How to parse HTML using Python HTMLParser

A sample code snippet on how to use the Python module HTMLParser to extract a well-formed HTML document for multiple

<input name="fileIDs" value="123456" />

Code snippet:

import urllib, urllib2
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
    def __init__(self, fh):
        """
        {fh} must be an input stream returned by open() or urllib2.urlopen()
        """
        HTMLParser.__init__(self)
        self.fileids = []
        self.feed(fh.read())
    def handle_starttag(self, tag, attrs):
        if tag == 'input':
            attrD = dict(attrs)
            if attrD['name'] == 'fileIDs':
                self.fileids.append(attrD['value'])
    def get_fileids(self):
        return self.fileids
opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open("http://www.example.com/200.html")
myparser = MyHTMLParser(response)
Reference:

One thought on “How to parse HTML using Python HTMLParser

Leave a comment