How to parse HTML using Python HTMLParser

A sample code snippet on how to use the Python module HTMLParser to extract a well-formed HTML document for multiple

<input name="fileIDs" value="123456" />

Code snippet:

import urllib, urllib2
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
    def __init__(self, fh):
        """
        {fh} must be an input stream returned by open() or urllib2.urlopen()
        """
        HTMLParser.__init__(self)
        self.fileids = []
        self.feed(fh.read())
    def handle_starttag(self, tag, attrs):
        if tag == 'input':
            attrD = dict(attrs)
            if attrD['name'] == 'fileIDs':
                self.fileids.append(attrD['value'])
    def get_fileids(self):
        return self.fileids
opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=1))
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
response = opener.open("http://www.example.com/200.html")
myparser = MyHTMLParser(response)
Reference:
Advertisement

One Response to “How to parse HTML using Python HTMLParser”

  1. Yoga Says:

    That was a truly amazing post!!

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Connecting to %s


Follow

Get every new post delivered to your Inbox.