Mailing List Archive

HTML table parser
Thanks for all example htmllib code that people sent my way.
After some reading and looking at examples, I found htmllib to be
perfect for what I wanted. Along the way I came up with an html
table parser that others may find useful, so here it is. It is
intended to be inherited along with htmllib.HTMLParser to create
a parser for data tables.

Mike

--
Michael A. Miller mmiller@jlab.org
Department of Physics, University of Illinois, Urbana-Champaign



TRUE = (1==1)
FALSE = not TRUE

class HTMLTableRow:
"""A row with a list of data and of headings.

For the first row of a table, there is no data, only column headings.
Subsequent rows have one (or more) row headings and data."""
def __init__(self):
self.headings = []
self.data = []


class HTMLTableData:
"""Storage class for table data parsed by TableParser."""
def __init__(self):
self.data = []

def new_row(self):
self.data.append(HTMLTableRow())

def rows(self):
return len(self.data)

def columns(self):
if self.rows() > 0:
result = len(self.data[0].data)
else:
result = 0
return result

def last_row(self):
return self.data[-1]

def add_data(self,data):
self.data[-1].data.append(data)
if self.rows == 1:
self.columns = self.columns + 1

def add_heading(self,data):
self.data[-1].headings.append(data)

def __repr__(self):
text = '>>> Table <<<\n'
width = 30
format = ' %%%ds ' % ( width )
for row in self.data:
if len(row.data) == 0:
for heading in row.headings:
text = text + format % ( heading[:width] )
text = text + '\n'
else:
if len(row.headings) > 0:
text = text + format % ( row.headings[0][:width] )
for data in row.data:
text = text + format % ( data[:width] )
text = text + '\n'
return text

__str__ = __repr__

def full_repr(self):
text = '>>> Full Table <<<\n'
width = 40
hformat = ' (h)-%%%ds ' % ( width )
dformat = ' (d)-%%%ds ' % ( width )
for row in self.data:
for heading in row.headings:
text = text + hformat % ( heading[:width] )
text = text + '\n'
for data in row.data:
text = text + dformat % ( data[:width] )
text = text + '\n'
return text


class TableParser:
"""Parser for HTML tables.

Each <table> start tag appends an HTMLTableData to the tables
list. <tr>, <td> and <th> tags are added to the last table
in the list. This will fail if there are <tr>, <tr> or <th>
tags that come before a <table> tag.

Table structure is liek this:
<tr> <th> <th> ... <th> <--- column headings
<tr> <th> <td> ... <td>
<tr> <th> <td> ... <td>
<tr> <th> <td> ... <td>

^
row headings.
"""
def __init__(self):
self.finished = FALSE
self.tables = []

def start_table(self,attrs):
self.tables.append(HTMLTableData())
self.finished = FALSE

def end_table(self):
self.finished = TRUE

def start_tr(self,attrs):
self.current_table().new_row()

def end_tr(self):
pass

def start_th(self,attrs):
self.save_bgn()
def end_th(self):
data = self.save_end()
self.current_table().add_heading(data)

def start_td(self,attrs):
self.save_bgn()
def end_td(self):
data = self.save_end()
self.current_table().add_data(data)
#print 'new data = [', data, ']'
#print self.current_table().data

def current_table(self):
return self.tables[-1]