buildframework/helium/external/python/lib/common/docutils-0.5-py2.5.egg/docutils/parsers/rst/tableparser.py
changeset 179 d8ac696cc51f
equal deleted inserted replaced
1:be27ed110b50 179:d8ac696cc51f
       
     1 # $Id: tableparser.py 4564 2006-05-21 20:44:42Z wiemann $
       
     2 # Author: David Goodger <goodger@python.org>
       
     3 # Copyright: This module has been placed in the public domain.
       
     4 
       
     5 """
       
     6 This module defines table parser classes,which parse plaintext-graphic tables
       
     7 and produce a well-formed data structure suitable for building a CALS table.
       
     8 
       
     9 :Classes:
       
    10     - `GridTableParser`: Parse fully-formed tables represented with a grid.
       
    11     - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
       
    12       borders.
       
    13 
       
    14 :Exception class: `TableMarkupError`
       
    15 
       
    16 :Function:
       
    17     `update_dict_of_lists()`: Merge two dictionaries containing list values.
       
    18 """
       
    19 
       
    20 __docformat__ = 'reStructuredText'
       
    21 
       
    22 
       
    23 import re
       
    24 import sys
       
    25 from docutils import DataError
       
    26 
       
    27 
       
    28 class TableMarkupError(DataError): pass
       
    29 
       
    30 
       
    31 class TableParser:
       
    32 
       
    33     """
       
    34     Abstract superclass for the common parts of the syntax-specific parsers.
       
    35     """
       
    36 
       
    37     head_body_separator_pat = None
       
    38     """Matches the row separator between head rows and body rows."""
       
    39 
       
    40     double_width_pad_char = '\x00'
       
    41     """Padding character for East Asian double-width text."""
       
    42 
       
    43     def parse(self, block):
       
    44         """
       
    45         Analyze the text `block` and return a table data structure.
       
    46 
       
    47         Given a plaintext-graphic table in `block` (list of lines of text; no
       
    48         whitespace padding), parse the table, construct and return the data
       
    49         necessary to construct a CALS table or equivalent.
       
    50 
       
    51         Raise `TableMarkupError` if there is any problem with the markup.
       
    52         """
       
    53         self.setup(block)
       
    54         self.find_head_body_sep()
       
    55         self.parse_table()
       
    56         structure = self.structure_from_cells()
       
    57         return structure
       
    58 
       
    59     def find_head_body_sep(self):
       
    60         """Look for a head/body row separator line; store the line index."""
       
    61         for i in range(len(self.block)):
       
    62             line = self.block[i]
       
    63             if self.head_body_separator_pat.match(line):
       
    64                 if self.head_body_sep:
       
    65                     raise TableMarkupError(
       
    66                         'Multiple head/body row separators in table (at line '
       
    67                         'offset %s and %s); only one allowed.'
       
    68                         % (self.head_body_sep, i))
       
    69                 else:
       
    70                     self.head_body_sep = i
       
    71                     self.block[i] = line.replace('=', '-')
       
    72         if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
       
    73                                                              - 1):
       
    74             raise TableMarkupError('The head/body row separator may not be '
       
    75                                    'the first or last line of the table.')
       
    76 
       
    77 
       
    78 class GridTableParser(TableParser):
       
    79 
       
    80     """
       
    81     Parse a grid table using `parse()`.
       
    82 
       
    83     Here's an example of a grid table::
       
    84 
       
    85         +------------------------+------------+----------+----------+
       
    86         | Header row, column 1   | Header 2   | Header 3 | Header 4 |
       
    87         +========================+============+==========+==========+
       
    88         | body row 1, column 1   | column 2   | column 3 | column 4 |
       
    89         +------------------------+------------+----------+----------+
       
    90         | body row 2             | Cells may span columns.          |
       
    91         +------------------------+------------+---------------------+
       
    92         | body row 3             | Cells may  | - Table cells       |
       
    93         +------------------------+ span rows. | - contain           |
       
    94         | body row 4             |            | - body elements.    |
       
    95         +------------------------+------------+---------------------+
       
    96 
       
    97     Intersections use '+', row separators use '-' (except for one optional
       
    98     head/body row separator, which uses '='), and column separators use '|'.
       
    99 
       
   100     Passing the above table to the `parse()` method will result in the
       
   101     following data structure::
       
   102 
       
   103         ([24, 12, 10, 10],
       
   104          [[(0, 0, 1, ['Header row, column 1']),
       
   105            (0, 0, 1, ['Header 2']),
       
   106            (0, 0, 1, ['Header 3']),
       
   107            (0, 0, 1, ['Header 4'])]],
       
   108          [[(0, 0, 3, ['body row 1, column 1']),
       
   109            (0, 0, 3, ['column 2']),
       
   110            (0, 0, 3, ['column 3']),
       
   111            (0, 0, 3, ['column 4'])],
       
   112           [(0, 0, 5, ['body row 2']),
       
   113            (0, 2, 5, ['Cells may span columns.']),
       
   114            None,
       
   115            None],
       
   116           [(0, 0, 7, ['body row 3']),
       
   117            (1, 0, 7, ['Cells may', 'span rows.', '']),
       
   118            (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
       
   119            None],
       
   120           [(0, 0, 9, ['body row 4']), None, None, None]])
       
   121 
       
   122     The first item is a list containing column widths (colspecs). The second
       
   123     item is a list of head rows, and the third is a list of body rows. Each
       
   124     row contains a list of cells. Each cell is either None (for a cell unused
       
   125     because of another cell's span), or a tuple. A cell tuple contains four
       
   126     items: the number of extra rows used by the cell in a vertical span
       
   127     (morerows); the number of extra columns used by the cell in a horizontal
       
   128     span (morecols); the line offset of the first line of the cell contents;
       
   129     and the cell contents, a list of lines of text.
       
   130     """
       
   131 
       
   132     head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
       
   133 
       
   134     def setup(self, block):
       
   135         self.block = block[:]           # make a copy; it may be modified
       
   136         self.block.disconnect()         # don't propagate changes to parent
       
   137         self.bottom = len(block) - 1
       
   138         self.right = len(block[0]) - 1
       
   139         self.head_body_sep = None
       
   140         self.done = [-1] * len(block[0])
       
   141         self.cells = []
       
   142         self.rowseps = {0: [0]}
       
   143         self.colseps = {0: [0]}
       
   144 
       
   145     def parse_table(self):
       
   146         """
       
   147         Start with a queue of upper-left corners, containing the upper-left
       
   148         corner of the table itself. Trace out one rectangular cell, remember
       
   149         it, and add its upper-right and lower-left corners to the queue of
       
   150         potential upper-left corners of further cells. Process the queue in
       
   151         top-to-bottom order, keeping track of how much of each text column has
       
   152         been seen.
       
   153 
       
   154         We'll end up knowing all the row and column boundaries, cell positions
       
   155         and their dimensions.
       
   156         """
       
   157         corners = [(0, 0)]
       
   158         while corners:
       
   159             top, left = corners.pop(0)
       
   160             if top == self.bottom or left == self.right \
       
   161                   or top <= self.done[left]:
       
   162                 continue
       
   163             result = self.scan_cell(top, left)
       
   164             if not result:
       
   165                 continue
       
   166             bottom, right, rowseps, colseps = result
       
   167             update_dict_of_lists(self.rowseps, rowseps)
       
   168             update_dict_of_lists(self.colseps, colseps)
       
   169             self.mark_done(top, left, bottom, right)
       
   170             cellblock = self.block.get_2D_block(top + 1, left + 1,
       
   171                                                 bottom, right)
       
   172             cellblock.disconnect()      # lines in cell can't sync with parent
       
   173             cellblock.replace(self.double_width_pad_char, '')
       
   174             self.cells.append((top, left, bottom, right, cellblock))
       
   175             corners.extend([(top, right), (bottom, left)])
       
   176             corners.sort()
       
   177         if not self.check_parse_complete():
       
   178             raise TableMarkupError('Malformed table; parse incomplete.')
       
   179 
       
   180     def mark_done(self, top, left, bottom, right):
       
   181         """For keeping track of how much of each text column has been seen."""
       
   182         before = top - 1
       
   183         after = bottom - 1
       
   184         for col in range(left, right):
       
   185             assert self.done[col] == before
       
   186             self.done[col] = after
       
   187 
       
   188     def check_parse_complete(self):
       
   189         """Each text column should have been completely seen."""
       
   190         last = self.bottom - 1
       
   191         for col in range(self.right):
       
   192             if self.done[col] != last:
       
   193                 return None
       
   194         return 1
       
   195 
       
   196     def scan_cell(self, top, left):
       
   197         """Starting at the top-left corner, start tracing out a cell."""
       
   198         assert self.block[top][left] == '+'
       
   199         result = self.scan_right(top, left)
       
   200         return result
       
   201 
       
   202     def scan_right(self, top, left):
       
   203         """
       
   204         Look for the top-right corner of the cell, and make note of all column
       
   205         boundaries ('+').
       
   206         """
       
   207         colseps = {}
       
   208         line = self.block[top]
       
   209         for i in range(left + 1, self.right + 1):
       
   210             if line[i] == '+':
       
   211                 colseps[i] = [top]
       
   212                 result = self.scan_down(top, left, i)
       
   213                 if result:
       
   214                     bottom, rowseps, newcolseps = result
       
   215                     update_dict_of_lists(colseps, newcolseps)
       
   216                     return bottom, i, rowseps, colseps
       
   217             elif line[i] != '-':
       
   218                 return None
       
   219         return None
       
   220 
       
   221     def scan_down(self, top, left, right):
       
   222         """
       
   223         Look for the bottom-right corner of the cell, making note of all row
       
   224         boundaries.
       
   225         """
       
   226         rowseps = {}
       
   227         for i in range(top + 1, self.bottom + 1):
       
   228             if self.block[i][right] == '+':
       
   229                 rowseps[i] = [right]
       
   230                 result = self.scan_left(top, left, i, right)
       
   231                 if result:
       
   232                     newrowseps, colseps = result
       
   233                     update_dict_of_lists(rowseps, newrowseps)
       
   234                     return i, rowseps, colseps
       
   235             elif self.block[i][right] != '|':
       
   236                 return None
       
   237         return None
       
   238 
       
   239     def scan_left(self, top, left, bottom, right):
       
   240         """
       
   241         Noting column boundaries, look for the bottom-left corner of the cell.
       
   242         It must line up with the starting point.
       
   243         """
       
   244         colseps = {}
       
   245         line = self.block[bottom]
       
   246         for i in range(right - 1, left, -1):
       
   247             if line[i] == '+':
       
   248                 colseps[i] = [bottom]
       
   249             elif line[i] != '-':
       
   250                 return None
       
   251         if line[left] != '+':
       
   252             return None
       
   253         result = self.scan_up(top, left, bottom, right)
       
   254         if result is not None:
       
   255             rowseps = result
       
   256             return rowseps, colseps
       
   257         return None
       
   258 
       
   259     def scan_up(self, top, left, bottom, right):
       
   260         """
       
   261         Noting row boundaries, see if we can return to the starting point.
       
   262         """
       
   263         rowseps = {}
       
   264         for i in range(bottom - 1, top, -1):
       
   265             if self.block[i][left] == '+':
       
   266                 rowseps[i] = [left]
       
   267             elif self.block[i][left] != '|':
       
   268                 return None
       
   269         return rowseps
       
   270 
       
   271     def structure_from_cells(self):
       
   272         """
       
   273         From the data collected by `scan_cell()`, convert to the final data
       
   274         structure.
       
   275         """
       
   276         rowseps = self.rowseps.keys()   # list of row boundaries
       
   277         rowseps.sort()
       
   278         rowindex = {}
       
   279         for i in range(len(rowseps)):
       
   280             rowindex[rowseps[i]] = i    # row boundary -> row number mapping
       
   281         colseps = self.colseps.keys()   # list of column boundaries
       
   282         colseps.sort()
       
   283         colindex = {}
       
   284         for i in range(len(colseps)):
       
   285             colindex[colseps[i]] = i    # column boundary -> col number map
       
   286         colspecs = [(colseps[i] - colseps[i - 1] - 1)
       
   287                     for i in range(1, len(colseps))] # list of column widths
       
   288         # prepare an empty table with the correct number of rows & columns
       
   289         onerow = [None for i in range(len(colseps) - 1)]
       
   290         rows = [onerow[:] for i in range(len(rowseps) - 1)]
       
   291         # keep track of # of cells remaining; should reduce to zero
       
   292         remaining = (len(rowseps) - 1) * (len(colseps) - 1)
       
   293         for top, left, bottom, right, block in self.cells:
       
   294             rownum = rowindex[top]
       
   295             colnum = colindex[left]
       
   296             assert rows[rownum][colnum] is None, (
       
   297                   'Cell (row %s, column %s) already used.'
       
   298                   % (rownum + 1, colnum + 1))
       
   299             morerows = rowindex[bottom] - rownum - 1
       
   300             morecols = colindex[right] - colnum - 1
       
   301             remaining -= (morerows + 1) * (morecols + 1)
       
   302             # write the cell into the table
       
   303             rows[rownum][colnum] = (morerows, morecols, top + 1, block)
       
   304         assert remaining == 0, 'Unused cells remaining.'
       
   305         if self.head_body_sep:          # separate head rows from body rows
       
   306             numheadrows = rowindex[self.head_body_sep]
       
   307             headrows = rows[:numheadrows]
       
   308             bodyrows = rows[numheadrows:]
       
   309         else:
       
   310             headrows = []
       
   311             bodyrows = rows
       
   312         return (colspecs, headrows, bodyrows)
       
   313 
       
   314 
       
   315 class SimpleTableParser(TableParser):
       
   316 
       
   317     """
       
   318     Parse a simple table using `parse()`.
       
   319 
       
   320     Here's an example of a simple table::
       
   321 
       
   322         =====  =====
       
   323         col 1  col 2
       
   324         =====  =====
       
   325         1      Second column of row 1.
       
   326         2      Second column of row 2.
       
   327                Second line of paragraph.
       
   328         3      - Second column of row 3.
       
   329 
       
   330                - Second item in bullet
       
   331                  list (row 3, column 2).
       
   332         4 is a span
       
   333         ------------
       
   334         5
       
   335         =====  =====
       
   336 
       
   337     Top and bottom borders use '=', column span underlines use '-', column
       
   338     separation is indicated with spaces.
       
   339 
       
   340     Passing the above table to the `parse()` method will result in the
       
   341     following data structure, whose interpretation is the same as for
       
   342     `GridTableParser`::
       
   343 
       
   344         ([5, 25],
       
   345          [[(0, 0, 1, ['col 1']),
       
   346            (0, 0, 1, ['col 2'])]],
       
   347          [[(0, 0, 3, ['1']),
       
   348            (0, 0, 3, ['Second column of row 1.'])],
       
   349           [(0, 0, 4, ['2']),
       
   350            (0, 0, 4, ['Second column of row 2.',
       
   351                       'Second line of paragraph.'])],
       
   352           [(0, 0, 6, ['3']),
       
   353            (0, 0, 6, ['- Second column of row 3.',
       
   354                       '',
       
   355                       '- Second item in bullet',
       
   356                       '  list (row 3, column 2).'])],
       
   357           [(0, 1, 10, ['4 is a span'])],
       
   358           [(0, 0, 12, ['5']),
       
   359            (0, 0, 12, [''])]])
       
   360     """
       
   361 
       
   362     head_body_separator_pat = re.compile('=[ =]*$')
       
   363     span_pat = re.compile('-[ -]*$')
       
   364 
       
   365     def setup(self, block):
       
   366         self.block = block[:]           # make a copy; it will be modified
       
   367         self.block.disconnect()         # don't propagate changes to parent
       
   368         # Convert top & bottom borders to column span underlines:
       
   369         self.block[0] = self.block[0].replace('=', '-')
       
   370         self.block[-1] = self.block[-1].replace('=', '-')
       
   371         self.head_body_sep = None
       
   372         self.columns = []
       
   373         self.border_end = None
       
   374         self.table = []
       
   375         self.done = [-1] * len(block[0])
       
   376         self.rowseps = {0: [0]}
       
   377         self.colseps = {0: [0]}
       
   378 
       
   379     def parse_table(self):
       
   380         """
       
   381         First determine the column boundaries from the top border, then
       
   382         process rows.  Each row may consist of multiple lines; accumulate
       
   383         lines until a row is complete.  Call `self.parse_row` to finish the
       
   384         job.
       
   385         """
       
   386         # Top border must fully describe all table columns.
       
   387         self.columns = self.parse_columns(self.block[0], 0)
       
   388         self.border_end = self.columns[-1][1]
       
   389         firststart, firstend = self.columns[0]
       
   390         offset = 1                      # skip top border
       
   391         start = 1
       
   392         text_found = None
       
   393         while offset < len(self.block):
       
   394             line = self.block[offset]
       
   395             if self.span_pat.match(line):
       
   396                 # Column span underline or border; row is complete.
       
   397                 self.parse_row(self.block[start:offset], start,
       
   398                                (line.rstrip(), offset))
       
   399                 start = offset + 1
       
   400                 text_found = None
       
   401             elif line[firststart:firstend].strip():
       
   402                 # First column not blank, therefore it's a new row.
       
   403                 if text_found and offset != start:
       
   404                     self.parse_row(self.block[start:offset], start)
       
   405                 start = offset
       
   406                 text_found = 1
       
   407             elif not text_found:
       
   408                 start = offset + 1
       
   409             offset += 1
       
   410 
       
   411     def parse_columns(self, line, offset):
       
   412         """
       
   413         Given a column span underline, return a list of (begin, end) pairs.
       
   414         """
       
   415         cols = []
       
   416         end = 0
       
   417         while 1:
       
   418             begin = line.find('-', end)
       
   419             end = line.find(' ', begin)
       
   420             if begin < 0:
       
   421                 break
       
   422             if end < 0:
       
   423                 end = len(line)
       
   424             cols.append((begin, end))
       
   425         if self.columns:
       
   426             if cols[-1][1] != self.border_end:
       
   427                 raise TableMarkupError('Column span incomplete at line '
       
   428                                        'offset %s.' % offset)
       
   429             # Allow for an unbounded rightmost column:
       
   430             cols[-1] = (cols[-1][0], self.columns[-1][1])
       
   431         return cols
       
   432 
       
   433     def init_row(self, colspec, offset):
       
   434         i = 0
       
   435         cells = []
       
   436         for start, end in colspec:
       
   437             morecols = 0
       
   438             try:
       
   439                 assert start == self.columns[i][0]
       
   440                 while end != self.columns[i][1]:
       
   441                     i += 1
       
   442                     morecols += 1
       
   443             except (AssertionError, IndexError):
       
   444                 raise TableMarkupError('Column span alignment problem at '
       
   445                                        'line offset %s.' % (offset + 1))
       
   446             cells.append([0, morecols, offset, []])
       
   447             i += 1
       
   448         return cells
       
   449 
       
   450     def parse_row(self, lines, start, spanline=None):
       
   451         """
       
   452         Given the text `lines` of a row, parse it and append to `self.table`.
       
   453 
       
   454         The row is parsed according to the current column spec (either
       
   455         `spanline` if provided or `self.columns`).  For each column, extract
       
   456         text from each line, and check for text in column margins.  Finally,
       
   457         adjust for insigificant whitespace.
       
   458         """
       
   459         if not (lines or spanline):
       
   460             # No new row, just blank lines.
       
   461             return
       
   462         if spanline:
       
   463             columns = self.parse_columns(*spanline)
       
   464             span_offset = spanline[1]
       
   465         else:
       
   466             columns = self.columns[:]
       
   467             span_offset = start
       
   468         self.check_columns(lines, start, columns)
       
   469         row = self.init_row(columns, start)
       
   470         for i in range(len(columns)):
       
   471             start, end = columns[i]
       
   472             cellblock = lines.get_2D_block(0, start, len(lines), end)
       
   473             cellblock.disconnect()      # lines in cell can't sync with parent
       
   474             cellblock.replace(self.double_width_pad_char, '')
       
   475             row[i][3] = cellblock
       
   476         self.table.append(row)
       
   477 
       
   478     def check_columns(self, lines, first_line, columns):
       
   479         """
       
   480         Check for text in column margins and text overflow in the last column.
       
   481         Raise TableMarkupError if anything but whitespace is in column margins.
       
   482         Adjust the end value for the last column if there is text overflow.
       
   483         """
       
   484         # "Infinite" value for a dummy last column's beginning, used to
       
   485         # check for text overflow:
       
   486         columns.append((sys.maxint, None))
       
   487         lastcol = len(columns) - 2
       
   488         for i in range(len(columns) - 1):
       
   489             start, end = columns[i]
       
   490             nextstart = columns[i+1][0]
       
   491             offset = 0
       
   492             for line in lines:
       
   493                 if i == lastcol and line[end:].strip():
       
   494                     text = line[start:].rstrip()
       
   495                     new_end = start + len(text)
       
   496                     columns[i] = (start, new_end)
       
   497                     main_start, main_end = self.columns[-1]
       
   498                     if new_end > main_end:
       
   499                         self.columns[-1] = (main_start, new_end)
       
   500                 elif line[end:nextstart].strip():
       
   501                     raise TableMarkupError('Text in column margin at line '
       
   502                                            'offset %s.' % (first_line + offset))
       
   503                 offset += 1
       
   504         columns.pop()
       
   505 
       
   506     def structure_from_cells(self):
       
   507         colspecs = [end - start for start, end in self.columns]
       
   508         first_body_row = 0
       
   509         if self.head_body_sep:
       
   510             for i in range(len(self.table)):
       
   511                 if self.table[i][0][2] > self.head_body_sep:
       
   512                     first_body_row = i
       
   513                     break
       
   514         return (colspecs, self.table[:first_body_row],
       
   515                 self.table[first_body_row:])
       
   516 
       
   517 
       
   518 def update_dict_of_lists(master, newdata):
       
   519     """
       
   520     Extend the list values of `master` with those from `newdata`.
       
   521 
       
   522     Both parameters must be dictionaries containing list values.
       
   523     """
       
   524     for key, values in newdata.items():
       
   525         master.setdefault(key, []).extend(values)