buildframework/helium/external/python/lib/common/docutils-0.5-py2.5.egg/docutils/parsers/rst/tableparser.py
author wbernard
Wed, 23 Dec 2009 19:29:07 +0200
changeset 179 d8ac696cc51f
permissions -rw-r--r--
helium_7.0-r14027
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
179
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     1
# $Id: tableparser.py 4564 2006-05-21 20:44:42Z wiemann $
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     2
# Author: David Goodger <goodger@python.org>
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     3
# Copyright: This module has been placed in the public domain.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     4
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     5
"""
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     6
This module defines table parser classes,which parse plaintext-graphic tables
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     7
and produce a well-formed data structure suitable for building a CALS table.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     8
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
     9
:Classes:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    10
    - `GridTableParser`: Parse fully-formed tables represented with a grid.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    11
    - `SimpleTableParser`: Parse simple tables, delimited by top & bottom
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    12
      borders.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    13
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    14
:Exception class: `TableMarkupError`
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    15
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    16
:Function:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    17
    `update_dict_of_lists()`: Merge two dictionaries containing list values.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    18
"""
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    19
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    20
__docformat__ = 'reStructuredText'
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    21
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    22
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    23
import re
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    24
import sys
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    25
from docutils import DataError
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    26
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    27
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    28
class TableMarkupError(DataError): pass
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    29
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    30
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    31
class TableParser:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    32
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    33
    """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    34
    Abstract superclass for the common parts of the syntax-specific parsers.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    35
    """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    36
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    37
    head_body_separator_pat = None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    38
    """Matches the row separator between head rows and body rows."""
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    39
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    40
    double_width_pad_char = '\x00'
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    41
    """Padding character for East Asian double-width text."""
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    42
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    43
    def parse(self, block):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    44
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    45
        Analyze the text `block` and return a table data structure.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    46
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    47
        Given a plaintext-graphic table in `block` (list of lines of text; no
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    48
        whitespace padding), parse the table, construct and return the data
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    49
        necessary to construct a CALS table or equivalent.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    50
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    51
        Raise `TableMarkupError` if there is any problem with the markup.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    52
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    53
        self.setup(block)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    54
        self.find_head_body_sep()
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    55
        self.parse_table()
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    56
        structure = self.structure_from_cells()
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    57
        return structure
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    58
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    59
    def find_head_body_sep(self):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    60
        """Look for a head/body row separator line; store the line index."""
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    61
        for i in range(len(self.block)):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    62
            line = self.block[i]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    63
            if self.head_body_separator_pat.match(line):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    64
                if self.head_body_sep:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    65
                    raise TableMarkupError(
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    66
                        'Multiple head/body row separators in table (at line '
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    67
                        'offset %s and %s); only one allowed.'
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    68
                        % (self.head_body_sep, i))
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    69
                else:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    70
                    self.head_body_sep = i
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    71
                    self.block[i] = line.replace('=', '-')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    72
        if self.head_body_sep == 0 or self.head_body_sep == (len(self.block)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    73
                                                             - 1):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    74
            raise TableMarkupError('The head/body row separator may not be '
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    75
                                   'the first or last line of the table.')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    76
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    77
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    78
class GridTableParser(TableParser):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    79
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    80
    """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    81
    Parse a grid table using `parse()`.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    82
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    83
    Here's an example of a grid table::
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    84
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    85
        +------------------------+------------+----------+----------+
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    86
        | Header row, column 1   | Header 2   | Header 3 | Header 4 |
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    87
        +========================+============+==========+==========+
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    88
        | body row 1, column 1   | column 2   | column 3 | column 4 |
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    89
        +------------------------+------------+----------+----------+
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    90
        | body row 2             | Cells may span columns.          |
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    91
        +------------------------+------------+---------------------+
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    92
        | body row 3             | Cells may  | - Table cells       |
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    93
        +------------------------+ span rows. | - contain           |
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    94
        | body row 4             |            | - body elements.    |
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    95
        +------------------------+------------+---------------------+
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    96
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    97
    Intersections use '+', row separators use '-' (except for one optional
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    98
    head/body row separator, which uses '='), and column separators use '|'.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
    99
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   100
    Passing the above table to the `parse()` method will result in the
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   101
    following data structure::
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   102
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   103
        ([24, 12, 10, 10],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   104
         [[(0, 0, 1, ['Header row, column 1']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   105
           (0, 0, 1, ['Header 2']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   106
           (0, 0, 1, ['Header 3']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   107
           (0, 0, 1, ['Header 4'])]],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   108
         [[(0, 0, 3, ['body row 1, column 1']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   109
           (0, 0, 3, ['column 2']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   110
           (0, 0, 3, ['column 3']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   111
           (0, 0, 3, ['column 4'])],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   112
          [(0, 0, 5, ['body row 2']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   113
           (0, 2, 5, ['Cells may span columns.']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   114
           None,
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   115
           None],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   116
          [(0, 0, 7, ['body row 3']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   117
           (1, 0, 7, ['Cells may', 'span rows.', '']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   118
           (1, 1, 7, ['- Table cells', '- contain', '- body elements.']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   119
           None],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   120
          [(0, 0, 9, ['body row 4']), None, None, None]])
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   121
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   122
    The first item is a list containing column widths (colspecs). The second
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   123
    item is a list of head rows, and the third is a list of body rows. Each
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   124
    row contains a list of cells. Each cell is either None (for a cell unused
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   125
    because of another cell's span), or a tuple. A cell tuple contains four
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   126
    items: the number of extra rows used by the cell in a vertical span
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   127
    (morerows); the number of extra columns used by the cell in a horizontal
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   128
    span (morecols); the line offset of the first line of the cell contents;
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   129
    and the cell contents, a list of lines of text.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   130
    """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   131
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   132
    head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   133
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   134
    def setup(self, block):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   135
        self.block = block[:]           # make a copy; it may be modified
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   136
        self.block.disconnect()         # don't propagate changes to parent
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   137
        self.bottom = len(block) - 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   138
        self.right = len(block[0]) - 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   139
        self.head_body_sep = None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   140
        self.done = [-1] * len(block[0])
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   141
        self.cells = []
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   142
        self.rowseps = {0: [0]}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   143
        self.colseps = {0: [0]}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   144
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   145
    def parse_table(self):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   146
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   147
        Start with a queue of upper-left corners, containing the upper-left
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   148
        corner of the table itself. Trace out one rectangular cell, remember
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   149
        it, and add its upper-right and lower-left corners to the queue of
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   150
        potential upper-left corners of further cells. Process the queue in
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   151
        top-to-bottom order, keeping track of how much of each text column has
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   152
        been seen.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   153
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   154
        We'll end up knowing all the row and column boundaries, cell positions
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   155
        and their dimensions.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   156
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   157
        corners = [(0, 0)]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   158
        while corners:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   159
            top, left = corners.pop(0)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   160
            if top == self.bottom or left == self.right \
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   161
                  or top <= self.done[left]:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   162
                continue
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   163
            result = self.scan_cell(top, left)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   164
            if not result:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   165
                continue
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   166
            bottom, right, rowseps, colseps = result
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   167
            update_dict_of_lists(self.rowseps, rowseps)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   168
            update_dict_of_lists(self.colseps, colseps)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   169
            self.mark_done(top, left, bottom, right)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   170
            cellblock = self.block.get_2D_block(top + 1, left + 1,
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   171
                                                bottom, right)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   172
            cellblock.disconnect()      # lines in cell can't sync with parent
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   173
            cellblock.replace(self.double_width_pad_char, '')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   174
            self.cells.append((top, left, bottom, right, cellblock))
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   175
            corners.extend([(top, right), (bottom, left)])
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   176
            corners.sort()
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   177
        if not self.check_parse_complete():
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   178
            raise TableMarkupError('Malformed table; parse incomplete.')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   179
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   180
    def mark_done(self, top, left, bottom, right):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   181
        """For keeping track of how much of each text column has been seen."""
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   182
        before = top - 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   183
        after = bottom - 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   184
        for col in range(left, right):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   185
            assert self.done[col] == before
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   186
            self.done[col] = after
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   187
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   188
    def check_parse_complete(self):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   189
        """Each text column should have been completely seen."""
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   190
        last = self.bottom - 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   191
        for col in range(self.right):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   192
            if self.done[col] != last:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   193
                return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   194
        return 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   195
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   196
    def scan_cell(self, top, left):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   197
        """Starting at the top-left corner, start tracing out a cell."""
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   198
        assert self.block[top][left] == '+'
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   199
        result = self.scan_right(top, left)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   200
        return result
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   201
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   202
    def scan_right(self, top, left):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   203
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   204
        Look for the top-right corner of the cell, and make note of all column
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   205
        boundaries ('+').
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   206
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   207
        colseps = {}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   208
        line = self.block[top]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   209
        for i in range(left + 1, self.right + 1):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   210
            if line[i] == '+':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   211
                colseps[i] = [top]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   212
                result = self.scan_down(top, left, i)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   213
                if result:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   214
                    bottom, rowseps, newcolseps = result
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   215
                    update_dict_of_lists(colseps, newcolseps)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   216
                    return bottom, i, rowseps, colseps
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   217
            elif line[i] != '-':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   218
                return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   219
        return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   220
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   221
    def scan_down(self, top, left, right):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   222
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   223
        Look for the bottom-right corner of the cell, making note of all row
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   224
        boundaries.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   225
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   226
        rowseps = {}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   227
        for i in range(top + 1, self.bottom + 1):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   228
            if self.block[i][right] == '+':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   229
                rowseps[i] = [right]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   230
                result = self.scan_left(top, left, i, right)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   231
                if result:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   232
                    newrowseps, colseps = result
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   233
                    update_dict_of_lists(rowseps, newrowseps)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   234
                    return i, rowseps, colseps
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   235
            elif self.block[i][right] != '|':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   236
                return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   237
        return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   238
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   239
    def scan_left(self, top, left, bottom, right):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   240
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   241
        Noting column boundaries, look for the bottom-left corner of the cell.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   242
        It must line up with the starting point.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   243
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   244
        colseps = {}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   245
        line = self.block[bottom]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   246
        for i in range(right - 1, left, -1):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   247
            if line[i] == '+':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   248
                colseps[i] = [bottom]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   249
            elif line[i] != '-':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   250
                return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   251
        if line[left] != '+':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   252
            return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   253
        result = self.scan_up(top, left, bottom, right)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   254
        if result is not None:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   255
            rowseps = result
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   256
            return rowseps, colseps
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   257
        return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   258
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   259
    def scan_up(self, top, left, bottom, right):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   260
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   261
        Noting row boundaries, see if we can return to the starting point.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   262
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   263
        rowseps = {}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   264
        for i in range(bottom - 1, top, -1):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   265
            if self.block[i][left] == '+':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   266
                rowseps[i] = [left]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   267
            elif self.block[i][left] != '|':
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   268
                return None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   269
        return rowseps
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   270
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   271
    def structure_from_cells(self):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   272
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   273
        From the data collected by `scan_cell()`, convert to the final data
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   274
        structure.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   275
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   276
        rowseps = self.rowseps.keys()   # list of row boundaries
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   277
        rowseps.sort()
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   278
        rowindex = {}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   279
        for i in range(len(rowseps)):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   280
            rowindex[rowseps[i]] = i    # row boundary -> row number mapping
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   281
        colseps = self.colseps.keys()   # list of column boundaries
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   282
        colseps.sort()
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   283
        colindex = {}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   284
        for i in range(len(colseps)):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   285
            colindex[colseps[i]] = i    # column boundary -> col number map
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   286
        colspecs = [(colseps[i] - colseps[i - 1] - 1)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   287
                    for i in range(1, len(colseps))] # list of column widths
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   288
        # prepare an empty table with the correct number of rows & columns
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   289
        onerow = [None for i in range(len(colseps) - 1)]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   290
        rows = [onerow[:] for i in range(len(rowseps) - 1)]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   291
        # keep track of # of cells remaining; should reduce to zero
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   292
        remaining = (len(rowseps) - 1) * (len(colseps) - 1)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   293
        for top, left, bottom, right, block in self.cells:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   294
            rownum = rowindex[top]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   295
            colnum = colindex[left]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   296
            assert rows[rownum][colnum] is None, (
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   297
                  'Cell (row %s, column %s) already used.'
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   298
                  % (rownum + 1, colnum + 1))
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   299
            morerows = rowindex[bottom] - rownum - 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   300
            morecols = colindex[right] - colnum - 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   301
            remaining -= (morerows + 1) * (morecols + 1)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   302
            # write the cell into the table
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   303
            rows[rownum][colnum] = (morerows, morecols, top + 1, block)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   304
        assert remaining == 0, 'Unused cells remaining.'
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   305
        if self.head_body_sep:          # separate head rows from body rows
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   306
            numheadrows = rowindex[self.head_body_sep]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   307
            headrows = rows[:numheadrows]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   308
            bodyrows = rows[numheadrows:]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   309
        else:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   310
            headrows = []
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   311
            bodyrows = rows
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   312
        return (colspecs, headrows, bodyrows)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   313
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   314
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   315
class SimpleTableParser(TableParser):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   316
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   317
    """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   318
    Parse a simple table using `parse()`.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   319
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   320
    Here's an example of a simple table::
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   321
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   322
        =====  =====
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   323
        col 1  col 2
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   324
        =====  =====
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   325
        1      Second column of row 1.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   326
        2      Second column of row 2.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   327
               Second line of paragraph.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   328
        3      - Second column of row 3.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   329
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   330
               - Second item in bullet
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   331
                 list (row 3, column 2).
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   332
        4 is a span
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   333
        ------------
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   334
        5
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   335
        =====  =====
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   336
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   337
    Top and bottom borders use '=', column span underlines use '-', column
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   338
    separation is indicated with spaces.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   339
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   340
    Passing the above table to the `parse()` method will result in the
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   341
    following data structure, whose interpretation is the same as for
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   342
    `GridTableParser`::
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   343
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   344
        ([5, 25],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   345
         [[(0, 0, 1, ['col 1']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   346
           (0, 0, 1, ['col 2'])]],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   347
         [[(0, 0, 3, ['1']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   348
           (0, 0, 3, ['Second column of row 1.'])],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   349
          [(0, 0, 4, ['2']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   350
           (0, 0, 4, ['Second column of row 2.',
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   351
                      'Second line of paragraph.'])],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   352
          [(0, 0, 6, ['3']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   353
           (0, 0, 6, ['- Second column of row 3.',
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   354
                      '',
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   355
                      '- Second item in bullet',
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   356
                      '  list (row 3, column 2).'])],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   357
          [(0, 1, 10, ['4 is a span'])],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   358
          [(0, 0, 12, ['5']),
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   359
           (0, 0, 12, [''])]])
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   360
    """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   361
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   362
    head_body_separator_pat = re.compile('=[ =]*$')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   363
    span_pat = re.compile('-[ -]*$')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   364
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   365
    def setup(self, block):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   366
        self.block = block[:]           # make a copy; it will be modified
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   367
        self.block.disconnect()         # don't propagate changes to parent
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   368
        # Convert top & bottom borders to column span underlines:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   369
        self.block[0] = self.block[0].replace('=', '-')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   370
        self.block[-1] = self.block[-1].replace('=', '-')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   371
        self.head_body_sep = None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   372
        self.columns = []
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   373
        self.border_end = None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   374
        self.table = []
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   375
        self.done = [-1] * len(block[0])
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   376
        self.rowseps = {0: [0]}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   377
        self.colseps = {0: [0]}
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   378
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   379
    def parse_table(self):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   380
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   381
        First determine the column boundaries from the top border, then
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   382
        process rows.  Each row may consist of multiple lines; accumulate
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   383
        lines until a row is complete.  Call `self.parse_row` to finish the
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   384
        job.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   385
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   386
        # Top border must fully describe all table columns.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   387
        self.columns = self.parse_columns(self.block[0], 0)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   388
        self.border_end = self.columns[-1][1]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   389
        firststart, firstend = self.columns[0]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   390
        offset = 1                      # skip top border
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   391
        start = 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   392
        text_found = None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   393
        while offset < len(self.block):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   394
            line = self.block[offset]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   395
            if self.span_pat.match(line):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   396
                # Column span underline or border; row is complete.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   397
                self.parse_row(self.block[start:offset], start,
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   398
                               (line.rstrip(), offset))
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   399
                start = offset + 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   400
                text_found = None
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   401
            elif line[firststart:firstend].strip():
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   402
                # First column not blank, therefore it's a new row.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   403
                if text_found and offset != start:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   404
                    self.parse_row(self.block[start:offset], start)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   405
                start = offset
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   406
                text_found = 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   407
            elif not text_found:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   408
                start = offset + 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   409
            offset += 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   410
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   411
    def parse_columns(self, line, offset):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   412
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   413
        Given a column span underline, return a list of (begin, end) pairs.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   414
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   415
        cols = []
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   416
        end = 0
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   417
        while 1:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   418
            begin = line.find('-', end)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   419
            end = line.find(' ', begin)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   420
            if begin < 0:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   421
                break
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   422
            if end < 0:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   423
                end = len(line)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   424
            cols.append((begin, end))
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   425
        if self.columns:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   426
            if cols[-1][1] != self.border_end:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   427
                raise TableMarkupError('Column span incomplete at line '
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   428
                                       'offset %s.' % offset)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   429
            # Allow for an unbounded rightmost column:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   430
            cols[-1] = (cols[-1][0], self.columns[-1][1])
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   431
        return cols
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   432
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   433
    def init_row(self, colspec, offset):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   434
        i = 0
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   435
        cells = []
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   436
        for start, end in colspec:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   437
            morecols = 0
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   438
            try:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   439
                assert start == self.columns[i][0]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   440
                while end != self.columns[i][1]:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   441
                    i += 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   442
                    morecols += 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   443
            except (AssertionError, IndexError):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   444
                raise TableMarkupError('Column span alignment problem at '
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   445
                                       'line offset %s.' % (offset + 1))
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   446
            cells.append([0, morecols, offset, []])
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   447
            i += 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   448
        return cells
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   449
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   450
    def parse_row(self, lines, start, spanline=None):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   451
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   452
        Given the text `lines` of a row, parse it and append to `self.table`.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   453
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   454
        The row is parsed according to the current column spec (either
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   455
        `spanline` if provided or `self.columns`).  For each column, extract
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   456
        text from each line, and check for text in column margins.  Finally,
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   457
        adjust for insigificant whitespace.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   458
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   459
        if not (lines or spanline):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   460
            # No new row, just blank lines.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   461
            return
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   462
        if spanline:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   463
            columns = self.parse_columns(*spanline)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   464
            span_offset = spanline[1]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   465
        else:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   466
            columns = self.columns[:]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   467
            span_offset = start
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   468
        self.check_columns(lines, start, columns)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   469
        row = self.init_row(columns, start)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   470
        for i in range(len(columns)):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   471
            start, end = columns[i]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   472
            cellblock = lines.get_2D_block(0, start, len(lines), end)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   473
            cellblock.disconnect()      # lines in cell can't sync with parent
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   474
            cellblock.replace(self.double_width_pad_char, '')
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   475
            row[i][3] = cellblock
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   476
        self.table.append(row)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   477
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   478
    def check_columns(self, lines, first_line, columns):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   479
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   480
        Check for text in column margins and text overflow in the last column.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   481
        Raise TableMarkupError if anything but whitespace is in column margins.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   482
        Adjust the end value for the last column if there is text overflow.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   483
        """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   484
        # "Infinite" value for a dummy last column's beginning, used to
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   485
        # check for text overflow:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   486
        columns.append((sys.maxint, None))
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   487
        lastcol = len(columns) - 2
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   488
        for i in range(len(columns) - 1):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   489
            start, end = columns[i]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   490
            nextstart = columns[i+1][0]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   491
            offset = 0
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   492
            for line in lines:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   493
                if i == lastcol and line[end:].strip():
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   494
                    text = line[start:].rstrip()
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   495
                    new_end = start + len(text)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   496
                    columns[i] = (start, new_end)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   497
                    main_start, main_end = self.columns[-1]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   498
                    if new_end > main_end:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   499
                        self.columns[-1] = (main_start, new_end)
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   500
                elif line[end:nextstart].strip():
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   501
                    raise TableMarkupError('Text in column margin at line '
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   502
                                           'offset %s.' % (first_line + offset))
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   503
                offset += 1
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   504
        columns.pop()
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   505
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   506
    def structure_from_cells(self):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   507
        colspecs = [end - start for start, end in self.columns]
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   508
        first_body_row = 0
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   509
        if self.head_body_sep:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   510
            for i in range(len(self.table)):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   511
                if self.table[i][0][2] > self.head_body_sep:
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   512
                    first_body_row = i
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   513
                    break
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   514
        return (colspecs, self.table[:first_body_row],
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   515
                self.table[first_body_row:])
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   516
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   517
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   518
def update_dict_of_lists(master, newdata):
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   519
    """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   520
    Extend the list values of `master` with those from `newdata`.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   521
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   522
    Both parameters must be dictionaries containing list values.
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   523
    """
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   524
    for key, values in newdata.items():
d8ac696cc51f helium_7.0-r14027
wbernard
parents:
diff changeset
   525
        master.setdefault(key, []).extend(values)