|
1 # $Id: tableparser.py 4564 2006-05-21 20:44:42Z wiemann $ |
|
2 # Author: David Goodger <goodger@python.org> |
|
3 # Copyright: This module has been placed in the public domain. |
|
4 |
|
5 """ |
|
6 This module defines table parser classes,which parse plaintext-graphic tables |
|
7 and produce a well-formed data structure suitable for building a CALS table. |
|
8 |
|
9 :Classes: |
|
10 - `GridTableParser`: Parse fully-formed tables represented with a grid. |
|
11 - `SimpleTableParser`: Parse simple tables, delimited by top & bottom |
|
12 borders. |
|
13 |
|
14 :Exception class: `TableMarkupError` |
|
15 |
|
16 :Function: |
|
17 `update_dict_of_lists()`: Merge two dictionaries containing list values. |
|
18 """ |
|
19 |
|
20 __docformat__ = 'reStructuredText' |
|
21 |
|
22 |
|
23 import re |
|
24 import sys |
|
25 from docutils import DataError |
|
26 |
|
27 |
|
28 class TableMarkupError(DataError): pass |
|
29 |
|
30 |
|
31 class TableParser: |
|
32 |
|
33 """ |
|
34 Abstract superclass for the common parts of the syntax-specific parsers. |
|
35 """ |
|
36 |
|
37 head_body_separator_pat = None |
|
38 """Matches the row separator between head rows and body rows.""" |
|
39 |
|
40 double_width_pad_char = '\x00' |
|
41 """Padding character for East Asian double-width text.""" |
|
42 |
|
43 def parse(self, block): |
|
44 """ |
|
45 Analyze the text `block` and return a table data structure. |
|
46 |
|
47 Given a plaintext-graphic table in `block` (list of lines of text; no |
|
48 whitespace padding), parse the table, construct and return the data |
|
49 necessary to construct a CALS table or equivalent. |
|
50 |
|
51 Raise `TableMarkupError` if there is any problem with the markup. |
|
52 """ |
|
53 self.setup(block) |
|
54 self.find_head_body_sep() |
|
55 self.parse_table() |
|
56 structure = self.structure_from_cells() |
|
57 return structure |
|
58 |
|
59 def find_head_body_sep(self): |
|
60 """Look for a head/body row separator line; store the line index.""" |
|
61 for i in range(len(self.block)): |
|
62 line = self.block[i] |
|
63 if self.head_body_separator_pat.match(line): |
|
64 if self.head_body_sep: |
|
65 raise TableMarkupError( |
|
66 'Multiple head/body row separators in table (at line ' |
|
67 'offset %s and %s); only one allowed.' |
|
68 % (self.head_body_sep, i)) |
|
69 else: |
|
70 self.head_body_sep = i |
|
71 self.block[i] = line.replace('=', '-') |
|
72 if self.head_body_sep == 0 or self.head_body_sep == (len(self.block) |
|
73 - 1): |
|
74 raise TableMarkupError('The head/body row separator may not be ' |
|
75 'the first or last line of the table.') |
|
76 |
|
77 |
|
78 class GridTableParser(TableParser): |
|
79 |
|
80 """ |
|
81 Parse a grid table using `parse()`. |
|
82 |
|
83 Here's an example of a grid table:: |
|
84 |
|
85 +------------------------+------------+----------+----------+ |
|
86 | Header row, column 1 | Header 2 | Header 3 | Header 4 | |
|
87 +========================+============+==========+==========+ |
|
88 | body row 1, column 1 | column 2 | column 3 | column 4 | |
|
89 +------------------------+------------+----------+----------+ |
|
90 | body row 2 | Cells may span columns. | |
|
91 +------------------------+------------+---------------------+ |
|
92 | body row 3 | Cells may | - Table cells | |
|
93 +------------------------+ span rows. | - contain | |
|
94 | body row 4 | | - body elements. | |
|
95 +------------------------+------------+---------------------+ |
|
96 |
|
97 Intersections use '+', row separators use '-' (except for one optional |
|
98 head/body row separator, which uses '='), and column separators use '|'. |
|
99 |
|
100 Passing the above table to the `parse()` method will result in the |
|
101 following data structure:: |
|
102 |
|
103 ([24, 12, 10, 10], |
|
104 [[(0, 0, 1, ['Header row, column 1']), |
|
105 (0, 0, 1, ['Header 2']), |
|
106 (0, 0, 1, ['Header 3']), |
|
107 (0, 0, 1, ['Header 4'])]], |
|
108 [[(0, 0, 3, ['body row 1, column 1']), |
|
109 (0, 0, 3, ['column 2']), |
|
110 (0, 0, 3, ['column 3']), |
|
111 (0, 0, 3, ['column 4'])], |
|
112 [(0, 0, 5, ['body row 2']), |
|
113 (0, 2, 5, ['Cells may span columns.']), |
|
114 None, |
|
115 None], |
|
116 [(0, 0, 7, ['body row 3']), |
|
117 (1, 0, 7, ['Cells may', 'span rows.', '']), |
|
118 (1, 1, 7, ['- Table cells', '- contain', '- body elements.']), |
|
119 None], |
|
120 [(0, 0, 9, ['body row 4']), None, None, None]]) |
|
121 |
|
122 The first item is a list containing column widths (colspecs). The second |
|
123 item is a list of head rows, and the third is a list of body rows. Each |
|
124 row contains a list of cells. Each cell is either None (for a cell unused |
|
125 because of another cell's span), or a tuple. A cell tuple contains four |
|
126 items: the number of extra rows used by the cell in a vertical span |
|
127 (morerows); the number of extra columns used by the cell in a horizontal |
|
128 span (morecols); the line offset of the first line of the cell contents; |
|
129 and the cell contents, a list of lines of text. |
|
130 """ |
|
131 |
|
132 head_body_separator_pat = re.compile(r'\+=[=+]+=\+ *$') |
|
133 |
|
134 def setup(self, block): |
|
135 self.block = block[:] # make a copy; it may be modified |
|
136 self.block.disconnect() # don't propagate changes to parent |
|
137 self.bottom = len(block) - 1 |
|
138 self.right = len(block[0]) - 1 |
|
139 self.head_body_sep = None |
|
140 self.done = [-1] * len(block[0]) |
|
141 self.cells = [] |
|
142 self.rowseps = {0: [0]} |
|
143 self.colseps = {0: [0]} |
|
144 |
|
145 def parse_table(self): |
|
146 """ |
|
147 Start with a queue of upper-left corners, containing the upper-left |
|
148 corner of the table itself. Trace out one rectangular cell, remember |
|
149 it, and add its upper-right and lower-left corners to the queue of |
|
150 potential upper-left corners of further cells. Process the queue in |
|
151 top-to-bottom order, keeping track of how much of each text column has |
|
152 been seen. |
|
153 |
|
154 We'll end up knowing all the row and column boundaries, cell positions |
|
155 and their dimensions. |
|
156 """ |
|
157 corners = [(0, 0)] |
|
158 while corners: |
|
159 top, left = corners.pop(0) |
|
160 if top == self.bottom or left == self.right \ |
|
161 or top <= self.done[left]: |
|
162 continue |
|
163 result = self.scan_cell(top, left) |
|
164 if not result: |
|
165 continue |
|
166 bottom, right, rowseps, colseps = result |
|
167 update_dict_of_lists(self.rowseps, rowseps) |
|
168 update_dict_of_lists(self.colseps, colseps) |
|
169 self.mark_done(top, left, bottom, right) |
|
170 cellblock = self.block.get_2D_block(top + 1, left + 1, |
|
171 bottom, right) |
|
172 cellblock.disconnect() # lines in cell can't sync with parent |
|
173 cellblock.replace(self.double_width_pad_char, '') |
|
174 self.cells.append((top, left, bottom, right, cellblock)) |
|
175 corners.extend([(top, right), (bottom, left)]) |
|
176 corners.sort() |
|
177 if not self.check_parse_complete(): |
|
178 raise TableMarkupError('Malformed table; parse incomplete.') |
|
179 |
|
180 def mark_done(self, top, left, bottom, right): |
|
181 """For keeping track of how much of each text column has been seen.""" |
|
182 before = top - 1 |
|
183 after = bottom - 1 |
|
184 for col in range(left, right): |
|
185 assert self.done[col] == before |
|
186 self.done[col] = after |
|
187 |
|
188 def check_parse_complete(self): |
|
189 """Each text column should have been completely seen.""" |
|
190 last = self.bottom - 1 |
|
191 for col in range(self.right): |
|
192 if self.done[col] != last: |
|
193 return None |
|
194 return 1 |
|
195 |
|
196 def scan_cell(self, top, left): |
|
197 """Starting at the top-left corner, start tracing out a cell.""" |
|
198 assert self.block[top][left] == '+' |
|
199 result = self.scan_right(top, left) |
|
200 return result |
|
201 |
|
202 def scan_right(self, top, left): |
|
203 """ |
|
204 Look for the top-right corner of the cell, and make note of all column |
|
205 boundaries ('+'). |
|
206 """ |
|
207 colseps = {} |
|
208 line = self.block[top] |
|
209 for i in range(left + 1, self.right + 1): |
|
210 if line[i] == '+': |
|
211 colseps[i] = [top] |
|
212 result = self.scan_down(top, left, i) |
|
213 if result: |
|
214 bottom, rowseps, newcolseps = result |
|
215 update_dict_of_lists(colseps, newcolseps) |
|
216 return bottom, i, rowseps, colseps |
|
217 elif line[i] != '-': |
|
218 return None |
|
219 return None |
|
220 |
|
221 def scan_down(self, top, left, right): |
|
222 """ |
|
223 Look for the bottom-right corner of the cell, making note of all row |
|
224 boundaries. |
|
225 """ |
|
226 rowseps = {} |
|
227 for i in range(top + 1, self.bottom + 1): |
|
228 if self.block[i][right] == '+': |
|
229 rowseps[i] = [right] |
|
230 result = self.scan_left(top, left, i, right) |
|
231 if result: |
|
232 newrowseps, colseps = result |
|
233 update_dict_of_lists(rowseps, newrowseps) |
|
234 return i, rowseps, colseps |
|
235 elif self.block[i][right] != '|': |
|
236 return None |
|
237 return None |
|
238 |
|
239 def scan_left(self, top, left, bottom, right): |
|
240 """ |
|
241 Noting column boundaries, look for the bottom-left corner of the cell. |
|
242 It must line up with the starting point. |
|
243 """ |
|
244 colseps = {} |
|
245 line = self.block[bottom] |
|
246 for i in range(right - 1, left, -1): |
|
247 if line[i] == '+': |
|
248 colseps[i] = [bottom] |
|
249 elif line[i] != '-': |
|
250 return None |
|
251 if line[left] != '+': |
|
252 return None |
|
253 result = self.scan_up(top, left, bottom, right) |
|
254 if result is not None: |
|
255 rowseps = result |
|
256 return rowseps, colseps |
|
257 return None |
|
258 |
|
259 def scan_up(self, top, left, bottom, right): |
|
260 """ |
|
261 Noting row boundaries, see if we can return to the starting point. |
|
262 """ |
|
263 rowseps = {} |
|
264 for i in range(bottom - 1, top, -1): |
|
265 if self.block[i][left] == '+': |
|
266 rowseps[i] = [left] |
|
267 elif self.block[i][left] != '|': |
|
268 return None |
|
269 return rowseps |
|
270 |
|
271 def structure_from_cells(self): |
|
272 """ |
|
273 From the data collected by `scan_cell()`, convert to the final data |
|
274 structure. |
|
275 """ |
|
276 rowseps = self.rowseps.keys() # list of row boundaries |
|
277 rowseps.sort() |
|
278 rowindex = {} |
|
279 for i in range(len(rowseps)): |
|
280 rowindex[rowseps[i]] = i # row boundary -> row number mapping |
|
281 colseps = self.colseps.keys() # list of column boundaries |
|
282 colseps.sort() |
|
283 colindex = {} |
|
284 for i in range(len(colseps)): |
|
285 colindex[colseps[i]] = i # column boundary -> col number map |
|
286 colspecs = [(colseps[i] - colseps[i - 1] - 1) |
|
287 for i in range(1, len(colseps))] # list of column widths |
|
288 # prepare an empty table with the correct number of rows & columns |
|
289 onerow = [None for i in range(len(colseps) - 1)] |
|
290 rows = [onerow[:] for i in range(len(rowseps) - 1)] |
|
291 # keep track of # of cells remaining; should reduce to zero |
|
292 remaining = (len(rowseps) - 1) * (len(colseps) - 1) |
|
293 for top, left, bottom, right, block in self.cells: |
|
294 rownum = rowindex[top] |
|
295 colnum = colindex[left] |
|
296 assert rows[rownum][colnum] is None, ( |
|
297 'Cell (row %s, column %s) already used.' |
|
298 % (rownum + 1, colnum + 1)) |
|
299 morerows = rowindex[bottom] - rownum - 1 |
|
300 morecols = colindex[right] - colnum - 1 |
|
301 remaining -= (morerows + 1) * (morecols + 1) |
|
302 # write the cell into the table |
|
303 rows[rownum][colnum] = (morerows, morecols, top + 1, block) |
|
304 assert remaining == 0, 'Unused cells remaining.' |
|
305 if self.head_body_sep: # separate head rows from body rows |
|
306 numheadrows = rowindex[self.head_body_sep] |
|
307 headrows = rows[:numheadrows] |
|
308 bodyrows = rows[numheadrows:] |
|
309 else: |
|
310 headrows = [] |
|
311 bodyrows = rows |
|
312 return (colspecs, headrows, bodyrows) |
|
313 |
|
314 |
|
315 class SimpleTableParser(TableParser): |
|
316 |
|
317 """ |
|
318 Parse a simple table using `parse()`. |
|
319 |
|
320 Here's an example of a simple table:: |
|
321 |
|
322 ===== ===== |
|
323 col 1 col 2 |
|
324 ===== ===== |
|
325 1 Second column of row 1. |
|
326 2 Second column of row 2. |
|
327 Second line of paragraph. |
|
328 3 - Second column of row 3. |
|
329 |
|
330 - Second item in bullet |
|
331 list (row 3, column 2). |
|
332 4 is a span |
|
333 ------------ |
|
334 5 |
|
335 ===== ===== |
|
336 |
|
337 Top and bottom borders use '=', column span underlines use '-', column |
|
338 separation is indicated with spaces. |
|
339 |
|
340 Passing the above table to the `parse()` method will result in the |
|
341 following data structure, whose interpretation is the same as for |
|
342 `GridTableParser`:: |
|
343 |
|
344 ([5, 25], |
|
345 [[(0, 0, 1, ['col 1']), |
|
346 (0, 0, 1, ['col 2'])]], |
|
347 [[(0, 0, 3, ['1']), |
|
348 (0, 0, 3, ['Second column of row 1.'])], |
|
349 [(0, 0, 4, ['2']), |
|
350 (0, 0, 4, ['Second column of row 2.', |
|
351 'Second line of paragraph.'])], |
|
352 [(0, 0, 6, ['3']), |
|
353 (0, 0, 6, ['- Second column of row 3.', |
|
354 '', |
|
355 '- Second item in bullet', |
|
356 ' list (row 3, column 2).'])], |
|
357 [(0, 1, 10, ['4 is a span'])], |
|
358 [(0, 0, 12, ['5']), |
|
359 (0, 0, 12, [''])]]) |
|
360 """ |
|
361 |
|
362 head_body_separator_pat = re.compile('=[ =]*$') |
|
363 span_pat = re.compile('-[ -]*$') |
|
364 |
|
365 def setup(self, block): |
|
366 self.block = block[:] # make a copy; it will be modified |
|
367 self.block.disconnect() # don't propagate changes to parent |
|
368 # Convert top & bottom borders to column span underlines: |
|
369 self.block[0] = self.block[0].replace('=', '-') |
|
370 self.block[-1] = self.block[-1].replace('=', '-') |
|
371 self.head_body_sep = None |
|
372 self.columns = [] |
|
373 self.border_end = None |
|
374 self.table = [] |
|
375 self.done = [-1] * len(block[0]) |
|
376 self.rowseps = {0: [0]} |
|
377 self.colseps = {0: [0]} |
|
378 |
|
379 def parse_table(self): |
|
380 """ |
|
381 First determine the column boundaries from the top border, then |
|
382 process rows. Each row may consist of multiple lines; accumulate |
|
383 lines until a row is complete. Call `self.parse_row` to finish the |
|
384 job. |
|
385 """ |
|
386 # Top border must fully describe all table columns. |
|
387 self.columns = self.parse_columns(self.block[0], 0) |
|
388 self.border_end = self.columns[-1][1] |
|
389 firststart, firstend = self.columns[0] |
|
390 offset = 1 # skip top border |
|
391 start = 1 |
|
392 text_found = None |
|
393 while offset < len(self.block): |
|
394 line = self.block[offset] |
|
395 if self.span_pat.match(line): |
|
396 # Column span underline or border; row is complete. |
|
397 self.parse_row(self.block[start:offset], start, |
|
398 (line.rstrip(), offset)) |
|
399 start = offset + 1 |
|
400 text_found = None |
|
401 elif line[firststart:firstend].strip(): |
|
402 # First column not blank, therefore it's a new row. |
|
403 if text_found and offset != start: |
|
404 self.parse_row(self.block[start:offset], start) |
|
405 start = offset |
|
406 text_found = 1 |
|
407 elif not text_found: |
|
408 start = offset + 1 |
|
409 offset += 1 |
|
410 |
|
411 def parse_columns(self, line, offset): |
|
412 """ |
|
413 Given a column span underline, return a list of (begin, end) pairs. |
|
414 """ |
|
415 cols = [] |
|
416 end = 0 |
|
417 while 1: |
|
418 begin = line.find('-', end) |
|
419 end = line.find(' ', begin) |
|
420 if begin < 0: |
|
421 break |
|
422 if end < 0: |
|
423 end = len(line) |
|
424 cols.append((begin, end)) |
|
425 if self.columns: |
|
426 if cols[-1][1] != self.border_end: |
|
427 raise TableMarkupError('Column span incomplete at line ' |
|
428 'offset %s.' % offset) |
|
429 # Allow for an unbounded rightmost column: |
|
430 cols[-1] = (cols[-1][0], self.columns[-1][1]) |
|
431 return cols |
|
432 |
|
433 def init_row(self, colspec, offset): |
|
434 i = 0 |
|
435 cells = [] |
|
436 for start, end in colspec: |
|
437 morecols = 0 |
|
438 try: |
|
439 assert start == self.columns[i][0] |
|
440 while end != self.columns[i][1]: |
|
441 i += 1 |
|
442 morecols += 1 |
|
443 except (AssertionError, IndexError): |
|
444 raise TableMarkupError('Column span alignment problem at ' |
|
445 'line offset %s.' % (offset + 1)) |
|
446 cells.append([0, morecols, offset, []]) |
|
447 i += 1 |
|
448 return cells |
|
449 |
|
450 def parse_row(self, lines, start, spanline=None): |
|
451 """ |
|
452 Given the text `lines` of a row, parse it and append to `self.table`. |
|
453 |
|
454 The row is parsed according to the current column spec (either |
|
455 `spanline` if provided or `self.columns`). For each column, extract |
|
456 text from each line, and check for text in column margins. Finally, |
|
457 adjust for insigificant whitespace. |
|
458 """ |
|
459 if not (lines or spanline): |
|
460 # No new row, just blank lines. |
|
461 return |
|
462 if spanline: |
|
463 columns = self.parse_columns(*spanline) |
|
464 span_offset = spanline[1] |
|
465 else: |
|
466 columns = self.columns[:] |
|
467 span_offset = start |
|
468 self.check_columns(lines, start, columns) |
|
469 row = self.init_row(columns, start) |
|
470 for i in range(len(columns)): |
|
471 start, end = columns[i] |
|
472 cellblock = lines.get_2D_block(0, start, len(lines), end) |
|
473 cellblock.disconnect() # lines in cell can't sync with parent |
|
474 cellblock.replace(self.double_width_pad_char, '') |
|
475 row[i][3] = cellblock |
|
476 self.table.append(row) |
|
477 |
|
478 def check_columns(self, lines, first_line, columns): |
|
479 """ |
|
480 Check for text in column margins and text overflow in the last column. |
|
481 Raise TableMarkupError if anything but whitespace is in column margins. |
|
482 Adjust the end value for the last column if there is text overflow. |
|
483 """ |
|
484 # "Infinite" value for a dummy last column's beginning, used to |
|
485 # check for text overflow: |
|
486 columns.append((sys.maxint, None)) |
|
487 lastcol = len(columns) - 2 |
|
488 for i in range(len(columns) - 1): |
|
489 start, end = columns[i] |
|
490 nextstart = columns[i+1][0] |
|
491 offset = 0 |
|
492 for line in lines: |
|
493 if i == lastcol and line[end:].strip(): |
|
494 text = line[start:].rstrip() |
|
495 new_end = start + len(text) |
|
496 columns[i] = (start, new_end) |
|
497 main_start, main_end = self.columns[-1] |
|
498 if new_end > main_end: |
|
499 self.columns[-1] = (main_start, new_end) |
|
500 elif line[end:nextstart].strip(): |
|
501 raise TableMarkupError('Text in column margin at line ' |
|
502 'offset %s.' % (first_line + offset)) |
|
503 offset += 1 |
|
504 columns.pop() |
|
505 |
|
506 def structure_from_cells(self): |
|
507 colspecs = [end - start for start, end in self.columns] |
|
508 first_body_row = 0 |
|
509 if self.head_body_sep: |
|
510 for i in range(len(self.table)): |
|
511 if self.table[i][0][2] > self.head_body_sep: |
|
512 first_body_row = i |
|
513 break |
|
514 return (colspecs, self.table[:first_body_row], |
|
515 self.table[first_body_row:]) |
|
516 |
|
517 |
|
518 def update_dict_of_lists(master, newdata): |
|
519 """ |
|
520 Extend the list values of `master` with those from `newdata`. |
|
521 |
|
522 Both parameters must be dictionaries containing list values. |
|
523 """ |
|
524 for key, values in newdata.items(): |
|
525 master.setdefault(key, []).extend(values) |