|
1 /* |
|
2 * libxml2_htmlparser.c : an HTML 4.0 non-verifying parser |
|
3 * |
|
4 * See Copyright for the status of this software. |
|
5 * |
|
6 * daniel@veillard.com |
|
7 * Portion Copyright © 2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
8 */ |
|
9 |
|
10 #define IN_LIBXML |
|
11 #include "xmlenglibxml.h" |
|
12 |
|
13 #include <string.h> |
|
14 #if defined(HAVE_CTYPE_H) |
|
15 #include <ctype.h> |
|
16 #endif |
|
17 #ifdef HAVE_STDLIB_H |
|
18 #include <stdlib.h> |
|
19 #endif |
|
20 #ifdef HAVE_SYS_STAT_H |
|
21 #include <sys/stat.h> |
|
22 #endif |
|
23 #ifdef HAVE_FCNTL_H |
|
24 #include <fcntl.h> |
|
25 #endif |
|
26 #ifdef HAVE_UNISTD_H |
|
27 #include <unistd.h> |
|
28 #endif |
|
29 #ifdef HAVE_ZLIB_H |
|
30 #include <zlib.h> |
|
31 #endif |
|
32 |
|
33 |
|
34 #include <stdapis/libxml2/libxml2_globals.h> |
|
35 #include <stdapis/libxml2/libxml2_xmlmemory.h> |
|
36 #include <stdapis/libxml2/libxml2_tree.h> |
|
37 #include <stdapis/libxml2/libxml2_parser.h> |
|
38 #include <stdapis/libxml2/libxml2_parserinternals.h> |
|
39 #include <stdapis/libxml2/libxml2_xmlerror.h> |
|
40 #include "libxml2_xmlerror2.h" |
|
41 #include "libxml2_htmlparser.h" |
|
42 #include "libxml2_htmltree.h" |
|
43 #include "libxml2_entities.h" |
|
44 #include <stdapis/libxml2/libxml2_encoding.h> |
|
45 #include <stdapis/libxml2/libxml2_valid.h> |
|
46 #include <stdapis/libxml2/libxml2_xmlio.h> |
|
47 #include <stdapis/libxml2/libxml2_uri.h> |
|
48 |
|
49 #define HTML_MAX_NAMELEN 1000 |
|
50 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 |
|
51 #define HTML_PARSER_BUFFER_SIZE 100 |
|
52 |
|
53 #ifdef LIBXML_HTML_ENABLED |
|
54 |
|
55 /* #define DEBUG */ |
|
56 /* #define DEBUG_PUSH */ |
|
57 |
|
58 static const int htmlOmittedDefaultValue = 1; |
|
59 |
|
60 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len, |
|
61 xmlChar end, xmlChar end2, xmlChar end3); |
|
62 static void htmlParseComment(htmlParserCtxtPtr ctxt); |
|
63 |
|
64 /************************************************************************ |
|
65 * * |
|
66 * Some factorized error routines * |
|
67 * * |
|
68 ************************************************************************/ |
|
69 |
|
70 /** |
|
71 * htmlErrMemory: |
|
72 * @param ctxt an HTML parser context |
|
73 * @param extra extra informations |
|
74 * |
|
75 * Handle a redefinition of attribute error |
|
76 */ |
|
77 void |
|
78 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra); // moved to XSLT-enabled part of this file |
|
79 |
|
80 /** |
|
81 * htmlParseErr: |
|
82 * @param ctxt an HTML parser context |
|
83 * @param error the error number |
|
84 * @param msg the error message |
|
85 * @param str1 string infor |
|
86 * @param str2 string infor |
|
87 * |
|
88 * Handle a fatal parser error, i.e. violating Well-Formedness constraints |
|
89 */ |
|
90 static void |
|
91 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, |
|
92 const char *msg, const xmlChar *str1, const xmlChar *str2) |
|
93 { |
|
94 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && |
|
95 (ctxt->instate == XML_PARSER_EOF)) |
|
96 return; |
|
97 ctxt->errNo = error; |
|
98 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, |
|
99 XML_ERR_ERROR, NULL, 0, |
|
100 (const char *) str1, (const char *) str2, |
|
101 NULL, 0, 0, |
|
102 msg, str1, str2); |
|
103 ctxt->wellFormed = 0; |
|
104 } |
|
105 |
|
106 /** |
|
107 * htmlParseErrInt: |
|
108 * @param ctxt an HTML parser context |
|
109 * @param error the error number |
|
110 * @param msg the error message |
|
111 * @param val integer info |
|
112 * |
|
113 * Handle a fatal parser error, i.e. violating Well-Formedness constraints |
|
114 */ |
|
115 static void |
|
116 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error, |
|
117 const char *msg, int val) |
|
118 { |
|
119 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && |
|
120 (ctxt->instate == XML_PARSER_EOF)) |
|
121 return; |
|
122 ctxt->errNo = error; |
|
123 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error, |
|
124 XML_ERR_ERROR, NULL, 0, NULL, NULL, |
|
125 NULL, val, 0, msg, val); |
|
126 ctxt->wellFormed = 0; |
|
127 } |
|
128 |
|
129 /************************************************************************ |
|
130 * * |
|
131 * Parser stacks related functions and macros * |
|
132 * * |
|
133 ************************************************************************/ |
|
134 |
|
135 /** |
|
136 * htmlnamePush: |
|
137 * @param ctxt an HTML parser context |
|
138 * @param value the element name |
|
139 * |
|
140 * Pushes a new element name on top of the name stack |
|
141 * |
|
142 * Returns 0 in case of error, the index in the stack otherwise |
|
143 */ |
|
144 static int |
|
145 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value) |
|
146 { |
|
147 if (ctxt->nameNr >= ctxt->nameMax) { |
|
148 void* allocTmp; // DONE: Fix xmlRealloc |
|
149 allocTmp = xmlRealloc((xmlChar**)ctxt->nameTab, |
|
150 ctxt->nameMax * 2 * sizeof(ctxt->nameTab[0])); |
|
151 if (!allocTmp) { |
|
152 htmlErrMemory(ctxt, NULL); |
|
153 return (0); |
|
154 } |
|
155 ctxt->nameMax *= 2; |
|
156 ctxt->nameTab = (const xmlChar**) allocTmp; |
|
157 } |
|
158 ctxt->nameTab[ctxt->nameNr] = value; |
|
159 ctxt->name = value; |
|
160 return (ctxt->nameNr++); |
|
161 } |
|
162 /** |
|
163 * htmlnamePop: |
|
164 * @param ctxt an HTML parser context |
|
165 * |
|
166 * Pops the top element name from the name stack |
|
167 * |
|
168 * Returns the name just removed |
|
169 */ |
|
170 static const xmlChar * |
|
171 htmlnamePop(htmlParserCtxtPtr ctxt) |
|
172 { |
|
173 const xmlChar *ret; |
|
174 |
|
175 if (ctxt->nameNr <= 0) |
|
176 return (0); |
|
177 ctxt->nameNr--; |
|
178 if (ctxt->nameNr < 0) |
|
179 return (0); |
|
180 if (ctxt->nameNr > 0) |
|
181 ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; |
|
182 else |
|
183 ctxt->name = NULL; |
|
184 ret = ctxt->nameTab[ctxt->nameNr]; |
|
185 ctxt->nameTab[ctxt->nameNr] = 0; |
|
186 return (ret); |
|
187 } |
|
188 |
|
189 /* |
|
190 * Macros for accessing the content. Those should be used only by the parser, |
|
191 * and not exported. |
|
192 * |
|
193 * Dirty macros, i.e. one need to make assumption on the context to use them |
|
194 * |
|
195 * CUR_PTR return the current pointer to the xmlChar to be parsed. |
|
196 * CUR returns the current xmlChar value, i.e. a 8 bit value if compiled |
|
197 * in ISO-Latin or UTF-8, and the current 16 bit value if compiled |
|
198 * in UNICODE mode. This should be used internally by the parser |
|
199 * only to compare to ASCII values otherwise it would break when |
|
200 * running with UTF-8 encoding. |
|
201 * NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only |
|
202 * to compare on ASCII based substring. |
|
203 * UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR |
|
204 * it should be used only to compare on ASCII based substring. |
|
205 * SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined |
|
206 * strings without newlines within the parser. |
|
207 * |
|
208 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding |
|
209 * |
|
210 * CURRENT Returns the current char value, with the full decoding of |
|
211 * UTF-8 if we are using this mode. It returns an int. |
|
212 * NEXT Skip to the next character, this does the proper decoding |
|
213 * in UTF-8 mode. |
|
214 * NEXTL(l) Skip the current unicode character of l xmlChars long. |
|
215 * COPY(to) copy one char to *to, increment CUR_PTR and to accordingly |
|
216 */ |
|
217 |
|
218 #define UPPER (toupper(*ctxt->input->cur)) |
|
219 |
|
220 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val) |
|
221 |
|
222 #define NXT(val) ctxt->input->cur[(val)] |
|
223 |
|
224 #define UPP(val) (toupper(ctxt->input->cur[(val)])) |
|
225 |
|
226 #define CUR_PTR ctxt->input->cur |
|
227 |
|
228 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \ |
|
229 (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \ |
|
230 xmlParserInputShrink(ctxt->input) |
|
231 |
|
232 #define GROW if ((ctxt->progressive == 0) && \ |
|
233 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \ |
|
234 xmlParserInputGrow(ctxt->input, INPUT_CHUNK) |
|
235 |
|
236 #define CURRENT ((int) (*ctxt->input->cur)) |
|
237 |
|
238 #define SKIP_BLANKS htmlSkipBlankChars(ctxt) |
|
239 |
|
240 /* Inported from XML */ |
|
241 |
|
242 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ |
|
243 #define CUR ((int) (*ctxt->input->cur)) |
|
244 #define NEXT xmlNextChar(ctxt) |
|
245 |
|
246 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur)) |
|
247 #define NXT(val) ctxt->input->cur[(val)] |
|
248 #define CUR_PTR ctxt->input->cur |
|
249 |
|
250 |
|
251 #define NEXTL(l) do { \ |
|
252 if (*(ctxt->input->cur) == '\n') { \ |
|
253 ctxt->input->line++; ctxt->input->col = 1; \ |
|
254 } else ctxt->input->col++; \ |
|
255 ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \ |
|
256 } while (0) |
|
257 |
|
258 /************ |
|
259 \ |
|
260 if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \ |
|
261 if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt); |
|
262 ************/ |
|
263 |
|
264 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l) |
|
265 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l) |
|
266 |
|
267 #define COPY_BUF(l,b,i,v) \ |
|
268 if (l == 1) b[i++] = (xmlChar) v; \ |
|
269 else i += xmlCopyChar(l,&b[i],v) |
|
270 |
|
271 /** |
|
272 * htmlCurrentChar: |
|
273 * @param ctxt the HTML parser context |
|
274 * @param len pointer to the length of the char read |
|
275 * |
|
276 * The current char value, if using UTF-8 this may actually span multiple |
|
277 * bytes in the input buffer. Implement the end of line normalization: |
|
278 * 2.11 End-of-Line Handling |
|
279 * If the encoding is unspecified, in the case we find an ISO-Latin-1 |
|
280 * char, then the encoding converter is plugged in automatically. |
|
281 * |
|
282 * Returns the current char value and its length |
|
283 */ |
|
284 |
|
285 static int |
|
286 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { |
|
287 if (ctxt->instate == XML_PARSER_EOF) |
|
288 return(0); |
|
289 |
|
290 if (ctxt->token != 0) { |
|
291 *len = 0; |
|
292 return(ctxt->token); |
|
293 } |
|
294 if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { |
|
295 /* |
|
296 * We are supposed to handle UTF8, check it's valid |
|
297 * From rfc2044: encoding of the Unicode values on UTF-8: |
|
298 * |
|
299 * UCS-4 range (hex.) UTF-8 octet sequence (binary) |
|
300 * 0000 0000-0000 007F 0xxxxxxx |
|
301 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx |
|
302 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx |
|
303 * |
|
304 * Check for the 0x110000 limit too |
|
305 */ |
|
306 const unsigned char *cur = ctxt->input->cur; |
|
307 unsigned char c; |
|
308 unsigned int val; |
|
309 |
|
310 c = *cur; |
|
311 if (c & 0x80) { |
|
312 if (cur[1] == 0) |
|
313 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); |
|
314 if ((cur[1] & 0xc0) != 0x80) |
|
315 goto encoding_error; |
|
316 if ((c & 0xe0) == 0xe0) { |
|
317 |
|
318 if (cur[2] == 0) |
|
319 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); |
|
320 if ((cur[2] & 0xc0) != 0x80) |
|
321 goto encoding_error; |
|
322 if ((c & 0xf0) == 0xf0) { |
|
323 if (cur[3] == 0) |
|
324 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); |
|
325 if (((c & 0xf8) != 0xf0) || |
|
326 ((cur[3] & 0xc0) != 0x80)) |
|
327 goto encoding_error; |
|
328 /* 4-byte code */ |
|
329 *len = 4; |
|
330 val = (cur[0] & 0x7) << 18; |
|
331 val |= (cur[1] & 0x3f) << 12; |
|
332 val |= (cur[2] & 0x3f) << 6; |
|
333 val |= cur[3] & 0x3f; |
|
334 } else { |
|
335 /* 3-byte code */ |
|
336 *len = 3; |
|
337 val = (cur[0] & 0xf) << 12; |
|
338 val |= (cur[1] & 0x3f) << 6; |
|
339 val |= cur[2] & 0x3f; |
|
340 } |
|
341 } else { |
|
342 /* 2-byte code */ |
|
343 *len = 2; |
|
344 val = (cur[0] & 0x1f) << 6; |
|
345 val |= cur[1] & 0x3f; |
|
346 } |
|
347 if (!IS_CHAR(val)) { |
|
348 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, |
|
349 "Char 0x%X out of allowed range\n", val); |
|
350 } |
|
351 return(val); |
|
352 } else { |
|
353 /* 1-byte code */ |
|
354 *len = 1; |
|
355 return((int) *ctxt->input->cur); |
|
356 } |
|
357 } |
|
358 /* |
|
359 * Assume it's a fixed length encoding (1) with |
|
360 * a compatible encoding for the ASCII set, since |
|
361 * XML constructs only use < 128 chars |
|
362 */ |
|
363 *len = 1; |
|
364 if ((int) *ctxt->input->cur < 0x80) |
|
365 return((int) *ctxt->input->cur); |
|
366 |
|
367 /* |
|
368 * Humm this is bad, do an automatic flow conversion |
|
369 */ |
|
370 xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); |
|
371 ctxt->charset = XML_CHAR_ENCODING_UTF8; |
|
372 return(xmlCurrentChar(ctxt, len)); |
|
373 |
|
374 encoding_error: |
|
375 /* |
|
376 * If we detect an UTF8 error that probably mean that the |
|
377 * input encoding didn't get properly advertized in the |
|
378 * declaration header. Report the error and switch the encoding |
|
379 * to ISO-Latin-1 (if you don't like this policy, just declare the |
|
380 * encoding !) |
|
381 */ |
|
382 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, |
|
383 "Input is not proper UTF-8, indicate encoding !\n", |
|
384 NULL, NULL); |
|
385 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) { |
|
386 ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", |
|
387 ctxt->input->cur[0], ctxt->input->cur[1], |
|
388 ctxt->input->cur[2], ctxt->input->cur[3]); |
|
389 } |
|
390 |
|
391 ctxt->charset = XML_CHAR_ENCODING_8859_1; |
|
392 *len = 1; |
|
393 return((int) *ctxt->input->cur); |
|
394 } |
|
395 |
|
396 /** |
|
397 * htmlSkipBlankChars: |
|
398 * @param ctxt the HTML parser context |
|
399 * |
|
400 * skip all blanks character found at that point in the input streams. |
|
401 * |
|
402 * Returns the number of space chars skipped |
|
403 */ |
|
404 |
|
405 static int |
|
406 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { |
|
407 int res = 0; |
|
408 |
|
409 while (IS_BLANK_CH(*(ctxt->input->cur))) { |
|
410 if ((*ctxt->input->cur == 0) && |
|
411 (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) { |
|
412 xmlPopInput(ctxt); |
|
413 } else { |
|
414 if (*(ctxt->input->cur) == '\n') { |
|
415 ctxt->input->line++; ctxt->input->col = 1; |
|
416 } else ctxt->input->col++; |
|
417 ctxt->input->cur++; |
|
418 ctxt->nbChars++; |
|
419 if (*ctxt->input->cur == 0) |
|
420 xmlParserInputGrow(ctxt->input, INPUT_CHUNK); |
|
421 } |
|
422 res++; |
|
423 } |
|
424 return(res); |
|
425 } |
|
426 |
|
427 |
|
428 #endif /* defined(LIBXML_HTML_ENABLED */ |
|
429 |
|
430 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) |
|
431 |
|
432 /************************************************************************ |
|
433 * * |
|
434 * The list of HTML elements and their properties * |
|
435 * * |
|
436 ************************************************************************/ |
|
437 |
|
438 /* |
|
439 * Start Tag: 1 means the start tag can be ommited |
|
440 * End Tag: 1 means the end tag can be ommited |
|
441 * 2 means it's forbidden (empty elements) |
|
442 * 3 means the tag is stylistic and should be closed easily |
|
443 * Depr: this element is deprecated |
|
444 * DTD: 1 means that this element is valid only in the Loose DTD |
|
445 * 2 means that this element is valid only in the Frameset DTD |
|
446 * |
|
447 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description |
|
448 , subElements , impliedsubelt , Attributes, userdata |
|
449 */ |
|
450 |
|
451 /* Definitions and a couple of vars for HTML Elements */ |
|
452 |
|
453 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small" |
|
454 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym" |
|
455 #define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe" |
|
456 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL |
|
457 #define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address" |
|
458 #define FORMCTRL "input", "select", "textarea", "label", "button" |
|
459 #define PCDATA |
|
460 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6" |
|
461 #define LIST "ul", "ol", "dir", "menu" |
|
462 #define MODIFIER |
|
463 #define FLOW BLOCK,INLINE |
|
464 #define EMPTY NULL |
|
465 |
|
466 // TO DO libxslt added 2nd const in between |
|
467 static const char* const html_flow [] = { FLOW, NULL } ; |
|
468 static const char* const html_inline [] = { INLINE, NULL } ; |
|
469 |
|
470 /* placeholders: elts with content but no subelements */ |
|
471 static const char* const html_pcdata[] = { NULL } ; |
|
472 #define html_cdata html_pcdata |
|
473 |
|
474 |
|
475 /* ... and for HTML Attributes */ |
|
476 |
|
477 #define COREATTRS "id", "class", "style", "title" |
|
478 #define I18N "lang", "dir" |
|
479 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup" |
|
480 #define ATTRS COREATTRS,I18N,EVENTS |
|
481 #define CELLHALIGN "align", "char", "charoff" |
|
482 #define CELLVALIGN "valign" |
|
483 |
|
484 static const char* const html_attrs [] = { ATTRS, NULL } ; |
|
485 static const char* const core_i18n_attrs [] = { COREATTRS, I18N, NULL } ; |
|
486 static const char* const core_attrs [] = { COREATTRS, NULL } ; |
|
487 static const char* const i18n_attrs [] = { I18N, NULL } ; |
|
488 |
|
489 /* Other declarations that should go inline ... */ |
|
490 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name", |
|
491 "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords", |
|
492 "tabindex", "onfocus", "onblur", NULL } ; |
|
493 static const char* const target_attr[] = { "target", NULL } ; |
|
494 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ; |
|
495 static const char* const alt_attr[] = { "alt", NULL } ; |
|
496 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ; |
|
497 static const char* const href_attrs[] = { "href", NULL } ; |
|
498 static const char* const clear_attrs[] = { "clear", NULL } ; |
|
499 static const char* const inline_p[] = { INLINE, "p", NULL } ; |
|
500 static const char* const flow_param[] = { FLOW, "param", NULL } ; |
|
501 static const char* const applet_attrs[] = { COREATTRS , "codebase", |
|
502 "archive", "alt", "name", "height", "width", "align", |
|
503 "hspace", "vspace", NULL } ; |
|
504 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref", |
|
505 "tabindex", "accesskey", "onfocus", "onblur", NULL } ; |
|
506 static const char* const basefont_attrs[] = |
|
507 { "id", "size", "color", "face", NULL } ; |
|
508 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ; |
|
509 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ; |
|
510 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ; |
|
511 static const char* const body_depr[] = { "background", "bgcolor", "text", |
|
512 "link", "vlink", "alink", NULL } ; |
|
513 static const char* const button_attrs[] = { ATTRS, "name", "value", "type", |
|
514 "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ; |
|
515 |
|
516 |
|
517 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ; |
|
518 static const char* const col_elt[] = { "col", NULL } ; |
|
519 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ; |
|
520 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ; |
|
521 static const char* const dl_contents[] = { "dt", "dd", NULL } ; |
|
522 static const char* const compact_attr[] = { "compact", NULL } ; |
|
523 static const char* const label_attr[] = { "label", NULL } ; |
|
524 static const char* const fieldset_contents[] = { FLOW, "legend" } ; |
|
525 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ; |
|
526 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ; |
|
527 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ; |
|
528 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ; |
|
529 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ; |
|
530 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ; |
|
531 static const char* const head_attrs[] = { I18N, "profile", NULL } ; |
|
532 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ; |
|
533 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ; |
|
534 static const char* const version_attr[] = { "version", NULL } ; |
|
535 static const char* const html_content[] = { "head", "body", "frameset", NULL } ; |
|
536 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ; |
|
537 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ; |
|
538 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ; |
|
539 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ; |
|
540 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ; |
|
541 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ; |
|
542 static const char* const align_attr[] = { "align", NULL } ; |
|
543 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ; |
|
544 static const char* const map_contents[] = { BLOCK, "area", NULL } ; |
|
545 static const char* const name_attr[] = { "name", NULL } ; |
|
546 static const char* const action_attr[] = { "action", NULL } ; |
|
547 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; |
|
548 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; |
|
549 static const char* const content_attr[] = { "content", NULL } ; |
|
550 static const char* const type_attr[] = { "type", NULL } ; |
|
551 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; |
|
552 static const char* const object_contents[] = { FLOW, "param", NULL } ; |
|
553 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ; |
|
554 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ; |
|
555 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ; |
|
556 static const char* const option_elt[] = { "option", NULL } ; |
|
557 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ; |
|
558 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ; |
|
559 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ; |
|
560 static const char* const width_attr[] = { "width", NULL } ; |
|
561 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ; |
|
562 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ; |
|
563 static const char* const language_attr[] = { "language", NULL } ; |
|
564 static const char* const select_content[] = { "optgroup", "option", NULL } ; |
|
565 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ; |
|
566 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ; |
|
567 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ; |
|
568 static const char* const table_depr[] = { "align", "bgcolor", NULL } ; |
|
569 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ; |
|
570 static const char* const tr_elt[] = { "tr", NULL } ; |
|
571 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ; |
|
572 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ; |
|
573 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ; |
|
574 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ; |
|
575 static const char* const tr_contents[] = { "th", "td", NULL } ; |
|
576 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ; |
|
577 static const char* const li_elt[] = { "li", NULL } ; |
|
578 static const char* const ul_depr[] = { "type", "compact", NULL} ; |
|
579 static const char* const dir_attr[] = { "dir", NULL} ; |
|
580 |
|
581 #define DECL (const char**) |
|
582 |
|
583 static const htmlElemDesc html40ElementTable [] = { |
|
584 { "a", 0, 0, 0, 0, 0, 0, 1, "anchor ", |
|
585 DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL |
|
586 }, |
|
587 { "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form", |
|
588 DECL html_inline , NULL , DECL html_attrs, NULL, NULL |
|
589 }, |
|
590 { "acronym", 0, 0, 0, 0, 0, 0, 1, "", |
|
591 DECL html_inline , NULL , DECL html_attrs, NULL, NULL |
|
592 }, |
|
593 { "address", 0, 0, 0, 0, 0, 0, 0, "information on author ", |
|
594 DECL inline_p , NULL , DECL html_attrs, NULL, NULL |
|
595 }, |
|
596 { "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ", |
|
597 DECL flow_param , NULL , NULL , DECL applet_attrs, NULL |
|
598 }, |
|
599 { "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ", |
|
600 EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr |
|
601 }, |
|
602 { "b", 0, 3, 0, 0, 0, 0, 1, "bold text style", |
|
603 DECL html_inline , NULL , DECL html_attrs, NULL, NULL |
|
604 }, |
|
605 { "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ", |
|
606 EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs |
|
607 }, |
|
608 { "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " , |
|
609 EMPTY , NULL , NULL, DECL basefont_attrs, NULL |
|
610 }, |
|
611 { "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ", |
|
612 DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr |
|
613 }, |
|
614 { "big", 0, 3, 0, 0, 0, 0, 1, "large text style", |
|
615 DECL html_inline , NULL , DECL html_attrs, NULL, NULL |
|
616 }, |
|
617 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ", |
|
618 DECL html_flow , NULL , DECL quote_attrs , NULL, NULL |
|
619 }, |
|
620 { "body", 1, 1, 0, 0, 0, 0, 0, "document body ", |
|
621 DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL |
|
622 }, |
|
623 { "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ", |
|
624 EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL |
|
625 }, |
|
626 { "button", 0, 0, 0, 0, 0, 0, 2, "push button ", |
|
627 DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL |
|
628 }, |
|
629 { "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ", |
|
630 DECL html_inline , NULL , DECL html_attrs, NULL, NULL |
|
631 }, |
|
632 { "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ", |
|
633 DECL html_flow , NULL , NULL, DECL html_attrs, NULL |
|
634 }, |
|
635 { "cite", 0, 0, 0, 0, 0, 0, 1, "citation", |
|
636 DECL html_inline , NULL , DECL html_attrs, NULL, NULL |
|
637 }, |
|
638 { "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment", |
|
639 DECL html_inline , NULL , DECL html_attrs, NULL, NULL |
|
640 }, |
|
641 { "col", 0, 2, 2, 1, 0, 0, 0, "table column ", |
|
642 EMPTY , NULL , DECL col_attrs , NULL, NULL |
|
643 }, |
|
644 { "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ", |
|
645 DECL col_elt , "col" , DECL col_attrs , NULL, NULL |
|
646 }, |
|
647 { "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ", |
|
648 DECL html_flow , NULL , DECL html_attrs, NULL, NULL |
|
649 }, |
|
650 { "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ", |
|
651 DECL html_flow , NULL , DECL edit_attrs , NULL, NULL |
|
652 }, |
|
653 { "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition", |
|
654 DECL html_inline , NULL , DECL html_attrs, NULL, NULL |
|
655 }, |
|
656 { "dir", 0, 0, 0, 0, 1, 1, 0, "directory list", |
|
657 DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL |
|
658 }, |
|
659 { "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container", |
|
660 DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL |
|
661 }, |
|
662 { "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ", |
|
663 DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL |
|
664 }, |
|
665 { "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ", |
|
666 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
667 }, |
|
668 { "em", 0, 3, 0, 0, 0, 0, 1, "emphasis", |
|
669 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
670 }, |
|
671 { "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ", |
|
672 DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL |
|
673 }, |
|
674 { "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ", |
|
675 DECL html_inline, NULL, NULL, DECL font_attrs, NULL |
|
676 }, |
|
677 { "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ", |
|
678 DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr |
|
679 }, |
|
680 { "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " , |
|
681 EMPTY, NULL, NULL, DECL frame_attrs, NULL |
|
682 }, |
|
683 { "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" , |
|
684 DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL |
|
685 }, |
|
686 { "h1", 0, 0, 0, 0, 0, 0, 0, "heading ", |
|
687 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL |
|
688 }, |
|
689 { "h2", 0, 0, 0, 0, 0, 0, 0, "heading ", |
|
690 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL |
|
691 }, |
|
692 { "h3", 0, 0, 0, 0, 0, 0, 0, "heading ", |
|
693 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL |
|
694 }, |
|
695 { "h4", 0, 0, 0, 0, 0, 0, 0, "heading ", |
|
696 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL |
|
697 }, |
|
698 { "h5", 0, 0, 0, 0, 0, 0, 0, "heading ", |
|
699 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL |
|
700 }, |
|
701 { "h6", 0, 0, 0, 0, 0, 0, 0, "heading ", |
|
702 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL |
|
703 }, |
|
704 { "head", 1, 1, 0, 0, 0, 0, 0, "document head ", |
|
705 DECL head_contents, NULL, DECL head_attrs, NULL, NULL |
|
706 }, |
|
707 { "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " , |
|
708 EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL |
|
709 }, |
|
710 { "html", 1, 1, 0, 0, 0, 0, 0, "document root element ", |
|
711 DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL |
|
712 }, |
|
713 { "i", 0, 3, 0, 0, 0, 0, 1, "italic text style", |
|
714 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
715 }, |
|
716 { "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ", |
|
717 DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL |
|
718 }, |
|
719 { "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ", |
|
720 EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs |
|
721 }, |
|
722 { "input", 0, 2, 2, 1, 0, 0, 1, "form control ", |
|
723 EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL |
|
724 }, |
|
725 { "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text", |
|
726 DECL html_flow, NULL, DECL edit_attrs, NULL, NULL |
|
727 }, |
|
728 { "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ", |
|
729 EMPTY, NULL, NULL, DECL prompt_attrs, NULL |
|
730 }, |
|
731 { "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user", |
|
732 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
733 }, |
|
734 { "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ", |
|
735 DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL |
|
736 }, |
|
737 { "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ", |
|
738 DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL |
|
739 }, |
|
740 { "li", 0, 1, 1, 0, 0, 0, 0, "list item ", |
|
741 DECL html_flow, NULL, DECL html_attrs, NULL, NULL |
|
742 }, |
|
743 { "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ", |
|
744 EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL |
|
745 }, |
|
746 { "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ", |
|
747 DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr |
|
748 }, |
|
749 { "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ", |
|
750 DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL |
|
751 }, |
|
752 { "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ", |
|
753 EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr |
|
754 }, |
|
755 { "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ", |
|
756 DECL noframes_content, "body" , DECL html_attrs, NULL, NULL |
|
757 }, |
|
758 { "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ", |
|
759 DECL html_flow, "div", DECL html_attrs, NULL, NULL |
|
760 }, |
|
761 { "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ", |
|
762 DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL |
|
763 }, |
|
764 { "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ", |
|
765 DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL |
|
766 }, |
|
767 { "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ", |
|
768 DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr |
|
769 }, |
|
770 { "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " , |
|
771 DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL |
|
772 }, |
|
773 { "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ", |
|
774 DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL |
|
775 }, |
|
776 { "param", 0, 2, 2, 1, 0, 0, 0, "named property value ", |
|
777 EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr |
|
778 }, |
|
779 { "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ", |
|
780 DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL |
|
781 }, |
|
782 { "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ", |
|
783 DECL html_inline, NULL, DECL quote_attrs, NULL, NULL |
|
784 }, |
|
785 { "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style", |
|
786 DECL html_inline, NULL, NULL, DECL html_attrs, NULL |
|
787 }, |
|
788 { "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.", |
|
789 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
790 }, |
|
791 { "script", 0, 0, 0, 0, 0, 0, 2, "script statements ", |
|
792 DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr |
|
793 }, |
|
794 { "select", 0, 0, 0, 0, 0, 0, 1, "option selector ", |
|
795 DECL select_content, NULL, DECL select_attrs, NULL, NULL |
|
796 }, |
|
797 { "small", 0, 3, 0, 0, 0, 0, 1, "small text style", |
|
798 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
799 }, |
|
800 { "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ", |
|
801 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
802 }, |
|
803 { "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text", |
|
804 DECL html_inline, NULL, NULL, DECL html_attrs, NULL |
|
805 }, |
|
806 { "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis", |
|
807 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
808 }, |
|
809 { "style", 0, 0, 0, 0, 0, 0, 0, "style info ", |
|
810 DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr |
|
811 }, |
|
812 { "sub", 0, 3, 0, 0, 0, 0, 1, "subscript", |
|
813 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
814 }, |
|
815 { "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ", |
|
816 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
817 }, |
|
818 { "table", 0, 0, 0, 0, 0, 0, 0, "", |
|
819 DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL |
|
820 }, |
|
821 { "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ", |
|
822 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL |
|
823 }, |
|
824 { "td", 0, 0, 0, 0, 0, 0, 0, "table data cell", |
|
825 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL |
|
826 }, |
|
827 { "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ", |
|
828 DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr |
|
829 }, |
|
830 { "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ", |
|
831 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL |
|
832 }, |
|
833 { "th", 0, 1, 0, 0, 0, 0, 0, "table header cell", |
|
834 DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL |
|
835 }, |
|
836 { "thead", 0, 1, 0, 0, 0, 0, 0, "table header ", |
|
837 DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL |
|
838 }, |
|
839 { "title", 0, 0, 0, 0, 0, 0, 0, "document title ", |
|
840 DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL |
|
841 }, |
|
842 { "tr", 0, 0, 0, 0, 0, 0, 0, "table row ", |
|
843 DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL |
|
844 }, |
|
845 { "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style", |
|
846 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
847 }, |
|
848 { "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style", |
|
849 DECL html_inline, NULL, NULL, DECL html_attrs, NULL |
|
850 }, |
|
851 { "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ", |
|
852 DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL |
|
853 }, |
|
854 { "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument", |
|
855 DECL html_inline, NULL, DECL html_attrs, NULL, NULL |
|
856 } |
|
857 }; |
|
858 |
|
859 #endif /* defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) */ |
|
860 |
|
861 #ifdef LIBXML_HTML_ENABLED |
|
862 |
|
863 /* |
|
864 * start tags that imply the end of current element |
|
865 */ |
|
866 static const char * const htmlStartClose [] = { |
|
867 "form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6", |
|
868 "dl", "ul", "ol", "menu", "dir", "address", "pre", |
|
869 "listing", "xmp", "head", NULL, |
|
870 "head", "p", NULL, |
|
871 "title", "p", NULL, |
|
872 "body", "head", "style", "link", "title", "p", NULL, |
|
873 "frameset", "head", "style", "link", "title", "p", NULL, |
|
874 "li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address", |
|
875 "pre", "listing", "xmp", "head", "li", NULL, |
|
876 "hr", "p", "head", NULL, |
|
877 "h1", "p", "head", NULL, |
|
878 "h2", "p", "head", NULL, |
|
879 "h3", "p", "head", NULL, |
|
880 "h4", "p", "head", NULL, |
|
881 "h5", "p", "head", NULL, |
|
882 "h6", "p", "head", NULL, |
|
883 "dir", "p", "head", NULL, |
|
884 "address", "p", "head", "ul", NULL, |
|
885 "pre", "p", "head", "ul", NULL, |
|
886 "listing", "p", "head", NULL, |
|
887 "xmp", "p", "head", NULL, |
|
888 "blockquote", "p", "head", NULL, |
|
889 "dl", "p", "dt", "menu", "dir", "address", "pre", "listing", |
|
890 "xmp", "head", NULL, |
|
891 "dt", "p", "menu", "dir", "address", "pre", "listing", "xmp", |
|
892 "head", "dd", NULL, |
|
893 "dd", "p", "menu", "dir", "address", "pre", "listing", "xmp", |
|
894 "head", "dt", NULL, |
|
895 "ul", "p", "head", "ol", "menu", "dir", "address", "pre", |
|
896 "listing", "xmp", NULL, |
|
897 "ol", "p", "head", "ul", NULL, |
|
898 "menu", "p", "head", "ul", NULL, |
|
899 "p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL, |
|
900 "div", "p", "head", NULL, |
|
901 "noscript", "p", "head", NULL, |
|
902 "center", "font", "b", "i", "p", "head", NULL, |
|
903 "a", "a", NULL, |
|
904 "caption", "p", NULL, |
|
905 "colgroup", "caption", "colgroup", "col", "p", NULL, |
|
906 "col", "caption", "col", "p", NULL, |
|
907 "table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", |
|
908 "listing", "xmp", "a", NULL, |
|
909 "th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, |
|
910 "td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL, |
|
911 "tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL, |
|
912 "thead", "caption", "col", "colgroup", NULL, |
|
913 "tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead", |
|
914 "tbody", "p", NULL, |
|
915 "tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead", |
|
916 "tfoot", "tbody", "p", NULL, |
|
917 "optgroup", "option", NULL, |
|
918 "option", "option", NULL, |
|
919 "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", |
|
920 "pre", "listing", "xmp", "a", NULL, |
|
921 NULL |
|
922 }; |
|
923 |
|
924 /* |
|
925 * The list of HTML elements which are supposed not to have |
|
926 * CDATA content and where a p element will be implied |
|
927 * |
|
928 |
|
929 |
|
930 */ |
|
931 static const char * const htmlNoContentElements[] = { |
|
932 "html", |
|
933 "head", |
|
934 "body", |
|
935 NULL |
|
936 }; |
|
937 |
|
938 /* |
|
939 * The list of HTML attributes which are of content %Script; |
|
940 * NOTE: when adding ones, check htmlIsScriptAttribute() since |
|
941 * it assumes the name starts with 'on' |
|
942 */ |
|
943 static const char * const htmlScriptAttributes[] = { |
|
944 "onclick", |
|
945 "ondblclick", |
|
946 "onmousedown", |
|
947 "onmouseup", |
|
948 "onmouseover", |
|
949 "onmousemove", |
|
950 "onmouseout", |
|
951 "onkeypress", |
|
952 "onkeydown", |
|
953 "onkeyup", |
|
954 "onload", |
|
955 "onunload", |
|
956 "onfocus", |
|
957 "onblur", |
|
958 "onsubmit", |
|
959 "onrest", |
|
960 "onchange", |
|
961 "onselect" |
|
962 }; |
|
963 |
|
964 /* |
|
965 * This table is used by the htmlparser to know what to do with |
|
966 * broken html pages. By assigning different priorities to different |
|
967 * elements the parser can decide how to handle extra endtags. |
|
968 * Endtags are only allowed to close elements with lower or equal |
|
969 * priority. |
|
970 */ |
|
971 |
|
972 typedef struct { |
|
973 const char *name; |
|
974 int priority; |
|
975 } elementPriority; |
|
976 |
|
977 static const elementPriority htmlEndPriority [] = { |
|
978 {"div", 150}, |
|
979 {"td", 160}, |
|
980 {"th", 160}, |
|
981 {"tr", 170}, |
|
982 {"thead", 180}, |
|
983 {"tbody", 180}, |
|
984 {"tfoot", 180}, |
|
985 {"table", 190}, |
|
986 {"head", 200}, |
|
987 {"body", 200}, |
|
988 {"html", 220}, |
|
989 {NULL, 100} /* Default priority */ |
|
990 }; |
|
991 |
|
992 /************************************************************************ |
|
993 * * |
|
994 * functions to handle HTML specific data * |
|
995 * * |
|
996 ************************************************************************/ |
|
997 |
|
998 /** |
|
999 * htmlInitAutoClose: |
|
1000 * |
|
1001 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. |
|
1002 * This is not reentrant. Call xmlInitParser() once before processing in |
|
1003 * case of use in multithreaded programs. |
|
1004 */ |
|
1005 void |
|
1006 htmlInitAutoClose(void) { |
|
1007 int indx, i = 0; |
|
1008 |
|
1009 if (htmlStartCloseIndexinitialized) return; |
|
1010 |
|
1011 for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL; |
|
1012 indx = 0; |
|
1013 while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) { |
|
1014 // libxslt port: (const char**) cast was added |
|
1015 htmlStartCloseIndex[indx++] = (const char**)&htmlStartClose[i]; |
|
1016 while(htmlStartClose[i++]) {}; |
|
1017 i++; |
|
1018 } |
|
1019 htmlStartCloseIndexinitialized = 1; |
|
1020 } |
|
1021 |
|
1022 /** |
|
1023 * htmlGetEndPriority: |
|
1024 * @param name The name of the element to look up the priority for. |
|
1025 * |
|
1026 * Return value: The "endtag" priority. |
|
1027 **/ |
|
1028 static int |
|
1029 htmlGetEndPriority (const xmlChar *name) { |
|
1030 int i = 0; |
|
1031 |
|
1032 while ((htmlEndPriority[i].name != NULL) && |
|
1033 (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name))) |
|
1034 i++; |
|
1035 |
|
1036 return(htmlEndPriority[i].priority); |
|
1037 } |
|
1038 |
|
1039 |
|
1040 /** |
|
1041 * htmlCheckAutoClose: |
|
1042 * @param newtag The new tag name |
|
1043 * @param oldtag The old tag name |
|
1044 * |
|
1045 * Checks whether the new tag is one of the registered valid tags for |
|
1046 * closing old. |
|
1047 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names. |
|
1048 * |
|
1049 * Returns 0 if no, 1 if yes. |
|
1050 */ |
|
1051 static int |
|
1052 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag) |
|
1053 { |
|
1054 int i, indx; |
|
1055 const char **closed = NULL; |
|
1056 |
|
1057 if (htmlStartCloseIndexinitialized == 0) |
|
1058 htmlInitAutoClose(); |
|
1059 |
|
1060 /* inefficient, but not a big deal */ |
|
1061 for (indx = 0; indx < 100; indx++) { |
|
1062 closed = htmlStartCloseIndex[indx]; |
|
1063 if (closed == NULL) |
|
1064 return (0); |
|
1065 if (xmlStrEqual(BAD_CAST * closed, newtag)) |
|
1066 break; |
|
1067 } |
|
1068 |
|
1069 i = closed - htmlStartClose; |
|
1070 i++; |
|
1071 while (htmlStartClose[i] != NULL) { |
|
1072 if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) { |
|
1073 return (1); |
|
1074 } |
|
1075 i++; |
|
1076 } |
|
1077 return (0); |
|
1078 } |
|
1079 |
|
1080 /** |
|
1081 * htmlAutoCloseOnClose: |
|
1082 * @param ctxt an HTML parser context |
|
1083 * @param newtag The new tag name |
|
1084 * @param force force the tag closure |
|
1085 * |
|
1086 * The HTML DTD allows an ending tag to implicitly close other tags. |
|
1087 */ |
|
1088 static void |
|
1089 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) |
|
1090 { |
|
1091 const htmlElemDesc *info; |
|
1092 int i, priority; |
|
1093 |
|
1094 priority = htmlGetEndPriority(newtag); |
|
1095 |
|
1096 for (i = (ctxt->nameNr - 1); i >= 0; i--) { |
|
1097 |
|
1098 if (xmlStrEqual(newtag, ctxt->nameTab[i])) |
|
1099 break; |
|
1100 /* |
|
1101 * A missplaced endtag can only close elements with lower |
|
1102 * or equal priority, so if we find an element with higher |
|
1103 * priority before we find an element with |
|
1104 * matching name, we just ignore this endtag |
|
1105 */ |
|
1106 if (htmlGetEndPriority(ctxt->nameTab[i]) > priority) |
|
1107 return; |
|
1108 } |
|
1109 if (i < 0) |
|
1110 return; |
|
1111 |
|
1112 while (!xmlStrEqual(newtag, ctxt->name)) { |
|
1113 info = htmlTagLookup(ctxt->name); |
|
1114 if ((info != NULL) && (info->endTag == 3)) { |
|
1115 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, |
|
1116 "Opening and ending tag mismatch: %s and %s\n", |
|
1117 newtag, ctxt->name); |
|
1118 } |
|
1119 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
1120 ctxt->sax->endElement(ctxt->userData, ctxt->name); |
|
1121 htmlnamePop(ctxt); |
|
1122 } |
|
1123 } |
|
1124 |
|
1125 /** |
|
1126 * htmlAutoCloseOnEnd: |
|
1127 * @param ctxt an HTML parser context |
|
1128 * |
|
1129 * Close all remaining tags at the end of the stream |
|
1130 */ |
|
1131 static void |
|
1132 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) |
|
1133 { |
|
1134 int i; |
|
1135 |
|
1136 if (ctxt->nameNr == 0) |
|
1137 return; |
|
1138 for (i = (ctxt->nameNr - 1); i >= 0; i--) { |
|
1139 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
1140 ctxt->sax->endElement(ctxt->userData, ctxt->name); |
|
1141 htmlnamePop(ctxt); |
|
1142 } |
|
1143 } |
|
1144 |
|
1145 /** |
|
1146 * htmlAutoClose: |
|
1147 * @param ctxt an HTML parser context |
|
1148 * @param newtag The new tag name or NULL |
|
1149 * |
|
1150 * The HTML DTD allows a tag to implicitly close other tags. |
|
1151 * The list is kept in htmlStartClose array. This function is |
|
1152 * called when a new tag has been detected and generates the |
|
1153 * appropriates closes if possible/needed. |
|
1154 * If newtag is NULL this mean we are at the end of the resource |
|
1155 * and we should check |
|
1156 */ |
|
1157 static void |
|
1158 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) |
|
1159 { |
|
1160 while ((newtag != NULL) && (ctxt->name != NULL) && |
|
1161 (htmlCheckAutoClose(newtag, ctxt->name))) { |
|
1162 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
1163 ctxt->sax->endElement(ctxt->userData, ctxt->name); |
|
1164 htmlnamePop(ctxt); |
|
1165 } |
|
1166 if (newtag == NULL) { |
|
1167 htmlAutoCloseOnEnd(ctxt); |
|
1168 return; |
|
1169 } |
|
1170 while ((newtag == NULL) && (ctxt->name != NULL) && |
|
1171 ((xmlStrEqual(ctxt->name, BAD_CAST "head")) || |
|
1172 (xmlStrEqual(ctxt->name, BAD_CAST "body")) || |
|
1173 (xmlStrEqual(ctxt->name, BAD_CAST "html")))) { |
|
1174 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
1175 ctxt->sax->endElement(ctxt->userData, ctxt->name); |
|
1176 htmlnamePop(ctxt); |
|
1177 } |
|
1178 } |
|
1179 |
|
1180 /** |
|
1181 * htmlAutoCloseTag: |
|
1182 * @param doc the HTML document |
|
1183 * @param name The tag name |
|
1184 * @param elem the HTML element |
|
1185 * |
|
1186 * The HTML DTD allows a tag to implicitly close other tags. |
|
1187 * The list is kept in htmlStartClose array. This function checks |
|
1188 * if the element or one of it's children would autoclose the |
|
1189 * given tag. |
|
1190 * |
|
1191 * Returns 1 if autoclose, 0 otherwise |
|
1192 */ |
|
1193 int |
|
1194 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) { |
|
1195 htmlNodePtr child; |
|
1196 |
|
1197 if (elem == NULL) return(1); |
|
1198 if (xmlStrEqual(name, elem->name)) return(0); |
|
1199 if (htmlCheckAutoClose(elem->name, name)) return(1); |
|
1200 child = elem->children; |
|
1201 while (child != NULL) { |
|
1202 if (htmlAutoCloseTag(doc, name, child)) return(1); |
|
1203 child = child->next; |
|
1204 } |
|
1205 return(0); |
|
1206 } |
|
1207 |
|
1208 /** |
|
1209 * htmlIsAutoClosed: |
|
1210 * @param doc the HTML document |
|
1211 * @param elem the HTML element |
|
1212 * |
|
1213 * The HTML DTD allows a tag to implicitly close other tags. |
|
1214 * The list is kept in htmlStartClose array. This function checks |
|
1215 * if a tag is autoclosed by one of it's child |
|
1216 * |
|
1217 * Returns 1 if autoclosed, 0 otherwise |
|
1218 */ |
|
1219 int |
|
1220 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) { |
|
1221 htmlNodePtr child; |
|
1222 |
|
1223 if (elem == NULL) return(1); |
|
1224 child = elem->children; |
|
1225 while (child != NULL) { |
|
1226 if (htmlAutoCloseTag(doc, elem->name, child)) return(1); |
|
1227 child = child->next; |
|
1228 } |
|
1229 return(0); |
|
1230 } |
|
1231 |
|
1232 /** |
|
1233 * htmlCheckImplied: |
|
1234 * @param ctxt an HTML parser context |
|
1235 * @param newtag The new tag name |
|
1236 * |
|
1237 * The HTML DTD allows a tag to exists only implicitly |
|
1238 * called when a new tag has been detected and generates the |
|
1239 * appropriates implicit tags if missing |
|
1240 */ |
|
1241 static void |
|
1242 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { |
|
1243 if (!htmlOmittedDefaultValue) |
|
1244 return; |
|
1245 if (xmlStrEqual(newtag, BAD_CAST"html")) |
|
1246 return; |
|
1247 if (ctxt->nameNr <= 0) { |
|
1248 htmlnamePush(ctxt, BAD_CAST"html"); |
|
1249 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) |
|
1250 ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL); |
|
1251 } |
|
1252 if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head"))) |
|
1253 return; |
|
1254 if ((ctxt->nameNr <= 1) && |
|
1255 ((xmlStrEqual(newtag, BAD_CAST"script")) || |
|
1256 (xmlStrEqual(newtag, BAD_CAST"style")) || |
|
1257 (xmlStrEqual(newtag, BAD_CAST"meta")) || |
|
1258 (xmlStrEqual(newtag, BAD_CAST"link")) || |
|
1259 (xmlStrEqual(newtag, BAD_CAST"title")) || |
|
1260 (xmlStrEqual(newtag, BAD_CAST"base")))) { |
|
1261 /* |
|
1262 * dropped OBJECT ... i you put it first BODY will be |
|
1263 * assumed ! |
|
1264 */ |
|
1265 htmlnamePush(ctxt, BAD_CAST"head"); |
|
1266 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) |
|
1267 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL); |
|
1268 } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) && |
|
1269 (!xmlStrEqual(newtag, BAD_CAST"frame")) && |
|
1270 (!xmlStrEqual(newtag, BAD_CAST"frameset"))) { |
|
1271 int i; |
|
1272 for (i = 0;i < ctxt->nameNr;i++) { |
|
1273 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) { |
|
1274 return; |
|
1275 } |
|
1276 if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) { |
|
1277 return; |
|
1278 } |
|
1279 } |
|
1280 |
|
1281 htmlnamePush(ctxt, BAD_CAST"body"); |
|
1282 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) |
|
1283 ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL); |
|
1284 } |
|
1285 } |
|
1286 |
|
1287 /** |
|
1288 * htmlCheckParagraph |
|
1289 * @param ctxt an HTML parser context |
|
1290 * |
|
1291 * Check whether a p element need to be implied before inserting |
|
1292 * characters in the current element. |
|
1293 * |
|
1294 * Returns 1 if a paragraph has been inserted, 0 if not and -1 |
|
1295 * in case of error. |
|
1296 */ |
|
1297 |
|
1298 static int |
|
1299 htmlCheckParagraph(htmlParserCtxtPtr ctxt) { |
|
1300 const xmlChar *tag; |
|
1301 int i; |
|
1302 |
|
1303 if (ctxt == NULL) |
|
1304 return(-1); |
|
1305 tag = ctxt->name; |
|
1306 if (tag == NULL) { |
|
1307 htmlAutoClose(ctxt, BAD_CAST"p"); |
|
1308 htmlCheckImplied(ctxt, BAD_CAST"p"); |
|
1309 htmlnamePush(ctxt, BAD_CAST"p"); |
|
1310 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) |
|
1311 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); |
|
1312 return(1); |
|
1313 } |
|
1314 if (!htmlOmittedDefaultValue) |
|
1315 return(0); |
|
1316 for (i = 0; htmlNoContentElements[i] != NULL; i++) { |
|
1317 if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) { |
|
1318 htmlAutoClose(ctxt, BAD_CAST"p"); |
|
1319 htmlCheckImplied(ctxt, BAD_CAST"p"); |
|
1320 htmlnamePush(ctxt, BAD_CAST"p"); |
|
1321 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) |
|
1322 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL); |
|
1323 return(1); |
|
1324 } |
|
1325 } |
|
1326 return(0); |
|
1327 } |
|
1328 |
|
1329 /** |
|
1330 * htmlIsScriptAttribute: |
|
1331 * @param name an attribute name |
|
1332 * |
|
1333 * Check if an attribute is of content type Script |
|
1334 * |
|
1335 * Returns 1 is the attribute is a script 0 otherwise |
|
1336 */ |
|
1337 int |
|
1338 htmlIsScriptAttribute(const xmlChar *name) { |
|
1339 unsigned int i; |
|
1340 |
|
1341 if (name == NULL) |
|
1342 return(0); |
|
1343 /* |
|
1344 * all script attributes start with 'on' |
|
1345 */ |
|
1346 if ((name[0] != 'o') || (name[1] != 'n')) |
|
1347 return(0); |
|
1348 for (i = 0; |
|
1349 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]); |
|
1350 i++) { |
|
1351 if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i])) |
|
1352 return(1); |
|
1353 } |
|
1354 return(0); |
|
1355 } |
|
1356 |
|
1357 /************************************************************************ |
|
1358 * * |
|
1359 * The list of HTML predefined entities * |
|
1360 * * |
|
1361 ************************************************************************/ |
|
1362 |
|
1363 |
|
1364 static const htmlEntityDesc html40EntitiesTable[] = { |
|
1365 /* |
|
1366 * the 4 absolute ones, plus apostrophe. |
|
1367 */ |
|
1368 { 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" }, |
|
1369 { 38, "amp", "ampersand, U+0026 ISOnum" }, |
|
1370 { 39, "apos", "single quote" }, |
|
1371 { 60, "lt", "less-than sign, U+003C ISOnum" }, |
|
1372 { 62, "gt", "greater-than sign, U+003E ISOnum" }, |
|
1373 |
|
1374 /* |
|
1375 * A bunch still in the 128-255 range |
|
1376 * Replacing them depend really on the charset used. |
|
1377 */ |
|
1378 { 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" }, |
|
1379 { 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" }, |
|
1380 { 162, "cent", "cent sign, U+00A2 ISOnum" }, |
|
1381 { 163, "pound","pound sign, U+00A3 ISOnum" }, |
|
1382 { 164, "curren","currency sign, U+00A4 ISOnum" }, |
|
1383 { 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" }, |
|
1384 { 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" }, |
|
1385 { 167, "sect", "section sign, U+00A7 ISOnum" }, |
|
1386 { 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" }, |
|
1387 { 169, "copy", "copyright sign, U+00A9 ISOnum" }, |
|
1388 { 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" }, |
|
1389 { 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" }, |
|
1390 { 172, "not", "not sign, U+00AC ISOnum" }, |
|
1391 { 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" }, |
|
1392 { 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" }, |
|
1393 { 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" }, |
|
1394 { 176, "deg", "degree sign, U+00B0 ISOnum" }, |
|
1395 { 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" }, |
|
1396 { 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" }, |
|
1397 { 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" }, |
|
1398 { 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" }, |
|
1399 { 181, "micro","micro sign, U+00B5 ISOnum" }, |
|
1400 { 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" }, |
|
1401 { 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" }, |
|
1402 { 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" }, |
|
1403 { 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" }, |
|
1404 { 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" }, |
|
1405 { 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" }, |
|
1406 { 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" }, |
|
1407 { 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" }, |
|
1408 { 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" }, |
|
1409 { 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" }, |
|
1410 { 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" }, |
|
1411 { 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" }, |
|
1412 { 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" }, |
|
1413 { 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" }, |
|
1414 { 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" }, |
|
1415 { 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" }, |
|
1416 { 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" }, |
|
1417 { 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" }, |
|
1418 { 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" }, |
|
1419 { 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" }, |
|
1420 { 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" }, |
|
1421 { 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" }, |
|
1422 { 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" }, |
|
1423 { 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" }, |
|
1424 { 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" }, |
|
1425 { 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" }, |
|
1426 { 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" }, |
|
1427 { 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" }, |
|
1428 { 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" }, |
|
1429 { 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" }, |
|
1430 { 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" }, |
|
1431 { 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" }, |
|
1432 { 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" }, |
|
1433 { 215, "times","multiplication sign, U+00D7 ISOnum" }, |
|
1434 { 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" }, |
|
1435 { 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" }, |
|
1436 { 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" }, |
|
1437 { 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" }, |
|
1438 { 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" }, |
|
1439 { 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" }, |
|
1440 { 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" }, |
|
1441 { 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" }, |
|
1442 { 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" }, |
|
1443 { 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" }, |
|
1444 { 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" }, |
|
1445 { 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" }, |
|
1446 { 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" }, |
|
1447 { 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" }, |
|
1448 { 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" }, |
|
1449 { 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" }, |
|
1450 { 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" }, |
|
1451 { 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" }, |
|
1452 { 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" }, |
|
1453 { 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" }, |
|
1454 { 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" }, |
|
1455 { 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" }, |
|
1456 { 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" }, |
|
1457 { 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" }, |
|
1458 { 240, "eth", "latin small letter eth, U+00F0 ISOlat1" }, |
|
1459 { 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" }, |
|
1460 { 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" }, |
|
1461 { 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" }, |
|
1462 { 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" }, |
|
1463 { 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" }, |
|
1464 { 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" }, |
|
1465 { 247, "divide","division sign, U+00F7 ISOnum" }, |
|
1466 { 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" }, |
|
1467 { 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" }, |
|
1468 { 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" }, |
|
1469 { 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" }, |
|
1470 { 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" }, |
|
1471 { 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" }, |
|
1472 { 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" }, |
|
1473 { 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" }, |
|
1474 |
|
1475 { 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" }, |
|
1476 { 339, "oelig","latin small ligature oe, U+0153 ISOlat2" }, |
|
1477 { 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" }, |
|
1478 { 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" }, |
|
1479 { 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" }, |
|
1480 |
|
1481 /* |
|
1482 * Anything below should really be kept as entities references |
|
1483 */ |
|
1484 { 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" }, |
|
1485 |
|
1486 { 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" }, |
|
1487 { 732, "tilde","small tilde, U+02DC ISOdia" }, |
|
1488 |
|
1489 { 913, "Alpha","greek capital letter alpha, U+0391" }, |
|
1490 { 914, "Beta", "greek capital letter beta, U+0392" }, |
|
1491 { 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" }, |
|
1492 { 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" }, |
|
1493 { 917, "Epsilon","greek capital letter epsilon, U+0395" }, |
|
1494 { 918, "Zeta", "greek capital letter zeta, U+0396" }, |
|
1495 { 919, "Eta", "greek capital letter eta, U+0397" }, |
|
1496 { 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" }, |
|
1497 { 921, "Iota", "greek capital letter iota, U+0399" }, |
|
1498 { 922, "Kappa","greek capital letter kappa, U+039A" }, |
|
1499 { 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" }, |
|
1500 { 924, "Mu", "greek capital letter mu, U+039C" }, |
|
1501 { 925, "Nu", "greek capital letter nu, U+039D" }, |
|
1502 { 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" }, |
|
1503 { 927, "Omicron","greek capital letter omicron, U+039F" }, |
|
1504 { 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" }, |
|
1505 { 929, "Rho", "greek capital letter rho, U+03A1" }, |
|
1506 { 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" }, |
|
1507 { 932, "Tau", "greek capital letter tau, U+03A4" }, |
|
1508 { 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" }, |
|
1509 { 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" }, |
|
1510 { 935, "Chi", "greek capital letter chi, U+03A7" }, |
|
1511 { 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" }, |
|
1512 { 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" }, |
|
1513 |
|
1514 { 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" }, |
|
1515 { 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" }, |
|
1516 { 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" }, |
|
1517 { 948, "delta","greek small letter delta, U+03B4 ISOgrk3" }, |
|
1518 { 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" }, |
|
1519 { 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" }, |
|
1520 { 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" }, |
|
1521 { 952, "theta","greek small letter theta, U+03B8 ISOgrk3" }, |
|
1522 { 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" }, |
|
1523 { 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" }, |
|
1524 { 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" }, |
|
1525 { 956, "mu", "greek small letter mu, U+03BC ISOgrk3" }, |
|
1526 { 957, "nu", "greek small letter nu, U+03BD ISOgrk3" }, |
|
1527 { 958, "xi", "greek small letter xi, U+03BE ISOgrk3" }, |
|
1528 { 959, "omicron","greek small letter omicron, U+03BF NEW" }, |
|
1529 { 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" }, |
|
1530 { 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" }, |
|
1531 { 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" }, |
|
1532 { 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" }, |
|
1533 { 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" }, |
|
1534 { 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" }, |
|
1535 { 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" }, |
|
1536 { 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" }, |
|
1537 { 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" }, |
|
1538 { 969, "omega","greek small letter omega, U+03C9 ISOgrk3" }, |
|
1539 { 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" }, |
|
1540 { 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" }, |
|
1541 { 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" }, |
|
1542 |
|
1543 { 8194, "ensp", "en space, U+2002 ISOpub" }, |
|
1544 { 8195, "emsp", "em space, U+2003 ISOpub" }, |
|
1545 { 8201, "thinsp","thin space, U+2009 ISOpub" }, |
|
1546 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" }, |
|
1547 { 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" }, |
|
1548 { 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" }, |
|
1549 { 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" }, |
|
1550 { 8211, "ndash","en dash, U+2013 ISOpub" }, |
|
1551 { 8212, "mdash","em dash, U+2014 ISOpub" }, |
|
1552 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" }, |
|
1553 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" }, |
|
1554 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" }, |
|
1555 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" }, |
|
1556 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" }, |
|
1557 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" }, |
|
1558 { 8224, "dagger","dagger, U+2020 ISOpub" }, |
|
1559 { 8225, "Dagger","double dagger, U+2021 ISOpub" }, |
|
1560 |
|
1561 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" }, |
|
1562 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" }, |
|
1563 |
|
1564 { 8240, "permil","per mille sign, U+2030 ISOtech" }, |
|
1565 |
|
1566 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" }, |
|
1567 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" }, |
|
1568 |
|
1569 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" }, |
|
1570 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" }, |
|
1571 |
|
1572 { 8254, "oline","overline = spacing overscore, U+203E NEW" }, |
|
1573 { 8260, "frasl","fraction slash, U+2044 NEW" }, |
|
1574 |
|
1575 { 8364, "euro", "euro sign, U+20AC NEW" }, |
|
1576 |
|
1577 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" }, |
|
1578 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" }, |
|
1579 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" }, |
|
1580 { 8482, "trade","trade mark sign, U+2122 ISOnum" }, |
|
1581 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" }, |
|
1582 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" }, |
|
1583 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" }, |
|
1584 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" }, |
|
1585 { 8595, "darr", "downwards arrow, U+2193 ISOnum" }, |
|
1586 { 8596, "harr", "left right arrow, U+2194 ISOamsa" }, |
|
1587 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" }, |
|
1588 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" }, |
|
1589 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" }, |
|
1590 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" }, |
|
1591 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" }, |
|
1592 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" }, |
|
1593 |
|
1594 { 8704, "forall","for all, U+2200 ISOtech" }, |
|
1595 { 8706, "part", "partial differential, U+2202 ISOtech" }, |
|
1596 { 8707, "exist","there exists, U+2203 ISOtech" }, |
|
1597 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" }, |
|
1598 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" }, |
|
1599 { 8712, "isin", "element of, U+2208 ISOtech" }, |
|
1600 { 8713, "notin","not an element of, U+2209 ISOtech" }, |
|
1601 { 8715, "ni", "contains as member, U+220B ISOtech" }, |
|
1602 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" }, |
|
1603 { 8721, "sum", "n-ary summation, U+2211 ISOamsb" }, |
|
1604 { 8722, "minus","minus sign, U+2212 ISOtech" }, |
|
1605 { 8727, "lowast","asterisk operator, U+2217 ISOtech" }, |
|
1606 { 8730, "radic","square root = radical sign, U+221A ISOtech" }, |
|
1607 { 8733, "prop", "proportional to, U+221D ISOtech" }, |
|
1608 { 8734, "infin","infinity, U+221E ISOtech" }, |
|
1609 { 8736, "ang", "angle, U+2220 ISOamso" }, |
|
1610 { 8743, "and", "logical and = wedge, U+2227 ISOtech" }, |
|
1611 { 8744, "or", "logical or = vee, U+2228 ISOtech" }, |
|
1612 { 8745, "cap", "intersection = cap, U+2229 ISOtech" }, |
|
1613 { 8746, "cup", "union = cup, U+222A ISOtech" }, |
|
1614 { 8747, "int", "integral, U+222B ISOtech" }, |
|
1615 { 8756, "there4","therefore, U+2234 ISOtech" }, |
|
1616 { 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" }, |
|
1617 { 8773, "cong", "approximately equal to, U+2245 ISOtech" }, |
|
1618 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" }, |
|
1619 { 8800, "ne", "not equal to, U+2260 ISOtech" }, |
|
1620 { 8801, "equiv","identical to, U+2261 ISOtech" }, |
|
1621 { 8804, "le", "less-than or equal to, U+2264 ISOtech" }, |
|
1622 { 8805, "ge", "greater-than or equal to, U+2265 ISOtech" }, |
|
1623 { 8834, "sub", "subset of, U+2282 ISOtech" }, |
|
1624 { 8835, "sup", "superset of, U+2283 ISOtech" }, |
|
1625 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" }, |
|
1626 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" }, |
|
1627 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" }, |
|
1628 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" }, |
|
1629 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" }, |
|
1630 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" }, |
|
1631 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" }, |
|
1632 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" }, |
|
1633 { 8969, "rceil","right ceiling, U+2309 ISOamsc" }, |
|
1634 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" }, |
|
1635 { 8971, "rfloor","right floor, U+230B ISOamsc" }, |
|
1636 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" }, |
|
1637 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" }, |
|
1638 { 9674, "loz", "lozenge, U+25CA ISOpub" }, |
|
1639 |
|
1640 { 9824, "spades","black spade suit, U+2660 ISOpub" }, |
|
1641 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" }, |
|
1642 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" }, |
|
1643 { 9830, "diams","black diamond suit, U+2666 ISOpub" }, |
|
1644 |
|
1645 }; |
|
1646 |
|
1647 /************************************************************************ |
|
1648 * * |
|
1649 * Commodity functions to handle entities * |
|
1650 * * |
|
1651 ************************************************************************/ |
|
1652 |
|
1653 /* |
|
1654 * Macro used to grow the current buffer. Buffer is freed in OOM. |
|
1655 */ // DONE: Fix xmlRealloc |
|
1656 #define growBuffer(buffer) { \ |
|
1657 void* allocTmp; \ |
|
1658 buffer##_size *= 2; \ |
|
1659 allocTmp = xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \ |
|
1660 if (!allocTmp) { \ |
|
1661 xmlFree(buffer); \ |
|
1662 htmlErrMemory(ctxt, "growing buffer\n"); \ |
|
1663 return(NULL); \ |
|
1664 } \ |
|
1665 buffer = (xmlChar*) allocTmp; \ |
|
1666 } |
|
1667 |
|
1668 /** |
|
1669 * htmlEntityLookup: |
|
1670 * @param name the entity name |
|
1671 * |
|
1672 * Lookup the given entity in EntitiesTable |
|
1673 * |
|
1674 |
|
1675 * |
|
1676 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. |
|
1677 */ |
|
1678 const htmlEntityDesc * |
|
1679 htmlEntityLookup(const xmlChar *name) { |
|
1680 unsigned int i; |
|
1681 |
|
1682 for (i = 0;i < (sizeof(html40EntitiesTable)/ |
|
1683 sizeof(html40EntitiesTable[0]));i++) { |
|
1684 if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) { |
|
1685 return((htmlEntityDescPtr) &html40EntitiesTable[i]); |
|
1686 } |
|
1687 } |
|
1688 return(NULL); |
|
1689 } |
|
1690 |
|
1691 /** |
|
1692 * htmlEntityValueLookup: |
|
1693 * @param value the entity's unicode value |
|
1694 * |
|
1695 * Lookup the given entity in EntitiesTable |
|
1696 * |
|
1697 |
|
1698 * |
|
1699 * Returns the associated htmlEntityDescPtr if found, NULL otherwise. |
|
1700 */ |
|
1701 const htmlEntityDesc * |
|
1702 htmlEntityValueLookup(unsigned int value) { |
|
1703 unsigned int i; |
|
1704 |
|
1705 for (i = 0;i < (sizeof(html40EntitiesTable)/ |
|
1706 sizeof(html40EntitiesTable[0]));i++) { |
|
1707 if (html40EntitiesTable[i].value >= value) { |
|
1708 if (html40EntitiesTable[i].value > value) |
|
1709 break; |
|
1710 return((htmlEntityDescPtr) &html40EntitiesTable[i]); |
|
1711 } |
|
1712 } |
|
1713 return(NULL); |
|
1714 } |
|
1715 |
|
1716 /** |
|
1717 * UTF8ToHtml: |
|
1718 * @param out a pointer to an array of bytes to store the result |
|
1719 * @param outlen the length of out |
|
1720 * @param in a pointer to an array of UTF-8 chars |
|
1721 * @param inlen the length of in |
|
1722 * |
|
1723 * Take a block of UTF-8 chars in and try to convert it to an ASCII |
|
1724 * plus HTML entities block of chars out. |
|
1725 * |
|
1726 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise |
|
1727 * The value of inlen after return is the number of octets consumed |
|
1728 * as the return value is positive, else unpredictable. |
|
1729 * The value of outlen after return is the number of octets consumed. |
|
1730 */ |
|
1731 int |
|
1732 UTF8ToHtml(unsigned char* out, int *outlen, |
|
1733 const unsigned char* in, int *inlen) { |
|
1734 const unsigned char* processed = in; |
|
1735 const unsigned char* outend; |
|
1736 const unsigned char* outstart = out; |
|
1737 const unsigned char* instart = in; |
|
1738 const unsigned char* inend; |
|
1739 unsigned int c, d; |
|
1740 int trailing; |
|
1741 |
|
1742 if (in == NULL) { |
|
1743 /* |
|
1744 * initialization nothing to do |
|
1745 */ |
|
1746 *outlen = 0; |
|
1747 *inlen = 0; |
|
1748 return(0); |
|
1749 } |
|
1750 inend = in + (*inlen); |
|
1751 outend = out + (*outlen); |
|
1752 while (in < inend) { |
|
1753 d = *in++; |
|
1754 if (d < 0x80) { c= d; trailing= 0; } |
|
1755 else if (d < 0xC0) { |
|
1756 /* trailing byte in leading position */ |
|
1757 *outlen = out - outstart; |
|
1758 *inlen = processed - instart; |
|
1759 return(-2); |
|
1760 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } |
|
1761 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } |
|
1762 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } |
|
1763 else { |
|
1764 /* no chance for this in Ascii */ |
|
1765 *outlen = out - outstart; |
|
1766 *inlen = processed - instart; |
|
1767 return(-2); |
|
1768 } |
|
1769 |
|
1770 if (inend - in < trailing) { |
|
1771 break; |
|
1772 } |
|
1773 |
|
1774 for ( ; trailing; trailing--) { |
|
1775 if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) |
|
1776 break; |
|
1777 c <<= 6; |
|
1778 c |= d & 0x3F; |
|
1779 } |
|
1780 |
|
1781 /* assertion: c is a single UTF-4 value */ |
|
1782 if (c < 0x80) { |
|
1783 if (out + 1 >= outend) |
|
1784 break; |
|
1785 *out++ = c; |
|
1786 } else { |
|
1787 int len; |
|
1788 const htmlEntityDesc * ent; |
|
1789 |
|
1790 /* |
|
1791 * Try to lookup a predefined HTML entity for it |
|
1792 */ |
|
1793 |
|
1794 ent = htmlEntityValueLookup(c); |
|
1795 if (ent == NULL) { |
|
1796 /* no chance for this in Ascii */ |
|
1797 *outlen = out - outstart; |
|
1798 *inlen = processed - instart; |
|
1799 return(-2); |
|
1800 } |
|
1801 len = strlen(ent->name); |
|
1802 if (out + 2 + len >= outend) |
|
1803 break; |
|
1804 *out++ = '&'; |
|
1805 memcpy(out, ent->name, len); |
|
1806 out += len; |
|
1807 *out++ = ';'; |
|
1808 } |
|
1809 processed = in; |
|
1810 } |
|
1811 *outlen = out - outstart; |
|
1812 *inlen = processed - instart; |
|
1813 return(0); |
|
1814 } |
|
1815 |
|
1816 /** |
|
1817 * htmlEncodeEntities: |
|
1818 * @param out a pointer to an array of bytes to store the result |
|
1819 * @param outlen the length of out |
|
1820 * @param in a pointer to an array of UTF-8 chars |
|
1821 * @param inlen the length of in |
|
1822 * @param quoteChar the quote character to escape (' or ") or zero. |
|
1823 * |
|
1824 * Take a block of UTF-8 chars in and try to convert it to an ASCII |
|
1825 * plus HTML entities block of chars out. |
|
1826 * |
|
1827 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise |
|
1828 * The value of inlen after return is the number of octets consumed |
|
1829 * as the return value is positive, else unpredictable. |
|
1830 * The value of outlen after return is the number of octets consumed. |
|
1831 */ |
|
1832 int |
|
1833 htmlEncodeEntities(unsigned char* out, int *outlen, |
|
1834 const unsigned char* in, int *inlen, int quoteChar) { |
|
1835 const unsigned char* processed = in; |
|
1836 const unsigned char* outend = out + (*outlen); |
|
1837 const unsigned char* outstart = out; |
|
1838 const unsigned char* instart = in; |
|
1839 const unsigned char* inend = in + (*inlen); |
|
1840 unsigned int c, d; |
|
1841 int trailing; |
|
1842 |
|
1843 while (in < inend) { |
|
1844 d = *in++; |
|
1845 if (d < 0x80) { c= d; trailing= 0; } |
|
1846 else if (d < 0xC0) { |
|
1847 /* trailing byte in leading position */ |
|
1848 *outlen = out - outstart; |
|
1849 *inlen = processed - instart; |
|
1850 return(-2); |
|
1851 } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } |
|
1852 else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } |
|
1853 else if (d < 0xF8) { c= d & 0x07; trailing= 3; } |
|
1854 else { |
|
1855 /* no chance for this in Ascii */ |
|
1856 *outlen = out - outstart; |
|
1857 *inlen = processed - instart; |
|
1858 return(-2); |
|
1859 } |
|
1860 |
|
1861 if (inend - in < trailing) |
|
1862 break; |
|
1863 |
|
1864 while (trailing--) { |
|
1865 if (((d= *in++) & 0xC0) != 0x80) { |
|
1866 *outlen = out - outstart; |
|
1867 *inlen = processed - instart; |
|
1868 return(-2); |
|
1869 } |
|
1870 c <<= 6; |
|
1871 c |= d & 0x3F; |
|
1872 } |
|
1873 |
|
1874 /* assertion: c is a single UTF-4 value */ |
|
1875 if ((c < 0x80) && (c != (unsigned int) quoteChar) && |
|
1876 (c != '&') && (c != '<') && (c != '>')) { |
|
1877 if (out >= outend) |
|
1878 break; |
|
1879 *out++ = c; |
|
1880 } else { |
|
1881 const htmlEntityDesc * ent; |
|
1882 const char *cp; |
|
1883 char nbuf[16]; |
|
1884 int len; |
|
1885 |
|
1886 /* |
|
1887 * Try to lookup a predefined HTML entity for it |
|
1888 */ |
|
1889 ent = htmlEntityValueLookup(c); |
|
1890 if (ent == NULL) { |
|
1891 snprintf(nbuf, sizeof(nbuf), "#%u", c); |
|
1892 cp = nbuf; |
|
1893 } |
|
1894 else |
|
1895 cp = ent->name; |
|
1896 len = strlen(cp); |
|
1897 if (out + 2 + len > outend) |
|
1898 break; |
|
1899 *out++ = '&'; |
|
1900 memcpy(out, cp, len); |
|
1901 out += len; |
|
1902 *out++ = ';'; |
|
1903 } |
|
1904 processed = in; |
|
1905 } |
|
1906 *outlen = out - outstart; |
|
1907 *inlen = processed - instart; |
|
1908 return(0); |
|
1909 } |
|
1910 |
|
1911 /************************************************************************ |
|
1912 * * |
|
1913 * Commodity functions to handle streams * |
|
1914 * * |
|
1915 ************************************************************************/ |
|
1916 |
|
1917 /** |
|
1918 * htmlNewInputStream: |
|
1919 * @param ctxt an HTML parser context |
|
1920 * |
|
1921 * Create a new input stream structure |
|
1922 * Returns the new input stream or NULL |
|
1923 */ |
|
1924 static htmlParserInputPtr |
|
1925 htmlNewInputStream(htmlParserCtxtPtr ctxt) { |
|
1926 htmlParserInputPtr input; |
|
1927 |
|
1928 input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput)); |
|
1929 if (input == NULL) { |
|
1930 htmlErrMemory(ctxt, "couldn't allocate a new input stream\n"); |
|
1931 return(NULL); |
|
1932 } |
|
1933 memset(input, 0, sizeof(htmlParserInput)); |
|
1934 input->filename = NULL; |
|
1935 input->directory = NULL; |
|
1936 input->base = NULL; |
|
1937 input->cur = NULL; |
|
1938 input->buf = NULL; |
|
1939 input->line = 1; |
|
1940 input->col = 1; |
|
1941 input->buf = NULL; |
|
1942 input->free = NULL; |
|
1943 input->version = NULL; |
|
1944 input->consumed = 0; |
|
1945 input->length = 0; |
|
1946 return(input); |
|
1947 } |
|
1948 |
|
1949 |
|
1950 /************************************************************************ |
|
1951 * * |
|
1952 * Commodity functions, cleanup needed ? * |
|
1953 * * |
|
1954 ************************************************************************/ |
|
1955 /* |
|
1956 * all tags allowing pc data from the html 4.01 loose dtd |
|
1957 * NOTE: it might be more apropriate to integrate this information |
|
1958 * into the html40ElementTable array but I don't want to risk any |
|
1959 * binary incomptibility |
|
1960 */ |
|
1961 static const char * const allowPCData[] = { |
|
1962 "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", |
|
1963 "blockquote", "body", "button", "caption", "center", "cite", "code", |
|
1964 "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2", |
|
1965 "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend", |
|
1966 "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp", |
|
1967 "small", "span", "strike", "strong", "td", "th", "tt", "u", "var" |
|
1968 }; |
|
1969 |
|
1970 /** |
|
1971 * areBlanks: |
|
1972 * @param ctxt an HTML parser context |
|
1973 * @param str a xmlChar * |
|
1974 * @param len the size of str |
|
1975 * |
|
1976 * Is this a sequence of blank chars that one can ignore ? |
|
1977 * |
|
1978 * Returns 1 if ignorable 0 otherwise. |
|
1979 */ |
|
1980 |
|
1981 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { |
|
1982 unsigned int i; |
|
1983 int j; |
|
1984 xmlNodePtr lastChild; |
|
1985 |
|
1986 for (j = 0;j < len;j++) |
|
1987 if (!(IS_BLANK_CH(str[j]))) return(0); |
|
1988 |
|
1989 if (CUR == 0) return(1); |
|
1990 if (CUR != '<') return(0); |
|
1991 if (ctxt->name == NULL) |
|
1992 return(1); |
|
1993 if (xmlStrEqual(ctxt->name, BAD_CAST"html")) |
|
1994 return(1); |
|
1995 if (xmlStrEqual(ctxt->name, BAD_CAST"head")) |
|
1996 return(1); |
|
1997 if (xmlStrEqual(ctxt->name, BAD_CAST"body")) |
|
1998 return(1); |
|
1999 if (ctxt->node == NULL) return(0); |
|
2000 lastChild = xmlGetLastChild(ctxt->node); |
|
2001 while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) |
|
2002 lastChild = lastChild->prev; |
|
2003 if (lastChild == NULL) { |
|
2004 if ((ctxt->node->type != XML_ELEMENT_NODE) && |
|
2005 (ctxt->node->content != NULL)) return(0); |
|
2006 /* keep ws in constructs like ...<b> </b>... |
|
2007 for all tags "b" allowing PCDATA */ |
|
2008 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { |
|
2009 if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) { |
|
2010 return(0); |
|
2011 } |
|
2012 } |
|
2013 } else if (xmlNodeIsText(lastChild)) { |
|
2014 return(0); |
|
2015 } else { |
|
2016 /* keep ws in constructs like <p><b>xy</b> <i>z</i><p> |
|
2017 for all tags "p" allowing PCDATA */ |
|
2018 for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) { |
|
2019 if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) { |
|
2020 return(0); |
|
2021 } |
|
2022 } |
|
2023 } |
|
2024 return(1); |
|
2025 } |
|
2026 #endif /* defined(LIBXML_HTML_ENABLED */ |
|
2027 |
|
2028 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) |
|
2029 |
|
2030 /** |
|
2031 * htmlErrMemory: |
|
2032 * @param ctxt an HTML parser context |
|
2033 * @param extra extra informations |
|
2034 * |
|
2035 * Handle a redefinition of attribute error |
|
2036 */ |
|
2037 static void |
|
2038 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra) |
|
2039 { |
|
2040 if ((ctxt != NULL) && (ctxt->disableSAX != 0) && |
|
2041 (ctxt->instate == XML_PARSER_EOF)) |
|
2042 return; |
|
2043 if (ctxt != NULL) { |
|
2044 ctxt->errNo = XML_ERR_NO_MEMORY; |
|
2045 ctxt->instate = XML_PARSER_EOF; |
|
2046 ctxt->disableSAX = 1; |
|
2047 } |
|
2048 if (extra) |
|
2049 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, |
|
2050 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra, |
|
2051 NULL, NULL, 0, 0, |
|
2052 "Memory allocation failed : %s\n", extra); |
|
2053 else |
|
2054 __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER, |
|
2055 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL, |
|
2056 NULL, NULL, 0, 0, "Memory allocation failed\n"); |
|
2057 } |
|
2058 |
|
2059 /** |
|
2060 * htmlNewDocNoDtD: |
|
2061 * @param URI URI for the dtd, or NULL |
|
2062 * @param ExternalID the external ID of the DTD, or NULL |
|
2063 * |
|
2064 * Creates a new HTML document without a DTD node if URI and ExternalID |
|
2065 * are NULL |
|
2066 * |
|
2067 * Returns a new document, do not initialize the DTD if not provided |
|
2068 */ |
|
2069 XMLPUBFUNEXPORT htmlDocPtr |
|
2070 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) { |
|
2071 xmlDocPtr cur; |
|
2072 |
|
2073 /* |
|
2074 * Allocate a new document and fill the fields. |
|
2075 */ |
|
2076 cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc)); |
|
2077 if (cur == NULL) { |
|
2078 htmlErrMemory(NULL, "HTML document creation failed\n"); |
|
2079 return(NULL); |
|
2080 } |
|
2081 memset(cur, 0, sizeof(xmlDoc)); |
|
2082 |
|
2083 cur->type = XML_HTML_DOCUMENT_NODE; |
|
2084 #ifdef XE_ENABLE_GS_CACHING |
|
2085 cur->cachedGs = xmlGetGlobalState(); |
|
2086 #endif |
|
2087 |
|
2088 //cur->version = NULL; |
|
2089 //cur->intSubset = NULL; |
|
2090 cur->doc = cur; |
|
2091 //cur->name = NULL; |
|
2092 //cur->children = NULL; |
|
2093 //cur->extSubset = NULL; |
|
2094 //cur->oldNs = NULL; |
|
2095 //cur->encoding = NULL; |
|
2096 cur->standalone = 1; |
|
2097 //cur->compression = 0; |
|
2098 //cur->ids = NULL; |
|
2099 //cur->refs = NULL; |
|
2100 //cur->_private = NULL; |
|
2101 |
|
2102 if (ExternalID || URI) |
|
2103 xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI); |
|
2104 |
|
2105 return(cur); |
|
2106 } |
|
2107 |
|
2108 /** |
|
2109 * htmlNewDoc: |
|
2110 * @param URI URI for the dtd, or NULL |
|
2111 * @param ExternalID the external ID of the DTD, or NULL |
|
2112 * |
|
2113 * Creates a new HTML document |
|
2114 * |
|
2115 * Returns a new document |
|
2116 */ |
|
2117 XMLPUBFUNEXPORT htmlDocPtr |
|
2118 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) { |
|
2119 if ((URI == NULL) && (ExternalID == NULL)) |
|
2120 return(htmlNewDocNoDtD( |
|
2121 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd", |
|
2122 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN")); |
|
2123 |
|
2124 return(htmlNewDocNoDtD(URI, ExternalID)); |
|
2125 } |
|
2126 |
|
2127 /** |
|
2128 * htmlTagLookup: |
|
2129 * @param tag The tag name in lowercase |
|
2130 * |
|
2131 * Lookup the HTML tag in the ElementTable |
|
2132 * |
|
2133 * Returns the related htmlElemDescPtr or NULL if not found. |
|
2134 */ |
|
2135 XMLPUBFUNEXPORT const htmlElemDesc * |
|
2136 htmlTagLookup(const xmlChar *tag) { |
|
2137 unsigned int i; |
|
2138 |
|
2139 for (i = 0; i < (sizeof(html40ElementTable) / |
|
2140 sizeof(html40ElementTable[0]));i++) { |
|
2141 if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name)) |
|
2142 return((htmlElemDescPtr) &html40ElementTable[i]); |
|
2143 } |
|
2144 return(NULL); |
|
2145 } |
|
2146 |
|
2147 #endif /* defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) */ |
|
2148 |
|
2149 #if defined(LIBXML_HTML_ENABLED) |
|
2150 |
|
2151 /************************************************************************ |
|
2152 * * |
|
2153 * The parser itself * |
|
2154 * Relates to http://www.w3.org/TR/html40 * |
|
2155 * * |
|
2156 ************************************************************************/ |
|
2157 |
|
2158 /************************************************************************ |
|
2159 * * |
|
2160 * The parser itself * |
|
2161 * * |
|
2162 ************************************************************************/ |
|
2163 |
|
2164 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); |
|
2165 |
|
2166 /** |
|
2167 * htmlParseHTMLName: |
|
2168 * @param ctxt an HTML parser context |
|
2169 * |
|
2170 * parse an HTML tag or attribute name, note that we convert it to lowercase |
|
2171 * since HTML names are not case-sensitive. |
|
2172 * |
|
2173 * Returns the Tag Name parsed or NULL |
|
2174 */ |
|
2175 |
|
2176 static const xmlChar * |
|
2177 htmlParseHTMLName(htmlParserCtxtPtr ctxt) { |
|
2178 int i = 0; |
|
2179 xmlChar loc[HTML_PARSER_BUFFER_SIZE]; |
|
2180 |
|
2181 if (!IS_LETTER_CH(CUR) && (CUR != '_') && |
|
2182 (CUR != ':')) return(NULL); |
|
2183 |
|
2184 while ((i < HTML_PARSER_BUFFER_SIZE) && |
|
2185 ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) || |
|
2186 (CUR == ':') || (CUR == '-') || (CUR == '_'))) { |
|
2187 if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20; |
|
2188 else loc[i] = CUR; |
|
2189 i++; |
|
2190 |
|
2191 NEXT; |
|
2192 } |
|
2193 |
|
2194 return(xmlDictLookup(ctxt->dict, loc, i)); |
|
2195 } |
|
2196 |
|
2197 /** |
|
2198 * htmlParseName: |
|
2199 * @param ctxt an HTML parser context |
|
2200 * |
|
2201 * parse an HTML name, this routine is case sensitive. |
|
2202 * |
|
2203 * Returns the Name parsed or NULL |
|
2204 */ |
|
2205 |
|
2206 static const xmlChar * |
|
2207 htmlParseName(htmlParserCtxtPtr ctxt) { |
|
2208 const xmlChar *in; |
|
2209 const xmlChar *ret; |
|
2210 int count = 0; |
|
2211 |
|
2212 GROW; |
|
2213 |
|
2214 /* |
|
2215 * Accelerator for simple ASCII names |
|
2216 */ |
|
2217 in = ctxt->input->cur; |
|
2218 if (((*in >= 0x61) && (*in <= 0x7A)) || |
|
2219 ((*in >= 0x41) && (*in <= 0x5A)) || |
|
2220 (*in == '_') || (*in == ':')) { |
|
2221 in++; |
|
2222 while (((*in >= 0x61) && (*in <= 0x7A)) || |
|
2223 ((*in >= 0x41) && (*in <= 0x5A)) || |
|
2224 ((*in >= 0x30) && (*in <= 0x39)) || |
|
2225 (*in == '_') || (*in == '-') || |
|
2226 (*in == ':') || (*in == '.')) |
|
2227 in++; |
|
2228 if ((*in > 0) && (*in < 0x80)) { |
|
2229 count = in - ctxt->input->cur; |
|
2230 ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count); |
|
2231 ctxt->input->cur = in; |
|
2232 ctxt->nbChars += count; |
|
2233 ctxt->input->col += count; |
|
2234 return(ret); |
|
2235 } |
|
2236 } |
|
2237 return(htmlParseNameComplex(ctxt)); |
|
2238 } |
|
2239 |
|
2240 static const xmlChar * |
|
2241 htmlParseNameComplex(xmlParserCtxtPtr ctxt) { |
|
2242 int len = 0, l; |
|
2243 int c; |
|
2244 int count = 0; |
|
2245 |
|
2246 /* |
|
2247 * Handler for more complex cases |
|
2248 */ |
|
2249 GROW; |
|
2250 c = CUR_CHAR(l); |
|
2251 if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */ |
|
2252 (!IS_LETTER(c) && (c != '_') && |
|
2253 (c != ':'))) { |
|
2254 return(NULL); |
|
2255 } |
|
2256 |
|
2257 while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */ |
|
2258 ((IS_LETTER(c)) || (IS_DIGIT(c)) || |
|
2259 (c == '.') || (c == '-') || |
|
2260 (c == '_') || (c == ':') || |
|
2261 (IS_COMBINING(c)) || |
|
2262 (IS_EXTENDER(c)))) { |
|
2263 if (count++ > 100) { |
|
2264 count = 0; |
|
2265 GROW; |
|
2266 } |
|
2267 len += l; |
|
2268 NEXTL(l); |
|
2269 c = CUR_CHAR(l); |
|
2270 } |
|
2271 return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len)); |
|
2272 } |
|
2273 |
|
2274 |
|
2275 /** |
|
2276 * htmlParseHTMLAttribute: |
|
2277 * @param ctxt an HTML parser context |
|
2278 * @param stop a char stop value |
|
2279 * |
|
2280 * parse an HTML attribute value till the stop (quote), if |
|
2281 * stop is 0 then it stops at the first space |
|
2282 * |
|
2283 * Returns the attribute parsed or NULL |
|
2284 */ |
|
2285 |
|
2286 static xmlChar * |
|
2287 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) { |
|
2288 xmlChar *buffer = NULL; |
|
2289 int buffer_size = 0; |
|
2290 xmlChar *out = NULL; |
|
2291 const xmlChar *name = NULL; |
|
2292 const xmlChar *cur = NULL; |
|
2293 const htmlEntityDesc * ent; |
|
2294 |
|
2295 /* |
|
2296 * allocate a translation buffer. |
|
2297 */ |
|
2298 buffer_size = HTML_PARSER_BUFFER_SIZE; |
|
2299 buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar)); |
|
2300 if (buffer == NULL) { |
|
2301 htmlErrMemory(ctxt, "buffer allocation failed\n"); |
|
2302 return(NULL); |
|
2303 } |
|
2304 out = buffer; |
|
2305 |
|
2306 /* |
|
2307 * Ok loop until we reach one of the ending chars |
|
2308 */ |
|
2309 while ((CUR != 0) && (CUR != stop)) { |
|
2310 if ((stop == 0) && (CUR == '>')) break; |
|
2311 if ((stop == 0) && (IS_BLANK_CH(CUR))) break; |
|
2312 if (CUR == '&') { |
|
2313 if (NXT(1) == '#') { |
|
2314 unsigned int c; |
|
2315 int bits; |
|
2316 |
|
2317 c = htmlParseCharRef(ctxt); |
|
2318 if (c < 0x80) |
|
2319 { *out++ = c; bits= -6; } |
|
2320 else if (c < 0x800) |
|
2321 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } |
|
2322 else if (c < 0x10000) |
|
2323 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } |
|
2324 else |
|
2325 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } |
|
2326 |
|
2327 for ( ; bits >= 0; bits-= 6) { |
|
2328 *out++ = ((c >> bits) & 0x3F) | 0x80; |
|
2329 } |
|
2330 |
|
2331 if (out - buffer > buffer_size - 100) { |
|
2332 int indx = out - buffer; |
|
2333 |
|
2334 growBuffer(buffer); |
|
2335 out = &buffer[indx]; |
|
2336 } |
|
2337 } else { |
|
2338 ent = htmlParseEntityRef(ctxt, &name); |
|
2339 if (name == NULL) { |
|
2340 *out++ = '&'; |
|
2341 if (out - buffer > buffer_size - 100) { |
|
2342 int indx = out - buffer; |
|
2343 |
|
2344 growBuffer(buffer); |
|
2345 out = &buffer[indx]; |
|
2346 } |
|
2347 } else if (ent == NULL) { |
|
2348 *out++ = '&'; |
|
2349 cur = name; |
|
2350 while (*cur != 0) { |
|
2351 if (out - buffer > buffer_size - 100) { |
|
2352 int indx = out - buffer; |
|
2353 |
|
2354 growBuffer(buffer); |
|
2355 out = &buffer[indx]; |
|
2356 } |
|
2357 *out++ = *cur++; |
|
2358 } |
|
2359 } else { |
|
2360 unsigned int c; |
|
2361 int bits; |
|
2362 |
|
2363 if (out - buffer > buffer_size - 100) { |
|
2364 int indx = out - buffer; |
|
2365 |
|
2366 growBuffer(buffer); |
|
2367 out = &buffer[indx]; |
|
2368 } |
|
2369 c = (xmlChar)ent->value; |
|
2370 if (c < 0x80) |
|
2371 { *out++ = c; bits= -6; } |
|
2372 else if (c < 0x800) |
|
2373 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } |
|
2374 else if (c < 0x10000) |
|
2375 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } |
|
2376 else |
|
2377 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } |
|
2378 |
|
2379 for ( ; bits >= 0; bits-= 6) { |
|
2380 *out++ = ((c >> bits) & 0x3F) | 0x80; |
|
2381 } |
|
2382 } |
|
2383 } |
|
2384 } else { |
|
2385 unsigned int c; |
|
2386 int bits, l; |
|
2387 |
|
2388 if (out - buffer > buffer_size - 100) { |
|
2389 int indx = out - buffer; |
|
2390 |
|
2391 growBuffer(buffer); |
|
2392 out = &buffer[indx]; |
|
2393 } |
|
2394 c = CUR_CHAR(l); |
|
2395 if (c < 0x80) |
|
2396 { *out++ = c; bits= -6; } |
|
2397 else if (c < 0x800) |
|
2398 { *out++ =((c >> 6) & 0x1F) | 0xC0; bits= 0; } |
|
2399 else if (c < 0x10000) |
|
2400 { *out++ =((c >> 12) & 0x0F) | 0xE0; bits= 6; } |
|
2401 else |
|
2402 { *out++ =((c >> 18) & 0x07) | 0xF0; bits= 12; } |
|
2403 |
|
2404 for ( ; bits >= 0; bits-= 6) { |
|
2405 *out++ = ((c >> bits) & 0x3F) | 0x80; |
|
2406 } |
|
2407 NEXT; |
|
2408 } |
|
2409 } |
|
2410 *out++ = 0; |
|
2411 return(buffer); |
|
2412 } |
|
2413 |
|
2414 /** |
|
2415 * htmlParseEntityRef: |
|
2416 * @param ctxt an HTML parser context |
|
2417 * @param str location to store the entity name |
|
2418 * |
|
2419 * parse an HTML ENTITY references |
|
2420 * |
|
2421 * [68] EntityRef ::= '&' Name ';' |
|
2422 * |
|
2423 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise, |
|
2424 * if non-NULL *str will have to be freed by the caller. |
|
2425 */ |
|
2426 const htmlEntityDesc * |
|
2427 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) { |
|
2428 const xmlChar *name; |
|
2429 const htmlEntityDesc * ent = NULL; |
|
2430 *str = NULL; |
|
2431 |
|
2432 if (CUR == '&') { |
|
2433 NEXT; |
|
2434 name = htmlParseName(ctxt); |
|
2435 if (name == NULL) { |
|
2436 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, |
|
2437 "htmlParseEntityRef: no name\n", NULL, NULL); |
|
2438 } else { |
|
2439 GROW; |
|
2440 if (CUR == ';') { |
|
2441 *str = name; |
|
2442 |
|
2443 /* |
|
2444 * Lookup the entity in the table. |
|
2445 */ |
|
2446 ent = htmlEntityLookup(name); |
|
2447 if (ent != NULL) /* OK that's ugly !!! */ |
|
2448 NEXT; |
|
2449 } else { |
|
2450 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING, |
|
2451 "htmlParseEntityRef: expecting ';'\n", |
|
2452 NULL, NULL); |
|
2453 *str = name; |
|
2454 } |
|
2455 } |
|
2456 } |
|
2457 return(ent); |
|
2458 } |
|
2459 |
|
2460 /** |
|
2461 * htmlParseAttValue: |
|
2462 * @param ctxt an HTML parser context |
|
2463 * |
|
2464 * parse a value for an attribute |
|
2465 * Note: the parser won't do substitution of entities here, this |
|
2466 * will be handled later in xmlStringGetNodeList, unless it was |
|
2467 * asked for ctxt->replaceEntities != 0 |
|
2468 * |
|
2469 * Returns the AttValue parsed or NULL. |
|
2470 */ |
|
2471 |
|
2472 static xmlChar * |
|
2473 htmlParseAttValue(htmlParserCtxtPtr ctxt) { |
|
2474 xmlChar *ret = NULL; |
|
2475 |
|
2476 if (CUR == '"') { |
|
2477 NEXT; |
|
2478 ret = htmlParseHTMLAttribute(ctxt, '"'); |
|
2479 if (CUR != '"') { |
|
2480 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, |
|
2481 "AttValue: \" expected\n", NULL, NULL); |
|
2482 } else |
|
2483 NEXT; |
|
2484 } else if (CUR == '\'') { |
|
2485 NEXT; |
|
2486 ret = htmlParseHTMLAttribute(ctxt, '\''); |
|
2487 if (CUR != '\'') { |
|
2488 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED, |
|
2489 "AttValue: ' expected\n", NULL, NULL); |
|
2490 } else |
|
2491 NEXT; |
|
2492 } else { |
|
2493 /* |
|
2494 * That's an HTMLism, the attribute value may not be quoted |
|
2495 */ |
|
2496 ret = htmlParseHTMLAttribute(ctxt, 0); |
|
2497 if (ret == NULL) { |
|
2498 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE, |
|
2499 "AttValue: no value found\n", NULL, NULL); |
|
2500 } |
|
2501 } |
|
2502 return(ret); |
|
2503 } |
|
2504 |
|
2505 /** |
|
2506 * htmlParseSystemLiteral: |
|
2507 * @param ctxt an HTML parser context |
|
2508 * |
|
2509 * parse an HTML Literal |
|
2510 * |
|
2511 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") |
|
2512 * |
|
2513 * Returns the SystemLiteral parsed or NULL |
|
2514 */ |
|
2515 |
|
2516 static xmlChar * |
|
2517 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { |
|
2518 const xmlChar *q; |
|
2519 xmlChar *ret = NULL; |
|
2520 |
|
2521 if (CUR == '"') { |
|
2522 NEXT; |
|
2523 q = CUR_PTR; |
|
2524 while ((IS_CHAR_CH(CUR)) && (CUR != '"')) |
|
2525 NEXT; |
|
2526 if (!IS_CHAR_CH(CUR)) { |
|
2527 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
|
2528 "Unfinished SystemLiteral\n", NULL, NULL); |
|
2529 } else { |
|
2530 ret = xmlStrndup(q, CUR_PTR - q); |
|
2531 NEXT; |
|
2532 } |
|
2533 } else if (CUR == '\'') { |
|
2534 NEXT; |
|
2535 q = CUR_PTR; |
|
2536 while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) |
|
2537 NEXT; |
|
2538 if (!IS_CHAR_CH(CUR)) { |
|
2539 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
|
2540 "Unfinished SystemLiteral\n", NULL, NULL); |
|
2541 } else { |
|
2542 ret = xmlStrndup(q, CUR_PTR - q); |
|
2543 NEXT; |
|
2544 } |
|
2545 } else { |
|
2546 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, |
|
2547 " or ' expected\n", NULL, NULL); |
|
2548 } |
|
2549 |
|
2550 return(ret); |
|
2551 } |
|
2552 |
|
2553 /** |
|
2554 * htmlParsePubidLiteral: |
|
2555 * @param ctxt an HTML parser context |
|
2556 * |
|
2557 * parse an HTML public literal |
|
2558 * |
|
2559 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" |
|
2560 * |
|
2561 * Returns the PubidLiteral parsed or NULL. |
|
2562 */ |
|
2563 |
|
2564 static xmlChar * |
|
2565 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { |
|
2566 const xmlChar *q; |
|
2567 xmlChar *ret = NULL; |
|
2568 /* |
|
2569 * Name ::= (Letter | '_') (NameChar)* |
|
2570 */ |
|
2571 if (CUR == '"') { |
|
2572 NEXT; |
|
2573 q = CUR_PTR; |
|
2574 while (IS_PUBIDCHAR_CH(CUR)) NEXT; |
|
2575 if (CUR != '"') { |
|
2576 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
|
2577 "Unfinished PubidLiteral\n", NULL, NULL); |
|
2578 } else { |
|
2579 ret = xmlStrndup(q, CUR_PTR - q); |
|
2580 NEXT; |
|
2581 } |
|
2582 } else if (CUR == '\'') { |
|
2583 NEXT; |
|
2584 q = CUR_PTR; |
|
2585 while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')) |
|
2586 NEXT; |
|
2587 if (CUR != '\'') { |
|
2588 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, |
|
2589 "Unfinished PubidLiteral\n", NULL, NULL); |
|
2590 } else { |
|
2591 ret = xmlStrndup(q, CUR_PTR - q); |
|
2592 NEXT; |
|
2593 } |
|
2594 } else { |
|
2595 htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, |
|
2596 "PubidLiteral \" or ' expected\n", NULL, NULL); |
|
2597 } |
|
2598 |
|
2599 return(ret); |
|
2600 } |
|
2601 |
|
2602 /** |
|
2603 * htmlParseScript: |
|
2604 * @param ctxt an HTML parser context |
|
2605 * |
|
2606 * parse the content of an HTML SCRIPT or STYLE element |
|
2607 * http://www.w3.org/TR/html4/sgml/dtd.html#Script |
|
2608 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet |
|
2609 * http://www.w3.org/TR/html4/types.html#type-script |
|
2610 * http://www.w3.org/TR/html4/types.html#h-6.15 |
|
2611 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1 |
|
2612 * |
|
2613 * Script data ( %Script; in the DTD) can be the content of the SCRIPT |
|
2614 * element and the value of intrinsic event attributes. User agents must |
|
2615 * not evaluate script data as HTML markup but instead must pass it on as |
|
2616 * data to a script engine. |
|
2617 * NOTES: |
|
2618 * - The content is passed like CDATA |
|
2619 * - the attributes for style and scripting "onXXX" are also described |
|
2620 * as CDATA but SGML allows entities references in attributes so their |
|
2621 * processing is identical as other attributes |
|
2622 */ |
|
2623 static void |
|
2624 htmlParseScript(htmlParserCtxtPtr ctxt) { |
|
2625 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1]; |
|
2626 int nbchar = 0; |
|
2627 xmlChar cur; |
|
2628 |
|
2629 SHRINK; |
|
2630 cur = CUR; |
|
2631 while (IS_CHAR_CH(cur)) { |
|
2632 if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') && |
|
2633 (NXT(3) == '-')) { |
|
2634 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { |
|
2635 if (ctxt->sax->cdataBlock!= NULL) { |
|
2636 /* |
|
2637 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE |
|
2638 */ |
|
2639 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); |
|
2640 } else if (ctxt->sax->characters != NULL) { |
|
2641 ctxt->sax->characters(ctxt->userData, buf, nbchar); |
|
2642 } |
|
2643 } |
|
2644 nbchar = 0; |
|
2645 htmlParseComment(ctxt); |
|
2646 cur = CUR; |
|
2647 continue; |
|
2648 } else if ((cur == '<') && (NXT(1) == '/')) { |
|
2649 /* |
|
2650 * One should break here, the specification is clear: |
|
2651 * Authors should therefore escape "</" within the content. |
|
2652 * Escape mechanisms are specific to each scripting or |
|
2653 * style sheet language. |
|
2654 */ |
|
2655 if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || |
|
2656 ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) |
|
2657 break; /* while */ |
|
2658 } |
|
2659 buf[nbchar++] = cur; |
|
2660 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { |
|
2661 if (ctxt->sax->cdataBlock!= NULL) { |
|
2662 /* |
|
2663 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE |
|
2664 */ |
|
2665 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); |
|
2666 } else if (ctxt->sax->characters != NULL) { |
|
2667 ctxt->sax->characters(ctxt->userData, buf, nbchar); |
|
2668 } |
|
2669 nbchar = 0; |
|
2670 } |
|
2671 NEXT; |
|
2672 cur = CUR; |
|
2673 } |
|
2674 if (!(IS_CHAR_CH(cur))) { |
|
2675 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, |
|
2676 "Invalid char in CDATA 0x%X\n", cur); |
|
2677 NEXT; |
|
2678 } |
|
2679 |
|
2680 if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { |
|
2681 if (ctxt->sax->cdataBlock!= NULL) { |
|
2682 /* |
|
2683 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE |
|
2684 */ |
|
2685 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar); |
|
2686 } else if (ctxt->sax->characters != NULL) { |
|
2687 ctxt->sax->characters(ctxt->userData, buf, nbchar); |
|
2688 } |
|
2689 } |
|
2690 } |
|
2691 |
|
2692 |
|
2693 /** |
|
2694 * htmlParseCharData: |
|
2695 * @param ctxt an HTML parser context |
|
2696 * |
|
2697 * parse a CharData section. |
|
2698 * if we are within a CDATA section ']]>' marks an end of section. |
|
2699 * |
|
2700 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) |
|
2701 */ |
|
2702 |
|
2703 static void |
|
2704 htmlParseCharData(htmlParserCtxtPtr ctxt) { |
|
2705 xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; |
|
2706 int nbchar = 0; |
|
2707 int cur, l; |
|
2708 |
|
2709 SHRINK; |
|
2710 cur = CUR_CHAR(l); |
|
2711 while (((cur != '<') || (ctxt->token == '<')) && |
|
2712 ((cur != '&') || (ctxt->token == '&')) && |
|
2713 (IS_CHAR(cur))) { |
|
2714 COPY_BUF(l,buf,nbchar,cur); |
|
2715 if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { |
|
2716 /* |
|
2717 * Ok the segment is to be consumed as chars. |
|
2718 */ |
|
2719 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { |
|
2720 if (areBlanks(ctxt, buf, nbchar)) { |
|
2721 if (ctxt->sax->ignorableWhitespace != NULL) |
|
2722 ctxt->sax->ignorableWhitespace(ctxt->userData, |
|
2723 buf, nbchar); |
|
2724 } else { |
|
2725 htmlCheckParagraph(ctxt); |
|
2726 if (ctxt->sax->characters != NULL) |
|
2727 ctxt->sax->characters(ctxt->userData, buf, nbchar); |
|
2728 } |
|
2729 } |
|
2730 nbchar = 0; |
|
2731 } |
|
2732 NEXTL(l); |
|
2733 cur = CUR_CHAR(l); |
|
2734 if (cur == 0) { |
|
2735 SHRINK; |
|
2736 GROW; |
|
2737 cur = CUR_CHAR(l); |
|
2738 } |
|
2739 } |
|
2740 if (nbchar != 0) { |
|
2741 /* |
|
2742 * Ok the segment is to be consumed as chars. |
|
2743 */ |
|
2744 if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { |
|
2745 if (areBlanks(ctxt, buf, nbchar)) { |
|
2746 if (ctxt->sax->ignorableWhitespace != NULL) |
|
2747 ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); |
|
2748 } else { |
|
2749 htmlCheckParagraph(ctxt); |
|
2750 if (ctxt->sax->characters != NULL) |
|
2751 ctxt->sax->characters(ctxt->userData, buf, nbchar); |
|
2752 } |
|
2753 } |
|
2754 } else { |
|
2755 /* |
|
2756 * Loop detection |
|
2757 */ |
|
2758 if (cur == 0) |
|
2759 ctxt->instate = XML_PARSER_EOF; |
|
2760 } |
|
2761 } |
|
2762 |
|
2763 /** |
|
2764 * htmlParseExternalID: |
|
2765 * @param ctxt an HTML parser context |
|
2766 * @param publicID a xmlChar** receiving PubidLiteral |
|
2767 * |
|
2768 * Parse an External ID or a Public ID |
|
2769 * |
|
2770 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral |
|
2771 * | 'PUBLIC' S PubidLiteral S SystemLiteral |
|
2772 * |
|
2773 * [83] PublicID ::= 'PUBLIC' S PubidLiteral |
|
2774 * |
|
2775 * Returns the function returns SystemLiteral and in the second |
|
2776 * case publicID receives PubidLiteral, is strict is off |
|
2777 * it is possible to return NULL and have publicID set. |
|
2778 */ |
|
2779 |
|
2780 static xmlChar * |
|
2781 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) { |
|
2782 xmlChar *URI = NULL; |
|
2783 |
|
2784 if ((UPPER == 'S') && (UPP(1) == 'Y') && |
|
2785 (UPP(2) == 'S') && (UPP(3) == 'T') && |
|
2786 (UPP(4) == 'E') && (UPP(5) == 'M')) { |
|
2787 SKIP(6); |
|
2788 if (!IS_BLANK_CH(CUR)) { |
|
2789 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, |
|
2790 "Space required after 'SYSTEM'\n", NULL, NULL); |
|
2791 } |
|
2792 SKIP_BLANKS; |
|
2793 URI = htmlParseSystemLiteral(ctxt); |
|
2794 if (URI == NULL) { |
|
2795 htmlParseErr(ctxt, XML_ERR_URI_REQUIRED, |
|
2796 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL); |
|
2797 } |
|
2798 } else if ((UPPER == 'P') && (UPP(1) == 'U') && |
|
2799 (UPP(2) == 'B') && (UPP(3) == 'L') && |
|
2800 (UPP(4) == 'I') && (UPP(5) == 'C')) { |
|
2801 SKIP(6); |
|
2802 if (!IS_BLANK_CH(CUR)) { |
|
2803 htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED, |
|
2804 "Space required after 'PUBLIC'\n", NULL, NULL); |
|
2805 } |
|
2806 SKIP_BLANKS; |
|
2807 *publicID = htmlParsePubidLiteral(ctxt); |
|
2808 if (*publicID == NULL) { |
|
2809 htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED, |
|
2810 "htmlParseExternalID: PUBLIC, no Public Identifier\n", |
|
2811 NULL, NULL); |
|
2812 } |
|
2813 SKIP_BLANKS; |
|
2814 if ((CUR == '"') || (CUR == '\'')) { |
|
2815 URI = htmlParseSystemLiteral(ctxt); |
|
2816 } |
|
2817 } |
|
2818 return(URI); |
|
2819 } |
|
2820 |
|
2821 /** |
|
2822 * htmlParseComment: |
|
2823 * @param ctxt an HTML parser context |
|
2824 * |
|
2825 * Parse an XML (SGML) comment <!-- .... --> |
|
2826 * |
|
2827 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->' |
|
2828 */ |
|
2829 static void |
|
2830 htmlParseComment(htmlParserCtxtPtr ctxt) |
|
2831 { |
|
2832 xmlChar* buf = NULL; |
|
2833 int len; |
|
2834 int size = HTML_PARSER_BUFFER_SIZE; |
|
2835 int q, ql; |
|
2836 int r, rl; |
|
2837 int cur, l; |
|
2838 xmlParserInputState state; |
|
2839 |
|
2840 /* |
|
2841 * Check that there is a comment right here. |
|
2842 */ |
|
2843 if ((RAW != '<') || (NXT(1) != '!') || |
|
2844 (NXT(2) != '-') || (NXT(3) != '-')) return; |
|
2845 |
|
2846 state = ctxt->instate; |
|
2847 ctxt->instate = XML_PARSER_COMMENT; |
|
2848 SHRINK; |
|
2849 SKIP(4); |
|
2850 buf = (xmlChar*) xmlMallocAtomic(size * sizeof(xmlChar)); |
|
2851 if (!buf) |
|
2852 goto OOM_exit; |
|
2853 // Now we must free 'buf' before returning |
|
2854 q = CUR_CHAR(ql); |
|
2855 NEXTL(ql); |
|
2856 r = CUR_CHAR(rl); |
|
2857 NEXTL(rl); |
|
2858 cur = CUR_CHAR(l); |
|
2859 len = 0; |
|
2860 while (IS_CHAR(cur) && |
|
2861 ((cur != '>') || (r != '-') || (q != '-'))) |
|
2862 { |
|
2863 if (len + 5 >= size) |
|
2864 { // DONE: Fix xmlRealloc |
|
2865 void* tmp; |
|
2866 size *= 2; |
|
2867 tmp = xmlRealloc(buf, size * sizeof(xmlChar)); |
|
2868 if (!tmp) |
|
2869 { |
|
2870 OOM: |
|
2871 xmlFree(buf); |
|
2872 OOM_exit: |
|
2873 htmlErrMemory(ctxt, "buffer allocation failed\n"); |
|
2874 ctxt->instate = state; |
|
2875 return; |
|
2876 } |
|
2877 buf = (xmlChar*) tmp; |
|
2878 } |
|
2879 COPY_BUF(ql,buf,len,q); |
|
2880 q = r; |
|
2881 ql = rl; |
|
2882 r = cur; |
|
2883 rl = l; |
|
2884 NEXTL(l); |
|
2885 cur = CUR_CHAR(l); |
|
2886 if (cur == 0) { |
|
2887 SHRINK; |
|
2888 GROW; |
|
2889 cur = CUR_CHAR(l); |
|
2890 } |
|
2891 } // end of "while good character and not the end of comment (-->)" |
|
2892 |
|
2893 buf[len] = 0; |
|
2894 if (!IS_CHAR(cur)) { |
|
2895 htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED, |
|
2896 "Comment not terminated \n<!--%.50s\n", buf, NULL); |
|
2897 xmlFree(buf); |
|
2898 } else { |
|
2899 NEXT; |
|
2900 if (ctxt->sax && |
|
2901 ctxt->sax->comment && |
|
2902 !ctxt->disableSAX) |
|
2903 { |
|
2904 ctxt->sax->comment(ctxt->userData, buf); |
|
2905 } |
|
2906 } |
|
2907 xmlFree(buf); |
|
2908 ctxt->instate = state; |
|
2909 } |
|
2910 |
|
2911 /** |
|
2912 * htmlParseCharRef: |
|
2913 * @param ctxt an HTML parser context |
|
2914 * |
|
2915 * parse Reference declarations |
|
2916 * |
|
2917 * [66] CharRef ::= '&#' [0-9]+ ';' | |
|
2918 * '&#x' [0-9a-fA-F]+ ';' |
|
2919 * |
|
2920 * Returns the value parsed (as an int) |
|
2921 */ |
|
2922 int |
|
2923 htmlParseCharRef(htmlParserCtxtPtr ctxt) { |
|
2924 int val = 0; |
|
2925 |
|
2926 if ((CUR == '&') && (NXT(1) == '#') && |
|
2927 ((NXT(2) == 'x') || NXT(2) == 'X')) { |
|
2928 SKIP(3); |
|
2929 while (CUR != ';') { |
|
2930 if ((CUR >= '0') && (CUR <= '9')) |
|
2931 val = val * 16 + (CUR - '0'); |
|
2932 else if ((CUR >= 'a') && (CUR <= 'f')) |
|
2933 val = val * 16 + (CUR - 'a') + 10; |
|
2934 else if ((CUR >= 'A') && (CUR <= 'F')) |
|
2935 val = val * 16 + (CUR - 'A') + 10; |
|
2936 else { |
|
2937 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, |
|
2938 "htmlParseCharRef: invalid hexadecimal value\n", |
|
2939 NULL, NULL); |
|
2940 return(0); |
|
2941 } |
|
2942 NEXT; |
|
2943 } |
|
2944 if (CUR == ';') |
|
2945 NEXT; |
|
2946 } else if ((CUR == '&') && (NXT(1) == '#')) { |
|
2947 SKIP(2); |
|
2948 while (CUR != ';') { |
|
2949 if ((CUR >= '0') && (CUR <= '9')) |
|
2950 val = val * 10 + (CUR - '0'); |
|
2951 else { |
|
2952 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, |
|
2953 "htmlParseCharRef: invalid decimal value\n", |
|
2954 NULL, NULL); |
|
2955 return(0); |
|
2956 } |
|
2957 NEXT; |
|
2958 } |
|
2959 if (CUR == ';') |
|
2960 NEXT; |
|
2961 } else { |
|
2962 htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF, |
|
2963 "htmlParseCharRef: invalid value\n", NULL, NULL); |
|
2964 } |
|
2965 /* |
|
2966 * Check the value IS_CHAR ... |
|
2967 */ |
|
2968 if (IS_CHAR(val)) { |
|
2969 return(val); |
|
2970 } else { |
|
2971 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, |
|
2972 "htmlParseCharRef: invalid xmlChar value %d\n", |
|
2973 val); |
|
2974 } |
|
2975 return(0); |
|
2976 } |
|
2977 |
|
2978 |
|
2979 /** |
|
2980 * htmlParseDocTypeDecl: |
|
2981 * @param ctxt an HTML parser context |
|
2982 * |
|
2983 * parse a DOCTYPE declaration |
|
2984 * |
|
2985 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? |
|
2986 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' |
|
2987 */ |
|
2988 |
|
2989 static void |
|
2990 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { |
|
2991 const xmlChar *name; |
|
2992 xmlChar *ExternalID = NULL; |
|
2993 xmlChar *URI = NULL; |
|
2994 |
|
2995 /* |
|
2996 * We know that '<!DOCTYPE' has been detected. |
|
2997 */ |
|
2998 SKIP(9); |
|
2999 |
|
3000 SKIP_BLANKS; |
|
3001 |
|
3002 /* |
|
3003 * Parse the DOCTYPE name. |
|
3004 */ |
|
3005 name = htmlParseName(ctxt); |
|
3006 if (name == NULL) { |
|
3007 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, |
|
3008 "htmlParseDocTypeDecl : no DOCTYPE name !\n", |
|
3009 NULL, NULL); |
|
3010 } |
|
3011 /* |
|
3012 * Check that upper(name) == "HTML" !!!!!!!!!!!!! |
|
3013 */ |
|
3014 |
|
3015 SKIP_BLANKS; |
|
3016 |
|
3017 /* |
|
3018 * Check for SystemID and ExternalID |
|
3019 */ |
|
3020 URI = htmlParseExternalID(ctxt, &ExternalID); |
|
3021 SKIP_BLANKS; |
|
3022 |
|
3023 /* |
|
3024 * We should be at the end of the DOCTYPE declaration. |
|
3025 */ |
|
3026 if (CUR != '>') { |
|
3027 htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, |
|
3028 "DOCTYPE improperly terminated\n", NULL, NULL); |
|
3029 /* We shouldn't try to resynchronize ... */ |
|
3030 } |
|
3031 NEXT; |
|
3032 |
|
3033 /* |
|
3034 * Create or update the document accordingly to the DOCTYPE |
|
3035 */ |
|
3036 if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) && |
|
3037 (!ctxt->disableSAX)) |
|
3038 ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI); |
|
3039 |
|
3040 /* |
|
3041 * Cleanup, since we don't use all those identifiers |
|
3042 */ |
|
3043 if (URI != NULL) xmlFree(URI); |
|
3044 if (ExternalID != NULL) xmlFree(ExternalID); |
|
3045 } |
|
3046 |
|
3047 /** |
|
3048 * htmlParseAttribute: |
|
3049 * @param ctxt an HTML parser context |
|
3050 * @param value a xmlChar ** used to store the value of the attribute |
|
3051 * |
|
3052 * parse an attribute |
|
3053 * |
|
3054 * [41] Attribute ::= Name Eq AttValue |
|
3055 * |
|
3056 * [25] Eq ::= S? '=' S? |
|
3057 * |
|
3058 * With namespace: |
|
3059 * |
|
3060 * [NS 11] Attribute ::= QName Eq AttValue |
|
3061 * |
|
3062 * Also the case QName == xmlns:??? is handled independently as a namespace |
|
3063 * definition. |
|
3064 * |
|
3065 * Returns the attribute name, and the value in *value. |
|
3066 */ |
|
3067 |
|
3068 static const xmlChar * |
|
3069 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { |
|
3070 const xmlChar *name; |
|
3071 xmlChar *val = NULL; |
|
3072 |
|
3073 *value = NULL; |
|
3074 name = htmlParseHTMLName(ctxt); |
|
3075 if (name == NULL) { |
|
3076 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, |
|
3077 "error parsing attribute name\n", NULL, NULL); |
|
3078 return(NULL); |
|
3079 } |
|
3080 |
|
3081 /* |
|
3082 * read the value |
|
3083 */ |
|
3084 SKIP_BLANKS; |
|
3085 if (CUR == '=') { |
|
3086 NEXT; |
|
3087 SKIP_BLANKS; |
|
3088 val = htmlParseAttValue(ctxt); |
|
3089 /****** |
|
3090 } else { |
|
3091 |
|
3092 if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) |
|
3093 ctxt->sax->warning(ctxt->userData, |
|
3094 "No value for attribute %s\n", name); */ |
|
3095 } |
|
3096 |
|
3097 *value = val; |
|
3098 return(name); |
|
3099 } |
|
3100 |
|
3101 /** |
|
3102 * htmlCheckEncoding: |
|
3103 * @param ctxt an HTML parser context |
|
3104 * @param attvalue the attribute value |
|
3105 * |
|
3106 * Checks an http-equiv attribute from a Meta tag to detect |
|
3107 * the encoding |
|
3108 * If a new encoding is detected the parser is switched to decode |
|
3109 * it and pass UTF8 |
|
3110 */ |
|
3111 static void |
|
3112 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { |
|
3113 const xmlChar *encoding; |
|
3114 |
|
3115 if ((ctxt == NULL) || (attvalue == NULL)) |
|
3116 return; |
|
3117 |
|
3118 /* do not change encoding */ |
|
3119 if (ctxt->input->encoding != NULL) |
|
3120 return; |
|
3121 |
|
3122 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); |
|
3123 if (encoding != NULL) { |
|
3124 encoding += 8; |
|
3125 } else { |
|
3126 encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); |
|
3127 if (encoding != NULL) |
|
3128 encoding += 9; |
|
3129 } |
|
3130 if (encoding != NULL) { |
|
3131 xmlCharEncoding enc; |
|
3132 xmlCharEncodingHandlerPtr handler; |
|
3133 |
|
3134 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; |
|
3135 |
|
3136 if (ctxt->input->encoding != NULL) |
|
3137 xmlFree((xmlChar *) ctxt->input->encoding); |
|
3138 ctxt->input->encoding = xmlStrdup(encoding); |
|
3139 |
|
3140 enc = xmlParseCharEncoding((const char *) encoding); |
|
3141 /* |
|
3142 * registered set of known encodings |
|
3143 */ |
|
3144 if (enc != XML_CHAR_ENCODING_ERROR) { |
|
3145 xmlSwitchEncoding(ctxt, enc); |
|
3146 ctxt->charset = XML_CHAR_ENCODING_UTF8; |
|
3147 } else { |
|
3148 /* |
|
3149 * fallback for unknown encodings |
|
3150 */ |
|
3151 handler = xmlFindCharEncodingHandler((const char *) encoding); |
|
3152 if (handler != NULL) { |
|
3153 xmlSwitchToEncoding(ctxt, handler); |
|
3154 ctxt->charset = XML_CHAR_ENCODING_UTF8; |
|
3155 } else { |
|
3156 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; |
|
3157 } |
|
3158 } |
|
3159 |
|
3160 if ((ctxt->input->buf != NULL) && |
|
3161 (ctxt->input->buf->encoder != NULL) && |
|
3162 (ctxt->input->buf->raw != NULL) && |
|
3163 (ctxt->input->buf->buffer != NULL)) { |
|
3164 int nbchars; |
|
3165 int processed; |
|
3166 |
|
3167 /* |
|
3168 * convert as much as possible to the parser reading buffer. |
|
3169 */ |
|
3170 processed = ctxt->input->cur - ctxt->input->base; |
|
3171 xmlBufferShrink(ctxt->input->buf->buffer, processed); |
|
3172 nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, |
|
3173 ctxt->input->buf->buffer, |
|
3174 ctxt->input->buf->raw); |
|
3175 if (nbchars < 0) { |
|
3176 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, |
|
3177 "htmlCheckEncoding: encoder error\n", |
|
3178 NULL, NULL); |
|
3179 } |
|
3180 ctxt->input->base = |
|
3181 ctxt->input->cur = ctxt->input->buf->buffer->content; |
|
3182 } |
|
3183 } |
|
3184 } |
|
3185 |
|
3186 /** |
|
3187 * htmlCheckMeta: |
|
3188 * @param ctxt an HTML parser context |
|
3189 * @param atts the attributes values |
|
3190 * |
|
3191 * Checks an attributes from a Meta tag |
|
3192 */ |
|
3193 static void |
|
3194 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { |
|
3195 int i; |
|
3196 const xmlChar *att, *value; |
|
3197 int http = 0; |
|
3198 const xmlChar *content = NULL; |
|
3199 |
|
3200 if ((ctxt == NULL) || (atts == NULL)) |
|
3201 return; |
|
3202 |
|
3203 i = 0; |
|
3204 att = atts[i++]; |
|
3205 while (att != NULL) { |
|
3206 value = atts[i++]; |
|
3207 if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) |
|
3208 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) |
|
3209 http = 1; |
|
3210 else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) |
|
3211 content = value; |
|
3212 att = atts[i++]; |
|
3213 } |
|
3214 if ((http) && (content != NULL)) |
|
3215 htmlCheckEncoding(ctxt, content); |
|
3216 |
|
3217 } |
|
3218 |
|
3219 /** |
|
3220 * htmlParseStartTag: |
|
3221 * @param ctxt an HTML parser context |
|
3222 * |
|
3223 * parse a start of tag either for rule element or |
|
3224 * EmptyElement. In both case we don't parse the tag closing chars. |
|
3225 * |
|
3226 * [40] STag ::= '<' Name (S Attribute)* S? '>' |
|
3227 * |
|
3228 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' |
|
3229 * |
|
3230 * With namespace: |
|
3231 * |
|
3232 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>' |
|
3233 * |
|
3234 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>' |
|
3235 * |
|
3236 */ |
|
3237 |
|
3238 static void |
|
3239 htmlParseStartTag(htmlParserCtxtPtr ctxt) { |
|
3240 const xmlChar *name; |
|
3241 const xmlChar *attname; |
|
3242 xmlChar *attvalue; |
|
3243 const xmlChar **atts = ctxt->atts; |
|
3244 int nbatts = 0; |
|
3245 int maxatts = ctxt->maxatts; |
|
3246 int meta = 0; |
|
3247 int i; |
|
3248 |
|
3249 if (CUR != '<') return; |
|
3250 NEXT; |
|
3251 |
|
3252 GROW; |
|
3253 name = htmlParseHTMLName(ctxt); |
|
3254 if (name == NULL) { |
|
3255 htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, |
|
3256 "htmlParseStartTag: invalid element name\n", |
|
3257 NULL, NULL); |
|
3258 /* Dump the bogus tag like browsers do */ |
|
3259 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) |
|
3260 NEXT; |
|
3261 return; |
|
3262 } |
|
3263 if (xmlStrEqual(name, BAD_CAST"meta")) |
|
3264 meta = 1; |
|
3265 |
|
3266 /* |
|
3267 * Check for auto-closure of HTML elements. |
|
3268 */ |
|
3269 htmlAutoClose(ctxt, name); |
|
3270 |
|
3271 /* |
|
3272 * Check for implied HTML elements. |
|
3273 */ |
|
3274 htmlCheckImplied(ctxt, name); |
|
3275 |
|
3276 /* |
|
3277 * Avoid html at any level > 0, head at any level != 1 |
|
3278 * or any attempt to recurse body |
|
3279 */ |
|
3280 if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) { |
|
3281 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, |
|
3282 "htmlParseStartTag: misplaced <html> tag\n", |
|
3283 name, NULL); |
|
3284 return; |
|
3285 } |
|
3286 if ((ctxt->nameNr != 1) && |
|
3287 (xmlStrEqual(name, BAD_CAST"head"))) { |
|
3288 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, |
|
3289 "htmlParseStartTag: misplaced <head> tag\n", |
|
3290 name, NULL); |
|
3291 return; |
|
3292 } |
|
3293 if (xmlStrEqual(name, BAD_CAST"body")) { |
|
3294 int indx; |
|
3295 for (indx = 0;indx < ctxt->nameNr;indx++) { |
|
3296 if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) { |
|
3297 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, |
|
3298 "htmlParseStartTag: misplaced <body> tag\n", |
|
3299 name, NULL); |
|
3300 while ((IS_CHAR_CH(CUR)) && (CUR != '>')) |
|
3301 NEXT; |
|
3302 return; |
|
3303 } |
|
3304 } |
|
3305 } |
|
3306 |
|
3307 /* |
|
3308 * Now parse the attributes, it ends up with the ending |
|
3309 * |
|
3310 * (S Attribute)* S? |
|
3311 */ |
|
3312 SKIP_BLANKS; |
|
3313 while ((IS_CHAR_CH(CUR)) && |
|
3314 (CUR != '>') && |
|
3315 ((CUR != '/') || (NXT(1) != '>'))) { |
|
3316 long cons = ctxt->nbChars; |
|
3317 |
|
3318 GROW; |
|
3319 attname = htmlParseAttribute(ctxt, &attvalue); |
|
3320 if (attname != NULL) { |
|
3321 |
|
3322 /* |
|
3323 * Well formedness requires at most one declaration of an attribute |
|
3324 */ |
|
3325 for (i = 0; i < nbatts;i += 2) { |
|
3326 if (xmlStrEqual(atts[i], attname)) { |
|
3327 htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED, |
|
3328 "Attribute %s redefined\n", attname, NULL); |
|
3329 if (attvalue != NULL) |
|
3330 xmlFree(attvalue); |
|
3331 goto failed; |
|
3332 } |
|
3333 } |
|
3334 |
|
3335 /* |
|
3336 * Add the pair to atts |
|
3337 */ |
|
3338 if (atts == NULL) { |
|
3339 maxatts = 22; /* allow for 10 attrs by default */ |
|
3340 atts = (const xmlChar **) |
|
3341 xmlMalloc(maxatts * sizeof(xmlChar *)); |
|
3342 if (atts == NULL) { |
|
3343 htmlErrMemory(ctxt, NULL); |
|
3344 if (attvalue != NULL) |
|
3345 xmlFree(attvalue); |
|
3346 goto failed; |
|
3347 } |
|
3348 ctxt->atts = atts; |
|
3349 ctxt->maxatts = maxatts; |
|
3350 } else if (nbatts + 4 > maxatts) { |
|
3351 const xmlChar **n; |
|
3352 |
|
3353 maxatts *= 2; |
|
3354 n = (const xmlChar **) xmlRealloc((void *) atts, |
|
3355 maxatts * sizeof(const xmlChar *)); |
|
3356 if (n == NULL) { |
|
3357 htmlErrMemory(ctxt, NULL); |
|
3358 if (attvalue != NULL) |
|
3359 xmlFree(attvalue); |
|
3360 goto failed; |
|
3361 } |
|
3362 atts = n; |
|
3363 ctxt->atts = atts; |
|
3364 ctxt->maxatts = maxatts; |
|
3365 } |
|
3366 atts[nbatts++] = attname; |
|
3367 atts[nbatts++] = attvalue; |
|
3368 atts[nbatts] = NULL; |
|
3369 atts[nbatts + 1] = NULL; |
|
3370 } |
|
3371 else { |
|
3372 if (attvalue != NULL) |
|
3373 xmlFree(attvalue); |
|
3374 /* Dump the bogus attribute string up to the next blank or |
|
3375 * the end of the tag. */ |
|
3376 while ((IS_CHAR_CH(CUR)) && |
|
3377 !(IS_BLANK_CH(CUR)) && (CUR != '>') && |
|
3378 ((CUR != '/') || (NXT(1) != '>'))) |
|
3379 NEXT; |
|
3380 } |
|
3381 |
|
3382 failed: |
|
3383 SKIP_BLANKS; |
|
3384 if (cons == ctxt->nbChars) { |
|
3385 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
3386 "htmlParseStartTag: problem parsing attributes\n", |
|
3387 NULL, NULL); |
|
3388 break; |
|
3389 } |
|
3390 } |
|
3391 |
|
3392 /* |
|
3393 * Handle specific association to the META tag |
|
3394 */ |
|
3395 if (meta) |
|
3396 htmlCheckMeta(ctxt, atts); |
|
3397 |
|
3398 /* |
|
3399 * SAX: Start of Element ! |
|
3400 */ |
|
3401 htmlnamePush(ctxt, name); |
|
3402 if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { |
|
3403 if (nbatts != 0) |
|
3404 ctxt->sax->startElement(ctxt->userData, name, atts); |
|
3405 else |
|
3406 ctxt->sax->startElement(ctxt->userData, name, NULL); |
|
3407 } |
|
3408 |
|
3409 if (atts != NULL) { |
|
3410 for (i = 1;i < nbatts;i += 2) { |
|
3411 if (atts[i] != NULL) |
|
3412 xmlFree((xmlChar *) atts[i]); |
|
3413 } |
|
3414 } |
|
3415 } |
|
3416 |
|
3417 /** |
|
3418 * htmlParseEndTag: |
|
3419 * @param ctxt an HTML parser context |
|
3420 * |
|
3421 * parse an end of tag |
|
3422 * |
|
3423 * [42] ETag ::= '</' Name S? '>' |
|
3424 * |
|
3425 * With namespace |
|
3426 * |
|
3427 * [NS 9] ETag ::= '</' QName S? '>' |
|
3428 * |
|
3429 * Returns 1 if the current level should be closed. |
|
3430 */ |
|
3431 |
|
3432 static int |
|
3433 htmlParseEndTag(htmlParserCtxtPtr ctxt) |
|
3434 { |
|
3435 const xmlChar *name; |
|
3436 const xmlChar *oldname; |
|
3437 int i, ret; |
|
3438 |
|
3439 if ((CUR != '<') || (NXT(1) != '/')) { |
|
3440 htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED, |
|
3441 "htmlParseEndTag: '</' not found\n", NULL, NULL); |
|
3442 return (0); |
|
3443 } |
|
3444 SKIP(2); |
|
3445 |
|
3446 name = htmlParseHTMLName(ctxt); |
|
3447 if (name == NULL) |
|
3448 return (0); |
|
3449 |
|
3450 /* |
|
3451 * We should definitely be at the ending "S? '>'" part |
|
3452 */ |
|
3453 SKIP_BLANKS; |
|
3454 if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { |
|
3455 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, |
|
3456 "End tag : expected '>'\n", NULL, NULL); |
|
3457 } else |
|
3458 NEXT; |
|
3459 |
|
3460 /* |
|
3461 * If the name read is not one of the element in the parsing stack |
|
3462 * then return, it's just an error. |
|
3463 */ |
|
3464 for (i = (ctxt->nameNr - 1); i >= 0; i--) { |
|
3465 if (xmlStrEqual(name, ctxt->nameTab[i])) |
|
3466 break; |
|
3467 } |
|
3468 if (i < 0) { |
|
3469 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, |
|
3470 "Unexpected end tag : %s\n", name, NULL); |
|
3471 return (0); |
|
3472 } |
|
3473 |
|
3474 |
|
3475 /* |
|
3476 * Check for auto-closure of HTML elements. |
|
3477 */ |
|
3478 |
|
3479 htmlAutoCloseOnClose(ctxt, name); |
|
3480 |
|
3481 /* |
|
3482 * Well formedness constraints, opening and closing must match. |
|
3483 * With the exception that the autoclose may have popped stuff out |
|
3484 * of the stack. |
|
3485 */ |
|
3486 if (!xmlStrEqual(name, ctxt->name)) { |
|
3487 if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) { |
|
3488 htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, |
|
3489 "Opening and ending tag mismatch: %s and %s\n", |
|
3490 name, ctxt->name); |
|
3491 } |
|
3492 } |
|
3493 |
|
3494 /* |
|
3495 * SAX: End of Tag |
|
3496 */ |
|
3497 oldname = ctxt->name; |
|
3498 if ((oldname != NULL) && (xmlStrEqual(oldname, name))) { |
|
3499 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
3500 ctxt->sax->endElement(ctxt->userData, name); |
|
3501 htmlnamePop(ctxt); |
|
3502 ret = 1; |
|
3503 } else { |
|
3504 ret = 0; |
|
3505 } |
|
3506 |
|
3507 return (ret); |
|
3508 } |
|
3509 |
|
3510 |
|
3511 /** |
|
3512 * htmlParseReference: |
|
3513 * @param ctxt an HTML parser context |
|
3514 * |
|
3515 * parse and handle entity references in content, |
|
3516 * this will end-up in a call to character() since this is either a |
|
3517 * CharRef, or a predefined entity. |
|
3518 */ |
|
3519 static void |
|
3520 htmlParseReference(htmlParserCtxtPtr ctxt) { |
|
3521 const htmlEntityDesc * ent; |
|
3522 xmlChar out[6]; |
|
3523 const xmlChar *name; |
|
3524 if (CUR != '&') return; |
|
3525 |
|
3526 if (NXT(1) == '#') { |
|
3527 unsigned int c; |
|
3528 int bits, i = 0; |
|
3529 |
|
3530 c = htmlParseCharRef(ctxt); |
|
3531 if (c == 0) |
|
3532 return; |
|
3533 |
|
3534 if (c < 0x80) { out[i++]= c; bits= -6; } |
|
3535 else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } |
|
3536 else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } |
|
3537 else { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } |
|
3538 |
|
3539 for ( ; bits >= 0; bits-= 6) { |
|
3540 out[i++]= ((c >> bits) & 0x3F) | 0x80; |
|
3541 } |
|
3542 out[i] = 0; |
|
3543 |
|
3544 htmlCheckParagraph(ctxt); |
|
3545 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) |
|
3546 ctxt->sax->characters(ctxt->userData, out, i); |
|
3547 } else { |
|
3548 ent = htmlParseEntityRef(ctxt, &name); |
|
3549 if (name == NULL) { |
|
3550 htmlCheckParagraph(ctxt); |
|
3551 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) |
|
3552 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); |
|
3553 return; |
|
3554 } |
|
3555 if ((ent == NULL) || !(ent->value > 0)) { |
|
3556 htmlCheckParagraph(ctxt); |
|
3557 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) { |
|
3558 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1); |
|
3559 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name)); |
|
3560 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */ |
|
3561 } |
|
3562 } else { |
|
3563 unsigned int c; |
|
3564 int bits, i = 0; |
|
3565 |
|
3566 c = ent->value; |
|
3567 if (c < 0x80) |
|
3568 { out[i++]= c; bits= -6; } |
|
3569 else if (c < 0x800) |
|
3570 { out[i++]=((c >> 6) & 0x1F) | 0xC0; bits= 0; } |
|
3571 else if (c < 0x10000) |
|
3572 { out[i++]=((c >> 12) & 0x0F) | 0xE0; bits= 6; } |
|
3573 else |
|
3574 { out[i++]=((c >> 18) & 0x07) | 0xF0; bits= 12; } |
|
3575 |
|
3576 for ( ; bits >= 0; bits-= 6) { |
|
3577 out[i++]= ((c >> bits) & 0x3F) | 0x80; |
|
3578 } |
|
3579 out[i] = 0; |
|
3580 |
|
3581 htmlCheckParagraph(ctxt); |
|
3582 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) |
|
3583 ctxt->sax->characters(ctxt->userData, out, i); |
|
3584 } |
|
3585 } |
|
3586 } |
|
3587 |
|
3588 /** |
|
3589 * htmlParseContent: |
|
3590 * @param ctxt an HTML parser context |
|
3591 * @param name the node name |
|
3592 * |
|
3593 * Parse a content: comment, sub-element, reference or text. |
|
3594 * |
|
3595 */ |
|
3596 |
|
3597 static void |
|
3598 htmlParseContent(htmlParserCtxtPtr ctxt) { |
|
3599 xmlChar *currentNode; |
|
3600 int depth; |
|
3601 |
|
3602 currentNode = xmlStrdup(ctxt->name); |
|
3603 depth = ctxt->nameNr; |
|
3604 while (1) { |
|
3605 long cons = ctxt->nbChars; |
|
3606 |
|
3607 GROW; |
|
3608 /* |
|
3609 * Our tag or one of it's parent or children is ending. |
|
3610 */ |
|
3611 if ((CUR == '<') && (NXT(1) == '/')) { |
|
3612 if (htmlParseEndTag(ctxt) && |
|
3613 ((currentNode != NULL) || (ctxt->nameNr == 0))) { |
|
3614 if (currentNode != NULL) |
|
3615 xmlFree(currentNode); |
|
3616 return; |
|
3617 } |
|
3618 continue; /* while */ |
|
3619 } |
|
3620 |
|
3621 /* |
|
3622 * Has this node been popped out during parsing of |
|
3623 * the next element |
|
3624 */ |
|
3625 if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) && |
|
3626 (!xmlStrEqual(currentNode, ctxt->name))) |
|
3627 { |
|
3628 if (currentNode != NULL) xmlFree(currentNode); |
|
3629 return; |
|
3630 } |
|
3631 |
|
3632 if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) || |
|
3633 (xmlStrEqual(currentNode, BAD_CAST"style")))) { |
|
3634 /* |
|
3635 * Handle SCRIPT/STYLE separately |
|
3636 */ |
|
3637 htmlParseScript(ctxt); |
|
3638 } else { |
|
3639 /* |
|
3640 * Sometimes DOCTYPE arrives in the middle of the document |
|
3641 */ |
|
3642 if ((CUR == '<') && (NXT(1) == '!') && |
|
3643 (UPP(2) == 'D') && (UPP(3) == 'O') && |
|
3644 (UPP(4) == 'C') && (UPP(5) == 'T') && |
|
3645 (UPP(6) == 'Y') && (UPP(7) == 'P') && |
|
3646 (UPP(8) == 'E')) { |
|
3647 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, |
|
3648 "Misplaced DOCTYPE declaration\n", |
|
3649 BAD_CAST "DOCTYPE" , NULL); |
|
3650 htmlParseDocTypeDecl(ctxt); |
|
3651 } |
|
3652 |
|
3653 /* |
|
3654 * First case : a comment |
|
3655 */ |
|
3656 if ((CUR == '<') && (NXT(1) == '!') && |
|
3657 (NXT(2) == '-') && (NXT(3) == '-')) { |
|
3658 htmlParseComment(ctxt); |
|
3659 } |
|
3660 |
|
3661 /* |
|
3662 * Second case : a sub-element. |
|
3663 */ |
|
3664 else if (CUR == '<') { |
|
3665 htmlParseElement(ctxt); |
|
3666 } |
|
3667 |
|
3668 /* |
|
3669 * Third case : a reference. If if has not been resolved, |
|
3670 * parsing returns it's Name, create the node |
|
3671 */ |
|
3672 else if (CUR == '&') { |
|
3673 htmlParseReference(ctxt); |
|
3674 } |
|
3675 |
|
3676 /* |
|
3677 * Fourth : end of the resource |
|
3678 */ |
|
3679 else if (CUR == 0) { |
|
3680 htmlAutoCloseOnEnd(ctxt); |
|
3681 break; |
|
3682 } |
|
3683 |
|
3684 /* |
|
3685 * Last case, text. Note that References are handled directly. |
|
3686 */ |
|
3687 else { |
|
3688 htmlParseCharData(ctxt); |
|
3689 } |
|
3690 |
|
3691 if (cons == ctxt->nbChars) { |
|
3692 if (ctxt->node != NULL) { |
|
3693 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
3694 "detected an error in element content\n", |
|
3695 NULL, NULL); |
|
3696 } |
|
3697 break; |
|
3698 } |
|
3699 } |
|
3700 GROW; |
|
3701 } |
|
3702 if (currentNode != NULL) xmlFree(currentNode); |
|
3703 } |
|
3704 |
|
3705 /** |
|
3706 * htmlParseElement: |
|
3707 * @param ctxt an HTML parser context |
|
3708 * |
|
3709 * parse an HTML element, this is highly recursive |
|
3710 * |
|
3711 * [39] element ::= EmptyElemTag | STag content ETag |
|
3712 * |
|
3713 * [41] Attribute ::= Name Eq AttValue |
|
3714 */ |
|
3715 |
|
3716 void |
|
3717 htmlParseElement(htmlParserCtxtPtr ctxt) { |
|
3718 const xmlChar *name; |
|
3719 xmlChar *currentNode = NULL; |
|
3720 const htmlElemDesc * info; |
|
3721 htmlParserNodeInfo node_info; |
|
3722 const xmlChar *oldname; |
|
3723 int depth = ctxt->nameNr; |
|
3724 const xmlChar *oldptr; |
|
3725 |
|
3726 /* Capture start position */ |
|
3727 if (ctxt->record_info) { |
|
3728 node_info.begin_pos = ctxt->input->consumed + |
|
3729 (CUR_PTR - ctxt->input->base); |
|
3730 node_info.begin_line = ctxt->input->line; |
|
3731 } |
|
3732 |
|
3733 oldname = ctxt->name; |
|
3734 htmlParseStartTag(ctxt); |
|
3735 name = ctxt->name; |
|
3736 if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) || |
|
3737 (name == NULL)) { |
|
3738 if (CUR == '>') |
|
3739 NEXT; |
|
3740 return; |
|
3741 } |
|
3742 |
|
3743 /* |
|
3744 * Lookup the info for that element. |
|
3745 */ |
|
3746 info = htmlTagLookup(name); |
|
3747 if (info == NULL) { |
|
3748 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, |
|
3749 "Tag %s invalid\n", name, NULL); |
|
3750 } |
|
3751 |
|
3752 /* |
|
3753 * Check for an Empty Element labeled the XML/SGML way |
|
3754 */ |
|
3755 if ((CUR == '/') && (NXT(1) == '>')) { |
|
3756 SKIP(2); |
|
3757 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
3758 ctxt->sax->endElement(ctxt->userData, name); |
|
3759 htmlnamePop(ctxt); |
|
3760 return; |
|
3761 } |
|
3762 |
|
3763 if (CUR == '>') { |
|
3764 NEXT; |
|
3765 } else { |
|
3766 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, |
|
3767 "Couldn't find end of Start Tag %s\n", name, NULL); |
|
3768 |
|
3769 /* |
|
3770 * end of parsing of this node. |
|
3771 */ |
|
3772 if (xmlStrEqual(name, ctxt->name)) { |
|
3773 nodePop(ctxt); |
|
3774 htmlnamePop(ctxt); |
|
3775 } |
|
3776 |
|
3777 /* |
|
3778 * Capture end position and add node |
|
3779 */ |
|
3780 if ( currentNode != NULL && ctxt->record_info ) { |
|
3781 node_info.end_pos = ctxt->input->consumed + |
|
3782 (CUR_PTR - ctxt->input->base); |
|
3783 node_info.end_line = ctxt->input->line; |
|
3784 node_info.node = ctxt->node; |
|
3785 xmlParserAddNodeInfo(ctxt, &node_info); |
|
3786 } |
|
3787 return; |
|
3788 } |
|
3789 |
|
3790 /* |
|
3791 * Check for an Empty Element from DTD definition |
|
3792 */ |
|
3793 if ((info != NULL) && (info->empty)) { |
|
3794 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
3795 ctxt->sax->endElement(ctxt->userData, name); |
|
3796 htmlnamePop(ctxt); |
|
3797 return; |
|
3798 } |
|
3799 |
|
3800 /* |
|
3801 * Parse the content of the element: |
|
3802 */ |
|
3803 currentNode = xmlStrdup(ctxt->name); |
|
3804 depth = ctxt->nameNr; |
|
3805 while (IS_CHAR_CH(CUR)) { |
|
3806 oldptr = ctxt->input->cur; |
|
3807 htmlParseContent(ctxt); |
|
3808 if (oldptr==ctxt->input->cur) break; |
|
3809 if (ctxt->nameNr < depth) break; |
|
3810 } |
|
3811 |
|
3812 /* |
|
3813 * Capture end position and add node |
|
3814 */ |
|
3815 if ( currentNode != NULL && ctxt->record_info ) { |
|
3816 node_info.end_pos = ctxt->input->consumed + |
|
3817 (CUR_PTR - ctxt->input->base); |
|
3818 node_info.end_line = ctxt->input->line; |
|
3819 node_info.node = ctxt->node; |
|
3820 xmlParserAddNodeInfo(ctxt, &node_info); |
|
3821 } |
|
3822 if (!IS_CHAR_CH(CUR)) { |
|
3823 htmlAutoCloseOnEnd(ctxt); |
|
3824 } |
|
3825 |
|
3826 if (currentNode != NULL) |
|
3827 xmlFree(currentNode); |
|
3828 } |
|
3829 |
|
3830 /** |
|
3831 * htmlParseDocument: |
|
3832 * @param ctxt an HTML parser context |
|
3833 * |
|
3834 * parse an HTML document (and build a tree if using the standard SAX |
|
3835 * interface). |
|
3836 * |
|
3837 * Returns 0, -1 in case of error. the parser context is augmented |
|
3838 * as a result of the parsing. |
|
3839 */ |
|
3840 |
|
3841 int |
|
3842 htmlParseDocument(htmlParserCtxtPtr ctxt) { |
|
3843 xmlDtdPtr dtd; |
|
3844 |
|
3845 xmlInitParser(); |
|
3846 |
|
3847 htmlDefaultSAXHandlerInit(); |
|
3848 ctxt->html = 1; |
|
3849 |
|
3850 GROW; |
|
3851 /* |
|
3852 * SAX: beginning of the document processing. |
|
3853 */ |
|
3854 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) |
|
3855 ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); |
|
3856 |
|
3857 /* |
|
3858 * Wipe out everything which is before the first '<' |
|
3859 */ |
|
3860 SKIP_BLANKS; |
|
3861 if (CUR == 0) { |
|
3862 htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, |
|
3863 "Document is empty\n", NULL, NULL); |
|
3864 } |
|
3865 |
|
3866 if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) |
|
3867 ctxt->sax->startDocument(ctxt->userData); |
|
3868 |
|
3869 |
|
3870 /* |
|
3871 * Parse possible comments before any content |
|
3872 */ |
|
3873 while ((CUR == '<') && (NXT(1) == '!') && |
|
3874 (NXT(2) == '-') && (NXT(3) == '-')) { |
|
3875 htmlParseComment(ctxt); |
|
3876 SKIP_BLANKS; |
|
3877 } |
|
3878 |
|
3879 |
|
3880 /* |
|
3881 * Then possibly doc type declaration(s) and more Misc |
|
3882 * (doctypedecl Misc*)? |
|
3883 */ |
|
3884 if ((CUR == '<') && (NXT(1) == '!') && |
|
3885 (UPP(2) == 'D') && (UPP(3) == 'O') && |
|
3886 (UPP(4) == 'C') && (UPP(5) == 'T') && |
|
3887 (UPP(6) == 'Y') && (UPP(7) == 'P') && |
|
3888 (UPP(8) == 'E')) { |
|
3889 htmlParseDocTypeDecl(ctxt); |
|
3890 } |
|
3891 SKIP_BLANKS; |
|
3892 |
|
3893 /* |
|
3894 * Parse possible comments before any content |
|
3895 */ |
|
3896 while ((CUR == '<') && (NXT(1) == '!') && |
|
3897 (NXT(2) == '-') && (NXT(3) == '-')) { |
|
3898 htmlParseComment(ctxt); |
|
3899 SKIP_BLANKS; |
|
3900 } |
|
3901 |
|
3902 /* |
|
3903 * Time to start parsing the tree itself |
|
3904 */ |
|
3905 htmlParseContent(ctxt); |
|
3906 |
|
3907 /* |
|
3908 * autoclose |
|
3909 */ |
|
3910 if (CUR == 0) |
|
3911 htmlAutoCloseOnEnd(ctxt); |
|
3912 |
|
3913 |
|
3914 /* |
|
3915 * SAX: end of the document processing. |
|
3916 */ |
|
3917 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) |
|
3918 ctxt->sax->endDocument(ctxt->userData); |
|
3919 |
|
3920 if (ctxt->myDoc != NULL) { |
|
3921 dtd = xmlGetIntSubset(ctxt->myDoc); |
|
3922 if (dtd == NULL) |
|
3923 ctxt->myDoc->intSubset = |
|
3924 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", |
|
3925 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", |
|
3926 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); |
|
3927 } |
|
3928 if (! ctxt->wellFormed) return(-1); |
|
3929 return(0); |
|
3930 } |
|
3931 |
|
3932 |
|
3933 /************************************************************************ |
|
3934 * * |
|
3935 * Parser contexts handling * |
|
3936 * * |
|
3937 ************************************************************************/ |
|
3938 |
|
3939 /** |
|
3940 * htmlInitParserCtxt: |
|
3941 * @param ctxt an HTML parser context |
|
3942 * |
|
3943 * Initialize a parser context |
|
3944 * |
|
3945 * Returns 0 in case of success and -1 in case of error |
|
3946 */ |
|
3947 |
|
3948 static int |
|
3949 htmlInitParserCtxt(htmlParserCtxtPtr ctxt) |
|
3950 { |
|
3951 htmlSAXHandler *sax; |
|
3952 |
|
3953 if (ctxt == NULL) return(-1); |
|
3954 memset(ctxt, 0, sizeof(htmlParserCtxt)); |
|
3955 // NOTE: All assignments ctxt->XX = 0; were commented as unnecessary |
|
3956 ctxt->dict = xmlDictCreate(); |
|
3957 if (ctxt->dict == NULL) { |
|
3958 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); |
|
3959 return(-1); |
|
3960 } |
|
3961 sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler)); |
|
3962 if (sax == NULL) { |
|
3963 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); |
|
3964 return(-1); |
|
3965 } |
|
3966 else |
|
3967 memset(sax, 0, sizeof(htmlSAXHandler)); |
|
3968 |
|
3969 /* Allocate the Input stack */ |
|
3970 ctxt->inputTab = (htmlParserInputPtr *) |
|
3971 xmlMalloc(5 * sizeof(htmlParserInputPtr)); |
|
3972 if (ctxt->inputTab == NULL) { |
|
3973 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); |
|
3974 //ctxt->inputNr = 0; |
|
3975 //ctxt->inputMax = 0; |
|
3976 //ctxt->input = NULL; |
|
3977 return(-1); |
|
3978 } |
|
3979 //ctxt->inputNr = 0; |
|
3980 ctxt->inputMax = 5; |
|
3981 //ctxt->input = NULL; |
|
3982 //ctxt->version = NULL; |
|
3983 //ctxt->encoding = NULL; |
|
3984 ctxt->standalone = -1; |
|
3985 ctxt->instate = XML_PARSER_START; |
|
3986 |
|
3987 /* Allocate the Node stack */ |
|
3988 ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr)); |
|
3989 if (ctxt->nodeTab == NULL) { |
|
3990 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); |
|
3991 //ctxt->nodeNr = 0; |
|
3992 //ctxt->nodeMax = 0; |
|
3993 //ctxt->node = NULL; |
|
3994 //ctxt->inputNr = 0; |
|
3995 //ctxt->inputMax = 0; |
|
3996 //ctxt->input = NULL; |
|
3997 return(-1); |
|
3998 } |
|
3999 //ctxt->nodeNr = 0; |
|
4000 ctxt->nodeMax = 10; |
|
4001 //ctxt->node = NULL; |
|
4002 |
|
4003 /* Allocate the Name stack */ |
|
4004 ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *)); |
|
4005 if (ctxt->nameTab == NULL) { |
|
4006 htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n"); |
|
4007 //ctxt->nameNr = 0; |
|
4008 ctxt->nameMax = 10; |
|
4009 //ctxt->name = NULL; |
|
4010 //ctxt->nodeNr = 0; |
|
4011 //ctxt->nodeMax = 0; |
|
4012 //ctxt->node = NULL; |
|
4013 //ctxt->inputNr = 0; |
|
4014 //ctxt->inputMax = 0; |
|
4015 ctxt->input = NULL; |
|
4016 return(-1); |
|
4017 } |
|
4018 //ctxt->nameNr = 0; |
|
4019 ctxt->nameMax = 10; |
|
4020 //ctxt->name = NULL; |
|
4021 |
|
4022 if (sax == NULL) |
|
4023 ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler; |
|
4024 else { |
|
4025 ctxt->sax = sax; |
|
4026 memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1)); |
|
4027 } |
|
4028 ctxt->userData = ctxt; |
|
4029 //ctxt->myDoc = NULL; |
|
4030 ctxt->wellFormed = 1; |
|
4031 //ctxt->replaceEntities = 0; |
|
4032 #ifdef LIBXML_ENABLE_NODE_LINEINFO |
|
4033 ctxt->linenumbers = xmlLineNumbersDefaultValue; |
|
4034 #endif |
|
4035 ctxt->html = 1; |
|
4036 ctxt->vctxt.userData = ctxt; |
|
4037 ctxt->vctxt.error = xmlParserValidityError; |
|
4038 ctxt->vctxt.warning = xmlParserValidityWarning; |
|
4039 //ctxt->record_info = 0; |
|
4040 //ctxt->validate = 0; |
|
4041 //ctxt->nbChars = 0; |
|
4042 //ctxt->checkIndex = 0; |
|
4043 //ctxt->catalogs = NULL; |
|
4044 xmlInitNodeInfoSeq(&ctxt->node_seq); |
|
4045 return(0); |
|
4046 } |
|
4047 |
|
4048 /** |
|
4049 * htmlFreeParserCtxt: |
|
4050 * @param ctxt an HTML parser context |
|
4051 * |
|
4052 * Free all the memory used by a parser context. However the parsed |
|
4053 * document in ctxt->myDoc is not freed. |
|
4054 */ |
|
4055 |
|
4056 void |
|
4057 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt) |
|
4058 { |
|
4059 xmlFreeParserCtxt(ctxt); |
|
4060 } |
|
4061 |
|
4062 /** |
|
4063 * htmlNewParserCtxt: |
|
4064 * |
|
4065 * Allocate and initialize a new parser context. |
|
4066 * |
|
4067 * Returns the xmlParserCtxtPtr or NULL |
|
4068 */ |
|
4069 |
|
4070 static htmlParserCtxtPtr |
|
4071 htmlNewParserCtxt(void) |
|
4072 { |
|
4073 xmlParserCtxtPtr ctxt; |
|
4074 |
|
4075 ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt)); |
|
4076 if (ctxt == NULL) { |
|
4077 htmlErrMemory(NULL, "NewParserCtxt: out of memory\n"); |
|
4078 return(NULL); |
|
4079 } |
|
4080 memset(ctxt, 0, sizeof(xmlParserCtxt)); |
|
4081 #ifdef XE_ENABLE_GS_CACHING |
|
4082 ctxt->cachedGs = xmlGetGlobalState(); |
|
4083 #endif |
|
4084 |
|
4085 if (htmlInitParserCtxt(ctxt) < 0) { |
|
4086 htmlFreeParserCtxt(ctxt); |
|
4087 return(NULL); |
|
4088 } |
|
4089 return(ctxt); |
|
4090 } |
|
4091 |
|
4092 /** |
|
4093 * htmlCreateMemoryParserCtxt: |
|
4094 * @param buffer a pointer to a char array |
|
4095 * @param size the size of the array |
|
4096 * |
|
4097 * Create a parser context for an HTML in-memory document. |
|
4098 * |
|
4099 * Returns the new parser context or NULL |
|
4100 */ |
|
4101 htmlParserCtxtPtr |
|
4102 htmlCreateMemoryParserCtxt(const char *buffer, int size) { |
|
4103 |
|
4104 xmlParserCtxtPtr ctxt; |
|
4105 xmlParserInputPtr input; |
|
4106 xmlParserInputBufferPtr buf; |
|
4107 |
|
4108 if (buffer == NULL) |
|
4109 return(NULL); |
|
4110 if (size <= 0) |
|
4111 return(NULL); |
|
4112 |
|
4113 ctxt = htmlNewParserCtxt(); |
|
4114 if (ctxt == NULL) |
|
4115 return(NULL); |
|
4116 |
|
4117 buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); |
|
4118 if (buf == NULL) return(NULL); |
|
4119 |
|
4120 input = xmlNewInputStream(ctxt); |
|
4121 if (input == NULL) { |
|
4122 xmlFreeParserCtxt(ctxt); |
|
4123 return(NULL); |
|
4124 } |
|
4125 |
|
4126 input->filename = NULL; |
|
4127 input->buf = buf; |
|
4128 input->base = input->buf->buffer->content; |
|
4129 input->cur = input->buf->buffer->content; |
|
4130 input->end = &input->buf->buffer->content[input->buf->buffer->use]; |
|
4131 |
|
4132 inputPush(ctxt, input); |
|
4133 return(ctxt); |
|
4134 } |
|
4135 |
|
4136 /** |
|
4137 * htmlCreateDocParserCtxt: |
|
4138 * @param cur a pointer to an array of xmlChar |
|
4139 * @param encoding a free form C string describing the HTML document encoding, or NULL |
|
4140 * |
|
4141 * Create a parser context for an HTML document. |
|
4142 * |
|
4143 |
|
4144 * |
|
4145 * Returns the new parser context or NULL |
|
4146 */ |
|
4147 static htmlParserCtxtPtr |
|
4148 htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) { |
|
4149 int len; |
|
4150 htmlParserCtxtPtr ctxt; |
|
4151 |
|
4152 if (cur == NULL) |
|
4153 return(NULL); |
|
4154 len = xmlStrlen(cur); |
|
4155 ctxt = htmlCreateMemoryParserCtxt((char *)cur, len); |
|
4156 |
|
4157 if (encoding != NULL) { |
|
4158 xmlCharEncoding enc; |
|
4159 xmlCharEncodingHandlerPtr handler; |
|
4160 |
|
4161 if (ctxt->input->encoding != NULL) |
|
4162 xmlFree((xmlChar *) ctxt->input->encoding); |
|
4163 ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); |
|
4164 |
|
4165 enc = xmlParseCharEncoding(encoding); |
|
4166 /* |
|
4167 * registered set of known encodings |
|
4168 */ |
|
4169 if (enc != XML_CHAR_ENCODING_ERROR) { |
|
4170 xmlSwitchEncoding(ctxt, enc); |
|
4171 if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { |
|
4172 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, |
|
4173 "Unsupported encoding %s\n", |
|
4174 (const xmlChar *) encoding, NULL); |
|
4175 } |
|
4176 } else { |
|
4177 /* |
|
4178 * fallback for unknown encodings |
|
4179 */ |
|
4180 handler = xmlFindCharEncodingHandler((const char *) encoding); |
|
4181 if (handler != NULL) { |
|
4182 xmlSwitchToEncoding(ctxt, handler); |
|
4183 } else { |
|
4184 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, |
|
4185 "Unsupported encoding %s\n", |
|
4186 (const xmlChar *) encoding, NULL); |
|
4187 } |
|
4188 } |
|
4189 } |
|
4190 return(ctxt); |
|
4191 } |
|
4192 |
|
4193 #ifdef LIBXML_PUSH_ENABLED |
|
4194 /************************************************************************ |
|
4195 * * |
|
4196 * Progressive parsing interfaces * |
|
4197 * * |
|
4198 ************************************************************************/ |
|
4199 |
|
4200 /** |
|
4201 * htmlParseLookupSequence: |
|
4202 * @param ctxt an HTML parser context |
|
4203 * @param first the first char to lookup |
|
4204 * @param next the next char to lookup or zero |
|
4205 * @param third the next char to lookup or zero |
|
4206 * @param comment flag to force checking inside comments |
|
4207 * |
|
4208 * Try to find if a sequence (first, next, third) or just (first next) or |
|
4209 * (first) is available in the input stream. |
|
4210 * This function has a side effect of (possibly) incrementing ctxt->checkIndex |
|
4211 * to avoid rescanning sequences of bytes, it DOES change the state of the |
|
4212 * parser, do not use liberally. |
|
4213 * This is basically similar to xmlParseLookupSequence() |
|
4214 * |
|
4215 * Returns the index to the current parsing point if the full sequence |
|
4216 * is available, -1 otherwise. |
|
4217 */ |
|
4218 static int |
|
4219 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, |
|
4220 xmlChar next, xmlChar third, int iscomment) { |
|
4221 int base, len; |
|
4222 htmlParserInputPtr in; |
|
4223 const xmlChar *buf; |
|
4224 int incomment = 0; |
|
4225 |
|
4226 in = ctxt->input; |
|
4227 if (in == NULL) return(-1); |
|
4228 base = in->cur - in->base; |
|
4229 if (base < 0) return(-1); |
|
4230 if (ctxt->checkIndex > base) |
|
4231 base = ctxt->checkIndex; |
|
4232 if (in->buf == NULL) { |
|
4233 buf = in->base; |
|
4234 len = in->length; |
|
4235 } else { |
|
4236 buf = in->buf->buffer->content; |
|
4237 len = in->buf->buffer->use; |
|
4238 } |
|
4239 /* take into account the sequence length */ |
|
4240 if (third) len -= 2; |
|
4241 else if (next) len --; |
|
4242 for (;base < len;base++) { |
|
4243 if (!incomment && (base + 4 < len) && !iscomment) { |
|
4244 if ((buf[base] == '<') && (buf[base + 1] == '!') && |
|
4245 (buf[base + 2] == '-') && (buf[base + 3] == '-')) { |
|
4246 incomment = 1; |
|
4247 /* do not increment past <! - some people use <!--> */ |
|
4248 base += 2; |
|
4249 } |
|
4250 } |
|
4251 if (incomment) { |
|
4252 if (base + 3 > len) |
|
4253 return(-1); |
|
4254 if ((buf[base] == '-') && (buf[base + 1] == '-') && |
|
4255 (buf[base + 2] == '>')) { |
|
4256 incomment = 0; |
|
4257 base += 2; |
|
4258 } |
|
4259 continue; |
|
4260 } |
|
4261 if (buf[base] == first) { |
|
4262 if (third != 0) { |
|
4263 if ((buf[base + 1] != next) || |
|
4264 (buf[base + 2] != third)) continue; |
|
4265 } else if (next != 0) { |
|
4266 if (buf[base + 1] != next) continue; |
|
4267 } |
|
4268 ctxt->checkIndex = 0; |
|
4269 #ifdef DEBUG_PUSH |
|
4270 if (next == 0) |
|
4271 xmlGenericError(xmlGenericErrorContext, |
|
4272 "HPP: lookup '%c' found at %d\n", |
|
4273 first, base); |
|
4274 else if (third == 0) |
|
4275 xmlGenericError(xmlGenericErrorContext, |
|
4276 "HPP: lookup '%c%c' found at %d\n", |
|
4277 first, next, base); |
|
4278 else |
|
4279 xmlGenericError(xmlGenericErrorContext, |
|
4280 "HPP: lookup '%c%c%c' found at %d\n", |
|
4281 first, next, third, base); |
|
4282 #endif |
|
4283 return(base - (in->cur - in->base)); |
|
4284 } |
|
4285 } |
|
4286 ctxt->checkIndex = base; |
|
4287 #ifdef DEBUG_PUSH |
|
4288 if (next == 0) |
|
4289 xmlGenericError(xmlGenericErrorContext, |
|
4290 "HPP: lookup '%c' failed\n", first); |
|
4291 else if (third == 0) |
|
4292 xmlGenericError(xmlGenericErrorContext, |
|
4293 "HPP: lookup '%c%c' failed\n", first, next); |
|
4294 else |
|
4295 xmlGenericError(xmlGenericErrorContext, |
|
4296 "HPP: lookup '%c%c%c' failed\n", first, next, third); |
|
4297 #endif |
|
4298 return(-1); |
|
4299 } |
|
4300 |
|
4301 /** |
|
4302 * htmlParseTryOrFinish: |
|
4303 * @param ctxt an HTML parser context |
|
4304 * @param terminate last chunk indicator |
|
4305 * |
|
4306 * Try to progress on parsing |
|
4307 * |
|
4308 * Returns zero if no parsing was possible |
|
4309 */ |
|
4310 static int |
|
4311 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { |
|
4312 int ret = 0; |
|
4313 htmlParserInputPtr in; |
|
4314 int avail = 0; |
|
4315 xmlChar cur, next; |
|
4316 |
|
4317 #ifdef DEBUG_PUSH |
|
4318 switch (ctxt->instate) { |
|
4319 case XML_PARSER_EOF: |
|
4320 xmlGenericError(xmlGenericErrorContext, |
|
4321 "HPP: try EOF\n"); break; |
|
4322 case XML_PARSER_START: |
|
4323 xmlGenericError(xmlGenericErrorContext, |
|
4324 "HPP: try START\n"); break; |
|
4325 case XML_PARSER_MISC: |
|
4326 xmlGenericError(xmlGenericErrorContext, |
|
4327 "HPP: try MISC\n");break; |
|
4328 case XML_PARSER_COMMENT: |
|
4329 xmlGenericError(xmlGenericErrorContext, |
|
4330 "HPP: try COMMENT\n");break; |
|
4331 case XML_PARSER_PROLOG: |
|
4332 xmlGenericError(xmlGenericErrorContext, |
|
4333 "HPP: try PROLOG\n");break; |
|
4334 case XML_PARSER_START_TAG: |
|
4335 xmlGenericError(xmlGenericErrorContext, |
|
4336 "HPP: try START_TAG\n");break; |
|
4337 case XML_PARSER_CONTENT: |
|
4338 xmlGenericError(xmlGenericErrorContext, |
|
4339 "HPP: try CONTENT\n");break; |
|
4340 case XML_PARSER_CDATA_SECTION: |
|
4341 xmlGenericError(xmlGenericErrorContext, |
|
4342 "HPP: try CDATA_SECTION\n");break; |
|
4343 case XML_PARSER_END_TAG: |
|
4344 xmlGenericError(xmlGenericErrorContext, |
|
4345 "HPP: try END_TAG\n");break; |
|
4346 case XML_PARSER_ENTITY_DECL: |
|
4347 xmlGenericError(xmlGenericErrorContext, |
|
4348 "HPP: try ENTITY_DECL\n");break; |
|
4349 case XML_PARSER_ENTITY_VALUE: |
|
4350 xmlGenericError(xmlGenericErrorContext, |
|
4351 "HPP: try ENTITY_VALUE\n");break; |
|
4352 case XML_PARSER_ATTRIBUTE_VALUE: |
|
4353 xmlGenericError(xmlGenericErrorContext, |
|
4354 "HPP: try ATTRIBUTE_VALUE\n");break; |
|
4355 case XML_PARSER_DTD: |
|
4356 xmlGenericError(xmlGenericErrorContext, |
|
4357 "HPP: try DTD\n");break; |
|
4358 case XML_PARSER_EPILOG: |
|
4359 xmlGenericError(xmlGenericErrorContext, |
|
4360 "HPP: try EPILOG\n");break; |
|
4361 case XML_PARSER_PI: |
|
4362 xmlGenericError(xmlGenericErrorContext, |
|
4363 "HPP: try PI\n");break; |
|
4364 case XML_PARSER_SYSTEM_LITERAL: |
|
4365 xmlGenericError(xmlGenericErrorContext, |
|
4366 "HPP: try SYSTEM_LITERAL\n");break; |
|
4367 } |
|
4368 #endif |
|
4369 |
|
4370 while (1) { |
|
4371 |
|
4372 in = ctxt->input; |
|
4373 if (in == NULL) break; |
|
4374 if (in->buf == NULL) |
|
4375 avail = in->length - (in->cur - in->base); |
|
4376 else |
|
4377 avail = in->buf->buffer->use - (in->cur - in->base); |
|
4378 if ((avail == 0) && (terminate)) { |
|
4379 htmlAutoCloseOnEnd(ctxt); |
|
4380 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { |
|
4381 /* |
|
4382 * SAX: end of the document processing. |
|
4383 */ |
|
4384 ctxt->instate = XML_PARSER_EOF; |
|
4385 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) |
|
4386 ctxt->sax->endDocument(ctxt->userData); |
|
4387 } |
|
4388 } |
|
4389 if (avail < 1) |
|
4390 goto done; |
|
4391 cur = in->cur[0]; |
|
4392 if (cur == 0) { |
|
4393 SKIP(1); |
|
4394 continue; |
|
4395 } |
|
4396 |
|
4397 switch (ctxt->instate) { |
|
4398 case XML_PARSER_EOF: |
|
4399 /* |
|
4400 * Document parsing is done ! |
|
4401 */ |
|
4402 goto done; |
|
4403 case XML_PARSER_START: |
|
4404 /* |
|
4405 * Very first chars read from the document flow. |
|
4406 */ |
|
4407 cur = in->cur[0]; |
|
4408 if (IS_BLANK_CH(cur)) { |
|
4409 SKIP_BLANKS; |
|
4410 if (in->buf == NULL) |
|
4411 avail = in->length - (in->cur - in->base); |
|
4412 else |
|
4413 avail = in->buf->buffer->use - (in->cur - in->base); |
|
4414 } |
|
4415 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) |
|
4416 ctxt->sax->setDocumentLocator(ctxt->userData, |
|
4417 &xmlDefaultSAXLocator); |
|
4418 if ((ctxt->sax) && (ctxt->sax->startDocument) && |
|
4419 (!ctxt->disableSAX)) |
|
4420 ctxt->sax->startDocument(ctxt->userData); |
|
4421 |
|
4422 cur = in->cur[0]; |
|
4423 next = in->cur[1]; |
|
4424 if ((cur == '<') && (next == '!') && |
|
4425 (UPP(2) == 'D') && (UPP(3) == 'O') && |
|
4426 (UPP(4) == 'C') && (UPP(5) == 'T') && |
|
4427 (UPP(6) == 'Y') && (UPP(7) == 'P') && |
|
4428 (UPP(8) == 'E')) { |
|
4429 if ((!terminate) && |
|
4430 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) |
|
4431 goto done; |
|
4432 #ifdef DEBUG_PUSH |
|
4433 xmlGenericError(xmlGenericErrorContext, |
|
4434 "HPP: Parsing internal subset\n"); |
|
4435 #endif |
|
4436 htmlParseDocTypeDecl(ctxt); |
|
4437 ctxt->instate = XML_PARSER_PROLOG; |
|
4438 #ifdef DEBUG_PUSH |
|
4439 xmlGenericError(xmlGenericErrorContext, |
|
4440 "HPP: entering PROLOG\n"); |
|
4441 #endif |
|
4442 } else { |
|
4443 ctxt->instate = XML_PARSER_MISC; |
|
4444 } |
|
4445 #ifdef DEBUG_PUSH |
|
4446 xmlGenericError(xmlGenericErrorContext, |
|
4447 "HPP: entering MISC\n"); |
|
4448 #endif |
|
4449 break; |
|
4450 case XML_PARSER_MISC: |
|
4451 SKIP_BLANKS; |
|
4452 if (in->buf == NULL) |
|
4453 avail = in->length - (in->cur - in->base); |
|
4454 else |
|
4455 avail = in->buf->buffer->use - (in->cur - in->base); |
|
4456 if (avail < 2) |
|
4457 goto done; |
|
4458 cur = in->cur[0]; |
|
4459 next = in->cur[1]; |
|
4460 if ((cur == '<') && (next == '!') && |
|
4461 (in->cur[2] == '-') && (in->cur[3] == '-')) { |
|
4462 if ((!terminate) && |
|
4463 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) |
|
4464 goto done; |
|
4465 #ifdef DEBUG_PUSH |
|
4466 xmlGenericError(xmlGenericErrorContext, |
|
4467 "HPP: Parsing Comment\n"); |
|
4468 #endif |
|
4469 htmlParseComment(ctxt); |
|
4470 ctxt->instate = XML_PARSER_MISC; |
|
4471 } else if ((cur == '<') && (next == '!') && |
|
4472 (UPP(2) == 'D') && (UPP(3) == 'O') && |
|
4473 (UPP(4) == 'C') && (UPP(5) == 'T') && |
|
4474 (UPP(6) == 'Y') && (UPP(7) == 'P') && |
|
4475 (UPP(8) == 'E')) { |
|
4476 if ((!terminate) && |
|
4477 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) |
|
4478 goto done; |
|
4479 #ifdef DEBUG_PUSH |
|
4480 xmlGenericError(xmlGenericErrorContext, |
|
4481 "HPP: Parsing internal subset\n"); |
|
4482 #endif |
|
4483 htmlParseDocTypeDecl(ctxt); |
|
4484 ctxt->instate = XML_PARSER_PROLOG; |
|
4485 #ifdef DEBUG_PUSH |
|
4486 xmlGenericError(xmlGenericErrorContext, |
|
4487 "HPP: entering PROLOG\n"); |
|
4488 #endif |
|
4489 } else if ((cur == '<') && (next == '!') && |
|
4490 (avail < 9)) { |
|
4491 goto done; |
|
4492 } else { |
|
4493 ctxt->instate = XML_PARSER_START_TAG; |
|
4494 #ifdef DEBUG_PUSH |
|
4495 xmlGenericError(xmlGenericErrorContext, |
|
4496 "HPP: entering START_TAG\n"); |
|
4497 #endif |
|
4498 } |
|
4499 break; |
|
4500 case XML_PARSER_PROLOG: |
|
4501 SKIP_BLANKS; |
|
4502 if (in->buf == NULL) |
|
4503 avail = in->length - (in->cur - in->base); |
|
4504 else |
|
4505 avail = in->buf->buffer->use - (in->cur - in->base); |
|
4506 if (avail < 2) |
|
4507 goto done; |
|
4508 cur = in->cur[0]; |
|
4509 next = in->cur[1]; |
|
4510 if ((cur == '<') && (next == '!') && |
|
4511 (in->cur[2] == '-') && (in->cur[3] == '-')) { |
|
4512 if ((!terminate) && |
|
4513 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) |
|
4514 goto done; |
|
4515 #ifdef DEBUG_PUSH |
|
4516 xmlGenericError(xmlGenericErrorContext, |
|
4517 "HPP: Parsing Comment\n"); |
|
4518 #endif |
|
4519 htmlParseComment(ctxt); |
|
4520 ctxt->instate = XML_PARSER_PROLOG; |
|
4521 } else if ((cur == '<') && (next == '!') && |
|
4522 (avail < 4)) { |
|
4523 goto done; |
|
4524 } else { |
|
4525 ctxt->instate = XML_PARSER_START_TAG; |
|
4526 #ifdef DEBUG_PUSH |
|
4527 xmlGenericError(xmlGenericErrorContext, |
|
4528 "HPP: entering START_TAG\n"); |
|
4529 #endif |
|
4530 } |
|
4531 break; |
|
4532 case XML_PARSER_EPILOG: |
|
4533 if (in->buf == NULL) |
|
4534 avail = in->length - (in->cur - in->base); |
|
4535 else |
|
4536 avail = in->buf->buffer->use - (in->cur - in->base); |
|
4537 if (avail < 1) |
|
4538 goto done; |
|
4539 cur = in->cur[0]; |
|
4540 if (IS_BLANK_CH(cur)) { |
|
4541 htmlParseCharData(ctxt); |
|
4542 goto done; |
|
4543 } |
|
4544 if (avail < 2) |
|
4545 goto done; |
|
4546 next = in->cur[1]; |
|
4547 if ((cur == '<') && (next == '!') && |
|
4548 (in->cur[2] == '-') && (in->cur[3] == '-')) { |
|
4549 if ((!terminate) && |
|
4550 (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0)) |
|
4551 goto done; |
|
4552 #ifdef DEBUG_PUSH |
|
4553 xmlGenericError(xmlGenericErrorContext, |
|
4554 "HPP: Parsing Comment\n"); |
|
4555 #endif |
|
4556 htmlParseComment(ctxt); |
|
4557 ctxt->instate = XML_PARSER_EPILOG; |
|
4558 } else if ((cur == '<') && (next == '!') && |
|
4559 (avail < 4)) { |
|
4560 goto done; |
|
4561 } else { |
|
4562 ctxt->errNo = XML_ERR_DOCUMENT_END; |
|
4563 ctxt->wellFormed = 0; |
|
4564 ctxt->instate = XML_PARSER_EOF; |
|
4565 #ifdef DEBUG_PUSH |
|
4566 xmlGenericError(xmlGenericErrorContext, |
|
4567 "HPP: entering EOF\n"); |
|
4568 #endif |
|
4569 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) |
|
4570 ctxt->sax->endDocument(ctxt->userData); |
|
4571 goto done; |
|
4572 } |
|
4573 break; |
|
4574 case XML_PARSER_START_TAG: { |
|
4575 const xmlChar *name, *oldname; |
|
4576 int depth = ctxt->nameNr; |
|
4577 const htmlElemDesc * info; |
|
4578 |
|
4579 if (avail < 2) |
|
4580 goto done; |
|
4581 cur = in->cur[0]; |
|
4582 if (cur != '<') { |
|
4583 ctxt->instate = XML_PARSER_CONTENT; |
|
4584 #ifdef DEBUG_PUSH |
|
4585 xmlGenericError(xmlGenericErrorContext, |
|
4586 "HPP: entering CONTENT\n"); |
|
4587 #endif |
|
4588 break; |
|
4589 } |
|
4590 if (in->cur[1] == '/') { |
|
4591 ctxt->instate = XML_PARSER_END_TAG; |
|
4592 ctxt->checkIndex = 0; |
|
4593 #ifdef DEBUG_PUSH |
|
4594 xmlGenericError(xmlGenericErrorContext, |
|
4595 "HPP: entering END_TAG\n"); |
|
4596 #endif |
|
4597 break; |
|
4598 } |
|
4599 if ((!terminate) && |
|
4600 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) |
|
4601 goto done; |
|
4602 |
|
4603 oldname = ctxt->name; |
|
4604 htmlParseStartTag(ctxt); |
|
4605 name = ctxt->name; |
|
4606 if (((depth == ctxt->nameNr) && |
|
4607 (xmlStrEqual(oldname, ctxt->name))) || |
|
4608 (name == NULL)) { |
|
4609 if (CUR == '>') |
|
4610 NEXT; |
|
4611 break; |
|
4612 } |
|
4613 |
|
4614 /* |
|
4615 * Lookup the info for that element. |
|
4616 */ |
|
4617 info = htmlTagLookup(name); |
|
4618 if (info == NULL) { |
|
4619 htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG, |
|
4620 "Tag %s invalid\n", name, NULL); |
|
4621 } |
|
4622 |
|
4623 /* |
|
4624 * Check for an Empty Element labeled the XML/SGML way |
|
4625 */ |
|
4626 if ((CUR == '/') && (NXT(1) == '>')) { |
|
4627 SKIP(2); |
|
4628 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
4629 ctxt->sax->endElement(ctxt->userData, name); |
|
4630 oldname = htmlnamePop(ctxt); |
|
4631 ctxt->instate = XML_PARSER_CONTENT; |
|
4632 #ifdef DEBUG_PUSH |
|
4633 xmlGenericError(xmlGenericErrorContext, |
|
4634 "HPP: entering CONTENT\n"); |
|
4635 #endif |
|
4636 break; |
|
4637 } |
|
4638 |
|
4639 if (CUR == '>') { |
|
4640 NEXT; |
|
4641 } else { |
|
4642 htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, |
|
4643 "Couldn't find end of Start Tag %s\n", |
|
4644 name, NULL); |
|
4645 |
|
4646 /* |
|
4647 * end of parsing of this node. |
|
4648 */ |
|
4649 if (xmlStrEqual(name, ctxt->name)) { |
|
4650 nodePop(ctxt); |
|
4651 oldname = htmlnamePop(ctxt); |
|
4652 } |
|
4653 |
|
4654 ctxt->instate = XML_PARSER_CONTENT; |
|
4655 #ifdef DEBUG_PUSH |
|
4656 xmlGenericError(xmlGenericErrorContext, |
|
4657 "HPP: entering CONTENT\n"); |
|
4658 #endif |
|
4659 break; |
|
4660 } |
|
4661 |
|
4662 /* |
|
4663 * Check for an Empty Element from DTD definition |
|
4664 */ |
|
4665 if ((info != NULL) && (info->empty)) { |
|
4666 if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) |
|
4667 ctxt->sax->endElement(ctxt->userData, name); |
|
4668 oldname = htmlnamePop(ctxt); |
|
4669 } |
|
4670 ctxt->instate = XML_PARSER_CONTENT; |
|
4671 #ifdef DEBUG_PUSH |
|
4672 xmlGenericError(xmlGenericErrorContext, |
|
4673 "HPP: entering CONTENT\n"); |
|
4674 #endif |
|
4675 break; |
|
4676 } |
|
4677 case XML_PARSER_CONTENT: { |
|
4678 long cons; |
|
4679 /* |
|
4680 * Handle preparsed entities and charRef |
|
4681 */ |
|
4682 if (ctxt->token != 0) { |
|
4683 xmlChar chr[2] = { 0 , 0 } ; |
|
4684 |
|
4685 chr[0] = (xmlChar) ctxt->token; |
|
4686 htmlCheckParagraph(ctxt); |
|
4687 if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) |
|
4688 ctxt->sax->characters(ctxt->userData, chr, 1); |
|
4689 ctxt->token = 0; |
|
4690 ctxt->checkIndex = 0; |
|
4691 } |
|
4692 if ((avail == 1) && (terminate)) { |
|
4693 cur = in->cur[0]; |
|
4694 if ((cur != '<') && (cur != '&')) { |
|
4695 if (ctxt->sax != NULL) { |
|
4696 if (IS_BLANK_CH(cur)) { |
|
4697 if (ctxt->sax->ignorableWhitespace != NULL) |
|
4698 ctxt->sax->ignorableWhitespace( |
|
4699 ctxt->userData, &cur, 1); |
|
4700 } else { |
|
4701 htmlCheckParagraph(ctxt); |
|
4702 if (ctxt->sax->characters != NULL) |
|
4703 ctxt->sax->characters( |
|
4704 ctxt->userData, &cur, 1); |
|
4705 } |
|
4706 } |
|
4707 ctxt->token = 0; |
|
4708 ctxt->checkIndex = 0; |
|
4709 in->cur++; |
|
4710 break; |
|
4711 } |
|
4712 } |
|
4713 if (avail < 2) |
|
4714 goto done; |
|
4715 cur = in->cur[0]; |
|
4716 next = in->cur[1]; |
|
4717 cons = ctxt->nbChars; |
|
4718 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) || |
|
4719 (xmlStrEqual(ctxt->name, BAD_CAST"style"))) { |
|
4720 /* |
|
4721 * Handle SCRIPT/STYLE separately |
|
4722 */ |
|
4723 if ((!terminate) && |
|
4724 (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0)) |
|
4725 goto done; |
|
4726 htmlParseScript(ctxt); |
|
4727 if ((cur == '<') && (next == '/')) { |
|
4728 ctxt->instate = XML_PARSER_END_TAG; |
|
4729 ctxt->checkIndex = 0; |
|
4730 #ifdef DEBUG_PUSH |
|
4731 xmlGenericError(xmlGenericErrorContext, |
|
4732 "HPP: entering END_TAG\n"); |
|
4733 #endif |
|
4734 break; |
|
4735 } |
|
4736 } else { |
|
4737 /* |
|
4738 * Sometimes DOCTYPE arrives in the middle of the document |
|
4739 */ |
|
4740 if ((cur == '<') && (next == '!') && |
|
4741 (UPP(2) == 'D') && (UPP(3) == 'O') && |
|
4742 (UPP(4) == 'C') && (UPP(5) == 'T') && |
|
4743 (UPP(6) == 'Y') && (UPP(7) == 'P') && |
|
4744 (UPP(8) == 'E')) { |
|
4745 if ((!terminate) && |
|
4746 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) |
|
4747 goto done; |
|
4748 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, |
|
4749 "Misplaced DOCTYPE declaration\n", |
|
4750 BAD_CAST "DOCTYPE" , NULL); |
|
4751 htmlParseDocTypeDecl(ctxt); |
|
4752 } else if ((cur == '<') && (next == '!') && |
|
4753 (in->cur[2] == '-') && (in->cur[3] == '-')) { |
|
4754 if ((!terminate) && |
|
4755 (htmlParseLookupSequence( |
|
4756 ctxt, '-', '-', '>', 1) < 0)) |
|
4757 goto done; |
|
4758 #ifdef DEBUG_PUSH |
|
4759 xmlGenericError(xmlGenericErrorContext, |
|
4760 "HPP: Parsing Comment\n"); |
|
4761 #endif |
|
4762 htmlParseComment(ctxt); |
|
4763 ctxt->instate = XML_PARSER_CONTENT; |
|
4764 } else if ((cur == '<') && (next == '!') && (avail < 4)) { |
|
4765 goto done; |
|
4766 } else if ((cur == '<') && (next == '/')) { |
|
4767 ctxt->instate = XML_PARSER_END_TAG; |
|
4768 ctxt->checkIndex = 0; |
|
4769 #ifdef DEBUG_PUSH |
|
4770 xmlGenericError(xmlGenericErrorContext, |
|
4771 "HPP: entering END_TAG\n"); |
|
4772 #endif |
|
4773 break; |
|
4774 } else if (cur == '<') { |
|
4775 ctxt->instate = XML_PARSER_START_TAG; |
|
4776 ctxt->checkIndex = 0; |
|
4777 #ifdef DEBUG_PUSH |
|
4778 xmlGenericError(xmlGenericErrorContext, |
|
4779 "HPP: entering START_TAG\n"); |
|
4780 #endif |
|
4781 break; |
|
4782 } else if (cur == '&') { |
|
4783 if ((!terminate) && |
|
4784 (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0)) |
|
4785 goto done; |
|
4786 #ifdef DEBUG_PUSH |
|
4787 xmlGenericError(xmlGenericErrorContext, |
|
4788 "HPP: Parsing Reference\n"); |
|
4789 #endif |
|
4790 |
|
4791 htmlParseReference(ctxt); |
|
4792 } else { |
|
4793 /* |
|
4794 * check that the text sequence is complete |
|
4795 * before handing out the data to the parser |
|
4796 * to avoid problems with erroneous end of |
|
4797 * data detection. |
|
4798 */ |
|
4799 if ((!terminate) && |
|
4800 (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) |
|
4801 goto done; |
|
4802 ctxt->checkIndex = 0; |
|
4803 #ifdef DEBUG_PUSH |
|
4804 xmlGenericError(xmlGenericErrorContext, |
|
4805 "HPP: Parsing char data\n"); |
|
4806 #endif |
|
4807 htmlParseCharData(ctxt); |
|
4808 } |
|
4809 } |
|
4810 if (cons == ctxt->nbChars) { |
|
4811 if (ctxt->node != NULL) { |
|
4812 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4813 "detected an error in element content\n", |
|
4814 NULL, NULL); |
|
4815 } |
|
4816 NEXT; |
|
4817 break; |
|
4818 } |
|
4819 |
|
4820 break; |
|
4821 } |
|
4822 case XML_PARSER_END_TAG: |
|
4823 if (avail < 2) |
|
4824 goto done; |
|
4825 if ((!terminate) && |
|
4826 (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) |
|
4827 goto done; |
|
4828 htmlParseEndTag(ctxt); |
|
4829 if (ctxt->nameNr == 0) { |
|
4830 ctxt->instate = XML_PARSER_EPILOG; |
|
4831 } else { |
|
4832 ctxt->instate = XML_PARSER_CONTENT; |
|
4833 } |
|
4834 ctxt->checkIndex = 0; |
|
4835 #ifdef DEBUG_PUSH |
|
4836 xmlGenericError(xmlGenericErrorContext, |
|
4837 "HPP: entering CONTENT\n"); |
|
4838 #endif |
|
4839 break; |
|
4840 case XML_PARSER_CDATA_SECTION: |
|
4841 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4842 "HPP: internal error, state == CDATA\n", |
|
4843 NULL, NULL); |
|
4844 ctxt->instate = XML_PARSER_CONTENT; |
|
4845 ctxt->checkIndex = 0; |
|
4846 #ifdef DEBUG_PUSH |
|
4847 xmlGenericError(xmlGenericErrorContext, |
|
4848 "HPP: entering CONTENT\n"); |
|
4849 #endif |
|
4850 break; |
|
4851 case XML_PARSER_DTD: |
|
4852 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4853 "HPP: internal error, state == DTD\n", |
|
4854 NULL, NULL); |
|
4855 ctxt->instate = XML_PARSER_CONTENT; |
|
4856 ctxt->checkIndex = 0; |
|
4857 #ifdef DEBUG_PUSH |
|
4858 xmlGenericError(xmlGenericErrorContext, |
|
4859 "HPP: entering CONTENT\n"); |
|
4860 #endif |
|
4861 break; |
|
4862 case XML_PARSER_COMMENT: |
|
4863 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4864 "HPP: internal error, state == COMMENT\n", |
|
4865 NULL, NULL); |
|
4866 ctxt->instate = XML_PARSER_CONTENT; |
|
4867 ctxt->checkIndex = 0; |
|
4868 #ifdef DEBUG_PUSH |
|
4869 xmlGenericError(xmlGenericErrorContext, |
|
4870 "HPP: entering CONTENT\n"); |
|
4871 #endif |
|
4872 break; |
|
4873 case XML_PARSER_PI: |
|
4874 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4875 "HPP: internal error, state == PI\n", |
|
4876 NULL, NULL); |
|
4877 ctxt->instate = XML_PARSER_CONTENT; |
|
4878 ctxt->checkIndex = 0; |
|
4879 #ifdef DEBUG_PUSH |
|
4880 xmlGenericError(xmlGenericErrorContext, |
|
4881 "HPP: entering CONTENT\n"); |
|
4882 #endif |
|
4883 break; |
|
4884 case XML_PARSER_ENTITY_DECL: |
|
4885 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4886 "HPP: internal error, state == ENTITY_DECL\n", |
|
4887 NULL, NULL); |
|
4888 ctxt->instate = XML_PARSER_CONTENT; |
|
4889 ctxt->checkIndex = 0; |
|
4890 #ifdef DEBUG_PUSH |
|
4891 xmlGenericError(xmlGenericErrorContext, |
|
4892 "HPP: entering CONTENT\n"); |
|
4893 #endif |
|
4894 break; |
|
4895 case XML_PARSER_ENTITY_VALUE: |
|
4896 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4897 "HPP: internal error, state == ENTITY_VALUE\n", |
|
4898 NULL, NULL); |
|
4899 ctxt->instate = XML_PARSER_CONTENT; |
|
4900 ctxt->checkIndex = 0; |
|
4901 #ifdef DEBUG_PUSH |
|
4902 xmlGenericError(xmlGenericErrorContext, |
|
4903 "HPP: entering DTD\n"); |
|
4904 #endif |
|
4905 break; |
|
4906 case XML_PARSER_ATTRIBUTE_VALUE: |
|
4907 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4908 "HPP: internal error, state == ATTRIBUTE_VALUE\n", |
|
4909 NULL, NULL); |
|
4910 ctxt->instate = XML_PARSER_START_TAG; |
|
4911 ctxt->checkIndex = 0; |
|
4912 #ifdef DEBUG_PUSH |
|
4913 xmlGenericError(xmlGenericErrorContext, |
|
4914 "HPP: entering START_TAG\n"); |
|
4915 #endif |
|
4916 break; |
|
4917 case XML_PARSER_SYSTEM_LITERAL: |
|
4918 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4919 "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n", |
|
4920 NULL, NULL); |
|
4921 ctxt->instate = XML_PARSER_CONTENT; |
|
4922 ctxt->checkIndex = 0; |
|
4923 #ifdef DEBUG_PUSH |
|
4924 xmlGenericError(xmlGenericErrorContext, |
|
4925 "HPP: entering CONTENT\n"); |
|
4926 #endif |
|
4927 break; |
|
4928 case XML_PARSER_IGNORE: |
|
4929 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4930 "HPP: internal error, state == XML_PARSER_IGNORE\n", |
|
4931 NULL, NULL); |
|
4932 ctxt->instate = XML_PARSER_CONTENT; |
|
4933 ctxt->checkIndex = 0; |
|
4934 #ifdef DEBUG_PUSH |
|
4935 xmlGenericError(xmlGenericErrorContext, |
|
4936 "HPP: entering CONTENT\n"); |
|
4937 #endif |
|
4938 break; |
|
4939 case XML_PARSER_PUBLIC_LITERAL: |
|
4940 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR, |
|
4941 "HPP: internal error, state == XML_PARSER_LITERAL\n", |
|
4942 NULL, NULL); |
|
4943 ctxt->instate = XML_PARSER_CONTENT; |
|
4944 ctxt->checkIndex = 0; |
|
4945 #ifdef DEBUG_PUSH |
|
4946 xmlGenericError(xmlGenericErrorContext, |
|
4947 "HPP: entering CONTENT\n"); |
|
4948 #endif |
|
4949 break; |
|
4950 |
|
4951 } |
|
4952 } |
|
4953 done: |
|
4954 if ((avail == 0) && (terminate)) { |
|
4955 htmlAutoCloseOnEnd(ctxt); |
|
4956 if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { |
|
4957 /* |
|
4958 * SAX: end of the document processing. |
|
4959 */ |
|
4960 ctxt->instate = XML_PARSER_EOF; |
|
4961 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) |
|
4962 ctxt->sax->endDocument(ctxt->userData); |
|
4963 } |
|
4964 } |
|
4965 if ((ctxt->myDoc != NULL) && |
|
4966 ((terminate) || (ctxt->instate == XML_PARSER_EOF) || |
|
4967 (ctxt->instate == XML_PARSER_EPILOG))) { |
|
4968 xmlDtdPtr dtd; |
|
4969 dtd = xmlGetIntSubset(ctxt->myDoc); |
|
4970 if (dtd == NULL) |
|
4971 ctxt->myDoc->intSubset = |
|
4972 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", |
|
4973 BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN", |
|
4974 BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd"); |
|
4975 } |
|
4976 #ifdef DEBUG_PUSH |
|
4977 xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret); |
|
4978 #endif |
|
4979 return(ret); |
|
4980 } |
|
4981 |
|
4982 /** |
|
4983 * htmlParseChunk: |
|
4984 * @param ctxt an HTML parser context |
|
4985 * @param chunk an char array |
|
4986 * @param size the size in byte of the chunk |
|
4987 * @param terminate last chunk indicator |
|
4988 * |
|
4989 * Parse a Chunk of memory |
|
4990 * |
|
4991 * Returns zero if no error, the xmlParserErrors otherwise. |
|
4992 */ |
|
4993 int |
|
4994 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, |
|
4995 int terminate) { |
|
4996 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && |
|
4997 (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { |
|
4998 int base = ctxt->input->base - ctxt->input->buf->buffer->content; |
|
4999 int cur = ctxt->input->cur - ctxt->input->base; |
|
5000 |
|
5001 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); |
|
5002 ctxt->input->base = ctxt->input->buf->buffer->content + base; |
|
5003 ctxt->input->cur = ctxt->input->base + cur; |
|
5004 #ifdef DEBUG_PUSH |
|
5005 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); |
|
5006 #endif |
|
5007 |
|
5008 #if 0 |
|
5009 if ((terminate) || (ctxt->input->buf->buffer->use > 80)) |
|
5010 htmlParseTryOrFinish(ctxt, terminate); |
|
5011 #endif |
|
5012 } else if (ctxt->instate != XML_PARSER_EOF) { |
|
5013 if ((ctxt->input != NULL) && ctxt->input->buf != NULL) { |
|
5014 xmlParserInputBufferPtr in = ctxt->input->buf; |
|
5015 if ((in->encoder != NULL) && (in->buffer != NULL) && |
|
5016 (in->raw != NULL)) { |
|
5017 int nbchars; |
|
5018 |
|
5019 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); |
|
5020 if (nbchars < 0) { |
|
5021 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, |
|
5022 "encoder error\n", NULL, NULL); |
|
5023 return(XML_ERR_INVALID_ENCODING); |
|
5024 } |
|
5025 } |
|
5026 } |
|
5027 } |
|
5028 htmlParseTryOrFinish(ctxt, terminate); |
|
5029 if (terminate) { |
|
5030 if ((ctxt->instate != XML_PARSER_EOF) && |
|
5031 (ctxt->instate != XML_PARSER_EPILOG) && |
|
5032 (ctxt->instate != XML_PARSER_MISC)) { |
|
5033 ctxt->errNo = XML_ERR_DOCUMENT_END; |
|
5034 ctxt->wellFormed = 0; |
|
5035 } |
|
5036 if (ctxt->instate != XML_PARSER_EOF) { |
|
5037 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL)) |
|
5038 ctxt->sax->endDocument(ctxt->userData); |
|
5039 } |
|
5040 ctxt->instate = XML_PARSER_EOF; |
|
5041 } |
|
5042 return((xmlParserErrors) ctxt->errNo); |
|
5043 } |
|
5044 #endif /* LIBXML_PUSH_ENABLED */ |
|
5045 |
|
5046 /************************************************************************ |
|
5047 * * |
|
5048 * User entry points * |
|
5049 * * |
|
5050 ************************************************************************/ |
|
5051 |
|
5052 /** |
|
5053 * htmlCreatePushParserCtxt: |
|
5054 * @param sax a SAX handler |
|
5055 * @param user_data The user data returned on SAX callbacks |
|
5056 * @param chunk a pointer to an array of chars |
|
5057 * @param size number of chars in the array |
|
5058 * @param filename an optional file name or URI |
|
5059 * @param enc an optional encoding |
|
5060 * |
|
5061 * Create a parser context for using the HTML parser in push mode |
|
5062 * The value of filename is used for fetching external entities |
|
5063 * and error/warning reports. |
|
5064 * |
|
5065 * Returns the new parser context or NULL |
|
5066 */ |
|
5067 htmlParserCtxtPtr |
|
5068 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, |
|
5069 const char *chunk, int size, const char *filename, |
|
5070 xmlCharEncoding enc) { |
|
5071 htmlParserCtxtPtr ctxt; |
|
5072 htmlParserInputPtr inputStream; |
|
5073 xmlParserInputBufferPtr buf; |
|
5074 |
|
5075 xmlInitParser(); |
|
5076 |
|
5077 buf = xmlAllocParserInputBuffer(enc); |
|
5078 if (buf == NULL) return(NULL); |
|
5079 |
|
5080 ctxt = htmlNewParserCtxt(); |
|
5081 if (ctxt == NULL) { |
|
5082 xmlFreeParserInputBuffer(buf); |
|
5083 return(NULL); |
|
5084 } |
|
5085 if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) |
|
5086 ctxt->charset=XML_CHAR_ENCODING_UTF8; |
|
5087 if (sax != NULL) { |
|
5088 if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler) |
|
5089 xmlFree(ctxt->sax); |
|
5090 ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler)); |
|
5091 if (ctxt->sax == NULL) { |
|
5092 xmlFree(buf); |
|
5093 xmlFree(ctxt); |
|
5094 return(NULL); |
|
5095 } |
|
5096 memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler)); |
|
5097 if (user_data != NULL) |
|
5098 ctxt->userData = user_data; |
|
5099 } |
|
5100 if (filename == NULL) { |
|
5101 ctxt->directory = NULL; |
|
5102 } else { |
|
5103 ctxt->directory = xmlParserGetDirectory(filename); |
|
5104 } |
|
5105 |
|
5106 inputStream = htmlNewInputStream(ctxt); |
|
5107 if (inputStream == NULL) { |
|
5108 xmlFreeParserCtxt(ctxt); |
|
5109 xmlFree(buf); |
|
5110 return(NULL); |
|
5111 } |
|
5112 |
|
5113 if (filename == NULL) |
|
5114 inputStream->filename = NULL; |
|
5115 else |
|
5116 inputStream->filename = (char *) |
|
5117 xmlCanonicPath((const xmlChar *) filename); |
|
5118 inputStream->buf = buf; |
|
5119 inputStream->base = inputStream->buf->buffer->content; |
|
5120 inputStream->cur = inputStream->buf->buffer->content; |
|
5121 inputStream->end = |
|
5122 &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; |
|
5123 |
|
5124 inputPush(ctxt, inputStream); |
|
5125 |
|
5126 if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && |
|
5127 (ctxt->input->buf != NULL)) { |
|
5128 int base = ctxt->input->base - ctxt->input->buf->buffer->content; |
|
5129 int cur = ctxt->input->cur - ctxt->input->base; |
|
5130 |
|
5131 xmlParserInputBufferPush(ctxt->input->buf, size, chunk); |
|
5132 |
|
5133 ctxt->input->base = ctxt->input->buf->buffer->content + base; |
|
5134 ctxt->input->cur = ctxt->input->base + cur; |
|
5135 ctxt->input->end = |
|
5136 &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; |
|
5137 #ifdef DEBUG_PUSH |
|
5138 xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); |
|
5139 #endif |
|
5140 } |
|
5141 |
|
5142 return(ctxt); |
|
5143 } |
|
5144 |
|
5145 /** |
|
5146 * htmlSAXParseDoc: |
|
5147 * @param cur a pointer to an array of xmlChar |
|
5148 * @param encoding a free form C string describing the HTML document encoding, or NULL |
|
5149 * @param sax the SAX handler block |
|
5150 * @param userData if using SAX, this pointer will be provided on callbacks. |
|
5151 * |
|
5152 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks |
|
5153 * to handle parse events. If sax is NULL, fallback to the default DOM |
|
5154 * behavior and return a tree. |
|
5155 * |
|
5156 * Returns the resulting document tree unless SAX is NULL or the document is |
|
5157 * not well formed. |
|
5158 */ |
|
5159 |
|
5160 htmlDocPtr |
|
5161 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) { |
|
5162 htmlDocPtr ret; |
|
5163 htmlParserCtxtPtr ctxt; |
|
5164 |
|
5165 xmlInitParser(); |
|
5166 |
|
5167 if (cur == NULL) return(NULL); |
|
5168 |
|
5169 |
|
5170 ctxt = htmlCreateDocParserCtxt(cur, encoding); |
|
5171 if (ctxt == NULL) return(NULL); |
|
5172 if (sax != NULL) { |
|
5173 if (ctxt->sax != NULL) xmlFree (ctxt->sax); |
|
5174 ctxt->sax = sax; |
|
5175 ctxt->userData = userData; |
|
5176 } |
|
5177 |
|
5178 htmlParseDocument(ctxt); |
|
5179 ret = ctxt->myDoc; |
|
5180 if (sax != NULL) { |
|
5181 ctxt->sax = NULL; |
|
5182 ctxt->userData = NULL; |
|
5183 } |
|
5184 htmlFreeParserCtxt(ctxt); |
|
5185 |
|
5186 return(ret); |
|
5187 } |
|
5188 |
|
5189 /** |
|
5190 * htmlParseDoc: |
|
5191 * @param cur a pointer to an array of xmlChar |
|
5192 * @param encoding a free form C string describing the HTML document encoding, or NULL |
|
5193 * |
|
5194 * parse an HTML in-memory document and build a tree. |
|
5195 * |
|
5196 * Returns the resulting document tree |
|
5197 */ |
|
5198 |
|
5199 htmlDocPtr |
|
5200 htmlParseDoc(xmlChar *cur, const char *encoding) { |
|
5201 return(htmlSAXParseDoc(cur, encoding, NULL, NULL)); |
|
5202 } |
|
5203 |
|
5204 |
|
5205 /** |
|
5206 * htmlCreateFileParserCtxt: |
|
5207 * @param filename the filename |
|
5208 * @param encoding a free form C string describing the HTML document encoding, or NULL |
|
5209 * |
|
5210 * Create a parser context for a file content. |
|
5211 * Automatic support for ZLIB/Compress compressed document is provided |
|
5212 * by default if found at compile-time. |
|
5213 * |
|
5214 * Returns the new parser context or NULL |
|
5215 */ |
|
5216 htmlParserCtxtPtr |
|
5217 htmlCreateFileParserCtxt(const char *filename, const char *encoding) |
|
5218 { |
|
5219 htmlParserCtxtPtr ctxt; |
|
5220 htmlParserInputPtr inputStream; |
|
5221 char *canonicFilename; |
|
5222 /* htmlCharEncoding enc; */ |
|
5223 xmlChar *content, *content_line = (xmlChar *) "charset="; |
|
5224 |
|
5225 ctxt = htmlNewParserCtxt(); |
|
5226 if (ctxt == NULL) { |
|
5227 return(NULL); |
|
5228 } |
|
5229 canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename); |
|
5230 if (canonicFilename == NULL) { |
|
5231 #ifdef LIBXML_SAX1_ENABLED |
|
5232 if (xmlDefaultSAXHandler.error != NULL) { |
|
5233 xmlDefaultSAXHandler.error(NULL, "out of memory\n"); |
|
5234 } |
|
5235 #endif |
|
5236 xmlFreeParserCtxt(ctxt); |
|
5237 return(NULL); |
|
5238 } |
|
5239 |
|
5240 inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt); |
|
5241 xmlFree(canonicFilename); |
|
5242 if (inputStream == NULL) { |
|
5243 xmlFreeParserCtxt(ctxt); |
|
5244 return(NULL); |
|
5245 } |
|
5246 |
|
5247 inputPush(ctxt, inputStream); |
|
5248 |
|
5249 /* set encoding */ |
|
5250 if (encoding) { |
|
5251 content = (xmlChar*)xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1); |
|
5252 if (content) { |
|
5253 strcpy ((char *)content, (char *)content_line); |
|
5254 strcat ((char *)content, (char *)encoding); |
|
5255 htmlCheckEncoding (ctxt, content); |
|
5256 xmlFree (content); |
|
5257 } |
|
5258 } |
|
5259 |
|
5260 return(ctxt); |
|
5261 } |
|
5262 |
|
5263 /** |
|
5264 * htmlSAXParseFile: |
|
5265 * @param filename the filename |
|
5266 * @param encoding a free form C string describing the HTML document encoding, or NULL |
|
5267 * @param sax the SAX handler block |
|
5268 * @param userData if using SAX, this pointer will be provided on callbacks. |
|
5269 * |
|
5270 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress |
|
5271 * compressed document is provided by default if found at compile-time. |
|
5272 * It use the given SAX function block to handle the parsing callback. |
|
5273 * If sax is NULL, fallback to the default DOM tree building routines. |
|
5274 * |
|
5275 * Returns the resulting document tree unless SAX is NULL or the document is |
|
5276 * not well formed. |
|
5277 */ |
|
5278 |
|
5279 htmlDocPtr |
|
5280 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, |
|
5281 void *userData) { |
|
5282 htmlDocPtr ret; |
|
5283 htmlParserCtxtPtr ctxt; |
|
5284 htmlSAXHandlerPtr oldsax = NULL; |
|
5285 |
|
5286 xmlInitParser(); |
|
5287 |
|
5288 ctxt = htmlCreateFileParserCtxt(filename, encoding); |
|
5289 if (ctxt == NULL) return(NULL); |
|
5290 if (sax != NULL) { |
|
5291 oldsax = ctxt->sax; |
|
5292 ctxt->sax = sax; |
|
5293 ctxt->userData = userData; |
|
5294 } |
|
5295 |
|
5296 htmlParseDocument(ctxt); |
|
5297 |
|
5298 ret = ctxt->myDoc; |
|
5299 if (sax != NULL) { |
|
5300 ctxt->sax = oldsax; |
|
5301 ctxt->userData = NULL; |
|
5302 } |
|
5303 htmlFreeParserCtxt(ctxt); |
|
5304 |
|
5305 return(ret); |
|
5306 } |
|
5307 |
|
5308 /** |
|
5309 * htmlParseFile: |
|
5310 * @param filename the filename |
|
5311 * @param encoding a free form C string describing the HTML document encoding, or NULL |
|
5312 * |
|
5313 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress |
|
5314 * compressed document is provided by default if found at compile-time. |
|
5315 * |
|
5316 * Returns the resulting document tree |
|
5317 */ |
|
5318 |
|
5319 htmlDocPtr |
|
5320 htmlParseFile(const char *filename, const char *encoding) { |
|
5321 return(htmlSAXParseFile(filename, encoding, NULL, NULL)); |
|
5322 } |
|
5323 |
|
5324 /** |
|
5325 * htmlHandleOmittedElem: |
|
5326 * @param val int 0 or 1 |
|
5327 * |
|
5328 * Set and return the previous value for handling HTML omitted tags. |
|
5329 * |
|
5330 * Returns the last value for 0 for no handling, 1 for auto insertion. |
|
5331 */ |
|
5332 |
|
5333 int |
|
5334 htmlHandleOmittedElem(int val) { |
|
5335 int old = htmlOmittedDefaultValue; |
|
5336 |
|
5337 |
|
5338 return(old); |
|
5339 } |
|
5340 |
|
5341 /** |
|
5342 * htmlElementAllowedHere: |
|
5343 * @param parent HTML parent element |
|
5344 * @param elt HTML element |
|
5345 * |
|
5346 * Checks whether an HTML element may be a direct child of a parent element. |
|
5347 * Note - doesn't check for deprecated elements |
|
5348 * |
|
5349 * Returns 1 if allowed; 0 otherwise. |
|
5350 */ |
|
5351 int |
|
5352 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) { |
|
5353 const char** p ; |
|
5354 |
|
5355 if ( ! elt || ! parent || ! parent->subelts ) |
|
5356 return 0 ; |
|
5357 |
|
5358 for ( p = parent->subelts; *p; ++p ) |
|
5359 if ( !xmlStrcmp((const xmlChar *)*p, elt) ) |
|
5360 return 1 ; |
|
5361 |
|
5362 return 0 ; |
|
5363 } |
|
5364 /** |
|
5365 * htmlElementStatusHere: |
|
5366 * @param parent HTML parent element |
|
5367 * @param elt HTML element |
|
5368 * |
|
5369 * Checks whether an HTML element may be a direct child of a parent element. |
|
5370 * and if so whether it is valid or deprecated. |
|
5371 * |
|
5372 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
|
5373 */ |
|
5374 htmlStatus |
|
5375 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) { |
|
5376 if ( ! parent || ! elt ) |
|
5377 return HTML_INVALID ; |
|
5378 if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) ) |
|
5379 return HTML_INVALID ; |
|
5380 |
|
5381 return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ; |
|
5382 } |
|
5383 /** |
|
5384 * htmlAttrAllowed: |
|
5385 * @param elt HTML element |
|
5386 * @param attr HTML attribute |
|
5387 * @param legacy whether to allow deprecated attributes |
|
5388 * |
|
5389 * Checks whether an attribute is valid for an element |
|
5390 * Has full knowledge of Required and Deprecated attributes |
|
5391 * |
|
5392 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
|
5393 */ |
|
5394 htmlStatus |
|
5395 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) { |
|
5396 const char** p ; |
|
5397 |
|
5398 if ( !elt || ! attr ) |
|
5399 return HTML_INVALID ; |
|
5400 |
|
5401 if ( elt->attrs_req ) |
|
5402 for ( p = elt->attrs_req; *p; ++p) |
|
5403 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) |
|
5404 return HTML_REQUIRED ; |
|
5405 |
|
5406 if ( elt->attrs_opt ) |
|
5407 for ( p = elt->attrs_opt; *p; ++p) |
|
5408 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) |
|
5409 return HTML_VALID ; |
|
5410 |
|
5411 if ( legacy && elt->attrs_depr ) |
|
5412 for ( p = elt->attrs_depr; *p; ++p) |
|
5413 if ( !xmlStrcmp((const xmlChar*)*p, attr) ) |
|
5414 return HTML_DEPRECATED ; |
|
5415 |
|
5416 return HTML_INVALID ; |
|
5417 } |
|
5418 /** |
|
5419 * htmlNodeStatus: |
|
5420 * @param node an htmlNodePtr in a tree |
|
5421 * @param legacy whether to allow deprecated elements (YES is faster here |
|
5422 * for Element nodes) |
|
5423 * |
|
5424 * Checks whether the tree node is valid. Experimental (the author |
|
5425 * only uses the HTML enhancements in a SAX parser) |
|
5426 * |
|
5427 * Return: for Element nodes, a return from htmlElementAllowedHere (if |
|
5428 * legacy allowed) or htmlElementStatusHere (otherwise). |
|
5429 * for Attribute nodes, a return from htmlAttrAllowed |
|
5430 * for other nodes, HTML_NA (no checks performed) |
|
5431 */ |
|
5432 htmlStatus |
|
5433 htmlNodeStatus(const htmlNodePtr node, int legacy) { |
|
5434 if ( ! node ) |
|
5435 return HTML_INVALID ; |
|
5436 |
|
5437 switch ( node->type ) { |
|
5438 case XML_ELEMENT_NODE: |
|
5439 return legacy |
|
5440 ? ( htmlElementAllowedHere ( |
|
5441 htmlTagLookup(node->parent->name) , node->name |
|
5442 ) ? HTML_VALID : HTML_INVALID ) |
|
5443 : htmlElementStatusHere( |
|
5444 htmlTagLookup(node->parent->name) , |
|
5445 htmlTagLookup(node->name) ) |
|
5446 ; |
|
5447 case XML_ATTRIBUTE_NODE: |
|
5448 return htmlAttrAllowed( |
|
5449 htmlTagLookup(node->parent->name) , node->name, legacy) ; |
|
5450 default: return HTML_NA ; |
|
5451 } |
|
5452 } |
|
5453 /************************************************************************ |
|
5454 * * |
|
5455 * New set (2.6.0) of simpler and more flexible APIs * |
|
5456 * * |
|
5457 ************************************************************************/ |
|
5458 /** |
|
5459 * DICT_FREE: |
|
5460 * @param str a string |
|
5461 * |
|
5462 * Free a string if it is not owned by the "dict" dictionnary in the |
|
5463 * current scope |
|
5464 */ |
|
5465 #define DICT_FREE(str) \ |
|
5466 if ((str) && ((!dict) || \ |
|
5467 (xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \ |
|
5468 xmlFree((char *)(str)); |
|
5469 |
|
5470 /** |
|
5471 * htmlCtxtReset: |
|
5472 * @param ctxt an HTML parser context |
|
5473 * |
|
5474 * Reset a parser context |
|
5475 */ |
|
5476 void |
|
5477 htmlCtxtReset(htmlParserCtxtPtr ctxt) |
|
5478 { |
|
5479 xmlParserInputPtr input; |
|
5480 xmlDictPtr dict = ctxt->dict; |
|
5481 |
|
5482 while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */ |
|
5483 xmlFreeInputStream(input); |
|
5484 } |
|
5485 ctxt->inputNr = 0; |
|
5486 ctxt->input = NULL; |
|
5487 |
|
5488 ctxt->spaceNr = 0; |
|
5489 ctxt->spaceTab[0] = -1; |
|
5490 ctxt->space = &ctxt->spaceTab[0]; |
|
5491 |
|
5492 |
|
5493 ctxt->nodeNr = 0; |
|
5494 ctxt->node = NULL; |
|
5495 |
|
5496 ctxt->nameNr = 0; |
|
5497 ctxt->name = NULL; |
|
5498 |
|
5499 DICT_FREE(ctxt->version); |
|
5500 ctxt->version = NULL; |
|
5501 DICT_FREE(ctxt->encoding); |
|
5502 ctxt->encoding = NULL; |
|
5503 DICT_FREE(ctxt->directory); |
|
5504 ctxt->directory = NULL; |
|
5505 DICT_FREE(ctxt->extSubURI); |
|
5506 ctxt->extSubURI = NULL; |
|
5507 DICT_FREE(ctxt->extSubSystem); |
|
5508 ctxt->extSubSystem = NULL; |
|
5509 if (ctxt->myDoc != NULL) |
|
5510 xmlFreeDoc(ctxt->myDoc); |
|
5511 ctxt->myDoc = NULL; |
|
5512 |
|
5513 ctxt->standalone = -1; |
|
5514 ctxt->hasExternalSubset = 0; |
|
5515 ctxt->hasPErefs = 0; |
|
5516 ctxt->html = 1; |
|
5517 ctxt->external = 0; |
|
5518 ctxt->instate = XML_PARSER_START; |
|
5519 ctxt->token = 0; |
|
5520 |
|
5521 ctxt->wellFormed = 1; |
|
5522 ctxt->nsWellFormed = 1; |
|
5523 ctxt->valid = 1; |
|
5524 ctxt->vctxt.userData = ctxt; |
|
5525 ctxt->vctxt.error = xmlParserValidityError; |
|
5526 ctxt->vctxt.warning = xmlParserValidityWarning; |
|
5527 ctxt->record_info = 0; |
|
5528 ctxt->nbChars = 0; |
|
5529 ctxt->checkIndex = 0; |
|
5530 ctxt->inSubset = 0; |
|
5531 ctxt->errNo = XML_ERR_OK; |
|
5532 ctxt->depth = 0; |
|
5533 ctxt->charset = XML_CHAR_ENCODING_UTF8; |
|
5534 ctxt->catalogs = NULL; |
|
5535 xmlInitNodeInfoSeq(&ctxt->node_seq); |
|
5536 |
|
5537 if (ctxt->attsDefault != NULL) { |
|
5538 xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree); |
|
5539 ctxt->attsDefault = NULL; |
|
5540 } |
|
5541 if (ctxt->attsSpecial != NULL) { |
|
5542 xmlHashFree(ctxt->attsSpecial, NULL); |
|
5543 ctxt->attsSpecial = NULL; |
|
5544 } |
|
5545 } |
|
5546 |
|
5547 /** |
|
5548 * htmlCtxtUseOptions: |
|
5549 * @param ctxt an HTML parser context |
|
5550 * @param options a combination of htmlParserOption(s) |
|
5551 * |
|
5552 * Applies the options to the parser context |
|
5553 * |
|
5554 * Returns 0 in case of success, the set of unknown or unimplemented options |
|
5555 * in case of error. |
|
5556 */ |
|
5557 int |
|
5558 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) |
|
5559 { |
|
5560 if (options & HTML_PARSE_NOWARNING) { |
|
5561 ctxt->sax->warning = NULL; |
|
5562 ctxt->vctxt.warning = NULL; |
|
5563 options -= XML_PARSE_NOWARNING; |
|
5564 ctxt->options |= XML_PARSE_NOWARNING; |
|
5565 } |
|
5566 if (options & HTML_PARSE_NOERROR) { |
|
5567 ctxt->sax->error = NULL; |
|
5568 ctxt->vctxt.error = NULL; |
|
5569 ctxt->sax->fatalError = NULL; |
|
5570 options -= XML_PARSE_NOERROR; |
|
5571 ctxt->options |= XML_PARSE_NOERROR; |
|
5572 } |
|
5573 if (options & HTML_PARSE_PEDANTIC) { |
|
5574 ctxt->pedantic = 1; |
|
5575 options -= XML_PARSE_PEDANTIC; |
|
5576 ctxt->options |= XML_PARSE_PEDANTIC; |
|
5577 } else |
|
5578 ctxt->pedantic = 0; |
|
5579 if (options & XML_PARSE_NOBLANKS) { |
|
5580 ctxt->keepBlanks = 0; |
|
5581 ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; |
|
5582 options -= XML_PARSE_NOBLANKS; |
|
5583 ctxt->options |= XML_PARSE_NOBLANKS; |
|
5584 } else |
|
5585 ctxt->keepBlanks = 1; |
|
5586 ctxt->dictNames = 0; |
|
5587 return (options); |
|
5588 } |
|
5589 |
|
5590 /** |
|
5591 * htmlDoRead: |
|
5592 * @param ctxt an HTML parser context |
|
5593 * @param URL the base URL to use for the document |
|
5594 * @param encoding the document encoding, or NULL |
|
5595 * @param options a combination of htmlParserOption(s) |
|
5596 * @param reuse keep the context for reuse |
|
5597 * |
|
5598 * Common front-end for the htmlRead functions |
|
5599 * |
|
5600 * Returns the resulting document tree or NULL |
|
5601 */ |
|
5602 static htmlDocPtr |
|
5603 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, |
|
5604 int options, int reuse) |
|
5605 { |
|
5606 htmlDocPtr ret; |
|
5607 |
|
5608 htmlCtxtUseOptions(ctxt, options); |
|
5609 ctxt->html = 1; |
|
5610 if (encoding != NULL) { |
|
5611 xmlCharEncodingHandlerPtr hdlr; |
|
5612 |
|
5613 hdlr = xmlFindCharEncodingHandler(encoding); |
|
5614 if (hdlr != NULL) |
|
5615 xmlSwitchToEncoding(ctxt, hdlr); |
|
5616 } |
|
5617 if ((URL != NULL) && (ctxt->input != NULL) && |
|
5618 (ctxt->input->filename == NULL)) |
|
5619 ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL); |
|
5620 htmlParseDocument(ctxt); |
|
5621 ret = ctxt->myDoc; |
|
5622 ctxt->myDoc = NULL; |
|
5623 if (!reuse) { |
|
5624 if ((ctxt->dictNames) && |
|
5625 (ret != NULL) && |
|
5626 (ret->dict == ctxt->dict)) |
|
5627 ctxt->dict = NULL; |
|
5628 xmlFreeParserCtxt(ctxt); |
|
5629 } |
|
5630 return (ret); |
|
5631 } |
|
5632 |
|
5633 /** |
|
5634 * htmlReadDoc: |
|
5635 * @param cur a pointer to a zero terminated string |
|
5636 * @param URL the base URL to use for the document |
|
5637 * @param encoding the document encoding, or NULL |
|
5638 * @param options a combination of htmlParserOption(s) |
|
5639 * |
|
5640 * parse an XML in-memory document and build a tree. |
|
5641 * |
|
5642 * Returns the resulting document tree |
|
5643 */ |
|
5644 htmlDocPtr |
|
5645 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options) |
|
5646 { |
|
5647 htmlParserCtxtPtr ctxt; |
|
5648 |
|
5649 if (cur == NULL) |
|
5650 return (NULL); |
|
5651 |
|
5652 ctxt = xmlCreateDocParserCtxt(cur, sizeof(cur)); |
|
5653 if (ctxt == NULL) |
|
5654 return (NULL); |
|
5655 return (htmlDoRead(ctxt, URL, encoding, options, 0)); |
|
5656 } |
|
5657 |
|
5658 /** |
|
5659 * htmlReadFile: |
|
5660 * @param filename a file or URL |
|
5661 * @param encoding the document encoding, or NULL |
|
5662 * @param options a combination of htmlParserOption(s) |
|
5663 * |
|
5664 * parse an XML file from the filesystem or the network. |
|
5665 * |
|
5666 * Returns the resulting document tree |
|
5667 */ |
|
5668 htmlDocPtr |
|
5669 htmlReadFile(const char *filename, const char *encoding, int options) |
|
5670 { |
|
5671 htmlParserCtxtPtr ctxt; |
|
5672 |
|
5673 ctxt = htmlCreateFileParserCtxt(filename, encoding); |
|
5674 if (ctxt == NULL) |
|
5675 return (NULL); |
|
5676 return (htmlDoRead(ctxt, NULL, NULL, options, 0)); |
|
5677 } |
|
5678 |
|
5679 /** |
|
5680 * htmlReadMemory: |
|
5681 * @param buffer a pointer to a char array |
|
5682 * @param size the size of the array |
|
5683 * @param URL the base URL to use for the document |
|
5684 * @param encoding the document encoding, or NULL |
|
5685 * @param options a combination of htmlParserOption(s) |
|
5686 * |
|
5687 * parse an XML in-memory document and build a tree. |
|
5688 * |
|
5689 * Returns the resulting document tree |
|
5690 */ |
|
5691 htmlDocPtr |
|
5692 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options) |
|
5693 { |
|
5694 htmlParserCtxtPtr ctxt; |
|
5695 |
|
5696 ctxt = xmlCreateMemoryParserCtxt(buffer, size); |
|
5697 if (ctxt == NULL) |
|
5698 return (NULL); |
|
5699 return (htmlDoRead(ctxt, URL, encoding, options, 0)); |
|
5700 } |
|
5701 |
|
5702 /** |
|
5703 * htmlReadFd: |
|
5704 * @param fd an open file descriptor |
|
5705 * @param URL the base URL to use for the document |
|
5706 * @param encoding the document encoding, or NULL |
|
5707 * @param options a combination of htmlParserOption(s) |
|
5708 * |
|
5709 * parse an XML from a file descriptor and build a tree. |
|
5710 * |
|
5711 * Returns the resulting document tree |
|
5712 */ |
|
5713 htmlDocPtr |
|
5714 htmlReadFd(int fd, const char *URL, const char *encoding, int options) |
|
5715 { |
|
5716 htmlParserCtxtPtr ctxt; |
|
5717 xmlParserInputBufferPtr input; |
|
5718 xmlParserInputPtr stream; |
|
5719 |
|
5720 if (fd < 0) |
|
5721 return (NULL); |
|
5722 |
|
5723 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); |
|
5724 if (input == NULL) |
|
5725 return (NULL); |
|
5726 ctxt = xmlNewParserCtxt(); |
|
5727 if (ctxt == NULL) { |
|
5728 xmlFreeParserInputBuffer(input); |
|
5729 return (NULL); |
|
5730 } |
|
5731 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); |
|
5732 if (stream == NULL) { |
|
5733 xmlFreeParserInputBuffer(input); |
|
5734 xmlFreeParserCtxt(ctxt); |
|
5735 return (NULL); |
|
5736 } |
|
5737 inputPush(ctxt, stream); |
|
5738 return (htmlDoRead(ctxt, URL, encoding, options, 0)); |
|
5739 } |
|
5740 |
|
5741 /** |
|
5742 * htmlReadIO: |
|
5743 * @param ioread an I/O read function |
|
5744 * @param ioclose an I/O close function |
|
5745 * @param ioctx an I/O handler |
|
5746 * @param URL the base URL to use for the document |
|
5747 * @param encoding the document encoding, or NULL |
|
5748 * @param options a combination of htmlParserOption(s) |
|
5749 * |
|
5750 * parse an HTML document from I/O functions and source and build a tree. |
|
5751 * |
|
5752 * Returns the resulting document tree |
|
5753 */ |
|
5754 htmlDocPtr |
|
5755 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, |
|
5756 void *ioctx, const char *URL, const char *encoding, int options) |
|
5757 { |
|
5758 htmlParserCtxtPtr ctxt; |
|
5759 xmlParserInputBufferPtr input; |
|
5760 xmlParserInputPtr stream; |
|
5761 |
|
5762 if (ioread == NULL) |
|
5763 return (NULL); |
|
5764 |
|
5765 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, |
|
5766 XML_CHAR_ENCODING_NONE); |
|
5767 if (input == NULL) |
|
5768 return (NULL); |
|
5769 ctxt = xmlNewParserCtxt(); |
|
5770 if (ctxt == NULL) { |
|
5771 xmlFreeParserInputBuffer(input); |
|
5772 return (NULL); |
|
5773 } |
|
5774 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); |
|
5775 if (stream == NULL) { |
|
5776 xmlFreeParserInputBuffer(input); |
|
5777 xmlFreeParserCtxt(ctxt); |
|
5778 return (NULL); |
|
5779 } |
|
5780 inputPush(ctxt, stream); |
|
5781 return (htmlDoRead(ctxt, URL, encoding, options, 0)); |
|
5782 } |
|
5783 |
|
5784 /** |
|
5785 * htmlCtxtReadDoc: |
|
5786 * @param ctxt an HTML parser context |
|
5787 * @param cur a pointer to a zero terminated string |
|
5788 * @param URL the base URL to use for the document |
|
5789 * @param encoding the document encoding, or NULL |
|
5790 * @param options a combination of htmlParserOption(s) |
|
5791 * |
|
5792 * parse an XML in-memory document and build a tree. |
|
5793 * This reuses the existing ctxt parser context |
|
5794 * |
|
5795 * Returns the resulting document tree |
|
5796 */ |
|
5797 htmlDocPtr |
|
5798 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur, |
|
5799 const char *URL, const char *encoding, int options) |
|
5800 { |
|
5801 xmlParserInputPtr stream; |
|
5802 |
|
5803 if (cur == NULL) |
|
5804 return (NULL); |
|
5805 if (ctxt == NULL) |
|
5806 return (NULL); |
|
5807 |
|
5808 htmlCtxtReset(ctxt); |
|
5809 |
|
5810 stream = xmlNewStringInputStream(ctxt, cur); |
|
5811 if (stream == NULL) { |
|
5812 return (NULL); |
|
5813 } |
|
5814 inputPush(ctxt, stream); |
|
5815 return (htmlDoRead(ctxt, URL, encoding, options, 1)); |
|
5816 } |
|
5817 |
|
5818 /** |
|
5819 * htmlCtxtReadFile: |
|
5820 * @param ctxt an HTML parser context |
|
5821 * @param filename a file or URL |
|
5822 * @param encoding the document encoding, or NULL |
|
5823 * @param options a combination of htmlParserOption(s) |
|
5824 * |
|
5825 * parse an XML file from the filesystem or the network. |
|
5826 * This reuses the existing ctxt parser context |
|
5827 * |
|
5828 * Returns the resulting document tree |
|
5829 */ |
|
5830 htmlDocPtr |
|
5831 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename, |
|
5832 const char *encoding, int options) |
|
5833 { |
|
5834 xmlParserInputPtr stream; |
|
5835 |
|
5836 if (filename == NULL) |
|
5837 return (NULL); |
|
5838 if (ctxt == NULL) |
|
5839 return (NULL); |
|
5840 |
|
5841 htmlCtxtReset(ctxt); |
|
5842 |
|
5843 stream = xmlNewInputFromFile(ctxt, filename); |
|
5844 if (stream == NULL) { |
|
5845 return (NULL); |
|
5846 } |
|
5847 inputPush(ctxt, stream); |
|
5848 return (htmlDoRead(ctxt, NULL, encoding, options, 1)); |
|
5849 } |
|
5850 |
|
5851 /** |
|
5852 * htmlCtxtReadMemory: |
|
5853 * @param ctxt an HTML parser context |
|
5854 * @param buffer a pointer to a char array |
|
5855 * @param size the size of the array |
|
5856 * @param URL the base URL to use for the document |
|
5857 * @param encoding the document encoding, or NULL |
|
5858 * @param options a combination of htmlParserOption(s) |
|
5859 * |
|
5860 * parse an XML in-memory document and build a tree. |
|
5861 * This reuses the existing ctxt parser context |
|
5862 * |
|
5863 * Returns the resulting document tree |
|
5864 */ |
|
5865 htmlDocPtr |
|
5866 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size, |
|
5867 const char *URL, const char *encoding, int options) |
|
5868 { |
|
5869 xmlParserInputBufferPtr input; |
|
5870 xmlParserInputPtr stream; |
|
5871 |
|
5872 if (ctxt == NULL) |
|
5873 return (NULL); |
|
5874 if (buffer == NULL) |
|
5875 return (NULL); |
|
5876 |
|
5877 htmlCtxtReset(ctxt); |
|
5878 |
|
5879 input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE); |
|
5880 if (input == NULL) { |
|
5881 return(NULL); |
|
5882 } |
|
5883 |
|
5884 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); |
|
5885 if (stream == NULL) { |
|
5886 xmlFreeParserInputBuffer(input); |
|
5887 return(NULL); |
|
5888 } |
|
5889 |
|
5890 inputPush(ctxt, stream); |
|
5891 return (htmlDoRead(ctxt, URL, encoding, options, 1)); |
|
5892 } |
|
5893 |
|
5894 /** |
|
5895 * htmlCtxtReadFd: |
|
5896 * @param ctxt an HTML parser context |
|
5897 * @param fd an open file descriptor |
|
5898 * @param URL the base URL to use for the document |
|
5899 * @param encoding the document encoding, or NULL |
|
5900 * @param options a combination of htmlParserOption(s) |
|
5901 * |
|
5902 * parse an XML from a file descriptor and build a tree. |
|
5903 * This reuses the existing ctxt parser context |
|
5904 * |
|
5905 * Returns the resulting document tree |
|
5906 */ |
|
5907 htmlDocPtr |
|
5908 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd, |
|
5909 const char *URL, const char *encoding, int options) |
|
5910 { |
|
5911 xmlParserInputBufferPtr input; |
|
5912 xmlParserInputPtr stream; |
|
5913 |
|
5914 if (fd < 0) |
|
5915 return (NULL); |
|
5916 if (ctxt == NULL) |
|
5917 return (NULL); |
|
5918 |
|
5919 htmlCtxtReset(ctxt); |
|
5920 |
|
5921 |
|
5922 input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE); |
|
5923 if (input == NULL) |
|
5924 return (NULL); |
|
5925 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); |
|
5926 if (stream == NULL) { |
|
5927 xmlFreeParserInputBuffer(input); |
|
5928 return (NULL); |
|
5929 } |
|
5930 inputPush(ctxt, stream); |
|
5931 return (htmlDoRead(ctxt, URL, encoding, options, 1)); |
|
5932 } |
|
5933 |
|
5934 /** |
|
5935 * htmlCtxtReadIO: |
|
5936 * @param ctxt an HTML parser context |
|
5937 * @param ioread an I/O read function |
|
5938 * @param ioclose an I/O close function |
|
5939 * @param ioctx an I/O handler |
|
5940 * @param URL the base URL to use for the document |
|
5941 * @param encoding the document encoding, or NULL |
|
5942 * @param options a combination of htmlParserOption(s) |
|
5943 * |
|
5944 * parse an HTML document from I/O functions and source and build a tree. |
|
5945 * This reuses the existing ctxt parser context |
|
5946 * |
|
5947 * Returns the resulting document tree |
|
5948 */ |
|
5949 htmlDocPtr |
|
5950 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, |
|
5951 xmlInputCloseCallback ioclose, void *ioctx, |
|
5952 const char *URL, |
|
5953 const char *encoding, int options) |
|
5954 { |
|
5955 xmlParserInputBufferPtr input; |
|
5956 xmlParserInputPtr stream; |
|
5957 |
|
5958 if (ioread == NULL) |
|
5959 return (NULL); |
|
5960 if (ctxt == NULL) |
|
5961 return (NULL); |
|
5962 |
|
5963 htmlCtxtReset(ctxt); |
|
5964 |
|
5965 input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx, |
|
5966 XML_CHAR_ENCODING_NONE); |
|
5967 if (input == NULL) |
|
5968 return (NULL); |
|
5969 stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE); |
|
5970 if (stream == NULL) { |
|
5971 xmlFreeParserInputBuffer(input); |
|
5972 return (NULL); |
|
5973 } |
|
5974 inputPush(ctxt, stream); |
|
5975 return (htmlDoRead(ctxt, URL, encoding, options, 1)); |
|
5976 } |
|
5977 |
|
5978 #endif /* LIBXML_HTML_ENABLED */ |