xml/libxml2libs/src/libxml2/libxml2_htmlparser.c
changeset 0 e35f40988205
equal deleted inserted replaced
-1:000000000000 0:e35f40988205
       
     1 /*
       
     2  * libxml2_htmlparser.c : an HTML 4.0 non-verifying parser
       
     3  *
       
     4  * See Copyright for the status of this software.
       
     5  *
       
     6  * daniel@veillard.com
       
     7  * Portion Copyright © 2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
       
     8  */
       
     9 
       
    10 #define IN_LIBXML
       
    11 #include "xmlenglibxml.h"
       
    12 
       
    13 #include <string.h>
       
    14 #if defined(HAVE_CTYPE_H)
       
    15 #include <ctype.h>
       
    16 #endif
       
    17 #ifdef HAVE_STDLIB_H
       
    18 #include <stdlib.h>
       
    19 #endif
       
    20 #ifdef HAVE_SYS_STAT_H
       
    21 #include <sys/stat.h>
       
    22 #endif
       
    23 #ifdef HAVE_FCNTL_H
       
    24 #include <fcntl.h>
       
    25 #endif
       
    26 #ifdef HAVE_UNISTD_H
       
    27 #include <unistd.h>
       
    28 #endif
       
    29 #ifdef HAVE_ZLIB_H
       
    30 #include <zlib.h>
       
    31 #endif
       
    32 
       
    33 
       
    34 #include <stdapis/libxml2/libxml2_globals.h>
       
    35 #include <stdapis/libxml2/libxml2_xmlmemory.h>
       
    36 #include <stdapis/libxml2/libxml2_tree.h>
       
    37 #include <stdapis/libxml2/libxml2_parser.h>
       
    38 #include <stdapis/libxml2/libxml2_parserinternals.h>
       
    39 #include <stdapis/libxml2/libxml2_xmlerror.h>
       
    40 #include "libxml2_xmlerror2.h"
       
    41 #include "libxml2_htmlparser.h"
       
    42 #include "libxml2_htmltree.h"
       
    43 #include "libxml2_entities.h"
       
    44 #include <stdapis/libxml2/libxml2_encoding.h>
       
    45 #include <stdapis/libxml2/libxml2_valid.h>
       
    46 #include <stdapis/libxml2/libxml2_xmlio.h>
       
    47 #include <stdapis/libxml2/libxml2_uri.h>
       
    48 
       
    49 #define HTML_MAX_NAMELEN 1000
       
    50 #define HTML_PARSER_BIG_BUFFER_SIZE 1000
       
    51 #define HTML_PARSER_BUFFER_SIZE 100
       
    52 
       
    53 #ifdef LIBXML_HTML_ENABLED
       
    54 
       
    55 /* #define DEBUG */
       
    56 /* #define DEBUG_PUSH */
       
    57 
       
    58 static const int htmlOmittedDefaultValue = 1;
       
    59 
       
    60 xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
       
    61                              xmlChar end, xmlChar  end2, xmlChar end3);
       
    62 static void htmlParseComment(htmlParserCtxtPtr ctxt);
       
    63 
       
    64 /************************************************************************
       
    65  *                                                                      *
       
    66  *              Some factorized error routines                          *
       
    67  *                                                                      *
       
    68  ************************************************************************/
       
    69 
       
    70 /**
       
    71  * htmlErrMemory:
       
    72  * @param ctxt an HTML parser context
       
    73  * @param extra extra informations
       
    74  *
       
    75  * Handle a redefinition of attribute error
       
    76  */
       
    77 void
       
    78 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra); // moved to XSLT-enabled part of this file
       
    79 
       
    80 /**
       
    81  * htmlParseErr:
       
    82  * @param ctxt an HTML parser context
       
    83  * @param error the error number
       
    84  * @param msg the error message
       
    85  * @param str1 string infor
       
    86  * @param str2 string infor
       
    87  *
       
    88  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
       
    89  */
       
    90 static void
       
    91 htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
       
    92              const char *msg, const xmlChar *str1, const xmlChar *str2)
       
    93 {
       
    94     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
       
    95         (ctxt->instate == XML_PARSER_EOF))
       
    96         return;
       
    97     ctxt->errNo = error;
       
    98     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
       
    99                     XML_ERR_ERROR, NULL, 0,
       
   100                     (const char *) str1, (const char *) str2,
       
   101                     NULL, 0, 0,
       
   102                     msg, str1, str2);
       
   103     ctxt->wellFormed = 0;
       
   104 }
       
   105 
       
   106 /**
       
   107  * htmlParseErrInt:
       
   108  * @param ctxt an HTML parser context
       
   109  * @param error the error number
       
   110  * @param msg the error message
       
   111  * @param val integer info
       
   112  *
       
   113  * Handle a fatal parser error, i.e. violating Well-Formedness constraints
       
   114  */
       
   115 static void
       
   116 htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
       
   117              const char *msg, int val)
       
   118 {
       
   119     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
       
   120         (ctxt->instate == XML_PARSER_EOF))
       
   121         return;
       
   122     ctxt->errNo = error;
       
   123     __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
       
   124                     XML_ERR_ERROR, NULL, 0, NULL, NULL,
       
   125                     NULL, val, 0, msg, val);
       
   126     ctxt->wellFormed = 0;
       
   127 }
       
   128 
       
   129 /************************************************************************
       
   130  *                                                                      *
       
   131  *              Parser stacks related functions and macros              *
       
   132  *                                                                      *
       
   133  ************************************************************************/
       
   134 
       
   135 /**
       
   136  * htmlnamePush:
       
   137  * @param ctxt an HTML parser context
       
   138  * @param value the element name
       
   139  *
       
   140  * Pushes a new element name on top of the name stack
       
   141  *
       
   142  * Returns 0 in case of error, the index in the stack otherwise
       
   143  */
       
   144 static int
       
   145 htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
       
   146 {
       
   147     if (ctxt->nameNr >= ctxt->nameMax) {
       
   148         void* allocTmp; // DONE: Fix xmlRealloc
       
   149         allocTmp = xmlRealloc((xmlChar**)ctxt->nameTab,
       
   150                                ctxt->nameMax * 2 * sizeof(ctxt->nameTab[0]));
       
   151         if (!allocTmp) {
       
   152             htmlErrMemory(ctxt, NULL);
       
   153             return (0);
       
   154         }
       
   155         ctxt->nameMax *= 2;
       
   156         ctxt->nameTab = (const xmlChar**) allocTmp;
       
   157     }
       
   158     ctxt->nameTab[ctxt->nameNr] = value;
       
   159     ctxt->name = value;
       
   160     return (ctxt->nameNr++);
       
   161 }
       
   162 /**
       
   163  * htmlnamePop:
       
   164  * @param ctxt an HTML parser context
       
   165  *
       
   166  * Pops the top element name from the name stack
       
   167  *
       
   168  * Returns the name just removed
       
   169  */
       
   170 static const xmlChar *
       
   171 htmlnamePop(htmlParserCtxtPtr ctxt)
       
   172 {
       
   173     const xmlChar *ret;
       
   174 
       
   175     if (ctxt->nameNr <= 0)
       
   176         return (0);
       
   177     ctxt->nameNr--;
       
   178     if (ctxt->nameNr < 0)
       
   179         return (0);
       
   180     if (ctxt->nameNr > 0)
       
   181         ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
       
   182     else
       
   183         ctxt->name = NULL;
       
   184     ret = ctxt->nameTab[ctxt->nameNr];
       
   185     ctxt->nameTab[ctxt->nameNr] = 0;
       
   186     return (ret);
       
   187 }
       
   188 
       
   189 /*
       
   190  * Macros for accessing the content. Those should be used only by the parser,
       
   191  * and not exported.
       
   192  *
       
   193  * Dirty macros, i.e. one need to make assumption on the context to use them
       
   194  *
       
   195  *   CUR_PTR return the current pointer to the xmlChar to be parsed.
       
   196  *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
       
   197  *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
       
   198  *           in UNICODE mode. This should be used internally by the parser
       
   199  *           only to compare to ASCII values otherwise it would break when
       
   200  *           running with UTF-8 encoding.
       
   201  *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
       
   202  *           to compare on ASCII based substring.
       
   203  *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
       
   204  *           it should be used only to compare on ASCII based substring.
       
   205  *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
       
   206  *           strings without newlines within the parser.
       
   207  *
       
   208  * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
       
   209  *
       
   210  *   CURRENT Returns the current char value, with the full decoding of
       
   211  *           UTF-8 if we are using this mode. It returns an int.
       
   212  *   NEXT    Skip to the next character, this does the proper decoding
       
   213  *           in UTF-8 mode. 
       
   214  *   NEXTL(l) Skip the current unicode character of l xmlChars long.
       
   215  *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
       
   216  */
       
   217 
       
   218 #define UPPER (toupper(*ctxt->input->cur))
       
   219 
       
   220 #define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
       
   221 
       
   222 #define NXT(val) ctxt->input->cur[(val)]
       
   223 
       
   224 #define UPP(val) (toupper(ctxt->input->cur[(val)]))
       
   225 
       
   226 #define CUR_PTR ctxt->input->cur
       
   227 
       
   228 #define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
       
   229                    (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
       
   230         xmlParserInputShrink(ctxt->input)
       
   231 
       
   232 #define GROW if ((ctxt->progressive == 0) &&                            \
       
   233                  (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))   \
       
   234         xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
       
   235 
       
   236 #define CURRENT ((int) (*ctxt->input->cur))
       
   237 
       
   238 #define SKIP_BLANKS htmlSkipBlankChars(ctxt)
       
   239 
       
   240 /* Inported from XML */
       
   241 
       
   242 /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
       
   243 #define CUR ((int) (*ctxt->input->cur))
       
   244 #define NEXT xmlNextChar(ctxt)
       
   245 
       
   246 #define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
       
   247 #define NXT(val) ctxt->input->cur[(val)]
       
   248 #define CUR_PTR ctxt->input->cur
       
   249 
       
   250 
       
   251 #define NEXTL(l) do {                                                   \
       
   252     if (*(ctxt->input->cur) == '\n') {                                  \
       
   253         ctxt->input->line++; ctxt->input->col = 1;                      \
       
   254     } else ctxt->input->col++;                                          \
       
   255     ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;            \
       
   256   } while (0)
       
   257 
       
   258 /************
       
   259     \
       
   260     if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);     \
       
   261     if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
       
   262  ************/
       
   263 
       
   264 #define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
       
   265 #define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
       
   266 
       
   267 #define COPY_BUF(l,b,i,v)                                               \
       
   268     if (l == 1) b[i++] = (xmlChar) v;                                   \
       
   269     else i += xmlCopyChar(l,&b[i],v)
       
   270 
       
   271 /**
       
   272  * htmlCurrentChar:
       
   273  * @param ctxt the HTML parser context
       
   274  * @param len pointer to the length of the char read
       
   275  *
       
   276  * The current char value, if using UTF-8 this may actually span multiple
       
   277  * bytes in the input buffer. Implement the end of line normalization:
       
   278  * 2.11 End-of-Line Handling
       
   279  * If the encoding is unspecified, in the case we find an ISO-Latin-1
       
   280  * char, then the encoding converter is plugged in automatically.
       
   281  *
       
   282  * Returns the current char value and its length
       
   283  */
       
   284 
       
   285 static int
       
   286 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
       
   287     if (ctxt->instate == XML_PARSER_EOF)
       
   288         return(0);
       
   289 
       
   290     if (ctxt->token != 0) {
       
   291         *len = 0;
       
   292         return(ctxt->token);
       
   293     }
       
   294     if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
       
   295         /*
       
   296          * We are supposed to handle UTF8, check it's valid
       
   297          * From rfc2044: encoding of the Unicode values on UTF-8:
       
   298          *
       
   299          * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
       
   300          * 0000 0000-0000 007F   0xxxxxxx
       
   301          * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
       
   302          * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
       
   303          *
       
   304          * Check for the 0x110000 limit too
       
   305          */
       
   306         const unsigned char *cur = ctxt->input->cur;
       
   307         unsigned char c;
       
   308         unsigned int val;
       
   309 
       
   310         c = *cur;
       
   311         if (c & 0x80) {
       
   312             if (cur[1] == 0)
       
   313                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
       
   314             if ((cur[1] & 0xc0) != 0x80)
       
   315                 goto encoding_error;
       
   316             if ((c & 0xe0) == 0xe0) {
       
   317 
       
   318                 if (cur[2] == 0)
       
   319                     xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
       
   320                 if ((cur[2] & 0xc0) != 0x80)
       
   321                     goto encoding_error;
       
   322                 if ((c & 0xf0) == 0xf0) {
       
   323                     if (cur[3] == 0)
       
   324                         xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
       
   325                     if (((c & 0xf8) != 0xf0) ||
       
   326                         ((cur[3] & 0xc0) != 0x80))
       
   327                         goto encoding_error;
       
   328                     /* 4-byte code */
       
   329                     *len = 4;
       
   330                     val = (cur[0] & 0x7) << 18;
       
   331                     val |= (cur[1] & 0x3f) << 12;
       
   332                     val |= (cur[2] & 0x3f) << 6;
       
   333                     val |= cur[3] & 0x3f;
       
   334                 } else {
       
   335                   /* 3-byte code */
       
   336                     *len = 3;
       
   337                     val = (cur[0] & 0xf) << 12;
       
   338                     val |= (cur[1] & 0x3f) << 6;
       
   339                     val |= cur[2] & 0x3f;
       
   340                 }
       
   341             } else {
       
   342               /* 2-byte code */
       
   343                 *len = 2;
       
   344                 val = (cur[0] & 0x1f) << 6;
       
   345                 val |= cur[1] & 0x3f;
       
   346             }
       
   347             if (!IS_CHAR(val)) {
       
   348                 htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
       
   349                                 "Char 0x%X out of allowed range\n", val);
       
   350         }
       
   351             return(val);
       
   352         } else {
       
   353             /* 1-byte code */
       
   354             *len = 1;
       
   355             return((int) *ctxt->input->cur);
       
   356         }
       
   357     }
       
   358     /*
       
   359      * Assume it's a fixed length encoding (1) with
       
   360      * a compatible encoding for the ASCII set, since
       
   361      * XML constructs only use < 128 chars
       
   362      */
       
   363     *len = 1;
       
   364     if ((int) *ctxt->input->cur < 0x80)
       
   365         return((int) *ctxt->input->cur);
       
   366 
       
   367     /*
       
   368      * Humm this is bad, do an automatic flow conversion
       
   369      */
       
   370     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
       
   371     ctxt->charset = XML_CHAR_ENCODING_UTF8;
       
   372     return(xmlCurrentChar(ctxt, len));
       
   373 
       
   374 encoding_error:
       
   375     /*
       
   376      * If we detect an UTF8 error that probably mean that the
       
   377      * input encoding didn't get properly advertized in the
       
   378      * declaration header. Report the error and switch the encoding
       
   379      * to ISO-Latin-1 (if you don't like this policy, just declare the
       
   380      * encoding !)
       
   381      */
       
   382     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
       
   383                  "Input is not proper UTF-8, indicate encoding !\n",
       
   384                  NULL, NULL);
       
   385     if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
       
   386         ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
       
   387                         ctxt->input->cur[0], ctxt->input->cur[1],
       
   388                         ctxt->input->cur[2], ctxt->input->cur[3]);
       
   389     }
       
   390 
       
   391     ctxt->charset = XML_CHAR_ENCODING_8859_1;
       
   392     *len = 1;
       
   393     return((int) *ctxt->input->cur);
       
   394 }
       
   395 
       
   396 /**
       
   397  * htmlSkipBlankChars:
       
   398  * @param ctxt the HTML parser context
       
   399  *
       
   400  * skip all blanks character found at that point in the input streams.
       
   401  *
       
   402  * Returns the number of space chars skipped
       
   403  */
       
   404 
       
   405 static int
       
   406 htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
       
   407     int res = 0;
       
   408 
       
   409     while (IS_BLANK_CH(*(ctxt->input->cur))) {
       
   410         if ((*ctxt->input->cur == 0) &&
       
   411             (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
       
   412                 xmlPopInput(ctxt);
       
   413         } else {
       
   414             if (*(ctxt->input->cur) == '\n') {
       
   415                 ctxt->input->line++; ctxt->input->col = 1;
       
   416             } else ctxt->input->col++;
       
   417             ctxt->input->cur++;
       
   418             ctxt->nbChars++;
       
   419             if (*ctxt->input->cur == 0)
       
   420                 xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
       
   421         }
       
   422         res++;
       
   423     }
       
   424     return(res);
       
   425 }
       
   426 
       
   427 
       
   428 #endif  /* defined(LIBXML_HTML_ENABLED */
       
   429 
       
   430 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT)
       
   431 
       
   432 /************************************************************************
       
   433  *                                                                      *
       
   434  *      The list of HTML elements and their properties                  *
       
   435  *                                                                      *
       
   436  ************************************************************************/
       
   437 
       
   438 /*
       
   439  *  Start Tag: 1 means the start tag can be ommited
       
   440  *  End Tag:   1 means the end tag can be ommited
       
   441  *             2 means it's forbidden (empty elements)
       
   442  *             3 means the tag is stylistic and should be closed easily
       
   443  *  Depr:      this element is deprecated
       
   444  *  DTD:       1 means that this element is valid only in the Loose DTD
       
   445  *             2 means that this element is valid only in the Frameset DTD
       
   446  *
       
   447  * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
       
   448         , subElements , impliedsubelt , Attributes, userdata
       
   449  */
       
   450 
       
   451 /* Definitions and a couple of vars for HTML Elements */
       
   452 
       
   453 #define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
       
   454 #define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
       
   455 #define SPECIAL "a", "img", "applet", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
       
   456 #define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
       
   457 #define BLOCK HEADING LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
       
   458 #define FORMCTRL "input", "select", "textarea", "label", "button"
       
   459 #define PCDATA
       
   460 #define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
       
   461 #define LIST "ul", "ol", "dir", "menu"
       
   462 #define MODIFIER
       
   463 #define FLOW BLOCK,INLINE
       
   464 #define EMPTY NULL
       
   465 
       
   466 // TO DO libxslt added 2nd const in between
       
   467 static const char* const html_flow [] = { FLOW, NULL } ;
       
   468 static const char* const html_inline [] = { INLINE, NULL } ;
       
   469 
       
   470 /* placeholders: elts with content but no subelements */
       
   471 static const char* const html_pcdata[] = { NULL } ;
       
   472 #define html_cdata html_pcdata
       
   473 
       
   474 
       
   475 /* ... and for HTML Attributes */
       
   476 
       
   477 #define COREATTRS "id", "class", "style", "title"
       
   478 #define I18N "lang", "dir"
       
   479 #define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
       
   480 #define ATTRS COREATTRS,I18N,EVENTS
       
   481 #define CELLHALIGN "align", "char", "charoff"
       
   482 #define CELLVALIGN "valign"
       
   483 
       
   484 static const char* const html_attrs [] = { ATTRS, NULL } ;
       
   485 static const char* const core_i18n_attrs [] = { COREATTRS, I18N, NULL } ;
       
   486 static const char* const core_attrs [] = { COREATTRS, NULL } ;
       
   487 static const char* const i18n_attrs [] = { I18N, NULL } ;
       
   488 
       
   489 /* Other declarations that should go inline ... */
       
   490 static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
       
   491         "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
       
   492         "tabindex", "onfocus", "onblur", NULL } ;
       
   493 static const char* const target_attr[] = { "target", NULL } ;
       
   494 static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
       
   495 static const char* const alt_attr[] = { "alt", NULL } ;
       
   496 static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
       
   497 static const char* const href_attrs[] = { "href", NULL } ;
       
   498 static const char* const clear_attrs[] = { "clear", NULL } ;
       
   499 static const char* const inline_p[] = { INLINE, "p", NULL } ;
       
   500 static const char* const flow_param[] = { FLOW, "param", NULL } ;
       
   501 static const char* const applet_attrs[] = { COREATTRS , "codebase",
       
   502                 "archive", "alt", "name", "height", "width", "align",
       
   503                 "hspace", "vspace", NULL } ;
       
   504 static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
       
   505         "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
       
   506 static const char* const basefont_attrs[] =
       
   507         { "id", "size", "color", "face", NULL } ;
       
   508 static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
       
   509 static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
       
   510 static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
       
   511 static const char* const body_depr[] = { "background", "bgcolor", "text",
       
   512         "link", "vlink", "alink", NULL } ;
       
   513 static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
       
   514         "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
       
   515 
       
   516 
       
   517 static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
       
   518 static const char* const col_elt[] = { "col", NULL } ;
       
   519 static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
       
   520 static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
       
   521 static const char* const dl_contents[] = { "dt", "dd", NULL } ;
       
   522 static const char* const compact_attr[] = { "compact", NULL } ;
       
   523 static const char* const label_attr[] = { "label", NULL } ;
       
   524 static const char* const fieldset_contents[] = { FLOW, "legend" } ;
       
   525 static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
       
   526 static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
       
   527 static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
       
   528 static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
       
   529 static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
       
   530 static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
       
   531 static const char* const head_attrs[] = { I18N, "profile", NULL } ;
       
   532 static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
       
   533 static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
       
   534 static const char* const version_attr[] = { "version", NULL } ;
       
   535 static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
       
   536 static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
       
   537 static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
       
   538 static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
       
   539 static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
       
   540 static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
       
   541 static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
       
   542 static const char* const align_attr[] = { "align", NULL } ;
       
   543 static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
       
   544 static const char* const map_contents[] = { BLOCK, "area", NULL } ;
       
   545 static const char* const name_attr[] = { "name", NULL } ;
       
   546 static const char* const action_attr[] = { "action", NULL } ;
       
   547 static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
       
   548 static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
       
   549 static const char* const content_attr[] = { "content", NULL } ;
       
   550 static const char* const type_attr[] = { "type", NULL } ;
       
   551 static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
       
   552 static const char* const object_contents[] = { FLOW, "param", NULL } ;
       
   553 static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
       
   554 static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
       
   555 static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
       
   556 static const char* const option_elt[] = { "option", NULL } ;
       
   557 static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
       
   558 static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
       
   559 static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
       
   560 static const char* const width_attr[] = { "width", NULL } ;
       
   561 static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
       
   562 static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
       
   563 static const char* const language_attr[] = { "language", NULL } ;
       
   564 static const char* const select_content[] = { "optgroup", "option", NULL } ;
       
   565 static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
       
   566 static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
       
   567 static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
       
   568 static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
       
   569 static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
       
   570 static const char* const tr_elt[] = { "tr", NULL } ;
       
   571 static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
       
   572 static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
       
   573 static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
       
   574 static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
       
   575 static const char* const tr_contents[] = { "th", "td", NULL } ;
       
   576 static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
       
   577 static const char* const li_elt[] = { "li", NULL } ;
       
   578 static const char* const ul_depr[] = { "type", "compact", NULL} ;
       
   579 static const char* const dir_attr[] = { "dir", NULL} ;
       
   580 
       
   581 #define DECL (const char**)
       
   582 
       
   583 static const htmlElemDesc  html40ElementTable [] = {
       
   584 { "a",          0, 0, 0, 0, 0, 0, 1, "anchor ",
       
   585         DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
       
   586 },
       
   587 { "abbr",       0, 0, 0, 0, 0, 0, 1, "abbreviated form",
       
   588         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
       
   589 },
       
   590 { "acronym",    0, 0, 0, 0, 0, 0, 1, "",
       
   591         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
       
   592 },
       
   593 { "address",    0, 0, 0, 0, 0, 0, 0, "information on author ",
       
   594         DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
       
   595 },
       
   596 { "applet",     0, 0, 0, 0, 1, 1, 2, "java applet ",
       
   597         DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
       
   598 },
       
   599 { "area",       0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
       
   600         EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
       
   601 },
       
   602 { "b",          0, 3, 0, 0, 0, 0, 1, "bold text style",
       
   603         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
       
   604 },
       
   605 { "base",       0, 2, 2, 1, 0, 0, 0, "document base uri ",
       
   606         EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
       
   607 },
       
   608 { "basefont",   0, 2, 2, 1, 1, 1, 1, "base font size " ,
       
   609         EMPTY , NULL , NULL, DECL basefont_attrs, NULL
       
   610 },
       
   611 { "bdo",        0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
       
   612         DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
       
   613 },
       
   614 { "big",        0, 3, 0, 0, 0, 0, 1, "large text style",
       
   615         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
       
   616 },
       
   617 { "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
       
   618         DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
       
   619 },
       
   620 { "body",       1, 1, 0, 0, 0, 0, 0, "document body ",
       
   621         DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
       
   622 },
       
   623 { "br",         0, 2, 2, 1, 0, 0, 1, "forced line break ",
       
   624         EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
       
   625 },
       
   626 { "button",     0, 0, 0, 0, 0, 0, 2, "push button ",
       
   627         DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
       
   628 },
       
   629 { "caption",    0, 0, 0, 0, 0, 0, 0, "table caption ",
       
   630         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
       
   631 },
       
   632 { "center",     0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
       
   633         DECL html_flow , NULL , NULL, DECL html_attrs, NULL
       
   634 },
       
   635 { "cite",       0, 0, 0, 0, 0, 0, 1, "citation",
       
   636         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
       
   637 },
       
   638 { "code",       0, 0, 0, 0, 0, 0, 1, "computer code fragment",
       
   639         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
       
   640 },
       
   641 { "col",        0, 2, 2, 1, 0, 0, 0, "table column ",
       
   642         EMPTY , NULL , DECL col_attrs , NULL, NULL
       
   643 },
       
   644 { "colgroup",   0, 1, 0, 0, 0, 0, 0, "table column group ",
       
   645         DECL col_elt , "col" , DECL col_attrs , NULL, NULL
       
   646 },
       
   647 { "dd",         0, 1, 0, 0, 0, 0, 0, "definition description ",
       
   648         DECL html_flow , NULL , DECL html_attrs, NULL, NULL
       
   649 },
       
   650 { "del",        0, 0, 0, 0, 0, 0, 2, "deleted text ",
       
   651         DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
       
   652 },
       
   653 { "dfn",        0, 0, 0, 0, 0, 0, 1, "instance definition",
       
   654         DECL html_inline , NULL , DECL html_attrs, NULL, NULL
       
   655 },
       
   656 { "dir",        0, 0, 0, 0, 1, 1, 0, "directory list",
       
   657         DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
       
   658 },
       
   659 { "div",        0, 0, 0, 0, 0, 0, 0, "generic language/style container",
       
   660         DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
       
   661 },
       
   662 { "dl",         0, 0, 0, 0, 0, 0, 0, "definition list ",
       
   663         DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
       
   664 },
       
   665 { "dt",         0, 1, 0, 0, 0, 0, 0, "definition term ",
       
   666         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   667 },
       
   668 { "em",         0, 3, 0, 0, 0, 0, 1, "emphasis",
       
   669         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   670 },
       
   671 { "fieldset",   0, 0, 0, 0, 0, 0, 0, "form control group ",
       
   672         DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
       
   673 },
       
   674 { "font",       0, 3, 0, 0, 1, 1, 1, "local change to font ",
       
   675         DECL html_inline, NULL, NULL, DECL font_attrs, NULL
       
   676 },
       
   677 { "form",       0, 0, 0, 0, 0, 0, 0, "interactive form ",
       
   678         DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
       
   679 },
       
   680 { "frame",      0, 2, 2, 1, 0, 2, 0, "subwindow " ,
       
   681         EMPTY, NULL, NULL, DECL frame_attrs, NULL
       
   682 },
       
   683 { "frameset",   0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
       
   684         DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
       
   685 },
       
   686 { "h1",         0, 0, 0, 0, 0, 0, 0, "heading ",
       
   687         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
       
   688 },
       
   689 { "h2",         0, 0, 0, 0, 0, 0, 0, "heading ",
       
   690         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
       
   691 },
       
   692 { "h3",         0, 0, 0, 0, 0, 0, 0, "heading ",
       
   693         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
       
   694 },
       
   695 { "h4",         0, 0, 0, 0, 0, 0, 0, "heading ",
       
   696         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
       
   697 },
       
   698 { "h5",         0, 0, 0, 0, 0, 0, 0, "heading ",
       
   699         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
       
   700 },
       
   701 { "h6",         0, 0, 0, 0, 0, 0, 0, "heading ",
       
   702         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
       
   703 },
       
   704 { "head",       1, 1, 0, 0, 0, 0, 0, "document head ",
       
   705         DECL head_contents, NULL, DECL head_attrs, NULL, NULL
       
   706 },
       
   707 { "hr",         0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
       
   708         EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
       
   709 },
       
   710 { "html",       1, 1, 0, 0, 0, 0, 0, "document root element ",
       
   711         DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
       
   712 },
       
   713 { "i",          0, 3, 0, 0, 0, 0, 1, "italic text style",
       
   714         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   715 },
       
   716 { "iframe",     0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
       
   717         DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
       
   718 },
       
   719 { "img",        0, 2, 2, 1, 0, 0, 1, "embedded image ",
       
   720         EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
       
   721 },
       
   722 { "input",      0, 2, 2, 1, 0, 0, 1, "form control ",
       
   723         EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
       
   724 },
       
   725 { "ins",        0, 0, 0, 0, 0, 0, 2, "inserted text",
       
   726         DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
       
   727 },
       
   728 { "isindex",    0, 2, 2, 1, 1, 1, 0, "single line prompt ",
       
   729         EMPTY, NULL, NULL, DECL prompt_attrs, NULL
       
   730 },
       
   731 { "kbd",        0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
       
   732         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   733 },
       
   734 { "label",      0, 0, 0, 0, 0, 0, 1, "form field label text ",
       
   735         DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
       
   736 },
       
   737 { "legend",     0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
       
   738         DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
       
   739 },
       
   740 { "li",         0, 1, 1, 0, 0, 0, 0, "list item ",
       
   741         DECL html_flow, NULL, DECL html_attrs, NULL, NULL
       
   742 },
       
   743 { "link",       0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
       
   744         EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
       
   745 },
       
   746 { "map",        0, 0, 0, 0, 0, 0, 2, "client-side image map ",
       
   747         DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
       
   748 },
       
   749 { "menu",       0, 0, 0, 0, 1, 1, 0, "menu list ",
       
   750         DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
       
   751 },
       
   752 { "meta",       0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
       
   753         EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
       
   754 },
       
   755 { "noframes",   0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
       
   756         DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
       
   757 },
       
   758 { "noscript",   0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
       
   759         DECL html_flow, "div", DECL html_attrs, NULL, NULL
       
   760 },
       
   761 { "object",     0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
       
   762         DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
       
   763 },
       
   764 { "ol",         0, 0, 0, 0, 0, 0, 0, "ordered list ",
       
   765         DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
       
   766 },
       
   767 { "optgroup",   0, 0, 0, 0, 0, 0, 0, "option group ",
       
   768         DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
       
   769 },
       
   770 { "option",     0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
       
   771         DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
       
   772 },
       
   773 { "p",          0, 1, 0, 0, 0, 0, 0, "paragraph ",
       
   774         DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
       
   775 },
       
   776 { "param",      0, 2, 2, 1, 0, 0, 0, "named property value ",
       
   777         EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
       
   778 },
       
   779 { "pre",        0, 0, 0, 0, 0, 0, 0, "preformatted text ",
       
   780         DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
       
   781 },
       
   782 { "q",          0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
       
   783         DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
       
   784 },
       
   785 { "s",          0, 3, 0, 0, 1, 1, 1, "strike-through text style",
       
   786         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
       
   787 },
       
   788 { "samp",       0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
       
   789         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   790 },
       
   791 { "script",     0, 0, 0, 0, 0, 0, 2, "script statements ",
       
   792         DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
       
   793 },
       
   794 { "select",     0, 0, 0, 0, 0, 0, 1, "option selector ",
       
   795         DECL select_content, NULL, DECL select_attrs, NULL, NULL
       
   796 },
       
   797 { "small",      0, 3, 0, 0, 0, 0, 1, "small text style",
       
   798         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   799 },
       
   800 { "span",       0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
       
   801         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   802 },
       
   803 { "strike",     0, 3, 0, 0, 1, 1, 1, "strike-through text",
       
   804         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
       
   805 },
       
   806 { "strong",     0, 3, 0, 0, 0, 0, 1, "strong emphasis",
       
   807         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   808 },
       
   809 { "style",      0, 0, 0, 0, 0, 0, 0, "style info ",
       
   810         DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
       
   811 },
       
   812 { "sub",        0, 3, 0, 0, 0, 0, 1, "subscript",
       
   813         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   814 },
       
   815 { "sup",        0, 3, 0, 0, 0, 0, 1, "superscript ",
       
   816         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   817 },
       
   818 { "table",      0, 0, 0, 0, 0, 0, 0, "",
       
   819         DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
       
   820 },
       
   821 { "tbody",      1, 0, 0, 0, 0, 0, 0, "table body ",
       
   822         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
       
   823 },
       
   824 { "td",         0, 0, 0, 0, 0, 0, 0, "table data cell",
       
   825         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
       
   826 },
       
   827 { "textarea",   0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
       
   828         DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
       
   829 },
       
   830 { "tfoot",      0, 1, 0, 0, 0, 0, 0, "table footer ",
       
   831         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
       
   832 },
       
   833 { "th",         0, 1, 0, 0, 0, 0, 0, "table header cell",
       
   834         DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
       
   835 },
       
   836 { "thead",      0, 1, 0, 0, 0, 0, 0, "table header ",
       
   837         DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
       
   838 },
       
   839 { "title",      0, 0, 0, 0, 0, 0, 0, "document title ",
       
   840         DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
       
   841 },
       
   842 { "tr",         0, 0, 0, 0, 0, 0, 0, "table row ",
       
   843         DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
       
   844 },
       
   845 { "tt",         0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
       
   846         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   847 },
       
   848 { "u",          0, 3, 0, 0, 1, 1, 1, "underlined text style",
       
   849         DECL html_inline, NULL, NULL, DECL html_attrs, NULL
       
   850 },
       
   851 { "ul",         0, 0, 0, 0, 0, 0, 0, "unordered list ",
       
   852         DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
       
   853 },
       
   854 { "var",        0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
       
   855         DECL html_inline, NULL, DECL html_attrs, NULL, NULL
       
   856 }
       
   857 };
       
   858 
       
   859 #endif /* defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) */
       
   860 
       
   861 #ifdef  LIBXML_HTML_ENABLED
       
   862 
       
   863 /*
       
   864  * start tags that imply the end of current element
       
   865  */
       
   866 static const char * const htmlStartClose [] = {
       
   867 "form",         "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
       
   868                 "dl", "ul", "ol", "menu", "dir", "address", "pre",
       
   869                 "listing", "xmp", "head", NULL,
       
   870 "head",         "p", NULL,
       
   871 "title",        "p", NULL,
       
   872 "body",         "head", "style", "link", "title", "p", NULL,
       
   873 "frameset",     "head", "style", "link", "title", "p", NULL,
       
   874 "li",           "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
       
   875                 "pre", "listing", "xmp", "head", "li", NULL,
       
   876 "hr",           "p", "head", NULL,
       
   877 "h1",           "p", "head", NULL,
       
   878 "h2",           "p", "head", NULL,
       
   879 "h3",           "p", "head", NULL,
       
   880 "h4",           "p", "head", NULL,
       
   881 "h5",           "p", "head", NULL,
       
   882 "h6",           "p", "head", NULL,
       
   883 "dir",          "p", "head", NULL,
       
   884 "address",      "p", "head", "ul", NULL,
       
   885 "pre",          "p", "head", "ul", NULL,
       
   886 "listing",      "p", "head", NULL,
       
   887 "xmp",          "p", "head", NULL,
       
   888 "blockquote",   "p", "head", NULL,
       
   889 "dl",           "p", "dt", "menu", "dir", "address", "pre", "listing",
       
   890                 "xmp", "head", NULL,
       
   891 "dt",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
       
   892                 "head", "dd", NULL,
       
   893 "dd",           "p", "menu", "dir", "address", "pre", "listing", "xmp",
       
   894                 "head", "dt", NULL,
       
   895 "ul",           "p", "head", "ol", "menu", "dir", "address", "pre",
       
   896                 "listing", "xmp", NULL,
       
   897 "ol",           "p", "head", "ul", NULL,
       
   898 "menu",         "p", "head", "ul", NULL,
       
   899 "p",            "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
       
   900 "div",          "p", "head", NULL,
       
   901 "noscript",     "p", "head", NULL,
       
   902 "center",       "font", "b", "i", "p", "head", NULL,
       
   903 "a",            "a", NULL,
       
   904 "caption",      "p", NULL,
       
   905 "colgroup",     "caption", "colgroup", "col", "p", NULL,
       
   906 "col",          "caption", "col", "p", NULL,
       
   907 "table",        "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
       
   908                 "listing", "xmp", "a", NULL,
       
   909 "th",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
       
   910 "td",           "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
       
   911 "tr",           "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
       
   912 "thead",        "caption", "col", "colgroup", NULL,
       
   913 "tfoot",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
       
   914                 "tbody", "p", NULL,
       
   915 "tbody",        "th", "td", "tr", "caption", "col", "colgroup", "thead",
       
   916                 "tfoot", "tbody", "p", NULL,
       
   917 "optgroup",     "option", NULL,
       
   918 "option",       "option", NULL,
       
   919 "fieldset",     "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
       
   920                 "pre", "listing", "xmp", "a", NULL,
       
   921 NULL
       
   922 };
       
   923 
       
   924 /*
       
   925  * The list of HTML elements which are supposed not to have
       
   926  * CDATA content and where a p element will be implied
       
   927  *
       
   928 
       
   929  
       
   930  */
       
   931 static const char * const htmlNoContentElements[] = {
       
   932     "html",
       
   933     "head",
       
   934     "body",
       
   935     NULL
       
   936 };
       
   937 
       
   938 /*
       
   939  * The list of HTML attributes which are of content %Script;
       
   940  * NOTE: when adding ones, check htmlIsScriptAttribute() since
       
   941  *       it assumes the name starts with 'on'
       
   942  */
       
   943 static const char * const htmlScriptAttributes[] = {
       
   944     "onclick",
       
   945     "ondblclick",
       
   946     "onmousedown",
       
   947     "onmouseup",
       
   948     "onmouseover",
       
   949     "onmousemove",
       
   950     "onmouseout",
       
   951     "onkeypress",
       
   952     "onkeydown",
       
   953     "onkeyup",
       
   954     "onload",
       
   955     "onunload",
       
   956     "onfocus",
       
   957     "onblur",
       
   958     "onsubmit",
       
   959     "onrest",
       
   960     "onchange",
       
   961     "onselect"
       
   962 };
       
   963 
       
   964 /*
       
   965  * This table is used by the htmlparser to know what to do with
       
   966  * broken html pages. By assigning different priorities to different
       
   967  * elements the parser can decide how to handle extra endtags.
       
   968  * Endtags are only allowed to close elements with lower or equal
       
   969  * priority.
       
   970  */
       
   971 
       
   972 typedef struct {
       
   973     const char *name;
       
   974     int priority;
       
   975 } elementPriority;
       
   976 
       
   977 static const elementPriority htmlEndPriority [] = {
       
   978     {"div",   150},
       
   979     {"td",    160},
       
   980     {"th",    160},
       
   981     {"tr",    170},
       
   982     {"thead", 180},
       
   983     {"tbody", 180},
       
   984     {"tfoot", 180},
       
   985     {"table", 190},
       
   986     {"head",  200},
       
   987     {"body",  200},
       
   988     {"html",  220},
       
   989     {NULL,    100} /* Default priority */
       
   990 };
       
   991 
       
   992 /************************************************************************
       
   993  *                                                                      *
       
   994  *      functions to handle HTML specific data                          *
       
   995  *                                                                      *
       
   996  ************************************************************************/
       
   997 
       
   998 /**
       
   999  * htmlInitAutoClose:
       
  1000  *
       
  1001  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
       
  1002  * This is not reentrant. Call xmlInitParser() once before processing in
       
  1003  * case of use in multithreaded programs.
       
  1004  */
       
  1005 void
       
  1006 htmlInitAutoClose(void) {
       
  1007     int indx, i = 0;
       
  1008 
       
  1009     if (htmlStartCloseIndexinitialized) return;
       
  1010 
       
  1011     for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
       
  1012     indx = 0;
       
  1013     while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
       
  1014         // libxslt port: (const char**) cast was added
       
  1015         htmlStartCloseIndex[indx++] = (const char**)&htmlStartClose[i];
       
  1016         while(htmlStartClose[i++]) {};
       
  1017             i++;
       
  1018     }
       
  1019     htmlStartCloseIndexinitialized = 1;
       
  1020 }
       
  1021 
       
  1022 /**
       
  1023  * htmlGetEndPriority:
       
  1024  * @param name The name of the element to look up the priority for.
       
  1025  *
       
  1026  * Return value: The "endtag" priority.
       
  1027  **/
       
  1028 static int
       
  1029 htmlGetEndPriority (const xmlChar *name) {
       
  1030     int i = 0;
       
  1031 
       
  1032     while ((htmlEndPriority[i].name != NULL) &&
       
  1033            (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
       
  1034         i++;
       
  1035 
       
  1036     return(htmlEndPriority[i].priority);
       
  1037 }
       
  1038 
       
  1039 
       
  1040 /**
       
  1041  * htmlCheckAutoClose:
       
  1042  * @param newtag The new tag name
       
  1043  * @param oldtag The old tag name
       
  1044  *
       
  1045  * Checks whether the new tag is one of the registered valid tags for
       
  1046  * closing old.
       
  1047  * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
       
  1048  *
       
  1049  * Returns 0 if no, 1 if yes.
       
  1050  */
       
  1051 static int
       
  1052 htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
       
  1053 {
       
  1054     int i, indx;
       
  1055     const char **closed = NULL;
       
  1056 
       
  1057     if (htmlStartCloseIndexinitialized == 0)
       
  1058         htmlInitAutoClose();
       
  1059 
       
  1060     /* inefficient, but not a big deal */
       
  1061     for (indx = 0; indx < 100; indx++) {
       
  1062         closed = htmlStartCloseIndex[indx];
       
  1063         if (closed == NULL)
       
  1064             return (0);
       
  1065         if (xmlStrEqual(BAD_CAST * closed, newtag))
       
  1066             break;
       
  1067     }
       
  1068 
       
  1069     i = closed - htmlStartClose;
       
  1070     i++;
       
  1071     while (htmlStartClose[i] != NULL) {
       
  1072         if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
       
  1073             return (1);
       
  1074         }
       
  1075         i++;
       
  1076     }
       
  1077     return (0);
       
  1078 }
       
  1079 
       
  1080 /**
       
  1081  * htmlAutoCloseOnClose:
       
  1082  * @param ctxt an HTML parser context
       
  1083  * @param newtag The new tag name
       
  1084  * @param force force the tag closure
       
  1085  *
       
  1086  * The HTML DTD allows an ending tag to implicitly close other tags.
       
  1087  */
       
  1088 static void
       
  1089 htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
       
  1090 {
       
  1091     const htmlElemDesc *info;
       
  1092     int i, priority;
       
  1093 
       
  1094     priority = htmlGetEndPriority(newtag);
       
  1095 
       
  1096     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
       
  1097 
       
  1098         if (xmlStrEqual(newtag, ctxt->nameTab[i]))
       
  1099             break;
       
  1100         /*
       
  1101          * A missplaced endtag can only close elements with lower
       
  1102          * or equal priority, so if we find an element with higher
       
  1103          * priority before we find an element with
       
  1104          * matching name, we just ignore this endtag
       
  1105          */
       
  1106         if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
       
  1107             return;
       
  1108     }
       
  1109     if (i < 0)
       
  1110         return;
       
  1111 
       
  1112     while (!xmlStrEqual(newtag, ctxt->name)) {
       
  1113         info = htmlTagLookup(ctxt->name);
       
  1114         if ((info != NULL) && (info->endTag == 3)) {
       
  1115             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
       
  1116                          "Opening and ending tag mismatch: %s and %s\n",
       
  1117                          newtag, ctxt->name);
       
  1118         }
       
  1119         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  1120             ctxt->sax->endElement(ctxt->userData, ctxt->name);
       
  1121         htmlnamePop(ctxt);
       
  1122     }
       
  1123 }
       
  1124 
       
  1125 /**
       
  1126  * htmlAutoCloseOnEnd:
       
  1127  * @param ctxt an HTML parser context
       
  1128  *
       
  1129  * Close all remaining tags at the end of the stream
       
  1130  */
       
  1131 static void
       
  1132 htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
       
  1133 {
       
  1134     int i;
       
  1135 
       
  1136     if (ctxt->nameNr == 0)
       
  1137         return;
       
  1138     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
       
  1139         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  1140             ctxt->sax->endElement(ctxt->userData, ctxt->name);
       
  1141         htmlnamePop(ctxt);
       
  1142     }
       
  1143 }
       
  1144 
       
  1145 /**
       
  1146  * htmlAutoClose:
       
  1147  * @param ctxt an HTML parser context
       
  1148  * @param newtag The new tag name or NULL
       
  1149  *
       
  1150  * The HTML DTD allows a tag to implicitly close other tags.
       
  1151  * The list is kept in htmlStartClose array. This function is
       
  1152  * called when a new tag has been detected and generates the
       
  1153  * appropriates closes if possible/needed.
       
  1154  * If newtag is NULL this mean we are at the end of the resource
       
  1155  * and we should check
       
  1156  */
       
  1157 static void
       
  1158 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
       
  1159 {
       
  1160     while ((newtag != NULL) && (ctxt->name != NULL) &&
       
  1161            (htmlCheckAutoClose(newtag, ctxt->name))) {
       
  1162         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  1163             ctxt->sax->endElement(ctxt->userData, ctxt->name);
       
  1164         htmlnamePop(ctxt);
       
  1165     }
       
  1166     if (newtag == NULL) {
       
  1167         htmlAutoCloseOnEnd(ctxt);
       
  1168         return;
       
  1169     }
       
  1170     while ((newtag == NULL) && (ctxt->name != NULL) &&
       
  1171            ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
       
  1172             (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
       
  1173             (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
       
  1174         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  1175             ctxt->sax->endElement(ctxt->userData, ctxt->name);
       
  1176         htmlnamePop(ctxt);
       
  1177     }
       
  1178 }
       
  1179 
       
  1180 /**
       
  1181  * htmlAutoCloseTag:
       
  1182  * @param doc the HTML document
       
  1183  * @param name The tag name
       
  1184  * @param elem the HTML element
       
  1185  *
       
  1186  * The HTML DTD allows a tag to implicitly close other tags.
       
  1187  * The list is kept in htmlStartClose array. This function checks
       
  1188  * if the element or one of it's children would autoclose the
       
  1189  * given tag.
       
  1190  *
       
  1191  * Returns 1 if autoclose, 0 otherwise
       
  1192  */
       
  1193 int
       
  1194 htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
       
  1195     htmlNodePtr child;
       
  1196 
       
  1197     if (elem == NULL) return(1);
       
  1198     if (xmlStrEqual(name, elem->name)) return(0);
       
  1199     if (htmlCheckAutoClose(elem->name, name)) return(1);
       
  1200     child = elem->children;
       
  1201     while (child != NULL) {
       
  1202         if (htmlAutoCloseTag(doc, name, child)) return(1);
       
  1203         child = child->next;
       
  1204     }
       
  1205     return(0);
       
  1206 }
       
  1207 
       
  1208 /**
       
  1209  * htmlIsAutoClosed:
       
  1210  * @param doc the HTML document
       
  1211  * @param elem the HTML element
       
  1212  *
       
  1213  * The HTML DTD allows a tag to implicitly close other tags.
       
  1214  * The list is kept in htmlStartClose array. This function checks
       
  1215  * if a tag is autoclosed by one of it's child
       
  1216  *
       
  1217  * Returns 1 if autoclosed, 0 otherwise
       
  1218  */
       
  1219 int
       
  1220 htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
       
  1221     htmlNodePtr child;
       
  1222 
       
  1223     if (elem == NULL) return(1);
       
  1224     child = elem->children;
       
  1225     while (child != NULL) {
       
  1226         if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
       
  1227         child = child->next;
       
  1228     }
       
  1229     return(0);
       
  1230 }
       
  1231 
       
  1232 /**
       
  1233  * htmlCheckImplied:
       
  1234  * @param ctxt an HTML parser context
       
  1235  * @param newtag The new tag name
       
  1236  *
       
  1237  * The HTML DTD allows a tag to exists only implicitly
       
  1238  * called when a new tag has been detected and generates the
       
  1239  * appropriates implicit tags if missing
       
  1240  */
       
  1241 static void
       
  1242 htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
       
  1243     if (!htmlOmittedDefaultValue)
       
  1244         return;
       
  1245     if (xmlStrEqual(newtag, BAD_CAST"html"))
       
  1246         return;
       
  1247     if (ctxt->nameNr <= 0) {
       
  1248         htmlnamePush(ctxt, BAD_CAST"html");
       
  1249         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
       
  1250             ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
       
  1251     }
       
  1252     if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
       
  1253         return;
       
  1254     if ((ctxt->nameNr <= 1) &&
       
  1255         ((xmlStrEqual(newtag, BAD_CAST"script")) ||
       
  1256          (xmlStrEqual(newtag, BAD_CAST"style")) ||
       
  1257          (xmlStrEqual(newtag, BAD_CAST"meta")) ||
       
  1258          (xmlStrEqual(newtag, BAD_CAST"link")) ||
       
  1259          (xmlStrEqual(newtag, BAD_CAST"title")) ||
       
  1260          (xmlStrEqual(newtag, BAD_CAST"base")))) {
       
  1261             /*
       
  1262              * dropped OBJECT ... i you put it first BODY will be
       
  1263              * assumed !
       
  1264              */
       
  1265             htmlnamePush(ctxt, BAD_CAST"head");
       
  1266             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
       
  1267                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
       
  1268     } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
       
  1269                (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
       
  1270                (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
       
  1271         int i;
       
  1272         for (i = 0;i < ctxt->nameNr;i++) {
       
  1273             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
       
  1274                 return;
       
  1275             }
       
  1276             if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
       
  1277                 return;
       
  1278             }
       
  1279         }
       
  1280 
       
  1281         htmlnamePush(ctxt, BAD_CAST"body");
       
  1282         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
       
  1283             ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
       
  1284     }
       
  1285 }
       
  1286 
       
  1287 /**
       
  1288  * htmlCheckParagraph
       
  1289  * @param ctxt an HTML parser context
       
  1290  *
       
  1291  * Check whether a p element need to be implied before inserting
       
  1292  * characters in the current element.
       
  1293  *
       
  1294  * Returns 1 if a paragraph has been inserted, 0 if not and -1
       
  1295  *         in case of error.
       
  1296  */
       
  1297 
       
  1298 static int
       
  1299 htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
       
  1300     const xmlChar *tag;
       
  1301     int i;
       
  1302 
       
  1303     if (ctxt == NULL)
       
  1304         return(-1);
       
  1305     tag = ctxt->name;
       
  1306     if (tag == NULL) {
       
  1307         htmlAutoClose(ctxt, BAD_CAST"p");
       
  1308         htmlCheckImplied(ctxt, BAD_CAST"p");
       
  1309         htmlnamePush(ctxt, BAD_CAST"p");
       
  1310         if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
       
  1311             ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
       
  1312         return(1);
       
  1313     }
       
  1314     if (!htmlOmittedDefaultValue)
       
  1315         return(0);
       
  1316     for (i = 0; htmlNoContentElements[i] != NULL; i++) {
       
  1317         if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
       
  1318             htmlAutoClose(ctxt, BAD_CAST"p");
       
  1319             htmlCheckImplied(ctxt, BAD_CAST"p");
       
  1320             htmlnamePush(ctxt, BAD_CAST"p");
       
  1321             if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
       
  1322                 ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
       
  1323             return(1);
       
  1324         }
       
  1325     }
       
  1326     return(0);
       
  1327 }
       
  1328 
       
  1329 /**
       
  1330  * htmlIsScriptAttribute:
       
  1331  * @param name an attribute name
       
  1332  *
       
  1333  * Check if an attribute is of content type Script
       
  1334  *
       
  1335  * Returns 1 is the attribute is a script 0 otherwise
       
  1336  */
       
  1337 int
       
  1338 htmlIsScriptAttribute(const xmlChar *name) {
       
  1339     unsigned int i;
       
  1340 
       
  1341     if (name == NULL)
       
  1342         return(0);
       
  1343     /*
       
  1344      * all script attributes start with 'on'
       
  1345      */
       
  1346     if ((name[0] != 'o') || (name[1] != 'n'))
       
  1347         return(0);
       
  1348     for (i = 0;
       
  1349          i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
       
  1350          i++) {
       
  1351         if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
       
  1352             return(1);
       
  1353     }
       
  1354     return(0);
       
  1355 }
       
  1356 
       
  1357 /************************************************************************
       
  1358  *                                                                      *
       
  1359  *              The list of HTML predefined entities                    *
       
  1360  *                                                                      *
       
  1361  ************************************************************************/
       
  1362 
       
  1363 
       
  1364 static const htmlEntityDesc html40EntitiesTable[] = {
       
  1365 /*
       
  1366  * the 4 absolute ones, plus apostrophe.
       
  1367  */
       
  1368 { 34,   "quot", "quotation mark = APL quote, U+0022 ISOnum" },
       
  1369 { 38,   "amp",  "ampersand, U+0026 ISOnum" },
       
  1370 { 39,   "apos", "single quote" },
       
  1371 { 60,   "lt",   "less-than sign, U+003C ISOnum" },
       
  1372 { 62,   "gt",   "greater-than sign, U+003E ISOnum" },
       
  1373 
       
  1374 /*
       
  1375  * A bunch still in the 128-255 range
       
  1376  * Replacing them depend really on the charset used.
       
  1377  */
       
  1378 { 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
       
  1379 { 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
       
  1380 { 162,  "cent", "cent sign, U+00A2 ISOnum" },
       
  1381 { 163,  "pound","pound sign, U+00A3 ISOnum" },
       
  1382 { 164,  "curren","currency sign, U+00A4 ISOnum" },
       
  1383 { 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
       
  1384 { 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
       
  1385 { 167,  "sect", "section sign, U+00A7 ISOnum" },
       
  1386 { 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
       
  1387 { 169,  "copy", "copyright sign, U+00A9 ISOnum" },
       
  1388 { 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
       
  1389 { 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
       
  1390 { 172,  "not",  "not sign, U+00AC ISOnum" },
       
  1391 { 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
       
  1392 { 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
       
  1393 { 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
       
  1394 { 176,  "deg",  "degree sign, U+00B0 ISOnum" },
       
  1395 { 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
       
  1396 { 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
       
  1397 { 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
       
  1398 { 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
       
  1399 { 181,  "micro","micro sign, U+00B5 ISOnum" },
       
  1400 { 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
       
  1401 { 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
       
  1402 { 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
       
  1403 { 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
       
  1404 { 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
       
  1405 { 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
       
  1406 { 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
       
  1407 { 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
       
  1408 { 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
       
  1409 { 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
       
  1410 { 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
       
  1411 { 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
       
  1412 { 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
       
  1413 { 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
       
  1414 { 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
       
  1415 { 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
       
  1416 { 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
       
  1417 { 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
       
  1418 { 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
       
  1419 { 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
       
  1420 { 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
       
  1421 { 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
       
  1422 { 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
       
  1423 { 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
       
  1424 { 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
       
  1425 { 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
       
  1426 { 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
       
  1427 { 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
       
  1428 { 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
       
  1429 { 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
       
  1430 { 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
       
  1431 { 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
       
  1432 { 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
       
  1433 { 215,  "times","multiplication sign, U+00D7 ISOnum" },
       
  1434 { 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
       
  1435 { 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
       
  1436 { 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
       
  1437 { 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
       
  1438 { 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
       
  1439 { 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
       
  1440 { 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
       
  1441 { 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
       
  1442 { 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
       
  1443 { 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
       
  1444 { 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
       
  1445 { 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
       
  1446 { 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
       
  1447 { 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
       
  1448 { 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
       
  1449 { 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
       
  1450 { 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
       
  1451 { 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
       
  1452 { 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
       
  1453 { 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
       
  1454 { 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
       
  1455 { 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
       
  1456 { 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
       
  1457 { 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
       
  1458 { 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
       
  1459 { 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
       
  1460 { 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
       
  1461 { 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
       
  1462 { 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
       
  1463 { 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
       
  1464 { 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
       
  1465 { 247,  "divide","division sign, U+00F7 ISOnum" },
       
  1466 { 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
       
  1467 { 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
       
  1468 { 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
       
  1469 { 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
       
  1470 { 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
       
  1471 { 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
       
  1472 { 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
       
  1473 { 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
       
  1474 
       
  1475 { 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
       
  1476 { 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
       
  1477 { 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
       
  1478 { 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
       
  1479 { 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
       
  1480 
       
  1481 /*
       
  1482  * Anything below should really be kept as entities references
       
  1483  */
       
  1484 { 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
       
  1485 
       
  1486 { 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
       
  1487 { 732,  "tilde","small tilde, U+02DC ISOdia" },
       
  1488 
       
  1489 { 913,  "Alpha","greek capital letter alpha, U+0391" },
       
  1490 { 914,  "Beta", "greek capital letter beta, U+0392" },
       
  1491 { 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
       
  1492 { 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
       
  1493 { 917,  "Epsilon","greek capital letter epsilon, U+0395" },
       
  1494 { 918,  "Zeta", "greek capital letter zeta, U+0396" },
       
  1495 { 919,  "Eta",  "greek capital letter eta, U+0397" },
       
  1496 { 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
       
  1497 { 921,  "Iota", "greek capital letter iota, U+0399" },
       
  1498 { 922,  "Kappa","greek capital letter kappa, U+039A" },
       
  1499 { 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
       
  1500 { 924,  "Mu",   "greek capital letter mu, U+039C" },
       
  1501 { 925,  "Nu",   "greek capital letter nu, U+039D" },
       
  1502 { 926,  "Xi",   "greek capital letter xi, U+039E ISOgrk3" },
       
  1503 { 927,  "Omicron","greek capital letter omicron, U+039F" },
       
  1504 { 928,  "Pi",   "greek capital letter pi, U+03A0 ISOgrk3" },
       
  1505 { 929,  "Rho",  "greek capital letter rho, U+03A1" },
       
  1506 { 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
       
  1507 { 932,  "Tau",  "greek capital letter tau, U+03A4" },
       
  1508 { 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
       
  1509 { 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
       
  1510 { 935,  "Chi",  "greek capital letter chi, U+03A7" },
       
  1511 { 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
       
  1512 { 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
       
  1513 
       
  1514 { 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
       
  1515 { 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
       
  1516 { 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
       
  1517 { 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
       
  1518 { 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
       
  1519 { 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
       
  1520 { 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
       
  1521 { 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
       
  1522 { 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
       
  1523 { 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
       
  1524 { 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
       
  1525 { 956,  "mu",   "greek small letter mu, U+03BC ISOgrk3" },
       
  1526 { 957,  "nu",   "greek small letter nu, U+03BD ISOgrk3" },
       
  1527 { 958,  "xi",   "greek small letter xi, U+03BE ISOgrk3" },
       
  1528 { 959,  "omicron","greek small letter omicron, U+03BF NEW" },
       
  1529 { 960,  "pi",   "greek small letter pi, U+03C0 ISOgrk3" },
       
  1530 { 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
       
  1531 { 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
       
  1532 { 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
       
  1533 { 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
       
  1534 { 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
       
  1535 { 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
       
  1536 { 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
       
  1537 { 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
       
  1538 { 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
       
  1539 { 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
       
  1540 { 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
       
  1541 { 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
       
  1542 
       
  1543 { 8194, "ensp", "en space, U+2002 ISOpub" },
       
  1544 { 8195, "emsp", "em space, U+2003 ISOpub" },
       
  1545 { 8201, "thinsp","thin space, U+2009 ISOpub" },
       
  1546 { 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
       
  1547 { 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
       
  1548 { 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
       
  1549 { 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
       
  1550 { 8211, "ndash","en dash, U+2013 ISOpub" },
       
  1551 { 8212, "mdash","em dash, U+2014 ISOpub" },
       
  1552 { 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
       
  1553 { 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
       
  1554 { 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
       
  1555 { 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
       
  1556 { 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
       
  1557 { 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
       
  1558 { 8224, "dagger","dagger, U+2020 ISOpub" },
       
  1559 { 8225, "Dagger","double dagger, U+2021 ISOpub" },
       
  1560 
       
  1561 { 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
       
  1562 { 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
       
  1563 
       
  1564 { 8240, "permil","per mille sign, U+2030 ISOtech" },
       
  1565 
       
  1566 { 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
       
  1567 { 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
       
  1568 
       
  1569 { 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
       
  1570 { 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
       
  1571 
       
  1572 { 8254, "oline","overline = spacing overscore, U+203E NEW" },
       
  1573 { 8260, "frasl","fraction slash, U+2044 NEW" },
       
  1574 
       
  1575 { 8364, "euro", "euro sign, U+20AC NEW" },
       
  1576 
       
  1577 { 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
       
  1578 { 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
       
  1579 { 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
       
  1580 { 8482, "trade","trade mark sign, U+2122 ISOnum" },
       
  1581 { 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
       
  1582 { 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
       
  1583 { 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
       
  1584 { 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
       
  1585 { 8595, "darr", "downwards arrow, U+2193 ISOnum" },
       
  1586 { 8596, "harr", "left right arrow, U+2194 ISOamsa" },
       
  1587 { 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
       
  1588 { 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
       
  1589 { 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
       
  1590 { 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
       
  1591 { 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
       
  1592 { 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
       
  1593 
       
  1594 { 8704, "forall","for all, U+2200 ISOtech" },
       
  1595 { 8706, "part", "partial differential, U+2202 ISOtech" },
       
  1596 { 8707, "exist","there exists, U+2203 ISOtech" },
       
  1597 { 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
       
  1598 { 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
       
  1599 { 8712, "isin", "element of, U+2208 ISOtech" },
       
  1600 { 8713, "notin","not an element of, U+2209 ISOtech" },
       
  1601 { 8715, "ni",   "contains as member, U+220B ISOtech" },
       
  1602 { 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
       
  1603 { 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
       
  1604 { 8722, "minus","minus sign, U+2212 ISOtech" },
       
  1605 { 8727, "lowast","asterisk operator, U+2217 ISOtech" },
       
  1606 { 8730, "radic","square root = radical sign, U+221A ISOtech" },
       
  1607 { 8733, "prop", "proportional to, U+221D ISOtech" },
       
  1608 { 8734, "infin","infinity, U+221E ISOtech" },
       
  1609 { 8736, "ang",  "angle, U+2220 ISOamso" },
       
  1610 { 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
       
  1611 { 8744, "or",   "logical or = vee, U+2228 ISOtech" },
       
  1612 { 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
       
  1613 { 8746, "cup",  "union = cup, U+222A ISOtech" },
       
  1614 { 8747, "int",  "integral, U+222B ISOtech" },
       
  1615 { 8756, "there4","therefore, U+2234 ISOtech" },
       
  1616 { 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
       
  1617 { 8773, "cong", "approximately equal to, U+2245 ISOtech" },
       
  1618 { 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
       
  1619 { 8800, "ne",   "not equal to, U+2260 ISOtech" },
       
  1620 { 8801, "equiv","identical to, U+2261 ISOtech" },
       
  1621 { 8804, "le",   "less-than or equal to, U+2264 ISOtech" },
       
  1622 { 8805, "ge",   "greater-than or equal to, U+2265 ISOtech" },
       
  1623 { 8834, "sub",  "subset of, U+2282 ISOtech" },
       
  1624 { 8835, "sup",  "superset of, U+2283 ISOtech" },
       
  1625 { 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
       
  1626 { 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
       
  1627 { 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
       
  1628 { 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
       
  1629 { 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
       
  1630 { 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
       
  1631 { 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
       
  1632 { 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
       
  1633 { 8969, "rceil","right ceiling, U+2309 ISOamsc" },
       
  1634 { 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
       
  1635 { 8971, "rfloor","right floor, U+230B ISOamsc" },
       
  1636 { 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
       
  1637 { 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
       
  1638 { 9674, "loz",  "lozenge, U+25CA ISOpub" },
       
  1639 
       
  1640 { 9824, "spades","black spade suit, U+2660 ISOpub" },
       
  1641 { 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
       
  1642 { 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
       
  1643 { 9830, "diams","black diamond suit, U+2666 ISOpub" },
       
  1644 
       
  1645 };
       
  1646 
       
  1647 /************************************************************************
       
  1648  *                                                                      *
       
  1649  *              Commodity functions to handle entities                  *
       
  1650  *                                                                      *
       
  1651  ************************************************************************/
       
  1652 
       
  1653 /*
       
  1654  * Macro used to grow the current buffer. Buffer is freed in OOM.
       
  1655  */ // DONE: Fix xmlRealloc  
       
  1656 #define growBuffer(buffer) {                                            \
       
  1657     void* allocTmp;                                                     \
       
  1658     buffer##_size *= 2;                                                 \
       
  1659     allocTmp = xmlRealloc(buffer, buffer##_size * sizeof(xmlChar));     \
       
  1660     if (!allocTmp) {                                                    \
       
  1661         xmlFree(buffer);                                                \
       
  1662         htmlErrMemory(ctxt, "growing buffer\n");                        \
       
  1663         return(NULL);                                                   \
       
  1664     }                                                                   \
       
  1665     buffer = (xmlChar*) allocTmp;                                       \
       
  1666 }
       
  1667 
       
  1668 /**
       
  1669  * htmlEntityLookup:
       
  1670  * @param name the entity name
       
  1671  *
       
  1672  * Lookup the given entity in EntitiesTable
       
  1673  *
       
  1674  
       
  1675  *
       
  1676  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
       
  1677  */
       
  1678 const htmlEntityDesc *
       
  1679 htmlEntityLookup(const xmlChar *name) {
       
  1680     unsigned int i;
       
  1681 
       
  1682     for (i = 0;i < (sizeof(html40EntitiesTable)/
       
  1683                     sizeof(html40EntitiesTable[0]));i++) {
       
  1684         if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
       
  1685             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
       
  1686         }
       
  1687     }
       
  1688     return(NULL);
       
  1689 }
       
  1690 
       
  1691 /**
       
  1692  * htmlEntityValueLookup:
       
  1693  * @param value the entity's unicode value
       
  1694  *
       
  1695  * Lookup the given entity in EntitiesTable
       
  1696  *
       
  1697  
       
  1698  *
       
  1699  * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
       
  1700  */
       
  1701 const htmlEntityDesc *
       
  1702 htmlEntityValueLookup(unsigned int value) {
       
  1703     unsigned int i;
       
  1704 
       
  1705     for (i = 0;i < (sizeof(html40EntitiesTable)/
       
  1706                     sizeof(html40EntitiesTable[0]));i++) {
       
  1707         if (html40EntitiesTable[i].value >= value) {
       
  1708             if (html40EntitiesTable[i].value > value)
       
  1709                 break;
       
  1710             return((htmlEntityDescPtr) &html40EntitiesTable[i]);
       
  1711         }
       
  1712     }
       
  1713     return(NULL);
       
  1714 }
       
  1715 
       
  1716 /**
       
  1717  * UTF8ToHtml:
       
  1718  * @param out a pointer to an array of bytes to store the result
       
  1719  * @param outlen the length of out
       
  1720  * @param in a pointer to an array of UTF-8 chars
       
  1721  * @param inlen the length of in
       
  1722  *
       
  1723  * Take a block of UTF-8 chars in and try to convert it to an ASCII
       
  1724  * plus HTML entities block of chars out.
       
  1725  *
       
  1726  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
       
  1727  * The value of inlen after return is the number of octets consumed
       
  1728  *     as the return value is positive, else unpredictable.
       
  1729  * The value of outlen after return is the number of octets consumed.
       
  1730  */
       
  1731 int
       
  1732 UTF8ToHtml(unsigned char* out, int *outlen,
       
  1733               const unsigned char* in, int *inlen) {
       
  1734     const unsigned char* processed = in;
       
  1735     const unsigned char* outend;
       
  1736     const unsigned char* outstart = out;
       
  1737     const unsigned char* instart = in;
       
  1738     const unsigned char* inend;
       
  1739     unsigned int c, d;
       
  1740     int trailing;
       
  1741 
       
  1742     if (in == NULL) {
       
  1743         /*
       
  1744          * initialization nothing to do
       
  1745          */
       
  1746         *outlen = 0;
       
  1747         *inlen = 0;
       
  1748         return(0);
       
  1749     }
       
  1750     inend = in + (*inlen);
       
  1751     outend = out + (*outlen);
       
  1752     while (in < inend) {
       
  1753         d = *in++;
       
  1754         if      (d < 0x80)  { c= d; trailing= 0; }
       
  1755         else if (d < 0xC0) {
       
  1756             /* trailing byte in leading position */
       
  1757             *outlen = out - outstart;
       
  1758             *inlen = processed - instart;
       
  1759             return(-2);
       
  1760         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
       
  1761         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
       
  1762         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
       
  1763         else {
       
  1764             /* no chance for this in Ascii */
       
  1765             *outlen = out - outstart;
       
  1766             *inlen = processed - instart;
       
  1767             return(-2);
       
  1768         }
       
  1769 
       
  1770         if (inend - in < trailing) {
       
  1771             break;
       
  1772         }
       
  1773 
       
  1774         for ( ; trailing; trailing--) {
       
  1775             if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
       
  1776                 break;
       
  1777             c <<= 6;
       
  1778             c |= d & 0x3F;
       
  1779         }
       
  1780 
       
  1781         /* assertion: c is a single UTF-4 value */
       
  1782         if (c < 0x80) {
       
  1783             if (out + 1 >= outend)
       
  1784                 break;
       
  1785             *out++ = c;
       
  1786         } else {
       
  1787             int len;
       
  1788             const htmlEntityDesc * ent;
       
  1789 
       
  1790             /*
       
  1791              * Try to lookup a predefined HTML entity for it
       
  1792              */
       
  1793 
       
  1794             ent = htmlEntityValueLookup(c);
       
  1795             if (ent == NULL) {
       
  1796                 /* no chance for this in Ascii */
       
  1797                 *outlen = out - outstart;
       
  1798                 *inlen = processed - instart;
       
  1799                 return(-2);
       
  1800             }
       
  1801             len = strlen(ent->name);
       
  1802             if (out + 2 + len >= outend)
       
  1803                 break;
       
  1804             *out++ = '&';
       
  1805             memcpy(out, ent->name, len);
       
  1806             out += len;
       
  1807             *out++ = ';';
       
  1808         }
       
  1809         processed = in;
       
  1810     }
       
  1811     *outlen = out - outstart;
       
  1812     *inlen = processed - instart;
       
  1813     return(0);
       
  1814 }
       
  1815 
       
  1816 /**
       
  1817  * htmlEncodeEntities:
       
  1818  * @param out a pointer to an array of bytes to store the result
       
  1819  * @param outlen the length of out
       
  1820  * @param in a pointer to an array of UTF-8 chars
       
  1821  * @param inlen the length of in
       
  1822  * @param quoteChar the quote character to escape (' or ") or zero.
       
  1823  *
       
  1824  * Take a block of UTF-8 chars in and try to convert it to an ASCII
       
  1825  * plus HTML entities block of chars out.
       
  1826  *
       
  1827  * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
       
  1828  * The value of inlen after return is the number of octets consumed
       
  1829  *     as the return value is positive, else unpredictable.
       
  1830  * The value of outlen after return is the number of octets consumed.
       
  1831  */
       
  1832 int
       
  1833 htmlEncodeEntities(unsigned char* out, int *outlen,
       
  1834                    const unsigned char* in, int *inlen, int quoteChar) {
       
  1835     const unsigned char* processed = in;
       
  1836     const unsigned char* outend = out + (*outlen);
       
  1837     const unsigned char* outstart = out;
       
  1838     const unsigned char* instart = in;
       
  1839     const unsigned char* inend = in + (*inlen);
       
  1840     unsigned int c, d;
       
  1841     int trailing;
       
  1842 
       
  1843     while (in < inend) {
       
  1844         d = *in++;
       
  1845         if      (d < 0x80)  { c= d; trailing= 0; }
       
  1846         else if (d < 0xC0) {
       
  1847             /* trailing byte in leading position */
       
  1848             *outlen = out - outstart;
       
  1849             *inlen = processed - instart;
       
  1850             return(-2);
       
  1851         } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
       
  1852         else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
       
  1853         else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
       
  1854         else {
       
  1855             /* no chance for this in Ascii */
       
  1856             *outlen = out - outstart;
       
  1857             *inlen = processed - instart;
       
  1858             return(-2);
       
  1859         }
       
  1860 
       
  1861         if (inend - in < trailing)
       
  1862             break;
       
  1863 
       
  1864         while (trailing--) {
       
  1865             if (((d= *in++) & 0xC0) != 0x80) {
       
  1866                 *outlen = out - outstart;
       
  1867                 *inlen = processed - instart;
       
  1868                 return(-2);
       
  1869             }
       
  1870             c <<= 6;
       
  1871             c |= d & 0x3F;
       
  1872         }
       
  1873 
       
  1874         /* assertion: c is a single UTF-4 value */
       
  1875         if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
       
  1876             (c != '&') && (c != '<') && (c != '>')) {
       
  1877             if (out >= outend)
       
  1878                 break;
       
  1879             *out++ = c;
       
  1880         } else {
       
  1881             const htmlEntityDesc * ent;
       
  1882             const char *cp;
       
  1883             char nbuf[16];
       
  1884             int len;
       
  1885 
       
  1886             /*
       
  1887              * Try to lookup a predefined HTML entity for it
       
  1888              */
       
  1889             ent = htmlEntityValueLookup(c);
       
  1890             if (ent == NULL) {
       
  1891                 snprintf(nbuf, sizeof(nbuf), "#%u", c);
       
  1892                 cp = nbuf;
       
  1893             }
       
  1894             else
       
  1895                 cp = ent->name;
       
  1896             len = strlen(cp);
       
  1897             if (out + 2 + len > outend)
       
  1898                 break;
       
  1899             *out++ = '&';
       
  1900             memcpy(out, cp, len);
       
  1901             out += len;
       
  1902             *out++ = ';';
       
  1903         }
       
  1904         processed = in;
       
  1905     }
       
  1906     *outlen = out - outstart;
       
  1907     *inlen = processed - instart;
       
  1908     return(0);
       
  1909 }
       
  1910 
       
  1911 /************************************************************************
       
  1912  *                                                                      *
       
  1913  *              Commodity functions to handle streams                   *
       
  1914  *                                                                      *
       
  1915  ************************************************************************/
       
  1916 
       
  1917 /**
       
  1918  * htmlNewInputStream:
       
  1919  * @param ctxt an HTML parser context
       
  1920  *
       
  1921  * Create a new input stream structure
       
  1922  * Returns the new input stream or NULL
       
  1923  */
       
  1924 static htmlParserInputPtr
       
  1925 htmlNewInputStream(htmlParserCtxtPtr ctxt) {
       
  1926     htmlParserInputPtr input;
       
  1927 
       
  1928     input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
       
  1929     if (input == NULL) {
       
  1930         htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
       
  1931         return(NULL);
       
  1932     }
       
  1933     memset(input, 0, sizeof(htmlParserInput));
       
  1934     input->filename = NULL;
       
  1935     input->directory = NULL;
       
  1936     input->base = NULL;
       
  1937     input->cur = NULL;
       
  1938     input->buf = NULL;
       
  1939     input->line = 1;
       
  1940     input->col = 1;
       
  1941     input->buf = NULL;
       
  1942     input->free = NULL;
       
  1943     input->version = NULL;
       
  1944     input->consumed = 0;
       
  1945     input->length = 0;
       
  1946     return(input);
       
  1947 }
       
  1948 
       
  1949 
       
  1950 /************************************************************************
       
  1951  *                                                                      *
       
  1952  *              Commodity functions, cleanup needed ?                   *
       
  1953  *                                                                      *
       
  1954  ************************************************************************/
       
  1955 /*
       
  1956  * all tags allowing pc data from the html 4.01 loose dtd
       
  1957  * NOTE: it might be more apropriate to integrate this information
       
  1958  * into the html40ElementTable array but I don't want to risk any
       
  1959  * binary incomptibility
       
  1960  */
       
  1961 static const char * const allowPCData[] = {
       
  1962     "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
       
  1963     "blockquote", "body", "button", "caption", "center", "cite", "code",
       
  1964     "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
       
  1965     "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
       
  1966     "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
       
  1967     "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
       
  1968 };
       
  1969 
       
  1970 /**
       
  1971  * areBlanks:
       
  1972  * @param ctxt an HTML parser context
       
  1973  * @param str a xmlChar *
       
  1974  * @param len the size of str
       
  1975  *
       
  1976  * Is this a sequence of blank chars that one can ignore ?
       
  1977  *
       
  1978  * Returns 1 if ignorable 0 otherwise.
       
  1979  */
       
  1980 
       
  1981 static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
       
  1982     unsigned int i;
       
  1983     int j;
       
  1984     xmlNodePtr lastChild;
       
  1985 
       
  1986     for (j = 0;j < len;j++)
       
  1987         if (!(IS_BLANK_CH(str[j]))) return(0);
       
  1988 
       
  1989     if (CUR == 0) return(1);
       
  1990     if (CUR != '<') return(0);
       
  1991     if (ctxt->name == NULL)
       
  1992         return(1);
       
  1993     if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
       
  1994         return(1);
       
  1995     if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
       
  1996         return(1);
       
  1997     if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
       
  1998         return(1);
       
  1999     if (ctxt->node == NULL) return(0);
       
  2000     lastChild = xmlGetLastChild(ctxt->node);
       
  2001     while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
       
  2002         lastChild = lastChild->prev;
       
  2003     if (lastChild == NULL) {
       
  2004         if ((ctxt->node->type != XML_ELEMENT_NODE) &&
       
  2005             (ctxt->node->content != NULL)) return(0);
       
  2006         /* keep ws in constructs like ...<b> </b>...
       
  2007            for all tags "b" allowing PCDATA */
       
  2008         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
       
  2009             if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
       
  2010                 return(0);
       
  2011             }
       
  2012         }
       
  2013     } else if (xmlNodeIsText(lastChild)) {
       
  2014         return(0);
       
  2015     } else {
       
  2016         /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
       
  2017            for all tags "p" allowing PCDATA */
       
  2018         for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
       
  2019             if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
       
  2020                 return(0);
       
  2021             }
       
  2022         }
       
  2023     }
       
  2024     return(1);
       
  2025 }
       
  2026 #endif  /* defined(LIBXML_HTML_ENABLED */
       
  2027 
       
  2028 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT)
       
  2029 
       
  2030 /**
       
  2031  * htmlErrMemory:
       
  2032  * @param ctxt an HTML parser context
       
  2033  * @param extra extra informations
       
  2034  *
       
  2035  * Handle a redefinition of attribute error
       
  2036  */
       
  2037 static void
       
  2038 htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
       
  2039 {
       
  2040     if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
       
  2041         (ctxt->instate == XML_PARSER_EOF))
       
  2042     return;
       
  2043     if (ctxt != NULL) {
       
  2044         ctxt->errNo = XML_ERR_NO_MEMORY;
       
  2045         ctxt->instate = XML_PARSER_EOF;
       
  2046         ctxt->disableSAX = 1;
       
  2047     }
       
  2048     if (extra)
       
  2049         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
       
  2050                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
       
  2051                         NULL, NULL, 0, 0,
       
  2052                         "Memory allocation failed : %s\n", extra);
       
  2053     else
       
  2054         __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
       
  2055                         XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
       
  2056                         NULL, NULL, 0, 0, "Memory allocation failed\n");
       
  2057 }
       
  2058 
       
  2059 /**
       
  2060  * htmlNewDocNoDtD:
       
  2061  * @param URI URI for the dtd, or NULL
       
  2062  * @param ExternalID the external ID of the DTD, or NULL
       
  2063  *
       
  2064  * Creates a new HTML document without a DTD node if URI and ExternalID
       
  2065  * are NULL
       
  2066  *
       
  2067  * Returns a new document, do not initialize the DTD if not provided
       
  2068  */
       
  2069 XMLPUBFUNEXPORT htmlDocPtr
       
  2070 htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
       
  2071     xmlDocPtr cur;
       
  2072 
       
  2073     /*
       
  2074      * Allocate a new document and fill the fields.
       
  2075      */
       
  2076     cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
       
  2077     if (cur == NULL) {
       
  2078         htmlErrMemory(NULL, "HTML document creation failed\n");
       
  2079         return(NULL);
       
  2080     }
       
  2081     memset(cur, 0, sizeof(xmlDoc));
       
  2082 
       
  2083     cur->type = XML_HTML_DOCUMENT_NODE;
       
  2084 #ifdef XE_ENABLE_GS_CACHING
       
  2085     cur->cachedGs = xmlGetGlobalState();
       
  2086 #endif
       
  2087 
       
  2088     //cur->version = NULL;
       
  2089     //cur->intSubset = NULL;
       
  2090     cur->doc = cur;
       
  2091     //cur->name = NULL;
       
  2092     //cur->children = NULL;
       
  2093     //cur->extSubset = NULL;
       
  2094     //cur->oldNs = NULL;
       
  2095     //cur->encoding = NULL;
       
  2096     cur->standalone = 1;
       
  2097     //cur->compression = 0;
       
  2098     //cur->ids = NULL;
       
  2099     //cur->refs = NULL;
       
  2100     //cur->_private = NULL;
       
  2101 
       
  2102     if (ExternalID || URI)
       
  2103         xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
       
  2104 
       
  2105     return(cur);
       
  2106 }
       
  2107 
       
  2108 /**
       
  2109  * htmlNewDoc:
       
  2110  * @param URI URI for the dtd, or NULL
       
  2111  * @param ExternalID the external ID of the DTD, or NULL
       
  2112  *
       
  2113  * Creates a new HTML document
       
  2114  *
       
  2115  * Returns a new document
       
  2116  */
       
  2117 XMLPUBFUNEXPORT htmlDocPtr
       
  2118 htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
       
  2119     if ((URI == NULL) && (ExternalID == NULL))
       
  2120         return(htmlNewDocNoDtD(
       
  2121                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
       
  2122                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
       
  2123 
       
  2124     return(htmlNewDocNoDtD(URI, ExternalID));
       
  2125 }
       
  2126 
       
  2127 /**
       
  2128  * htmlTagLookup:
       
  2129  * @param tag The tag name in lowercase
       
  2130  *
       
  2131  * Lookup the HTML tag in the ElementTable
       
  2132  *
       
  2133  * Returns the related htmlElemDescPtr or NULL if not found.
       
  2134  */
       
  2135 XMLPUBFUNEXPORT const htmlElemDesc *
       
  2136 htmlTagLookup(const xmlChar *tag) {
       
  2137     unsigned int i;
       
  2138 
       
  2139     for (i = 0; i < (sizeof(html40ElementTable) /
       
  2140                      sizeof(html40ElementTable[0]));i++) {
       
  2141         if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
       
  2142         return((htmlElemDescPtr) &html40ElementTable[i]);
       
  2143     }
       
  2144     return(NULL);
       
  2145 }
       
  2146 
       
  2147 #endif /* defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) */
       
  2148 
       
  2149 #if defined(LIBXML_HTML_ENABLED)
       
  2150 
       
  2151 /************************************************************************
       
  2152  *                                                                      *
       
  2153  *                      The parser itself                               *
       
  2154  *      Relates to http://www.w3.org/TR/html40                          *
       
  2155  *                                                                      *
       
  2156  ************************************************************************/
       
  2157 
       
  2158 /************************************************************************
       
  2159  *                                                                      *
       
  2160  *                      The parser itself                               *
       
  2161  *                                                                      *
       
  2162  ************************************************************************/
       
  2163 
       
  2164 static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
       
  2165 
       
  2166 /**
       
  2167  * htmlParseHTMLName:
       
  2168  * @param ctxt an HTML parser context
       
  2169  *
       
  2170  * parse an HTML tag or attribute name, note that we convert it to lowercase
       
  2171  * since HTML names are not case-sensitive.
       
  2172  *
       
  2173  * Returns the Tag Name parsed or NULL
       
  2174  */
       
  2175 
       
  2176 static const xmlChar *
       
  2177 htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
       
  2178     int i = 0;
       
  2179     xmlChar loc[HTML_PARSER_BUFFER_SIZE];
       
  2180 
       
  2181     if (!IS_LETTER_CH(CUR) && (CUR != '_') &&
       
  2182         (CUR != ':')) return(NULL);
       
  2183 
       
  2184     while ((i < HTML_PARSER_BUFFER_SIZE) &&
       
  2185            ((IS_LETTER_CH(CUR)) || (IS_DIGIT_CH(CUR)) ||
       
  2186            (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
       
  2187         if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
       
  2188         else loc[i] = CUR;
       
  2189         i++;
       
  2190 
       
  2191         NEXT;
       
  2192     }
       
  2193 
       
  2194     return(xmlDictLookup(ctxt->dict, loc, i));
       
  2195 }
       
  2196 
       
  2197 /**
       
  2198  * htmlParseName:
       
  2199  * @param ctxt an HTML parser context
       
  2200  *
       
  2201  * parse an HTML name, this routine is case sensitive.
       
  2202  *
       
  2203  * Returns the Name parsed or NULL
       
  2204  */
       
  2205 
       
  2206 static const xmlChar *
       
  2207 htmlParseName(htmlParserCtxtPtr ctxt) {
       
  2208     const xmlChar *in;
       
  2209     const xmlChar *ret;
       
  2210     int count = 0;
       
  2211 
       
  2212     GROW;
       
  2213 
       
  2214     /*
       
  2215      * Accelerator for simple ASCII names
       
  2216      */
       
  2217     in = ctxt->input->cur;
       
  2218     if (((*in >= 0x61) && (*in <= 0x7A)) ||
       
  2219         ((*in >= 0x41) && (*in <= 0x5A)) ||
       
  2220         (*in == '_') || (*in == ':')) {
       
  2221         in++;
       
  2222         while (((*in >= 0x61) && (*in <= 0x7A)) ||
       
  2223                ((*in >= 0x41) && (*in <= 0x5A)) ||
       
  2224                ((*in >= 0x30) && (*in <= 0x39)) ||
       
  2225                (*in == '_') || (*in == '-') ||
       
  2226                (*in == ':') || (*in == '.'))
       
  2227             in++;
       
  2228         if ((*in > 0) && (*in < 0x80)) {
       
  2229             count = in - ctxt->input->cur;
       
  2230             ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
       
  2231             ctxt->input->cur = in;
       
  2232             ctxt->nbChars += count;
       
  2233             ctxt->input->col += count;
       
  2234             return(ret);
       
  2235         }
       
  2236     }
       
  2237     return(htmlParseNameComplex(ctxt));
       
  2238 }
       
  2239 
       
  2240 static const xmlChar *
       
  2241 htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
       
  2242     int len = 0, l;
       
  2243     int c;
       
  2244     int count = 0;
       
  2245 
       
  2246     /*
       
  2247      * Handler for more complex cases
       
  2248      */
       
  2249     GROW;
       
  2250     c = CUR_CHAR(l);
       
  2251     if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
       
  2252         (!IS_LETTER(c) && (c != '_') &&
       
  2253          (c != ':'))) {
       
  2254         return(NULL);
       
  2255     }
       
  2256 
       
  2257     while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
       
  2258            ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
       
  2259             (c == '.') || (c == '-') ||
       
  2260             (c == '_') || (c == ':') ||
       
  2261             (IS_COMBINING(c)) ||
       
  2262             (IS_EXTENDER(c)))) {
       
  2263         if (count++ > 100) {
       
  2264             count = 0;
       
  2265             GROW;
       
  2266         }
       
  2267         len += l;
       
  2268         NEXTL(l);
       
  2269         c = CUR_CHAR(l);
       
  2270     }
       
  2271     return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
       
  2272 }
       
  2273 
       
  2274 
       
  2275 /**
       
  2276  * htmlParseHTMLAttribute:
       
  2277  * @param ctxt an HTML parser context
       
  2278  * @param stop a char stop value
       
  2279  *
       
  2280  * parse an HTML attribute value till the stop (quote), if
       
  2281  * stop is 0 then it stops at the first space
       
  2282  *
       
  2283  * Returns the attribute parsed or NULL
       
  2284  */
       
  2285 
       
  2286 static xmlChar *
       
  2287 htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
       
  2288     xmlChar *buffer = NULL;
       
  2289     int buffer_size = 0;
       
  2290     xmlChar *out = NULL;
       
  2291     const xmlChar *name = NULL;
       
  2292     const xmlChar *cur = NULL;
       
  2293     const htmlEntityDesc * ent;
       
  2294 
       
  2295     /*
       
  2296      * allocate a translation buffer.
       
  2297      */
       
  2298     buffer_size = HTML_PARSER_BUFFER_SIZE;
       
  2299     buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
       
  2300     if (buffer == NULL) {
       
  2301         htmlErrMemory(ctxt, "buffer allocation failed\n");
       
  2302         return(NULL);
       
  2303     }
       
  2304     out = buffer;
       
  2305 
       
  2306     /*
       
  2307      * Ok loop until we reach one of the ending chars
       
  2308      */
       
  2309     while ((CUR != 0) && (CUR != stop)) {
       
  2310         if ((stop == 0) && (CUR == '>')) break;
       
  2311         if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
       
  2312         if (CUR == '&') {
       
  2313             if (NXT(1) == '#') {
       
  2314                 unsigned int c;
       
  2315                 int bits;
       
  2316 
       
  2317                 c = htmlParseCharRef(ctxt);
       
  2318                 if      (c <    0x80)
       
  2319                         { *out++  = c;                bits= -6; }
       
  2320                 else if (c <   0x800)
       
  2321                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
       
  2322                 else if (c < 0x10000)
       
  2323                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
       
  2324                 else
       
  2325                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
       
  2326 
       
  2327                 for ( ; bits >= 0; bits-= 6) {
       
  2328                     *out++  = ((c >> bits) & 0x3F) | 0x80;
       
  2329                 }
       
  2330 
       
  2331                 if (out - buffer > buffer_size - 100) {
       
  2332                         int indx = out - buffer;
       
  2333 
       
  2334                         growBuffer(buffer);
       
  2335                         out = &buffer[indx];
       
  2336                 }
       
  2337             } else {
       
  2338                 ent = htmlParseEntityRef(ctxt, &name);
       
  2339                 if (name == NULL) {
       
  2340                     *out++ = '&';
       
  2341                     if (out - buffer > buffer_size - 100) {
       
  2342                         int indx = out - buffer;
       
  2343 
       
  2344                         growBuffer(buffer);
       
  2345                         out = &buffer[indx];
       
  2346                     }
       
  2347                 } else if (ent == NULL) {
       
  2348                     *out++ = '&';
       
  2349                     cur = name;
       
  2350                     while (*cur != 0) {
       
  2351                         if (out - buffer > buffer_size - 100) {
       
  2352                             int indx = out - buffer;
       
  2353 
       
  2354                             growBuffer(buffer);
       
  2355                             out = &buffer[indx];
       
  2356                         }
       
  2357                         *out++ = *cur++;
       
  2358                     }
       
  2359                 } else {
       
  2360                     unsigned int c;
       
  2361                     int bits;
       
  2362 
       
  2363                     if (out - buffer > buffer_size - 100) {
       
  2364                         int indx = out - buffer;
       
  2365 
       
  2366                         growBuffer(buffer);
       
  2367                         out = &buffer[indx];
       
  2368                     }
       
  2369                     c = (xmlChar)ent->value;
       
  2370                     if      (c <    0x80)
       
  2371                         { *out++  = c;                bits= -6; }
       
  2372                     else if (c <   0x800)
       
  2373                         { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
       
  2374                     else if (c < 0x10000)
       
  2375                         { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
       
  2376                     else
       
  2377                         { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
       
  2378 
       
  2379                     for ( ; bits >= 0; bits-= 6) {
       
  2380                         *out++  = ((c >> bits) & 0x3F) | 0x80;
       
  2381                     }
       
  2382                 }
       
  2383             }
       
  2384         } else {
       
  2385             unsigned int c;
       
  2386             int bits, l;
       
  2387 
       
  2388             if (out - buffer > buffer_size - 100) {
       
  2389                 int indx = out - buffer;
       
  2390 
       
  2391                 growBuffer(buffer);
       
  2392                 out = &buffer[indx];
       
  2393             }
       
  2394             c = CUR_CHAR(l);
       
  2395             if      (c <    0x80)
       
  2396                     { *out++  = c;                bits= -6; }
       
  2397             else if (c <   0x800)
       
  2398                     { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
       
  2399             else if (c < 0x10000)
       
  2400                     { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
       
  2401             else
       
  2402                     { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
       
  2403 
       
  2404             for ( ; bits >= 0; bits-= 6) {
       
  2405                 *out++  = ((c >> bits) & 0x3F) | 0x80;
       
  2406             }
       
  2407             NEXT;
       
  2408         }
       
  2409     }
       
  2410     *out++ = 0;
       
  2411     return(buffer);
       
  2412 }
       
  2413 
       
  2414 /**
       
  2415  * htmlParseEntityRef:
       
  2416  * @param ctxt an HTML parser context
       
  2417  * @param str location to store the entity name
       
  2418  *
       
  2419  * parse an HTML ENTITY references
       
  2420  *
       
  2421  * [68] EntityRef ::= '&' Name ';'
       
  2422  *
       
  2423  * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
       
  2424  *         if non-NULL *str will have to be freed by the caller.
       
  2425  */
       
  2426 const htmlEntityDesc *
       
  2427 htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
       
  2428     const xmlChar *name;
       
  2429     const htmlEntityDesc * ent = NULL;
       
  2430     *str = NULL;
       
  2431 
       
  2432     if (CUR == '&') {
       
  2433         NEXT;
       
  2434         name = htmlParseName(ctxt);
       
  2435         if (name == NULL) {
       
  2436             htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
       
  2437                          "htmlParseEntityRef: no name\n", NULL, NULL);
       
  2438         } else {
       
  2439             GROW;
       
  2440             if (CUR == ';') {
       
  2441                 *str = name;
       
  2442 
       
  2443                 /*
       
  2444                  * Lookup the entity in the table.
       
  2445                  */
       
  2446                 ent = htmlEntityLookup(name);
       
  2447                 if (ent != NULL) /* OK that's ugly !!! */
       
  2448                     NEXT;
       
  2449             } else {
       
  2450                 htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
       
  2451                              "htmlParseEntityRef: expecting ';'\n",
       
  2452                              NULL, NULL);
       
  2453                 *str = name;
       
  2454             }
       
  2455         }
       
  2456     }
       
  2457     return(ent);
       
  2458 }
       
  2459 
       
  2460 /**
       
  2461  * htmlParseAttValue:
       
  2462  * @param ctxt an HTML parser context
       
  2463  *
       
  2464  * parse a value for an attribute
       
  2465  * Note: the parser won't do substitution of entities here, this
       
  2466  * will be handled later in xmlStringGetNodeList, unless it was
       
  2467  * asked for ctxt->replaceEntities != 0
       
  2468  *
       
  2469  * Returns the AttValue parsed or NULL.
       
  2470  */
       
  2471 
       
  2472 static xmlChar *
       
  2473 htmlParseAttValue(htmlParserCtxtPtr ctxt) {
       
  2474     xmlChar *ret = NULL;
       
  2475 
       
  2476     if (CUR == '"') {
       
  2477         NEXT;
       
  2478         ret = htmlParseHTMLAttribute(ctxt, '"');
       
  2479         if (CUR != '"') {
       
  2480             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
       
  2481                          "AttValue: \" expected\n", NULL, NULL);
       
  2482         } else
       
  2483             NEXT;
       
  2484     } else if (CUR == '\'') {
       
  2485         NEXT;
       
  2486         ret = htmlParseHTMLAttribute(ctxt, '\'');
       
  2487         if (CUR != '\'') {
       
  2488             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
       
  2489                          "AttValue: ' expected\n", NULL, NULL);
       
  2490         } else
       
  2491             NEXT;
       
  2492     } else {
       
  2493         /*
       
  2494          * That's an HTMLism, the attribute value may not be quoted
       
  2495          */
       
  2496         ret = htmlParseHTMLAttribute(ctxt, 0);
       
  2497         if (ret == NULL) {
       
  2498             htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
       
  2499                          "AttValue: no value found\n", NULL, NULL);
       
  2500         }
       
  2501     }
       
  2502     return(ret);
       
  2503 }
       
  2504 
       
  2505 /**
       
  2506  * htmlParseSystemLiteral:
       
  2507  * @param ctxt an HTML parser context
       
  2508  *
       
  2509  * parse an HTML Literal
       
  2510  *
       
  2511  * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
       
  2512  *
       
  2513  * Returns the SystemLiteral parsed or NULL
       
  2514  */
       
  2515 
       
  2516 static xmlChar *
       
  2517 htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
       
  2518     const xmlChar *q;
       
  2519     xmlChar *ret = NULL;
       
  2520 
       
  2521     if (CUR == '"') {
       
  2522         NEXT;
       
  2523         q = CUR_PTR;
       
  2524         while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
       
  2525             NEXT;
       
  2526         if (!IS_CHAR_CH(CUR)) {
       
  2527             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
       
  2528                          "Unfinished SystemLiteral\n", NULL, NULL);
       
  2529         } else {
       
  2530             ret = xmlStrndup(q, CUR_PTR - q);
       
  2531             NEXT;
       
  2532         }
       
  2533     } else if (CUR == '\'') {
       
  2534         NEXT;
       
  2535         q = CUR_PTR;
       
  2536         while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
       
  2537             NEXT;
       
  2538         if (!IS_CHAR_CH(CUR)) {
       
  2539             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
       
  2540                          "Unfinished SystemLiteral\n", NULL, NULL);
       
  2541         } else {
       
  2542             ret = xmlStrndup(q, CUR_PTR - q);
       
  2543             NEXT;
       
  2544         }
       
  2545     } else {
       
  2546         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
       
  2547                      " or ' expected\n", NULL, NULL);
       
  2548     }
       
  2549 
       
  2550     return(ret);
       
  2551 }
       
  2552 
       
  2553 /**
       
  2554  * htmlParsePubidLiteral:
       
  2555  * @param ctxt an HTML parser context
       
  2556  *
       
  2557  * parse an HTML public literal
       
  2558  *
       
  2559  * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
       
  2560  *
       
  2561  * Returns the PubidLiteral parsed or NULL.
       
  2562  */
       
  2563 
       
  2564 static xmlChar *
       
  2565 htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
       
  2566     const xmlChar *q;
       
  2567     xmlChar *ret = NULL;
       
  2568     /*
       
  2569      * Name ::= (Letter | '_') (NameChar)*
       
  2570      */
       
  2571     if (CUR == '"') {
       
  2572         NEXT;
       
  2573         q = CUR_PTR;
       
  2574         while (IS_PUBIDCHAR_CH(CUR)) NEXT;
       
  2575         if (CUR != '"') {
       
  2576             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
       
  2577                          "Unfinished PubidLiteral\n", NULL, NULL);
       
  2578         } else {
       
  2579             ret = xmlStrndup(q, CUR_PTR - q);
       
  2580             NEXT;
       
  2581         }
       
  2582     } else if (CUR == '\'') {
       
  2583         NEXT;
       
  2584         q = CUR_PTR;
       
  2585         while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
       
  2586             NEXT;
       
  2587         if (CUR != '\'') {
       
  2588             htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
       
  2589                          "Unfinished PubidLiteral\n", NULL, NULL);
       
  2590         } else {
       
  2591             ret = xmlStrndup(q, CUR_PTR - q);
       
  2592             NEXT;
       
  2593         }
       
  2594     } else {
       
  2595         htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
       
  2596                      "PubidLiteral \" or ' expected\n", NULL, NULL);
       
  2597     }
       
  2598 
       
  2599     return(ret);
       
  2600 }
       
  2601 
       
  2602 /**
       
  2603  * htmlParseScript:
       
  2604  * @param ctxt an HTML parser context
       
  2605  *
       
  2606  * parse the content of an HTML SCRIPT or STYLE element
       
  2607  * http://www.w3.org/TR/html4/sgml/dtd.html#Script
       
  2608  * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
       
  2609  * http://www.w3.org/TR/html4/types.html#type-script
       
  2610  * http://www.w3.org/TR/html4/types.html#h-6.15
       
  2611  * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
       
  2612  *
       
  2613  * Script data ( %Script; in the DTD) can be the content of the SCRIPT
       
  2614  * element and the value of intrinsic event attributes. User agents must
       
  2615  * not evaluate script data as HTML markup but instead must pass it on as
       
  2616  * data to a script engine.
       
  2617  * NOTES:
       
  2618  * - The content is passed like CDATA
       
  2619  * - the attributes for style and scripting "onXXX" are also described
       
  2620  *   as CDATA but SGML allows entities references in attributes so their
       
  2621  *   processing is identical as other attributes
       
  2622  */
       
  2623 static void
       
  2624 htmlParseScript(htmlParserCtxtPtr ctxt) {
       
  2625     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
       
  2626     int nbchar = 0;
       
  2627     xmlChar cur;
       
  2628 
       
  2629     SHRINK;
       
  2630     cur = CUR;
       
  2631     while (IS_CHAR_CH(cur)) {
       
  2632         if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
       
  2633             (NXT(3) == '-')) {
       
  2634             if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
       
  2635                 if (ctxt->sax->cdataBlock!= NULL) {
       
  2636                     /*
       
  2637                      * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
       
  2638                      */
       
  2639                     ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
       
  2640                 } else if (ctxt->sax->characters != NULL) {
       
  2641                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
       
  2642                 }
       
  2643             }
       
  2644             nbchar = 0;
       
  2645             htmlParseComment(ctxt);
       
  2646             cur = CUR;
       
  2647             continue;
       
  2648         } else if ((cur == '<') && (NXT(1) == '/')) {
       
  2649             /*
       
  2650              * One should break here, the specification is clear:
       
  2651              * Authors should therefore escape "</" within the content.
       
  2652              * Escape mechanisms are specific to each scripting or
       
  2653              * style sheet language.
       
  2654              */
       
  2655             if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
       
  2656                 ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
       
  2657                 break; /* while */
       
  2658         }
       
  2659         buf[nbchar++] = cur;
       
  2660         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
       
  2661             if (ctxt->sax->cdataBlock!= NULL) {
       
  2662                 /*
       
  2663                  * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
       
  2664                  */
       
  2665                 ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
       
  2666             } else if (ctxt->sax->characters != NULL) {
       
  2667                 ctxt->sax->characters(ctxt->userData, buf, nbchar);
       
  2668             }
       
  2669             nbchar = 0;
       
  2670         }
       
  2671         NEXT;
       
  2672         cur = CUR;
       
  2673     }
       
  2674     if (!(IS_CHAR_CH(cur))) {
       
  2675         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
       
  2676                         "Invalid char in CDATA 0x%X\n", cur);
       
  2677         NEXT;
       
  2678     }
       
  2679 
       
  2680     if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
       
  2681         if (ctxt->sax->cdataBlock!= NULL) {
       
  2682             /*
       
  2683              * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
       
  2684              */
       
  2685             ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
       
  2686         } else if (ctxt->sax->characters != NULL) {
       
  2687             ctxt->sax->characters(ctxt->userData, buf, nbchar);
       
  2688         }
       
  2689     }
       
  2690 }
       
  2691 
       
  2692 
       
  2693 /**
       
  2694  * htmlParseCharData:
       
  2695  * @param ctxt an HTML parser context
       
  2696  *
       
  2697  * parse a CharData section.
       
  2698  * if we are within a CDATA section ']]>' marks an end of section.
       
  2699  *
       
  2700  * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
       
  2701  */
       
  2702 
       
  2703 static void
       
  2704 htmlParseCharData(htmlParserCtxtPtr ctxt) {
       
  2705     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
       
  2706     int nbchar = 0;
       
  2707     int cur, l;
       
  2708 
       
  2709     SHRINK;
       
  2710     cur = CUR_CHAR(l);
       
  2711     while (((cur != '<') || (ctxt->token == '<')) &&
       
  2712            ((cur != '&') || (ctxt->token == '&')) &&
       
  2713            (IS_CHAR(cur))) {
       
  2714         COPY_BUF(l,buf,nbchar,cur);
       
  2715         if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
       
  2716             /*
       
  2717              * Ok the segment is to be consumed as chars.
       
  2718              */
       
  2719             if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
       
  2720                 if (areBlanks(ctxt, buf, nbchar)) {
       
  2721                     if (ctxt->sax->ignorableWhitespace != NULL)
       
  2722                         ctxt->sax->ignorableWhitespace(ctxt->userData,
       
  2723                                                        buf, nbchar);
       
  2724                 } else {
       
  2725                     htmlCheckParagraph(ctxt);
       
  2726                     if (ctxt->sax->characters != NULL)
       
  2727                         ctxt->sax->characters(ctxt->userData, buf, nbchar);
       
  2728                 }
       
  2729             }
       
  2730             nbchar = 0;
       
  2731         }
       
  2732         NEXTL(l);
       
  2733         cur = CUR_CHAR(l);
       
  2734         if (cur == 0) {
       
  2735             SHRINK;
       
  2736             GROW;
       
  2737             cur = CUR_CHAR(l);
       
  2738         }
       
  2739     }
       
  2740     if (nbchar != 0) {
       
  2741         /*
       
  2742          * Ok the segment is to be consumed as chars.
       
  2743          */
       
  2744         if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
       
  2745             if (areBlanks(ctxt, buf, nbchar)) {
       
  2746                 if (ctxt->sax->ignorableWhitespace != NULL)
       
  2747                     ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
       
  2748             } else {
       
  2749                 htmlCheckParagraph(ctxt);
       
  2750                 if (ctxt->sax->characters != NULL)
       
  2751                     ctxt->sax->characters(ctxt->userData, buf, nbchar);
       
  2752             }
       
  2753         }
       
  2754     } else {
       
  2755         /*
       
  2756          * Loop detection
       
  2757          */
       
  2758         if (cur == 0)
       
  2759             ctxt->instate = XML_PARSER_EOF;
       
  2760     }
       
  2761 }
       
  2762 
       
  2763 /**
       
  2764  * htmlParseExternalID:
       
  2765  * @param ctxt an HTML parser context
       
  2766  * @param publicID a xmlChar** receiving PubidLiteral
       
  2767  *
       
  2768  * Parse an External ID or a Public ID
       
  2769  *
       
  2770  * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
       
  2771  *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
       
  2772  *
       
  2773  * [83] PublicID ::= 'PUBLIC' S PubidLiteral
       
  2774  *
       
  2775  * Returns the function returns SystemLiteral and in the second
       
  2776  *                case publicID receives PubidLiteral, is strict is off
       
  2777  *                it is possible to return NULL and have publicID set.
       
  2778  */
       
  2779 
       
  2780 static xmlChar *
       
  2781 htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
       
  2782     xmlChar *URI = NULL;
       
  2783 
       
  2784     if ((UPPER == 'S') && (UPP(1) == 'Y') &&
       
  2785          (UPP(2) == 'S') && (UPP(3) == 'T') &&
       
  2786          (UPP(4) == 'E') && (UPP(5) == 'M')) {
       
  2787         SKIP(6);
       
  2788         if (!IS_BLANK_CH(CUR)) {
       
  2789             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
       
  2790                          "Space required after 'SYSTEM'\n", NULL, NULL);
       
  2791         }
       
  2792         SKIP_BLANKS;
       
  2793         URI = htmlParseSystemLiteral(ctxt);
       
  2794         if (URI == NULL) {
       
  2795             htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
       
  2796                          "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
       
  2797         }
       
  2798     } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
       
  2799                (UPP(2) == 'B') && (UPP(3) == 'L') &&
       
  2800                (UPP(4) == 'I') && (UPP(5) == 'C')) {
       
  2801         SKIP(6);
       
  2802         if (!IS_BLANK_CH(CUR)) {
       
  2803             htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
       
  2804                          "Space required after 'PUBLIC'\n", NULL, NULL);
       
  2805         }
       
  2806         SKIP_BLANKS;
       
  2807         *publicID = htmlParsePubidLiteral(ctxt);
       
  2808         if (*publicID == NULL) {
       
  2809             htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
       
  2810                          "htmlParseExternalID: PUBLIC, no Public Identifier\n",
       
  2811                          NULL, NULL);
       
  2812         }
       
  2813         SKIP_BLANKS;
       
  2814         if ((CUR == '"') || (CUR == '\'')) {
       
  2815             URI = htmlParseSystemLiteral(ctxt);
       
  2816         }
       
  2817     }
       
  2818     return(URI);
       
  2819 }
       
  2820 
       
  2821 /**
       
  2822  * htmlParseComment:
       
  2823  * @param ctxt an HTML parser context
       
  2824  *
       
  2825  * Parse an XML (SGML) comment <!-- .... -->
       
  2826  *
       
  2827  * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
       
  2828  */
       
  2829 static void
       
  2830 htmlParseComment(htmlParserCtxtPtr ctxt)
       
  2831 {
       
  2832     xmlChar* buf = NULL;
       
  2833     int len;
       
  2834     int size = HTML_PARSER_BUFFER_SIZE;
       
  2835     int q, ql;
       
  2836     int r, rl;
       
  2837     int cur, l;
       
  2838     xmlParserInputState state;
       
  2839 
       
  2840     /*
       
  2841      * Check that there is a comment right here.
       
  2842      */
       
  2843     if ((RAW != '<') || (NXT(1) != '!') ||
       
  2844         (NXT(2) != '-') || (NXT(3) != '-')) return;
       
  2845 
       
  2846     state = ctxt->instate;
       
  2847     ctxt->instate = XML_PARSER_COMMENT;
       
  2848     SHRINK;
       
  2849     SKIP(4);
       
  2850     buf = (xmlChar*) xmlMallocAtomic(size * sizeof(xmlChar));
       
  2851     if (!buf)
       
  2852         goto OOM_exit;
       
  2853     // Now we must free 'buf' before returning
       
  2854     q = CUR_CHAR(ql);
       
  2855     NEXTL(ql);
       
  2856     r = CUR_CHAR(rl);
       
  2857     NEXTL(rl);
       
  2858     cur = CUR_CHAR(l);
       
  2859     len = 0;
       
  2860     while (IS_CHAR(cur) &&
       
  2861           ((cur != '>') || (r != '-') || (q != '-')))
       
  2862     {
       
  2863         if (len + 5 >= size)
       
  2864         {   // DONE: Fix xmlRealloc
       
  2865             void* tmp;
       
  2866             size *= 2;
       
  2867             tmp = xmlRealloc(buf, size * sizeof(xmlChar));
       
  2868             if (!tmp)
       
  2869             {
       
  2870 OOM:
       
  2871                 xmlFree(buf);
       
  2872 OOM_exit:
       
  2873                 htmlErrMemory(ctxt, "buffer allocation failed\n");
       
  2874                 ctxt->instate = state;
       
  2875                 return;
       
  2876             }
       
  2877             buf = (xmlChar*) tmp;
       
  2878         }
       
  2879         COPY_BUF(ql,buf,len,q);
       
  2880         q = r;
       
  2881         ql = rl;
       
  2882         r = cur;
       
  2883         rl = l;
       
  2884         NEXTL(l);
       
  2885         cur = CUR_CHAR(l);
       
  2886         if (cur == 0) {
       
  2887             SHRINK;
       
  2888             GROW;
       
  2889             cur = CUR_CHAR(l);
       
  2890         }
       
  2891     } // end of "while good character and not the end of comment (-->)"
       
  2892 
       
  2893     buf[len] = 0;
       
  2894     if (!IS_CHAR(cur)) {
       
  2895         htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
       
  2896                  "Comment not terminated \n<!--%.50s\n", buf, NULL);
       
  2897         xmlFree(buf);
       
  2898     } else {
       
  2899         NEXT;
       
  2900         if (ctxt->sax           &&
       
  2901             ctxt->sax->comment  &&
       
  2902             !ctxt->disableSAX)
       
  2903         {
       
  2904             ctxt->sax->comment(ctxt->userData, buf);
       
  2905         }
       
  2906     }
       
  2907     xmlFree(buf);
       
  2908     ctxt->instate = state;
       
  2909 }
       
  2910 
       
  2911 /**
       
  2912  * htmlParseCharRef:
       
  2913  * @param ctxt an HTML parser context
       
  2914  *
       
  2915  * parse Reference declarations
       
  2916  *
       
  2917  * [66] CharRef ::= '&#' [0-9]+ ';' |
       
  2918  *                  '&#x' [0-9a-fA-F]+ ';'
       
  2919  *
       
  2920  * Returns the value parsed (as an int)
       
  2921  */
       
  2922 int
       
  2923 htmlParseCharRef(htmlParserCtxtPtr ctxt) {
       
  2924     int val = 0;
       
  2925 
       
  2926     if ((CUR == '&') && (NXT(1) == '#') &&
       
  2927         ((NXT(2) == 'x') || NXT(2) == 'X')) {
       
  2928         SKIP(3);
       
  2929         while (CUR != ';') {
       
  2930             if ((CUR >= '0') && (CUR <= '9'))
       
  2931                 val = val * 16 + (CUR - '0');
       
  2932             else if ((CUR >= 'a') && (CUR <= 'f'))
       
  2933                 val = val * 16 + (CUR - 'a') + 10;
       
  2934             else if ((CUR >= 'A') && (CUR <= 'F'))
       
  2935                 val = val * 16 + (CUR - 'A') + 10;
       
  2936             else {
       
  2937                 htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
       
  2938                              "htmlParseCharRef: invalid hexadecimal value\n",
       
  2939                              NULL, NULL);
       
  2940                 return(0);
       
  2941             }
       
  2942             NEXT;
       
  2943         }
       
  2944         if (CUR == ';')
       
  2945             NEXT;
       
  2946     } else if  ((CUR == '&') && (NXT(1) == '#')) {
       
  2947         SKIP(2);
       
  2948         while (CUR != ';') {
       
  2949             if ((CUR >= '0') && (CUR <= '9'))
       
  2950                 val = val * 10 + (CUR - '0');
       
  2951             else {
       
  2952                 htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
       
  2953                              "htmlParseCharRef: invalid decimal value\n",
       
  2954                              NULL, NULL);
       
  2955                 return(0);
       
  2956             }
       
  2957             NEXT;
       
  2958         }
       
  2959         if (CUR == ';')
       
  2960             NEXT;
       
  2961     } else {
       
  2962         htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
       
  2963                      "htmlParseCharRef: invalid value\n", NULL, NULL);
       
  2964     }
       
  2965     /*
       
  2966      * Check the value IS_CHAR ...
       
  2967      */
       
  2968     if (IS_CHAR(val)) {
       
  2969         return(val);
       
  2970     } else {
       
  2971         htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
       
  2972                         "htmlParseCharRef: invalid xmlChar value %d\n",
       
  2973                         val);
       
  2974     }
       
  2975     return(0);
       
  2976 }
       
  2977 
       
  2978 
       
  2979 /**
       
  2980  * htmlParseDocTypeDecl:
       
  2981  * @param ctxt an HTML parser context
       
  2982  *
       
  2983  * parse a DOCTYPE declaration
       
  2984  *
       
  2985  * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
       
  2986  *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
       
  2987  */
       
  2988 
       
  2989 static void
       
  2990 htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
       
  2991     const xmlChar *name;
       
  2992     xmlChar *ExternalID = NULL;
       
  2993     xmlChar *URI = NULL;
       
  2994 
       
  2995     /*
       
  2996      * We know that '<!DOCTYPE' has been detected.
       
  2997      */
       
  2998     SKIP(9);
       
  2999 
       
  3000     SKIP_BLANKS;
       
  3001 
       
  3002     /*
       
  3003      * Parse the DOCTYPE name.
       
  3004      */
       
  3005     name = htmlParseName(ctxt);
       
  3006     if (name == NULL) {
       
  3007         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
       
  3008                      "htmlParseDocTypeDecl : no DOCTYPE name !\n",
       
  3009                      NULL, NULL);
       
  3010     }
       
  3011     /*
       
  3012      * Check that upper(name) == "HTML" !!!!!!!!!!!!!
       
  3013      */
       
  3014 
       
  3015     SKIP_BLANKS;
       
  3016 
       
  3017     /*
       
  3018      * Check for SystemID and ExternalID
       
  3019      */
       
  3020     URI = htmlParseExternalID(ctxt, &ExternalID);
       
  3021     SKIP_BLANKS;
       
  3022 
       
  3023     /*
       
  3024      * We should be at the end of the DOCTYPE declaration.
       
  3025      */
       
  3026     if (CUR != '>') {
       
  3027         htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
       
  3028                      "DOCTYPE improperly terminated\n", NULL, NULL);
       
  3029         /* We shouldn't try to resynchronize ... */
       
  3030     }
       
  3031     NEXT;
       
  3032 
       
  3033     /*
       
  3034      * Create or update the document accordingly to the DOCTYPE
       
  3035      */
       
  3036     if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
       
  3037         (!ctxt->disableSAX))
       
  3038         ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
       
  3039 
       
  3040     /*
       
  3041      * Cleanup, since we don't use all those identifiers
       
  3042      */
       
  3043     if (URI != NULL) xmlFree(URI);
       
  3044     if (ExternalID != NULL) xmlFree(ExternalID);
       
  3045 }
       
  3046 
       
  3047 /**
       
  3048  * htmlParseAttribute:
       
  3049  * @param ctxt an HTML parser context
       
  3050  * @param value a xmlChar ** used to store the value of the attribute
       
  3051  *
       
  3052  * parse an attribute
       
  3053  *
       
  3054  * [41] Attribute ::= Name Eq AttValue
       
  3055  *
       
  3056  * [25] Eq ::= S? '=' S?
       
  3057  *
       
  3058  * With namespace:
       
  3059  *
       
  3060  * [NS 11] Attribute ::= QName Eq AttValue
       
  3061  *
       
  3062  * Also the case QName == xmlns:??? is handled independently as a namespace
       
  3063  * definition.
       
  3064  *
       
  3065  * Returns the attribute name, and the value in *value.
       
  3066  */
       
  3067 
       
  3068 static const xmlChar *
       
  3069 htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
       
  3070     const xmlChar *name;
       
  3071     xmlChar *val = NULL;
       
  3072 
       
  3073     *value = NULL;
       
  3074     name = htmlParseHTMLName(ctxt);
       
  3075     if (name == NULL) {
       
  3076         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
       
  3077                      "error parsing attribute name\n", NULL, NULL);
       
  3078         return(NULL);
       
  3079     }
       
  3080 
       
  3081     /*
       
  3082      * read the value
       
  3083      */
       
  3084     SKIP_BLANKS;
       
  3085     if (CUR == '=') {
       
  3086         NEXT;
       
  3087         SKIP_BLANKS;
       
  3088         val = htmlParseAttValue(ctxt);
       
  3089         /******
       
  3090     } else {
       
  3091         
       
  3092         if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
       
  3093             ctxt->sax->warning(ctxt->userData,
       
  3094                "No value for attribute %s\n", name); */
       
  3095     }
       
  3096 
       
  3097     *value = val;
       
  3098     return(name);
       
  3099 }
       
  3100 
       
  3101 /**
       
  3102  * htmlCheckEncoding:
       
  3103  * @param ctxt an HTML parser context
       
  3104  * @param attvalue the attribute value
       
  3105  *
       
  3106  * Checks an http-equiv attribute from a Meta tag to detect
       
  3107  * the encoding
       
  3108  * If a new encoding is detected the parser is switched to decode
       
  3109  * it and pass UTF8
       
  3110  */
       
  3111 static void
       
  3112 htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
       
  3113     const xmlChar *encoding;
       
  3114 
       
  3115     if ((ctxt == NULL) || (attvalue == NULL))
       
  3116         return;
       
  3117 
       
  3118     /* do not change encoding */
       
  3119     if (ctxt->input->encoding != NULL)
       
  3120         return;
       
  3121 
       
  3122     encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
       
  3123     if (encoding != NULL) {
       
  3124         encoding += 8;
       
  3125     } else {
       
  3126         encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
       
  3127         if (encoding != NULL)
       
  3128             encoding += 9;
       
  3129     }
       
  3130     if (encoding != NULL) {
       
  3131         xmlCharEncoding enc;
       
  3132         xmlCharEncodingHandlerPtr handler;
       
  3133 
       
  3134         while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
       
  3135 
       
  3136         if (ctxt->input->encoding != NULL)
       
  3137             xmlFree((xmlChar *) ctxt->input->encoding);
       
  3138         ctxt->input->encoding = xmlStrdup(encoding);
       
  3139 
       
  3140         enc = xmlParseCharEncoding((const char *) encoding);
       
  3141         /*
       
  3142          * registered set of known encodings
       
  3143          */
       
  3144         if (enc != XML_CHAR_ENCODING_ERROR) {
       
  3145             xmlSwitchEncoding(ctxt, enc);
       
  3146             ctxt->charset = XML_CHAR_ENCODING_UTF8;
       
  3147         } else {
       
  3148             /*
       
  3149              * fallback for unknown encodings
       
  3150              */
       
  3151             handler = xmlFindCharEncodingHandler((const char *) encoding);
       
  3152             if (handler != NULL) {
       
  3153                 xmlSwitchToEncoding(ctxt, handler);
       
  3154                 ctxt->charset = XML_CHAR_ENCODING_UTF8;
       
  3155             } else {
       
  3156                 ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
       
  3157             }
       
  3158         }
       
  3159 
       
  3160         if ((ctxt->input->buf != NULL) &&
       
  3161             (ctxt->input->buf->encoder != NULL) &&
       
  3162             (ctxt->input->buf->raw != NULL) &&
       
  3163             (ctxt->input->buf->buffer != NULL)) {
       
  3164             int nbchars;
       
  3165             int processed;
       
  3166 
       
  3167             /*
       
  3168              * convert as much as possible to the parser reading buffer.
       
  3169              */
       
  3170             processed = ctxt->input->cur - ctxt->input->base;
       
  3171             xmlBufferShrink(ctxt->input->buf->buffer, processed);
       
  3172             nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
       
  3173                                        ctxt->input->buf->buffer,
       
  3174                                        ctxt->input->buf->raw);
       
  3175             if (nbchars < 0) {
       
  3176                 htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
       
  3177                              "htmlCheckEncoding: encoder error\n",
       
  3178                              NULL, NULL);
       
  3179             }
       
  3180             ctxt->input->base =
       
  3181             ctxt->input->cur = ctxt->input->buf->buffer->content;
       
  3182         }
       
  3183     }
       
  3184 }
       
  3185 
       
  3186 /**
       
  3187  * htmlCheckMeta:
       
  3188  * @param ctxt an HTML parser context
       
  3189  * @param atts the attributes values
       
  3190  *
       
  3191  * Checks an attributes from a Meta tag
       
  3192  */
       
  3193 static void
       
  3194 htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
       
  3195     int i;
       
  3196     const xmlChar *att, *value;
       
  3197     int http = 0;
       
  3198     const xmlChar *content = NULL;
       
  3199 
       
  3200     if ((ctxt == NULL) || (atts == NULL))
       
  3201         return;
       
  3202 
       
  3203     i = 0;
       
  3204     att = atts[i++];
       
  3205     while (att != NULL) {
       
  3206         value = atts[i++];
       
  3207         if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
       
  3208          && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
       
  3209             http = 1;
       
  3210         else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
       
  3211             content = value;
       
  3212         att = atts[i++];
       
  3213     }
       
  3214     if ((http) && (content != NULL))
       
  3215         htmlCheckEncoding(ctxt, content);
       
  3216 
       
  3217 }
       
  3218 
       
  3219 /**
       
  3220  * htmlParseStartTag:
       
  3221  * @param ctxt an HTML parser context
       
  3222  *
       
  3223  * parse a start of tag either for rule element or
       
  3224  * EmptyElement. In both case we don't parse the tag closing chars.
       
  3225  *
       
  3226  * [40] STag ::= '<' Name (S Attribute)* S? '>'
       
  3227  *
       
  3228  * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
       
  3229  *
       
  3230  * With namespace:
       
  3231  *
       
  3232  * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
       
  3233  *
       
  3234  * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
       
  3235  *
       
  3236  */
       
  3237 
       
  3238 static void
       
  3239 htmlParseStartTag(htmlParserCtxtPtr ctxt) {
       
  3240     const xmlChar *name;
       
  3241     const xmlChar *attname;
       
  3242     xmlChar *attvalue;
       
  3243     const xmlChar **atts = ctxt->atts;
       
  3244     int nbatts = 0;
       
  3245     int maxatts = ctxt->maxatts;
       
  3246     int meta = 0;
       
  3247     int i;
       
  3248 
       
  3249     if (CUR != '<') return;
       
  3250     NEXT;
       
  3251 
       
  3252     GROW;
       
  3253     name = htmlParseHTMLName(ctxt);
       
  3254     if (name == NULL) {
       
  3255         htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
       
  3256                      "htmlParseStartTag: invalid element name\n",
       
  3257                      NULL, NULL);
       
  3258         /* Dump the bogus tag like browsers do */
       
  3259         while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
       
  3260             NEXT;
       
  3261         return;
       
  3262     }
       
  3263     if (xmlStrEqual(name, BAD_CAST"meta"))
       
  3264         meta = 1;
       
  3265 
       
  3266     /*
       
  3267      * Check for auto-closure of HTML elements.
       
  3268      */
       
  3269     htmlAutoClose(ctxt, name);
       
  3270 
       
  3271     /*
       
  3272      * Check for implied HTML elements.
       
  3273      */
       
  3274     htmlCheckImplied(ctxt, name);
       
  3275 
       
  3276     /*
       
  3277      * Avoid html at any level > 0, head at any level != 1
       
  3278      * or any attempt to recurse body
       
  3279      */
       
  3280     if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
       
  3281         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
       
  3282                      "htmlParseStartTag: misplaced <html> tag\n",
       
  3283                      name, NULL);
       
  3284         return;
       
  3285     }
       
  3286     if ((ctxt->nameNr != 1) &&
       
  3287         (xmlStrEqual(name, BAD_CAST"head"))) {
       
  3288         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
       
  3289                      "htmlParseStartTag: misplaced <head> tag\n",
       
  3290                      name, NULL);
       
  3291         return;
       
  3292     }
       
  3293     if (xmlStrEqual(name, BAD_CAST"body")) {
       
  3294         int indx;
       
  3295         for (indx = 0;indx < ctxt->nameNr;indx++) {
       
  3296             if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
       
  3297                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
       
  3298                              "htmlParseStartTag: misplaced <body> tag\n",
       
  3299                              name, NULL);
       
  3300                 while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
       
  3301                     NEXT;
       
  3302                 return;
       
  3303             }
       
  3304         }
       
  3305     }
       
  3306 
       
  3307     /*
       
  3308      * Now parse the attributes, it ends up with the ending
       
  3309      *
       
  3310      * (S Attribute)* S?
       
  3311      */
       
  3312     SKIP_BLANKS;
       
  3313     while ((IS_CHAR_CH(CUR)) &&
       
  3314            (CUR != '>') &&
       
  3315            ((CUR != '/') || (NXT(1) != '>'))) {
       
  3316         long cons = ctxt->nbChars;
       
  3317 
       
  3318         GROW;
       
  3319         attname = htmlParseAttribute(ctxt, &attvalue);
       
  3320         if (attname != NULL) {
       
  3321 
       
  3322             /*
       
  3323              * Well formedness requires at most one declaration of an attribute
       
  3324              */
       
  3325             for (i = 0; i < nbatts;i += 2) {
       
  3326                 if (xmlStrEqual(atts[i], attname)) {
       
  3327                     htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
       
  3328                                  "Attribute %s redefined\n", attname, NULL);
       
  3329                     if (attvalue != NULL)
       
  3330                         xmlFree(attvalue);
       
  3331                     goto failed;
       
  3332                 }
       
  3333             }
       
  3334 
       
  3335             /*
       
  3336              * Add the pair to atts
       
  3337              */
       
  3338             if (atts == NULL) {
       
  3339                 maxatts = 22; /* allow for 10 attrs by default */
       
  3340                 atts = (const xmlChar **)
       
  3341                        xmlMalloc(maxatts * sizeof(xmlChar *));
       
  3342                 if (atts == NULL) {
       
  3343                     htmlErrMemory(ctxt, NULL);
       
  3344                     if (attvalue != NULL)
       
  3345                         xmlFree(attvalue);
       
  3346                     goto failed;
       
  3347                 }
       
  3348                 ctxt->atts = atts;
       
  3349                 ctxt->maxatts = maxatts;
       
  3350             } else if (nbatts + 4 > maxatts) {
       
  3351                 const xmlChar **n;
       
  3352 
       
  3353                 maxatts *= 2;
       
  3354                 n = (const xmlChar **) xmlRealloc((void *) atts,
       
  3355                                              maxatts * sizeof(const xmlChar *));
       
  3356                 if (n == NULL) {
       
  3357                     htmlErrMemory(ctxt, NULL);
       
  3358                     if (attvalue != NULL)
       
  3359                         xmlFree(attvalue);
       
  3360                     goto failed;
       
  3361                 }
       
  3362                 atts = n;
       
  3363                 ctxt->atts = atts;
       
  3364                 ctxt->maxatts = maxatts;
       
  3365             }
       
  3366             atts[nbatts++] = attname;
       
  3367             atts[nbatts++] = attvalue;
       
  3368             atts[nbatts] = NULL;
       
  3369             atts[nbatts + 1] = NULL;
       
  3370         }
       
  3371         else {
       
  3372             if (attvalue != NULL)
       
  3373                 xmlFree(attvalue);
       
  3374             /* Dump the bogus attribute string up to the next blank or
       
  3375              * the end of the tag. */
       
  3376             while ((IS_CHAR_CH(CUR)) &&
       
  3377                    !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
       
  3378                    ((CUR != '/') || (NXT(1) != '>')))
       
  3379                 NEXT;
       
  3380         }
       
  3381 
       
  3382 failed:
       
  3383         SKIP_BLANKS;
       
  3384         if (cons == ctxt->nbChars) {
       
  3385             htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  3386                          "htmlParseStartTag: problem parsing attributes\n",
       
  3387                          NULL, NULL);
       
  3388             break;
       
  3389         }
       
  3390     }
       
  3391 
       
  3392     /*
       
  3393      * Handle specific association to the META tag
       
  3394      */
       
  3395     if (meta)
       
  3396         htmlCheckMeta(ctxt, atts);
       
  3397 
       
  3398     /*
       
  3399      * SAX: Start of Element !
       
  3400      */
       
  3401     htmlnamePush(ctxt, name);
       
  3402     if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
       
  3403         if (nbatts != 0)
       
  3404             ctxt->sax->startElement(ctxt->userData, name, atts);
       
  3405         else
       
  3406             ctxt->sax->startElement(ctxt->userData, name, NULL);
       
  3407     }
       
  3408 
       
  3409     if (atts != NULL) {
       
  3410         for (i = 1;i < nbatts;i += 2) {
       
  3411             if (atts[i] != NULL)
       
  3412                 xmlFree((xmlChar *) atts[i]);
       
  3413         }
       
  3414     }
       
  3415 }
       
  3416 
       
  3417 /**
       
  3418  * htmlParseEndTag:
       
  3419  * @param ctxt an HTML parser context
       
  3420  *
       
  3421  * parse an end of tag
       
  3422  *
       
  3423  * [42] ETag ::= '</' Name S? '>'
       
  3424  *
       
  3425  * With namespace
       
  3426  *
       
  3427  * [NS 9] ETag ::= '</' QName S? '>'
       
  3428  *
       
  3429  * Returns 1 if the current level should be closed.
       
  3430  */
       
  3431 
       
  3432 static int
       
  3433 htmlParseEndTag(htmlParserCtxtPtr ctxt)
       
  3434 {
       
  3435     const xmlChar *name;
       
  3436     const xmlChar *oldname;
       
  3437     int i, ret;
       
  3438 
       
  3439     if ((CUR != '<') || (NXT(1) != '/')) {
       
  3440         htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
       
  3441                      "htmlParseEndTag: '</' not found\n", NULL, NULL);
       
  3442         return (0);
       
  3443     }
       
  3444     SKIP(2);
       
  3445 
       
  3446     name = htmlParseHTMLName(ctxt);
       
  3447     if (name == NULL)
       
  3448         return (0);
       
  3449 
       
  3450     /*
       
  3451      * We should definitely be at the ending "S? '>'" part
       
  3452      */
       
  3453     SKIP_BLANKS;
       
  3454     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
       
  3455         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
       
  3456                      "End tag : expected '>'\n", NULL, NULL);
       
  3457     } else
       
  3458         NEXT;
       
  3459 
       
  3460     /*
       
  3461      * If the name read is not one of the element in the parsing stack
       
  3462      * then return, it's just an error.
       
  3463      */
       
  3464     for (i = (ctxt->nameNr - 1); i >= 0; i--) {
       
  3465         if (xmlStrEqual(name, ctxt->nameTab[i]))
       
  3466             break;
       
  3467     }
       
  3468     if (i < 0) {
       
  3469         htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
       
  3470                      "Unexpected end tag : %s\n", name, NULL);
       
  3471         return (0);
       
  3472     }
       
  3473 
       
  3474 
       
  3475     /*
       
  3476      * Check for auto-closure of HTML elements.
       
  3477      */
       
  3478 
       
  3479     htmlAutoCloseOnClose(ctxt, name);
       
  3480 
       
  3481     /*
       
  3482      * Well formedness constraints, opening and closing must match.
       
  3483      * With the exception that the autoclose may have popped stuff out
       
  3484      * of the stack.
       
  3485      */
       
  3486     if (!xmlStrEqual(name, ctxt->name)) {
       
  3487         if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
       
  3488             htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
       
  3489                          "Opening and ending tag mismatch: %s and %s\n",
       
  3490                          name, ctxt->name);
       
  3491         }
       
  3492     }
       
  3493 
       
  3494     /*
       
  3495      * SAX: End of Tag
       
  3496      */
       
  3497     oldname = ctxt->name;
       
  3498     if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
       
  3499         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  3500             ctxt->sax->endElement(ctxt->userData, name);
       
  3501         htmlnamePop(ctxt);
       
  3502         ret = 1;
       
  3503     } else {
       
  3504         ret = 0;
       
  3505     }
       
  3506 
       
  3507     return (ret);
       
  3508 }
       
  3509 
       
  3510 
       
  3511 /**
       
  3512  * htmlParseReference:
       
  3513  * @param ctxt an HTML parser context
       
  3514  *
       
  3515  * parse and handle entity references in content,
       
  3516  * this will end-up in a call to character() since this is either a
       
  3517  * CharRef, or a predefined entity.
       
  3518  */
       
  3519 static void
       
  3520 htmlParseReference(htmlParserCtxtPtr ctxt) {
       
  3521     const htmlEntityDesc * ent;
       
  3522     xmlChar out[6];
       
  3523     const xmlChar *name;
       
  3524     if (CUR != '&') return;
       
  3525 
       
  3526     if (NXT(1) == '#') {
       
  3527         unsigned int c;
       
  3528         int bits, i = 0;
       
  3529 
       
  3530         c = htmlParseCharRef(ctxt);
       
  3531         if (c == 0)
       
  3532             return;
       
  3533 
       
  3534         if      (c <    0x80) { out[i++]= c;                bits= -6; }
       
  3535         else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
       
  3536         else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
       
  3537         else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
       
  3538 
       
  3539         for ( ; bits >= 0; bits-= 6) {
       
  3540             out[i++]= ((c >> bits) & 0x3F) | 0x80;
       
  3541         }
       
  3542         out[i] = 0;
       
  3543 
       
  3544         htmlCheckParagraph(ctxt);
       
  3545         if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
       
  3546             ctxt->sax->characters(ctxt->userData, out, i);
       
  3547     } else {
       
  3548         ent = htmlParseEntityRef(ctxt, &name);
       
  3549         if (name == NULL) {
       
  3550             htmlCheckParagraph(ctxt);
       
  3551             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
       
  3552                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
       
  3553             return;
       
  3554         }
       
  3555         if ((ent == NULL) || !(ent->value > 0)) {
       
  3556             htmlCheckParagraph(ctxt);
       
  3557             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
       
  3558                 ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
       
  3559                 ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
       
  3560                 /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
       
  3561             }
       
  3562         } else {
       
  3563             unsigned int c;
       
  3564             int bits, i = 0;
       
  3565 
       
  3566             c = ent->value;
       
  3567             if      (c <    0x80)
       
  3568                     { out[i++]= c;                bits= -6; }
       
  3569             else if (c <   0x800)
       
  3570                     { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
       
  3571             else if (c < 0x10000)
       
  3572                     { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
       
  3573             else
       
  3574                     { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
       
  3575 
       
  3576             for ( ; bits >= 0; bits-= 6) {
       
  3577                 out[i++]= ((c >> bits) & 0x3F) | 0x80;
       
  3578             }
       
  3579             out[i] = 0;
       
  3580 
       
  3581             htmlCheckParagraph(ctxt);
       
  3582             if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
       
  3583                 ctxt->sax->characters(ctxt->userData, out, i);
       
  3584         }
       
  3585     }
       
  3586 }
       
  3587 
       
  3588 /**
       
  3589  * htmlParseContent:
       
  3590  * @param ctxt an HTML parser context
       
  3591  * @param name the node name
       
  3592  *
       
  3593  * Parse a content: comment, sub-element, reference or text.
       
  3594  *
       
  3595  */
       
  3596 
       
  3597 static void
       
  3598 htmlParseContent(htmlParserCtxtPtr ctxt) {
       
  3599     xmlChar *currentNode;
       
  3600     int depth;
       
  3601 
       
  3602     currentNode = xmlStrdup(ctxt->name);
       
  3603     depth = ctxt->nameNr;
       
  3604     while (1) {
       
  3605         long cons = ctxt->nbChars;
       
  3606 
       
  3607         GROW;
       
  3608         /*
       
  3609          * Our tag or one of it's parent or children is ending.
       
  3610          */
       
  3611         if ((CUR == '<') && (NXT(1) == '/')) {
       
  3612             if (htmlParseEndTag(ctxt) &&
       
  3613                 ((currentNode != NULL) || (ctxt->nameNr == 0))) {
       
  3614                 if (currentNode != NULL)
       
  3615                     xmlFree(currentNode);
       
  3616                 return;
       
  3617             }
       
  3618             continue; /* while */
       
  3619         }
       
  3620 
       
  3621         /*
       
  3622          * Has this node been popped out during parsing of
       
  3623          * the next element
       
  3624          */
       
  3625         if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
       
  3626             (!xmlStrEqual(currentNode, ctxt->name)))
       
  3627              {
       
  3628             if (currentNode != NULL) xmlFree(currentNode);
       
  3629             return;
       
  3630         }
       
  3631 
       
  3632         if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
       
  3633             (xmlStrEqual(currentNode, BAD_CAST"style")))) {
       
  3634             /*
       
  3635              * Handle SCRIPT/STYLE separately
       
  3636              */
       
  3637             htmlParseScript(ctxt);
       
  3638         } else {
       
  3639             /*
       
  3640              * Sometimes DOCTYPE arrives in the middle of the document
       
  3641              */
       
  3642             if ((CUR == '<') && (NXT(1) == '!') &&
       
  3643                 (UPP(2) == 'D') && (UPP(3) == 'O') &&
       
  3644                 (UPP(4) == 'C') && (UPP(5) == 'T') &&
       
  3645                 (UPP(6) == 'Y') && (UPP(7) == 'P') &&
       
  3646                 (UPP(8) == 'E')) {
       
  3647                 htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
       
  3648                              "Misplaced DOCTYPE declaration\n",
       
  3649                              BAD_CAST "DOCTYPE" , NULL);
       
  3650                 htmlParseDocTypeDecl(ctxt);
       
  3651             }
       
  3652 
       
  3653             /*
       
  3654              * First case :  a comment
       
  3655              */
       
  3656             if ((CUR == '<') && (NXT(1) == '!') &&
       
  3657                 (NXT(2) == '-') && (NXT(3) == '-')) {
       
  3658                 htmlParseComment(ctxt);
       
  3659             }
       
  3660 
       
  3661             /*
       
  3662              * Second case :  a sub-element.
       
  3663              */
       
  3664             else if (CUR == '<') {
       
  3665                 htmlParseElement(ctxt);
       
  3666             }
       
  3667 
       
  3668             /*
       
  3669              * Third case : a reference. If if has not been resolved,
       
  3670              *    parsing returns it's Name, create the node
       
  3671              */
       
  3672             else if (CUR == '&') {
       
  3673                 htmlParseReference(ctxt);
       
  3674             }
       
  3675 
       
  3676             /*
       
  3677              * Fourth : end of the resource
       
  3678              */
       
  3679             else if (CUR == 0) {
       
  3680                 htmlAutoCloseOnEnd(ctxt);
       
  3681                 break;
       
  3682             }
       
  3683 
       
  3684             /*
       
  3685              * Last case, text. Note that References are handled directly.
       
  3686              */
       
  3687             else {
       
  3688                 htmlParseCharData(ctxt);
       
  3689             }
       
  3690 
       
  3691             if (cons == ctxt->nbChars) {
       
  3692                 if (ctxt->node != NULL) {
       
  3693                     htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  3694                                  "detected an error in element content\n",
       
  3695                                  NULL, NULL);
       
  3696                 }
       
  3697                 break;
       
  3698             }
       
  3699         }
       
  3700         GROW;
       
  3701     }
       
  3702     if (currentNode != NULL) xmlFree(currentNode);
       
  3703 }
       
  3704 
       
  3705 /**
       
  3706  * htmlParseElement:
       
  3707  * @param ctxt an HTML parser context
       
  3708  *
       
  3709  * parse an HTML element, this is highly recursive
       
  3710  *
       
  3711  * [39] element ::= EmptyElemTag | STag content ETag
       
  3712  *
       
  3713  * [41] Attribute ::= Name Eq AttValue
       
  3714  */
       
  3715 
       
  3716 void
       
  3717 htmlParseElement(htmlParserCtxtPtr ctxt) {
       
  3718     const xmlChar *name;
       
  3719     xmlChar *currentNode = NULL;
       
  3720     const htmlElemDesc * info;
       
  3721     htmlParserNodeInfo node_info;
       
  3722     const xmlChar *oldname;
       
  3723     int depth = ctxt->nameNr;
       
  3724     const xmlChar *oldptr;
       
  3725 
       
  3726     /* Capture start position */
       
  3727     if (ctxt->record_info) {
       
  3728         node_info.begin_pos = ctxt->input->consumed +
       
  3729                           (CUR_PTR - ctxt->input->base);
       
  3730         node_info.begin_line = ctxt->input->line;
       
  3731     }
       
  3732 
       
  3733     oldname = ctxt->name;
       
  3734     htmlParseStartTag(ctxt);
       
  3735     name = ctxt->name;
       
  3736     if (((depth == ctxt->nameNr) && (xmlStrEqual(oldname, ctxt->name))) ||
       
  3737         (name == NULL)) {
       
  3738         if (CUR == '>')
       
  3739             NEXT;
       
  3740         return;
       
  3741     }
       
  3742 
       
  3743     /*
       
  3744      * Lookup the info for that element.
       
  3745      */
       
  3746     info = htmlTagLookup(name);
       
  3747     if (info == NULL) {
       
  3748         htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
       
  3749                      "Tag %s invalid\n", name, NULL);
       
  3750     }
       
  3751 
       
  3752     /*
       
  3753      * Check for an Empty Element labeled the XML/SGML way
       
  3754      */
       
  3755     if ((CUR == '/') && (NXT(1) == '>')) {
       
  3756         SKIP(2);
       
  3757         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  3758             ctxt->sax->endElement(ctxt->userData, name);
       
  3759         htmlnamePop(ctxt);
       
  3760         return;
       
  3761     }
       
  3762 
       
  3763     if (CUR == '>') {
       
  3764         NEXT;
       
  3765     } else {
       
  3766         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
       
  3767                      "Couldn't find end of Start Tag %s\n", name, NULL);
       
  3768 
       
  3769         /*
       
  3770          * end of parsing of this node.
       
  3771          */
       
  3772         if (xmlStrEqual(name, ctxt->name)) {
       
  3773             nodePop(ctxt);
       
  3774             htmlnamePop(ctxt);
       
  3775         }
       
  3776 
       
  3777         /*
       
  3778          * Capture end position and add node
       
  3779          */
       
  3780         if ( currentNode != NULL && ctxt->record_info ) {
       
  3781            node_info.end_pos = ctxt->input->consumed +
       
  3782                               (CUR_PTR - ctxt->input->base);
       
  3783            node_info.end_line = ctxt->input->line;
       
  3784            node_info.node = ctxt->node;
       
  3785            xmlParserAddNodeInfo(ctxt, &node_info);
       
  3786         }
       
  3787         return;
       
  3788     }
       
  3789 
       
  3790     /*
       
  3791      * Check for an Empty Element from DTD definition
       
  3792      */
       
  3793     if ((info != NULL) && (info->empty)) {
       
  3794         if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  3795             ctxt->sax->endElement(ctxt->userData, name);
       
  3796         htmlnamePop(ctxt);
       
  3797         return;
       
  3798     }
       
  3799 
       
  3800     /*
       
  3801      * Parse the content of the element:
       
  3802      */
       
  3803     currentNode = xmlStrdup(ctxt->name);
       
  3804     depth = ctxt->nameNr;
       
  3805     while (IS_CHAR_CH(CUR)) {
       
  3806         oldptr = ctxt->input->cur;
       
  3807         htmlParseContent(ctxt);
       
  3808         if (oldptr==ctxt->input->cur) break;
       
  3809         if (ctxt->nameNr < depth) break;
       
  3810     }
       
  3811 
       
  3812     /*
       
  3813      * Capture end position and add node
       
  3814      */
       
  3815     if ( currentNode != NULL && ctxt->record_info ) {
       
  3816        node_info.end_pos = ctxt->input->consumed +
       
  3817                           (CUR_PTR - ctxt->input->base);
       
  3818        node_info.end_line = ctxt->input->line;
       
  3819        node_info.node = ctxt->node;
       
  3820        xmlParserAddNodeInfo(ctxt, &node_info);
       
  3821     }
       
  3822     if (!IS_CHAR_CH(CUR)) {
       
  3823         htmlAutoCloseOnEnd(ctxt);
       
  3824     }
       
  3825 
       
  3826     if (currentNode != NULL)
       
  3827         xmlFree(currentNode);
       
  3828 }
       
  3829 
       
  3830 /**
       
  3831  * htmlParseDocument:
       
  3832  * @param ctxt an HTML parser context
       
  3833  *
       
  3834  * parse an HTML document (and build a tree if using the standard SAX
       
  3835  * interface).
       
  3836  *
       
  3837  * Returns 0, -1 in case of error. the parser context is augmented
       
  3838  *                as a result of the parsing.
       
  3839  */
       
  3840 
       
  3841 int
       
  3842 htmlParseDocument(htmlParserCtxtPtr ctxt) {
       
  3843     xmlDtdPtr dtd;
       
  3844 
       
  3845     xmlInitParser();
       
  3846 
       
  3847     htmlDefaultSAXHandlerInit();
       
  3848     ctxt->html = 1;
       
  3849 
       
  3850     GROW;
       
  3851     /*
       
  3852      * SAX: beginning of the document processing.
       
  3853      */
       
  3854     if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
       
  3855         ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
       
  3856 
       
  3857     /*
       
  3858      * Wipe out everything which is before the first '<'
       
  3859      */
       
  3860     SKIP_BLANKS;
       
  3861     if (CUR == 0) {
       
  3862         htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
       
  3863                      "Document is empty\n", NULL, NULL);
       
  3864     }
       
  3865 
       
  3866     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
       
  3867         ctxt->sax->startDocument(ctxt->userData);
       
  3868 
       
  3869 
       
  3870     /*
       
  3871      * Parse possible comments before any content
       
  3872      */
       
  3873     while ((CUR == '<') && (NXT(1) == '!') &&
       
  3874            (NXT(2) == '-') && (NXT(3) == '-')) {
       
  3875         htmlParseComment(ctxt);
       
  3876         SKIP_BLANKS;
       
  3877     }
       
  3878 
       
  3879 
       
  3880     /*
       
  3881      * Then possibly doc type declaration(s) and more Misc
       
  3882      * (doctypedecl Misc*)?
       
  3883      */
       
  3884     if ((CUR == '<') && (NXT(1) == '!') &&
       
  3885         (UPP(2) == 'D') && (UPP(3) == 'O') &&
       
  3886         (UPP(4) == 'C') && (UPP(5) == 'T') &&
       
  3887         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
       
  3888         (UPP(8) == 'E')) {
       
  3889         htmlParseDocTypeDecl(ctxt);
       
  3890     }
       
  3891     SKIP_BLANKS;
       
  3892 
       
  3893     /*
       
  3894      * Parse possible comments before any content
       
  3895      */
       
  3896     while ((CUR == '<') && (NXT(1) == '!') &&
       
  3897            (NXT(2) == '-') && (NXT(3) == '-')) {
       
  3898         htmlParseComment(ctxt);
       
  3899         SKIP_BLANKS;
       
  3900     }
       
  3901 
       
  3902     /*
       
  3903      * Time to start parsing the tree itself
       
  3904      */
       
  3905     htmlParseContent(ctxt);
       
  3906 
       
  3907     /*
       
  3908      * autoclose
       
  3909      */
       
  3910     if (CUR == 0)
       
  3911         htmlAutoCloseOnEnd(ctxt);
       
  3912 
       
  3913 
       
  3914     /*
       
  3915      * SAX: end of the document processing.
       
  3916      */
       
  3917     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
       
  3918         ctxt->sax->endDocument(ctxt->userData);
       
  3919 
       
  3920     if (ctxt->myDoc != NULL) {
       
  3921         dtd = xmlGetIntSubset(ctxt->myDoc);
       
  3922         if (dtd == NULL)
       
  3923             ctxt->myDoc->intSubset =
       
  3924                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
       
  3925                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
       
  3926                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
       
  3927     }
       
  3928     if (! ctxt->wellFormed) return(-1);
       
  3929     return(0);
       
  3930 }
       
  3931 
       
  3932 
       
  3933 /************************************************************************
       
  3934  *                                                                      *
       
  3935  *                      Parser contexts handling                        *
       
  3936  *                                                                      *
       
  3937  ************************************************************************/
       
  3938 
       
  3939 /**
       
  3940  * htmlInitParserCtxt:
       
  3941  * @param ctxt an HTML parser context
       
  3942  *
       
  3943  * Initialize a parser context
       
  3944  *
       
  3945  * Returns 0 in case of success and -1 in case of error
       
  3946  */
       
  3947 
       
  3948 static int
       
  3949 htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
       
  3950 {
       
  3951     htmlSAXHandler *sax;
       
  3952 
       
  3953     if (ctxt == NULL) return(-1);
       
  3954     memset(ctxt, 0, sizeof(htmlParserCtxt));
       
  3955     // NOTE: All assignments  ctxt->XX = 0; were commented as unnecessary
       
  3956     ctxt->dict = xmlDictCreate();
       
  3957     if (ctxt->dict == NULL) {
       
  3958         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
       
  3959         return(-1);
       
  3960     }
       
  3961     sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
       
  3962     if (sax == NULL) {
       
  3963         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
       
  3964         return(-1);
       
  3965     }
       
  3966     else
       
  3967         memset(sax, 0, sizeof(htmlSAXHandler));
       
  3968 
       
  3969     /* Allocate the Input stack */
       
  3970     ctxt->inputTab = (htmlParserInputPtr *)
       
  3971                       xmlMalloc(5 * sizeof(htmlParserInputPtr));
       
  3972     if (ctxt->inputTab == NULL) {
       
  3973         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
       
  3974     //ctxt->inputNr = 0;
       
  3975     //ctxt->inputMax = 0;
       
  3976     //ctxt->input = NULL;
       
  3977         return(-1);
       
  3978     }
       
  3979     //ctxt->inputNr = 0;
       
  3980     ctxt->inputMax = 5;
       
  3981     //ctxt->input = NULL;
       
  3982     //ctxt->version = NULL;
       
  3983     //ctxt->encoding = NULL;
       
  3984     ctxt->standalone = -1;
       
  3985     ctxt->instate = XML_PARSER_START;
       
  3986 
       
  3987     /* Allocate the Node stack */
       
  3988     ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
       
  3989     if (ctxt->nodeTab == NULL) {
       
  3990         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
       
  3991     //ctxt->nodeNr = 0;
       
  3992     //ctxt->nodeMax = 0;
       
  3993     //ctxt->node = NULL;
       
  3994     //ctxt->inputNr = 0;
       
  3995     //ctxt->inputMax = 0;
       
  3996     //ctxt->input = NULL;
       
  3997         return(-1);
       
  3998     }
       
  3999     //ctxt->nodeNr = 0;
       
  4000     ctxt->nodeMax = 10;
       
  4001     //ctxt->node = NULL;
       
  4002 
       
  4003     /* Allocate the Name stack */
       
  4004     ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
       
  4005     if (ctxt->nameTab == NULL) {
       
  4006         htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
       
  4007     //ctxt->nameNr = 0;
       
  4008     ctxt->nameMax = 10;
       
  4009     //ctxt->name = NULL;
       
  4010     //ctxt->nodeNr = 0;
       
  4011     //ctxt->nodeMax = 0;
       
  4012     //ctxt->node = NULL;
       
  4013     //ctxt->inputNr = 0;
       
  4014     //ctxt->inputMax = 0;
       
  4015     ctxt->input = NULL;
       
  4016     return(-1);
       
  4017     }
       
  4018     //ctxt->nameNr = 0;
       
  4019     ctxt->nameMax = 10;
       
  4020     //ctxt->name = NULL;
       
  4021 
       
  4022     if (sax == NULL)
       
  4023         ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
       
  4024     else {
       
  4025         ctxt->sax = sax;
       
  4026         memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
       
  4027     }
       
  4028     ctxt->userData = ctxt;
       
  4029     //ctxt->myDoc = NULL;
       
  4030     ctxt->wellFormed = 1;
       
  4031     //ctxt->replaceEntities = 0;
       
  4032     #ifdef LIBXML_ENABLE_NODE_LINEINFO
       
  4033     ctxt->linenumbers = xmlLineNumbersDefaultValue;
       
  4034     #endif
       
  4035     ctxt->html = 1;
       
  4036     ctxt->vctxt.userData = ctxt;
       
  4037     ctxt->vctxt.error = xmlParserValidityError;
       
  4038     ctxt->vctxt.warning = xmlParserValidityWarning;
       
  4039     //ctxt->record_info = 0;
       
  4040     //ctxt->validate = 0;
       
  4041     //ctxt->nbChars = 0;
       
  4042     //ctxt->checkIndex = 0;
       
  4043     //ctxt->catalogs = NULL;
       
  4044     xmlInitNodeInfoSeq(&ctxt->node_seq);
       
  4045     return(0);
       
  4046 }
       
  4047 
       
  4048 /**
       
  4049  * htmlFreeParserCtxt:
       
  4050  * @param ctxt an HTML parser context
       
  4051  *
       
  4052  * Free all the memory used by a parser context. However the parsed
       
  4053  * document in ctxt->myDoc is not freed.
       
  4054  */
       
  4055 
       
  4056 void
       
  4057 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
       
  4058 {
       
  4059     xmlFreeParserCtxt(ctxt);
       
  4060 }
       
  4061 
       
  4062 /**
       
  4063  * htmlNewParserCtxt:
       
  4064  *
       
  4065  * Allocate and initialize a new parser context.
       
  4066  *
       
  4067  * Returns the xmlParserCtxtPtr or NULL
       
  4068  */
       
  4069 
       
  4070 static htmlParserCtxtPtr
       
  4071 htmlNewParserCtxt(void)
       
  4072 {
       
  4073     xmlParserCtxtPtr ctxt;
       
  4074 
       
  4075     ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
       
  4076     if (ctxt == NULL) {
       
  4077         htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
       
  4078         return(NULL);
       
  4079     }
       
  4080     memset(ctxt, 0, sizeof(xmlParserCtxt));
       
  4081 #ifdef XE_ENABLE_GS_CACHING
       
  4082     ctxt->cachedGs = xmlGetGlobalState();
       
  4083 #endif
       
  4084 
       
  4085     if (htmlInitParserCtxt(ctxt) < 0) {
       
  4086         htmlFreeParserCtxt(ctxt);
       
  4087         return(NULL);
       
  4088     }
       
  4089     return(ctxt);
       
  4090 }
       
  4091 
       
  4092 /**
       
  4093  * htmlCreateMemoryParserCtxt:
       
  4094  * @param buffer a pointer to a char array
       
  4095  * @param size the size of the array
       
  4096  *
       
  4097  * Create a parser context for an HTML in-memory document.
       
  4098  *
       
  4099  * Returns the new parser context or NULL
       
  4100  */
       
  4101 htmlParserCtxtPtr
       
  4102 htmlCreateMemoryParserCtxt(const char *buffer, int size) {
       
  4103 	
       
  4104     xmlParserCtxtPtr ctxt;
       
  4105     xmlParserInputPtr input;
       
  4106     xmlParserInputBufferPtr buf;
       
  4107 
       
  4108     if (buffer == NULL)
       
  4109         return(NULL);
       
  4110     if (size <= 0)
       
  4111         return(NULL);
       
  4112 
       
  4113     ctxt = htmlNewParserCtxt();
       
  4114     if (ctxt == NULL)
       
  4115         return(NULL);
       
  4116 
       
  4117     buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
       
  4118     if (buf == NULL) return(NULL);
       
  4119 
       
  4120     input = xmlNewInputStream(ctxt);
       
  4121     if (input == NULL) {
       
  4122         xmlFreeParserCtxt(ctxt);
       
  4123         return(NULL);
       
  4124     }
       
  4125 
       
  4126     input->filename = NULL;
       
  4127     input->buf = buf;
       
  4128     input->base = input->buf->buffer->content;
       
  4129     input->cur = input->buf->buffer->content;
       
  4130     input->end = &input->buf->buffer->content[input->buf->buffer->use];
       
  4131 
       
  4132     inputPush(ctxt, input);
       
  4133     return(ctxt);
       
  4134 }
       
  4135 
       
  4136 /**
       
  4137  * htmlCreateDocParserCtxt:
       
  4138  * @param cur a pointer to an array of xmlChar
       
  4139  * @param encoding a free form C string describing the HTML document encoding, or NULL
       
  4140  *
       
  4141  * Create a parser context for an HTML document.
       
  4142  *
       
  4143  
       
  4144  *
       
  4145  * Returns the new parser context or NULL
       
  4146  */
       
  4147 static htmlParserCtxtPtr
       
  4148 htmlCreateDocParserCtxt(xmlChar *cur, const char *encoding ATTRIBUTE_UNUSED) {
       
  4149     int len;
       
  4150     htmlParserCtxtPtr ctxt;
       
  4151 
       
  4152     if (cur == NULL)
       
  4153         return(NULL);
       
  4154     len = xmlStrlen(cur);
       
  4155     ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
       
  4156 
       
  4157     if (encoding != NULL) {
       
  4158         xmlCharEncoding enc;
       
  4159         xmlCharEncodingHandlerPtr handler;
       
  4160 
       
  4161         if (ctxt->input->encoding != NULL)
       
  4162             xmlFree((xmlChar *) ctxt->input->encoding);
       
  4163         ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
       
  4164 
       
  4165         enc = xmlParseCharEncoding(encoding);
       
  4166         /*
       
  4167          * registered set of known encodings
       
  4168          */
       
  4169         if (enc != XML_CHAR_ENCODING_ERROR) {
       
  4170             xmlSwitchEncoding(ctxt, enc);
       
  4171             if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
       
  4172                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
       
  4173                              "Unsupported encoding %s\n",
       
  4174                              (const xmlChar *) encoding, NULL);
       
  4175             }
       
  4176         } else {
       
  4177             /*
       
  4178              * fallback for unknown encodings
       
  4179              */
       
  4180             handler = xmlFindCharEncodingHandler((const char *) encoding);
       
  4181             if (handler != NULL) {
       
  4182                 xmlSwitchToEncoding(ctxt, handler);
       
  4183             } else {
       
  4184                 htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
       
  4185                              "Unsupported encoding %s\n",
       
  4186                              (const xmlChar *) encoding, NULL);
       
  4187             }
       
  4188         }
       
  4189     }
       
  4190     return(ctxt);
       
  4191 }
       
  4192 
       
  4193 #ifdef LIBXML_PUSH_ENABLED
       
  4194 /************************************************************************
       
  4195  *                                                                      *
       
  4196  *              Progressive parsing interfaces                          *
       
  4197  *                                                                      *
       
  4198  ************************************************************************/
       
  4199 
       
  4200 /**
       
  4201  * htmlParseLookupSequence:
       
  4202  * @param ctxt an HTML parser context
       
  4203  * @param first the first char to lookup
       
  4204  * @param next the next char to lookup or zero
       
  4205  * @param third the next char to lookup or zero
       
  4206  * @param comment flag to force checking inside comments
       
  4207  *
       
  4208  * Try to find if a sequence (first, next, third) or  just (first next) or
       
  4209  * (first) is available in the input stream.
       
  4210  * This function has a side effect of (possibly) incrementing ctxt->checkIndex
       
  4211  * to avoid rescanning sequences of bytes, it DOES change the state of the
       
  4212  * parser, do not use liberally.
       
  4213  * This is basically similar to xmlParseLookupSequence()
       
  4214  *
       
  4215  * Returns the index to the current parsing point if the full sequence
       
  4216  *      is available, -1 otherwise.
       
  4217  */
       
  4218 static int
       
  4219 htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
       
  4220                         xmlChar next, xmlChar third, int iscomment) {
       
  4221     int base, len;
       
  4222     htmlParserInputPtr in;
       
  4223     const xmlChar *buf;
       
  4224     int incomment = 0;
       
  4225 
       
  4226     in = ctxt->input;
       
  4227     if (in == NULL) return(-1);
       
  4228     base = in->cur - in->base;
       
  4229     if (base < 0) return(-1);
       
  4230     if (ctxt->checkIndex > base)
       
  4231         base = ctxt->checkIndex;
       
  4232     if (in->buf == NULL) {
       
  4233         buf = in->base;
       
  4234         len = in->length;
       
  4235     } else {
       
  4236         buf = in->buf->buffer->content;
       
  4237         len = in->buf->buffer->use;
       
  4238     }
       
  4239     /* take into account the sequence length */
       
  4240     if (third) len -= 2;
       
  4241     else if (next) len --;
       
  4242     for (;base < len;base++) {
       
  4243         if (!incomment && (base + 4 < len) && !iscomment) {
       
  4244             if ((buf[base] == '<') && (buf[base + 1] == '!') &&
       
  4245                 (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
       
  4246                 incomment = 1;
       
  4247                 /* do not increment past <! - some people use <!--> */
       
  4248                 base += 2;
       
  4249             }
       
  4250         }
       
  4251         if (incomment) {
       
  4252             if (base + 3 > len)
       
  4253                 return(-1);
       
  4254             if ((buf[base] == '-') && (buf[base + 1] == '-') &&
       
  4255                 (buf[base + 2] == '>')) {
       
  4256                 incomment = 0;
       
  4257                 base += 2;
       
  4258             }
       
  4259             continue;
       
  4260         }
       
  4261         if (buf[base] == first) {
       
  4262             if (third != 0) {
       
  4263                 if ((buf[base + 1] != next) ||
       
  4264                     (buf[base + 2] != third)) continue;
       
  4265             } else if (next != 0) {
       
  4266                 if (buf[base + 1] != next) continue;
       
  4267             }
       
  4268             ctxt->checkIndex = 0;
       
  4269 #ifdef DEBUG_PUSH
       
  4270             if (next == 0)
       
  4271                 xmlGenericError(xmlGenericErrorContext,
       
  4272                         "HPP: lookup '%c' found at %d\n",
       
  4273                         first, base);
       
  4274             else if (third == 0)
       
  4275                 xmlGenericError(xmlGenericErrorContext,
       
  4276                         "HPP: lookup '%c%c' found at %d\n",
       
  4277                         first, next, base);
       
  4278             else
       
  4279                 xmlGenericError(xmlGenericErrorContext,
       
  4280                         "HPP: lookup '%c%c%c' found at %d\n",
       
  4281                         first, next, third, base);
       
  4282 #endif
       
  4283             return(base - (in->cur - in->base));
       
  4284         }
       
  4285     }
       
  4286     ctxt->checkIndex = base;
       
  4287 #ifdef DEBUG_PUSH
       
  4288     if (next == 0)
       
  4289         xmlGenericError(xmlGenericErrorContext,
       
  4290                 "HPP: lookup '%c' failed\n", first);
       
  4291     else if (third == 0)
       
  4292         xmlGenericError(xmlGenericErrorContext,
       
  4293                 "HPP: lookup '%c%c' failed\n", first, next);
       
  4294     else
       
  4295         xmlGenericError(xmlGenericErrorContext,
       
  4296                 "HPP: lookup '%c%c%c' failed\n", first, next, third);
       
  4297 #endif
       
  4298     return(-1);
       
  4299 }
       
  4300 
       
  4301 /**
       
  4302  * htmlParseTryOrFinish:
       
  4303  * @param ctxt an HTML parser context
       
  4304  * @param terminate last chunk indicator
       
  4305  *
       
  4306  * Try to progress on parsing
       
  4307  *
       
  4308  * Returns zero if no parsing was possible
       
  4309  */
       
  4310 static int
       
  4311 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
       
  4312     int ret = 0;
       
  4313     htmlParserInputPtr in;
       
  4314     int avail = 0;
       
  4315     xmlChar cur, next;
       
  4316 
       
  4317 #ifdef DEBUG_PUSH
       
  4318     switch (ctxt->instate) {
       
  4319         case XML_PARSER_EOF:
       
  4320             xmlGenericError(xmlGenericErrorContext,
       
  4321                     "HPP: try EOF\n"); break;
       
  4322         case XML_PARSER_START:
       
  4323             xmlGenericError(xmlGenericErrorContext,
       
  4324                     "HPP: try START\n"); break;
       
  4325         case XML_PARSER_MISC:
       
  4326             xmlGenericError(xmlGenericErrorContext,
       
  4327                     "HPP: try MISC\n");break;
       
  4328         case XML_PARSER_COMMENT:
       
  4329             xmlGenericError(xmlGenericErrorContext,
       
  4330                     "HPP: try COMMENT\n");break;
       
  4331         case XML_PARSER_PROLOG:
       
  4332             xmlGenericError(xmlGenericErrorContext,
       
  4333                     "HPP: try PROLOG\n");break;
       
  4334         case XML_PARSER_START_TAG:
       
  4335             xmlGenericError(xmlGenericErrorContext,
       
  4336                     "HPP: try START_TAG\n");break;
       
  4337         case XML_PARSER_CONTENT:
       
  4338             xmlGenericError(xmlGenericErrorContext,
       
  4339                     "HPP: try CONTENT\n");break;
       
  4340         case XML_PARSER_CDATA_SECTION:
       
  4341             xmlGenericError(xmlGenericErrorContext,
       
  4342                     "HPP: try CDATA_SECTION\n");break;
       
  4343         case XML_PARSER_END_TAG:
       
  4344             xmlGenericError(xmlGenericErrorContext,
       
  4345                     "HPP: try END_TAG\n");break;
       
  4346         case XML_PARSER_ENTITY_DECL:
       
  4347             xmlGenericError(xmlGenericErrorContext,
       
  4348                     "HPP: try ENTITY_DECL\n");break;
       
  4349         case XML_PARSER_ENTITY_VALUE:
       
  4350             xmlGenericError(xmlGenericErrorContext,
       
  4351                     "HPP: try ENTITY_VALUE\n");break;
       
  4352         case XML_PARSER_ATTRIBUTE_VALUE:
       
  4353             xmlGenericError(xmlGenericErrorContext,
       
  4354                     "HPP: try ATTRIBUTE_VALUE\n");break;
       
  4355         case XML_PARSER_DTD:
       
  4356             xmlGenericError(xmlGenericErrorContext,
       
  4357                     "HPP: try DTD\n");break;
       
  4358         case XML_PARSER_EPILOG:
       
  4359             xmlGenericError(xmlGenericErrorContext,
       
  4360                     "HPP: try EPILOG\n");break;
       
  4361         case XML_PARSER_PI:
       
  4362             xmlGenericError(xmlGenericErrorContext,
       
  4363                     "HPP: try PI\n");break;
       
  4364         case XML_PARSER_SYSTEM_LITERAL:
       
  4365             xmlGenericError(xmlGenericErrorContext,
       
  4366                     "HPP: try SYSTEM_LITERAL\n");break;
       
  4367     }
       
  4368 #endif
       
  4369 
       
  4370     while (1) {
       
  4371 
       
  4372         in = ctxt->input;
       
  4373         if (in == NULL) break;
       
  4374         if (in->buf == NULL)
       
  4375             avail = in->length - (in->cur - in->base);
       
  4376         else
       
  4377             avail = in->buf->buffer->use - (in->cur - in->base);
       
  4378         if ((avail == 0) && (terminate)) {
       
  4379             htmlAutoCloseOnEnd(ctxt);
       
  4380             if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
       
  4381                 /*
       
  4382                  * SAX: end of the document processing.
       
  4383                  */
       
  4384                 ctxt->instate = XML_PARSER_EOF;
       
  4385                 if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
       
  4386                     ctxt->sax->endDocument(ctxt->userData);
       
  4387             }
       
  4388         }
       
  4389         if (avail < 1)
       
  4390             goto done;
       
  4391         cur = in->cur[0];
       
  4392         if (cur == 0) {
       
  4393             SKIP(1);
       
  4394             continue;
       
  4395         }
       
  4396 
       
  4397         switch (ctxt->instate) {
       
  4398             case XML_PARSER_EOF:
       
  4399                 /*
       
  4400                  * Document parsing is done !
       
  4401                  */
       
  4402                 goto done;
       
  4403             case XML_PARSER_START:
       
  4404                 /*
       
  4405                  * Very first chars read from the document flow.
       
  4406                  */
       
  4407                 cur = in->cur[0];
       
  4408                 if (IS_BLANK_CH(cur)) {
       
  4409                     SKIP_BLANKS;
       
  4410                     if (in->buf == NULL)
       
  4411                         avail = in->length - (in->cur - in->base);
       
  4412                     else
       
  4413                         avail = in->buf->buffer->use - (in->cur - in->base);
       
  4414                 }
       
  4415                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
       
  4416                     ctxt->sax->setDocumentLocator(ctxt->userData,
       
  4417                                                   &xmlDefaultSAXLocator);
       
  4418                 if ((ctxt->sax) && (ctxt->sax->startDocument) &&
       
  4419                     (!ctxt->disableSAX))
       
  4420                     ctxt->sax->startDocument(ctxt->userData);
       
  4421 
       
  4422                 cur = in->cur[0];
       
  4423                 next = in->cur[1];
       
  4424                 if ((cur == '<') && (next == '!') &&
       
  4425                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
       
  4426                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
       
  4427                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
       
  4428                     (UPP(8) == 'E')) {
       
  4429                     if ((!terminate) &&
       
  4430                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
       
  4431                         goto done;
       
  4432 #ifdef DEBUG_PUSH
       
  4433                     xmlGenericError(xmlGenericErrorContext,
       
  4434                             "HPP: Parsing internal subset\n");
       
  4435 #endif
       
  4436                     htmlParseDocTypeDecl(ctxt);
       
  4437                     ctxt->instate = XML_PARSER_PROLOG;
       
  4438 #ifdef DEBUG_PUSH
       
  4439                     xmlGenericError(xmlGenericErrorContext,
       
  4440                             "HPP: entering PROLOG\n");
       
  4441 #endif
       
  4442                 } else {
       
  4443                     ctxt->instate = XML_PARSER_MISC;
       
  4444                 }
       
  4445 #ifdef DEBUG_PUSH
       
  4446                 xmlGenericError(xmlGenericErrorContext,
       
  4447                         "HPP: entering MISC\n");
       
  4448 #endif
       
  4449                 break;
       
  4450             case XML_PARSER_MISC:
       
  4451                 SKIP_BLANKS;
       
  4452                 if (in->buf == NULL)
       
  4453                     avail = in->length - (in->cur - in->base);
       
  4454                 else
       
  4455                     avail = in->buf->buffer->use - (in->cur - in->base);
       
  4456                 if (avail < 2)
       
  4457                     goto done;
       
  4458                 cur = in->cur[0];
       
  4459                 next = in->cur[1];
       
  4460                 if ((cur == '<') && (next == '!') &&
       
  4461                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
       
  4462                     if ((!terminate) &&
       
  4463                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
       
  4464                         goto done;
       
  4465 #ifdef DEBUG_PUSH
       
  4466                     xmlGenericError(xmlGenericErrorContext,
       
  4467                             "HPP: Parsing Comment\n");
       
  4468 #endif
       
  4469                     htmlParseComment(ctxt);
       
  4470                     ctxt->instate = XML_PARSER_MISC;
       
  4471                 } else if ((cur == '<') && (next == '!') &&
       
  4472                     (UPP(2) == 'D') && (UPP(3) == 'O') &&
       
  4473                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
       
  4474                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
       
  4475                     (UPP(8) == 'E')) {
       
  4476                     if ((!terminate) &&
       
  4477                         (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
       
  4478                         goto done;
       
  4479 #ifdef DEBUG_PUSH
       
  4480                     xmlGenericError(xmlGenericErrorContext,
       
  4481                             "HPP: Parsing internal subset\n");
       
  4482 #endif
       
  4483                     htmlParseDocTypeDecl(ctxt);
       
  4484                     ctxt->instate = XML_PARSER_PROLOG;
       
  4485 #ifdef DEBUG_PUSH
       
  4486                     xmlGenericError(xmlGenericErrorContext,
       
  4487                             "HPP: entering PROLOG\n");
       
  4488 #endif
       
  4489                 } else if ((cur == '<') && (next == '!') &&
       
  4490                            (avail < 9)) {
       
  4491                     goto done;
       
  4492                 } else {
       
  4493                     ctxt->instate = XML_PARSER_START_TAG;
       
  4494 #ifdef DEBUG_PUSH
       
  4495                     xmlGenericError(xmlGenericErrorContext,
       
  4496                             "HPP: entering START_TAG\n");
       
  4497 #endif
       
  4498                 }
       
  4499                 break;
       
  4500             case XML_PARSER_PROLOG:
       
  4501                 SKIP_BLANKS;
       
  4502                 if (in->buf == NULL)
       
  4503                     avail = in->length - (in->cur - in->base);
       
  4504                 else
       
  4505                     avail = in->buf->buffer->use - (in->cur - in->base);
       
  4506                 if (avail < 2)
       
  4507                     goto done;
       
  4508                 cur = in->cur[0];
       
  4509                 next = in->cur[1];
       
  4510                 if ((cur == '<') && (next == '!') &&
       
  4511                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
       
  4512                     if ((!terminate) &&
       
  4513                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
       
  4514                         goto done;
       
  4515 #ifdef DEBUG_PUSH
       
  4516                     xmlGenericError(xmlGenericErrorContext,
       
  4517                             "HPP: Parsing Comment\n");
       
  4518 #endif
       
  4519                     htmlParseComment(ctxt);
       
  4520                     ctxt->instate = XML_PARSER_PROLOG;
       
  4521                 } else if ((cur == '<') && (next == '!') &&
       
  4522                            (avail < 4)) {
       
  4523                     goto done;
       
  4524                 } else {
       
  4525                     ctxt->instate = XML_PARSER_START_TAG;
       
  4526 #ifdef DEBUG_PUSH
       
  4527                     xmlGenericError(xmlGenericErrorContext,
       
  4528                             "HPP: entering START_TAG\n");
       
  4529 #endif
       
  4530                 }
       
  4531                 break;
       
  4532             case XML_PARSER_EPILOG:
       
  4533                 if (in->buf == NULL)
       
  4534                     avail = in->length - (in->cur - in->base);
       
  4535                 else
       
  4536                     avail = in->buf->buffer->use - (in->cur - in->base);
       
  4537                 if (avail < 1)
       
  4538                     goto done;
       
  4539                 cur = in->cur[0];
       
  4540                 if (IS_BLANK_CH(cur)) {
       
  4541                     htmlParseCharData(ctxt);
       
  4542                     goto done;
       
  4543                 }
       
  4544                 if (avail < 2)
       
  4545                     goto done;
       
  4546                 next = in->cur[1];
       
  4547                 if ((cur == '<') && (next == '!') &&
       
  4548                     (in->cur[2] == '-') && (in->cur[3] == '-')) {
       
  4549                     if ((!terminate) &&
       
  4550                         (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
       
  4551                         goto done;
       
  4552 #ifdef DEBUG_PUSH
       
  4553                     xmlGenericError(xmlGenericErrorContext,
       
  4554                             "HPP: Parsing Comment\n");
       
  4555 #endif
       
  4556                     htmlParseComment(ctxt);
       
  4557                     ctxt->instate = XML_PARSER_EPILOG;
       
  4558                 } else if ((cur == '<') && (next == '!') &&
       
  4559                            (avail < 4)) {
       
  4560                     goto done;
       
  4561                 } else {
       
  4562                     ctxt->errNo = XML_ERR_DOCUMENT_END;
       
  4563                     ctxt->wellFormed = 0;
       
  4564                     ctxt->instate = XML_PARSER_EOF;
       
  4565 #ifdef DEBUG_PUSH
       
  4566                     xmlGenericError(xmlGenericErrorContext,
       
  4567                             "HPP: entering EOF\n");
       
  4568 #endif
       
  4569                     if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
       
  4570                         ctxt->sax->endDocument(ctxt->userData);
       
  4571                     goto done;
       
  4572                 }
       
  4573                 break;
       
  4574             case XML_PARSER_START_TAG: {
       
  4575                 const xmlChar *name, *oldname;
       
  4576                 int depth = ctxt->nameNr;
       
  4577                 const htmlElemDesc * info;
       
  4578 
       
  4579                 if (avail < 2)
       
  4580                     goto done;
       
  4581                 cur = in->cur[0];
       
  4582                 if (cur != '<') {
       
  4583                     ctxt->instate = XML_PARSER_CONTENT;
       
  4584 #ifdef DEBUG_PUSH
       
  4585                     xmlGenericError(xmlGenericErrorContext,
       
  4586                             "HPP: entering CONTENT\n");
       
  4587 #endif
       
  4588                     break;
       
  4589                 }
       
  4590                 if (in->cur[1] == '/') {
       
  4591                     ctxt->instate = XML_PARSER_END_TAG;
       
  4592                     ctxt->checkIndex = 0;
       
  4593 #ifdef DEBUG_PUSH
       
  4594                     xmlGenericError(xmlGenericErrorContext,
       
  4595                             "HPP: entering END_TAG\n");
       
  4596 #endif
       
  4597                     break;
       
  4598                 }
       
  4599                 if ((!terminate) &&
       
  4600                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
       
  4601                     goto done;
       
  4602 
       
  4603                 oldname = ctxt->name;
       
  4604                 htmlParseStartTag(ctxt);
       
  4605                 name = ctxt->name;
       
  4606                 if (((depth == ctxt->nameNr) &&
       
  4607                      (xmlStrEqual(oldname, ctxt->name))) ||
       
  4608                     (name == NULL)) {
       
  4609                     if (CUR == '>')
       
  4610                         NEXT;
       
  4611                     break;
       
  4612                 }
       
  4613 
       
  4614                 /*
       
  4615                  * Lookup the info for that element.
       
  4616                  */
       
  4617                 info = htmlTagLookup(name);
       
  4618                 if (info == NULL) {
       
  4619                     htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
       
  4620                                  "Tag %s invalid\n", name, NULL);
       
  4621                 }
       
  4622 
       
  4623                 /*
       
  4624                  * Check for an Empty Element labeled the XML/SGML way
       
  4625                  */
       
  4626                 if ((CUR == '/') && (NXT(1) == '>')) {
       
  4627                     SKIP(2);
       
  4628                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  4629                         ctxt->sax->endElement(ctxt->userData, name);
       
  4630                     oldname = htmlnamePop(ctxt);
       
  4631                     ctxt->instate = XML_PARSER_CONTENT;
       
  4632 #ifdef DEBUG_PUSH
       
  4633                     xmlGenericError(xmlGenericErrorContext,
       
  4634                             "HPP: entering CONTENT\n");
       
  4635 #endif
       
  4636                     break;
       
  4637                 }
       
  4638 
       
  4639                 if (CUR == '>') {
       
  4640                     NEXT;
       
  4641                 } else {
       
  4642                     htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
       
  4643                                  "Couldn't find end of Start Tag %s\n",
       
  4644                                  name, NULL);
       
  4645 
       
  4646                     /*
       
  4647                      * end of parsing of this node.
       
  4648                      */
       
  4649                     if (xmlStrEqual(name, ctxt->name)) {
       
  4650                         nodePop(ctxt);
       
  4651                         oldname = htmlnamePop(ctxt);
       
  4652                     }
       
  4653 
       
  4654                     ctxt->instate = XML_PARSER_CONTENT;
       
  4655 #ifdef DEBUG_PUSH
       
  4656                     xmlGenericError(xmlGenericErrorContext,
       
  4657                             "HPP: entering CONTENT\n");
       
  4658 #endif
       
  4659                     break;
       
  4660                 }
       
  4661 
       
  4662                 /*
       
  4663                  * Check for an Empty Element from DTD definition
       
  4664                  */
       
  4665                 if ((info != NULL) && (info->empty)) {
       
  4666                     if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
       
  4667                         ctxt->sax->endElement(ctxt->userData, name);
       
  4668                     oldname = htmlnamePop(ctxt);
       
  4669                 }
       
  4670                 ctxt->instate = XML_PARSER_CONTENT;
       
  4671 #ifdef DEBUG_PUSH
       
  4672                 xmlGenericError(xmlGenericErrorContext,
       
  4673                         "HPP: entering CONTENT\n");
       
  4674 #endif
       
  4675                 break;
       
  4676             }
       
  4677             case XML_PARSER_CONTENT: {
       
  4678                 long cons;
       
  4679                 /*
       
  4680                  * Handle preparsed entities and charRef
       
  4681                  */
       
  4682                 if (ctxt->token != 0) {
       
  4683                     xmlChar chr[2] = { 0 , 0 } ;
       
  4684 
       
  4685                     chr[0] = (xmlChar) ctxt->token;
       
  4686                     htmlCheckParagraph(ctxt);
       
  4687                     if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
       
  4688                         ctxt->sax->characters(ctxt->userData, chr, 1);
       
  4689                     ctxt->token = 0;
       
  4690                     ctxt->checkIndex = 0;
       
  4691                 }
       
  4692                 if ((avail == 1) && (terminate)) {
       
  4693                     cur = in->cur[0];
       
  4694                     if ((cur != '<') && (cur != '&')) {
       
  4695                         if (ctxt->sax != NULL) {
       
  4696                             if (IS_BLANK_CH(cur)) {
       
  4697                                 if (ctxt->sax->ignorableWhitespace != NULL)
       
  4698                                     ctxt->sax->ignorableWhitespace(
       
  4699                                             ctxt->userData, &cur, 1);
       
  4700                             } else {
       
  4701                                 htmlCheckParagraph(ctxt);
       
  4702                                 if (ctxt->sax->characters != NULL)
       
  4703                                     ctxt->sax->characters(
       
  4704                                             ctxt->userData, &cur, 1);
       
  4705                             }
       
  4706                         }
       
  4707                         ctxt->token = 0;
       
  4708                         ctxt->checkIndex = 0;
       
  4709                         in->cur++;
       
  4710                         break;
       
  4711                     }
       
  4712                 }
       
  4713                 if (avail < 2)
       
  4714                     goto done;
       
  4715                 cur = in->cur[0];
       
  4716                 next = in->cur[1];
       
  4717                 cons = ctxt->nbChars;
       
  4718                 if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
       
  4719                     (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
       
  4720                     /*
       
  4721                      * Handle SCRIPT/STYLE separately
       
  4722                      */
       
  4723                     if ((!terminate) &&
       
  4724                         (htmlParseLookupSequence(ctxt, '<', '/', 0, 0) < 0))
       
  4725                         goto done;
       
  4726                     htmlParseScript(ctxt);
       
  4727                     if ((cur == '<') && (next == '/')) {
       
  4728                         ctxt->instate = XML_PARSER_END_TAG;
       
  4729                         ctxt->checkIndex = 0;
       
  4730 #ifdef DEBUG_PUSH
       
  4731                         xmlGenericError(xmlGenericErrorContext,
       
  4732                                 "HPP: entering END_TAG\n");
       
  4733 #endif
       
  4734                         break;
       
  4735                     }
       
  4736                 } else {
       
  4737                     /*
       
  4738                      * Sometimes DOCTYPE arrives in the middle of the document
       
  4739                      */
       
  4740                     if ((cur == '<') && (next == '!') &&
       
  4741                         (UPP(2) == 'D') && (UPP(3) == 'O') &&
       
  4742                         (UPP(4) == 'C') && (UPP(5) == 'T') &&
       
  4743                         (UPP(6) == 'Y') && (UPP(7) == 'P') &&
       
  4744                         (UPP(8) == 'E')) {
       
  4745                         if ((!terminate) &&
       
  4746                             (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
       
  4747                             goto done;
       
  4748                         htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
       
  4749                                      "Misplaced DOCTYPE declaration\n",
       
  4750                                      BAD_CAST "DOCTYPE" , NULL);
       
  4751                         htmlParseDocTypeDecl(ctxt);
       
  4752                     } else if ((cur == '<') && (next == '!') &&
       
  4753                         (in->cur[2] == '-') && (in->cur[3] == '-')) {
       
  4754                         if ((!terminate) &&
       
  4755                             (htmlParseLookupSequence(
       
  4756                                         ctxt, '-', '-', '>', 1) < 0))
       
  4757                             goto done;
       
  4758 #ifdef DEBUG_PUSH
       
  4759                         xmlGenericError(xmlGenericErrorContext,
       
  4760                                 "HPP: Parsing Comment\n");
       
  4761 #endif
       
  4762                         htmlParseComment(ctxt);
       
  4763                         ctxt->instate = XML_PARSER_CONTENT;
       
  4764                     } else if ((cur == '<') && (next == '!') && (avail < 4)) {
       
  4765                         goto done;
       
  4766                     } else if ((cur == '<') && (next == '/')) {
       
  4767                         ctxt->instate = XML_PARSER_END_TAG;
       
  4768                         ctxt->checkIndex = 0;
       
  4769 #ifdef DEBUG_PUSH
       
  4770                         xmlGenericError(xmlGenericErrorContext,
       
  4771                                 "HPP: entering END_TAG\n");
       
  4772 #endif
       
  4773                         break;
       
  4774                     } else if (cur == '<') {
       
  4775                         ctxt->instate = XML_PARSER_START_TAG;
       
  4776                         ctxt->checkIndex = 0;
       
  4777 #ifdef DEBUG_PUSH
       
  4778                         xmlGenericError(xmlGenericErrorContext,
       
  4779                                 "HPP: entering START_TAG\n");
       
  4780 #endif
       
  4781                         break;
       
  4782                     } else if (cur == '&') {
       
  4783                         if ((!terminate) &&
       
  4784                             (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
       
  4785                             goto done;
       
  4786 #ifdef DEBUG_PUSH
       
  4787                         xmlGenericError(xmlGenericErrorContext,
       
  4788                                 "HPP: Parsing Reference\n");
       
  4789 #endif
       
  4790                         
       
  4791                         htmlParseReference(ctxt);
       
  4792                     } else {
       
  4793                         /*
       
  4794                          * check that the text sequence is complete
       
  4795                          * before handing out the data to the parser
       
  4796                          * to avoid problems with erroneous end of
       
  4797                          * data detection.
       
  4798                          */
       
  4799                         if ((!terminate) &&
       
  4800                             (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
       
  4801                             goto done;
       
  4802                         ctxt->checkIndex = 0;
       
  4803 #ifdef DEBUG_PUSH
       
  4804                         xmlGenericError(xmlGenericErrorContext,
       
  4805                                 "HPP: Parsing char data\n");
       
  4806 #endif
       
  4807                         htmlParseCharData(ctxt);
       
  4808                     }
       
  4809                 }
       
  4810                 if (cons == ctxt->nbChars) {
       
  4811                     if (ctxt->node != NULL) {
       
  4812                         htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4813                                      "detected an error in element content\n",
       
  4814                                      NULL, NULL);
       
  4815                     }
       
  4816                     NEXT;
       
  4817                     break;
       
  4818                 }
       
  4819 
       
  4820                 break;
       
  4821             }
       
  4822             case XML_PARSER_END_TAG:
       
  4823                 if (avail < 2)
       
  4824                     goto done;
       
  4825                 if ((!terminate) &&
       
  4826                     (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
       
  4827                     goto done;
       
  4828                 htmlParseEndTag(ctxt);
       
  4829                 if (ctxt->nameNr == 0) {
       
  4830                     ctxt->instate = XML_PARSER_EPILOG;
       
  4831                 } else {
       
  4832                     ctxt->instate = XML_PARSER_CONTENT;
       
  4833                 }
       
  4834                 ctxt->checkIndex = 0;
       
  4835 #ifdef DEBUG_PUSH
       
  4836                 xmlGenericError(xmlGenericErrorContext,
       
  4837                         "HPP: entering CONTENT\n");
       
  4838 #endif
       
  4839                 break;
       
  4840             case XML_PARSER_CDATA_SECTION:
       
  4841                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4842                         "HPP: internal error, state == CDATA\n",
       
  4843                              NULL, NULL);
       
  4844                 ctxt->instate = XML_PARSER_CONTENT;
       
  4845                 ctxt->checkIndex = 0;
       
  4846 #ifdef DEBUG_PUSH
       
  4847                 xmlGenericError(xmlGenericErrorContext,
       
  4848                         "HPP: entering CONTENT\n");
       
  4849 #endif
       
  4850                 break;
       
  4851             case XML_PARSER_DTD:
       
  4852                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4853                         "HPP: internal error, state == DTD\n",
       
  4854                              NULL, NULL);
       
  4855                 ctxt->instate = XML_PARSER_CONTENT;
       
  4856                 ctxt->checkIndex = 0;
       
  4857 #ifdef DEBUG_PUSH
       
  4858                 xmlGenericError(xmlGenericErrorContext,
       
  4859                         "HPP: entering CONTENT\n");
       
  4860 #endif
       
  4861                 break;
       
  4862             case XML_PARSER_COMMENT:
       
  4863                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4864                         "HPP: internal error, state == COMMENT\n",
       
  4865                              NULL, NULL);
       
  4866                 ctxt->instate = XML_PARSER_CONTENT;
       
  4867                 ctxt->checkIndex = 0;
       
  4868 #ifdef DEBUG_PUSH
       
  4869                 xmlGenericError(xmlGenericErrorContext,
       
  4870                         "HPP: entering CONTENT\n");
       
  4871 #endif
       
  4872                 break;
       
  4873             case XML_PARSER_PI:
       
  4874                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4875                         "HPP: internal error, state == PI\n",
       
  4876                              NULL, NULL);
       
  4877                 ctxt->instate = XML_PARSER_CONTENT;
       
  4878                 ctxt->checkIndex = 0;
       
  4879 #ifdef DEBUG_PUSH
       
  4880                 xmlGenericError(xmlGenericErrorContext,
       
  4881                         "HPP: entering CONTENT\n");
       
  4882 #endif
       
  4883                 break;
       
  4884             case XML_PARSER_ENTITY_DECL:
       
  4885                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4886                         "HPP: internal error, state == ENTITY_DECL\n",
       
  4887                              NULL, NULL);
       
  4888                 ctxt->instate = XML_PARSER_CONTENT;
       
  4889                 ctxt->checkIndex = 0;
       
  4890 #ifdef DEBUG_PUSH
       
  4891                 xmlGenericError(xmlGenericErrorContext,
       
  4892                         "HPP: entering CONTENT\n");
       
  4893 #endif
       
  4894                 break;
       
  4895             case XML_PARSER_ENTITY_VALUE:
       
  4896                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4897                         "HPP: internal error, state == ENTITY_VALUE\n",
       
  4898                              NULL, NULL);
       
  4899                 ctxt->instate = XML_PARSER_CONTENT;
       
  4900                 ctxt->checkIndex = 0;
       
  4901 #ifdef DEBUG_PUSH
       
  4902                 xmlGenericError(xmlGenericErrorContext,
       
  4903                         "HPP: entering DTD\n");
       
  4904 #endif
       
  4905                 break;
       
  4906             case XML_PARSER_ATTRIBUTE_VALUE:
       
  4907                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4908                         "HPP: internal error, state == ATTRIBUTE_VALUE\n",
       
  4909                              NULL, NULL);
       
  4910                 ctxt->instate = XML_PARSER_START_TAG;
       
  4911                 ctxt->checkIndex = 0;
       
  4912 #ifdef DEBUG_PUSH
       
  4913                 xmlGenericError(xmlGenericErrorContext,
       
  4914                         "HPP: entering START_TAG\n");
       
  4915 #endif
       
  4916                 break;
       
  4917             case XML_PARSER_SYSTEM_LITERAL:
       
  4918                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4919                     "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
       
  4920                              NULL, NULL);
       
  4921                 ctxt->instate = XML_PARSER_CONTENT;
       
  4922                 ctxt->checkIndex = 0;
       
  4923 #ifdef DEBUG_PUSH
       
  4924                 xmlGenericError(xmlGenericErrorContext,
       
  4925                         "HPP: entering CONTENT\n");
       
  4926 #endif
       
  4927                 break;
       
  4928             case XML_PARSER_IGNORE:
       
  4929                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4930                         "HPP: internal error, state == XML_PARSER_IGNORE\n",
       
  4931                              NULL, NULL);
       
  4932                 ctxt->instate = XML_PARSER_CONTENT;
       
  4933                 ctxt->checkIndex = 0;
       
  4934 #ifdef DEBUG_PUSH
       
  4935                 xmlGenericError(xmlGenericErrorContext,
       
  4936                         "HPP: entering CONTENT\n");
       
  4937 #endif
       
  4938                 break;
       
  4939             case XML_PARSER_PUBLIC_LITERAL:
       
  4940                 htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
       
  4941                         "HPP: internal error, state == XML_PARSER_LITERAL\n",
       
  4942                              NULL, NULL);
       
  4943                 ctxt->instate = XML_PARSER_CONTENT;
       
  4944                 ctxt->checkIndex = 0;
       
  4945 #ifdef DEBUG_PUSH
       
  4946                 xmlGenericError(xmlGenericErrorContext,
       
  4947                         "HPP: entering CONTENT\n");
       
  4948 #endif
       
  4949                 break;
       
  4950 
       
  4951         }
       
  4952     }
       
  4953 done:
       
  4954     if ((avail == 0) && (terminate)) {
       
  4955         htmlAutoCloseOnEnd(ctxt);
       
  4956         if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
       
  4957             /*
       
  4958              * SAX: end of the document processing.
       
  4959              */
       
  4960             ctxt->instate = XML_PARSER_EOF;
       
  4961             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
       
  4962                 ctxt->sax->endDocument(ctxt->userData);
       
  4963         }
       
  4964     }
       
  4965     if ((ctxt->myDoc != NULL) &&
       
  4966         ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
       
  4967          (ctxt->instate == XML_PARSER_EPILOG))) {
       
  4968         xmlDtdPtr dtd;
       
  4969         dtd = xmlGetIntSubset(ctxt->myDoc);
       
  4970         if (dtd == NULL)
       
  4971             ctxt->myDoc->intSubset =
       
  4972                 xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
       
  4973                     BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
       
  4974                     BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
       
  4975     }
       
  4976 #ifdef DEBUG_PUSH
       
  4977     xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
       
  4978 #endif
       
  4979     return(ret);
       
  4980 }
       
  4981 
       
  4982 /**
       
  4983  * htmlParseChunk:
       
  4984  * @param ctxt an HTML parser context
       
  4985  * @param chunk an char array
       
  4986  * @param size the size in byte of the chunk
       
  4987  * @param terminate last chunk indicator
       
  4988  *
       
  4989  * Parse a Chunk of memory
       
  4990  *
       
  4991  * Returns zero if no error, the xmlParserErrors otherwise.
       
  4992  */
       
  4993 int
       
  4994 htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
       
  4995               int terminate) {
       
  4996     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
       
  4997         (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
       
  4998         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
       
  4999         int cur = ctxt->input->cur - ctxt->input->base;
       
  5000 
       
  5001         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
       
  5002         ctxt->input->base = ctxt->input->buf->buffer->content + base;
       
  5003         ctxt->input->cur = ctxt->input->base + cur;
       
  5004 #ifdef DEBUG_PUSH
       
  5005         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
       
  5006 #endif
       
  5007 
       
  5008 #if 0
       
  5009         if ((terminate) || (ctxt->input->buf->buffer->use > 80))
       
  5010             htmlParseTryOrFinish(ctxt, terminate);
       
  5011 #endif
       
  5012     } else if (ctxt->instate != XML_PARSER_EOF) {
       
  5013         if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
       
  5014             xmlParserInputBufferPtr in = ctxt->input->buf;
       
  5015             if ((in->encoder != NULL) && (in->buffer != NULL) &&
       
  5016                     (in->raw != NULL)) {
       
  5017                 int nbchars;
       
  5018 
       
  5019                 nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
       
  5020                 if (nbchars < 0) {
       
  5021                     htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
       
  5022                                  "encoder error\n", NULL, NULL);
       
  5023                     return(XML_ERR_INVALID_ENCODING);
       
  5024                 }
       
  5025             }
       
  5026         }
       
  5027     }
       
  5028     htmlParseTryOrFinish(ctxt, terminate);
       
  5029     if (terminate) {
       
  5030         if ((ctxt->instate != XML_PARSER_EOF) &&
       
  5031             (ctxt->instate != XML_PARSER_EPILOG) &&
       
  5032             (ctxt->instate != XML_PARSER_MISC)) {
       
  5033             ctxt->errNo = XML_ERR_DOCUMENT_END;
       
  5034             ctxt->wellFormed = 0;
       
  5035         }
       
  5036         if (ctxt->instate != XML_PARSER_EOF) {
       
  5037             if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
       
  5038                 ctxt->sax->endDocument(ctxt->userData);
       
  5039         }
       
  5040         ctxt->instate = XML_PARSER_EOF;
       
  5041     }
       
  5042     return((xmlParserErrors) ctxt->errNo);
       
  5043 }
       
  5044 #endif /* LIBXML_PUSH_ENABLED */
       
  5045 
       
  5046 /************************************************************************
       
  5047  *                                                                      *
       
  5048  *                      User entry points                               *
       
  5049  *                                                                      *
       
  5050  ************************************************************************/
       
  5051 
       
  5052 /**
       
  5053  * htmlCreatePushParserCtxt:
       
  5054  * @param sax a SAX handler
       
  5055  * @param user_data The user data returned on SAX callbacks
       
  5056  * @param chunk a pointer to an array of chars
       
  5057  * @param size number of chars in the array
       
  5058  * @param filename an optional file name or URI
       
  5059  * @param enc an optional encoding
       
  5060  *
       
  5061  * Create a parser context for using the HTML parser in push mode
       
  5062  * The value of filename is used for fetching external entities
       
  5063  * and error/warning reports.
       
  5064  *
       
  5065  * Returns the new parser context or NULL
       
  5066  */
       
  5067 htmlParserCtxtPtr
       
  5068 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
       
  5069                          const char *chunk, int size, const char *filename,
       
  5070                          xmlCharEncoding enc) {
       
  5071     htmlParserCtxtPtr ctxt;
       
  5072     htmlParserInputPtr inputStream;
       
  5073     xmlParserInputBufferPtr buf;
       
  5074 
       
  5075     xmlInitParser();
       
  5076 
       
  5077     buf = xmlAllocParserInputBuffer(enc);
       
  5078     if (buf == NULL) return(NULL);
       
  5079 
       
  5080     ctxt = htmlNewParserCtxt();
       
  5081     if (ctxt == NULL) {
       
  5082         xmlFreeParserInputBuffer(buf);
       
  5083         return(NULL);
       
  5084     }
       
  5085     if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
       
  5086         ctxt->charset=XML_CHAR_ENCODING_UTF8;
       
  5087     if (sax != NULL) {
       
  5088         if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
       
  5089             xmlFree(ctxt->sax);
       
  5090         ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
       
  5091         if (ctxt->sax == NULL) {
       
  5092             xmlFree(buf);
       
  5093             xmlFree(ctxt);
       
  5094             return(NULL);
       
  5095         }
       
  5096         memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
       
  5097         if (user_data != NULL)
       
  5098             ctxt->userData = user_data;
       
  5099     }
       
  5100     if (filename == NULL) {
       
  5101         ctxt->directory = NULL;
       
  5102     } else {
       
  5103         ctxt->directory = xmlParserGetDirectory(filename);
       
  5104     }
       
  5105 
       
  5106     inputStream = htmlNewInputStream(ctxt);
       
  5107     if (inputStream == NULL) {
       
  5108         xmlFreeParserCtxt(ctxt);
       
  5109         xmlFree(buf);
       
  5110         return(NULL);
       
  5111     }
       
  5112 
       
  5113     if (filename == NULL)
       
  5114         inputStream->filename = NULL;
       
  5115     else
       
  5116         inputStream->filename = (char *)
       
  5117             xmlCanonicPath((const xmlChar *) filename);
       
  5118     inputStream->buf = buf;
       
  5119     inputStream->base = inputStream->buf->buffer->content;
       
  5120     inputStream->cur = inputStream->buf->buffer->content;
       
  5121     inputStream->end =
       
  5122         &inputStream->buf->buffer->content[inputStream->buf->buffer->use];
       
  5123 
       
  5124     inputPush(ctxt, inputStream);
       
  5125 
       
  5126     if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
       
  5127         (ctxt->input->buf != NULL))  {
       
  5128         int base = ctxt->input->base - ctxt->input->buf->buffer->content;
       
  5129         int cur = ctxt->input->cur - ctxt->input->base;
       
  5130 
       
  5131         xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
       
  5132 
       
  5133         ctxt->input->base = ctxt->input->buf->buffer->content + base;
       
  5134         ctxt->input->cur = ctxt->input->base + cur;
       
  5135         ctxt->input->end =
       
  5136             &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
       
  5137 #ifdef DEBUG_PUSH
       
  5138         xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
       
  5139 #endif
       
  5140     }
       
  5141 
       
  5142     return(ctxt);
       
  5143 }
       
  5144 
       
  5145 /**
       
  5146  * htmlSAXParseDoc:
       
  5147  * @param cur a pointer to an array of xmlChar
       
  5148  * @param encoding a free form C string describing the HTML document encoding, or NULL
       
  5149  * @param sax the SAX handler block
       
  5150  * @param userData if using SAX, this pointer will be provided on callbacks.
       
  5151  *
       
  5152  * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
       
  5153  * to handle parse events. If sax is NULL, fallback to the default DOM
       
  5154  * behavior and return a tree.
       
  5155  *
       
  5156  * Returns the resulting document tree unless SAX is NULL or the document is
       
  5157  *     not well formed.
       
  5158  */
       
  5159 
       
  5160 htmlDocPtr
       
  5161 htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
       
  5162     htmlDocPtr ret;
       
  5163     htmlParserCtxtPtr ctxt;
       
  5164 
       
  5165     xmlInitParser();
       
  5166 
       
  5167     if (cur == NULL) return(NULL);
       
  5168 
       
  5169 
       
  5170     ctxt = htmlCreateDocParserCtxt(cur, encoding);
       
  5171     if (ctxt == NULL) return(NULL);
       
  5172     if (sax != NULL) {
       
  5173         if (ctxt->sax != NULL) xmlFree (ctxt->sax);
       
  5174         ctxt->sax = sax;
       
  5175         ctxt->userData = userData;
       
  5176     }
       
  5177 
       
  5178     htmlParseDocument(ctxt);
       
  5179     ret = ctxt->myDoc;
       
  5180     if (sax != NULL) {
       
  5181         ctxt->sax = NULL;
       
  5182         ctxt->userData = NULL;
       
  5183     }
       
  5184     htmlFreeParserCtxt(ctxt);
       
  5185 
       
  5186     return(ret);
       
  5187 }
       
  5188 
       
  5189 /**
       
  5190  * htmlParseDoc:
       
  5191  * @param cur a pointer to an array of xmlChar
       
  5192  * @param encoding a free form C string describing the HTML document encoding, or NULL
       
  5193  *
       
  5194  * parse an HTML in-memory document and build a tree.
       
  5195  *
       
  5196  * Returns the resulting document tree
       
  5197  */
       
  5198 
       
  5199 htmlDocPtr
       
  5200 htmlParseDoc(xmlChar *cur, const char *encoding) {
       
  5201     return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
       
  5202 }
       
  5203 
       
  5204 
       
  5205 /**
       
  5206  * htmlCreateFileParserCtxt:
       
  5207  * @param filename the filename
       
  5208  * @param encoding a free form C string describing the HTML document encoding, or NULL
       
  5209  *
       
  5210  * Create a parser context for a file content.
       
  5211  * Automatic support for ZLIB/Compress compressed document is provided
       
  5212  * by default if found at compile-time.
       
  5213  *
       
  5214  * Returns the new parser context or NULL
       
  5215  */
       
  5216 htmlParserCtxtPtr
       
  5217 htmlCreateFileParserCtxt(const char *filename, const char *encoding)
       
  5218 {
       
  5219     htmlParserCtxtPtr ctxt;
       
  5220     htmlParserInputPtr inputStream;
       
  5221     char *canonicFilename;
       
  5222     /* htmlCharEncoding enc; */
       
  5223     xmlChar *content, *content_line = (xmlChar *) "charset=";
       
  5224 
       
  5225     ctxt = htmlNewParserCtxt();
       
  5226     if (ctxt == NULL) {
       
  5227         return(NULL);
       
  5228     }
       
  5229     canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
       
  5230     if (canonicFilename == NULL) {
       
  5231 #ifdef LIBXML_SAX1_ENABLED
       
  5232         if (xmlDefaultSAXHandler.error != NULL) {
       
  5233             xmlDefaultSAXHandler.error(NULL, "out of memory\n");
       
  5234         }
       
  5235 #endif
       
  5236         xmlFreeParserCtxt(ctxt);
       
  5237         return(NULL);
       
  5238     }
       
  5239 
       
  5240     inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
       
  5241     xmlFree(canonicFilename);
       
  5242     if (inputStream == NULL) {
       
  5243         xmlFreeParserCtxt(ctxt);
       
  5244         return(NULL);
       
  5245     }
       
  5246 
       
  5247     inputPush(ctxt, inputStream);
       
  5248 
       
  5249     /* set encoding */
       
  5250     if (encoding) {
       
  5251         content = (xmlChar*)xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
       
  5252         if (content) {
       
  5253             strcpy ((char *)content, (char *)content_line);
       
  5254             strcat ((char *)content, (char *)encoding);
       
  5255             htmlCheckEncoding (ctxt, content);
       
  5256             xmlFree (content);
       
  5257         }
       
  5258     }
       
  5259 
       
  5260     return(ctxt);
       
  5261 }
       
  5262 
       
  5263 /**
       
  5264  * htmlSAXParseFile:
       
  5265  * @param filename the filename
       
  5266  * @param encoding a free form C string describing the HTML document encoding, or NULL
       
  5267  * @param sax the SAX handler block
       
  5268  * @param userData if using SAX, this pointer will be provided on callbacks.
       
  5269  *
       
  5270  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
       
  5271  * compressed document is provided by default if found at compile-time.
       
  5272  * It use the given SAX function block to handle the parsing callback.
       
  5273  * If sax is NULL, fallback to the default DOM tree building routines.
       
  5274  *
       
  5275  * Returns the resulting document tree unless SAX is NULL or the document is
       
  5276  *     not well formed.
       
  5277  */
       
  5278 
       
  5279 htmlDocPtr
       
  5280 htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
       
  5281                  void *userData) {
       
  5282     htmlDocPtr ret;
       
  5283     htmlParserCtxtPtr ctxt;
       
  5284     htmlSAXHandlerPtr oldsax = NULL;
       
  5285 
       
  5286     xmlInitParser();
       
  5287 
       
  5288     ctxt = htmlCreateFileParserCtxt(filename, encoding);
       
  5289     if (ctxt == NULL) return(NULL);
       
  5290     if (sax != NULL) {
       
  5291         oldsax = ctxt->sax;
       
  5292         ctxt->sax = sax;
       
  5293         ctxt->userData = userData;
       
  5294     }
       
  5295 
       
  5296     htmlParseDocument(ctxt);
       
  5297 
       
  5298     ret = ctxt->myDoc;
       
  5299     if (sax != NULL) {
       
  5300         ctxt->sax = oldsax;
       
  5301         ctxt->userData = NULL;
       
  5302     }
       
  5303     htmlFreeParserCtxt(ctxt);
       
  5304 
       
  5305     return(ret);
       
  5306 }
       
  5307 
       
  5308 /**
       
  5309  * htmlParseFile:
       
  5310  * @param filename the filename
       
  5311  * @param encoding a free form C string describing the HTML document encoding, or NULL
       
  5312  *
       
  5313  * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
       
  5314  * compressed document is provided by default if found at compile-time.
       
  5315  *
       
  5316  * Returns the resulting document tree
       
  5317  */
       
  5318 
       
  5319 htmlDocPtr
       
  5320 htmlParseFile(const char *filename, const char *encoding) {
       
  5321     return(htmlSAXParseFile(filename, encoding, NULL, NULL));
       
  5322 }
       
  5323 
       
  5324 /**
       
  5325  * htmlHandleOmittedElem:
       
  5326  * @param val int 0 or 1
       
  5327  *
       
  5328  * Set and return the previous value for handling HTML omitted tags.
       
  5329  *
       
  5330  * Returns the last value for 0 for no handling, 1 for auto insertion.
       
  5331  */
       
  5332 
       
  5333 int
       
  5334 htmlHandleOmittedElem(int val) {
       
  5335     int old = htmlOmittedDefaultValue;
       
  5336 
       
  5337         
       
  5338     return(old);
       
  5339 }
       
  5340 
       
  5341 /**
       
  5342  * htmlElementAllowedHere:
       
  5343  * @param parent HTML parent element
       
  5344  * @param elt HTML element
       
  5345  *
       
  5346  * Checks whether an HTML element may be a direct child of a parent element.
       
  5347  * Note - doesn't check for deprecated elements
       
  5348  *
       
  5349  * Returns 1 if allowed; 0 otherwise.
       
  5350  */
       
  5351 int
       
  5352 htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
       
  5353   const char** p ;
       
  5354 
       
  5355   if ( ! elt || ! parent || ! parent->subelts )
       
  5356         return 0 ;
       
  5357 
       
  5358   for ( p = parent->subelts; *p; ++p )
       
  5359     if ( !xmlStrcmp((const xmlChar *)*p, elt) )
       
  5360       return 1 ;
       
  5361 
       
  5362   return 0 ;
       
  5363 }
       
  5364 /**
       
  5365  * htmlElementStatusHere:
       
  5366  * @param parent HTML parent element
       
  5367  * @param elt HTML element
       
  5368  *
       
  5369  * Checks whether an HTML element may be a direct child of a parent element.
       
  5370  * and if so whether it is valid or deprecated.
       
  5371  *
       
  5372  * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
       
  5373  */
       
  5374 htmlStatus
       
  5375 htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
       
  5376   if ( ! parent || ! elt )
       
  5377     return HTML_INVALID ;
       
  5378   if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
       
  5379     return HTML_INVALID ;
       
  5380 
       
  5381   return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
       
  5382 }
       
  5383 /**
       
  5384  * htmlAttrAllowed:
       
  5385  * @param elt HTML element
       
  5386  * @param attr HTML attribute
       
  5387  * @param legacy whether to allow deprecated attributes
       
  5388  *
       
  5389  * Checks whether an attribute is valid for an element
       
  5390  * Has full knowledge of Required and Deprecated attributes
       
  5391  *
       
  5392  * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
       
  5393  */
       
  5394 htmlStatus
       
  5395 htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
       
  5396   const char** p ;
       
  5397 
       
  5398   if ( !elt || ! attr )
       
  5399         return HTML_INVALID ;
       
  5400 
       
  5401   if ( elt->attrs_req )
       
  5402     for ( p = elt->attrs_req; *p; ++p)
       
  5403       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
       
  5404         return HTML_REQUIRED ;
       
  5405 
       
  5406   if ( elt->attrs_opt )
       
  5407     for ( p = elt->attrs_opt; *p; ++p)
       
  5408       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
       
  5409         return HTML_VALID ;
       
  5410 
       
  5411   if ( legacy && elt->attrs_depr )
       
  5412     for ( p = elt->attrs_depr; *p; ++p)
       
  5413       if ( !xmlStrcmp((const xmlChar*)*p, attr) )
       
  5414         return HTML_DEPRECATED ;
       
  5415 
       
  5416   return HTML_INVALID ;
       
  5417 }
       
  5418 /**
       
  5419  * htmlNodeStatus:
       
  5420  * @param node an htmlNodePtr in a tree
       
  5421  * @param legacy whether to allow deprecated elements (YES is faster here
       
  5422  *      for Element nodes)
       
  5423  *
       
  5424  * Checks whether the tree node is valid.  Experimental (the author
       
  5425  *     only uses the HTML enhancements in a SAX parser)
       
  5426  *
       
  5427  * Return: for Element nodes, a return from htmlElementAllowedHere (if
       
  5428  *      legacy allowed) or htmlElementStatusHere (otherwise).
       
  5429  *      for Attribute nodes, a return from htmlAttrAllowed
       
  5430  *      for other nodes, HTML_NA (no checks performed)
       
  5431  */
       
  5432 htmlStatus
       
  5433 htmlNodeStatus(const htmlNodePtr node, int legacy) {
       
  5434   if ( ! node )
       
  5435     return HTML_INVALID ;
       
  5436 
       
  5437   switch ( node->type ) {
       
  5438     case XML_ELEMENT_NODE:
       
  5439       return legacy
       
  5440         ? ( htmlElementAllowedHere (
       
  5441                 htmlTagLookup(node->parent->name) , node->name
       
  5442                 ) ? HTML_VALID : HTML_INVALID )
       
  5443         : htmlElementStatusHere(
       
  5444                 htmlTagLookup(node->parent->name) ,
       
  5445                 htmlTagLookup(node->name) )
       
  5446         ;
       
  5447     case XML_ATTRIBUTE_NODE:
       
  5448       return htmlAttrAllowed(
       
  5449         htmlTagLookup(node->parent->name) , node->name, legacy) ;
       
  5450     default: return HTML_NA ;
       
  5451   }
       
  5452 }
       
  5453 /************************************************************************
       
  5454  *                                                                      *
       
  5455  *      New set (2.6.0) of simpler and more flexible APIs               *
       
  5456  *                                                                      *
       
  5457  ************************************************************************/
       
  5458 /**
       
  5459  * DICT_FREE:
       
  5460  * @param str a string
       
  5461  *
       
  5462  * Free a string if it is not owned by the "dict" dictionnary in the
       
  5463  * current scope
       
  5464  */
       
  5465 #define DICT_FREE(str)                                          \
       
  5466         if ((str) && ((!dict) ||                                \
       
  5467             (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
       
  5468             xmlFree((char *)(str));
       
  5469 
       
  5470 /**
       
  5471  * htmlCtxtReset:
       
  5472  * @param ctxt an HTML parser context
       
  5473  *
       
  5474  * Reset a parser context
       
  5475  */
       
  5476 void
       
  5477 htmlCtxtReset(htmlParserCtxtPtr ctxt)
       
  5478 {
       
  5479     xmlParserInputPtr input;
       
  5480     xmlDictPtr dict = ctxt->dict;
       
  5481 
       
  5482     while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
       
  5483         xmlFreeInputStream(input);
       
  5484     }
       
  5485     ctxt->inputNr = 0;
       
  5486     ctxt->input = NULL;
       
  5487 
       
  5488     ctxt->spaceNr = 0;
       
  5489     ctxt->spaceTab[0] = -1;
       
  5490     ctxt->space = &ctxt->spaceTab[0];
       
  5491 
       
  5492 
       
  5493     ctxt->nodeNr = 0;
       
  5494     ctxt->node = NULL;
       
  5495 
       
  5496     ctxt->nameNr = 0;
       
  5497     ctxt->name = NULL;
       
  5498 
       
  5499     DICT_FREE(ctxt->version);
       
  5500     ctxt->version = NULL;
       
  5501     DICT_FREE(ctxt->encoding);
       
  5502     ctxt->encoding = NULL;
       
  5503     DICT_FREE(ctxt->directory);
       
  5504     ctxt->directory = NULL;
       
  5505     DICT_FREE(ctxt->extSubURI);
       
  5506     ctxt->extSubURI = NULL;
       
  5507     DICT_FREE(ctxt->extSubSystem);
       
  5508     ctxt->extSubSystem = NULL;
       
  5509     if (ctxt->myDoc != NULL)
       
  5510         xmlFreeDoc(ctxt->myDoc);
       
  5511     ctxt->myDoc = NULL;
       
  5512 
       
  5513     ctxt->standalone = -1;
       
  5514     ctxt->hasExternalSubset = 0;
       
  5515     ctxt->hasPErefs = 0;
       
  5516     ctxt->html = 1;
       
  5517     ctxt->external = 0;
       
  5518     ctxt->instate = XML_PARSER_START;
       
  5519     ctxt->token = 0;
       
  5520 
       
  5521     ctxt->wellFormed = 1;
       
  5522     ctxt->nsWellFormed = 1;
       
  5523     ctxt->valid = 1;
       
  5524     ctxt->vctxt.userData = ctxt;
       
  5525     ctxt->vctxt.error = xmlParserValidityError;
       
  5526     ctxt->vctxt.warning = xmlParserValidityWarning;
       
  5527     ctxt->record_info = 0;
       
  5528     ctxt->nbChars = 0;
       
  5529     ctxt->checkIndex = 0;
       
  5530     ctxt->inSubset = 0;
       
  5531     ctxt->errNo = XML_ERR_OK;
       
  5532     ctxt->depth = 0;
       
  5533     ctxt->charset = XML_CHAR_ENCODING_UTF8;
       
  5534     ctxt->catalogs = NULL;
       
  5535     xmlInitNodeInfoSeq(&ctxt->node_seq);
       
  5536 
       
  5537     if (ctxt->attsDefault != NULL) {
       
  5538         xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
       
  5539         ctxt->attsDefault = NULL;
       
  5540     }
       
  5541     if (ctxt->attsSpecial != NULL) {
       
  5542         xmlHashFree(ctxt->attsSpecial, NULL);
       
  5543         ctxt->attsSpecial = NULL;
       
  5544     }
       
  5545 }
       
  5546 
       
  5547 /**
       
  5548  * htmlCtxtUseOptions:
       
  5549  * @param ctxt an HTML parser context
       
  5550  * @param options a combination of htmlParserOption(s)
       
  5551  *
       
  5552  * Applies the options to the parser context
       
  5553  *
       
  5554  * Returns 0 in case of success, the set of unknown or unimplemented options
       
  5555  *         in case of error.
       
  5556  */
       
  5557 int
       
  5558 htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
       
  5559 {
       
  5560     if (options & HTML_PARSE_NOWARNING) {
       
  5561         ctxt->sax->warning = NULL;
       
  5562         ctxt->vctxt.warning = NULL;
       
  5563         options -= XML_PARSE_NOWARNING;
       
  5564         ctxt->options |= XML_PARSE_NOWARNING;
       
  5565     }
       
  5566     if (options & HTML_PARSE_NOERROR) {
       
  5567         ctxt->sax->error = NULL;
       
  5568         ctxt->vctxt.error = NULL;
       
  5569         ctxt->sax->fatalError = NULL;
       
  5570         options -= XML_PARSE_NOERROR;
       
  5571         ctxt->options |= XML_PARSE_NOERROR;
       
  5572     }
       
  5573     if (options & HTML_PARSE_PEDANTIC) {
       
  5574         ctxt->pedantic = 1;
       
  5575         options -= XML_PARSE_PEDANTIC;
       
  5576         ctxt->options |= XML_PARSE_PEDANTIC;
       
  5577     } else
       
  5578         ctxt->pedantic = 0;
       
  5579     if (options & XML_PARSE_NOBLANKS) {
       
  5580         ctxt->keepBlanks = 0;
       
  5581         ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
       
  5582         options -= XML_PARSE_NOBLANKS;
       
  5583         ctxt->options |= XML_PARSE_NOBLANKS;
       
  5584     } else
       
  5585         ctxt->keepBlanks = 1;
       
  5586     ctxt->dictNames = 0;
       
  5587     return (options);
       
  5588 }
       
  5589 
       
  5590 /**
       
  5591  * htmlDoRead:
       
  5592  * @param ctxt an HTML parser context
       
  5593  * @param URL the base URL to use for the document
       
  5594  * @param encoding the document encoding, or NULL
       
  5595  * @param options a combination of htmlParserOption(s)
       
  5596  * @param reuse keep the context for reuse
       
  5597  *
       
  5598  * Common front-end for the htmlRead functions
       
  5599  *
       
  5600  * Returns the resulting document tree or NULL
       
  5601  */
       
  5602 static htmlDocPtr
       
  5603 htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
       
  5604           int options, int reuse)
       
  5605 {
       
  5606     htmlDocPtr ret;
       
  5607 
       
  5608     htmlCtxtUseOptions(ctxt, options);
       
  5609     ctxt->html = 1;
       
  5610     if (encoding != NULL) {
       
  5611         xmlCharEncodingHandlerPtr hdlr;
       
  5612 
       
  5613         hdlr = xmlFindCharEncodingHandler(encoding);
       
  5614         if (hdlr != NULL)
       
  5615             xmlSwitchToEncoding(ctxt, hdlr);
       
  5616     }
       
  5617     if ((URL != NULL) && (ctxt->input != NULL) &&
       
  5618         (ctxt->input->filename == NULL))
       
  5619         ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
       
  5620     htmlParseDocument(ctxt);
       
  5621     ret = ctxt->myDoc;
       
  5622     ctxt->myDoc = NULL;
       
  5623     if (!reuse) {
       
  5624         if ((ctxt->dictNames) &&
       
  5625             (ret != NULL) &&
       
  5626             (ret->dict == ctxt->dict))
       
  5627             ctxt->dict = NULL;
       
  5628         xmlFreeParserCtxt(ctxt);
       
  5629     }
       
  5630     return (ret);
       
  5631 }
       
  5632 
       
  5633 /**
       
  5634  * htmlReadDoc:
       
  5635  * @param cur a pointer to a zero terminated string
       
  5636  * @param URL the base URL to use for the document
       
  5637  * @param encoding the document encoding, or NULL
       
  5638  * @param options a combination of htmlParserOption(s)
       
  5639  *
       
  5640  * parse an XML in-memory document and build a tree.
       
  5641  *
       
  5642  * Returns the resulting document tree
       
  5643  */
       
  5644 htmlDocPtr
       
  5645 htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
       
  5646 {
       
  5647     htmlParserCtxtPtr ctxt;
       
  5648 
       
  5649     if (cur == NULL)
       
  5650         return (NULL);
       
  5651 
       
  5652     ctxt = xmlCreateDocParserCtxt(cur, sizeof(cur));
       
  5653     if (ctxt == NULL)
       
  5654         return (NULL);
       
  5655     return (htmlDoRead(ctxt, URL, encoding, options, 0));
       
  5656 }
       
  5657 
       
  5658 /**
       
  5659  * htmlReadFile:
       
  5660  * @param filename a file or URL
       
  5661  * @param encoding the document encoding, or NULL
       
  5662  * @param options a combination of htmlParserOption(s)
       
  5663  *
       
  5664  * parse an XML file from the filesystem or the network.
       
  5665  *
       
  5666  * Returns the resulting document tree
       
  5667  */
       
  5668 htmlDocPtr
       
  5669 htmlReadFile(const char *filename, const char *encoding, int options)
       
  5670 {
       
  5671     htmlParserCtxtPtr ctxt;
       
  5672 
       
  5673     ctxt = htmlCreateFileParserCtxt(filename, encoding);
       
  5674     if (ctxt == NULL)
       
  5675         return (NULL);
       
  5676     return (htmlDoRead(ctxt, NULL, NULL, options, 0));
       
  5677 }
       
  5678 
       
  5679 /**
       
  5680  * htmlReadMemory:
       
  5681  * @param buffer a pointer to a char array
       
  5682  * @param size the size of the array
       
  5683  * @param URL the base URL to use for the document
       
  5684  * @param encoding the document encoding, or NULL
       
  5685  * @param options a combination of htmlParserOption(s)
       
  5686  *
       
  5687  * parse an XML in-memory document and build a tree.
       
  5688  *
       
  5689  * Returns the resulting document tree
       
  5690  */
       
  5691 htmlDocPtr
       
  5692 htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
       
  5693 {
       
  5694     htmlParserCtxtPtr ctxt;
       
  5695 
       
  5696     ctxt = xmlCreateMemoryParserCtxt(buffer, size);
       
  5697     if (ctxt == NULL)
       
  5698         return (NULL);
       
  5699     return (htmlDoRead(ctxt, URL, encoding, options, 0));
       
  5700 }
       
  5701 
       
  5702 /**
       
  5703  * htmlReadFd:
       
  5704  * @param fd an open file descriptor
       
  5705  * @param URL the base URL to use for the document
       
  5706  * @param encoding the document encoding, or NULL
       
  5707  * @param options a combination of htmlParserOption(s)
       
  5708  *
       
  5709  * parse an XML from a file descriptor and build a tree.
       
  5710  *
       
  5711  * Returns the resulting document tree
       
  5712  */
       
  5713 htmlDocPtr
       
  5714 htmlReadFd(int fd, const char *URL, const char *encoding, int options)
       
  5715 {
       
  5716     htmlParserCtxtPtr ctxt;
       
  5717     xmlParserInputBufferPtr input;
       
  5718     xmlParserInputPtr stream;
       
  5719 
       
  5720     if (fd < 0)
       
  5721         return (NULL);
       
  5722 
       
  5723     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
       
  5724     if (input == NULL)
       
  5725         return (NULL);
       
  5726     ctxt = xmlNewParserCtxt();
       
  5727     if (ctxt == NULL) {
       
  5728         xmlFreeParserInputBuffer(input);
       
  5729         return (NULL);
       
  5730     }
       
  5731     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
       
  5732     if (stream == NULL) {
       
  5733         xmlFreeParserInputBuffer(input);
       
  5734         xmlFreeParserCtxt(ctxt);
       
  5735         return (NULL);
       
  5736     }
       
  5737     inputPush(ctxt, stream);
       
  5738     return (htmlDoRead(ctxt, URL, encoding, options, 0));
       
  5739 }
       
  5740 
       
  5741 /**
       
  5742  * htmlReadIO:
       
  5743  * @param ioread an I/O read function
       
  5744  * @param ioclose an I/O close function
       
  5745  * @param ioctx an I/O handler
       
  5746  * @param URL the base URL to use for the document
       
  5747  * @param encoding the document encoding, or NULL
       
  5748  * @param options a combination of htmlParserOption(s)
       
  5749  *
       
  5750  * parse an HTML document from I/O functions and source and build a tree.
       
  5751  *
       
  5752  * Returns the resulting document tree
       
  5753  */
       
  5754 htmlDocPtr
       
  5755 htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
       
  5756           void *ioctx, const char *URL, const char *encoding, int options)
       
  5757 {
       
  5758     htmlParserCtxtPtr ctxt;
       
  5759     xmlParserInputBufferPtr input;
       
  5760     xmlParserInputPtr stream;
       
  5761 
       
  5762     if (ioread == NULL)
       
  5763         return (NULL);
       
  5764 
       
  5765     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
       
  5766                                          XML_CHAR_ENCODING_NONE);
       
  5767     if (input == NULL)
       
  5768         return (NULL);
       
  5769     ctxt = xmlNewParserCtxt();
       
  5770     if (ctxt == NULL) {
       
  5771         xmlFreeParserInputBuffer(input);
       
  5772         return (NULL);
       
  5773     }
       
  5774     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
       
  5775     if (stream == NULL) {
       
  5776         xmlFreeParserInputBuffer(input);
       
  5777         xmlFreeParserCtxt(ctxt);
       
  5778         return (NULL);
       
  5779     }
       
  5780     inputPush(ctxt, stream);
       
  5781     return (htmlDoRead(ctxt, URL, encoding, options, 0));
       
  5782 }
       
  5783 
       
  5784 /**
       
  5785  * htmlCtxtReadDoc:
       
  5786  * @param ctxt an HTML parser context
       
  5787  * @param cur a pointer to a zero terminated string
       
  5788  * @param URL the base URL to use for the document
       
  5789  * @param encoding the document encoding, or NULL
       
  5790  * @param options a combination of htmlParserOption(s)
       
  5791  *
       
  5792  * parse an XML in-memory document and build a tree.
       
  5793  * This reuses the existing ctxt parser context
       
  5794  *
       
  5795  * Returns the resulting document tree
       
  5796  */
       
  5797 htmlDocPtr
       
  5798 htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
       
  5799                const char *URL, const char *encoding, int options)
       
  5800 {
       
  5801     xmlParserInputPtr stream;
       
  5802 
       
  5803     if (cur == NULL)
       
  5804         return (NULL);
       
  5805     if (ctxt == NULL)
       
  5806         return (NULL);
       
  5807 
       
  5808     htmlCtxtReset(ctxt);
       
  5809 
       
  5810     stream = xmlNewStringInputStream(ctxt, cur);
       
  5811     if (stream == NULL) {
       
  5812         return (NULL);
       
  5813     }
       
  5814     inputPush(ctxt, stream);
       
  5815     return (htmlDoRead(ctxt, URL, encoding, options, 1));
       
  5816 }
       
  5817 
       
  5818 /**
       
  5819  * htmlCtxtReadFile:
       
  5820  * @param ctxt an HTML parser context
       
  5821  * @param filename a file or URL
       
  5822  * @param encoding the document encoding, or NULL
       
  5823  * @param options a combination of htmlParserOption(s)
       
  5824  *
       
  5825  * parse an XML file from the filesystem or the network.
       
  5826  * This reuses the existing ctxt parser context
       
  5827  *
       
  5828  * Returns the resulting document tree
       
  5829  */
       
  5830 htmlDocPtr
       
  5831 htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
       
  5832                 const char *encoding, int options)
       
  5833 {
       
  5834     xmlParserInputPtr stream;
       
  5835 
       
  5836     if (filename == NULL)
       
  5837         return (NULL);
       
  5838     if (ctxt == NULL)
       
  5839         return (NULL);
       
  5840 
       
  5841     htmlCtxtReset(ctxt);
       
  5842 
       
  5843     stream = xmlNewInputFromFile(ctxt, filename);
       
  5844     if (stream == NULL) {
       
  5845         return (NULL);
       
  5846     }
       
  5847     inputPush(ctxt, stream);
       
  5848     return (htmlDoRead(ctxt, NULL, encoding, options, 1));
       
  5849 }
       
  5850 
       
  5851 /**
       
  5852  * htmlCtxtReadMemory:
       
  5853  * @param ctxt an HTML parser context
       
  5854  * @param buffer a pointer to a char array
       
  5855  * @param size the size of the array
       
  5856  * @param URL the base URL to use for the document
       
  5857  * @param encoding the document encoding, or NULL
       
  5858  * @param options a combination of htmlParserOption(s)
       
  5859  *
       
  5860  * parse an XML in-memory document and build a tree.
       
  5861  * This reuses the existing ctxt parser context
       
  5862  *
       
  5863  * Returns the resulting document tree
       
  5864  */
       
  5865 htmlDocPtr
       
  5866 htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
       
  5867                   const char *URL, const char *encoding, int options)
       
  5868 {
       
  5869     xmlParserInputBufferPtr input;
       
  5870     xmlParserInputPtr stream;
       
  5871 
       
  5872     if (ctxt == NULL)
       
  5873         return (NULL);
       
  5874     if (buffer == NULL)
       
  5875         return (NULL);
       
  5876 
       
  5877     htmlCtxtReset(ctxt);
       
  5878 
       
  5879     input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
       
  5880     if (input == NULL) {
       
  5881         return(NULL);
       
  5882     }
       
  5883 
       
  5884     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
       
  5885     if (stream == NULL) {
       
  5886         xmlFreeParserInputBuffer(input);
       
  5887         return(NULL);
       
  5888     }
       
  5889 
       
  5890     inputPush(ctxt, stream);
       
  5891     return (htmlDoRead(ctxt, URL, encoding, options, 1));
       
  5892 }
       
  5893 
       
  5894 /**
       
  5895  * htmlCtxtReadFd:
       
  5896  * @param ctxt an HTML parser context
       
  5897  * @param fd an open file descriptor
       
  5898  * @param URL the base URL to use for the document
       
  5899  * @param encoding the document encoding, or NULL
       
  5900  * @param options a combination of htmlParserOption(s)
       
  5901  *
       
  5902  * parse an XML from a file descriptor and build a tree.
       
  5903  * This reuses the existing ctxt parser context
       
  5904  *
       
  5905  * Returns the resulting document tree
       
  5906  */
       
  5907 htmlDocPtr
       
  5908 htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
       
  5909               const char *URL, const char *encoding, int options)
       
  5910 {
       
  5911     xmlParserInputBufferPtr input;
       
  5912     xmlParserInputPtr stream;
       
  5913 
       
  5914     if (fd < 0)
       
  5915         return (NULL);
       
  5916     if (ctxt == NULL)
       
  5917         return (NULL);
       
  5918 
       
  5919     htmlCtxtReset(ctxt);
       
  5920 
       
  5921 
       
  5922     input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
       
  5923     if (input == NULL)
       
  5924         return (NULL);
       
  5925     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
       
  5926     if (stream == NULL) {
       
  5927         xmlFreeParserInputBuffer(input);
       
  5928         return (NULL);
       
  5929     }
       
  5930     inputPush(ctxt, stream);
       
  5931     return (htmlDoRead(ctxt, URL, encoding, options, 1));
       
  5932 }
       
  5933 
       
  5934 /**
       
  5935  * htmlCtxtReadIO:
       
  5936  * @param ctxt an HTML parser context
       
  5937  * @param ioread an I/O read function
       
  5938  * @param ioclose an I/O close function
       
  5939  * @param ioctx an I/O handler
       
  5940  * @param URL the base URL to use for the document
       
  5941  * @param encoding the document encoding, or NULL
       
  5942  * @param options a combination of htmlParserOption(s)
       
  5943  *
       
  5944  * parse an HTML document from I/O functions and source and build a tree.
       
  5945  * This reuses the existing ctxt parser context
       
  5946  *
       
  5947  * Returns the resulting document tree
       
  5948  */
       
  5949 htmlDocPtr
       
  5950 htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
       
  5951               xmlInputCloseCallback ioclose, void *ioctx,
       
  5952               const char *URL,
       
  5953               const char *encoding, int options)
       
  5954 {
       
  5955     xmlParserInputBufferPtr input;
       
  5956     xmlParserInputPtr stream;
       
  5957 
       
  5958     if (ioread == NULL)
       
  5959         return (NULL);
       
  5960     if (ctxt == NULL)
       
  5961         return (NULL);
       
  5962 
       
  5963     htmlCtxtReset(ctxt);
       
  5964 
       
  5965     input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
       
  5966                                          XML_CHAR_ENCODING_NONE);
       
  5967     if (input == NULL)
       
  5968         return (NULL);
       
  5969     stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
       
  5970     if (stream == NULL) {
       
  5971         xmlFreeParserInputBuffer(input);
       
  5972         return (NULL);
       
  5973     }
       
  5974     inputPush(ctxt, stream);
       
  5975     return (htmlDoRead(ctxt, URL, encoding, options, 1));
       
  5976 }
       
  5977 
       
  5978 #endif /* LIBXML_HTML_ENABLED */