xml/libxml2libs/inc/libxml2/libxml2_htmlparser.h
changeset 0 e35f40988205
equal deleted inserted replaced
-1:000000000000 0:e35f40988205
       
     1 /*
       
     2  * Summary: interface for an HTML 4.0 non-verifying parser
       
     3  * Description: this module implements an HTML 4.0 non-verifying parser
       
     4  *              with API compatible with the XML parser ones. It should
       
     5  *              be able to parse "real world" HTML, even if severely
       
     6  *              broken from a specification point of view.
       
     7  *
       
     8  * Copy: See Copyright for the status of this software.
       
     9  *
       
    10  * Author: Daniel Veillard
       
    11  * Portion Copyright © 2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
       
    12  */
       
    13 
       
    14 /** @file
       
    15 @publishedAll
       
    16 @released
       
    17 */
       
    18 
       
    19 #ifndef HTML_PARSER_H
       
    20 #define HTML_PARSER_H
       
    21 
       
    22 #include <stdapis/libxml2/libxml2_parser.h>
       
    23 
       
    24 #ifdef __cplusplus
       
    25 extern "C" {
       
    26 #endif
       
    27 
       
    28 /*
       
    29  * Most of the back-end structures from XML and HTML are shared.
       
    30  */
       
    31 typedef xmlParserCtxt htmlParserCtxt;
       
    32 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
       
    33 typedef xmlParserNodeInfo htmlParserNodeInfo;
       
    34 typedef xmlSAXHandler htmlSAXHandler;
       
    35 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
       
    36 typedef xmlParserInput htmlParserInput;
       
    37 typedef xmlParserInputPtr htmlParserInputPtr;
       
    38 typedef xmlDocPtr htmlDocPtr;
       
    39 typedef xmlNodePtr htmlNodePtr;
       
    40 
       
    41 /*
       
    42  * Internal description of an HTML element, representing HTML 4.01
       
    43  * and XHTML 1.0 (which share the same structure).
       
    44  */
       
    45 typedef struct _htmlElemDesc htmlElemDesc;
       
    46 typedef htmlElemDesc *htmlElemDescPtr;
       
    47 struct _htmlElemDesc {
       
    48     const char *name;   /* The tag name */
       
    49     char startTag;      /* Whether the start tag can be implied */
       
    50     char endTag;        /* Whether the end tag can be implied */
       
    51     char saveEndTag;    /* Whether the end tag should be saved */
       
    52     char empty;         /* Is this an empty element ? */
       
    53     char depr;          /* Is this a deprecated element ? */
       
    54     char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
       
    55     char isinline;      /* is this a block 0 or inline 1 element */
       
    56     const char *desc;   /* the description */
       
    57 
       
    58 /* NRK Jan.2003
       
    59  * New fields encapsulating HTML structure
       
    60  *
       
    61  * Bugs:
       
    62  *      This is a very limited representation.  It fails to tell us when
       
    63  *      an element *requires* subelements (we only have whether they're
       
    64  *      allowed or not), and it doesn't tell us where CDATA and PCDATA
       
    65  *      are allowed.  Some element relationships are not fully represented:
       
    66  *      these are flagged with the word MODIFIER
       
    67  */
       
    68     const char** subelts;               /* allowed sub-elements of this element */
       
    69     const char* defaultsubelt;  /* subelement for suggested auto-repair
       
    70                                            if necessary or NULL */
       
    71     const char** attrs_opt;             /* Optional Attributes */
       
    72     const char** attrs_depr;            /* Additional deprecated attributes */
       
    73     const char** attrs_req;             /* Required attributes */
       
    74 };
       
    75 
       
    76 /*
       
    77  * Internal description of an HTML entity.
       
    78  */
       
    79 typedef struct _htmlEntityDesc htmlEntityDesc;
       
    80 typedef htmlEntityDesc *htmlEntityDescPtr;
       
    81 struct _htmlEntityDesc {
       
    82     unsigned int value; /* the UNICODE value for the character */
       
    83     const char *name;   /* The entity name */
       
    84     const char *desc;   /* the description */
       
    85 };
       
    86 
       
    87 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT)
       
    88 /*
       
    89  * There is only few public functions.
       
    90  */
       
    91 XMLPUBFUN const htmlElemDesc * XMLCALL
       
    92                         htmlTagLookup   (const xmlChar *tag);
       
    93 
       
    94 #endif /* LIBXML_HTML_ENABLED || XMLENGINE_XSLT ) */
       
    95 
       
    96 #ifdef LIBXML_HTML_ENABLED
       
    97 
       
    98 XMLPUBFUN const htmlEntityDesc * XMLCALL
       
    99                         htmlEntityLookup(const xmlChar *name);
       
   100 XMLPUBFUN const htmlEntityDesc * XMLCALL
       
   101                         htmlEntityValueLookup(unsigned int value);
       
   102 
       
   103 XMLPUBFUN int XMLCALL
       
   104                         htmlIsAutoClosed(htmlDocPtr doc,
       
   105                                          htmlNodePtr elem);
       
   106 XMLPUBFUN int XMLCALL
       
   107                         htmlAutoCloseTag(htmlDocPtr doc,
       
   108                                          const xmlChar *name,
       
   109                                          htmlNodePtr elem);
       
   110 XMLPUBFUN const htmlEntityDesc * XMLCALL
       
   111                         htmlParseEntityRef(htmlParserCtxtPtr ctxt,
       
   112                                          const xmlChar **str);
       
   113 XMLPUBFUN int XMLCALL
       
   114                         htmlParseCharRef(htmlParserCtxtPtr ctxt);
       
   115 XMLPUBFUN void XMLCALL
       
   116                         htmlParseElement(htmlParserCtxtPtr ctxt);
       
   117 
       
   118 XMLPUBFUN htmlParserCtxtPtr XMLCALL
       
   119                         htmlCreateMemoryParserCtxt(const char *buffer,
       
   120                                                    int size);
       
   121 
       
   122 XMLPUBFUN int XMLCALL
       
   123                         htmlParseDocument(htmlParserCtxtPtr ctxt);
       
   124 XMLPUBFUN htmlDocPtr XMLCALL
       
   125                         htmlSAXParseDoc (xmlChar *cur,
       
   126                                          const char *encoding,
       
   127                                          htmlSAXHandlerPtr sax,
       
   128                                          void *userData);
       
   129 XMLPUBFUN htmlDocPtr XMLCALL
       
   130                         htmlParseDoc    (xmlChar *cur,
       
   131                                          const char *encoding);
       
   132 XMLPUBFUN htmlDocPtr XMLCALL
       
   133                         htmlSAXParseFile(const char *filename,
       
   134                                          const char *encoding,
       
   135                                          htmlSAXHandlerPtr sax,
       
   136                                          void *userData);
       
   137 XMLPUBFUN htmlDocPtr XMLCALL
       
   138                         htmlParseFile   (const char *filename,
       
   139                                          const char *encoding);
       
   140 XMLPUBFUN int XMLCALL
       
   141                         UTF8ToHtml      (unsigned char *out,
       
   142                                          int *outlen,
       
   143                                          const unsigned char *in,
       
   144                                          int *inlen);
       
   145 XMLPUBFUN int XMLCALL
       
   146                         htmlEncodeEntities(unsigned char *out,
       
   147                                          int *outlen,
       
   148                                          const unsigned char *in,
       
   149                                          int *inlen, int quoteChar);
       
   150 XMLPUBFUN int XMLCALL
       
   151                         htmlIsScriptAttribute(const xmlChar *name);
       
   152 XMLPUBFUN int XMLCALL
       
   153                         htmlHandleOmittedElem(int val);
       
   154 
       
   155 #ifdef LIBXML_PUSH_ENABLED
       
   156 /**
       
   157  * Interfaces for the Push mode.
       
   158  */
       
   159 XMLPUBFUN htmlParserCtxtPtr XMLCALL
       
   160                         htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
       
   161                                                  void *user_data,
       
   162                                                  const char *chunk,
       
   163                                                  int size,
       
   164                                                  const char *filename,
       
   165                                                  xmlCharEncoding enc);
       
   166 XMLPUBFUN int XMLCALL
       
   167                         htmlParseChunk          (htmlParserCtxtPtr ctxt,
       
   168                                                  const char *chunk,
       
   169                                                  int size,
       
   170                                                  int terminate);
       
   171 #endif /* LIBXML_PUSH_ENABLED */
       
   172 
       
   173 XMLPUBFUN void XMLCALL
       
   174                         htmlFreeParserCtxt      (htmlParserCtxtPtr ctxt);
       
   175 
       
   176 /*
       
   177  * New set of simpler/more flexible APIs
       
   178  */
       
   179 /**
       
   180  * xmlParserOption:
       
   181  *
       
   182  * This is the set of XML parser options that can be passed down
       
   183  * to the xmlReadDoc() and similar calls.
       
   184  */
       
   185 typedef enum {
       
   186     HTML_PARSE_NOERROR  = 1<<5, /* suppress error reports */
       
   187     HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
       
   188     HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
       
   189     HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
       
   190     HTML_PARSE_NONET    = 1<<11 /* Forbid network access */
       
   191 } htmlParserOption;
       
   192 
       
   193 XMLPUBFUN void XMLCALL
       
   194                 htmlCtxtReset           (htmlParserCtxtPtr ctxt);
       
   195 XMLPUBFUN int XMLCALL
       
   196                 htmlCtxtUseOptions      (htmlParserCtxtPtr ctxt,
       
   197                                          int options);
       
   198 XMLPUBFUN htmlDocPtr XMLCALL
       
   199                 htmlReadDoc             (const xmlChar *cur,
       
   200                                          const char *URL,
       
   201                                          const char *encoding,
       
   202                                          int options);
       
   203 XMLPUBFUN htmlDocPtr XMLCALL
       
   204                 htmlReadFile            (const char *URL,
       
   205                                          const char *encoding,
       
   206                                          int options);
       
   207 XMLPUBFUN htmlDocPtr XMLCALL
       
   208                 htmlReadMemory          (const char *buffer,
       
   209                                          int size,
       
   210                                          const char *URL,
       
   211                                          const char *encoding,
       
   212                                          int options);
       
   213 XMLPUBFUN htmlDocPtr XMLCALL
       
   214                 htmlReadFd              (int fd,
       
   215                                          const char *URL,
       
   216                                          const char *encoding,
       
   217                                          int options);
       
   218 XMLPUBFUN htmlDocPtr XMLCALL
       
   219                 htmlReadIO              (xmlInputReadCallback ioread,
       
   220                                          xmlInputCloseCallback ioclose,
       
   221                                          void *ioctx,
       
   222                                          const char *URL,
       
   223                                          const char *encoding,
       
   224                                          int options);
       
   225 XMLPUBFUN htmlDocPtr XMLCALL
       
   226                 htmlCtxtReadDoc         (xmlParserCtxtPtr ctxt,
       
   227                                          const xmlChar *cur,
       
   228                                          const char *URL,
       
   229                                          const char *encoding,
       
   230                                          int options);
       
   231 XMLPUBFUN htmlDocPtr XMLCALL
       
   232                 htmlCtxtReadFile                (xmlParserCtxtPtr ctxt,
       
   233                                          const char *filename,
       
   234                                          const char *encoding,
       
   235                                          int options);
       
   236 XMLPUBFUN htmlDocPtr XMLCALL
       
   237                 htmlCtxtReadMemory              (xmlParserCtxtPtr ctxt,
       
   238                                          const char *buffer,
       
   239                                          int size,
       
   240                                          const char *URL,
       
   241                                          const char *encoding,
       
   242                                          int options);
       
   243 XMLPUBFUN htmlDocPtr XMLCALL
       
   244                 htmlCtxtReadFd          (xmlParserCtxtPtr ctxt,
       
   245                                          int fd,
       
   246                                          const char *URL,
       
   247                                          const char *encoding,
       
   248                                          int options);
       
   249 XMLPUBFUN htmlDocPtr XMLCALL
       
   250                 htmlCtxtReadIO          (xmlParserCtxtPtr ctxt,
       
   251                                          xmlInputReadCallback ioread,
       
   252                                          xmlInputCloseCallback ioclose,
       
   253                                          void *ioctx,
       
   254                                          const char *URL,
       
   255                                          const char *encoding,
       
   256                                          int options);
       
   257 
       
   258 /* NRK/Jan2003: further knowledge of HTML structure
       
   259  */
       
   260 typedef enum {
       
   261   HTML_NA = 0 ,         /* something we don't check at all */
       
   262   HTML_INVALID = 0x1 ,
       
   263   HTML_DEPRECATED = 0x2 ,
       
   264   HTML_VALID = 0x4 ,
       
   265   HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
       
   266 } htmlStatus ;
       
   267 
       
   268 /* Using htmlElemDesc rather than name here, to emphasise the fact
       
   269    that otherwise there's a lookup overhead
       
   270 */
       
   271 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
       
   272 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
       
   273 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
       
   274 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
       
   275 /**
       
   276  * htmlDefaultSubelement:
       
   277  * @param elt HTML element
       
   278  *
       
   279  * Returns the default subelement for this element
       
   280  */
       
   281 #define htmlDefaultSubelement(elt) elt->defaultsubelt
       
   282 /**
       
   283  * htmlElementAllowedHereDesc:
       
   284  * @param parent HTML parent element
       
   285  * @param elt HTML element
       
   286  *
       
   287  * Checks whether an HTML element description may be a
       
   288  * direct child of the specified element.
       
   289  *
       
   290  * Returns 1 if allowed; 0 otherwise.
       
   291  */
       
   292 #define htmlElementAllowedHereDesc(parent,elt) \
       
   293         htmlElementAllowedHere((parent), (elt)->name)
       
   294 /**
       
   295  * htmlRequiredAttrs:
       
   296  * @param elt HTML element
       
   297  *
       
   298  * Returns the attributes required for the specified element.
       
   299  */
       
   300 #define htmlRequiredAttrs(elt) (elt)->attrs_req
       
   301 
       
   302 
       
   303 #endif /* LIBXML_HTML_ENABLED */
       
   304 
       
   305 #ifdef __cplusplus
       
   306 }
       
   307 #endif
       
   308 
       
   309 #endif /* HTML_PARSER_H */