|
1 /* |
|
2 * Summary: interface for an HTML 4.0 non-verifying parser |
|
3 * Description: this module implements an HTML 4.0 non-verifying parser |
|
4 * with API compatible with the XML parser ones. It should |
|
5 * be able to parse "real world" HTML, even if severely |
|
6 * broken from a specification point of view. |
|
7 * |
|
8 * Copy: See Copyright for the status of this software. |
|
9 * |
|
10 * Author: Daniel Veillard |
|
11 * Portion Copyright © 2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
12 */ |
|
13 |
|
14 /** @file |
|
15 @publishedAll |
|
16 @released |
|
17 */ |
|
18 |
|
19 #ifndef HTML_PARSER_H |
|
20 #define HTML_PARSER_H |
|
21 |
|
22 #include <stdapis/libxml2/libxml2_parser.h> |
|
23 |
|
24 #ifdef __cplusplus |
|
25 extern "C" { |
|
26 #endif |
|
27 |
|
28 /* |
|
29 * Most of the back-end structures from XML and HTML are shared. |
|
30 */ |
|
31 typedef xmlParserCtxt htmlParserCtxt; |
|
32 typedef xmlParserCtxtPtr htmlParserCtxtPtr; |
|
33 typedef xmlParserNodeInfo htmlParserNodeInfo; |
|
34 typedef xmlSAXHandler htmlSAXHandler; |
|
35 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; |
|
36 typedef xmlParserInput htmlParserInput; |
|
37 typedef xmlParserInputPtr htmlParserInputPtr; |
|
38 typedef xmlDocPtr htmlDocPtr; |
|
39 typedef xmlNodePtr htmlNodePtr; |
|
40 |
|
41 /* |
|
42 * Internal description of an HTML element, representing HTML 4.01 |
|
43 * and XHTML 1.0 (which share the same structure). |
|
44 */ |
|
45 typedef struct _htmlElemDesc htmlElemDesc; |
|
46 typedef htmlElemDesc *htmlElemDescPtr; |
|
47 struct _htmlElemDesc { |
|
48 const char *name; /* The tag name */ |
|
49 char startTag; /* Whether the start tag can be implied */ |
|
50 char endTag; /* Whether the end tag can be implied */ |
|
51 char saveEndTag; /* Whether the end tag should be saved */ |
|
52 char empty; /* Is this an empty element ? */ |
|
53 char depr; /* Is this a deprecated element ? */ |
|
54 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ |
|
55 char isinline; /* is this a block 0 or inline 1 element */ |
|
56 const char *desc; /* the description */ |
|
57 |
|
58 /* NRK Jan.2003 |
|
59 * New fields encapsulating HTML structure |
|
60 * |
|
61 * Bugs: |
|
62 * This is a very limited representation. It fails to tell us when |
|
63 * an element *requires* subelements (we only have whether they're |
|
64 * allowed or not), and it doesn't tell us where CDATA and PCDATA |
|
65 * are allowed. Some element relationships are not fully represented: |
|
66 * these are flagged with the word MODIFIER |
|
67 */ |
|
68 const char** subelts; /* allowed sub-elements of this element */ |
|
69 const char* defaultsubelt; /* subelement for suggested auto-repair |
|
70 if necessary or NULL */ |
|
71 const char** attrs_opt; /* Optional Attributes */ |
|
72 const char** attrs_depr; /* Additional deprecated attributes */ |
|
73 const char** attrs_req; /* Required attributes */ |
|
74 }; |
|
75 |
|
76 /* |
|
77 * Internal description of an HTML entity. |
|
78 */ |
|
79 typedef struct _htmlEntityDesc htmlEntityDesc; |
|
80 typedef htmlEntityDesc *htmlEntityDescPtr; |
|
81 struct _htmlEntityDesc { |
|
82 unsigned int value; /* the UNICODE value for the character */ |
|
83 const char *name; /* The entity name */ |
|
84 const char *desc; /* the description */ |
|
85 }; |
|
86 |
|
87 #if defined(LIBXML_HTML_ENABLED) || defined(XMLENGINE_XSLT) |
|
88 /* |
|
89 * There is only few public functions. |
|
90 */ |
|
91 XMLPUBFUN const htmlElemDesc * XMLCALL |
|
92 htmlTagLookup (const xmlChar *tag); |
|
93 |
|
94 #endif /* LIBXML_HTML_ENABLED || XMLENGINE_XSLT ) */ |
|
95 |
|
96 #ifdef LIBXML_HTML_ENABLED |
|
97 |
|
98 XMLPUBFUN const htmlEntityDesc * XMLCALL |
|
99 htmlEntityLookup(const xmlChar *name); |
|
100 XMLPUBFUN const htmlEntityDesc * XMLCALL |
|
101 htmlEntityValueLookup(unsigned int value); |
|
102 |
|
103 XMLPUBFUN int XMLCALL |
|
104 htmlIsAutoClosed(htmlDocPtr doc, |
|
105 htmlNodePtr elem); |
|
106 XMLPUBFUN int XMLCALL |
|
107 htmlAutoCloseTag(htmlDocPtr doc, |
|
108 const xmlChar *name, |
|
109 htmlNodePtr elem); |
|
110 XMLPUBFUN const htmlEntityDesc * XMLCALL |
|
111 htmlParseEntityRef(htmlParserCtxtPtr ctxt, |
|
112 const xmlChar **str); |
|
113 XMLPUBFUN int XMLCALL |
|
114 htmlParseCharRef(htmlParserCtxtPtr ctxt); |
|
115 XMLPUBFUN void XMLCALL |
|
116 htmlParseElement(htmlParserCtxtPtr ctxt); |
|
117 |
|
118 XMLPUBFUN htmlParserCtxtPtr XMLCALL |
|
119 htmlCreateMemoryParserCtxt(const char *buffer, |
|
120 int size); |
|
121 |
|
122 XMLPUBFUN int XMLCALL |
|
123 htmlParseDocument(htmlParserCtxtPtr ctxt); |
|
124 XMLPUBFUN htmlDocPtr XMLCALL |
|
125 htmlSAXParseDoc (xmlChar *cur, |
|
126 const char *encoding, |
|
127 htmlSAXHandlerPtr sax, |
|
128 void *userData); |
|
129 XMLPUBFUN htmlDocPtr XMLCALL |
|
130 htmlParseDoc (xmlChar *cur, |
|
131 const char *encoding); |
|
132 XMLPUBFUN htmlDocPtr XMLCALL |
|
133 htmlSAXParseFile(const char *filename, |
|
134 const char *encoding, |
|
135 htmlSAXHandlerPtr sax, |
|
136 void *userData); |
|
137 XMLPUBFUN htmlDocPtr XMLCALL |
|
138 htmlParseFile (const char *filename, |
|
139 const char *encoding); |
|
140 XMLPUBFUN int XMLCALL |
|
141 UTF8ToHtml (unsigned char *out, |
|
142 int *outlen, |
|
143 const unsigned char *in, |
|
144 int *inlen); |
|
145 XMLPUBFUN int XMLCALL |
|
146 htmlEncodeEntities(unsigned char *out, |
|
147 int *outlen, |
|
148 const unsigned char *in, |
|
149 int *inlen, int quoteChar); |
|
150 XMLPUBFUN int XMLCALL |
|
151 htmlIsScriptAttribute(const xmlChar *name); |
|
152 XMLPUBFUN int XMLCALL |
|
153 htmlHandleOmittedElem(int val); |
|
154 |
|
155 #ifdef LIBXML_PUSH_ENABLED |
|
156 /** |
|
157 * Interfaces for the Push mode. |
|
158 */ |
|
159 XMLPUBFUN htmlParserCtxtPtr XMLCALL |
|
160 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, |
|
161 void *user_data, |
|
162 const char *chunk, |
|
163 int size, |
|
164 const char *filename, |
|
165 xmlCharEncoding enc); |
|
166 XMLPUBFUN int XMLCALL |
|
167 htmlParseChunk (htmlParserCtxtPtr ctxt, |
|
168 const char *chunk, |
|
169 int size, |
|
170 int terminate); |
|
171 #endif /* LIBXML_PUSH_ENABLED */ |
|
172 |
|
173 XMLPUBFUN void XMLCALL |
|
174 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); |
|
175 |
|
176 /* |
|
177 * New set of simpler/more flexible APIs |
|
178 */ |
|
179 /** |
|
180 * xmlParserOption: |
|
181 * |
|
182 * This is the set of XML parser options that can be passed down |
|
183 * to the xmlReadDoc() and similar calls. |
|
184 */ |
|
185 typedef enum { |
|
186 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ |
|
187 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ |
|
188 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ |
|
189 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ |
|
190 HTML_PARSE_NONET = 1<<11 /* Forbid network access */ |
|
191 } htmlParserOption; |
|
192 |
|
193 XMLPUBFUN void XMLCALL |
|
194 htmlCtxtReset (htmlParserCtxtPtr ctxt); |
|
195 XMLPUBFUN int XMLCALL |
|
196 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, |
|
197 int options); |
|
198 XMLPUBFUN htmlDocPtr XMLCALL |
|
199 htmlReadDoc (const xmlChar *cur, |
|
200 const char *URL, |
|
201 const char *encoding, |
|
202 int options); |
|
203 XMLPUBFUN htmlDocPtr XMLCALL |
|
204 htmlReadFile (const char *URL, |
|
205 const char *encoding, |
|
206 int options); |
|
207 XMLPUBFUN htmlDocPtr XMLCALL |
|
208 htmlReadMemory (const char *buffer, |
|
209 int size, |
|
210 const char *URL, |
|
211 const char *encoding, |
|
212 int options); |
|
213 XMLPUBFUN htmlDocPtr XMLCALL |
|
214 htmlReadFd (int fd, |
|
215 const char *URL, |
|
216 const char *encoding, |
|
217 int options); |
|
218 XMLPUBFUN htmlDocPtr XMLCALL |
|
219 htmlReadIO (xmlInputReadCallback ioread, |
|
220 xmlInputCloseCallback ioclose, |
|
221 void *ioctx, |
|
222 const char *URL, |
|
223 const char *encoding, |
|
224 int options); |
|
225 XMLPUBFUN htmlDocPtr XMLCALL |
|
226 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, |
|
227 const xmlChar *cur, |
|
228 const char *URL, |
|
229 const char *encoding, |
|
230 int options); |
|
231 XMLPUBFUN htmlDocPtr XMLCALL |
|
232 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, |
|
233 const char *filename, |
|
234 const char *encoding, |
|
235 int options); |
|
236 XMLPUBFUN htmlDocPtr XMLCALL |
|
237 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, |
|
238 const char *buffer, |
|
239 int size, |
|
240 const char *URL, |
|
241 const char *encoding, |
|
242 int options); |
|
243 XMLPUBFUN htmlDocPtr XMLCALL |
|
244 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, |
|
245 int fd, |
|
246 const char *URL, |
|
247 const char *encoding, |
|
248 int options); |
|
249 XMLPUBFUN htmlDocPtr XMLCALL |
|
250 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, |
|
251 xmlInputReadCallback ioread, |
|
252 xmlInputCloseCallback ioclose, |
|
253 void *ioctx, |
|
254 const char *URL, |
|
255 const char *encoding, |
|
256 int options); |
|
257 |
|
258 /* NRK/Jan2003: further knowledge of HTML structure |
|
259 */ |
|
260 typedef enum { |
|
261 HTML_NA = 0 , /* something we don't check at all */ |
|
262 HTML_INVALID = 0x1 , |
|
263 HTML_DEPRECATED = 0x2 , |
|
264 HTML_VALID = 0x4 , |
|
265 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ |
|
266 } htmlStatus ; |
|
267 |
|
268 /* Using htmlElemDesc rather than name here, to emphasise the fact |
|
269 that otherwise there's a lookup overhead |
|
270 */ |
|
271 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; |
|
272 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; |
|
273 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; |
|
274 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; |
|
275 /** |
|
276 * htmlDefaultSubelement: |
|
277 * @param elt HTML element |
|
278 * |
|
279 * Returns the default subelement for this element |
|
280 */ |
|
281 #define htmlDefaultSubelement(elt) elt->defaultsubelt |
|
282 /** |
|
283 * htmlElementAllowedHereDesc: |
|
284 * @param parent HTML parent element |
|
285 * @param elt HTML element |
|
286 * |
|
287 * Checks whether an HTML element description may be a |
|
288 * direct child of the specified element. |
|
289 * |
|
290 * Returns 1 if allowed; 0 otherwise. |
|
291 */ |
|
292 #define htmlElementAllowedHereDesc(parent,elt) \ |
|
293 htmlElementAllowedHere((parent), (elt)->name) |
|
294 /** |
|
295 * htmlRequiredAttrs: |
|
296 * @param elt HTML element |
|
297 * |
|
298 * Returns the attributes required for the specified element. |
|
299 */ |
|
300 #define htmlRequiredAttrs(elt) (elt)->attrs_req |
|
301 |
|
302 |
|
303 #endif /* LIBXML_HTML_ENABLED */ |
|
304 |
|
305 #ifdef __cplusplus |
|
306 } |
|
307 #endif |
|
308 |
|
309 #endif /* HTML_PARSER_H */ |