|
1 /* |
|
2 * Summary: interface for an HTML 4.0 non-verifying parser |
|
3 * Description: this module implements an HTML 4.0 non-verifying parser |
|
4 * with API compatible with the XML parser ones. It should |
|
5 * be able to parse "real world" HTML, even if severely |
|
6 * broken from a specification point of view. |
|
7 * |
|
8 * Copy: See Copyright for the status of this software. |
|
9 * |
|
10 * Author: Daniel Veillard |
|
11 */ |
|
12 |
|
13 #ifndef __HTML_PARSER_H__ |
|
14 #define __HTML_PARSER_H__ |
|
15 #include <libxml/xmlversion.h> |
|
16 #include <libxml/parser.h> |
|
17 |
|
18 #ifdef LIBXML_HTML_ENABLED |
|
19 |
|
20 #ifdef __cplusplus |
|
21 extern "C" { |
|
22 #endif |
|
23 |
|
24 /* |
|
25 * Most of the back-end structures from XML and HTML are shared. |
|
26 */ |
|
27 typedef xmlParserCtxt htmlParserCtxt; |
|
28 typedef xmlParserCtxtPtr htmlParserCtxtPtr; |
|
29 typedef xmlParserNodeInfo htmlParserNodeInfo; |
|
30 typedef xmlSAXHandler htmlSAXHandler; |
|
31 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; |
|
32 typedef xmlParserInput htmlParserInput; |
|
33 typedef xmlParserInputPtr htmlParserInputPtr; |
|
34 typedef xmlDocPtr htmlDocPtr; |
|
35 typedef xmlNodePtr htmlNodePtr; |
|
36 |
|
37 /* |
|
38 * Internal description of an HTML element, representing HTML 4.01 |
|
39 * and XHTML 1.0 (which share the same structure). |
|
40 */ |
|
41 typedef struct _htmlElemDesc htmlElemDesc; |
|
42 typedef htmlElemDesc *htmlElemDescPtr; |
|
43 struct _htmlElemDesc { |
|
44 const char *name; /* The tag name */ |
|
45 char startTag; /* Whether the start tag can be implied */ |
|
46 char endTag; /* Whether the end tag can be implied */ |
|
47 char saveEndTag; /* Whether the end tag should be saved */ |
|
48 char empty; /* Is this an empty element ? */ |
|
49 char depr; /* Is this a deprecated element ? */ |
|
50 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ |
|
51 char isinline; /* is this a block 0 or inline 1 element */ |
|
52 const char *desc; /* the description */ |
|
53 |
|
54 /* NRK Jan.2003 |
|
55 * New fields encapsulating HTML structure |
|
56 * |
|
57 * Bugs: |
|
58 * This is a very limited representation. It fails to tell us when |
|
59 * an element *requires* subelements (we only have whether they're |
|
60 * allowed or not), and it doesn't tell us where CDATA and PCDATA |
|
61 * are allowed. Some element relationships are not fully represented: |
|
62 * these are flagged with the word MODIFIER |
|
63 */ |
|
64 const char** subelts; /* allowed sub-elements of this element */ |
|
65 const char* defaultsubelt; /* subelement for suggested auto-repair |
|
66 if necessary or NULL */ |
|
67 const char** attrs_opt; /* Optional Attributes */ |
|
68 const char** attrs_depr; /* Additional deprecated attributes */ |
|
69 const char** attrs_req; /* Required attributes */ |
|
70 }; |
|
71 |
|
72 /* |
|
73 * Internal description of an HTML entity. |
|
74 */ |
|
75 typedef struct _htmlEntityDesc htmlEntityDesc; |
|
76 typedef htmlEntityDesc *htmlEntityDescPtr; |
|
77 struct _htmlEntityDesc { |
|
78 unsigned int value; /* the UNICODE value for the character */ |
|
79 const char *name; /* The entity name */ |
|
80 const char *desc; /* the description */ |
|
81 }; |
|
82 |
|
83 /* |
|
84 * There is only few public functions. |
|
85 */ |
|
86 XMLPUBFUN const htmlElemDesc * XMLCALL |
|
87 htmlTagLookup (const xmlChar *tag); |
|
88 XMLPUBFUN const htmlEntityDesc * XMLCALL |
|
89 htmlEntityLookup(const xmlChar *name); |
|
90 XMLPUBFUN const htmlEntityDesc * XMLCALL |
|
91 htmlEntityValueLookup(unsigned int value); |
|
92 |
|
93 XMLPUBFUN int XMLCALL |
|
94 htmlIsAutoClosed(htmlDocPtr doc, |
|
95 htmlNodePtr elem); |
|
96 XMLPUBFUN int XMLCALL |
|
97 htmlAutoCloseTag(htmlDocPtr doc, |
|
98 const xmlChar *name, |
|
99 htmlNodePtr elem); |
|
100 XMLPUBFUN const htmlEntityDesc * XMLCALL |
|
101 htmlParseEntityRef(htmlParserCtxtPtr ctxt, |
|
102 const xmlChar **str); |
|
103 XMLPUBFUN int XMLCALL |
|
104 htmlParseCharRef(htmlParserCtxtPtr ctxt); |
|
105 XMLPUBFUN void XMLCALL |
|
106 htmlParseElement(htmlParserCtxtPtr ctxt); |
|
107 |
|
108 XMLPUBFUN htmlParserCtxtPtr XMLCALL |
|
109 htmlCreateMemoryParserCtxt(const char *buffer, |
|
110 int size); |
|
111 |
|
112 XMLPUBFUN int XMLCALL |
|
113 htmlParseDocument(htmlParserCtxtPtr ctxt); |
|
114 XMLPUBFUN htmlDocPtr XMLCALL |
|
115 htmlSAXParseDoc (xmlChar *cur, |
|
116 const char *encoding, |
|
117 htmlSAXHandlerPtr sax, |
|
118 void *userData); |
|
119 XMLPUBFUN htmlDocPtr XMLCALL |
|
120 htmlParseDoc (xmlChar *cur, |
|
121 const char *encoding); |
|
122 XMLPUBFUN htmlDocPtr XMLCALL |
|
123 htmlSAXParseFile(const char *filename, |
|
124 const char *encoding, |
|
125 htmlSAXHandlerPtr sax, |
|
126 void *userData); |
|
127 XMLPUBFUN htmlDocPtr XMLCALL |
|
128 htmlParseFile (const char *filename, |
|
129 const char *encoding); |
|
130 XMLPUBFUN int XMLCALL |
|
131 UTF8ToHtml (unsigned char *out, |
|
132 int *outlen, |
|
133 const unsigned char *in, |
|
134 int *inlen); |
|
135 XMLPUBFUN int XMLCALL |
|
136 htmlEncodeEntities(unsigned char *out, |
|
137 int *outlen, |
|
138 const unsigned char *in, |
|
139 int *inlen, int quoteChar); |
|
140 XMLPUBFUN int XMLCALL |
|
141 htmlIsScriptAttribute(const xmlChar *name); |
|
142 XMLPUBFUN int XMLCALL |
|
143 htmlHandleOmittedElem(int val); |
|
144 |
|
145 #ifdef LIBXML_PUSH_ENABLED |
|
146 /** |
|
147 * Interfaces for the Push mode. |
|
148 */ |
|
149 XMLPUBFUN htmlParserCtxtPtr XMLCALL |
|
150 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, |
|
151 void *user_data, |
|
152 const char *chunk, |
|
153 int size, |
|
154 const char *filename, |
|
155 xmlCharEncoding enc); |
|
156 XMLPUBFUN int XMLCALL |
|
157 htmlParseChunk (htmlParserCtxtPtr ctxt, |
|
158 const char *chunk, |
|
159 int size, |
|
160 int terminate); |
|
161 #endif /* LIBXML_PUSH_ENABLED */ |
|
162 |
|
163 XMLPUBFUN void XMLCALL |
|
164 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); |
|
165 |
|
166 /* |
|
167 * New set of simpler/more flexible APIs |
|
168 */ |
|
169 /** |
|
170 * xmlParserOption: |
|
171 * |
|
172 * This is the set of XML parser options that can be passed down |
|
173 * to the xmlReadDoc() and similar calls. |
|
174 */ |
|
175 typedef enum { |
|
176 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ |
|
177 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ |
|
178 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ |
|
179 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ |
|
180 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ |
|
181 HTML_PARSE_NONET = 1<<11,/* Forbid network access */ |
|
182 HTML_PARSE_COMPACT = 1<<16 /* compact small text nodes */ |
|
183 } htmlParserOption; |
|
184 |
|
185 XMLPUBFUN void XMLCALL |
|
186 htmlCtxtReset (htmlParserCtxtPtr ctxt); |
|
187 XMLPUBFUN int XMLCALL |
|
188 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, |
|
189 int options); |
|
190 XMLPUBFUN htmlDocPtr XMLCALL |
|
191 htmlReadDoc (const xmlChar *cur, |
|
192 const char *URL, |
|
193 const char *encoding, |
|
194 int options); |
|
195 XMLPUBFUN htmlDocPtr XMLCALL |
|
196 htmlReadFile (const char *URL, |
|
197 const char *encoding, |
|
198 int options); |
|
199 XMLPUBFUN htmlDocPtr XMLCALL |
|
200 htmlReadMemory (const char *buffer, |
|
201 int size, |
|
202 const char *URL, |
|
203 const char *encoding, |
|
204 int options); |
|
205 XMLPUBFUN htmlDocPtr XMLCALL |
|
206 htmlReadFd (int fd, |
|
207 const char *URL, |
|
208 const char *encoding, |
|
209 int options); |
|
210 XMLPUBFUN htmlDocPtr XMLCALL |
|
211 htmlReadIO (xmlInputReadCallback ioread, |
|
212 xmlInputCloseCallback ioclose, |
|
213 void *ioctx, |
|
214 const char *URL, |
|
215 const char *encoding, |
|
216 int options); |
|
217 XMLPUBFUN htmlDocPtr XMLCALL |
|
218 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, |
|
219 const xmlChar *cur, |
|
220 const char *URL, |
|
221 const char *encoding, |
|
222 int options); |
|
223 XMLPUBFUN htmlDocPtr XMLCALL |
|
224 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, |
|
225 const char *filename, |
|
226 const char *encoding, |
|
227 int options); |
|
228 XMLPUBFUN htmlDocPtr XMLCALL |
|
229 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, |
|
230 const char *buffer, |
|
231 int size, |
|
232 const char *URL, |
|
233 const char *encoding, |
|
234 int options); |
|
235 XMLPUBFUN htmlDocPtr XMLCALL |
|
236 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, |
|
237 int fd, |
|
238 const char *URL, |
|
239 const char *encoding, |
|
240 int options); |
|
241 XMLPUBFUN htmlDocPtr XMLCALL |
|
242 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, |
|
243 xmlInputReadCallback ioread, |
|
244 xmlInputCloseCallback ioclose, |
|
245 void *ioctx, |
|
246 const char *URL, |
|
247 const char *encoding, |
|
248 int options); |
|
249 |
|
250 /* NRK/Jan2003: further knowledge of HTML structure |
|
251 */ |
|
252 typedef enum { |
|
253 HTML_NA = 0 , /* something we don't check at all */ |
|
254 HTML_INVALID = 0x1 , |
|
255 HTML_DEPRECATED = 0x2 , |
|
256 HTML_VALID = 0x4 , |
|
257 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ |
|
258 } htmlStatus ; |
|
259 |
|
260 /* Using htmlElemDesc rather than name here, to emphasise the fact |
|
261 that otherwise there's a lookup overhead |
|
262 */ |
|
263 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; |
|
264 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; |
|
265 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; |
|
266 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; |
|
267 /** |
|
268 * htmlDefaultSubelement: |
|
269 * @elt: HTML element |
|
270 * |
|
271 * Returns the default subelement for this element |
|
272 */ |
|
273 #define htmlDefaultSubelement(elt) elt->defaultsubelt |
|
274 /** |
|
275 * htmlElementAllowedHereDesc: |
|
276 * @parent: HTML parent element |
|
277 * @elt: HTML element |
|
278 * |
|
279 * Checks whether an HTML element description may be a |
|
280 * direct child of the specified element. |
|
281 * |
|
282 * Returns 1 if allowed; 0 otherwise. |
|
283 */ |
|
284 #define htmlElementAllowedHereDesc(parent,elt) \ |
|
285 htmlElementAllowedHere((parent), (elt)->name) |
|
286 /** |
|
287 * htmlRequiredAttrs: |
|
288 * @elt: HTML element |
|
289 * |
|
290 * Returns the attributes required for the specified element. |
|
291 */ |
|
292 #define htmlRequiredAttrs(elt) (elt)->attrs_req |
|
293 |
|
294 |
|
295 #ifdef __cplusplus |
|
296 } |
|
297 #endif |
|
298 |
|
299 #endif /* LIBXML_HTML_ENABLED */ |
|
300 #endif /* __HTML_PARSER_H__ */ |