|
1 // Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of "Eclipse Public License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // |
|
15 |
|
16 #ifndef __XMLCONSTANTS_H__ |
|
17 #define __XMLCONSTANTS_H__ |
|
18 |
|
19 #include <e32base.h> |
|
20 |
|
21 /** |
|
22 This file describes useful XML constants. |
|
23 |
|
24 The UTF-8 character representation protocol is described here. |
|
25 |
|
26 num of bytes| Bits used in encoding | Bit representation |
|
27 |
|
28 1 | 7 | 0vvvvvvv |
|
29 |
|
30 2 | 11 | 110vvvvv 10vvvvvv |
|
31 |
|
32 3 | 16 | 1110vvvv 10vvvvvv 10vvvvvv |
|
33 |
|
34 4 | 21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv |
|
35 |
|
36 5 | 26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv |
|
37 |
|
38 6 | 31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv |
|
39 |
|
40 Because each XML entity not accompanied by external encoding information and not in |
|
41 UTF-8 or UTF-16, encoding must begin with an XML encoding declaration, in which the |
|
42 first characters must be '<?xml', any conforming processor can detect, after two to |
|
43 four octets of input, which of the following cases apply. In reading this list, it |
|
44 may help to know that in UCS-4, '<' is "0x0000003C" and '?' is "0x0000003F", and the |
|
45 Byte Order Mark (BOM) required of UTF-16 data streams is "#xFEFF". |
|
46 |
|
47 The notation ## is used to denote any byte value except that two consecutive ##s |
|
48 cannot be both 00. |
|
49 |
|
50 ASCII characters are represented as ASCII values |
|
51 @file |
|
52 */ |
|
53 |
|
54 /** |
|
55 Bit mask to capture the utf-8 single byte header character encoding. |
|
56 @publishedAll |
|
57 */ |
|
58 const TUint KUTF8SingleHeaderMask = 0x80; // 10000000 |
|
59 |
|
60 |
|
61 /** |
|
62 Bit mask to capture the utf-8 double byte header character encoding. |
|
63 @publishedAll |
|
64 */ |
|
65 const TUint KUTF8DoubleHeaderMask = 0xE0; // 11100000 |
|
66 |
|
67 /** |
|
68 Bit mask to capture the utf-8 triple byte header character encoding. |
|
69 @publishedAll |
|
70 */ |
|
71 const TUint KUTF8TripleHeaderMask = 0xF0; // 11110000 |
|
72 |
|
73 /** |
|
74 Bit mask to capture the utf-8 quadruple byte header character encoding. |
|
75 @publishedAll |
|
76 */ |
|
77 const TUint KUTF8QuadrupleHeaderMask = 0xF8; // 11111000 |
|
78 |
|
79 /** |
|
80 Bit mask to capture the utf-8 quinary byte header character encoding. |
|
81 @publishedAll |
|
82 */ |
|
83 const TUint KUTF8QuinaryHeaderMask = 0xFC; // 11111100 |
|
84 |
|
85 /** |
|
86 Bit mask to capture the utf-8 senary byte header character encoding. |
|
87 @publishedAll |
|
88 */ |
|
89 const TUint KUTF8SenaryHeaderMask = 0xFE; // 11111110 |
|
90 |
|
91 /** |
|
92 The utf-8 bit pattern describing a single byte header character encoding. |
|
93 @publishedAll |
|
94 */ |
|
95 const TUint KUTF8SingleByteHeader = 0x00; // 00000000 |
|
96 |
|
97 /** |
|
98 The utf-8 bit pattern describing a double byte header character encoding. |
|
99 @publishedAll |
|
100 */ |
|
101 const TUint KUTF8DoubleByteHeader = 0xC0; // 11000000 |
|
102 |
|
103 /** |
|
104 The utf-8 bit pattern describing a triple byte header character encoding. |
|
105 @publishedAll |
|
106 */ |
|
107 const TUint KUTF8TripleByteHeader = 0xE0; // 11100000 |
|
108 |
|
109 /** |
|
110 The utf-8 bit pattern describing a quadruple byte header character encoding. |
|
111 @publishedAll |
|
112 */ |
|
113 const TUint KUTF8QuadrupleByteHeader = 0xF0; // 11110000 |
|
114 |
|
115 /** |
|
116 The utf-8 bit pattern describing a quinary byte header character encoding. |
|
117 @publishedAll |
|
118 */ |
|
119 const TUint KUTF8QuinaryByteHeader = 0xF8; // 11111000 |
|
120 |
|
121 /** |
|
122 The utf-8 bit pattern describing a senary byte header character encoding. |
|
123 @publishedAll |
|
124 */ |
|
125 const TUint KUTF8SenaryByteHeader = 0xFC; // 11111100 |
|
126 |
|
127 /** |
|
128 The byte count of a utf-8 single byte character encoding. |
|
129 @publishedAll |
|
130 */ |
|
131 const TInt KUTF8SingleByteCount = 1; |
|
132 |
|
133 /** |
|
134 The byte count of a utf-8 double byte character encoding. |
|
135 @publishedAll |
|
136 */ |
|
137 const TInt KUTF8DoubleByteCount = 2; |
|
138 |
|
139 /** |
|
140 The byte count of a utf-8 triple byte character encoding. |
|
141 @publishedAll |
|
142 */ |
|
143 const TInt KUTF8TripleByteCount = 3; |
|
144 |
|
145 /** |
|
146 The byte count of a utf-8 quadruple byte character encoding. |
|
147 @publishedAll |
|
148 */ |
|
149 const TInt KUTF8QuadrupleByteCount = 4; |
|
150 |
|
151 /** |
|
152 The byte count of a utf-8 quinary byte character encoding. |
|
153 @publishedAll |
|
154 */ |
|
155 const TInt KUTF8QuinaryByteCount = 5; |
|
156 |
|
157 /** |
|
158 The byte count of a utf-8 senary byte character encoding. |
|
159 @publishedAll |
|
160 */ |
|
161 const TInt KUTF8SenaryByteCount = 6; |
|
162 |
|
163 /** |
|
164 The byte count required to encode '<?xml'. |
|
165 @publishedAll |
|
166 */ |
|
167 const TInt KEncodingByteCount = 4; |
|
168 |
|
169 |
|
170 /** |
|
171 The encoding text to search for that describes the encoding of an xml document. |
|
172 @publishedAll |
|
173 */ |
|
174 _LIT8(KEncodingTxt, "encoding=\""); |
|
175 |
|
176 /** |
|
177 The end tag (>) symbol used in xml to close the scope of and element. |
|
178 @publishedAll |
|
179 */ |
|
180 const TUint8 KXMLEndTag = '>'; |
|
181 |
|
182 /** |
|
183 The quotation (") symbol used in xml. |
|
184 @publishedAll |
|
185 */ |
|
186 const TUint8 KQuotation = '\"'; |
|
187 |
|
188 |
|
189 enum TParseMode |
|
190 /** |
|
191 Lists enumerations used to describe one or more Parse modes. Users can set this information |
|
192 via the SetParseMode method on the RXmlParser object. |
|
193 @see RXmlParser |
|
194 @publishedAll |
|
195 */ |
|
196 { |
|
197 |
|
198 /** |
|
199 This enumeration when set specifies the convertion of elements and attributes to lowercase. |
|
200 This can be used for case-insensitive HTML so that a tag can be matched to a static |
|
201 string in the string pool. |
|
202 @see RStringPool |
|
203 */ |
|
204 EParseModeConvertTagsToLowerCase = 0x0001, |
|
205 |
|
206 /** |
|
207 This enumeration when set reports an error when unrecognised tags are found. |
|
208 */ |
|
209 EParseModeErrorOnUnrecognisedTags = 0x0002, |
|
210 |
|
211 /** |
|
212 This enumeration when set reports unrecognised tags. |
|
213 */ |
|
214 EParseModeReportUnrecognisedTags = 0x0004, |
|
215 |
|
216 /** |
|
217 This enumeration when set reports the namespace. |
|
218 */ |
|
219 EParseModeReportNamespaces = 0x0008, |
|
220 |
|
221 /** |
|
222 This enumeration when set reports the namespace prefix. |
|
223 */ |
|
224 EParseModeReportNamespacePrefixes = 0x0010, |
|
225 |
|
226 /** |
|
227 This enumeration when set sends all content data for an element in one chunk. |
|
228 */ |
|
229 EParseModeSendFullContentInOneChunk = 0x0020, |
|
230 |
|
231 /** |
|
232 This enumeration when set reports namespace mappings via the OnStartPrefixMapping & |
|
233 OnEndPrefixMapping methods. |
|
234 @see MMarkupCallback |
|
235 */ |
|
236 EParseModeReportNamespaceMapping = 0x0040, |
|
237 |
|
238 /** |
|
239 This enumeration when set describes the data in the specified encoding, otherwise |
|
240 it is specified in utf-8. |
|
241 */ |
|
242 EParseModeRawContent = 0x0080, |
|
243 |
|
244 /** |
|
245 This enumeration when set states that all string comparisons be non-folded. |
|
246 Fold is defined as: The removal of differences between characters that are deemed |
|
247 unimportant for the purposes of inexact or case-insensitive matching. |
|
248 As well as ignoring differences of case, folding ignores any accent on a character. |
|
249 */ |
|
250 EParseModeStrict = 0x0100, |
|
251 |
|
252 /** |
|
253 This enumeration is a mask that covers the total enumerations thus far, and as |
|
254 such should be updated to reflect any new enumerations added. |
|
255 */ |
|
256 EParseModeAllMask = 0x01FF, |
|
257 |
|
258 }; |
|
259 |
|
260 |
|
261 enum TEncoding |
|
262 /** |
|
263 Lists enumerations used to describe the encoding of an xml document. |
|
264 The first line of an xml document generally has the encoding described, |
|
265 however, the data upto this description is specified in the encoding. The actual |
|
266 description is described in ASCII. |
|
267 @publishedAll |
|
268 */ |
|
269 { |
|
270 |
|
271 // With BOM (Byte Order Mark): |
|
272 |
|
273 /** |
|
274 This enumeration represents a BOM subset with the following values 00 00 FE FF. |
|
275 Posible encodings include: UCS-4, big-endian machine (1234 order). |
|
276 */ |
|
277 EEncodingUCS_4BEBOM = 0, |
|
278 |
|
279 /** |
|
280 This enumeration represents a BOM subset with the following values FF FE 00 00. |
|
281 Posible encodings include: UCS-4, little-endian machine (4321 order). |
|
282 */ |
|
283 EEncodingUCS_4LEBOM, |
|
284 |
|
285 /** |
|
286 This enumeration represents a BOM subset with the following values 00 00 FF FE. |
|
287 Posible encodings include: UCS-4, unusual octet order (2143). |
|
288 */ |
|
289 EEncodingUCS_4UO1BOM, |
|
290 |
|
291 /** |
|
292 This enumeration represents a BOM subset with the following values FE FF 00 00. |
|
293 Posible encodings include: UCS-4, unusual octet order (3412). |
|
294 */ |
|
295 EEncodingUCS_4UO2BOM, |
|
296 |
|
297 /** |
|
298 This enumeration represents a BOM subset with the following values FE FF ## ##. |
|
299 Posible encodings include: UTF-16, big-endian. |
|
300 */ |
|
301 EEncodingUTF_16BEBOM, |
|
302 |
|
303 /** |
|
304 This enumeration represents a BOM subset with the following values FF FE ## ##. |
|
305 Posible encodings include: UTF-16, little-endian. |
|
306 */ |
|
307 EEncodingUTF_16LEBOM, |
|
308 |
|
309 /** |
|
310 This enumeration represents a BOM subset with the following values EF BB BF ##. |
|
311 Posible encodings include: UTF-8. |
|
312 */ |
|
313 EEncodingUTF_8BOM, |
|
314 |
|
315 |
|
316 // Without a Byte Order Mark: |
|
317 |
|
318 /** |
|
319 This enumeration represents a non BOM subset with the following values 00 00 00 3C. |
|
320 Posible encodings include: UCS-4 or other encoding with a 32-bit code unit |
|
321 and ASCII characters encoded as ASCII values, in respectively big-endian (1234), |
|
322 little-endian (4321) and two unusual byte orders (2143 and 3412). The encoding |
|
323 declaration must be read to determine which of UCS-4 or other supported 32-bit encodings applies. |
|
324 */ |
|
325 EEncodingUCS_4BE, |
|
326 |
|
327 /** |
|
328 This enumeration represents a non BOM subset with the following values 3C 00 00 00. |
|
329 Posible encodings include: UCS-4 or other encoding with a 32-bit code unit |
|
330 and ASCII characters encoded as ASCII values, in respectively big-endian (1234), |
|
331 little-endian (4321) and two unusual byte orders (2143 and 3412). The encoding |
|
332 declaration must be read to determine which of UCS-4 or other supported 32-bit encodings applies. |
|
333 */ |
|
334 EEncodingUCS_4LE, |
|
335 |
|
336 /** |
|
337 This enumeration represents a non BOM subset with the following values 00 00 3C 00. |
|
338 Posible encodings include: UCS-4 or other encoding with a 32-bit code unit |
|
339 and ASCII characters encoded as ASCII values, in respectively big-endian (1234), |
|
340 little-endian (4321) and two unusual byte orders (2143 and 3412). The encoding |
|
341 declaration must be read to determine which of UCS-4 or other supported 32-bit encodings applies. |
|
342 */ |
|
343 EEncodingUCS_4BO1, |
|
344 |
|
345 /** |
|
346 This enumeration represents a non BOM subset with the following values 00 3C 00 00 |
|
347 Posible encodings include: UCS-4 or other encoding with a 32-bit code unit |
|
348 and ASCII characters encoded as ASCII values, in respectively big-endian (1234), |
|
349 little-endian (4321) and two unusual byte orders (2143 and 3412). The encoding |
|
350 declaration must be read to determine which of UCS-4 or other supported 32-bit encodings applies. |
|
351 */ |
|
352 EEncodingUCS_4BO2, |
|
353 |
|
354 /** |
|
355 This enumeration represents a non BOM subset with the following values 00 3C 00 3F. |
|
356 Posible encodings include: UTF-16BE or big-endian ISO-10646-UCS-2 or other encoding |
|
357 with a 16-bit code unit in big-endian order and ASCII characters encoded as ASCII |
|
358 values (the encoding declaration must be read to determine which). |
|
359 */ |
|
360 EEncodingUTF_16BE, |
|
361 |
|
362 /** |
|
363 This enumeration represents a non BOM subset with the following values 3C 00 3F 00. |
|
364 Posible encodings include: UTF-16LE or little-endian ISO-10646-UCS-2 or other encoding |
|
365 with a 16-bit code unit in little-endian order and ASCII characters encoded as ASCII |
|
366 values (the encoding declaration must be read to determine which). |
|
367 */ |
|
368 EEncodingUTF_16LE, |
|
369 |
|
370 /** |
|
371 This enumeration represents a non BOM subset with the following values 3C 3F 78 6D. |
|
372 Posible encodings include: UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, |
|
373 EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the |
|
374 characters of ASCII have their normal positions, width, and values; the actual encoding |
|
375 declaration must be read to detect which of these applies, but since all of these |
|
376 encodings use the same bit patterns for the relevant ASCII characters, the encoding |
|
377 declaration itself may be read reliably. |
|
378 */ |
|
379 EEncodingUTF_8, |
|
380 |
|
381 /** |
|
382 This enumeration represents a non BOM subset with the following values 4C 6F A7 94. |
|
383 Posible encodings include: EBCDIC (in some flavor; the full encoding declaration must |
|
384 be read to tell which code page is in use). |
|
385 */ |
|
386 |
|
387 EEncodingEBCDIC, |
|
388 |
|
389 /** |
|
390 This enumeration represents a non BOM subset with a combination of other values. |
|
391 Posible encodings include: Other UTF-8 without an encoding declaration, or else the data |
|
392 stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, |
|
393 or enclosed in a wrapper of some kind. |
|
394 */ |
|
395 EEncodingOTHER, |
|
396 |
|
397 }; |
|
398 |
|
399 #endif // __XMLCONSTANTS_H__ |