xml/xmlexpatparser/src/xmlconstants.h
changeset 0 e35f40988205
equal deleted inserted replaced
-1:000000000000 0:e35f40988205
       
     1 // Copyright (c) 2003-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     2 // All rights reserved.
       
     3 // This component and the accompanying materials are made available
       
     4 // under the terms of "Eclipse Public License v1.0"
       
     5 // which accompanies this distribution, and is available
       
     6 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     7 //
       
     8 // Initial Contributors:
       
     9 // Nokia Corporation - initial contribution.
       
    10 //
       
    11 // Contributors:
       
    12 //
       
    13 // Description:
       
    14 //
       
    15 
       
    16 #ifndef __XMLCONSTANTS_H__
       
    17 #define __XMLCONSTANTS_H__
       
    18 
       
    19 #include <e32base.h>
       
    20 
       
    21 /**
       
    22 This file describes useful XML constants.
       
    23 
       
    24 The UTF-8 character representation protocol is described here.
       
    25 
       
    26 num of bytes| Bits used in encoding	| Bit representation
       
    27 
       
    28           1 |                     7 | 0vvvvvvv
       
    29 
       
    30           2 |                    11 | 110vvvvv 10vvvvvv
       
    31 
       
    32           3 |                    16 | 1110vvvv 10vvvvvv 10vvvvvv
       
    33 
       
    34           4 |                    21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
       
    35 
       
    36           5 |                    26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
       
    37 
       
    38           6 |                    31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
       
    39 
       
    40 Because each XML entity not accompanied by external encoding information and not in
       
    41 UTF-8 or UTF-16, encoding must begin with an XML encoding declaration, in which the
       
    42 first characters must be '<?xml', any conforming processor can detect, after two to
       
    43 four octets of input, which of the following cases apply. In reading this list, it
       
    44 may help to know that in UCS-4, '<' is "0x0000003C" and '?' is "0x0000003F", and the
       
    45 Byte Order Mark (BOM) required of UTF-16 data streams is "#xFEFF". 
       
    46 
       
    47 The notation ## is used to denote any byte value except that two consecutive ##s 
       
    48 cannot be both 00.
       
    49 
       
    50 ASCII characters are represented as ASCII values
       
    51 @file
       
    52 */
       
    53 
       
    54 /**
       
    55 Bit mask to capture the utf-8 single byte header character encoding.
       
    56 @publishedAll
       
    57 */
       
    58 const TUint KUTF8SingleHeaderMask		= 0x80; // 10000000
       
    59 
       
    60 
       
    61 /**
       
    62 Bit mask to capture the utf-8 double byte header character encoding.
       
    63 @publishedAll
       
    64 */
       
    65 const TUint KUTF8DoubleHeaderMask		= 0xE0; // 11100000
       
    66 
       
    67 /**
       
    68 Bit mask to capture the utf-8 triple byte header character encoding.
       
    69 @publishedAll
       
    70 */
       
    71 const TUint KUTF8TripleHeaderMask		= 0xF0; // 11110000
       
    72 
       
    73 /**
       
    74 Bit mask to capture the utf-8 quadruple byte header character encoding.
       
    75 @publishedAll
       
    76 */
       
    77 const TUint KUTF8QuadrupleHeaderMask	= 0xF8; // 11111000
       
    78 
       
    79 /**
       
    80 Bit mask to capture the utf-8 quinary byte header character encoding.
       
    81 @publishedAll
       
    82 */
       
    83 const TUint KUTF8QuinaryHeaderMask		= 0xFC; // 11111100
       
    84 
       
    85 /**
       
    86 Bit mask to capture the utf-8 senary byte header character encoding.
       
    87 @publishedAll
       
    88 */
       
    89 const TUint KUTF8SenaryHeaderMask		= 0xFE; // 11111110
       
    90 
       
    91 /**
       
    92 The utf-8 bit pattern describing a single byte header character encoding.
       
    93 @publishedAll
       
    94 */
       
    95 const TUint KUTF8SingleByteHeader		= 0x00; // 00000000
       
    96 
       
    97 /**
       
    98 The utf-8 bit pattern describing a double byte header character encoding.
       
    99 @publishedAll
       
   100 */
       
   101 const TUint KUTF8DoubleByteHeader		= 0xC0; // 11000000
       
   102 
       
   103 /**
       
   104 The utf-8 bit pattern describing a triple byte header character encoding.
       
   105 @publishedAll
       
   106 */
       
   107 const TUint KUTF8TripleByteHeader		= 0xE0; // 11100000
       
   108 
       
   109 /**
       
   110 The utf-8 bit pattern describing a quadruple byte header character encoding.
       
   111 @publishedAll
       
   112 */
       
   113 const TUint KUTF8QuadrupleByteHeader	= 0xF0; // 11110000
       
   114 
       
   115 /**
       
   116 The utf-8 bit pattern describing a quinary byte header character encoding.
       
   117 @publishedAll
       
   118 */
       
   119 const TUint KUTF8QuinaryByteHeader		= 0xF8; // 11111000
       
   120 
       
   121 /**
       
   122 The utf-8 bit pattern describing a senary byte header character encoding.
       
   123 @publishedAll
       
   124 */
       
   125 const TUint KUTF8SenaryByteHeader		= 0xFC; // 11111100
       
   126 
       
   127 /**
       
   128 The byte count of a utf-8 single byte character encoding.
       
   129 @publishedAll
       
   130 */
       
   131 const TInt KUTF8SingleByteCount			= 1; 
       
   132 
       
   133 /**
       
   134 The byte count of a utf-8 double byte character encoding.
       
   135 @publishedAll
       
   136 */
       
   137 const TInt KUTF8DoubleByteCount			= 2; 
       
   138 
       
   139 /**
       
   140 The byte count of a utf-8 triple byte character encoding.
       
   141 @publishedAll
       
   142 */
       
   143 const TInt KUTF8TripleByteCount			= 3; 
       
   144 
       
   145 /**
       
   146 The byte count of a utf-8 quadruple byte character encoding.
       
   147 @publishedAll
       
   148 */
       
   149 const TInt KUTF8QuadrupleByteCount		= 4; 
       
   150 
       
   151 /**
       
   152 The byte count of a utf-8 quinary byte character encoding.
       
   153 @publishedAll
       
   154 */
       
   155 const TInt KUTF8QuinaryByteCount		= 5; 
       
   156 
       
   157 /**
       
   158 The byte count of a utf-8 senary byte character encoding.
       
   159 @publishedAll
       
   160 */
       
   161 const TInt KUTF8SenaryByteCount			= 6;
       
   162 
       
   163 /**
       
   164 The byte count required to encode '<?xml'.
       
   165 @publishedAll
       
   166 */
       
   167 const TInt KEncodingByteCount			= 4;
       
   168 
       
   169 
       
   170 /**
       
   171 The encoding text to search for that describes the encoding of an xml document.
       
   172 @publishedAll
       
   173 */
       
   174 _LIT8(KEncodingTxt, "encoding=\"");
       
   175 
       
   176 /**
       
   177 The end tag (>) symbol used in xml to close the scope of and element.
       
   178 @publishedAll
       
   179 */
       
   180 const TUint8 KXMLEndTag					= '>';
       
   181 
       
   182 /**
       
   183 The quotation (") symbol used in xml.
       
   184 @publishedAll
       
   185 */
       
   186 const TUint8 KQuotation					= '\"';
       
   187 
       
   188 
       
   189 enum TParseMode 
       
   190 /**
       
   191 Lists enumerations used to describe one or more Parse modes. Users can set this information
       
   192 via the SetParseMode method on the RXmlParser object.
       
   193 @see RXmlParser
       
   194 @publishedAll
       
   195 */
       
   196 	{
       
   197 
       
   198 /**
       
   199 This enumeration when set specifies the convertion of elements and attributes to lowercase. 
       
   200 This can be used for case-insensitive HTML so that a tag can be matched to a static 
       
   201 string in the string pool.
       
   202 @see RStringPool
       
   203 */
       
   204 	EParseModeConvertTagsToLowerCase	= 0x0001,
       
   205 
       
   206 /**
       
   207 This enumeration when set reports an error when unrecognised tags are found.
       
   208 */
       
   209 	EParseModeErrorOnUnrecognisedTags	= 0x0002,
       
   210 
       
   211 /**
       
   212 This enumeration when set reports unrecognised tags.
       
   213 */
       
   214 	EParseModeReportUnrecognisedTags	= 0x0004,
       
   215 
       
   216 /**
       
   217 This enumeration when set reports the namespace.
       
   218 */
       
   219 	EParseModeReportNamespaces			= 0x0008,
       
   220 
       
   221 /**
       
   222 This enumeration when set reports the namespace prefix.
       
   223 */
       
   224 	EParseModeReportNamespacePrefixes	= 0x0010,
       
   225 
       
   226 /**
       
   227 This enumeration when set sends all content data for an element in one chunk.
       
   228 */
       
   229 	EParseModeSendFullContentInOneChunk	= 0x0020,
       
   230 
       
   231 /**
       
   232 This enumeration when set reports namespace mappings via the OnStartPrefixMapping & 
       
   233 OnEndPrefixMapping methods.
       
   234 @see MMarkupCallback
       
   235 */
       
   236 	EParseModeReportNamespaceMapping	= 0x0040,
       
   237 
       
   238 /**
       
   239 This enumeration when set describes the data in the specified encoding, otherwise
       
   240 it is specified in utf-8.
       
   241 */
       
   242 	EParseModeRawContent                = 0x0080,
       
   243 
       
   244 /**
       
   245 This enumeration when set states that all string comparisons be non-folded.
       
   246 Fold is defined as: The removal of differences between characters that are deemed 
       
   247 unimportant for the purposes of inexact or case-insensitive matching. 
       
   248 As well as ignoring differences of case, folding ignores any accent on a character.
       
   249 */
       
   250 	EParseModeStrict					= 0x0100,
       
   251 
       
   252 /**
       
   253 This enumeration is a mask that covers the total enumerations thus far, and as
       
   254 such should be updated to reflect any new enumerations added.
       
   255 */
       
   256 	EParseModeAllMask                   = 0x01FF,
       
   257 
       
   258 	};
       
   259 
       
   260 
       
   261 enum TEncoding
       
   262 /**
       
   263 Lists enumerations used to describe the encoding of an xml document. 
       
   264 The first line of an xml document generally has the encoding described,
       
   265 however, the data upto this description is specified in the encoding. The actual 
       
   266 description is described in ASCII.
       
   267 @publishedAll
       
   268 */
       
   269 	{
       
   270 
       
   271 // With BOM (Byte Order Mark):
       
   272 
       
   273 /**
       
   274 This enumeration represents a BOM subset with the following values 00 00 FE FF.
       
   275 Posible encodings include: UCS-4, big-endian machine (1234 order).
       
   276 */
       
   277 	EEncodingUCS_4BEBOM = 0,
       
   278 
       
   279 /**
       
   280 This enumeration represents a BOM subset with the following values FF FE 00 00.
       
   281 Posible encodings include: UCS-4, little-endian machine (4321 order).
       
   282 */
       
   283 	EEncodingUCS_4LEBOM,
       
   284 	
       
   285 /**
       
   286 This enumeration represents a BOM subset with the following values 00 00 FF FE.
       
   287 Posible encodings include: UCS-4, unusual octet order (2143).
       
   288 */
       
   289 	EEncodingUCS_4UO1BOM,
       
   290 
       
   291 /**
       
   292 This enumeration represents a BOM subset with the following values FE FF 00 00.
       
   293 Posible encodings include: UCS-4, unusual octet order (3412).
       
   294 */
       
   295 	EEncodingUCS_4UO2BOM,
       
   296 
       
   297 /**
       
   298 This enumeration represents a BOM subset with the following values FE FF ## ##.
       
   299 Posible encodings include: UTF-16, big-endian.
       
   300 */
       
   301 	EEncodingUTF_16BEBOM,
       
   302 
       
   303 /**
       
   304 This enumeration represents a BOM subset with the following values FF FE ## ##.
       
   305 Posible encodings include: UTF-16, little-endian.
       
   306 */
       
   307 	EEncodingUTF_16LEBOM,
       
   308 
       
   309 /**
       
   310 This enumeration represents a BOM subset with the following values EF BB BF ##.
       
   311 Posible encodings include: UTF-8.
       
   312 */
       
   313 	EEncodingUTF_8BOM,
       
   314 
       
   315 
       
   316 // Without a Byte Order Mark:
       
   317 
       
   318 /**
       
   319 This enumeration represents a non BOM subset with the following values 00 00 00 3C.
       
   320 Posible encodings include: UCS-4 or other encoding with a 32-bit code unit
       
   321 and ASCII characters encoded as ASCII values, in respectively big-endian (1234), 
       
   322 little-endian (4321) and two unusual byte orders (2143 and 3412). The encoding 
       
   323 declaration must be read to determine which of UCS-4 or other supported 32-bit encodings applies.
       
   324 */
       
   325 	EEncodingUCS_4BE,
       
   326 
       
   327 /**
       
   328 This enumeration represents a non BOM subset with the following values 3C 00 00 00.
       
   329 Posible encodings include: UCS-4 or other encoding with a 32-bit code unit
       
   330 and ASCII characters encoded as ASCII values, in respectively big-endian (1234), 
       
   331 little-endian (4321) and two unusual byte orders (2143 and 3412). The encoding 
       
   332 declaration must be read to determine which of UCS-4 or other supported 32-bit encodings applies.
       
   333 */
       
   334 	EEncodingUCS_4LE,
       
   335 
       
   336 /**
       
   337 This enumeration represents a non BOM subset with the following values 00 00 3C 00.
       
   338 Posible encodings include: UCS-4 or other encoding with a 32-bit code unit
       
   339 and ASCII characters encoded as ASCII values, in respectively big-endian (1234), 
       
   340 little-endian (4321) and two unusual byte orders (2143 and 3412). The encoding 
       
   341 declaration must be read to determine which of UCS-4 or other supported 32-bit encodings applies.
       
   342 */
       
   343 	EEncodingUCS_4BO1,
       
   344 
       
   345 /**
       
   346 This enumeration represents a non BOM subset with the following values 00 3C 00 00 
       
   347 Posible encodings include: UCS-4 or other encoding with a 32-bit code unit
       
   348 and ASCII characters encoded as ASCII values, in respectively big-endian (1234), 
       
   349 little-endian (4321) and two unusual byte orders (2143 and 3412). The encoding 
       
   350 declaration must be read to determine which of UCS-4 or other supported 32-bit encodings applies. 
       
   351 */
       
   352 	EEncodingUCS_4BO2,
       
   353 
       
   354 /**
       
   355 This enumeration represents a non BOM subset with the following values 00 3C 00 3F.
       
   356 Posible encodings include: UTF-16BE or big-endian ISO-10646-UCS-2 or other encoding
       
   357 with a 16-bit code unit in big-endian order and ASCII characters encoded as ASCII 
       
   358 values (the encoding declaration must be read to determine which).
       
   359 */
       
   360 	EEncodingUTF_16BE,
       
   361 	
       
   362 /**
       
   363 This enumeration represents a non BOM subset with the following values 3C 00 3F 00.
       
   364 Posible encodings include: UTF-16LE or little-endian ISO-10646-UCS-2 or other encoding
       
   365 with a 16-bit code unit in little-endian order and ASCII characters encoded as ASCII 
       
   366 values (the encoding declaration must be read to determine which).
       
   367 */
       
   368 	EEncodingUTF_16LE,
       
   369 
       
   370 /**
       
   371 This enumeration represents a non BOM subset with the following values 3C 3F 78 6D.
       
   372 Posible encodings include: UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, 
       
   373 EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the 
       
   374 characters of ASCII have their normal positions, width, and values; the actual encoding
       
   375 declaration must be read to detect which of these applies, but since all of these 
       
   376 encodings use the same bit patterns for the relevant ASCII characters, the encoding 
       
   377 declaration itself may be read reliably.
       
   378 */
       
   379 	EEncodingUTF_8,
       
   380 
       
   381 /**
       
   382 This enumeration represents a non BOM subset with the following values 4C 6F A7 94.
       
   383 Posible encodings include: EBCDIC (in some flavor; the full encoding declaration must 
       
   384 be read to tell which code page is in use).
       
   385 */
       
   386 
       
   387 	EEncodingEBCDIC,
       
   388 
       
   389 /**
       
   390 This enumeration represents a non BOM subset with a combination of other values.
       
   391 Posible encodings include: Other UTF-8 without an encoding declaration, or else the data 
       
   392 stream is mislabeled (lacking a required encoding declaration), corrupt, fragmentary, 
       
   393 or enclosed in a wrapper of some kind.
       
   394 */
       
   395 	EEncodingOTHER,
       
   396 
       
   397 	};
       
   398 
       
   399 #endif // __XMLCONSTANTS_H__