xml/cxmllibrary/src/xmlp/src/XMLReader.c
branchRCL_3
changeset 20 889504eac4fb
equal deleted inserted replaced
19:6bcc0aa4be39 20:889504eac4fb
       
     1 /*
       
     2 * Copyright (c) 2000 - 2001 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of the License "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 /*
       
    20 This module provides a text (character) stream, pointers into the stream
       
    21 and operations on segments of the stream as though they were strings.
       
    22 The goal is to isolate the client from stream buffers, cross buffer
       
    23 issues and some character set encoding concerns.
       
    24 
       
    25 This particular version is for input from a single buffer.
       
    26 */
       
    27 
       
    28 #include "cxml_internal.h"
       
    29 #include <xml/cxml/nw_string_char.h>
       
    30 #include <xml/cxml/nw_xmlp_xmlreader.h>
       
    31 
       
    32 static
       
    33 NW_Status_t
       
    34 NW_XML_Reader_ReadAsciiChar(NW_Uint32 c, NW_Uint32* pReturnChar)
       
    35 {
       
    36     /* This looks a bit weird but the idea is to force the conversion
       
    37     of the ASCII character through the same function that is used
       
    38     to read a character from the text.  This will impose the same conversion
       
    39     limitations and the same result encoding. */
       
    40     NW_Int32 byteCount;
       
    41     NW_Uint8 buf[2];
       
    42     NW_Ucs2 c_ucs2;
       
    43     buf[0] = (NW_Uint8)(c & 0xff);
       
    44     buf[1] = 0;
       
    45     /* should only use this function for ASCII */
       
    46     if (c > 127) {
       
    47         return NW_STAT_FAILURE;
       
    48     }
       
    49     /* call it UTF-8 because ASCII doesn't work with NW_String_readChar()
       
    50     at the moment */
       
    51     byteCount = NW_String_readChar((NW_Byte*)buf, &c_ucs2, HTTP_utf_8);
       
    52     if (byteCount != 1) {
       
    53         return NW_STAT_FAILURE;
       
    54     }
       
    55     *pReturnChar = c_ucs2;
       
    56     return NW_STAT_SUCCESS;
       
    57 }
       
    58 
       
    59 /* assumes this is just a handoff of the buffer (i.e., won't make a copy) */
       
    60 EXPORT_C NW_Status_t
       
    61 NW_XML_Reader_InitFromBuffer(NW_XML_Reader_t* pT, NW_Uint32 length, unsigned char* pBuf)
       
    62 {
       
    63     pT->encoding = 0;
       
    64     pT->endianness = NW_NATIVE_ENDIAN;
       
    65     pT->index = 0;
       
    66     pT->charIndex = 0;
       
    67     pT->lineColumn.crCount = 0;
       
    68     pT->lineColumn.lfCount = 0;
       
    69     pT->lineColumn.charsSinceLastCR = 0;
       
    70     pT->lineColumn.charsSinceLastLF = 0;
       
    71     pT->end = 0;
       
    72     pT->length = length;
       
    73     pT->pBuf = pBuf;
       
    74     return NW_STAT_SUCCESS;
       
    75 }
       
    76 
       
    77 EXPORT_C NW_Status_t
       
    78 NW_XML_Reader_DataAddressFromBuffer(NW_XML_Reader_t* pT,
       
    79                                  NW_Uint32 start, NW_Uint32* length,
       
    80                                  unsigned char** ppData)
       
    81 {
       
    82     NW_ASSERT(start < pT->length);
       
    83     NW_ASSERT(*length <= pT->length);
       
    84     NW_ASSERT((start + *length) <= pT->length);
       
    85     *ppData = NULL;
       
    86     if (start < pT->length) {
       
    87         *ppData = pT->pBuf + start;
       
    88         *length = (((start + *length) <= pT->length) ?
       
    89                    *length : (pT->length - start));
       
    90         return NW_STAT_SUCCESS;
       
    91     }
       
    92     return NW_STAT_FAILURE;
       
    93 }
       
    94 
       
    95 /* peekOrAdvance: first arg "advance": peek = 0, advance = 1 */
       
    96 static
       
    97 NW_Status_t
       
    98 NW_XML_Reader_PeekOrAdvanceOffset(NW_Bool advance, NW_XML_Reader_t* pT,
       
    99                                NW_Uint32 offsetCharCount, NW_Uint32* pC)
       
   100 {
       
   101     NW_Ucs2 c_ucs2;
       
   102     NW_Uint32 i;
       
   103     NW_Uint32 charCount = 0;
       
   104     NW_Int32 byteCount = 0;
       
   105     NW_Uint32 crCount = 0;
       
   106     NW_Uint32 lfCount = 0;
       
   107     NW_Uint32 charsPastCR = 0;
       
   108     NW_Uint32 charsPastLF = 0;
       
   109     NW_Bool resetPastCR = 0;
       
   110     NW_Bool resetPastLF = 0;
       
   111 
       
   112     NW_ASSERT(!(advance && !offsetCharCount)); 
       
   113 
       
   114     if (pT->end) {
       
   115         return NW_STAT_FAILURE;
       
   116     }
       
   117     for (i = pT->index; i < pT->length; i += (NW_Uint32)byteCount) {
       
   118         NW_ASSERT(charCount <= offsetCharCount);
       
   119 
       
   120         /* It is assumed that this func returns UNICODE code points. */
       
   121         byteCount = NW_String_readChar((NW_Byte*)&(pT->pBuf[i]),
       
   122                                        &c_ucs2, pT->encoding);
       
   123         *pC = c_ucs2;
       
   124         if (byteCount == -1) {
       
   125             return NW_STAT_FAILURE;
       
   126         }
       
   127         if (charCount == offsetCharCount) {
       
   128             /* This catches NW_String_readChar() reading past buffer end
       
   129             and can be removed when the readChar function does proper
       
   130             error checking. */
       
   131             if ((i + (NW_Uint32)byteCount) > pT->length) {
       
   132                 return NW_STAT_FAILURE;
       
   133             }
       
   134             break;
       
   135         }
       
   136         charCount++;
       
   137         charsPastCR++;
       
   138         charsPastLF++;
       
   139         if (c_ucs2 == 0xd /* CR */) {
       
   140             crCount++;
       
   141             resetPastCR = 1;
       
   142             charsPastCR = 0;
       
   143         } else if (c_ucs2 == 0xa /* LF */) {
       
   144             lfCount++;
       
   145             resetPastLF = 1;
       
   146             charsPastLF = 0;
       
   147         }
       
   148     }
       
   149     if (i >= pT->length) {
       
   150         pT->end = 1;
       
   151     }
       
   152     /* This catches NW_String_readChar() reading past buffer end and can be
       
   153     removed when the readChar function does proper error checking. */
       
   154     if (i > pT->length) {
       
   155         return NW_STAT_FAILURE;
       
   156     }
       
   157     if (advance) {
       
   158         pT->index = i;
       
   159         pT->charIndex += charCount;
       
   160         pT->lineColumn.crCount += crCount;
       
   161         pT->lineColumn.lfCount += lfCount;
       
   162         if (resetPastCR) {
       
   163             pT->lineColumn.charsSinceLastCR = charsPastCR;
       
   164         } else {
       
   165             pT->lineColumn.charsSinceLastCR += charsPastCR;
       
   166         }
       
   167         if (resetPastLF) {
       
   168             pT->lineColumn.charsSinceLastLF = charsPastLF;
       
   169         } else {
       
   170             pT->lineColumn.charsSinceLastLF += charsPastLF;
       
   171         }
       
   172     }
       
   173     return NW_STAT_SUCCESS;
       
   174 }
       
   175 
       
   176 EXPORT_C NW_Status_t
       
   177 NW_XML_Reader_PeekOffset(NW_XML_Reader_t* pT, NW_Uint32 nChars, NW_Uint32* pC)
       
   178 {
       
   179     return NW_XML_Reader_PeekOrAdvanceOffset(0, pT, nChars, pC);
       
   180 }
       
   181 
       
   182 EXPORT_C NW_Status_t
       
   183 NW_XML_Reader_AdvanceOffset(NW_XML_Reader_t* pT, NW_Uint32 nChars)
       
   184 {
       
   185     NW_Uint32 c;
       
   186     return NW_XML_Reader_PeekOrAdvanceOffset(1, pT, nChars, &c);
       
   187 }
       
   188 
       
   189 EXPORT_C 
       
   190 void NW_XML_Reader_GetPosition(NW_XML_Reader_t* pT, NW_Uint32* pByteIndex,
       
   191                             NW_Uint32* pCharIndex,
       
   192                             NW_XML_Reader_LineColumn_t* pLineColumn)
       
   193 {
       
   194     *pByteIndex = pT->index;
       
   195     *pCharIndex = pT->charIndex;
       
   196     pLineColumn->crCount = pT->lineColumn.crCount;
       
   197     pLineColumn->lfCount = pT->lineColumn.lfCount;
       
   198     pLineColumn->charsSinceLastCR = pT->lineColumn.charsSinceLastCR;
       
   199     pLineColumn->charsSinceLastLF = pT->lineColumn.charsSinceLastLF;
       
   200 }
       
   201 
       
   202 /* Note: Setting the position (similar to seeking in a file) is in general
       
   203 not possible without reading the characters (usually reading forward) because
       
   204 character encoding may use a variable numbers of bytes per character. This is
       
   205 here so that if you have defined a valid interval, then you can reposition to
       
   206 the beginning of the interval. Setting to the position to a bad value will
       
   207 not always be caught immediately. Don't forget to also save and set line
       
   208 and column with position. */
       
   209 EXPORT_C void
       
   210 NW_XML_Reader_SetPosition(NW_XML_Reader_t* pT, NW_Uint32 byteIndex,
       
   211                        NW_Uint32 charIndex,
       
   212                        const NW_XML_Reader_LineColumn_t* pLineColumn)
       
   213 {
       
   214     pT->index = byteIndex;
       
   215     pT->charIndex = charIndex;
       
   216     pT->lineColumn.crCount = pLineColumn->crCount;
       
   217     pT->lineColumn.lfCount = pLineColumn->lfCount;
       
   218     pT->lineColumn.charsSinceLastCR = pLineColumn->charsSinceLastCR;
       
   219     pT->lineColumn.charsSinceLastLF = pLineColumn->charsSinceLastLF;
       
   220 }
       
   221 
       
   222 /*
       
   223 Reader Interval Functions
       
   224 */
       
   225 
       
   226 EXPORT_C void
       
   227 NW_XML_Reader_Interval_Start(NW_XML_Reader_Interval_t* pI, NW_XML_Reader_t* pT)
       
   228 {
       
   229     /* set both start and stop for safety in later use */
       
   230     pI->start = pI->stop = pT->index;
       
   231     pI->charStart = pI->charStop = pT->charIndex;
       
   232 }
       
   233 
       
   234 EXPORT_C void
       
   235 NW_XML_Reader_Interval_Stop(NW_XML_Reader_Interval_t* pI, NW_XML_Reader_t* pT)
       
   236 {
       
   237     pI->stop = pT->index;
       
   238     pI->charStop = pT->charIndex;
       
   239 }
       
   240 
       
   241 /* BEGIN GENERIC Reader CHARACTER AND STRING FUNCTIONS */
       
   242 
       
   243 /* pMatch is 1 if ASCII character c matches Reader char in its encoding */
       
   244 EXPORT_C NW_Status_t
       
   245 NW_XML_Reader_AsciiCharMatch(NW_XML_Reader_t* pT, NW_Uint32 asciiC, NW_Uint32* pMatch)
       
   246 {
       
   247     NW_Uint32 c_text, c_ascii;
       
   248     NW_Status_t s = NW_XML_Reader_Peek(pT, &c_text);
       
   249     *pMatch = 0;
       
   250     if (NW_STAT_IS_SUCCESS(s)) {
       
   251         s = NW_XML_Reader_ReadAsciiChar(asciiC, &c_ascii);
       
   252         if (NW_STAT_IS_SUCCESS(s)) {
       
   253             *pMatch = (c_text == c_ascii);
       
   254         }
       
   255     }
       
   256     return s;
       
   257 }
       
   258 
       
   259 /* pMatch is 1 if ASCII string matches Reader sequence in its encoding */
       
   260 EXPORT_C NW_Status_t
       
   261 NW_XML_Reader_AsciiStringMatch(NW_XML_Reader_t* pT, NW_Uint32 length, const NW_Uint8* pString,
       
   262                             NW_Uint32* pMatch)
       
   263 {
       
   264     NW_Uint32 c_text, c_ascii;
       
   265     NW_Uint32 i;
       
   266     NW_Status_t s = NW_STAT_SUCCESS;
       
   267     *pMatch = 0;
       
   268     NW_ASSERT(length);
       
   269     for (i = 0; i < length; i++) {
       
   270         s = NW_XML_Reader_PeekOffset(pT, i, &c_text);
       
   271         if (NW_STAT_IS_FAILURE(s)) {
       
   272             break;
       
   273         }
       
   274         s = NW_XML_Reader_ReadAsciiChar(pString[i], &c_ascii);
       
   275         if (NW_STAT_IS_FAILURE(s)) {
       
   276             break;
       
   277         }
       
   278         if (c_text != c_ascii) {
       
   279             break;
       
   280         }
       
   281     }
       
   282     if (i == length) {
       
   283         *pMatch = 1;
       
   284     }
       
   285     return s;
       
   286 }
       
   287 
       
   288 /* Note: For XML, whitespace is only ASCII 0x20 (space),
       
   289 0x09 (tab), 0x0d (CR), 0x0a (LF).  The base test used here,
       
   290 CXML_Str_Isspace(), includes two other forms of whitespace. */
       
   291 EXPORT_C NW_Status_t
       
   292 NW_XML_Reader_SkipSpace(NW_XML_Reader_t* pT)
       
   293 {
       
   294     NW_Uint32 c;
       
   295     NW_Status_t s = NW_STAT_SUCCESS;
       
   296     for (;;) {
       
   297         s = NW_XML_Reader_Peek(pT, &c);
       
   298         if (NW_STAT_IS_FAILURE(s)) {
       
   299             break;
       
   300         }
       
   301         if (c > 0xffff) { /* validate casting */
       
   302             break;
       
   303         }
       
   304         if (!CXML_Str_Isspace((NW_Ucs2)(c & 0xffff))) {
       
   305             break;
       
   306         }
       
   307         s = NW_XML_Reader_Advance(pT);
       
   308         if (NW_STAT_IS_FAILURE(s)) {
       
   309             break;
       
   310         }
       
   311         if (pT->end){
       
   312          /* At the end so break */
       
   313             break;
       
   314           }
       
   315     }
       
   316     return s;
       
   317 }
       
   318 
       
   319 /* Note: For XML, whitespace is only ASCII 0x20 (space),
       
   320 0x09 (tab), 0x0d (CR), 0x0a (LF).  The base test used here,
       
   321 CXML_Str_Isspace(), includes two other forms of whitespace. */
       
   322 EXPORT_C NW_Status_t
       
   323 NW_XML_Reader_IsSpace(NW_XML_Reader_t* pT, NW_Uint32* pMatch)
       
   324 {
       
   325     NW_Uint32 c;
       
   326     NW_Status_t s;
       
   327 
       
   328     *pMatch = 0;
       
   329     s  = NW_XML_Reader_Peek(pT, &c);
       
   330     if (NW_STAT_IS_FAILURE(s)) {
       
   331         return s;
       
   332     }
       
   333     if (c > 0xffff) { /* validate casting */
       
   334         return NW_STAT_FAILURE;
       
   335     }
       
   336     if (CXML_Str_Isspace((NW_Ucs2)(c & 0xffff))) {
       
   337         *pMatch = 1;
       
   338     }
       
   339     return NW_STAT_SUCCESS;
       
   340 }
       
   341 
       
   342 EXPORT_C NW_Status_t
       
   343 NW_XML_Reader_IsLetter(NW_XML_Reader_t* pT, NW_Uint32* pMatch)
       
   344 {
       
   345     NW_Uint32 c;
       
   346     NW_Status_t s;
       
   347 
       
   348     *pMatch = 0;
       
   349     s = NW_XML_Reader_Peek(pT, &c);
       
   350     if (NW_STAT_IS_FAILURE(s)) {
       
   351         return s;
       
   352     }
       
   353     /* This is an approximation to what XML charaters are "letter".
       
   354     Everything above the 8-bit range is considered to be a "letter".*/
       
   355     if (c >= 0x41 && c <= 0x5a) {
       
   356         *pMatch = 1;
       
   357     }
       
   358     else if (c >= 0x61 && c <= 0x7a) {
       
   359         *pMatch = 1;
       
   360     }
       
   361     else if (c >= 0xc0 && c <= 0xd6) {
       
   362         *pMatch = 1;
       
   363     }
       
   364     else if (c >= 0xd8 && c <= 0xf6) {
       
   365         *pMatch = 1;
       
   366     }
       
   367     else if (c >= 0xf8) {/* letters become anything above 0xf8 */
       
   368         *pMatch = 1;
       
   369     }
       
   370     return NW_STAT_SUCCESS;
       
   371 }
       
   372 
       
   373 /* Note: For XML, digits include not only the ASCII digits but
       
   374 other language forms of digits.  The base test used here,
       
   375 CXML_Str_Isdigit() only tests for ASCII digits. */
       
   376 EXPORT_C NW_Status_t
       
   377 NW_XML_Reader_IsDigit(NW_XML_Reader_t* pT, NW_Uint32* pMatch)
       
   378 {
       
   379     NW_Uint32 c;
       
   380     NW_Status_t s;
       
   381 
       
   382     *pMatch = 0;
       
   383     s = NW_XML_Reader_Peek(pT, &c);
       
   384     if (NW_STAT_IS_FAILURE(s)) {
       
   385         return s;
       
   386     }
       
   387     if (c > 0xffff) {/* validate casting */
       
   388         return NW_STAT_SUCCESS;
       
   389     }
       
   390     if (CXML_Str_Isdigit((NW_Ucs2)(c & 0xffff))) {
       
   391         *pMatch = 1;
       
   392     }
       
   393     return NW_STAT_SUCCESS;
       
   394 }
       
   395 
       
   396 /* Returns an estimate of the current line and column position in the text.
       
   397 It is an estimate because it has to guess at what the intended line ending
       
   398 sequence is using a count of CR and LF characters.  Line and Column indices
       
   399 are 1-based not 0-based. */
       
   400 EXPORT_C void
       
   401 NW_XML_Reader_GetLineColumn(NW_XML_Reader_t* pT, NW_Uint32* pLine,
       
   402                          NW_Uint32* pColumn)
       
   403 {
       
   404     NW_Uint32 crCount, lfCount, charsSinceCR, charsSinceLF;
       
   405     crCount = pT->lineColumn.crCount;
       
   406     lfCount = pT->lineColumn.lfCount;
       
   407     charsSinceCR = pT->lineColumn.charsSinceLastCR;
       
   408     charsSinceLF = pT->lineColumn.charsSinceLastLF;
       
   409     if (crCount == lfCount) {
       
   410         /* assume CR, LF, DOS style */
       
   411         /* use a bias in favor of CR followed by LF
       
   412         which will give the correct column for DOS */
       
   413         *pLine = lfCount + 1;
       
   414         *pColumn = charsSinceLF + 1;
       
   415     } else if (lfCount == 0) {
       
   416         /* assume CR only, Unix style */
       
   417         *pLine = crCount + 1;
       
   418         *pColumn = charsSinceCR + 1;
       
   419     } else if (crCount == 0) {
       
   420         /* assume LF only, Mac style */
       
   421         *pLine = lfCount + 1;
       
   422         *pColumn = charsSinceLF + 1;
       
   423     } else {
       
   424         /* an unclear situation so use
       
   425         thresholds on the ratio to guess */
       
   426         NW_Uint32 ratio;
       
   427         ratio = ((crCount * 100) / lfCount);
       
   428         if (ratio > 300) {/* more than 3 to 1 crCount to lfCount */
       
   429             /* assume CR only, Unix style */
       
   430             *pLine = crCount + 1;
       
   431             *pColumn = charsSinceCR + 1;
       
   432         } else if (ratio < 33) {/* less than 1 to 3 crCount to lfCount */
       
   433             /* assume LF only, Mac style */
       
   434             *pLine = lfCount + 1;
       
   435             *pColumn = charsSinceLF + 1;
       
   436         } else {
       
   437             /* assume CR, LF, DOS style */
       
   438             /* use a bias in favor of CR, LF sequence (DOS style)
       
   439             which will give the correct column */
       
   440             *pLine = lfCount + 1;
       
   441             *pColumn = charsSinceLF + 1;
       
   442         }
       
   443     }
       
   444 }
       
   445