diff -r 000000000000 -r dd21522fd290 webengine/wmlengine/src/htmlp/src/htmlp_lexer.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/webengine/wmlengine/src/htmlp/src/htmlp_lexer.c Mon Mar 30 12:54:55 2009 +0300 @@ -0,0 +1,476 @@ +/* +* Copyright (c) 2000 - 2001 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of the License "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + + +/* + +Lexer for HTML parser. + +*/ +#ifndef FEA_RME_NOHTMLPARSER + +#include "nwx_defs.h" +#include "nw_htmlp_lexer.h" +#include "nw_string_char.h" +#include "BrsrStatusCodes.h" + +/* Does not copy the buffer */ +TBrowserStatusCode NW_HTMLP_Lexer_InitFromBuffer(NW_HTMLP_Lexer_t* pL, + NW_Uint32 byteCount, + NW_Uint8* pBuf, + NW_HTMLP_ElementTableIndex_t elementCount, + NW_HTMLP_ElementDescriptionConst_t* pElementDictionary) +{ + if ((byteCount == 0) || (pBuf == NULL)) { + return KBrsrFailure; + } + pL->encoding = 0; + pL->endianness = NW_NATIVE_ENDIAN; + pL->readPosition = 0; + pL->charPosition = 0; + pL->lineColumn.crCount = 0; + pL->lineColumn.lfCount = 0; + pL->lineColumn.charsSinceLastCR = 0; + pL->lineColumn.charsSinceLastLF = 0; + pL->end = NW_FALSE; + pL->byteCount = byteCount; + pL->pBuf = pBuf; + pL->elementCount = elementCount; + pL->pElementDictionary = pElementDictionary; + return KBrsrSuccess; +} + +/* Returns a pointer in *ppData into pBuf at some byte position +and byte count, byte count is truncated to fit in pBuf if required. */ +TBrowserStatusCode NW_HTMLP_Lexer_DataAddressFromBuffer(NW_HTMLP_Lexer_t* pL, + NW_Uint32 byteIndex, + NW_Uint32* pByteCount, + NW_Uint8** ppData) +{ + NW_ASSERT(byteIndex < pL->byteCount); + NW_ASSERT(*pByteCount <= pL->byteCount); + NW_ASSERT((byteIndex + *pByteCount) <= pL->byteCount); + *ppData = NULL; + if (byteIndex < pL->byteCount) { + *ppData = pL->pBuf + byteIndex; + if ((byteIndex + *pByteCount) > pL->byteCount) { + *pByteCount = pL->byteCount - byteIndex; + } + return KBrsrSuccess; + } + return KBrsrFailure; +} + +/* peekOrAdvance: first arg "advance": peek = NW_FALSE, advance = NW_TRUE */ +static +TBrowserStatusCode NW_HTMLP_Lexer_PeekOrAdvanceOffset(NW_Bool advance, + NW_HTMLP_Lexer_t* pL, + NW_Uint32 offsetCharCount, + NW_Uint32* pC, + NW_Bool* pEOF) +{ + NW_Uint32 i; + NW_Uint32 charCount = 0; + NW_Int32 byteCount = 0; + NW_Uint32 crCount = 0; + NW_Uint32 lfCount = 0; + NW_Uint32 charsPastCR = 0; + NW_Uint32 charsPastLF = 0; + NW_Ucs2 c_ucs2; + NW_Bool resetPastCR = 0; + NW_Bool resetPastLF = 0; + + /* it makes no sense to advance by 0 */ + NW_ASSERT((advance == NW_FALSE) || (offsetCharCount > 0)); + + if (NW_HTMLP_Lexer_AtEnd(pL)) { + *pEOF = NW_TRUE; + return KBrsrSuccess; + } + *pEOF = NW_FALSE; + i = pL->readPosition; + do { + /* It is assumed that this func returns UNICODE code points. */ + byteCount = NW_String_readChar(&(pL->pBuf[i]), + &c_ucs2, pL->encoding); + /* This catches NW_String_readChar() reading past end of buffer + and can be removed when the readChar function does proper + error checking (requires passing in buf length). */ + if ((i + byteCount) > pL->byteCount) { + return KBrsrFailure; + } + *pC = c_ucs2; + if (byteCount == -1) { + return KBrsrFailure; + } + if (charCount == offsetCharCount) { + break; + } + charCount++; + charsPastCR++; + charsPastLF++; + if (c_ucs2 == 0xd /* CR */) { + crCount++; + resetPastCR = 1; + charsPastCR = 0; + } else if (c_ucs2 == 0xa /* LF */) { + lfCount++; + resetPastLF = 1; + charsPastLF = 0; + } + i += byteCount; + if (pL->encoding == HTTP_iso_10646_ucs_2 && + i == (pL->byteCount - 1)) + { + *pEOF = NW_TRUE; + break; + } + if (pL->encoding == HTTP_utf_8 && + (i + 3) > pL->byteCount) + { + *pEOF = NW_TRUE; + break; + } + } while (i < pL->byteCount); + if (i >= pL->byteCount) { + *pEOF = NW_TRUE; + } + if (advance == NW_TRUE) { + if (*pEOF == NW_TRUE) { + pL->readPosition = pL->byteCount; + pL->end = NW_TRUE; + } else { + pL->readPosition = i; + } + pL->charPosition += charCount; + pL->lineColumn.crCount += crCount; + pL->lineColumn.lfCount += lfCount; + if (resetPastCR) { + pL->lineColumn.charsSinceLastCR = charsPastCR; + } else { + pL->lineColumn.charsSinceLastCR += charsPastCR; + } + if (resetPastLF) { + pL->lineColumn.charsSinceLastLF = charsPastLF; + } else { + pL->lineColumn.charsSinceLastLF += charsPastLF; + } + } + return KBrsrSuccess; +} + +TBrowserStatusCode NW_HTMLP_Lexer_PeekOffset(NW_HTMLP_Lexer_t* pL, + NW_Uint32 offsetCharCount, + NW_Uint32* pC, + NW_Bool* pEOF) +{ + return NW_HTMLP_Lexer_PeekOrAdvanceOffset(NW_FALSE, pL, + offsetCharCount, pC, pEOF); +} + +TBrowserStatusCode NW_HTMLP_Lexer_AdvanceOffset(NW_HTMLP_Lexer_t* pL, + NW_Uint32 offsetCharCount) +{ + NW_Uint32 c; + NW_Bool eof; + return NW_HTMLP_Lexer_PeekOrAdvanceOffset(NW_TRUE, pL, + offsetCharCount, &c, &eof); +} + +void NW_HTMLP_Lexer_GetPosition(NW_HTMLP_Lexer_t* pL, NW_HTMLP_Lexer_Position_t* pPosition) +{ + pPosition->readPosition = pL->readPosition; + pPosition->charPosition = pL->charPosition; + pPosition->crCount = pL->lineColumn.crCount; + pPosition->lfCount = pL->lineColumn.lfCount; + pPosition->charsSinceLastCR = pL->lineColumn.charsSinceLastCR; + pPosition->charsSinceLastLF = pL->lineColumn.charsSinceLastLF; + pPosition->end = pL->end; +} + +/* Note: Setting the position (similar to seeking in a file) is in general +not possible without reading the characters (usually reading forward) because +character encoding may use a variable numbers of bytes per character. This is +here so that if you have defined a valid interval, then you can reposition to +the beginning of the interval. Setting to the position to a bad value will +not always be caught immediately. Don't forget to also save and set line +and column with position. */ +TBrowserStatusCode NW_HTMLP_Lexer_SetPosition(NW_HTMLP_Lexer_t* pL, NW_HTMLP_Lexer_Position_t* pPosition) +{ + if ((pPosition->readPosition > pL->byteCount) + || (pPosition->end && (pPosition->readPosition != pL->byteCount))) { + return KBrsrFailure; + } + pL->readPosition = pPosition->readPosition; + pL->charPosition = pPosition->charPosition; + pL->lineColumn.crCount = pPosition->crCount; + pL->lineColumn.lfCount = pPosition->lfCount; + pL->lineColumn.charsSinceLastCR = pPosition->charsSinceLastCR; + pL->lineColumn.charsSinceLastLF = pPosition->charsSinceLastLF; + pL->end = pPosition->end; + + return KBrsrSuccess; +} + +TBrowserStatusCode NW_HTMLP_Lexer_IsSpace(NW_HTMLP_Lexer_t* pL, NW_Bool* pMatch) +{ + NW_Uint32 c; + TBrowserStatusCode e; + NW_Bool eof; + + *pMatch = NW_FALSE; + e = NW_HTMLP_Lexer_Peek(pL, &c, &eof); + if (BRSR_STAT_IS_FAILURE(e)) { + return e; + } + if (eof == NW_TRUE) { + return KBrsrSuccess; + } + /* "space" is defined in HTML to be the following codepoints: + 0x20 (space), 0x9 (tab), 0xc (form feed), 0x200b (zero-width space), + 0xd (cr), 0xa (lf) */ + if ((c == 0x20U) || (c == 0x9U) || (c == 0xcU) + || (c== 0x200bU) || (c == 0xdU) || (c == 0xaU)) { + *pMatch = NW_TRUE; + } + return KBrsrSuccess; +} + +TBrowserStatusCode NW_HTMLP_Lexer_IsCRLF(NW_HTMLP_Lexer_t* pL, NW_Bool* pMatch) +{ + NW_Uint32 c; + TBrowserStatusCode e; + NW_Bool eof; + + *pMatch = NW_FALSE; + e = NW_HTMLP_Lexer_Peek(pL, &c, &eof); + if (BRSR_STAT_IS_FAILURE(e)) { + return e; + } + if (eof == NW_TRUE) { + return KBrsrSuccess; + } + /* 0xd (CR), 0xa (LF) */ + if ((c == 0xdU) || (c == 0xaU)) { + *pMatch = NW_TRUE; + } + return KBrsrSuccess; +} + +/* on return: *pMatch == NW_TRUE if character is in [a-zA-Z] */ +TBrowserStatusCode NW_HTMLP_Lexer_IsAsciiLetter(NW_HTMLP_Lexer_t* pL, NW_Bool* pMatch) +{ + NW_Uint32 c; + TBrowserStatusCode e; + NW_Bool eof; + + *pMatch = NW_FALSE; + e = NW_HTMLP_Lexer_Peek(pL, &c, &eof); + if (BRSR_STAT_IS_FAILURE(e)) { + return e; + } + if (eof == NW_TRUE) { + return KBrsrSuccess; + } + if ( ( (c >= (NW_Uint32)'a') && (c <= (NW_Uint32)'z') ) + || ( (c >= (NW_Uint32)'A') && (c <= (NW_Uint32)'Z') ) ) { + *pMatch = NW_TRUE; + } + return KBrsrSuccess; +} + +/* on return: *pMatch == NW_TRUE if character is in [0-9] */ +TBrowserStatusCode NW_HTMLP_Lexer_IsAsciiDigit(NW_HTMLP_Lexer_t* pL, NW_Bool* pMatch) +{ + NW_Uint32 c; + TBrowserStatusCode e; + NW_Bool eof; + + *pMatch = NW_FALSE; + e = NW_HTMLP_Lexer_Peek(pL, &c, &eof); + if (BRSR_STAT_IS_FAILURE(e)) { + return e; + } + if (eof == NW_TRUE) { + return KBrsrSuccess; + } + if ((c >= (NW_Uint32)'0') && (c <= (NW_Uint32)'9')) { + *pMatch = NW_TRUE; + } + return KBrsrSuccess; +} + +/* *pMatch is NW_TRUE iff ASCII string matches the text in its encoding */ +TBrowserStatusCode NW_HTMLP_Lexer_AsciiCharCompare(NW_HTMLP_Lexer_t* pL, + NW_Uint8 asciiChar, + NW_Bool* pMatch) +{ + NW_Uint32 c_text; + TBrowserStatusCode e; + NW_Bool eof; + + *pMatch = NW_FALSE; + e = NW_HTMLP_Lexer_Peek(pL, &c_text, &eof); + if (BRSR_STAT_IS_FAILURE(e)) { + return e; + } + if (eof == NW_TRUE) { + return KBrsrSuccess; + } + if (c_text == asciiChar) { + *pMatch = NW_TRUE; + } + return KBrsrSuccess; +} + +/* +on entry: no assumptions +on return: If matched string, then *pMatch == NW_TRUE. +.........: If did not match string, then *pMatch == NW_FALSE. +.........: In either case, lexer read position is unchanged +eof handling: if encounters EOF while attempting operation then returns +............: *pMatch == NW_FALSE and KBrsrSuccess, and +............: lexer read position is unchanged +on error return: return value is not KBrsrSuccess, *pMatch == NW_FALSE +...............: and lexer read position is unchanged +*/ +TBrowserStatusCode NW_HTMLP_Lexer_AsciiStringCompare(NW_HTMLP_Lexer_t* pL, + NW_Uint32 asciiCharCount, + const NW_Uint8* pString, + NW_Bool* pMatch) +{ + return NW_HTMLP_Lexer_AsciiStringCompareCase(pL, + asciiCharCount, + pString, + NW_TRUE, + pMatch); +} + +/* perform the same functionality as NW_HTMLP_Lexer_AsciiStringCompareCase + except for the ability to perform both case insensitive check and + case sensitive check +*/ +TBrowserStatusCode NW_HTMLP_Lexer_AsciiStringCompareCase(NW_HTMLP_Lexer_t* pL, + NW_Uint32 asciiCharCount, + const NW_Uint8* pString, + NW_Bool CaseSensitive, + NW_Bool* pMatch) +{ + NW_Uint32 c_text; + NW_Uint32 i; + TBrowserStatusCode e = KBrsrSuccess; + NW_Bool eof; + + *pMatch = NW_FALSE; + NW_ASSERT(asciiCharCount); + for (i = 0; i < asciiCharCount; i++) { + + e = NW_HTMLP_Lexer_PeekOffset(pL, i, &c_text, &eof); + if (BRSR_STAT_IS_FAILURE(e)) { + break; + } + if (eof == NW_TRUE) { + break; + } + if (c_text != pString[i]) { + if (!CaseSensitive) + { + if (c_text + 'A' - 'a' == pString[i]) //Small case to Upper + continue; + if(c_text + 'a' - 'A' == pString[i]) //Upper case to small + continue; + } + break; + } + } + if (i == asciiCharCount) { + *pMatch = NW_TRUE; + } + return e; +} + +/* Sets start, stop, charStart, charStop to current read position. */ +void NW_HTMLP_Interval_Start(NW_HTMLP_Interval_t* pI, NW_HTMLP_Lexer_t* pL) +{ + /* set both start and stop for safety in later use */ + pI->start = pI->stop = pL->readPosition; + pI->charStart = pI->charStop = pL->charPosition; +} + +/* Sets stop to current reader position. */ +void NW_HTMLP_Interval_Stop(NW_HTMLP_Interval_t* pI, NW_HTMLP_Lexer_t* pL) +{ + pI->stop = pL->readPosition; + pI->charStop = pL->charPosition; +} + +/* Returns an estimate of the current line and column position in the text. +It is an estimate because it has to guess at what the intended line ending +sequence is using a count of CR and LF characters. Line and Column indices +are 1-based not 0-based. */ +void +NW_HTMLP_Lexer_GetLineColumn(NW_HTMLP_Lexer_t* pT, NW_Uint32* pLine, + NW_Uint32* pColumn) +{ + NW_Uint32 crCount, lfCount, charsSinceCR, charsSinceLF; + crCount = pT->lineColumn.crCount; + lfCount = pT->lineColumn.lfCount; + charsSinceCR = pT->lineColumn.charsSinceLastCR; + charsSinceLF = pT->lineColumn.charsSinceLastLF; + if (crCount == lfCount) { + /* assume CR, LF, DOS style */ + /* use a bias in favor of CR followed by LF + which will give the correct column for DOS */ + *pLine = lfCount + 1; + *pColumn = charsSinceLF + 1; + } else if (lfCount == 0) { + /* assume CR only, Unix style */ + *pLine = crCount + 1; + *pColumn = charsSinceCR + 1; + } else if (crCount == 0) { + /* assume LF only, Mac style */ + *pLine = lfCount + 1; + *pColumn = charsSinceLF + 1; + } else { + /* an unclear situation so use + thresholds on the ratio to guess */ + NW_Uint32 ratio; + ratio = ((crCount * 100) / lfCount); + if (ratio > 300) {/* more than 3 to 1 crCount to lfCount */ + /* assume CR only, Unix style */ + *pLine = crCount + 1; + *pColumn = charsSinceCR + 1; + } else if (ratio < 33) {/* less than 1 to 3 crCount to lfCount */ + /* assume LF only, Mac style */ + *pLine = lfCount + 1; + *pColumn = charsSinceLF + 1; + } else { + /* assume CR, LF, DOS style */ + /* use a bias in favor of CR, LF sequence (DOS style) + which will give the correct column */ + *pLine = lfCount + 1; + *pColumn = charsSinceLF + 1; + } + } +} +#else + +void FeaRmeNoHTMLParser_htmlp_lexer(){ + int i = 0; + i+=1; +} +#endif /* FEA_RME_NOHTMLPARSER */