diff -r d4d56f5e7c55 -r 65456528cac2 searchengine/oss/loc/analysis/src/cjkanalyzer.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/searchengine/oss/loc/analysis/src/cjkanalyzer.cpp Fri Oct 15 12:09:28 2010 +0530 @@ -0,0 +1,213 @@ +/* +* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). +* All rights reserved. +* This component and the accompanying materials are made available +* under the terms of "Eclipse Public License v1.0" +* which accompanies this distribution, and is available +* at the URL "http://www.eclipse.org/legal/epl-v10.html". +* +* Initial Contributors: +* Nokia Corporation - initial contribution. +* +* Contributors: +* +* Description: +* +*/ + +#include "CLucene/StdHeader.h" +#include "CJKAnalyzer.h" + +CL_NS_DEF2(analysis,cjk) +CL_NS_USE(analysis) +CL_NS_USE(util) + + +const TCHAR* CJKTokenizer::tokenTypeSingle = _T("single"); +const TCHAR* CJKTokenizer::tokenTypeDouble = _T("double"); + +CJKTokenizer::CJKTokenizer(Reader* in): + Tokenizer(in) +{ + tokenType = Token::defaultType; + offset = 0; + bufferIndex = 0; + dataLen = 0; + preIsTokened = false; + ignoreSurrogates = true; +} + +bool CJKTokenizer::next(Token* token){ + while (true) { + /** how many character(s) has been stored in buffer */ + int32_t length = 0; + + /** the position used to create Token */ + int32_t start = offset; + + while (true) { + /** current character */ + clunichar c; + int charlen = 1; + + offset++; + + if (bufferIndex >= dataLen) { + dataLen = input->read(ioBuffer); + bufferIndex = 0; + } + + if (dataLen == -1) { + if (length > 0) { + if (preIsTokened == true) { + length = 0; + preIsTokened = false; + } else { + offset--; + } + break; + } else { + offset--; + return false; + } + } else { + //get current character + c = ioBuffer[bufferIndex++]; + } + + //to support surrogates, we'll need to convert the incoming utf16 into + //ucs4(c variable). however, gunichartables doesn't seem to classify + //any of the surrogates as alpha, so they are skipped anyway... + //so for now we just convert to ucs4 so that we dont corrupt the input. + if ( c >= 0xd800 || c <= 0xdfff ){ + clunichar c2 = ioBuffer[bufferIndex]; + if ( c2 >= 0xdc00 && c2 <= 0xdfff ){ + bufferIndex++; + offset++; + charlen=2; + + c = (((c & 0x03ffL) << 10) | ((c2 & 0x03ffL) << 0)) + 0x00010000L; + } + } + + //if the current character is ASCII or Extend ASCII + if ((c <= 0xFF) //is BASIC_LATIN + || (c>=0xFF00 && c<=0xFFEF) //ascii >0x74 cast to unsigned... + ) { + if (c >= 0xFF00) { + //todo: test this... only happens on platforms where char is signed, i think... + /** convert HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */ + c -= 0xFEE0; + } + + // if the current character is a letter or "_" "+" "#" + if (_istalnum(c) || ((c == '_') || (c == '+') || (c == '#')) ) { + if (length == 0) { + // "javaC1C2C3C4linux"
+ // ^--: the current character begin to token the ASCII + // letter + start = offset - 1; + } else if (tokenType == tokenTypeDouble) { + // "javaC1C2C3C4linux"
+ // ^--: the previous non-ASCII + // : the current character + offset-=charlen; + bufferIndex-=charlen; + tokenType = tokenTypeSingle; + + if (preIsTokened == true) { + // there is only one non-ASCII has been stored + length = 0; + preIsTokened = false; + + break; + } else { + break; + } + } + + // store the LowerCase(c) in the buffer + buffer[length++] = _totlower((TCHAR)c); + tokenType = tokenTypeSingle; + + // break the procedure if buffer overflowed! + if (length == LUCENE_MAX_WORD_LEN) { + break; + } + } else if (length > 0) { + if (preIsTokened == true) { + length = 0; + preIsTokened = false; + } else { + break; + } + } + } else { + // non-ASCII letter, eg."C1C2C3C4" + if ( _istalpha(c) || (!ignoreSurrogates && c >= 0x10000) ) { + if (length == 0) { + start = offset - 1; + + if ( c < 0x00010000L ) + buffer[length++] = (TCHAR)c; + else{ + clunichar ucs4 = c - 0x00010000L; + buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800; + buffer[length++] = (TCHAR)((ucs4 >> 0) & 0x3ff) | 0xdc00; + } + + tokenType = tokenTypeDouble; + } else { + if (tokenType == tokenTypeSingle) { + offset-=charlen; + bufferIndex-=charlen; + + //return the previous ASCII characters + break; + } else { + if ( c < 0x00010000L ) + buffer[length++] = (TCHAR)c; + else{ + clunichar ucs4 = c - 0x00010000L; + buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800; + buffer[length++] = (TCHAR)((ucs4 >> 0) & 0x3ff) | 0xdc00; + } + tokenType = tokenTypeDouble; + + if (length >= 2) { + offset-=charlen; + bufferIndex-=charlen; + preIsTokened = true; + + break; + } + } + } + } else if (length > 0) { + if (preIsTokened == true) { + // empty the buffer + length = 0; + preIsTokened = false; + } else { + break; + } + } + } + } + if (length > 0) { + buffer[length]='\0'; + token->set(buffer,start, start+length, tokenType); + return true; + } else if (dataLen == -1) { + offset--; + return false; + } + } +} + +TokenStream* CJKAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) { + return new CJKTokenizer(reader); +} + + +CL_NS_END2