searchengine/oss/loc/analysis/src/cjkanalyzer.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#include "CLucene/StdHeader.h"
#include "CJKAnalyzer.h"

CL_NS_DEF2(analysis,cjk)
CL_NS_USE(analysis)
CL_NS_USE(util)


const TCHAR* CJKTokenizer::tokenTypeSingle = _T("single");
const TCHAR* CJKTokenizer::tokenTypeDouble = _T("double");

CJKTokenizer::CJKTokenizer(Reader* in):
	Tokenizer(in)
{
	tokenType = Token::defaultType;
	offset = 0;
	bufferIndex = 0;
	dataLen = 0;
	preIsTokened = false;
	ignoreSurrogates = true;
}

bool CJKTokenizer::next(Token* token){
	while (true) {
		/** how many character(s) has been stored in buffer */
		int32_t length = 0;
	
		/** the position used to create Token */
		int32_t start = offset;
	
		while (true) {
			/** current character */
			clunichar c;
			int charlen = 1;
	
			offset++;
	
			if (bufferIndex >= dataLen) {
				dataLen = input->read(ioBuffer);
				bufferIndex = 0;
			}
	
			if (dataLen == -1) {
				if (length > 0) {
					if (preIsTokened == true) {
						length = 0;
						preIsTokened = false;
					} else {
						offset--;
					}
					break;
				} else {
					offset--;
					return false;
				}
			} else {
				//get current character
				c = ioBuffer[bufferIndex++];
			}
	
			//to support surrogates, we'll need to convert the incoming utf16 into
			//ucs4(c variable). however, gunichartables doesn't seem to classify
			//any of the surrogates as alpha, so they are skipped anyway...
			//so for now we just convert to ucs4 so that we dont corrupt the input.
			if ( c >= 0xd800 || c <= 0xdfff ){
				clunichar c2 = ioBuffer[bufferIndex];
				if ( c2 >= 0xdc00 && c2 <= 0xdfff ){
					bufferIndex++;
					offset++;
					charlen=2;
	
					c = (((c & 0x03ffL) << 10) | ((c2 & 0x03ffL) <<  0)) + 0x00010000L;
				}
			}
	
			//if the current character is ASCII or Extend ASCII
			if ((c <= 0xFF) //is BASIC_LATIN
				|| (c>=0xFF00 && c<=0xFFEF) //ascii >0x74 cast to unsigned...
			   ) {
				if (c >= 0xFF00) {
					//todo: test this... only happens on platforms where char is signed, i think...
					/** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
					c -= 0xFEE0;
				}
	
				// if the current character is a letter or "_" "+" "#"
				if (_istalnum(c) || ((c == '_') || (c == '+') || (c == '#')) ) {
					if (length == 0) {
						// "javaC1C2C3C4linux" <br>
						//      ^--: the current character begin to token the ASCII
						// letter
						start = offset - 1;
					} else if (tokenType == tokenTypeDouble) {
						// "javaC1C2C3C4linux" <br>
						//              ^--: the previous non-ASCII
						// : the current character
						offset-=charlen;
						bufferIndex-=charlen;
						tokenType = tokenTypeSingle;
	
						if (preIsTokened == true) {
							// there is only one non-ASCII has been stored
							length = 0;
							preIsTokened = false;
	
							break;
						} else {
							break;
						}
					}
	
					// store the LowerCase(c) in the buffer
					buffer[length++] = _totlower((TCHAR)c);
					tokenType = tokenTypeSingle;
	
					// break the procedure if buffer overflowed!
					if (length == LUCENE_MAX_WORD_LEN) {
						break;
					}
				} else if (length > 0) {
					if (preIsTokened == true) {
						length = 0;
						preIsTokened = false;
					} else {
						break;
					}
				}
			} else {
				// non-ASCII letter, eg."C1C2C3C4"
				if ( _istalpha(c) || (!ignoreSurrogates && c >= 0x10000) ) {
					if (length == 0) {
						start = offset - 1;
						
						if ( c < 0x00010000L )
							buffer[length++] = (TCHAR)c;
						else{
							clunichar ucs4 = c - 0x00010000L;
							buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800;
							buffer[length++] = (TCHAR)((ucs4 >>  0) & 0x3ff) | 0xdc00;
						}
	
						tokenType = tokenTypeDouble;
					} else {
						if (tokenType == tokenTypeSingle) {
							offset-=charlen;
							bufferIndex-=charlen;
	
							//return the previous ASCII characters
							break;
						} else {
							if ( c < 0x00010000L )
								buffer[length++] = (TCHAR)c;
							else{
								clunichar ucs4 = c - 0x00010000L;
								buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800;
								buffer[length++] = (TCHAR)((ucs4 >>  0) & 0x3ff) | 0xdc00;
							}
							tokenType = tokenTypeDouble;
	
							if (length >= 2) {
								offset-=charlen;
								bufferIndex-=charlen;
								preIsTokened = true;
	
								break;
							}
						}
					}
				} else if (length > 0) {
					if (preIsTokened == true) {
						// empty the buffer
						length = 0;
						preIsTokened = false;
					} else {
						break;
					}
				}
			}
		}
		if (length > 0) {
			buffer[length]='\0';
			token->set(buffer,start, start+length, tokenType);
			return true; 
		} else if (dataLen == -1) {
			offset--; 
			return false;
		}
	}
}

TokenStream* CJKAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
	return new CJKTokenizer(reader);
}


CL_NS_END2