searchengine/oss/loc/analysis/src/cjkanalyzer.cpp
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
hgs
parents:
diff changeset
    18
#include "CLucene/StdHeader.h"
hgs
parents:
diff changeset
    19
#include "CJKAnalyzer.h"
hgs
parents:
diff changeset
    20
hgs
parents:
diff changeset
    21
CL_NS_DEF2(analysis,cjk)
hgs
parents:
diff changeset
    22
CL_NS_USE(analysis)
hgs
parents:
diff changeset
    23
CL_NS_USE(util)
hgs
parents:
diff changeset
    24
hgs
parents:
diff changeset
    25
hgs
parents:
diff changeset
    26
const TCHAR* CJKTokenizer::tokenTypeSingle = _T("single");
hgs
parents:
diff changeset
    27
const TCHAR* CJKTokenizer::tokenTypeDouble = _T("double");
hgs
parents:
diff changeset
    28
hgs
parents:
diff changeset
    29
CJKTokenizer::CJKTokenizer(Reader* in):
hgs
parents:
diff changeset
    30
	Tokenizer(in)
hgs
parents:
diff changeset
    31
{
hgs
parents:
diff changeset
    32
	tokenType = Token::defaultType;
hgs
parents:
diff changeset
    33
	offset = 0;
hgs
parents:
diff changeset
    34
	bufferIndex = 0;
hgs
parents:
diff changeset
    35
	dataLen = 0;
hgs
parents:
diff changeset
    36
	preIsTokened = false;
hgs
parents:
diff changeset
    37
	ignoreSurrogates = true;
hgs
parents:
diff changeset
    38
}
hgs
parents:
diff changeset
    39
hgs
parents:
diff changeset
    40
bool CJKTokenizer::next(Token* token){
hgs
parents:
diff changeset
    41
	while (true) {
hgs
parents:
diff changeset
    42
		/** how many character(s) has been stored in buffer */
hgs
parents:
diff changeset
    43
		int32_t length = 0;
hgs
parents:
diff changeset
    44
	
hgs
parents:
diff changeset
    45
		/** the position used to create Token */
hgs
parents:
diff changeset
    46
		int32_t start = offset;
hgs
parents:
diff changeset
    47
	
hgs
parents:
diff changeset
    48
		while (true) {
hgs
parents:
diff changeset
    49
			/** current character */
hgs
parents:
diff changeset
    50
			clunichar c;
hgs
parents:
diff changeset
    51
			int charlen = 1;
hgs
parents:
diff changeset
    52
	
hgs
parents:
diff changeset
    53
			offset++;
hgs
parents:
diff changeset
    54
	
hgs
parents:
diff changeset
    55
			if (bufferIndex >= dataLen) {
hgs
parents:
diff changeset
    56
				dataLen = input->read(ioBuffer);
hgs
parents:
diff changeset
    57
				bufferIndex = 0;
hgs
parents:
diff changeset
    58
			}
hgs
parents:
diff changeset
    59
	
hgs
parents:
diff changeset
    60
			if (dataLen == -1) {
hgs
parents:
diff changeset
    61
				if (length > 0) {
hgs
parents:
diff changeset
    62
					if (preIsTokened == true) {
hgs
parents:
diff changeset
    63
						length = 0;
hgs
parents:
diff changeset
    64
						preIsTokened = false;
hgs
parents:
diff changeset
    65
					} else {
hgs
parents:
diff changeset
    66
						offset--;
hgs
parents:
diff changeset
    67
					}
hgs
parents:
diff changeset
    68
					break;
hgs
parents:
diff changeset
    69
				} else {
hgs
parents:
diff changeset
    70
					offset--;
hgs
parents:
diff changeset
    71
					return false;
hgs
parents:
diff changeset
    72
				}
hgs
parents:
diff changeset
    73
			} else {
hgs
parents:
diff changeset
    74
				//get current character
hgs
parents:
diff changeset
    75
				c = ioBuffer[bufferIndex++];
hgs
parents:
diff changeset
    76
			}
hgs
parents:
diff changeset
    77
	
hgs
parents:
diff changeset
    78
			//to support surrogates, we'll need to convert the incoming utf16 into
hgs
parents:
diff changeset
    79
			//ucs4(c variable). however, gunichartables doesn't seem to classify
hgs
parents:
diff changeset
    80
			//any of the surrogates as alpha, so they are skipped anyway...
hgs
parents:
diff changeset
    81
			//so for now we just convert to ucs4 so that we dont corrupt the input.
hgs
parents:
diff changeset
    82
			if ( c >= 0xd800 || c <= 0xdfff ){
hgs
parents:
diff changeset
    83
				clunichar c2 = ioBuffer[bufferIndex];
hgs
parents:
diff changeset
    84
				if ( c2 >= 0xdc00 && c2 <= 0xdfff ){
hgs
parents:
diff changeset
    85
					bufferIndex++;
hgs
parents:
diff changeset
    86
					offset++;
hgs
parents:
diff changeset
    87
					charlen=2;
hgs
parents:
diff changeset
    88
	
hgs
parents:
diff changeset
    89
					c = (((c & 0x03ffL) << 10) | ((c2 & 0x03ffL) <<  0)) + 0x00010000L;
hgs
parents:
diff changeset
    90
				}
hgs
parents:
diff changeset
    91
			}
hgs
parents:
diff changeset
    92
	
hgs
parents:
diff changeset
    93
			//if the current character is ASCII or Extend ASCII
hgs
parents:
diff changeset
    94
			if ((c <= 0xFF) //is BASIC_LATIN
hgs
parents:
diff changeset
    95
				|| (c>=0xFF00 && c<=0xFFEF) //ascii >0x74 cast to unsigned...
hgs
parents:
diff changeset
    96
			   ) {
hgs
parents:
diff changeset
    97
				if (c >= 0xFF00) {
hgs
parents:
diff changeset
    98
					//todo: test this... only happens on platforms where char is signed, i think...
hgs
parents:
diff changeset
    99
					/** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
hgs
parents:
diff changeset
   100
					c -= 0xFEE0;
hgs
parents:
diff changeset
   101
				}
hgs
parents:
diff changeset
   102
	
hgs
parents:
diff changeset
   103
				// if the current character is a letter or "_" "+" "#"
hgs
parents:
diff changeset
   104
				if (_istalnum(c) || ((c == '_') || (c == '+') || (c == '#')) ) {
hgs
parents:
diff changeset
   105
					if (length == 0) {
hgs
parents:
diff changeset
   106
						// "javaC1C2C3C4linux" <br>
hgs
parents:
diff changeset
   107
						//      ^--: the current character begin to token the ASCII
hgs
parents:
diff changeset
   108
						// letter
hgs
parents:
diff changeset
   109
						start = offset - 1;
hgs
parents:
diff changeset
   110
					} else if (tokenType == tokenTypeDouble) {
hgs
parents:
diff changeset
   111
						// "javaC1C2C3C4linux" <br>
hgs
parents:
diff changeset
   112
						//              ^--: the previous non-ASCII
hgs
parents:
diff changeset
   113
						// : the current character
hgs
parents:
diff changeset
   114
						offset-=charlen;
hgs
parents:
diff changeset
   115
						bufferIndex-=charlen;
hgs
parents:
diff changeset
   116
						tokenType = tokenTypeSingle;
hgs
parents:
diff changeset
   117
	
hgs
parents:
diff changeset
   118
						if (preIsTokened == true) {
hgs
parents:
diff changeset
   119
							// there is only one non-ASCII has been stored
hgs
parents:
diff changeset
   120
							length = 0;
hgs
parents:
diff changeset
   121
							preIsTokened = false;
hgs
parents:
diff changeset
   122
	
hgs
parents:
diff changeset
   123
							break;
hgs
parents:
diff changeset
   124
						} else {
hgs
parents:
diff changeset
   125
							break;
hgs
parents:
diff changeset
   126
						}
hgs
parents:
diff changeset
   127
					}
hgs
parents:
diff changeset
   128
	
hgs
parents:
diff changeset
   129
					// store the LowerCase(c) in the buffer
hgs
parents:
diff changeset
   130
					buffer[length++] = _totlower((TCHAR)c);
hgs
parents:
diff changeset
   131
					tokenType = tokenTypeSingle;
hgs
parents:
diff changeset
   132
	
hgs
parents:
diff changeset
   133
					// break the procedure if buffer overflowed!
hgs
parents:
diff changeset
   134
					if (length == LUCENE_MAX_WORD_LEN) {
hgs
parents:
diff changeset
   135
						break;
hgs
parents:
diff changeset
   136
					}
hgs
parents:
diff changeset
   137
				} else if (length > 0) {
hgs
parents:
diff changeset
   138
					if (preIsTokened == true) {
hgs
parents:
diff changeset
   139
						length = 0;
hgs
parents:
diff changeset
   140
						preIsTokened = false;
hgs
parents:
diff changeset
   141
					} else {
hgs
parents:
diff changeset
   142
						break;
hgs
parents:
diff changeset
   143
					}
hgs
parents:
diff changeset
   144
				}
hgs
parents:
diff changeset
   145
			} else {
hgs
parents:
diff changeset
   146
				// non-ASCII letter, eg."C1C2C3C4"
hgs
parents:
diff changeset
   147
				if ( _istalpha(c) || (!ignoreSurrogates && c >= 0x10000) ) {
hgs
parents:
diff changeset
   148
					if (length == 0) {
hgs
parents:
diff changeset
   149
						start = offset - 1;
hgs
parents:
diff changeset
   150
						
hgs
parents:
diff changeset
   151
						if ( c < 0x00010000L )
hgs
parents:
diff changeset
   152
							buffer[length++] = (TCHAR)c;
hgs
parents:
diff changeset
   153
						else{
hgs
parents:
diff changeset
   154
							clunichar ucs4 = c - 0x00010000L;
hgs
parents:
diff changeset
   155
							buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800;
hgs
parents:
diff changeset
   156
							buffer[length++] = (TCHAR)((ucs4 >>  0) & 0x3ff) | 0xdc00;
hgs
parents:
diff changeset
   157
						}
hgs
parents:
diff changeset
   158
	
hgs
parents:
diff changeset
   159
						tokenType = tokenTypeDouble;
hgs
parents:
diff changeset
   160
					} else {
hgs
parents:
diff changeset
   161
						if (tokenType == tokenTypeSingle) {
hgs
parents:
diff changeset
   162
							offset-=charlen;
hgs
parents:
diff changeset
   163
							bufferIndex-=charlen;
hgs
parents:
diff changeset
   164
	
hgs
parents:
diff changeset
   165
							//return the previous ASCII characters
hgs
parents:
diff changeset
   166
							break;
hgs
parents:
diff changeset
   167
						} else {
hgs
parents:
diff changeset
   168
							if ( c < 0x00010000L )
hgs
parents:
diff changeset
   169
								buffer[length++] = (TCHAR)c;
hgs
parents:
diff changeset
   170
							else{
hgs
parents:
diff changeset
   171
								clunichar ucs4 = c - 0x00010000L;
hgs
parents:
diff changeset
   172
								buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800;
hgs
parents:
diff changeset
   173
								buffer[length++] = (TCHAR)((ucs4 >>  0) & 0x3ff) | 0xdc00;
hgs
parents:
diff changeset
   174
							}
hgs
parents:
diff changeset
   175
							tokenType = tokenTypeDouble;
hgs
parents:
diff changeset
   176
	
hgs
parents:
diff changeset
   177
							if (length >= 2) {
hgs
parents:
diff changeset
   178
								offset-=charlen;
hgs
parents:
diff changeset
   179
								bufferIndex-=charlen;
hgs
parents:
diff changeset
   180
								preIsTokened = true;
hgs
parents:
diff changeset
   181
	
hgs
parents:
diff changeset
   182
								break;
hgs
parents:
diff changeset
   183
							}
hgs
parents:
diff changeset
   184
						}
hgs
parents:
diff changeset
   185
					}
hgs
parents:
diff changeset
   186
				} else if (length > 0) {
hgs
parents:
diff changeset
   187
					if (preIsTokened == true) {
hgs
parents:
diff changeset
   188
						// empty the buffer
hgs
parents:
diff changeset
   189
						length = 0;
hgs
parents:
diff changeset
   190
						preIsTokened = false;
hgs
parents:
diff changeset
   191
					} else {
hgs
parents:
diff changeset
   192
						break;
hgs
parents:
diff changeset
   193
					}
hgs
parents:
diff changeset
   194
				}
hgs
parents:
diff changeset
   195
			}
hgs
parents:
diff changeset
   196
		}
hgs
parents:
diff changeset
   197
		if (length > 0) {
hgs
parents:
diff changeset
   198
			buffer[length]='\0';
hgs
parents:
diff changeset
   199
			token->set(buffer,start, start+length, tokenType);
hgs
parents:
diff changeset
   200
			return true; 
hgs
parents:
diff changeset
   201
		} else if (dataLen == -1) {
hgs
parents:
diff changeset
   202
			offset--; 
hgs
parents:
diff changeset
   203
			return false;
hgs
parents:
diff changeset
   204
		}
hgs
parents:
diff changeset
   205
	}
hgs
parents:
diff changeset
   206
}
hgs
parents:
diff changeset
   207
hgs
parents:
diff changeset
   208
TokenStream* CJKAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
hgs
parents:
diff changeset
   209
	return new CJKTokenizer(reader);
hgs
parents:
diff changeset
   210
}
hgs
parents:
diff changeset
   211
hgs
parents:
diff changeset
   212
hgs
parents:
diff changeset
   213
CL_NS_END2