searchengine/oss/loc/analysis/src/cjkanalyzer.cpp
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 #include "CLucene/StdHeader.h"
       
    19 #include "CJKAnalyzer.h"
       
    20 
       
    21 CL_NS_DEF2(analysis,cjk)
       
    22 CL_NS_USE(analysis)
       
    23 CL_NS_USE(util)
       
    24 
       
    25 
       
    26 const TCHAR* CJKTokenizer::tokenTypeSingle = _T("single");
       
    27 const TCHAR* CJKTokenizer::tokenTypeDouble = _T("double");
       
    28 
       
    29 CJKTokenizer::CJKTokenizer(Reader* in):
       
    30 	Tokenizer(in)
       
    31 {
       
    32 	tokenType = Token::defaultType;
       
    33 	offset = 0;
       
    34 	bufferIndex = 0;
       
    35 	dataLen = 0;
       
    36 	preIsTokened = false;
       
    37 	ignoreSurrogates = true;
       
    38 }
       
    39 
       
    40 bool CJKTokenizer::next(Token* token){
       
    41 	while (true) {
       
    42 		/** how many character(s) has been stored in buffer */
       
    43 		int32_t length = 0;
       
    44 	
       
    45 		/** the position used to create Token */
       
    46 		int32_t start = offset;
       
    47 	
       
    48 		while (true) {
       
    49 			/** current character */
       
    50 			clunichar c;
       
    51 			int charlen = 1;
       
    52 	
       
    53 			offset++;
       
    54 	
       
    55 			if (bufferIndex >= dataLen) {
       
    56 				dataLen = input->read(ioBuffer);
       
    57 				bufferIndex = 0;
       
    58 			}
       
    59 	
       
    60 			if (dataLen == -1) {
       
    61 				if (length > 0) {
       
    62 					if (preIsTokened == true) {
       
    63 						length = 0;
       
    64 						preIsTokened = false;
       
    65 					} else {
       
    66 						offset--;
       
    67 					}
       
    68 					break;
       
    69 				} else {
       
    70 					offset--;
       
    71 					return false;
       
    72 				}
       
    73 			} else {
       
    74 				//get current character
       
    75 				c = ioBuffer[bufferIndex++];
       
    76 			}
       
    77 	
       
    78 			//to support surrogates, we'll need to convert the incoming utf16 into
       
    79 			//ucs4(c variable). however, gunichartables doesn't seem to classify
       
    80 			//any of the surrogates as alpha, so they are skipped anyway...
       
    81 			//so for now we just convert to ucs4 so that we dont corrupt the input.
       
    82 			if ( c >= 0xd800 || c <= 0xdfff ){
       
    83 				clunichar c2 = ioBuffer[bufferIndex];
       
    84 				if ( c2 >= 0xdc00 && c2 <= 0xdfff ){
       
    85 					bufferIndex++;
       
    86 					offset++;
       
    87 					charlen=2;
       
    88 	
       
    89 					c = (((c & 0x03ffL) << 10) | ((c2 & 0x03ffL) <<  0)) + 0x00010000L;
       
    90 				}
       
    91 			}
       
    92 	
       
    93 			//if the current character is ASCII or Extend ASCII
       
    94 			if ((c <= 0xFF) //is BASIC_LATIN
       
    95 				|| (c>=0xFF00 && c<=0xFFEF) //ascii >0x74 cast to unsigned...
       
    96 			   ) {
       
    97 				if (c >= 0xFF00) {
       
    98 					//todo: test this... only happens on platforms where char is signed, i think...
       
    99 					/** convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */
       
   100 					c -= 0xFEE0;
       
   101 				}
       
   102 	
       
   103 				// if the current character is a letter or "_" "+" "#"
       
   104 				if (_istalnum(c) || ((c == '_') || (c == '+') || (c == '#')) ) {
       
   105 					if (length == 0) {
       
   106 						// "javaC1C2C3C4linux" <br>
       
   107 						//      ^--: the current character begin to token the ASCII
       
   108 						// letter
       
   109 						start = offset - 1;
       
   110 					} else if (tokenType == tokenTypeDouble) {
       
   111 						// "javaC1C2C3C4linux" <br>
       
   112 						//              ^--: the previous non-ASCII
       
   113 						// : the current character
       
   114 						offset-=charlen;
       
   115 						bufferIndex-=charlen;
       
   116 						tokenType = tokenTypeSingle;
       
   117 	
       
   118 						if (preIsTokened == true) {
       
   119 							// there is only one non-ASCII has been stored
       
   120 							length = 0;
       
   121 							preIsTokened = false;
       
   122 	
       
   123 							break;
       
   124 						} else {
       
   125 							break;
       
   126 						}
       
   127 					}
       
   128 	
       
   129 					// store the LowerCase(c) in the buffer
       
   130 					buffer[length++] = _totlower((TCHAR)c);
       
   131 					tokenType = tokenTypeSingle;
       
   132 	
       
   133 					// break the procedure if buffer overflowed!
       
   134 					if (length == LUCENE_MAX_WORD_LEN) {
       
   135 						break;
       
   136 					}
       
   137 				} else if (length > 0) {
       
   138 					if (preIsTokened == true) {
       
   139 						length = 0;
       
   140 						preIsTokened = false;
       
   141 					} else {
       
   142 						break;
       
   143 					}
       
   144 				}
       
   145 			} else {
       
   146 				// non-ASCII letter, eg."C1C2C3C4"
       
   147 				if ( _istalpha(c) || (!ignoreSurrogates && c >= 0x10000) ) {
       
   148 					if (length == 0) {
       
   149 						start = offset - 1;
       
   150 						
       
   151 						if ( c < 0x00010000L )
       
   152 							buffer[length++] = (TCHAR)c;
       
   153 						else{
       
   154 							clunichar ucs4 = c - 0x00010000L;
       
   155 							buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800;
       
   156 							buffer[length++] = (TCHAR)((ucs4 >>  0) & 0x3ff) | 0xdc00;
       
   157 						}
       
   158 	
       
   159 						tokenType = tokenTypeDouble;
       
   160 					} else {
       
   161 						if (tokenType == tokenTypeSingle) {
       
   162 							offset-=charlen;
       
   163 							bufferIndex-=charlen;
       
   164 	
       
   165 							//return the previous ASCII characters
       
   166 							break;
       
   167 						} else {
       
   168 							if ( c < 0x00010000L )
       
   169 								buffer[length++] = (TCHAR)c;
       
   170 							else{
       
   171 								clunichar ucs4 = c - 0x00010000L;
       
   172 								buffer[length++] = (TCHAR)((ucs4 >> 10) & 0x3ff) | 0xd800;
       
   173 								buffer[length++] = (TCHAR)((ucs4 >>  0) & 0x3ff) | 0xdc00;
       
   174 							}
       
   175 							tokenType = tokenTypeDouble;
       
   176 	
       
   177 							if (length >= 2) {
       
   178 								offset-=charlen;
       
   179 								bufferIndex-=charlen;
       
   180 								preIsTokened = true;
       
   181 	
       
   182 								break;
       
   183 							}
       
   184 						}
       
   185 					}
       
   186 				} else if (length > 0) {
       
   187 					if (preIsTokened == true) {
       
   188 						// empty the buffer
       
   189 						length = 0;
       
   190 						preIsTokened = false;
       
   191 					} else {
       
   192 						break;
       
   193 					}
       
   194 				}
       
   195 			}
       
   196 		}
       
   197 		if (length > 0) {
       
   198 			buffer[length]='\0';
       
   199 			token->set(buffer,start, start+length, tokenType);
       
   200 			return true; 
       
   201 		} else if (dataLen == -1) {
       
   202 			offset--; 
       
   203 			return false;
       
   204 		}
       
   205 	}
       
   206 }
       
   207 
       
   208 TokenStream* CJKAnalyzer::tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
       
   209 	return new CJKTokenizer(reader);
       
   210 }
       
   211 
       
   212 
       
   213 CL_NS_END2