--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysis/inc/public/thaianalysis.h Fri Oct 15 12:09:28 2010 +0530
@@ -0,0 +1,94 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description:
+*
+*/
+
+#ifndef THAIWORDFILTER_H_
+#define THAIWORDFILTER_H_
+
+#include <memory>
+
+#include "cpixstrtools.h"
+
+#include "Clucene.h"
+
+namespace analysis
+{
+ // Forward declarations
+ class BreakIterator;
+
+
+ /**
+ * Thai analysis uses compiled dictionary and it needs to
+ * know the dictionary's location in the file system. This dictionary
+ * is then loaded when thai analysis is used for first time.
+ * The dictionary is kept in memory until shutdown.
+ */
+ void InitThaiAnalysis(const char* thaiDataFile);
+
+ /**
+ * Releases the thai dictionary, if loaded to memory
+ */
+ void ShutdownThaiAnalysis();
+
+ /**
+ * Dictionary based token filtering. Reads tokens from the token stream.
+ * If token beginning with Thai character is encountered, it breaks
+ * the token into a number of thai word tokens based on the dictionary
+ * with the longest matching algorithm.
+ */
+ class ThaiWordFilter : public lucene::analysis::TokenFilter
+ {
+ public:
+
+ ThaiWordFilter( lucene::analysis::TokenStream* input, bool deleteTs );
+
+ ~ThaiWordFilter();
+
+ bool next(lucene::analysis::Token* token);
+
+ private:
+
+ std::auto_ptr<BreakIterator> breaks_;
+
+ lucene::analysis::Token thaiToken_;
+
+ };
+
+ /**
+ * Analyzer for thai language. Uses StandardAnalyzer to make preliminary
+ * tokenization and ThaiWordFilter to split Thai sentence tokens
+ * into a number of Thai word tokens.
+ */
+ class ThaiAnalyzer : public lucene::analysis::Analyzer
+ {
+ public:
+
+ ThaiAnalyzer();
+
+ public:
+
+ lucene::analysis::TokenStream* tokenStream(const wchar_t* fieldName,
+ lucene::util::Reader* reader);
+
+ private:
+
+ CL_NS(util)::CLSetList<const TCHAR*> stopWords_;
+
+ };
+
+}
+
+#endif /* THAIWORDFILTER_H_ */