searchengine/cpix/cpix/src/analyzer.cpp
changeset 1 6f2c1c46032b
parent 0 671dee74050a
child 2 6c1a2771f4b7
--- a/searchengine/cpix/cpix/src/analyzer.cpp	Mon Apr 19 14:40:16 2010 +0300
+++ b/searchengine/cpix/cpix/src/analyzer.cpp	Mon May 03 13:33:22 2010 +0300
@@ -38,6 +38,7 @@
 
 #include "indevicecfg.h" 
 
+#include "initparams.h"
 namespace
 {
     const char AGGR_NONFILEREADERPROXY_ERR[] 
@@ -50,6 +51,44 @@
 
 namespace Cpix {
 
+	PrefixGenerator::PrefixGenerator(
+		lucene::analysis::TokenStream* in, 
+		bool deleteTS, 
+		size_t maxPrefixLength) 
+	: 	TokenFilter(in, deleteTS),
+	  	token_(), 
+	  	prefixLength_(0),
+	  	maxPrefixLength_(maxPrefixLength) {}
+	
+	
+	PrefixGenerator::~PrefixGenerator() {
+	}
+
+	
+	bool PrefixGenerator::next(lucene::analysis::Token* token) {
+		token_.setPositionIncrement(0); 
+
+		while (prefixLength_ == 0) {
+			token_.setPositionIncrement(1); // default position increment
+			if (!input->next(&token_)) {
+				return false;
+			}
+			prefixLength_ = std::min(token_.termTextLength(), maxPrefixLength_);
+		}
+			
+		// Clip token
+		std::wstring clipped; 
+		clipped = token_.termText();
+		token_.setText(clipped.substr(0, prefixLength_).c_str());
+		
+		// Copy
+		token->set(token_.termText(), token_.startOffset(), token_.endOffset(), token_.type());
+		token->setPositionIncrement(token_.getPositionIncrement());
+		
+		// Reduce prefixLength_
+		prefixLength_--;
+		return true; 
+	}
 
     AggregateFieldTokenStream::AggregateFieldTokenStream(lucene::analysis::Analyzer& analyzer, 
                                                          DocumentFieldIterator* fields) 
@@ -136,6 +175,12 @@
                                                                        lucene::util::Reader * reader) {
         if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) {
             return new AggregateFieldTokenStream( analyzer_, document_.fields()); 
+        } else if ( wcscmp( fieldName, LCPIX_DEFAULT_PREFIX_FIELD ) == 0 ) {
+            return
+				new PrefixGenerator(
+					new AggregateFieldTokenStream( analyzer_, document_.fields()),
+					true,
+					OPTIMIZED_PREFIX_MAX_LENGTH);
         } else {
             return analyzer_.tokenStream( fieldName, reader ); 
         }
@@ -428,6 +473,34 @@
         int min_, max_;
         std::auto_ptr<TokenStreamFactory> factory_; 
     };
+    
+    /**
+     * Specialized PrefixGenerator factory is needed, because PrefixGenerator
+     * requires the max prefix size. 
+     */
+    template<>
+    class FilterFactory<PrefixGenerator> : public TokenStreamFactory 
+    {
+    public:
+        FilterFactory(const Invokation& invokation, 
+                      auto_ptr<TokenStreamFactory> factory) 
+            : factory_(factory) {
+            using namespace Cpt::Parser;
+            if (invokation.params().size() != 1 || 
+                !dynamic_cast<IntegerLit*>(invokation.params()[0])) {
+                THROW_CPIXEXC("Prefix generator takes exactly one integer parameter");
+            }
+            maxPrefixLength_ = dynamic_cast<IntegerLit*>(invokation.params()[0])->value();
+        }
+        virtual lucene::analysis::TokenStream* tokenStream(const TCHAR          * fieldName, 
+                                                           lucene::util::Reader * reader) {
+            return _CLNEW PrefixGenerator(factory_->tokenStream(fieldName, reader), true, maxPrefixLength_ ); 
+        }
+    private: 
+        int maxPrefixLength_;
+        std::auto_ptr<TokenStreamFactory> factory_; 
+    };
+
 
     typedef auto_ptr<TokenStreamFactory> (*TokenizerFactoryCreator)(const Invokation& invokation);
     typedef auto_ptr<TokenStreamFactory> (*FilterFactoryCreator)(const Invokation& invokation, 
@@ -507,6 +580,7 @@
         {CPIX_FILTER_STOP, 		FilterFactoryCtor<lucene::analysis::StopFilter>::create},
         {CPIX_FILTER_STEM, 		FilterFactoryCtor<lucene::analysis::SnowballFilter>::create},
         {CPIX_FILTER_LENGTH, 	FilterFactoryCtor<lucene::analysis::LengthFilter>::create},
+        {CPIX_FILTER_PREFIXES, 	FilterFactoryCtor<PrefixGenerator>::create},
 
 // 		TODO: Add more Filters