searchengine/cpix/cpix/src/prefixqueryparser.cpp
changeset 8 6547bf8ca13a
child 14 8bd192d47aaa
equal deleted inserted replaced
7:a5fbfefd615f 8:6547bf8ca13a
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 #include "CLucene.h"
       
    20 
       
    21 #include "cpixmaindefs.h"
       
    22 
       
    23 // internal libs
       
    24 #include "cpixparsetools.h"
       
    25 
       
    26 // internal
       
    27 #include "analyzer.h"
       
    28 
       
    29 #include "prefixqueryparser.h"
       
    30 
       
    31 #include "cpixanalyzer.h"
       
    32 #include "cluceneext.h"
       
    33 
       
    34 #include "tinyunicode.h"
       
    35 
       
    36 #include "cpixexc.h"
       
    37 
       
    38 namespace Cpix {
       
    39 	
       
    40 	using namespace lucene::analysis; 
       
    41 	using namespace lucene::search; 
       
    42 	using namespace lucene::document; 
       
    43 	using namespace lucene::util; 
       
    44 	using lucene::index::Term; 
       
    45 	using namespace std; 
       
    46 
       
    47 	namespace {
       
    48 	
       
    49 		/**
       
    50 		 * Small optimization to avoid creating extra boolean queries
       
    51 		 */
       
    52 		class QueryConstructor {
       
    53 			
       
    54 		public: 
       
    55 			QueryConstructor() : q_(), bq_(0) {}
       
    56 			
       
    57 			auto_ptr<Query> operator()() {
       
    58 				return q_; 
       
    59 			}
       
    60 			void add(auto_ptr<Query> q) {
       
    61 				if ( q.get() ) {
       
    62 					if ( bq_ ) {
       
    63 						bq_->add( q.release(), true, true, false ); 
       
    64 					} else {
       
    65 						if ( q_.get() ) {
       
    66 							auto_ptr<BooleanQuery> bq( new BooleanQuery() );
       
    67 							bq_ = bq.get();
       
    68 							bq_->add( q_.release(), true, true, false ); 
       
    69 							bq_->add( q.release(), true, true, false ); 
       
    70 							q_.reset( bq.release() ); 
       
    71 						} else {
       
    72 							q_ = q;  
       
    73 						}
       
    74 					}
       
    75 				}
       
    76 			}
       
    77 			inline void add(Query* q) {
       
    78 				add( auto_ptr<Query>( q ) );
       
    79 			}
       
    80 	
       
    81 		private: 
       
    82 			
       
    83 			auto_ptr<Query> q_; 
       
    84 			BooleanQuery* bq_; 
       
    85 			
       
    86 		};
       
    87 		
       
    88 		/**
       
    89 		 * TokenStream interface with one modification: 
       
    90 		 *   * Ability to check if returned token was last one in the stream 
       
    91 		 */
       
    92 		class HasNextTokenStream {
       
    93 			
       
    94 			public:
       
    95 			
       
    96 				HasNextTokenStream(TokenStream* tokens)
       
    97 				:   i_(true), 
       
    98 					next_(),
       
    99 					buf_(),
       
   100 					tokens_( tokens ){
       
   101 					next_ = tokens_->next(&buf_[0]);
       
   102 				}
       
   103 		
       
   104 				inline Token& next() {
       
   105 					next_ = tokens_->next(&buf_[i_]); 
       
   106 					i_ = !i_;
       
   107 					return buf_[i_]; 
       
   108 				}
       
   109 			
       
   110 				inline bool hasNext() {
       
   111 					return next_; 
       
   112 				}
       
   113 				
       
   114 			private:
       
   115 				bool i_, next_; 
       
   116 				Token buf_[2]; 
       
   117 				auto_ptr<TokenStream> tokens_; 
       
   118 		};
       
   119 				
       
   120 	
       
   121 	}
       
   122 	
       
   123 	PrefixQueryParser::PrefixQueryParser(const wchar_t* field) 
       
   124 	: field_(field) {}
       
   125 		
       
   126 	PrefixQueryParser::~PrefixQueryParser() {}
       
   127 	
       
   128 	auto_ptr<Query> PrefixQueryParser::parse(const wchar_t* query) {
       
   129 		Cpt::Lex::WhitespaceSplitter split(query);
       
   130 		QueryConstructor ret;
       
   131 		while ( split ) {
       
   132 			ret.add( toQuery( split++ ) ); 
       
   133 		}	
       
   134 		return ret(); 
       
   135 	}
       
   136 	
       
   137 	const wchar_t* PrefixQueryParser::getField() const {
       
   138 		return field_.c_str(); 
       
   139 	}
       
   140 	
       
   141 	void PrefixQueryParser::setDefaultOperator(cpix_QP_Operator op) {
       
   142 		THROW_CPIXEXC("Prefix query parser does not support setting the default operator.");  
       
   143 	}
       
   144 
       
   145 	bool PrefixQueryParser::usePrefixFor(lucene::analysis::Token& token) {
       
   146 		return !analysis::unicode::IsCjk(token.termText()[0]);
       
   147 	}
       
   148 
       
   149 	auto_ptr<Query> 
       
   150 		PrefixQueryParser::toQuery(Cpt::Lex::Token word) {
       
   151 		Analyzer& preAnalyzer( Analysis::getPrefixAnalyzer() ); 
       
   152 		StringReader reader( word.begin(), word.length() );
       
   153 		HasNextTokenStream tokens(
       
   154 			preAnalyzer.tokenStream( field_.c_str(), 
       
   155 									 &reader ) );
       
   156 
       
   157 		QueryConstructor ret; 
       
   158 		
       
   159 		while ( tokens.hasNext() ) {
       
   160 			lucene::analysis::Token& token = tokens.next();
       
   161 			
       
   162 			if ( usePrefixFor(token) ) {
       
   163 				if (!tokens.hasNext()) {
       
   164 					// Turn only last token of this word into prefix query
       
   165 					ret.add(
       
   166 						_CLNEW PrefixQuery( freeref( _CLNEW Term( field_.c_str(), 
       
   167 								                                  token.termText() ) ) ) );  
       
   168 				} else {
       
   169 					// Others tokens can be normal term queries
       
   170 					ret.add( 
       
   171 						_CLNEW TermQuery( freeref( _CLNEW Term( field_.c_str(), 
       
   172 															    token.termText() ) ) ) );  
       
   173 				}
       
   174 			} else {
       
   175 				Analyzer& termAnalyzer = Analysis::getQueryAnalyzer();
       
   176 				StringReader reader( token.termText(), token.termTextLength() );
       
   177 				HasNextTokenStream tokens(
       
   178 					termAnalyzer.tokenStream( field_.c_str(), 
       
   179 											  &reader ) );
       
   180 				
       
   181 				Token& first = tokens.next();
       
   182 				if (tokens.hasNext()) { // more than one
       
   183 					auto_ptr<PhraseQuery> phrase( _CLNEW PhraseQuery() );
       
   184 					phrase->add( freeref( _CLNEW Term( field_.c_str(), 
       
   185 													   first.termText() ) ) ); 
       
   186 					while (tokens.hasNext()) {
       
   187 						phrase->add( freeref( _CLNEW Term( field_.c_str(), 
       
   188 														   tokens.next().termText() ) ) ); 
       
   189 					}
       
   190 					ret.add( std::auto_ptr<Query>( phrase.release() ) ); 
       
   191 				} else {
       
   192 					ret.add( 
       
   193 					        _CLNEW TermQuery( freeref( _CLNEW Term( field_.c_str(), 
       
   194 													                first.termText() ) ) ) );
       
   195 				}
       
   196 			}
       
   197 		}
       
   198 		return ret(); 
       
   199 	}
       
   200 
       
   201 }