searchengine/cpix/cpix/src/prefixqueryparser.cpp
author hgs
Wed, 25 Aug 2010 13:17:41 +0530
changeset 16 2729d20a0010
parent 14 8bd192d47aaa
permissions -rw-r--r--
201033
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
8
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
hgs
parents:
diff changeset
    18
hgs
parents:
diff changeset
    19
#include "CLucene.h"
hgs
parents:
diff changeset
    20
hgs
parents:
diff changeset
    21
#include "cpixmaindefs.h"
hgs
parents:
diff changeset
    22
hgs
parents:
diff changeset
    23
// internal libs
hgs
parents:
diff changeset
    24
#include "cpixparsetools.h"
hgs
parents:
diff changeset
    25
hgs
parents:
diff changeset
    26
// internal
hgs
parents:
diff changeset
    27
#include "analyzer.h"
hgs
parents:
diff changeset
    28
hgs
parents:
diff changeset
    29
#include "prefixqueryparser.h"
hgs
parents:
diff changeset
    30
hgs
parents:
diff changeset
    31
#include "cpixanalyzer.h"
hgs
parents:
diff changeset
    32
#include "cluceneext.h"
hgs
parents:
diff changeset
    33
hgs
parents:
diff changeset
    34
#include "tinyunicode.h"
hgs
parents:
diff changeset
    35
hgs
parents:
diff changeset
    36
#include "cpixexc.h"
hgs
parents:
diff changeset
    37
hgs
parents:
diff changeset
    38
namespace Cpix {
hgs
parents:
diff changeset
    39
	
hgs
parents:
diff changeset
    40
	using namespace lucene::analysis; 
hgs
parents:
diff changeset
    41
	using namespace lucene::search; 
hgs
parents:
diff changeset
    42
	using namespace lucene::document; 
hgs
parents:
diff changeset
    43
	using namespace lucene::util; 
hgs
parents:
diff changeset
    44
	using lucene::index::Term; 
hgs
parents:
diff changeset
    45
	using namespace std; 
hgs
parents:
diff changeset
    46
hgs
parents:
diff changeset
    47
	namespace {
hgs
parents:
diff changeset
    48
	
hgs
parents:
diff changeset
    49
		/**
hgs
parents:
diff changeset
    50
		 * Small optimization to avoid creating extra boolean queries
hgs
parents:
diff changeset
    51
		 */
hgs
parents:
diff changeset
    52
		class QueryConstructor {
hgs
parents:
diff changeset
    53
			
hgs
parents:
diff changeset
    54
		public: 
hgs
parents:
diff changeset
    55
			QueryConstructor() : q_(), bq_(0) {}
hgs
parents:
diff changeset
    56
			
hgs
parents:
diff changeset
    57
			auto_ptr<Query> operator()() {
hgs
parents:
diff changeset
    58
				return q_; 
hgs
parents:
diff changeset
    59
			}
hgs
parents:
diff changeset
    60
			void add(auto_ptr<Query> q) {
hgs
parents:
diff changeset
    61
				if ( q.get() ) {
hgs
parents:
diff changeset
    62
					if ( bq_ ) {
16
hgs
parents: 14
diff changeset
    63
						bq_->add( q.release(), true, false, false ); 
8
hgs
parents:
diff changeset
    64
					} else {
hgs
parents:
diff changeset
    65
						if ( q_.get() ) {
hgs
parents:
diff changeset
    66
							auto_ptr<BooleanQuery> bq( new BooleanQuery() );
hgs
parents:
diff changeset
    67
							bq_ = bq.get();
14
hgs
parents: 8
diff changeset
    68
							bq_->add( q_.release(), true, false, false ); 
hgs
parents: 8
diff changeset
    69
							bq_->add( q.release(), true, false, false ); 
8
hgs
parents:
diff changeset
    70
							q_.reset( bq.release() ); 
hgs
parents:
diff changeset
    71
						} else {
hgs
parents:
diff changeset
    72
							q_ = q;  
hgs
parents:
diff changeset
    73
						}
hgs
parents:
diff changeset
    74
					}
hgs
parents:
diff changeset
    75
				}
hgs
parents:
diff changeset
    76
			}
hgs
parents:
diff changeset
    77
			inline void add(Query* q) {
hgs
parents:
diff changeset
    78
				add( auto_ptr<Query>( q ) );
hgs
parents:
diff changeset
    79
			}
hgs
parents:
diff changeset
    80
	
hgs
parents:
diff changeset
    81
		private: 
hgs
parents:
diff changeset
    82
			
hgs
parents:
diff changeset
    83
			auto_ptr<Query> q_; 
hgs
parents:
diff changeset
    84
			BooleanQuery* bq_; 
hgs
parents:
diff changeset
    85
			
hgs
parents:
diff changeset
    86
		};
hgs
parents:
diff changeset
    87
		
hgs
parents:
diff changeset
    88
		/**
hgs
parents:
diff changeset
    89
		 * TokenStream interface with one modification: 
hgs
parents:
diff changeset
    90
		 *   * Ability to check if returned token was last one in the stream 
hgs
parents:
diff changeset
    91
		 */
hgs
parents:
diff changeset
    92
		class HasNextTokenStream {
hgs
parents:
diff changeset
    93
			
hgs
parents:
diff changeset
    94
			public:
hgs
parents:
diff changeset
    95
			
hgs
parents:
diff changeset
    96
				HasNextTokenStream(TokenStream* tokens)
hgs
parents:
diff changeset
    97
				:   i_(true), 
hgs
parents:
diff changeset
    98
					next_(),
hgs
parents:
diff changeset
    99
					buf_(),
hgs
parents:
diff changeset
   100
					tokens_( tokens ){
hgs
parents:
diff changeset
   101
					next_ = tokens_->next(&buf_[0]);
hgs
parents:
diff changeset
   102
				}
hgs
parents:
diff changeset
   103
		
hgs
parents:
diff changeset
   104
				inline Token& next() {
hgs
parents:
diff changeset
   105
					next_ = tokens_->next(&buf_[i_]); 
hgs
parents:
diff changeset
   106
					i_ = !i_;
hgs
parents:
diff changeset
   107
					return buf_[i_]; 
hgs
parents:
diff changeset
   108
				}
hgs
parents:
diff changeset
   109
			
hgs
parents:
diff changeset
   110
				inline bool hasNext() {
hgs
parents:
diff changeset
   111
					return next_; 
hgs
parents:
diff changeset
   112
				}
hgs
parents:
diff changeset
   113
				
hgs
parents:
diff changeset
   114
			private:
hgs
parents:
diff changeset
   115
				bool i_, next_; 
hgs
parents:
diff changeset
   116
				Token buf_[2]; 
hgs
parents:
diff changeset
   117
				auto_ptr<TokenStream> tokens_; 
hgs
parents:
diff changeset
   118
		};
hgs
parents:
diff changeset
   119
				
hgs
parents:
diff changeset
   120
	
hgs
parents:
diff changeset
   121
	}
hgs
parents:
diff changeset
   122
	
hgs
parents:
diff changeset
   123
	PrefixQueryParser::PrefixQueryParser(const wchar_t* field) 
hgs
parents:
diff changeset
   124
	: field_(field) {}
hgs
parents:
diff changeset
   125
		
hgs
parents:
diff changeset
   126
	PrefixQueryParser::~PrefixQueryParser() {}
hgs
parents:
diff changeset
   127
	
hgs
parents:
diff changeset
   128
	auto_ptr<Query> PrefixQueryParser::parse(const wchar_t* query) {
hgs
parents:
diff changeset
   129
		Cpt::Lex::WhitespaceSplitter split(query);
hgs
parents:
diff changeset
   130
		QueryConstructor ret;
hgs
parents:
diff changeset
   131
		while ( split ) {
hgs
parents:
diff changeset
   132
			ret.add( toQuery( split++ ) ); 
hgs
parents:
diff changeset
   133
		}	
hgs
parents:
diff changeset
   134
		return ret(); 
hgs
parents:
diff changeset
   135
	}
hgs
parents:
diff changeset
   136
	
hgs
parents:
diff changeset
   137
	const wchar_t* PrefixQueryParser::getField() const {
hgs
parents:
diff changeset
   138
		return field_.c_str(); 
hgs
parents:
diff changeset
   139
	}
hgs
parents:
diff changeset
   140
	
hgs
parents:
diff changeset
   141
	void PrefixQueryParser::setDefaultOperator(cpix_QP_Operator op) {
hgs
parents:
diff changeset
   142
		THROW_CPIXEXC("Prefix query parser does not support setting the default operator.");  
hgs
parents:
diff changeset
   143
	}
hgs
parents:
diff changeset
   144
hgs
parents:
diff changeset
   145
	bool PrefixQueryParser::usePrefixFor(lucene::analysis::Token& token) {
hgs
parents:
diff changeset
   146
		return !analysis::unicode::IsCjk(token.termText()[0]);
hgs
parents:
diff changeset
   147
	}
hgs
parents:
diff changeset
   148
hgs
parents:
diff changeset
   149
	auto_ptr<Query> 
hgs
parents:
diff changeset
   150
		PrefixQueryParser::toQuery(Cpt::Lex::Token word) {
hgs
parents:
diff changeset
   151
		Analyzer& preAnalyzer( Analysis::getPrefixAnalyzer() ); 
hgs
parents:
diff changeset
   152
		StringReader reader( word.begin(), word.length() );
hgs
parents:
diff changeset
   153
		HasNextTokenStream tokens(
hgs
parents:
diff changeset
   154
			preAnalyzer.tokenStream( field_.c_str(), 
hgs
parents:
diff changeset
   155
									 &reader ) );
hgs
parents:
diff changeset
   156
hgs
parents:
diff changeset
   157
		QueryConstructor ret; 
hgs
parents:
diff changeset
   158
		
hgs
parents:
diff changeset
   159
		while ( tokens.hasNext() ) {
hgs
parents:
diff changeset
   160
			lucene::analysis::Token& token = tokens.next();
hgs
parents:
diff changeset
   161
			
hgs
parents:
diff changeset
   162
			if ( usePrefixFor(token) ) {
hgs
parents:
diff changeset
   163
				if (!tokens.hasNext()) {
hgs
parents:
diff changeset
   164
					// Turn only last token of this word into prefix query
hgs
parents:
diff changeset
   165
					ret.add(
hgs
parents:
diff changeset
   166
						_CLNEW PrefixQuery( freeref( _CLNEW Term( field_.c_str(), 
hgs
parents:
diff changeset
   167
								                                  token.termText() ) ) ) );  
hgs
parents:
diff changeset
   168
				} else {
hgs
parents:
diff changeset
   169
					// Others tokens can be normal term queries
hgs
parents:
diff changeset
   170
					ret.add( 
hgs
parents:
diff changeset
   171
						_CLNEW TermQuery( freeref( _CLNEW Term( field_.c_str(), 
hgs
parents:
diff changeset
   172
															    token.termText() ) ) ) );  
hgs
parents:
diff changeset
   173
				}
hgs
parents:
diff changeset
   174
			} else {
hgs
parents:
diff changeset
   175
				Analyzer& termAnalyzer = Analysis::getQueryAnalyzer();
hgs
parents:
diff changeset
   176
				StringReader reader( token.termText(), token.termTextLength() );
hgs
parents:
diff changeset
   177
				HasNextTokenStream tokens(
hgs
parents:
diff changeset
   178
					termAnalyzer.tokenStream( field_.c_str(), 
hgs
parents:
diff changeset
   179
											  &reader ) );
hgs
parents:
diff changeset
   180
				
hgs
parents:
diff changeset
   181
				Token& first = tokens.next();
hgs
parents:
diff changeset
   182
				if (tokens.hasNext()) { // more than one
hgs
parents:
diff changeset
   183
					auto_ptr<PhraseQuery> phrase( _CLNEW PhraseQuery() );
hgs
parents:
diff changeset
   184
					phrase->add( freeref( _CLNEW Term( field_.c_str(), 
hgs
parents:
diff changeset
   185
													   first.termText() ) ) ); 
hgs
parents:
diff changeset
   186
					while (tokens.hasNext()) {
hgs
parents:
diff changeset
   187
						phrase->add( freeref( _CLNEW Term( field_.c_str(), 
hgs
parents:
diff changeset
   188
														   tokens.next().termText() ) ) ); 
hgs
parents:
diff changeset
   189
					}
hgs
parents:
diff changeset
   190
					ret.add( std::auto_ptr<Query>( phrase.release() ) ); 
hgs
parents:
diff changeset
   191
				} else {
hgs
parents:
diff changeset
   192
					ret.add( 
hgs
parents:
diff changeset
   193
					        _CLNEW TermQuery( freeref( _CLNEW Term( field_.c_str(), 
hgs
parents:
diff changeset
   194
													                first.termText() ) ) ) );
hgs
parents:
diff changeset
   195
				}
hgs
parents:
diff changeset
   196
			}
hgs
parents:
diff changeset
   197
		}
hgs
parents:
diff changeset
   198
		return ret(); 
hgs
parents:
diff changeset
   199
	}
hgs
parents:
diff changeset
   200
hgs
parents:
diff changeset
   201
}