searchengine/oss/cl/clucene/src/clucene/search/hits.cpp
changeset 21 2c484ac32ef0
parent 18 3e1f76dd2722
equal deleted inserted replaced
18:3e1f76dd2722 21:2c484ac32ef0
    10 #include "clucene/document/document.h"
    10 #include "clucene/document/document.h"
    11 #include "clucene/index/indexreader.h"
    11 #include "clucene/index/indexreader.h"
    12 #include "filter.h"
    12 #include "filter.h"
    13 #include "clucene/search/searchheader.h"
    13 #include "clucene/search/searchheader.h"
    14 //#ifdef USE_HIGHLIGHTER 
    14 //#ifdef USE_HIGHLIGHTER 
    15 
       
    16 #include "CLucene/highlighter/QueryTermExtractor.h"
    15 #include "CLucene/highlighter/QueryTermExtractor.h"
    17 #include "CLucene/highlighter/QueryScorer.h"
    16 #include "CLucene/highlighter/QueryScorer.h"
    18 #include "CLucene/highlighter/Highlighter.h"
    17 #include "CLucene/highlighter/Highlighter.h"
    19 #include "CLucene/highlighter/SimpleHTMLFormatter.h"
    18 #include "CLucene/highlighter/SimpleHTMLFormatter.h"
    20 #include "CLucene/analysis/standard/StandardAnalyzer.h"
    19 #include "CLucene/analysis/standard/StandardAnalyzer.h"
    21 #include "clucene/search/prefixquery.h"
    20 #include "clucene/search/prefixquery.h"
    22 
    21 
    23 // internal libs
    22 #include "prefixfilter.h"
    24 #include "cpixparsetools.h"
    23 #include "koreananalyzer.h"
    25 
    24 
    26 //#endif
    25 //#endif
    27 
    26 
    28 CL_NS_USE(document)
    27 CL_NS_USE(document)
    29 CL_NS_USE(util)
    28 CL_NS_USE(util)
    54 
    53 
    55 
    54 
    56 	Hits::Hits(Searcher* s, Query* q, Filter* f, const Sort* _sort):
    55 	Hits::Hits(Searcher* s, Query* q, Filter* f, const Sort* _sort):
    57 		query(q), searcher(s), filter(f), sort(_sort)
    56 		query(q), searcher(s), filter(f), sort(_sort)
    58 //#ifdef USE_HIGHLIGHTER
    57 //#ifdef USE_HIGHLIGHTER
    59 		, hl_frag(20)
    58 		, hl_frag(15)		
       
    59 #if defined (__SYMBIAN32__)		
       
    60     ,lang(User::Language())
       
    61 #endif    
    60 //#endif		
    62 //#endif		
    61 	{
    63 	{
    62 	//Func - Constructor
    64 	//Func - Constructor
    63 	//Pre  - s contains a valid reference to a searcher s
    65 	//Pre  - s contains a valid reference to a searcher s
    64 	//       q contains a valid reference to a Query
    66 	//       q contains a valid reference to a Query
    79 
    81 
    80 	}
    82 	}
    81 	int32_t Hits::length() const {
    83 	int32_t Hits::length() const {
    82 		return _length;
    84 		return _length;
    83 	}
    85 	}
    84 
    86 	
       
    87  void Hits::getHighlightedText(CL_NS(document)::Document* document)
       
    88         {
       
    89 /* TODO :: Important consideration for getting locale
       
    90  * Highlighting is based on the locale, the current implementation is 
       
    91  * only for symbian devices, this dependency should be complete before 
       
    92  * porting to any other OS. so all code is under symbian macro.
       
    93  * 
       
    94  */
       
    95 #if defined (__SYMBIAN32__)
       
    96         TCHAR* result = NULL;
       
    97         CL_NS2(search,highlight)::QueryScorer hl_scorer(query);
       
    98         CL_NS2(search,highlight)::Highlighter highlighter(&hl_formatter, &hl_scorer);
       
    99         highlighter.setTextFragmenter(&hl_frag);
       
   100 
       
   101         const TCHAR* fieldtxt = document->get(LCPIX_HL_EXCERPT_FIELD);
       
   102 
       
   103         if(fieldtxt)
       
   104             {
       
   105             StringReader strreader(fieldtxt);
       
   106 
       
   107             switch(lang)
       
   108                 {
       
   109                 case ELangEnglish:
       
   110                 case ELangCanadianEnglish:
       
   111                 case ELangInternationalEnglish:
       
   112                 case ELangSouthAfricanEnglish:
       
   113                     {
       
   114                     CL_NS(analysis)::TokenStream* tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardTokenizer(&strreader);
       
   115                     tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardFilter(tokenstream,true);
       
   116                     tokenstream = _CLNEW CL_NS(analysis)::LowerCaseFilter(tokenstream,true);
       
   117                     result = highlighter.getBestFragments(tokenstream, fieldtxt, 2, L"...");
       
   118                     break;
       
   119                     }
       
   120                 case ELangFrench:
       
   121                 case ELangSwissFrench:
       
   122                 case ELangBelgianFrench:
       
   123                 case ELangInternationalFrench:
       
   124                 case ELangCanadianFrench:
       
   125                     {
       
   126                     ::analysis::FrenchAnalyzer hl_analyzer;
       
   127                     lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_HL_EXCERPT_FIELD, &strreader);
       
   128                     result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"...");
       
   129                     break;
       
   130                     }
       
   131                 case ELangHebrew:
       
   132                     {
       
   133                     ::analysis::HebrewAnalyzer hl_analyzer;
       
   134                     lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_HL_EXCERPT_FIELD, &strreader);
       
   135                     result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"...");
       
   136                     break;
       
   137                     }
       
   138                 case ELangTaiwanChinese:
       
   139                 case ELangHongKongChinese:
       
   140                 case ELangPrcChinese:
       
   141                 case ELangJapanese:
       
   142                 case ELangKorean:
       
   143                     {
       
   144                     ::analysis::CjkNGramTokenizer hl_analyzer(&strreader,1);
       
   145                     lucene::analysis::TokenStream * ts1 = &hl_analyzer;
       
   146                     result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"...");
       
   147                     break;
       
   148                     }
       
   149                 case ELangNone:
       
   150                 default:
       
   151                     {
       
   152                     CL_NS(analysis)::TokenStream* tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardTokenizer(&strreader);
       
   153                     tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardFilter(tokenstream,true);
       
   154                     tokenstream = _CLNEW CL_NS(analysis)::LowerCaseFilter(tokenstream,true);
       
   155                     result = highlighter.getBestFragments(tokenstream, fieldtxt, 2, L"...");
       
   156                     }
       
   157                 }
       
   158 
       
   159             if (result != NULL && *((int*)result) != 0x00)
       
   160                 {
       
   161                 document->removeField( LCPIX_HL_EXCERPT_FIELD );
       
   162                 document->add(*_CLNEW Field(LCPIX_HL_EXCERPT_FIELD,
       
   163                                 result, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO));
       
   164                 result = NULL;
       
   165                 }
       
   166             }
       
   167 
       
   168         const TCHAR* fieldtxt2 = document->get(LCPIX_EXCERPT_FIELD);
       
   169 
       
   170         if(fieldtxt2 )
       
   171             {
       
   172             StringReader strreader2(fieldtxt2);
       
   173             switch(lang)
       
   174                 {
       
   175                 case ELangEnglish:
       
   176                 case ELangCanadianEnglish:
       
   177                 case ELangInternationalEnglish:
       
   178                 case ELangSouthAfricanEnglish:
       
   179                     {
       
   180                     CL_NS2(analysis,standard)::StandardAnalyzer hl_analyzer;
       
   181                     lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2);
       
   182                     result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
       
   183                     break;
       
   184                     }
       
   185                 case ELangFrench:
       
   186                 case ELangSwissFrench:
       
   187                 case ELangBelgianFrench:
       
   188                 case ELangInternationalFrench:
       
   189                 case ELangCanadianFrench:
       
   190                     {
       
   191                     ::analysis::FrenchAnalyzer hl_analyzer;
       
   192                     lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2);
       
   193                     result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
       
   194                     break;
       
   195                     }
       
   196                 case ELangHebrew:
       
   197                     {
       
   198                     ::analysis::HebrewAnalyzer hl_analyzer;
       
   199                     lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2);
       
   200                     result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
       
   201                     break;
       
   202                     }
       
   203                 case ELangTaiwanChinese:
       
   204                 case ELangHongKongChinese:
       
   205                 case ELangPrcChinese:
       
   206                 case ELangJapanese:
       
   207                 case ELangKorean:
       
   208                     {
       
   209                     ::analysis::CjkNGramTokenizer hl_analyzer(&strreader2,1);
       
   210                     lucene::analysis::TokenStream * ts1 = &hl_analyzer;
       
   211                     result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
       
   212                     break;
       
   213                     }
       
   214                 case ELangNone:
       
   215                 default:
       
   216                     {
       
   217                     CL_NS2(analysis,standard)::StandardAnalyzer hl_analyzer;
       
   218                     lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2);
       
   219                     result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
       
   220                     }
       
   221                 }
       
   222             if (result != NULL && *((int*)result) != 0x00)
       
   223                 {
       
   224                 document->removeField( LCPIX_EXCERPT_FIELD );
       
   225                 document->add(*_CLNEW Field(LCPIX_EXCERPT_FIELD,
       
   226                                 result, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO));
       
   227                 }
       
   228             }
       
   229 #endif
       
   230 
       
   231         }
       
   232 	
    85 	Document& Hits::doc(const int32_t n){
   233 	Document& Hits::doc(const int32_t n){
    86 		HitDoc* hitDoc = getHitDoc(n);
   234 		HitDoc* hitDoc = getHitDoc(n);
    87 
   235 
    88 		// Update LRU cache of documents
   236 		// Update LRU cache of documents
    89 		remove(hitDoc);				  // remove from list, if there
   237 		remove(hitDoc);				  // remove from list, if there
    98 
   246 
    99 		if (hitDoc->doc == NULL){
   247 		if (hitDoc->doc == NULL){
   100 			hitDoc->doc = _CLNEW Document;
   248 			hitDoc->doc = _CLNEW Document;
   101 			searcher->doc(hitDoc->id, hitDoc->doc);	  // cache miss: read document
   249 			searcher->doc(hitDoc->id, hitDoc->doc);	  // cache miss: read document
   102 //#ifdef USE_HIGHLIGHTER
   250 //#ifdef USE_HIGHLIGHTER
   103 
       
   104             CL_NS(document)::Document* document = hitDoc->doc;
   251             CL_NS(document)::Document* document = hitDoc->doc;
   105             
   252             getHighlightedText(document);
   106 	            TCHAR* result = NULL;
   253 //#endif
   107 	            Query* rwquery[2];
   254          
   108 	            searcher->getrewritten(hitDoc->id, query, rwquery);
       
   109 	            
       
   110 	            const TCHAR* firstlnHLtxt = document->get(LCPIX_HL_EXCERPT_FIELD);
       
   111 	            
       
   112 	            if(firstlnHLtxt && rwquery[1])
       
   113 	              {
       
   114 	                CL_NS2(search,highlight)::QueryScorer hl_scorer(rwquery[1]);
       
   115 	
       
   116 	                CL_NS2(search,highlight)::Highlighter highlighter(&hl_formatter, &hl_scorer);
       
   117 	
       
   118 	                highlighter.setTextFragmenter(&hl_frag);
       
   119 	
       
   120 	                wstring hlText;
       
   121 	                
       
   122 	                StringReader strreader(firstlnHLtxt);
       
   123 	
       
   124 	                lucene::analysis::TokenStream * tokenStream = hl_analyzer.tokenStream(LCPIX_HL_EXCERPT_FIELD, &strreader);
       
   125 	
       
   126 	                result = highlighter.getBestFragments(tokenStream, firstlnHLtxt, 2,L"...");
       
   127 	                
       
   128 	                if (result != NULL && *((int*)result) != 0x00)
       
   129 	                    {
       
   130 	                    hlText.append(result);
       
   131 	                    
       
   132 	                    document->removeField( LCPIX_HL_EXCERPT_FIELD );
       
   133 	
       
   134 	                    document->add(*_CLNEW Field(LCPIX_HL_EXCERPT_FIELD,
       
   135 	                                    hlText.c_str(), lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO));
       
   136 	                    }
       
   137 	                
       
   138 	              }
       
   139 	            
       
   140 	            const TCHAR* text = document->get(LCPIX_EXCERPT_FIELD);
       
   141 	
       
   142 	            if(text && rwquery[1])
       
   143 	              { 
       
   144 	                CL_NS2(search,highlight)::QueryScorer hl_scorer(rwquery[1]);
       
   145 	
       
   146 	                CL_NS2(search,highlight)::Highlighter highlighter(&hl_formatter, &hl_scorer);
       
   147 	
       
   148 	                highlighter.setTextFragmenter(&hl_frag);
       
   149 	
       
   150 	                wstring hlText;
       
   151 	                
       
   152 	                StringReader strreader(text);
       
   153 	
       
   154 	                lucene::analysis::TokenStream * tokenStream = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader);
       
   155 	
       
   156 	                result = highlighter.getBestFragments(tokenStream, text, 2,L"...");
       
   157 	               
       
   158 	                if (result != NULL && *((int*)result) != 0x00)
       
   159 	                    {
       
   160 	                    hlText.append(result);
       
   161 	                    
       
   162 	                    document->removeField( LCPIX_EXCERPT_FIELD );
       
   163 	
       
   164 	                    document->add(*_CLNEW Field(LCPIX_EXCERPT_FIELD,
       
   165 	                                    hlText.c_str(), lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO));
       
   166 	                    }
       
   167 	                } 
       
   168 //#endif            
       
   169 		}
   255 		}
   170 
   256 
   171 		return *hitDoc->doc;
   257 		return *hitDoc->doc;
   172 	}
   258 	}
   173 
   259