diff -r 3e1f76dd2722 -r 2c484ac32ef0 searchengine/oss/cl/clucene/src/clucene/search/hits.cpp --- a/searchengine/oss/cl/clucene/src/clucene/search/hits.cpp Thu Sep 02 21:37:32 2010 +0300 +++ b/searchengine/oss/cl/clucene/src/clucene/search/hits.cpp Fri Sep 17 08:35:54 2010 +0300 @@ -12,7 +12,6 @@ #include "filter.h" #include "clucene/search/searchheader.h" //#ifdef USE_HIGHLIGHTER - #include "CLucene/highlighter/QueryTermExtractor.h" #include "CLucene/highlighter/QueryScorer.h" #include "CLucene/highlighter/Highlighter.h" @@ -20,8 +19,8 @@ #include "CLucene/analysis/standard/StandardAnalyzer.h" #include "clucene/search/prefixquery.h" -// internal libs -#include "cpixparsetools.h" +#include "prefixfilter.h" +#include "koreananalyzer.h" //#endif @@ -56,7 +55,10 @@ Hits::Hits(Searcher* s, Query* q, Filter* f, const Sort* _sort): query(q), searcher(s), filter(f), sort(_sort) //#ifdef USE_HIGHLIGHTER - , hl_frag(20) + , hl_frag(15) +#if defined (__SYMBIAN32__) + ,lang(User::Language()) +#endif //#endif { //Func - Constructor @@ -81,7 +83,153 @@ int32_t Hits::length() const { return _length; } + + void Hits::getHighlightedText(CL_NS(document)::Document* document) + { +/* TODO :: Important consideration for getting locale + * Highlighting is based on the locale, the current implementation is + * only for symbian devices, this dependency should be complete before + * porting to any other OS. so all code is under symbian macro. + * + */ +#if defined (__SYMBIAN32__) + TCHAR* result = NULL; + CL_NS2(search,highlight)::QueryScorer hl_scorer(query); + CL_NS2(search,highlight)::Highlighter highlighter(&hl_formatter, &hl_scorer); + highlighter.setTextFragmenter(&hl_frag); + const TCHAR* fieldtxt = document->get(LCPIX_HL_EXCERPT_FIELD); + + if(fieldtxt) + { + StringReader strreader(fieldtxt); + + switch(lang) + { + case ELangEnglish: + case ELangCanadianEnglish: + case ELangInternationalEnglish: + case ELangSouthAfricanEnglish: + { + CL_NS(analysis)::TokenStream* tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardTokenizer(&strreader); + tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardFilter(tokenstream,true); + tokenstream = _CLNEW CL_NS(analysis)::LowerCaseFilter(tokenstream,true); + result = highlighter.getBestFragments(tokenstream, fieldtxt, 2, L"..."); + break; + } + case ELangFrench: + case ELangSwissFrench: + case ELangBelgianFrench: + case ELangInternationalFrench: + case ELangCanadianFrench: + { + ::analysis::FrenchAnalyzer hl_analyzer; + lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_HL_EXCERPT_FIELD, &strreader); + result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"..."); + break; + } + case ELangHebrew: + { + ::analysis::HebrewAnalyzer hl_analyzer; + lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_HL_EXCERPT_FIELD, &strreader); + result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"..."); + break; + } + case ELangTaiwanChinese: + case ELangHongKongChinese: + case ELangPrcChinese: + case ELangJapanese: + case ELangKorean: + { + ::analysis::CjkNGramTokenizer hl_analyzer(&strreader,1); + lucene::analysis::TokenStream * ts1 = &hl_analyzer; + result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"..."); + break; + } + case ELangNone: + default: + { + CL_NS(analysis)::TokenStream* tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardTokenizer(&strreader); + tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardFilter(tokenstream,true); + tokenstream = _CLNEW CL_NS(analysis)::LowerCaseFilter(tokenstream,true); + result = highlighter.getBestFragments(tokenstream, fieldtxt, 2, L"..."); + } + } + + if (result != NULL && *((int*)result) != 0x00) + { + document->removeField( LCPIX_HL_EXCERPT_FIELD ); + document->add(*_CLNEW Field(LCPIX_HL_EXCERPT_FIELD, + result, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO)); + result = NULL; + } + } + + const TCHAR* fieldtxt2 = document->get(LCPIX_EXCERPT_FIELD); + + if(fieldtxt2 ) + { + StringReader strreader2(fieldtxt2); + switch(lang) + { + case ELangEnglish: + case ELangCanadianEnglish: + case ELangInternationalEnglish: + case ELangSouthAfricanEnglish: + { + CL_NS2(analysis,standard)::StandardAnalyzer hl_analyzer; + lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2); + result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"..."); + break; + } + case ELangFrench: + case ELangSwissFrench: + case ELangBelgianFrench: + case ELangInternationalFrench: + case ELangCanadianFrench: + { + ::analysis::FrenchAnalyzer hl_analyzer; + lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2); + result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"..."); + break; + } + case ELangHebrew: + { + ::analysis::HebrewAnalyzer hl_analyzer; + lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2); + result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"..."); + break; + } + case ELangTaiwanChinese: + case ELangHongKongChinese: + case ELangPrcChinese: + case ELangJapanese: + case ELangKorean: + { + ::analysis::CjkNGramTokenizer hl_analyzer(&strreader2,1); + lucene::analysis::TokenStream * ts1 = &hl_analyzer; + result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"..."); + break; + } + case ELangNone: + default: + { + CL_NS2(analysis,standard)::StandardAnalyzer hl_analyzer; + lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2); + result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"..."); + } + } + if (result != NULL && *((int*)result) != 0x00) + { + document->removeField( LCPIX_EXCERPT_FIELD ); + document->add(*_CLNEW Field(LCPIX_EXCERPT_FIELD, + result, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO)); + } + } +#endif + + } + Document& Hits::doc(const int32_t n){ HitDoc* hitDoc = getHitDoc(n); @@ -100,72 +248,10 @@ hitDoc->doc = _CLNEW Document; searcher->doc(hitDoc->id, hitDoc->doc); // cache miss: read document //#ifdef USE_HIGHLIGHTER - CL_NS(document)::Document* document = hitDoc->doc; - - TCHAR* result = NULL; - Query* rwquery[2]; - searcher->getrewritten(hitDoc->id, query, rwquery); - - const TCHAR* firstlnHLtxt = document->get(LCPIX_HL_EXCERPT_FIELD); - - if(firstlnHLtxt && rwquery[1]) - { - CL_NS2(search,highlight)::QueryScorer hl_scorer(rwquery[1]); - - CL_NS2(search,highlight)::Highlighter highlighter(&hl_formatter, &hl_scorer); - - highlighter.setTextFragmenter(&hl_frag); - - wstring hlText; - - StringReader strreader(firstlnHLtxt); - - lucene::analysis::TokenStream * tokenStream = hl_analyzer.tokenStream(LCPIX_HL_EXCERPT_FIELD, &strreader); - - result = highlighter.getBestFragments(tokenStream, firstlnHLtxt, 2,L"..."); - - if (result != NULL && *((int*)result) != 0x00) - { - hlText.append(result); - - document->removeField( LCPIX_HL_EXCERPT_FIELD ); - - document->add(*_CLNEW Field(LCPIX_HL_EXCERPT_FIELD, - hlText.c_str(), lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO)); - } - - } - - const TCHAR* text = document->get(LCPIX_EXCERPT_FIELD); - - if(text && rwquery[1]) - { - CL_NS2(search,highlight)::QueryScorer hl_scorer(rwquery[1]); - - CL_NS2(search,highlight)::Highlighter highlighter(&hl_formatter, &hl_scorer); - - highlighter.setTextFragmenter(&hl_frag); - - wstring hlText; - - StringReader strreader(text); - - lucene::analysis::TokenStream * tokenStream = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader); - - result = highlighter.getBestFragments(tokenStream, text, 2,L"..."); - - if (result != NULL && *((int*)result) != 0x00) - { - hlText.append(result); - - document->removeField( LCPIX_EXCERPT_FIELD ); - - document->add(*_CLNEW Field(LCPIX_EXCERPT_FIELD, - hlText.c_str(), lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO)); - } - } -//#endif + getHighlightedText(document); +//#endif + } return *hitDoc->doc;