searchengine/oss/cl/clucene/src/clucene/search/hits.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Fri, 17 Sep 2010 08:35:54 +0300
changeset 21 2c484ac32ef0
parent 18 3e1f76dd2722
permissions -rw-r--r--
Revision: 201035 Kit: 201037

/*------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#include "clucene/stdheader.h"

#include "searchheader.h"
#include "clucene/document/document.h"
#include "clucene/index/indexreader.h"
#include "filter.h"
#include "clucene/search/searchheader.h"
//#ifdef USE_HIGHLIGHTER 
#include "CLucene/highlighter/QueryTermExtractor.h"
#include "CLucene/highlighter/QueryScorer.h"
#include "CLucene/highlighter/Highlighter.h"
#include "CLucene/highlighter/SimpleHTMLFormatter.h"
#include "CLucene/analysis/standard/StandardAnalyzer.h"
#include "clucene/search/prefixquery.h"

#include "prefixfilter.h"
#include "koreananalyzer.h"

//#endif

CL_NS_USE(document)
CL_NS_USE(util)
CL_NS_USE(index)

CL_NS_DEF(search)

	HitDoc::HitDoc(const float_t s, const int32_t i)
	{
	//Func - Constructor
	//Pre  - true
	//Post - The instance has been created

		next  = NULL;
		prev  = NULL;
		doc   = NULL;
		score = s;
		id    = i;
	}

	HitDoc::~HitDoc(){
	//Func - Destructor
	//Pre  - true
	//Post - The instance has been destroyed

		_CLDELETE(doc);
	}


	Hits::Hits(Searcher* s, Query* q, Filter* f, const Sort* _sort):
		query(q), searcher(s), filter(f), sort(_sort)
//#ifdef USE_HIGHLIGHTER
		, hl_frag(15)		
#if defined (__SYMBIAN32__)		
    ,lang(User::Language())
#endif    
//#endif		
	{
	//Func - Constructor
	//Pre  - s contains a valid reference to a searcher s
	//       q contains a valid reference to a Query
	//       f is NULL or contains a pointer to a filter
	//Post - The instance has been created

		_length  = 0;
		first   = NULL;
		last    = NULL;
		numDocs = 0;
		maxDocs = 200;

		//retrieve 100 initially
		getMoreDocs(50);
	}

	Hits::~Hits(){

	}
	int32_t Hits::length() const {
		return _length;
	}
	
 void Hits::getHighlightedText(CL_NS(document)::Document* document)
        {
/* TODO :: Important consideration for getting locale
 * Highlighting is based on the locale, the current implementation is 
 * only for symbian devices, this dependency should be complete before 
 * porting to any other OS. so all code is under symbian macro.
 * 
 */
#if defined (__SYMBIAN32__)
        TCHAR* result = NULL;
        CL_NS2(search,highlight)::QueryScorer hl_scorer(query);
        CL_NS2(search,highlight)::Highlighter highlighter(&hl_formatter, &hl_scorer);
        highlighter.setTextFragmenter(&hl_frag);

        const TCHAR* fieldtxt = document->get(LCPIX_HL_EXCERPT_FIELD);

        if(fieldtxt)
            {
            StringReader strreader(fieldtxt);

            switch(lang)
                {
                case ELangEnglish:
                case ELangCanadianEnglish:
                case ELangInternationalEnglish:
                case ELangSouthAfricanEnglish:
                    {
                    CL_NS(analysis)::TokenStream* tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardTokenizer(&strreader);
                    tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardFilter(tokenstream,true);
                    tokenstream = _CLNEW CL_NS(analysis)::LowerCaseFilter(tokenstream,true);
                    result = highlighter.getBestFragments(tokenstream, fieldtxt, 2, L"...");
                    break;
                    }
                case ELangFrench:
                case ELangSwissFrench:
                case ELangBelgianFrench:
                case ELangInternationalFrench:
                case ELangCanadianFrench:
                    {
                    ::analysis::FrenchAnalyzer hl_analyzer;
                    lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_HL_EXCERPT_FIELD, &strreader);
                    result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"...");
                    break;
                    }
                case ELangHebrew:
                    {
                    ::analysis::HebrewAnalyzer hl_analyzer;
                    lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_HL_EXCERPT_FIELD, &strreader);
                    result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"...");
                    break;
                    }
                case ELangTaiwanChinese:
                case ELangHongKongChinese:
                case ELangPrcChinese:
                case ELangJapanese:
                case ELangKorean:
                    {
                    ::analysis::CjkNGramTokenizer hl_analyzer(&strreader,1);
                    lucene::analysis::TokenStream * ts1 = &hl_analyzer;
                    result = highlighter.getBestFragments(ts1, fieldtxt, 2, L"...");
                    break;
                    }
                case ELangNone:
                default:
                    {
                    CL_NS(analysis)::TokenStream* tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardTokenizer(&strreader);
                    tokenstream = _CLNEW CL_NS2(analysis,standard)::StandardFilter(tokenstream,true);
                    tokenstream = _CLNEW CL_NS(analysis)::LowerCaseFilter(tokenstream,true);
                    result = highlighter.getBestFragments(tokenstream, fieldtxt, 2, L"...");
                    }
                }

            if (result != NULL && *((int*)result) != 0x00)
                {
                document->removeField( LCPIX_HL_EXCERPT_FIELD );
                document->add(*_CLNEW Field(LCPIX_HL_EXCERPT_FIELD,
                                result, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO));
                result = NULL;
                }
            }

        const TCHAR* fieldtxt2 = document->get(LCPIX_EXCERPT_FIELD);

        if(fieldtxt2 )
            {
            StringReader strreader2(fieldtxt2);
            switch(lang)
                {
                case ELangEnglish:
                case ELangCanadianEnglish:
                case ELangInternationalEnglish:
                case ELangSouthAfricanEnglish:
                    {
                    CL_NS2(analysis,standard)::StandardAnalyzer hl_analyzer;
                    lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2);
                    result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
                    break;
                    }
                case ELangFrench:
                case ELangSwissFrench:
                case ELangBelgianFrench:
                case ELangInternationalFrench:
                case ELangCanadianFrench:
                    {
                    ::analysis::FrenchAnalyzer hl_analyzer;
                    lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2);
                    result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
                    break;
                    }
                case ELangHebrew:
                    {
                    ::analysis::HebrewAnalyzer hl_analyzer;
                    lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2);
                    result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
                    break;
                    }
                case ELangTaiwanChinese:
                case ELangHongKongChinese:
                case ELangPrcChinese:
                case ELangJapanese:
                case ELangKorean:
                    {
                    ::analysis::CjkNGramTokenizer hl_analyzer(&strreader2,1);
                    lucene::analysis::TokenStream * ts1 = &hl_analyzer;
                    result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
                    break;
                    }
                case ELangNone:
                default:
                    {
                    CL_NS2(analysis,standard)::StandardAnalyzer hl_analyzer;
                    lucene::analysis::TokenStream * ts1 = hl_analyzer.tokenStream(LCPIX_EXCERPT_FIELD, &strreader2);
                    result = highlighter.getBestFragments(ts1, fieldtxt2, 2, L"...");
                    }
                }
            if (result != NULL && *((int*)result) != 0x00)
                {
                document->removeField( LCPIX_EXCERPT_FIELD );
                document->add(*_CLNEW Field(LCPIX_EXCERPT_FIELD,
                                result, lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_NO));
                }
            }
#endif

        }
	
	Document& Hits::doc(const int32_t n){
		HitDoc* hitDoc = getHitDoc(n);

		// Update LRU cache of documents
		remove(hitDoc);				  // remove from list, if there
		addToFront(hitDoc);				  // add to front of list
		if (numDocs > maxDocs) {			  // if cache is full
			HitDoc* oldLast = last;
			remove(last);				  // flush last

			_CLDELETE( oldLast->doc );
			oldLast->doc = NULL;
		}

		if (hitDoc->doc == NULL){
			hitDoc->doc = _CLNEW Document;
			searcher->doc(hitDoc->id, hitDoc->doc);	  // cache miss: read document
//#ifdef USE_HIGHLIGHTER
            CL_NS(document)::Document* document = hitDoc->doc;
            getHighlightedText(document);
//#endif
         
		}

		return *hitDoc->doc;
	}

	int32_t Hits::id (const int32_t n){
		return getHitDoc(n)->id;
	}

    float_t Hits::score(const int32_t n){
		return getHitDoc(n)->score;
	}

	void Hits::getMoreDocs(const size_t m){
		size_t _min = m;
		{
			size_t nHits = hitDocs.size();
			if ( nHits > _min)
				_min = nHits;
		}

		size_t n = _min * 2;				  // double # retrieved
		TopDocs* topDocs = NULL;
		if ( sort==NULL )
			topDocs = (TopDocs*)((Searchable*)searcher)->_search(query, filter, n);
		else
			topDocs = (TopDocs*)((Searchable*)searcher)->_search(query, filter, n, sort);
		_length = topDocs->totalHits;
		ScoreDoc* scoreDocs = topDocs->scoreDocs;
		int32_t scoreDocsLength = topDocs->scoreDocsLength;

		float_t scoreNorm = 1.0f;
		//Check that scoreDocs is a valid pointer before using it
		if (scoreDocs != NULL){
			if (_length > 0 && scoreDocs[0].score > 1.0f){
				scoreNorm = 1.0f / scoreDocs[0].score;
			}

			int32_t end = scoreDocsLength < _length ? scoreDocsLength : _length;
			for (int32_t i = hitDocs.size(); i < end; i++) {
				hitDocs.push_back(_CLNEW HitDoc(scoreDocs[i].score*scoreNorm, scoreDocs[i].doc));
			}
		}

		_CLDELETE(topDocs);
	}

	HitDoc* Hits::getHitDoc(const size_t n){
		if (n >= _length){
		    TCHAR buf[100];
            _sntprintf(buf, 100,_T("Not a valid hit number: %d"),n);
			_CLTHROWT(CL_ERR_IndexOutOfBounds, buf );
		}
		if (n >= hitDocs.size())
			getMoreDocs(n);

		return hitDocs[n];
	}

	void Hits::addToFront(HitDoc* hitDoc) {  // insert at front of cache
		if (first == NULL)
			last = hitDoc;
		else
			first->prev = hitDoc;

		hitDoc->next = first;
		first = hitDoc;
		hitDoc->prev = NULL;

		numDocs++;
	}

	void Hits::remove(const HitDoc* hitDoc) {	  // remove from cache
		if (hitDoc->doc == NULL)			  // it's not in the list
			return;					  // abort

		if (hitDoc->next == NULL)
			last = hitDoc->prev;
		else
			hitDoc->next->prev = hitDoc->prev;

		if (hitDoc->prev == NULL)
			first = hitDoc->next;
		else
			hitDoc->prev->next = hitDoc->next;

		numDocs--;
	}
CL_NS_END