searchengine/cpix/cpix/src/analyzer.cpp
author hgs
Mon, 28 Jun 2010 10:34:53 +0530
changeset 8 6547bf8ca13a
parent 7 a5fbfefd615f
child 19 e3c09e9691e0
permissions -rw-r--r--
201025

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

// general utilities
#include "wchar.h"
#include <string>
#include <vector>
#include <sstream>
#include <iostream>
#include <fstream>
#include <algorithm>

// clucene
#include "CLucene.h"

// support
#include "cpixparsetools.h"
#include "cpixfstools.h"

// internal
#include "analyzer.h"
#include "cpixanalyzer.h"
#include "cpixexc.h"
#include "document.h"
#include "cluceneext.h"
#include "indevicecfg.h"
#include "initparams.h"
#include "thaianalysis.h"

#include "analyzerexp.h"
#include "customanalyzer.h"
#include "common/cpixlog.h"

namespace
{
    const char AGGR_NONFILEREADERPROXY_ERR[] 
    = "Aggregated reader field should be FileReaderProxy instance";

    const char AGGR_STREAMREADER_ERR[] 
    = "Aggregating streamValue-fields not implemented";
    
    const char THAI_LANGUAGE_FILE[] 
    = "thaidict.sm";

    const char ANALYZER_FILE[]
    = "analyzer.loc";

    const wchar_t DEFAULT_ANALYZER_CONFIG[]
        = L"default";

    const wchar_t QUERY_ANALYZER_CONFIG[]
        = L"query";

    const wchar_t PREFIX_ANALYZER_CONFIG[]
        = L"prefix";

//    const wchar_t CPIX_ANALYZER_FALLBACK[]
//    = CPIX_ANALYZER_STANDARD;
//
//    const wchar_t CPIX_PREFIX_ANALYZER_FALLBACK[]
//    = CPIX_TOKENIZER_LETTER L">" CPIX_FILTER_LOWERCASE;

    
}


namespace Cpix {

	
Analysis* Analysis::theInstance_ = NULL; 

	void Analysis::init(InitParams& ip) {
		// Init thai analysis with thai dictionary
		std::string thai( Cpt::appendpath(ip.getResourceDir(),
										  THAI_LANGUAGE_FILE) );
		
		if ( Cpt::filesize( thai.c_str() ) ) {
			analysis::InitThaiAnalysis(thai.c_str());
		} else {
			logMsg(CPIX_LL_WARNING,
				   "Thai dictionary could not be found. Thai analysis will NOT work.");
		}
	
		// Setup the analysis instance
		theInstance_ = new Analysis(ip);
	}
	
	Analysis::Analysis(InitParams& ip) 
	:	defaultAnalyzer_(),
	 	queryAnalyzer_(), 
		prefixAnalyzer_() {
		
		auto_ptr<AnalyzerExp::Piping> p = parse( Cpt::appendpath( ip.getResourceDir(), ANALYZER_FILE ) );
		
		defaultAnalyzer_.reset( new CustomAnalyzer( *p, DEFAULT_ANALYZER_CONFIG ) ); 
		queryAnalyzer_.reset( new CustomAnalyzer( *p, QUERY_ANALYZER_CONFIG ) ); 
		prefixAnalyzer_.reset( new CustomAnalyzer( *p, PREFIX_ANALYZER_CONFIG ) ); 
	}
	
	auto_ptr<AnalyzerExp::Piping> Analysis::parse(std::string path) {
		std::wifstream in(path.c_str());
		auto_ptr<AnalyzerExp::Piping> ret; 
		if ( in ) {
		
			// Reserve constant size buffer and populate it with definition
			//
			int filesize = Cpt::filesize(path.c_str()); 
			Cpt::auto_array<wchar_t> buf( new wchar_t[filesize+1] );
			in.read(buf.get(), filesize);
			buf.get()[filesize] = '\0'; 
			if ( !in.fail() ) {
				try {
					ret = AnalyzerExp::ParsePiping( buf.get() );
				} catch (...) {}
			} 
			in.close();
		} 
		
		if ( !ret.get() ) { 
			THROW_CPIXEXC("Analyzer definition not found. %s could not be opened. ", path.c_str()); 
		}
		return ret; 
	}
	
	void Analysis::shutdown() {
		analysis::ShutdownThaiAnalysis(); 
		delete theInstance_;
		theInstance_ = NULL; 
	}

	lucene::analysis::Analyzer& Analysis::getDefaultAnalyzer() {
		// TODO: Assert( theInstance_ );
		return *theInstance_->defaultAnalyzer_; 
	}

	lucene::analysis::Analyzer& Analysis::getQueryAnalyzer() {
		// TODO: Assert( theInstance_ );
		return *theInstance_->queryAnalyzer_; 
	}

	lucene::analysis::Analyzer& Analysis::getPrefixAnalyzer() {
		// TODO: Assert( theInstance_ );
		return *theInstance_->prefixAnalyzer_; 
	}

	PrefixGenerator::PrefixGenerator(
		lucene::analysis::TokenStream* in, 
		bool deleteTS, 
		size_t maxPrefixLength) 
	: 	TokenFilter(in, deleteTS),
	  	token_(), 
	  	prefixLength_(0),
	  	maxPrefixLength_(maxPrefixLength) {}
	
	
	PrefixGenerator::~PrefixGenerator() {
	}

	
	bool PrefixGenerator::next(lucene::analysis::Token* token) {
		token_.setPositionIncrement(0); 

		while (prefixLength_ == 0) {
			token_.setPositionIncrement(1); // default position increment
			if (!input->next(&token_)) {
				return false;
			}
			prefixLength_ = std::min(token_.termTextLength(), maxPrefixLength_);
		}
			
		// Clip token
		std::wstring clipped; 
		clipped = token_.termText();
		token_.setText(clipped.substr(0, prefixLength_).c_str());
		
		// Copy
		token->set(token_.termText(), token_.startOffset(), token_.endOffset(), token_.type());
		token->setPositionIncrement(token_.getPositionIncrement());
		
		// Reduce prefixLength_
		prefixLength_--;
		return true; 
	}

    AggregateFieldTokenStream::AggregateFieldTokenStream(lucene::analysis::Analyzer& analyzer, 
                                                         DocumentFieldIterator* fields) 
	: stream_(), analyzer_( analyzer ), reader_(), fields_( fields ) {
        getNextStream(); 
    }
													   
    AggregateFieldTokenStream::~AggregateFieldTokenStream() {
        _CLDELETE( stream_ ); 
        delete fields_; 
    }
	
    bool AggregateFieldTokenStream::next(lucene::analysis::Token* token) {
        while ( stream_ ) {
            if ( stream_->next( token ) ) {
                return true;
            }
            getNextStream();
        }
        return false;
    }
		
    void AggregateFieldTokenStream::close() {
        if (stream_) stream_->close(); 
        _CLDELETE( stream_ ); 
        _CLDELETE( reader_ ); 
    }
	
    void AggregateFieldTokenStream::getNextStream()
    {
        using namespace lucene::document;
        using namespace lucene::util; 
	
        if ( stream_ ) stream_->close(); 
        _CLDELETE( stream_ ); 
        _CLDELETE( reader_ );
		
        Field* field = 0; 
        while (*fields_ && field == NULL)
            {
                field = (*fields_)++;
	
                if (!field->isAggregated()) 
                    {
						field = 0;
                    }
            }
        if (field) {
            if (field->stringValue() != NULL)
                {
                    reader_ = _CLNEW CL_NS(util)::StringReader(field->stringValue(),_tcslen(field->stringValue()),false);
                }
            else if (field->native().readerValue() != NULL)
                {
                    Reader* r = field->native().readerValue();
                    FileReaderProxy
                        * frp = 
                        dynamic_cast<FileReaderProxy*>(r);
                    if (frp == NULL)
                        {
                            _CLTHROWA(CL_ERR_IO, AGGR_NONFILEREADERPROXY_ERR);
                        }
                    else
                        {
                            reader_ = frp->clone();
                        }
                }
            else
                {
                    _CLTHROWA(CL_ERR_IO, AGGR_STREAMREADER_ERR);
                }
            if(field->isFreeText())
                {
                    using namespace lucene::analysis;
                    stream_ = _CLNEW standard::StandardTokenizer(reader_);
                    stream_ = _CLNEW standard::StandardFilter(stream_,true);
                    stream_ = _CLNEW LowerCaseFilter(stream_,true);
                }
            else
            stream_ = analyzer_.tokenStream( field->name(), reader_ ); 
        }
    }
	
		
    AggregateFieldAnalyzer::AggregateFieldAnalyzer(Cpix::Document& document, 
                                                   lucene::analysis::Analyzer& analyzer) 
	:	analyzer_(analyzer), document_(document)
    {
    }
	
    lucene::analysis::TokenStream* AggregateFieldAnalyzer::tokenStream(const TCHAR     * fieldName, 
                                                                       lucene::util::Reader * reader) {
        if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) {
            return new AggregateFieldTokenStream( analyzer_, document_.fields()); 
        } else if ( wcscmp( fieldName, LCPIX_DEFAULT_PREFIX_FIELD ) == 0 ) {
            return
				new PrefixGenerator(
					new AggregateFieldTokenStream( analyzer_, document_.fields()),
					true,
					OPTIMIZED_PREFIX_MAX_LENGTH);
        } else {
            return analyzer_.tokenStream( fieldName, reader ); 
        }
    }
	
    SystemAnalyzer::SystemAnalyzer(lucene::analysis::Analyzer* analyzer) : analyzer_(analyzer) {} 
    SystemAnalyzer::~SystemAnalyzer() { _CLDELETE(analyzer_); }
	
    lucene::analysis::TokenStream* SystemAnalyzer::tokenStream(const TCHAR      	* fieldName, 
                                                               lucene::util::Reader * reader) {
        using namespace lucene::analysis; 
        if ( wcscmp( fieldName, LCPIX_DEFAULT_FIELD ) == 0 ) {
            // Use standard analyzer without stop filter for this task
            TokenStream* ret = _CLNEW standard::StandardTokenizer(reader);
            ret = _CLNEW standard::StandardFilter(ret,true);
            ret = _CLNEW LowerCaseFilter(ret,true);
            return ret;
        } else if (wcscmp( fieldName, LCPIX_DOCUID_FIELD) == 0){
            // Use standard analyzer without stop filter for this task
            return  _CLNEW KeywordTokenizer(reader);
        } else if (wcscmp( fieldName, LCPIX_APPCLASS_FIELD )  == 0){
            // Use standard analyzer without stop filter for this task
            TokenStream* ret = _CLNEW WhitespaceTokenizer(reader);
            ret = _CLNEW LowerCaseFilter(ret,true);
            return ret;
        } else if (wcscmp( fieldName, LCPIX_MIMETYPE_FIELD ) == 0) {
            TokenStream* ret = _CLNEW KeywordTokenizer(reader);
            return ret;
        } else {
            return analyzer_->tokenStream( fieldName, reader ); 
        }									 
    }
}