searchengine/cpix/cpix/src/fileparser/textfileparser.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Mon, 03 May 2010 13:33:22 +0300
changeset 1 6f2c1c46032b
parent 0 671dee74050a
permissions -rw-r--r--
Revision: 201015 Kit: 201018

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#include <wchar.h>
#include <string.h>

#include <fstream>
#include <string>

#include "document.h"
#include "cpixdoc.h"
#include "fileparser/fileparser.h" // removeStandardFields()

#include "CLucene.h"

#include "indevicecfg.h"

#include "cpixidxdb.h"
#include "cluceneext.h"

namespace
{
    const char EXTENSION[]       = ".txt";
    const char EXTENSION_UPPER[] = ".TXT";

    const char DEFAULT_ENCODING[] = "UTF-8";
    
    /**
     * Returns 1 on success, 0 on eof. 
     */
    int clgetline(lucene::util::Reader& reader, std::wstring& line) 
    	{
        line = L""; 
        
        // read line 
        while (true) 
        	{
				int c = reader.read(); 
				switch (c) {
					case -1: // EOF
						return line.length() > 0; 
					case '\n': // line break
					case '\r': // line break
						return 1;
					default:
						line += static_cast<wchar_t>(c);
						if  (line.length() > 500)
							return 1;
				}
        	}
    	}

    void getExcerptOfFile(wchar_t       * dst,
                          const char    * path,
                          size_t          maxWords,
                          size_t          bufSize)
    {
        using namespace std;
        using namespace lucene::util;
                
        // Lucene reader can do UTF-8 magic, so let's use it
		FileReader file( path, DEFAULT_ENCODING ); 
		
		if ( file.reader->getStatus() == jstreams::Ok ) 
			{
				cpix_EPIState
					epiState;
				cpix_init_EPIState(&epiState);
		
				wstring
					line;
		
				while (bufSize > 0 && maxWords > 0 && clgetline(file, line))
					{
						dst = cpix_getExcerptOfWText(dst,
													 line.c_str(),
													 &maxWords,
													 &bufSize,
													 &epiState);
					}
			}
    }

}


namespace Cpix
{

    bool isTextFile(const char * path)
    {
        size_t
            length = strlen(path);

        const char
            * ext = path + length - strlen(EXTENSION);
        
        bool
            rv = false;

        if ((strcmp(EXTENSION, ext) == 0)
            ||(strcmp(EXTENSION_UPPER, ext) == 0))
            {
            rv = true;
            }

        return rv;
    }

    
    void processTextFile(Cpix::Document * doc,
                         const char     * path)
    {
        using namespace lucene::util;
        using namespace Cpix;

        // remove these fields before creating new values for them.
        removeStandardFields(doc);

        std::auto_ptr<Field>
            newField(new Field(CONTENTS_FIELD,
                               new FileReaderProxy(path,
                                                   DEFAULT_ENCODING),
                               cpix_STORE_NO | cpix_INDEX_TOKENIZED));
        doc->add(newField.get());
        newField.release();
        
        /* determine file size. if the stream is not seekable, the size will be -1
         * Here if the file is empty then we put the path name contained in 
         * "_docuid" as excerpt
         */
        FILE* textFilePtr = fopen(path, "r");
        fseek(textFilePtr, 0, SEEK_END);
        long long size = ftell(textFilePtr);
        fseek(textFilePtr, 0, SEEK_SET);
        fclose(textFilePtr);
        if (size > 0) {
			wchar_t excerpt[512];
        	getExcerptOfFile(excerpt,
                         path,
                         10, // max words
                         sizeof(excerpt) / sizeof(wchar_t));
			doc->setExcerpt(excerpt);
		
		} else {
            //For empty file setting the path as excerpt 
			doc->setExcerpt(doc->get(LCPIX_DOCUID_FIELD));
		}
        doc->setAppClass(CONTENTAPPCLASS);
        doc->setMimeType(LTEXTFILE_MIMETYPE);
        // Always perform generic file processing
        GenericFileProcessor(doc,path);

        
        // TODO should we index dates (last modified)?
    }

}