FCL/sf/mw/searchsrv: searchengine/cpix/cpix/src/fileparser/fileparser.cpp@cf5c74390b98


/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#include "cpixstrtools.h"

#include "ifieldfilter.h"
#include "fileparser/fileparser.h"
#include "cpixexc.h"

#include "indevicecfg.h"
#include "document.h"
#include "cpixdoc.h"
#include "common/cpixlog.h"

namespace
{

    /*************************************************************
     *
     * These are the signatures for the "interface methods" of the
     * content-specific file parsers.
     *
     */

    /**
     * This "content-specific file parser method signature" is capable
     * telling if a given path matches the file type a particular file
     * parser is can parse.
     */
    typedef bool (*FileTypeMatcher)(const char *);

    /**
     * Once a path has been matched by a particular file parser, this
     * processor will be invoked. The parser must create file type
     * specific fields on the given document based on the file at the
     * given path.
     *
     *  (a) Any common fields (like extension, file base name, etc)
     *      are added by the general file parser, so file-type
     *      specific file parsers need not bother.
     *
     *  (b) All file type specific file parsers MUST define the
     *      appclass, mimetype and excerpt fields. Function 
     *      removeStandardFields() must be called to remove the original
     *      values defined by cpixdox.cpp:initializeDoc.
     *
     *  (c) These flags MUST be used for the three mandatory fields:
     *
     *      appclass  : Field::STORE_YES | Field::INDEX_TOKENIZED
     *
     *      mime type : Field::STORE_YES | Field::INDEX_UNTOKENIZED
     * 
     *      exceprt   : Field::STORE_YES | Field::INDEX_NO
     */
    typedef void (*FileTypeProcessor)(Cpix::Document *,
                                      const char                 *);

}


namespace Cpix
{

    /************************************************************
     *
     * These are the file type specific parser "interface methods",
     * that is functions from different "modules".
     *
     */


    // from textfileparser.cpp
    bool isTextFile(const char * path);
    void processTextFile(Cpix::Document * doc,
                         const char     * path);

    // from jpegfileparser.cpp
    bool isJpegFile(const char * path);
    void processJpegFile(Cpix::Document * doc,
                         const char     * path);

    // from mp3fileparser.cpp
    bool isMp3File(const char * path);
    void processMp3File(Cpix::Document * doc,
                        const char     * path);
    
    // from PDFfileparser.cpp
    bool isPdfFile(const char * path);
    void processPdfFile(Cpix::Document * doc,
                        const char     * path);

    // from unrecognizedfileparser.cpp
    bool isUnrecognizedFile(const char * path);
    void processUnrecognizedFile(Cpix::Document * doc,
                                 const char     * path);
}



namespace Cpix
{

    void GenericFileProcessor(Cpix::Document 		 * doc,
                              const char                 * path)
    {
        using namespace std;
        using namespace Cpix;
        using namespace Cpt;

        /* OBS
        size_t
            bufSize = strlen(path) + 1;
        auto_array<wchar_t>
            wPath(new wchar_t[bufSize]);
        wPath.get()[bufSize - 1] = 0;
        mbstowcs(wPath.get(),
                 path,
                 bufSize);
        */
        auto_array<wchar_t>
            wPath(path);

        wstring
            wPathStr(wPath.get());

        size_t
            lastSep = wPathStr.find_last_of(L"/\\");
        
        // if path got here, then it is a file, therefore the last
        // character of path cannot possibly be a directory separator
        wstring
            wFullName(lastSep == string::npos 
                     ? wPathStr 
                     : wPathStr.substr(lastSep + 1));

        size_t
            lastDot = wFullName.find_last_of(L'.');

        wstring
            wBaseName(wFullName.substr(0, lastDot)),
            wExtension((lastDot == string::npos 
                        || lastDot == wFullName.length() - 1) 
                       ? wstring(L"")
                       : wFullName.substr(lastDot + 1));
        
        auto_ptr<Field>
            newField(new Field(LFULLNAME_FIELD,
                               wFullName.c_str(),
                               cpix_STORE_YES
                             | cpix_INDEX_TOKENIZED
                             | cpix_AGGREGATE_YES
                             | cpix_FREE_TEXT));

        doc->add(newField.get());
        newField.release();


        newField.reset(new Field(LBASENAME_FIELD,
                                 wBaseName.c_str(),
                                 cpix_STORE_NO 
                               | cpix_INDEX_TOKENIZED 
                               | cpix_AGGREGATE_YES
                               | cpix_FREE_TEXT));
        doc->add(newField.get());
        newField.release();

        newField.reset(new Field(LEXTENSION_FIELD,
                                 wExtension.c_str(),
                                 cpix_STORE_YES 
                               | cpix_INDEX_TOKENIZED
                               | cpix_AGGREGATE_YES
                               | cpix_FREE_TEXT));
        doc->add(newField.get());
        newField.release();
    }

}



namespace
{

    struct FileProcessorDef
    {
        FileTypeMatcher   matcher_;
        FileTypeProcessor processor_;
    };

    
    const FileProcessorDef fileProcessorDefs[] = {
        {
            Cpix::isTextFile,
            Cpix::processTextFile
        },
/* 
 * Since all the media files are getting harvested from MDS, there is no need for Jpeg and MP3 parser in cpix.
 * Source code of Jpeg and MP3 parser are note removed because incase future if we revert to 
 * CPIX parsers, we can use this code.
*/        
        
#if 0  
        {
            Cpix::isJpegFile,
            Cpix::processJpegFile
        },

        {
            Cpix::isMp3File,
            Cpix::processMp3File
        },
#endif
		{
            Cpix::isPdfFile,
            Cpix::processPdfFile
        },
 
        // TODO
        // enter more file processor definitions here

        
        // This last file parser accepts ALL types, and only defines a
        // generic appclass (FILEAPPCLASS). So any specific type
        // parser must be added before this "catch-all" clause.
        {
            Cpix::isUnrecognizedFile,
            Cpix::processUnrecognizedFile
        },

        // end of list
        { NULL,
          NULL
        }
    };


    FileTypeProcessor GetFileTypeProcessor(const char * path)
    {
        FileTypeProcessor
            rv = NULL;

        const FileProcessorDef
            * fpd = fileProcessorDefs;
        
        while (fpd->matcher_ != NULL)
            {
                if ( (*fpd->matcher_)(path) )
                    {
                        rv = fpd->processor_;
                        break;
                    }

                ++ fpd;
            }

        return rv;
    }


}


namespace Cpix
{

    void removeStandardFields(Cpix::Document * doc)
    {
        doc->removeField(LCPIX_EXCERPT_FIELD);
        doc->removeField(LCPIX_APPCLASS_FIELD);
        doc->removeField(LCPIX_MIMETYPE_FIELD);
    }
    
    /**
    * check whether the given file is parsable. Since the 
    * media files are getting harvested from MDS server, the JPEG and
    * MP3 are there in blocked list.
    */
    
    bool isFileAllowedToParse(const wchar_t  * wPathName)
        {
           bool rv = false;
           const wchar_t *parsedList[] = {
				   L"pdf",
				   L"txt"
		   };
           wstring
                       wPathStr(wPathName);
       
           if(!wPathStr.empty())
               {
                   size_t
                       lastSep = wPathStr.find_last_of(L"/\\");
               
                   // if path got here, then it is a file, therefore the last
                   // character of path cannot possibly be a directory separator
                   wstring
                       wFullName(lastSep == string::npos 
                                ? wPathStr 
                                : wPathStr.substr(lastSep + 1));

                   size_t
                       lastDot = wFullName.find_last_of(L'.');

                   wstring
                       wExtension((lastDot == string::npos 
                                   || lastDot == wFullName.length() - 1) 
                                  ? wstring(L"")
                                  : wFullName.substr(lastDot + 1));

				   if (!wExtension.empty()) {
					   for (int i=0; i < (sizeof(parsedList)/sizeof(char*)); i++) {
						   if (!wcscmp ((wchar_t*)wExtension.c_str(), parsedList[i])) {
							   rv = true;
							   break;
						   }
					   }
               		}
               }    
        	   return rv;
        }

    bool FileParser::process(Cpix::Document * doc)
    {
        bool
            rv = false;

        const wchar_t
            * docUid = doc->get(LCPIX_DOCUID_FIELD);

        if (docUid == NULL)
            {
                logMsg(CPIX_LL_WARNING,
                       "Corrupt document - no doc uid field");
            }
        else
            {
                if(isFileAllowedToParse(docUid))
                    {
                          Cpt::auto_array<char>  path(docUid);
        
                            try
                                {
            
                                    FileTypeProcessor
                                        ftp = GetFileTypeProcessor(path.get());
            
                                    if (ftp == NULL)
                                        {
                                            logMsg(CPIX_LL_WARNING,
                                                   "File %s: unrecognized type, not parsing file type specifics.\n",
                                                   path.get());
                                        }
                                    else
                                        {
                                            (*ftp)(doc,
                                                   path.get());
                                        }
                                    rv = true; // return true so that doc is indexed
                                }
                            catch (std::exception & exc)
                                {
                                    logMsg(CPIX_LL_ERROR,
                                           "File %s: parser error: %s\n",
                                           path.get(),
                                           exc.what());
                                }
                            catch (...)
                                {
                                    logMsg(CPIX_LL_ERROR,
                                           "File %s: unknown parse error\n",
                                           path.get());
                                }
                      }
             }
                      

        return rv;
    }
    

    FileParser::~FileParser()
    {
        ;
    }


}
author	Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
	Wed, 18 Aug 2010 10:53:26 +0300
changeset 15	cf5c74390b98
parent 9	d575fd691cf9
permissions	-rw-r--r--