searchengine/cpix/cpix/src/fileparser/pdffileparser.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Mon, 03 May 2010 13:33:22 +0300
changeset 1 6f2c1c46032b
parent 0 671dee74050a
child 2 6c1a2771f4b7
permissions -rw-r--r--
Revision: 201015 Kit: 201018

/*
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
* All rights reserved.
* This component and the accompanying materials are made available
* under the terms of "Eclipse Public License v1.0"
* which accompanies this distribution, and is available
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
*
* Initial Contributors:
* Nokia Corporation - initial contribution.
*
* Contributors:
*
* Description: 
*
*/

#include <wchar.h>
#include <string.h>

#include <fstream>
#include <string>

#include <zlib.h>
#include <wchar.h>

#include <memory.h>
#include <string.h>
#include <ctype.h>
#include <string>
#include <vector>
#include <iostream>

#include <fcntl.h>


#include "document.h"
#include "cpixdoc.h"
#include "fileparser/fileparser.h" // removeStandardFields()

#include "CLucene.h"

#include "indevicecfg.h"

#include "cpixidxdb.h"
#include "cluceneext.h"
#include "cpixstrtools.h"

namespace
{
    const char EXTENSION[]       = ".txt";
    const char EXTENSION_UPPER[] = ".TXT";

    const char DEFAULT_ENCODING[] = "UTF-8";
    
    /**
     * Returns 1 on success, 0 on eof. 
     */
    int clgetline(lucene::util::Reader& reader, std::wstring& line) 
        {
        line = L""; 
        
        // read line 
        while (true) 
            {
                int c = reader.read(); 
                switch (c) {
                    case -1: // EOF
                        return line.length() > 0; 
                    case '\n': // line break
                    case '\r': // line break
                        return 1;
                    default:
                        line += static_cast<wchar_t>(c);
                        if  (line.length() > 500)
                            return 1;
                }
            }
        }

    void getExcerptOfFile(wchar_t       * dst,
                          const char    * path,
                          size_t          maxWords,
                          size_t          bufSize)
    {
        using namespace std;
        using namespace lucene::util;
                
        // Lucene reader can do UTF-8 magic, so let's use it
        FileReader file( path, DEFAULT_ENCODING ); 
        
        if ( file.reader->getStatus() == jstreams::Ok ) 
            {
                cpix_EPIState
                    epiState;
                cpix_init_EPIState(&epiState);
        
                wstring
                    line;
        
                while (bufSize > 0 && maxWords > 0 && clgetline(file, line))
                    {
                        dst = cpix_getExcerptOfWText(dst,
                                                     line.c_str(),
                                                     &maxWords,
                                                     &bufSize,
                                                     &epiState);
                    }
            }
    }

}

using namespace std;
using namespace Cpt;

namespace Cpix
    {
    
    //Keep this many previous recent characters for back reference:
#define oldchar 15
    
    ssize_t FindStringInBuffer (char* buffer, char* search, size_t buffersize)
                    {
                    char* buffer0 = buffer;

                    size_t len = strlen(search);
                    bool fnd = false;
                    while (!fnd)
                        {
                        fnd = true;
                        for (size_t i=0; i<len; i++)
                            {
                            if (buffer[i]!=search[i])
                                {
                                fnd = false;
                                break;
                                }
                            }
                        if (fnd) return buffer - buffer0;
                        buffer = buffer + 1;
                        if (buffer - buffer0 + len >= buffersize) return -1;
                        }
                    return -1;
                    }
    
    
         //Check if a certain 2 character token just came along (e.g. BT):
         bool seen2(const char* search, char* recent)
             {
             if ( recent[oldchar-3]==search[0]
                     && recent[oldchar-2]==search[1]
                     && (recent[oldchar-1]==' ' || recent[oldchar-1]==0x0d || recent[oldchar-1]==0x0a)
                     && (recent[oldchar-4]==' ' || recent[oldchar-4]==0x0d || recent[oldchar-4]==0x0a)
             )
                 {
                 return true;
                 }
             return false;
             }

         //Convert a recent set of characters into a number if there is one.
         //Otherwise return -1:
         float ExtractNumber(const char* search, int lastcharoffset)
             {
             int i = lastcharoffset;
             while (i>0 && search[i]==' ') i--;
             while (i>0 && (isdigit(search[i]) || search[i]=='.')) i--;
             float flt=-1.0;
             char buffer[oldchar+5];
             memset(buffer,0,sizeof(buffer));
             strncpy(buffer, search+i+1, lastcharoffset-i);
             if (buffer[0] && sscanf(buffer, "%f", &flt))
                 {
                 return flt;
                 }
             return -1.0;
             }
         
         int getTempFileName(const char *path, char *tempFileName)
            {
                int len = strlen(path);
                int retVal = 0;
                char fileName[128];
               
                memset(tempFileName,0,254);
                
                strcpy(tempFileName,"c:\\temp\\pdf\\");
                                
                for(int i=0; i<len; i++)
                    {
                        if(isalnum(path[i]))
                            {
                              fileName[retVal] = path[i];
                              retVal ++;
                            }
                    }
                fileName[retVal] = '\0';
                strcat(tempFileName,fileName);
                return retVal;
            }


         //This method processes an uncompressed Adobe (text) object and extracts text.
         int ProcessOutput( FILE *fileI, char *outBuf)
             {
             const int BufLen = 1024;
             //Are we currently inside a text object?se
             bool intextobject = false;
             char output[BufLen];

             //Is the next character literal (e.g. \\ to get a \ character or \( to get ( ):
             bool nextliteral = false;

             //() Bracket nesting level. Text appears inside ()
             int rbdepth = 0;

             //Keep previous chars to get extract numbers etc.:
             char oc[oldchar];
             int j=0;
             int len;




             if(!fileI)
                 return -1;
             
             len = fread(output, 1, BufLen,fileI);
             if(feof(fileI))
                 return -1;

             for (j=0; j<oldchar; j++) oc[j]=' ';
             for (size_t i=0; i<len; i++)
                 {
                 char c = output[i];
                 if (intextobject)
                     {
                     if (rbdepth==0 && seen2("TD", oc))
                         {
                         //Positioning.
                         //See if a new line has to start or just a tab:
                         float num = ExtractNumber(oc,oldchar-5);
                         if (num>1.0)
                             {
                             strcat ( outBuf," ");

                             }
                         if (num<1.0)
                             {
                             //fputc('\t', fileO);
                             strcat(outBuf," ");

                             }
                         }
                     if (rbdepth==0 && seen2("ET", oc))
                         {
                         //End of a text object, also go to a new line.
                         strcat (outBuf," ");

                         //fputc(0x0d, fileO);

                         //fputc(0x0a, fileO);
                         }
                     else if (c=='(' && rbdepth==0 && !nextliteral)
                         {
                         //Start outputting text!
                         rbdepth=1;
                         //See if a space or tab (>1000) is called for by looking
                         //at the number in front of (
                         int num = ExtractNumber(oc,oldchar-1);
                         if (num>0)
                             {
                             if (num>1000.0)
                                 {
                                 //fputc('\t', fileO);
                                 strcat(outBuf," ");

                                 }
                             else if (num>100.0)
                                 {
                                 //fputc(' ', fileO);
                                 strcat(outBuf," ");

                                 }
                             }
                         }
                     else if (c==')' && rbdepth==1 && !nextliteral)
                         {
                         //Stop outputting text
                         rbdepth=0;
                         }
                     else if (rbdepth==1)
                         {
                         //Just a normal text character:
                         if (c=='\\' && !nextliteral)
                             {
                             //Only print out next character no matter what. Do not interpret.
                             nextliteral = true;
                             }
                         else
                             {
                             nextliteral = false;
                             if ( ((c>=' ') && (c<='~')) || ((c>=128) && (c<255)) )
                                 {
                                 if(isascii(c))
                                 {
                                 char temp[2];
                                 temp[0] = c;
                                 temp[1] = '\0';
                                 strcat(outBuf,temp);
                                 }

                                 }
                             }
                         }
                     }
                 //Store the recent characters for when we have to go back for a number:
                 for (j=0; j<oldchar-1; j++) oc[j]=oc[j+1];
                 oc[oldchar-1]=c;
                 if (!intextobject)
                     {
                     if (seen2("BT", oc))
                         {
                         //Start of a text object:
                         intextobject = true;
                         }
                     }
                 }
             if(outBuf)
                 return strlen(outBuf);
             else
                 return 0;
             }


    /* Decompress from file source to file dest until stream ends or EOF.
     inf() returns Z_OK on success, Z_MEM_ERROR if memory could not be
     allocated for processing, Z_DATA_ERROR if the deflate data is
     invalid or incomplete, Z_VERSION_ERROR if the version of zlib.h and
     the version of the library linked do not match, or Z_ERRNO if there
     is an error reading or writing the files. */
    int inf(FILE *source, FILE *dest)
        {
        int ret;
        unsigned have;
        z_stream strm;
        unsigned char in[CHUNK];
        unsigned char out[CHUNK];

        /* allocate inflate state */
        strm.zalloc = Z_NULL;
        strm.zfree = Z_NULL;
        strm.opaque = Z_NULL;
        strm.avail_in = 0;
        strm.next_in = Z_NULL;
        ret = inflateInit(&strm);
        if (ret != Z_OK)
        return ret;

        /* decompress until deflate stream ends or end of file */
        do
            {
            strm.avail_in = fread(in, 1, CHUNK, source);
            if (ferror(source))
                {
                (void) inflateEnd(&strm);
                return Z_ERRNO;
                }
            if (strm.avail_in == 0)
            break;
            strm.next_in = in;

            /* run inflate() on input until output buffer not full */
            do
                {
                strm.avail_out = CHUNK;
                strm.next_out = out;
                ret = inflate(&strm, Z_FINISH); //Z_NO_FLUSH);
                assert(ret != Z_STREAM_ERROR); /* state not clobbered */
                switch (ret)
                    {
                    case Z_NEED_DICT:
                    ret = Z_DATA_ERROR; /* and fall through */
                    case Z_DATA_ERROR:
                    case Z_MEM_ERROR:
                    (void) inflateEnd(&strm);
                    return ret;
                    }
                have = CHUNK - strm.avail_out;
                if (fwrite(out, 1, have, dest) != have || ferror(dest))
                    {
                    (void) inflateEnd(&strm);
                    return Z_ERRNO;
                    }
                }
            while (strm.avail_out == 0);

            /* done when inflate() says it's done */
            }
        while (ret != Z_STREAM_END);

        /* clean up and return */
        (void) inflateEnd(&strm);
        return ret == Z_STREAM_END ? Z_OK : Z_DATA_ERROR;
        }

    /* report a zlib or i/o error */
    void zerr(int ret)
        {
        fputs("zpipe: ", stderr);
        switch (ret)
            {
            case Z_ERRNO:
            if (ferror(stdin))
            fputs("error reading stdin\n", stderr);
            if (ferror(stdout))
            fputs("error writing stdout\n", stderr);
            break;
            case Z_STREAM_ERROR:
            fputs("invalid compression level\n", stderr);
            break;
            case Z_DATA_ERROR:
            fputs("invalid or incomplete deflate data\n", stderr);
            break;
            case Z_MEM_ERROR:
            fputs("out of memory\n", stderr);
            break;
            case Z_VERSION_ERROR:
            fputs("zlib version mismatch!\n", stderr);
            }
        }
    
    //Find a string in a buffer:
    int createCompressedStream(FILE *fileI, const char *path)
        {
        const int BufLen = 1024;
        char tempFile[254];
        //Read 1024 chars into memory (!):
        char* buffer = (char *)malloc(sizeof( char ) *BufLen);
        bool hasStreamData = true;
        bool hasStreamStarted = false;

        char* writePointer;
        int bytesToWrite = 0;
        FILE* pdfReaderI;

        if(feof(fileI))
            {

            free (buffer);
            return -1;
            }
        
        getTempFileName(path,tempFile);
        strcat(tempFile,"_compressedbin.data");

        pdfReaderI = fopen(tempFile,"w+b");

        if(fileI && pdfReaderI )
            {
            while(hasStreamData)
                {
                memset(buffer,0, BufLen);

                /*
                 * !!!!!!!!!!!!!!!!!!!!!!!!!!!   CAUTION !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
                 * Chances are there half of the word "stream" may get read to the buffer.
                 * if it happens, that particular two stream wont get index.
                 * Didnt implement it as of now. Because the logic requires lot of file pointer movement
				 * and character comparison.
                 */

              
                size_t actualRead = fread(buffer, 1, BufLen,fileI);

                if(feof(fileI))
                    {
                    fclose(pdfReaderI);

                    free (buffer);
                    return -1;
                    }

                int streamStart = FindStringInBuffer (buffer, "stream", actualRead);
                int streamEnd = FindStringInBuffer (buffer, "endstream", actualRead);

                if(streamEnd> 0 && streamStart> 0 ) // To aviod finding the stream in endstream

                    {
                    if ((streamStart - streamEnd) == 3)
                    streamStart = -1;
                    }

                if ((streamStart> 0) && (hasStreamStarted == false ))
                    {
                    if((streamStart + 7)> actualRead)
                        {
                        fseek ( fileI , -(actualRead - streamStart-2) , SEEK_CUR );

                        hasStreamStarted = false;
                        hasStreamData = false;
                        continue;
                        }
                    //Skip to beginning of the data stream:
                    streamStart += 6;
                    if (buffer[streamStart]==0x0d && buffer[streamStart+1]==0x0a) streamStart+=2;
                    else if (buffer[streamStart]==0x0a) streamStart++;

                    hasStreamStarted = true;
                    writePointer = buffer + streamStart;

                    if (streamEnd> 0)
                        {
                        if (buffer[streamEnd-2]==0x0d && buffer[streamEnd-1]==0x0a) streamEnd-=2;
                        else if (buffer[streamEnd-1]==0x0a) streamEnd--;
                        bytesToWrite = streamEnd - streamStart+1;

                        fseek (fileI , -(actualRead - streamEnd - 9) , SEEK_CUR );

                        int i = ftell(fileI);
                        hasStreamStarted = false;
                        hasStreamData = false;

                        }
                    else
                    bytesToWrite = actualRead-streamStart;
                    fwrite(writePointer, 1,bytesToWrite, pdfReaderI);

                    }
                else if (hasStreamStarted)
                    {
                    if (streamEnd> 0)
                        {
                        if (buffer[streamEnd-2]==0x0d && buffer[streamEnd-1]==0x0a) streamEnd-=2;
                        else if (buffer[streamEnd-1]==0x0a) streamEnd--;
                        bytesToWrite = streamEnd;
                        hasStreamStarted = false;
                        hasStreamData = false;

                        fseek (fileI , -(actualRead - streamEnd - 9) , SEEK_CUR );

                        }
                    else
                    bytesToWrite = actualRead;

                    fwrite(buffer, 1,bytesToWrite, pdfReaderI);

                    }

                }
            }
        fclose(pdfReaderI);

        free (buffer);
        return 1;
        }

    int extractAStream(FILE *inFile, const char *path)
        {
        char tempFile[254];
        int retf = createCompressedStream(inFile, path);

        if (retf == -1 )
            {
            return retf;
            }

        FILE* UncompressedFile;
        FILE* CompressedFile;
        
        getTempFileName(path,tempFile);
        strcat(tempFile,"_uncompressedbin.data");
        UncompressedFile = fopen(tempFile,"w+b");
        
        getTempFileName(path,tempFile);
        strcat(tempFile,"_compressedbin.data");
        CompressedFile = fopen(tempFile,"r+b");
        


        if(CompressedFile && UncompressedFile )
            int ret = inf(CompressedFile,UncompressedFile);
        else
            retf = -1;
        
        fclose(UncompressedFile);
        fclose(CompressedFile);
        remove(tempFile);
        return retf;

        }
  

    
    int32_t convertPDFToText(const char *path)
                 {
                 int retVal =0, ret = 0;
                 FILE *fileO, *unCompressedFp;
                 int32_t nwritten = 0;
                 FILE *file;
                 char *outBuf;
                 char tempFile[254];
                 
                 file = fopen(path,"rb");
                
                 outBuf = (char *) malloc (sizeof(char)*1024);
                 _mkdir("c:\\temp");
                 _mkdir("c:\\temp\\pdf");

                // memset(start,0,space);
                 if (file == 0)
                     {
                         fclose(file);
                         file = 0;
                         return -1;
                     }
                 
                 getTempFileName(path,tempFile);
                 strcat(tempFile,".txt");
                 
                 fileO = fopen(tempFile,"w");
                 
                 getTempFileName(path,tempFile);
                 strcat(tempFile,"_uncompressedbin.data");
                 
                 while (ret != -1)
                 {
                         ret = extractAStream(file,path);
                         unCompressedFp = fopen(tempFile,"r+b");
                         
                         if(!unCompressedFp && !fileO)
                             ret = -1;
                         
                         if(ret == -1)
                             {
                             fclose(file);
                             fclose(unCompressedFp);
                             fclose(fileO);
                             free(outBuf);
                             file = 0;
                             remove(tempFile);
                             getTempFileName(path,tempFile);
                             strcat(tempFile,"_compressedbin.data");
                             remove(tempFile);
                             return -1;
                             }
     
                         while(retVal != -1)
                             {
                             memset(outBuf,0,1024);
                             retVal = ProcessOutput(unCompressedFp,outBuf);
                             if(retVal> 0)
                                 {
                                 fwrite(outBuf, 1,retVal, fileO);
                                 fflush(fileO);
                                 }
                             }
                         retVal = 0;
                         fclose(unCompressedFp);
                   }
                 fclose(fileO);
                 remove(tempFile);
                 getTempFileName(path,tempFile);
                 strcat(tempFile,"_compressedbin.data");
                 remove(tempFile);
                 free(outBuf);
                 return nwritten;
                 }



    bool isPdfFile(const char * path)
        {
        size_t length = strlen(path);
        const char PDF_EXTENSION[] = ".pdf";
        const char PDF_EXTENSION_UPPER[] = ".PDF";

        const char * ext = path + length - strlen(PDF_EXTENSION);

        bool rv = false;

        if ((strcmp(PDF_EXTENSION, ext) == 0) || (strcmp(PDF_EXTENSION_UPPER,
                ext) == 0))
            {
            rv = true;
            }

        return rv;
        }
    
   
    void processPdfFile(Cpix::Document * doc, const char * path)
        {
        using namespace lucene::util;
        using namespace Cpix;

        const char DEFAULT_ENCODING[] = "UTF-8";
        char tempFile[254];
        FILE *fp;
		wchar_t* excerpt = new wchar_t[MAX_EXCERPT_LENGTH];
        
        convertPDFToText(path);

        // remove these fields before creating new values for them.
        removeStandardFields(doc);
        
        getTempFileName(path,tempFile);

        strcat(tempFile,".txt");
        
        fp = fopen(tempFile,"r");
        // determine file size. if the stream is not seekable, the size will be -1
        fseek(fp, 0, SEEK_END);
        long long size = ftell(fp);
        fseek(fp, 0, SEEK_SET);
        fclose(fp);
        
        if(size > 0)
            {
                std::auto_ptr<Field> newField(new Field(CONTENTS_FIELD,
                new FileReaderProxy(tempFile, DEFAULT_ENCODING), cpix_STORE_NO
                        | cpix_INDEX_TOKENIZED));
        
                doc->add(newField.get());
                newField.release();
				        getExcerptOfFile(excerpt,
                        tempFile,
                        10, // max words
                        sizeof(excerpt) / sizeof(wchar_t));
						doc->setExcerpt(excerpt);
						
            }
        else
           {
               //For empty file setting the path as excerpt 
               doc->setExcerpt(doc->get(LCPIX_DOCUID_FIELD));
               
               getTempFileName(path,tempFile);
               strcat(tempFile,".txt");
               remove(tempFile);
           
           }

        doc->setAppClass(CONTENTAPPCLASS);
        doc->setMimeType(LPDFFILE_MIMETYPE);
        delete excerpt;
        GenericFileProcessor(doc,path);
        }

    }