searchengine/cpix/cpix/src/fileparser/pdffileparser.cpp
changeset 1 6f2c1c46032b
parent 0 671dee74050a
child 2 6c1a2771f4b7
equal deleted inserted replaced
0:671dee74050a 1:6f2c1c46032b
    43 #include "indevicecfg.h"
    43 #include "indevicecfg.h"
    44 
    44 
    45 #include "cpixidxdb.h"
    45 #include "cpixidxdb.h"
    46 #include "cluceneext.h"
    46 #include "cluceneext.h"
    47 #include "cpixstrtools.h"
    47 #include "cpixstrtools.h"
       
    48 
       
    49 namespace
       
    50 {
       
    51     const char EXTENSION[]       = ".txt";
       
    52     const char EXTENSION_UPPER[] = ".TXT";
       
    53 
       
    54     const char DEFAULT_ENCODING[] = "UTF-8";
       
    55     
       
    56     /**
       
    57      * Returns 1 on success, 0 on eof. 
       
    58      */
       
    59     int clgetline(lucene::util::Reader& reader, std::wstring& line) 
       
    60         {
       
    61         line = L""; 
       
    62         
       
    63         // read line 
       
    64         while (true) 
       
    65             {
       
    66                 int c = reader.read(); 
       
    67                 switch (c) {
       
    68                     case -1: // EOF
       
    69                         return line.length() > 0; 
       
    70                     case '\n': // line break
       
    71                     case '\r': // line break
       
    72                         return 1;
       
    73                     default:
       
    74                         line += static_cast<wchar_t>(c);
       
    75                         if  (line.length() > 500)
       
    76                             return 1;
       
    77                 }
       
    78             }
       
    79         }
       
    80 
       
    81     void getExcerptOfFile(wchar_t       * dst,
       
    82                           const char    * path,
       
    83                           size_t          maxWords,
       
    84                           size_t          bufSize)
       
    85     {
       
    86         using namespace std;
       
    87         using namespace lucene::util;
       
    88                 
       
    89         // Lucene reader can do UTF-8 magic, so let's use it
       
    90         FileReader file( path, DEFAULT_ENCODING ); 
       
    91         
       
    92         if ( file.reader->getStatus() == jstreams::Ok ) 
       
    93             {
       
    94                 cpix_EPIState
       
    95                     epiState;
       
    96                 cpix_init_EPIState(&epiState);
       
    97         
       
    98                 wstring
       
    99                     line;
       
   100         
       
   101                 while (bufSize > 0 && maxWords > 0 && clgetline(file, line))
       
   102                     {
       
   103                         dst = cpix_getExcerptOfWText(dst,
       
   104                                                      line.c_str(),
       
   105                                                      &maxWords,
       
   106                                                      &bufSize,
       
   107                                                      &epiState);
       
   108                     }
       
   109             }
       
   110     }
       
   111 
       
   112 }
    48 
   113 
    49 using namespace std;
   114 using namespace std;
    50 using namespace Cpt;
   115 using namespace Cpt;
    51 
   116 
    52 namespace Cpix
   117 namespace Cpix
   631         using namespace Cpix;
   696         using namespace Cpix;
   632 
   697 
   633         const char DEFAULT_ENCODING[] = "UTF-8";
   698         const char DEFAULT_ENCODING[] = "UTF-8";
   634         char tempFile[254];
   699         char tempFile[254];
   635         FILE *fp;
   700         FILE *fp;
       
   701 		wchar_t* excerpt = new wchar_t[MAX_EXCERPT_LENGTH];
   636         
   702         
   637         convertPDFToText(path);
   703         convertPDFToText(path);
   638 
   704 
   639         // remove these fields before creating new values for them.
   705         // remove these fields before creating new values for them.
   640         removeStandardFields(doc);
   706         removeStandardFields(doc);
   656                 new FileReaderProxy(tempFile, DEFAULT_ENCODING), cpix_STORE_NO
   722                 new FileReaderProxy(tempFile, DEFAULT_ENCODING), cpix_STORE_NO
   657                         | cpix_INDEX_TOKENIZED));
   723                         | cpix_INDEX_TOKENIZED));
   658         
   724         
   659                 doc->add(newField.get());
   725                 doc->add(newField.get());
   660                 newField.release();
   726                 newField.release();
       
   727 				        getExcerptOfFile(excerpt,
       
   728                         tempFile,
       
   729                         10, // max words
       
   730                         sizeof(excerpt) / sizeof(wchar_t));
       
   731 						doc->setExcerpt(excerpt);
       
   732 						
   661             }
   733             }
   662         else
   734         else
   663            {
   735            {
       
   736                //For empty file setting the path as excerpt 
       
   737                doc->setExcerpt(doc->get(LCPIX_DOCUID_FIELD));
       
   738                
   664                getTempFileName(path,tempFile);
   739                getTempFileName(path,tempFile);
   665                strcat(tempFile,".txt");
   740                strcat(tempFile,".txt");
   666                remove(tempFile);
   741                remove(tempFile);
   667            
   742            
   668            }
   743            }
   669 
   744 
   670         doc->setAppClass(PDFAPPCLASS);
   745         doc->setAppClass(CONTENTAPPCLASS);
   671         doc->setMimeType(LPDFFILE_MIMETYPE);
   746         doc->setMimeType(LPDFFILE_MIMETYPE);
       
   747         delete excerpt;
   672         GenericFileProcessor(doc,path);
   748         GenericFileProcessor(doc,path);
   673         }
   749         }
   674 
   750 
   675     }
   751     }
       
   752 
       
   753