searchengine/cpix/cpix/src/fileparser/pdffileparser.cpp
changeset 2 6c1a2771f4b7
parent 1 6f2c1c46032b
--- a/searchengine/cpix/cpix/src/fileparser/pdffileparser.cpp	Mon May 03 13:33:22 2010 +0300
+++ b/searchengine/cpix/cpix/src/fileparser/pdffileparser.cpp	Fri May 14 16:57:37 2010 +0300
@@ -48,69 +48,43 @@
 
 namespace
 {
-    const char EXTENSION[]       = ".txt";
-    const char EXTENSION_UPPER[] = ".TXT";
-
-    const char DEFAULT_ENCODING[] = "UTF-8";
-    
     /**
      * Returns 1 on success, 0 on eof. 
      */
-    int clgetline(lucene::util::Reader& reader, std::wstring& line) 
+    int getPDFExcerpt(const char* filePath,std::wstring& line) 
         {
-        line = L""; 
+        line = L"";
+        int wordCount = 0;
         
         // read line 
+        FILE *fp = fopen(filePath,"rb");
         while (true) 
             {
-                int c = reader.read(); 
+                int c = fgetc(fp); 
                 switch (c) {
                     case -1: // EOF
+                        fclose(fp);
                         return line.length() > 0; 
                     case '\n': // line break
                     case '\r': // line break
+                        fclose(fp);
                         return 1;
                     default:
                         line += static_cast<wchar_t>(c);
-                        if  (line.length() > 500)
-                            return 1;
+                        if ( c == ' ')
+                            wordCount ++;
+                        
+                        if  ((line.length() > MAX_EXCERPT_LENGTH) ||  wordCount == 10 )
+                            {
+                                fclose(fp);
+                                return 1;
+                            }
                 }
             }
         }
 
-    void getExcerptOfFile(wchar_t       * dst,
-                          const char    * path,
-                          size_t          maxWords,
-                          size_t          bufSize)
-    {
-        using namespace std;
-        using namespace lucene::util;
-                
-        // Lucene reader can do UTF-8 magic, so let's use it
-        FileReader file( path, DEFAULT_ENCODING ); 
-        
-        if ( file.reader->getStatus() == jstreams::Ok ) 
-            {
-                cpix_EPIState
-                    epiState;
-                cpix_init_EPIState(&epiState);
-        
-                wstring
-                    line;
-        
-                while (bufSize > 0 && maxWords > 0 && clgetline(file, line))
-                    {
-                        dst = cpix_getExcerptOfWText(dst,
-                                                     line.c_str(),
-                                                     &maxWords,
-                                                     &bufSize,
-                                                     &epiState);
-                    }
-            }
     }
 
-}
-
 using namespace std;
 using namespace Cpt;
 
@@ -139,7 +113,7 @@
                             }
                         if (fnd) return buffer - buffer0;
                         buffer = buffer + 1;
-                        if (buffer - buffer0 + len >= buffersize) return -1;
+                        if (buffer - buffer0 + len > buffersize) return -1;
                         }
                     return -1;
                     }
@@ -441,6 +415,7 @@
         bool hasStreamData = true;
         bool hasStreamStarted = false;
 
+
         char* writePointer;
         int bytesToWrite = 0;
         FILE* pdfReaderI;
@@ -451,7 +426,7 @@
             free (buffer);
             return -1;
             }
-        
+
         getTempFileName(path,tempFile);
         strcat(tempFile,"_compressedbin.data");
 
@@ -468,7 +443,7 @@
                  * Chances are there half of the word "stream" may get read to the buffer.
                  * if it happens, that particular two stream wont get index.
                  * Didnt implement it as of now. Because the logic requires lot of file pointer movement
-				 * and character comparison.
+				 * and character comparison.0
                  */
 
               
@@ -489,7 +464,7 @@
 
                     {
                     if ((streamStart - streamEnd) == 3)
-                    streamStart = -1;
+                        streamStart = -1;
                     }
 
                 if ((streamStart> 0) && (hasStreamStarted == false ))
@@ -525,7 +500,9 @@
                         }
                     else
                     bytesToWrite = actualRead-streamStart;
-                    fwrite(writePointer, 1,bytesToWrite, pdfReaderI);
+                    
+                    if(bytesToWrite >  0)
+                        fwrite(writePointer, 1,bytesToWrite, pdfReaderI);
 
                     }
                 else if (hasStreamStarted)
@@ -543,14 +520,16 @@
                         }
                     else
                     bytesToWrite = actualRead;
-
-                    fwrite(buffer, 1,bytesToWrite, pdfReaderI);
+                    
+                    if(bytesToWrite >  0)
+                        fwrite(buffer, 1,bytesToWrite, pdfReaderI);
 
                     }
 
                 }
             }
-        fclose(pdfReaderI);
+        if (pdfReaderI)
+            fclose(pdfReaderI); // coverty 121614
 
         free (buffer);
         return 1;
@@ -584,8 +563,10 @@
         else
             retf = -1;
         
-        fclose(UncompressedFile);
-        fclose(CompressedFile);
+        if (UncompressedFile) // coverty 
+            fclose(UncompressedFile);
+        if  (CompressedFile)
+            fclose(CompressedFile); // coverty
         remove(tempFile);
         return retf;
 
@@ -611,8 +592,8 @@
                 // memset(start,0,space);
                  if (file == 0)
                      {
-                         fclose(file);
-                         file = 0;
+
+                         free(outBuf);
                          return -1;
                      }
                  
@@ -635,10 +616,12 @@
                          if(ret == -1)
                              {
                              fclose(file);
-                             fclose(unCompressedFp);
-                             fclose(fileO);
                              free(outBuf);
                              file = 0;
+                             if (unCompressedFp)
+                                 fclose( unCompressedFp );
+                             if(fileO)
+                                 fclose( fileO );
                              remove(tempFile);
                              getTempFileName(path,tempFile);
                              strcat(tempFile,"_compressedbin.data");
@@ -657,7 +640,8 @@
                                  }
                              }
                          retVal = 0;
-                         fclose(unCompressedFp);
+                         if(unCompressedFp)
+                             fclose(unCompressedFp);
                    }
                  fclose(fileO);
                  remove(tempFile);
@@ -698,8 +682,8 @@
         const char DEFAULT_ENCODING[] = "UTF-8";
         char tempFile[254];
         FILE *fp;
-		wchar_t* excerpt = new wchar_t[MAX_EXCERPT_LENGTH];
-        
+		//wchar_t excerpt [MAX_EXCERPT_LENGTH];
+        wstring excerpt;
         convertPDFToText(path);
 
         // remove these fields before creating new values for them.
@@ -724,12 +708,8 @@
         
                 doc->add(newField.get());
                 newField.release();
-				        getExcerptOfFile(excerpt,
-                        tempFile,
-                        10, // max words
-                        sizeof(excerpt) / sizeof(wchar_t));
-						doc->setExcerpt(excerpt);
-						
+                getPDFExcerpt(tempFile,excerpt);
+                doc->setExcerpt(excerpt.c_str());
             }
         else
            {
@@ -744,7 +724,6 @@
 
         doc->setAppClass(CONTENTAPPCLASS);
         doc->setMimeType(LPDFFILE_MIMETYPE);
-        delete excerpt;
         GenericFileProcessor(doc,path);
         }