43 #include "indevicecfg.h" |
43 #include "indevicecfg.h" |
44 |
44 |
45 #include "cpixidxdb.h" |
45 #include "cpixidxdb.h" |
46 #include "cluceneext.h" |
46 #include "cluceneext.h" |
47 #include "cpixstrtools.h" |
47 #include "cpixstrtools.h" |
|
48 |
|
49 namespace |
|
50 { |
|
51 const char EXTENSION[] = ".txt"; |
|
52 const char EXTENSION_UPPER[] = ".TXT"; |
|
53 |
|
54 const char DEFAULT_ENCODING[] = "UTF-8"; |
|
55 |
|
56 /** |
|
57 * Returns 1 on success, 0 on eof. |
|
58 */ |
|
59 int clgetline(lucene::util::Reader& reader, std::wstring& line) |
|
60 { |
|
61 line = L""; |
|
62 |
|
63 // read line |
|
64 while (true) |
|
65 { |
|
66 int c = reader.read(); |
|
67 switch (c) { |
|
68 case -1: // EOF |
|
69 return line.length() > 0; |
|
70 case '\n': // line break |
|
71 case '\r': // line break |
|
72 return 1; |
|
73 default: |
|
74 line += static_cast<wchar_t>(c); |
|
75 if (line.length() > 500) |
|
76 return 1; |
|
77 } |
|
78 } |
|
79 } |
|
80 |
|
81 void getExcerptOfFile(wchar_t * dst, |
|
82 const char * path, |
|
83 size_t maxWords, |
|
84 size_t bufSize) |
|
85 { |
|
86 using namespace std; |
|
87 using namespace lucene::util; |
|
88 |
|
89 // Lucene reader can do UTF-8 magic, so let's use it |
|
90 FileReader file( path, DEFAULT_ENCODING ); |
|
91 |
|
92 if ( file.reader->getStatus() == jstreams::Ok ) |
|
93 { |
|
94 cpix_EPIState |
|
95 epiState; |
|
96 cpix_init_EPIState(&epiState); |
|
97 |
|
98 wstring |
|
99 line; |
|
100 |
|
101 while (bufSize > 0 && maxWords > 0 && clgetline(file, line)) |
|
102 { |
|
103 dst = cpix_getExcerptOfWText(dst, |
|
104 line.c_str(), |
|
105 &maxWords, |
|
106 &bufSize, |
|
107 &epiState); |
|
108 } |
|
109 } |
|
110 } |
|
111 |
|
112 } |
48 |
113 |
49 using namespace std; |
114 using namespace std; |
50 using namespace Cpt; |
115 using namespace Cpt; |
51 |
116 |
52 namespace Cpix |
117 namespace Cpix |
631 using namespace Cpix; |
696 using namespace Cpix; |
632 |
697 |
633 const char DEFAULT_ENCODING[] = "UTF-8"; |
698 const char DEFAULT_ENCODING[] = "UTF-8"; |
634 char tempFile[254]; |
699 char tempFile[254]; |
635 FILE *fp; |
700 FILE *fp; |
|
701 wchar_t* excerpt = new wchar_t[MAX_EXCERPT_LENGTH]; |
636 |
702 |
637 convertPDFToText(path); |
703 convertPDFToText(path); |
638 |
704 |
639 // remove these fields before creating new values for them. |
705 // remove these fields before creating new values for them. |
640 removeStandardFields(doc); |
706 removeStandardFields(doc); |
656 new FileReaderProxy(tempFile, DEFAULT_ENCODING), cpix_STORE_NO |
722 new FileReaderProxy(tempFile, DEFAULT_ENCODING), cpix_STORE_NO |
657 | cpix_INDEX_TOKENIZED)); |
723 | cpix_INDEX_TOKENIZED)); |
658 |
724 |
659 doc->add(newField.get()); |
725 doc->add(newField.get()); |
660 newField.release(); |
726 newField.release(); |
|
727 getExcerptOfFile(excerpt, |
|
728 tempFile, |
|
729 10, // max words |
|
730 sizeof(excerpt) / sizeof(wchar_t)); |
|
731 doc->setExcerpt(excerpt); |
|
732 |
661 } |
733 } |
662 else |
734 else |
663 { |
735 { |
|
736 //For empty file setting the path as excerpt |
|
737 doc->setExcerpt(doc->get(LCPIX_DOCUID_FIELD)); |
|
738 |
664 getTempFileName(path,tempFile); |
739 getTempFileName(path,tempFile); |
665 strcat(tempFile,".txt"); |
740 strcat(tempFile,".txt"); |
666 remove(tempFile); |
741 remove(tempFile); |
667 |
742 |
668 } |
743 } |
669 |
744 |
670 doc->setAppClass(PDFAPPCLASS); |
745 doc->setAppClass(CONTENTAPPCLASS); |
671 doc->setMimeType(LPDFFILE_MIMETYPE); |
746 doc->setMimeType(LPDFFILE_MIMETYPE); |
|
747 delete excerpt; |
672 GenericFileProcessor(doc,path); |
748 GenericFileProcessor(doc,path); |
673 } |
749 } |
674 |
750 |
675 } |
751 } |
|
752 |
|
753 |