author | Alex Gilkes <alex.gilkes@nokia.com> |
Wed, 28 Oct 2009 14:39:48 +0000 | |
changeset 1 | be27ed110b50 |
permissions | -rw-r--r-- |
1
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
1 |
# -*- coding: utf-8 -*- |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
2 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
3 |
sphinx.search |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
4 |
~~~~~~~~~~~~~ |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
5 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
6 |
Create a search index for offline search. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
7 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
8 |
:copyright: 2007-2008 by Armin Ronacher. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
9 |
:license: BSD. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
10 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
11 |
import re |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
12 |
import cPickle as pickle |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
13 |
from cStringIO import StringIO |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
14 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
15 |
from docutils.nodes import Text, NodeVisitor |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
16 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
17 |
from sphinx.util.stemmer import PorterStemmer |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
18 |
from sphinx.util import jsdump, rpartition |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
19 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
20 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
21 |
word_re = re.compile(r'\w+(?u)') |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
22 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
23 |
stopwords = set(""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
24 |
a and are as at |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
25 |
be but by |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
26 |
for |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
27 |
if in into is it |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
28 |
near no not |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
29 |
of on or |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
30 |
such |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
31 |
that the their then there these they this to |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
32 |
was will with |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
33 |
""".split()) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
34 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
35 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
36 |
class _JavaScriptIndex(object): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
37 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
38 |
The search index as javascript file that calls a function |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
39 |
on the documentation search object to register the index. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
40 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
41 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
42 |
PREFIX = 'Search.setIndex(' |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
43 |
SUFFIX = ')' |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
44 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
45 |
def dumps(self, data): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
46 |
return self.PREFIX + jsdump.dumps(data) + self.SUFFIX |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
47 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
48 |
def loads(self, s): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
49 |
data = s[len(self.PREFIX):-len(self.SUFFIX)] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
50 |
if not data or not s.startswith(self.PREFIX) or not \ |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
51 |
s.endswith(self.SUFFIX): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
52 |
raise ValueError('invalid data') |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
53 |
return jsdump.loads(data) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
54 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
55 |
def dump(self, data, f): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
56 |
f.write(self.dumps(data)) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
57 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
58 |
def load(self, f): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
59 |
return self.loads(f.read()) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
60 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
61 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
62 |
js_index = _JavaScriptIndex() |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
63 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
64 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
65 |
class Stemmer(PorterStemmer): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
66 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
67 |
All those porter stemmer implementations look hideous. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
68 |
make at least the stem method nicer. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
69 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
70 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
71 |
def stem(self, word): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
72 |
word = word.lower() |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
73 |
return PorterStemmer.stem(self, word, 0, len(word) - 1) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
74 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
75 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
76 |
class WordCollector(NodeVisitor): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
77 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
78 |
A special visitor that collects words for the `IndexBuilder`. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
79 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
80 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
81 |
def __init__(self, document): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
82 |
NodeVisitor.__init__(self, document) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
83 |
self.found_words = [] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
84 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
85 |
def dispatch_visit(self, node): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
86 |
if node.__class__ is Text: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
87 |
self.found_words.extend(word_re.findall(node.astext())) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
88 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
89 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
90 |
class IndexBuilder(object): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
91 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
92 |
Helper class that creates a searchindex based on the doctrees |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
93 |
passed to the `feed` method. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
94 |
""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
95 |
formats = { |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
96 |
'jsdump': jsdump, |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
97 |
'pickle': pickle |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
98 |
} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
99 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
100 |
def __init__(self, env): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
101 |
self.env = env |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
102 |
self._stemmer = Stemmer() |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
103 |
# filename -> title |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
104 |
self._titles = {} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
105 |
# stemmed word -> set(filenames) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
106 |
self._mapping = {} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
107 |
# desctypes -> index |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
108 |
self._desctypes = {} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
109 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
110 |
def load(self, stream, format): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
111 |
"""Reconstruct from frozen data.""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
112 |
if isinstance(format, basestring): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
113 |
format = self.formats[format] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
114 |
frozen = format.load(stream) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
115 |
# if an old index is present, we treat it as not existing. |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
116 |
if not isinstance(frozen, dict): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
117 |
raise ValueError('old format') |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
118 |
index2fn = frozen['filenames'] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
119 |
self._titles = dict(zip(index2fn, frozen['titles'])) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
120 |
self._mapping = {} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
121 |
for k, v in frozen['terms'].iteritems(): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
122 |
if isinstance(v, int): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
123 |
self._mapping[k] = set([index2fn[v]]) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
124 |
else: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
125 |
self._mapping[k] = set(index2fn[i] for i in v) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
126 |
# no need to load keywords/desctypes |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
127 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
128 |
def dump(self, stream, format): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
129 |
"""Dump the frozen index to a stream.""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
130 |
if isinstance(format, basestring): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
131 |
format = self.formats[format] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
132 |
format.dump(self.freeze(), stream) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
133 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
134 |
def get_modules(self, fn2index): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
135 |
rv = {} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
136 |
for name, (doc, _, _, _) in self.env.modules.iteritems(): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
137 |
rv[name] = fn2index[doc] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
138 |
return rv |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
139 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
140 |
def get_descrefs(self, fn2index): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
141 |
rv = {} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
142 |
dt = self._desctypes |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
143 |
for fullname, (doc, desctype) in self.env.descrefs.iteritems(): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
144 |
prefix, name = rpartition(fullname, '.') |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
145 |
pdict = rv.setdefault(prefix, {}) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
146 |
try: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
147 |
i = dt[desctype] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
148 |
except KeyError: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
149 |
i = len(dt) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
150 |
dt[desctype] = i |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
151 |
pdict[name] = (fn2index[doc], i) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
152 |
return rv |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
153 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
154 |
def get_terms(self, fn2index): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
155 |
rv = {} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
156 |
for k, v in self._mapping.iteritems(): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
157 |
if len(v) == 1: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
158 |
fn, = v |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
159 |
rv[k] = fn2index[fn] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
160 |
else: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
161 |
rv[k] = [fn2index[fn] for fn in v] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
162 |
return rv |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
163 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
164 |
def freeze(self): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
165 |
"""Create a usable data structure for serializing.""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
166 |
filenames = self._titles.keys() |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
167 |
titles = self._titles.values() |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
168 |
fn2index = dict((f, i) for (i, f) in enumerate(filenames)) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
169 |
return dict( |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
170 |
filenames=filenames, |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
171 |
titles=titles, |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
172 |
terms=self.get_terms(fn2index), |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
173 |
descrefs=self.get_descrefs(fn2index), |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
174 |
modules=self.get_modules(fn2index), |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
175 |
desctypes=dict((v, k) for (k, v) in self._desctypes.items()), |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
176 |
) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
177 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
178 |
def prune(self, filenames): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
179 |
"""Remove data for all filenames not in the list.""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
180 |
new_titles = {} |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
181 |
for filename in filenames: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
182 |
if filename in self._titles: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
183 |
new_titles[filename] = self._titles[filename] |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
184 |
self._titles = new_titles |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
185 |
for wordnames in self._mapping.itervalues(): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
186 |
wordnames.intersection_update(filenames) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
187 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
188 |
def feed(self, filename, title, doctree): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
189 |
"""Feed a doctree to the index.""" |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
190 |
self._titles[filename] = title |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
191 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
192 |
visitor = WordCollector(doctree) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
193 |
doctree.walk(visitor) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
194 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
195 |
def add_term(word, prefix='', stem=self._stemmer.stem): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
196 |
word = stem(word) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
197 |
if len(word) < 3 or word in stopwords or word.isdigit(): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
198 |
return |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
199 |
self._mapping.setdefault(prefix + word, set()).add(filename) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
200 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
201 |
for word in word_re.findall(title): |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
202 |
add_term(word) |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
203 |
|
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
204 |
for word in visitor.found_words: |
be27ed110b50
Bringing in Helium, imaker and cmaker
Alex Gilkes <alex.gilkes@nokia.com>
parents:
diff
changeset
|
205 |
add_term(word) |