|
1 # -*- coding: utf-8 -*- |
|
2 """ |
|
3 sphinx.search |
|
4 ~~~~~~~~~~~~~ |
|
5 |
|
6 Create a search index for offline search. |
|
7 |
|
8 :copyright: 2007-2008 by Armin Ronacher. |
|
9 :license: BSD. |
|
10 """ |
|
11 import re |
|
12 import cPickle as pickle |
|
13 from cStringIO import StringIO |
|
14 |
|
15 from docutils.nodes import Text, NodeVisitor |
|
16 |
|
17 from sphinx.util.stemmer import PorterStemmer |
|
18 from sphinx.util import jsdump, rpartition |
|
19 |
|
20 |
|
21 word_re = re.compile(r'\w+(?u)') |
|
22 |
|
23 stopwords = set(""" |
|
24 a and are as at |
|
25 be but by |
|
26 for |
|
27 if in into is it |
|
28 near no not |
|
29 of on or |
|
30 such |
|
31 that the their then there these they this to |
|
32 was will with |
|
33 """.split()) |
|
34 |
|
35 |
|
36 class _JavaScriptIndex(object): |
|
37 """ |
|
38 The search index as javascript file that calls a function |
|
39 on the documentation search object to register the index. |
|
40 """ |
|
41 |
|
42 PREFIX = 'Search.setIndex(' |
|
43 SUFFIX = ')' |
|
44 |
|
45 def dumps(self, data): |
|
46 return self.PREFIX + jsdump.dumps(data) + self.SUFFIX |
|
47 |
|
48 def loads(self, s): |
|
49 data = s[len(self.PREFIX):-len(self.SUFFIX)] |
|
50 if not data or not s.startswith(self.PREFIX) or not \ |
|
51 s.endswith(self.SUFFIX): |
|
52 raise ValueError('invalid data') |
|
53 return jsdump.loads(data) |
|
54 |
|
55 def dump(self, data, f): |
|
56 f.write(self.dumps(data)) |
|
57 |
|
58 def load(self, f): |
|
59 return self.loads(f.read()) |
|
60 |
|
61 |
|
62 js_index = _JavaScriptIndex() |
|
63 |
|
64 |
|
65 class Stemmer(PorterStemmer): |
|
66 """ |
|
67 All those porter stemmer implementations look hideous. |
|
68 make at least the stem method nicer. |
|
69 """ |
|
70 |
|
71 def stem(self, word): |
|
72 word = word.lower() |
|
73 return PorterStemmer.stem(self, word, 0, len(word) - 1) |
|
74 |
|
75 |
|
76 class WordCollector(NodeVisitor): |
|
77 """ |
|
78 A special visitor that collects words for the `IndexBuilder`. |
|
79 """ |
|
80 |
|
81 def __init__(self, document): |
|
82 NodeVisitor.__init__(self, document) |
|
83 self.found_words = [] |
|
84 |
|
85 def dispatch_visit(self, node): |
|
86 if node.__class__ is Text: |
|
87 self.found_words.extend(word_re.findall(node.astext())) |
|
88 |
|
89 |
|
90 class IndexBuilder(object): |
|
91 """ |
|
92 Helper class that creates a searchindex based on the doctrees |
|
93 passed to the `feed` method. |
|
94 """ |
|
95 formats = { |
|
96 'jsdump': jsdump, |
|
97 'pickle': pickle |
|
98 } |
|
99 |
|
100 def __init__(self, env): |
|
101 self.env = env |
|
102 self._stemmer = Stemmer() |
|
103 # filename -> title |
|
104 self._titles = {} |
|
105 # stemmed word -> set(filenames) |
|
106 self._mapping = {} |
|
107 # desctypes -> index |
|
108 self._desctypes = {} |
|
109 |
|
110 def load(self, stream, format): |
|
111 """Reconstruct from frozen data.""" |
|
112 if isinstance(format, basestring): |
|
113 format = self.formats[format] |
|
114 frozen = format.load(stream) |
|
115 # if an old index is present, we treat it as not existing. |
|
116 if not isinstance(frozen, dict): |
|
117 raise ValueError('old format') |
|
118 index2fn = frozen['filenames'] |
|
119 self._titles = dict(zip(index2fn, frozen['titles'])) |
|
120 self._mapping = {} |
|
121 for k, v in frozen['terms'].iteritems(): |
|
122 if isinstance(v, int): |
|
123 self._mapping[k] = set([index2fn[v]]) |
|
124 else: |
|
125 self._mapping[k] = set(index2fn[i] for i in v) |
|
126 # no need to load keywords/desctypes |
|
127 |
|
128 def dump(self, stream, format): |
|
129 """Dump the frozen index to a stream.""" |
|
130 if isinstance(format, basestring): |
|
131 format = self.formats[format] |
|
132 format.dump(self.freeze(), stream) |
|
133 |
|
134 def get_modules(self, fn2index): |
|
135 rv = {} |
|
136 for name, (doc, _, _, _) in self.env.modules.iteritems(): |
|
137 rv[name] = fn2index[doc] |
|
138 return rv |
|
139 |
|
140 def get_descrefs(self, fn2index): |
|
141 rv = {} |
|
142 dt = self._desctypes |
|
143 for fullname, (doc, desctype) in self.env.descrefs.iteritems(): |
|
144 prefix, name = rpartition(fullname, '.') |
|
145 pdict = rv.setdefault(prefix, {}) |
|
146 try: |
|
147 i = dt[desctype] |
|
148 except KeyError: |
|
149 i = len(dt) |
|
150 dt[desctype] = i |
|
151 pdict[name] = (fn2index[doc], i) |
|
152 return rv |
|
153 |
|
154 def get_terms(self, fn2index): |
|
155 rv = {} |
|
156 for k, v in self._mapping.iteritems(): |
|
157 if len(v) == 1: |
|
158 fn, = v |
|
159 rv[k] = fn2index[fn] |
|
160 else: |
|
161 rv[k] = [fn2index[fn] for fn in v] |
|
162 return rv |
|
163 |
|
164 def freeze(self): |
|
165 """Create a usable data structure for serializing.""" |
|
166 filenames = self._titles.keys() |
|
167 titles = self._titles.values() |
|
168 fn2index = dict((f, i) for (i, f) in enumerate(filenames)) |
|
169 return dict( |
|
170 filenames=filenames, |
|
171 titles=titles, |
|
172 terms=self.get_terms(fn2index), |
|
173 descrefs=self.get_descrefs(fn2index), |
|
174 modules=self.get_modules(fn2index), |
|
175 desctypes=dict((v, k) for (k, v) in self._desctypes.items()), |
|
176 ) |
|
177 |
|
178 def prune(self, filenames): |
|
179 """Remove data for all filenames not in the list.""" |
|
180 new_titles = {} |
|
181 for filename in filenames: |
|
182 if filename in self._titles: |
|
183 new_titles[filename] = self._titles[filename] |
|
184 self._titles = new_titles |
|
185 for wordnames in self._mapping.itervalues(): |
|
186 wordnames.intersection_update(filenames) |
|
187 |
|
188 def feed(self, filename, title, doctree): |
|
189 """Feed a doctree to the index.""" |
|
190 self._titles[filename] = title |
|
191 |
|
192 visitor = WordCollector(doctree) |
|
193 doctree.walk(visitor) |
|
194 |
|
195 def add_term(word, prefix='', stem=self._stemmer.stem): |
|
196 word = stem(word) |
|
197 if len(word) < 3 or word in stopwords or word.isdigit(): |
|
198 return |
|
199 self._mapping.setdefault(prefix + word, set()).add(filename) |
|
200 |
|
201 for word in word_re.findall(title): |
|
202 add_term(word) |
|
203 |
|
204 for word in visitor.found_words: |
|
205 add_term(word) |