buildframework/helium/external/python/lib/common/Sphinx-0.5.1-py2.5.egg/sphinx/search.py
changeset 179 d8ac696cc51f
equal deleted inserted replaced
1:be27ed110b50 179:d8ac696cc51f
       
     1 # -*- coding: utf-8 -*-
       
     2 """
       
     3     sphinx.search
       
     4     ~~~~~~~~~~~~~
       
     5 
       
     6     Create a search index for offline search.
       
     7 
       
     8     :copyright: 2007-2008 by Armin Ronacher.
       
     9     :license: BSD.
       
    10 """
       
    11 import re
       
    12 import cPickle as pickle
       
    13 from cStringIO import StringIO
       
    14 
       
    15 from docutils.nodes import Text, NodeVisitor
       
    16 
       
    17 from sphinx.util.stemmer import PorterStemmer
       
    18 from sphinx.util import jsdump, rpartition
       
    19 
       
    20 
       
    21 word_re = re.compile(r'\w+(?u)')
       
    22 
       
    23 stopwords = set("""
       
    24 a  and  are  as  at
       
    25 be  but  by
       
    26 for
       
    27 if  in  into  is  it
       
    28 near  no  not
       
    29 of  on  or
       
    30 such
       
    31 that  the  their  then  there  these  they  this  to
       
    32 was  will  with
       
    33 """.split())
       
    34 
       
    35 
       
    36 class _JavaScriptIndex(object):
       
    37     """
       
    38     The search index as javascript file that calls a function
       
    39     on the documentation search object to register the index.
       
    40     """
       
    41 
       
    42     PREFIX = 'Search.setIndex('
       
    43     SUFFIX = ')'
       
    44 
       
    45     def dumps(self, data):
       
    46         return self.PREFIX + jsdump.dumps(data) + self.SUFFIX
       
    47 
       
    48     def loads(self, s):
       
    49         data = s[len(self.PREFIX):-len(self.SUFFIX)]
       
    50         if not data or not s.startswith(self.PREFIX) or not \
       
    51            s.endswith(self.SUFFIX):
       
    52             raise ValueError('invalid data')
       
    53         return jsdump.loads(data)
       
    54 
       
    55     def dump(self, data, f):
       
    56         f.write(self.dumps(data))
       
    57 
       
    58     def load(self, f):
       
    59         return self.loads(f.read())
       
    60 
       
    61 
       
    62 js_index = _JavaScriptIndex()
       
    63 
       
    64 
       
    65 class Stemmer(PorterStemmer):
       
    66     """
       
    67     All those porter stemmer implementations look hideous.
       
    68     make at least the stem method nicer.
       
    69     """
       
    70 
       
    71     def stem(self, word):
       
    72         word = word.lower()
       
    73         return PorterStemmer.stem(self, word, 0, len(word) - 1)
       
    74 
       
    75 
       
    76 class WordCollector(NodeVisitor):
       
    77     """
       
    78     A special visitor that collects words for the `IndexBuilder`.
       
    79     """
       
    80 
       
    81     def __init__(self, document):
       
    82         NodeVisitor.__init__(self, document)
       
    83         self.found_words = []
       
    84 
       
    85     def dispatch_visit(self, node):
       
    86         if node.__class__ is Text:
       
    87             self.found_words.extend(word_re.findall(node.astext()))
       
    88 
       
    89 
       
    90 class IndexBuilder(object):
       
    91     """
       
    92     Helper class that creates a searchindex based on the doctrees
       
    93     passed to the `feed` method.
       
    94     """
       
    95     formats = {
       
    96         'jsdump':   jsdump,
       
    97         'pickle':   pickle
       
    98     }
       
    99 
       
   100     def __init__(self, env):
       
   101         self.env = env
       
   102         self._stemmer = Stemmer()
       
   103         # filename -> title
       
   104         self._titles = {}
       
   105         # stemmed word -> set(filenames)
       
   106         self._mapping = {}
       
   107         # desctypes -> index
       
   108         self._desctypes = {}
       
   109 
       
   110     def load(self, stream, format):
       
   111         """Reconstruct from frozen data."""
       
   112         if isinstance(format, basestring):
       
   113             format = self.formats[format]
       
   114         frozen = format.load(stream)
       
   115         # if an old index is present, we treat it as not existing.
       
   116         if not isinstance(frozen, dict):
       
   117             raise ValueError('old format')
       
   118         index2fn = frozen['filenames']
       
   119         self._titles = dict(zip(index2fn, frozen['titles']))
       
   120         self._mapping = {}
       
   121         for k, v in frozen['terms'].iteritems():
       
   122             if isinstance(v, int):
       
   123                 self._mapping[k] = set([index2fn[v]])
       
   124             else:
       
   125                 self._mapping[k] = set(index2fn[i] for i in v)
       
   126         # no need to load keywords/desctypes
       
   127 
       
   128     def dump(self, stream, format):
       
   129         """Dump the frozen index to a stream."""
       
   130         if isinstance(format, basestring):
       
   131             format = self.formats[format]
       
   132         format.dump(self.freeze(), stream)
       
   133 
       
   134     def get_modules(self, fn2index):
       
   135         rv = {}
       
   136         for name, (doc, _, _, _) in self.env.modules.iteritems():
       
   137             rv[name] = fn2index[doc]
       
   138         return rv
       
   139 
       
   140     def get_descrefs(self, fn2index):
       
   141         rv = {}
       
   142         dt = self._desctypes
       
   143         for fullname, (doc, desctype) in self.env.descrefs.iteritems():
       
   144             prefix, name = rpartition(fullname, '.')
       
   145             pdict = rv.setdefault(prefix, {})
       
   146             try:
       
   147                 i = dt[desctype]
       
   148             except KeyError:
       
   149                 i = len(dt)
       
   150                 dt[desctype] = i
       
   151             pdict[name] = (fn2index[doc], i)
       
   152         return rv
       
   153 
       
   154     def get_terms(self, fn2index):
       
   155         rv = {}
       
   156         for k, v in self._mapping.iteritems():
       
   157             if len(v) == 1:
       
   158                 fn, = v
       
   159                 rv[k] = fn2index[fn]
       
   160             else:
       
   161                 rv[k] = [fn2index[fn] for fn in v]
       
   162         return rv
       
   163 
       
   164     def freeze(self):
       
   165         """Create a usable data structure for serializing."""
       
   166         filenames = self._titles.keys()
       
   167         titles = self._titles.values()
       
   168         fn2index = dict((f, i) for (i, f) in enumerate(filenames))
       
   169         return dict(
       
   170             filenames=filenames,
       
   171             titles=titles,
       
   172             terms=self.get_terms(fn2index),
       
   173             descrefs=self.get_descrefs(fn2index),
       
   174             modules=self.get_modules(fn2index),
       
   175             desctypes=dict((v, k) for (k, v) in self._desctypes.items()),
       
   176         )
       
   177 
       
   178     def prune(self, filenames):
       
   179         """Remove data for all filenames not in the list."""
       
   180         new_titles = {}
       
   181         for filename in filenames:
       
   182             if filename in self._titles:
       
   183                 new_titles[filename] = self._titles[filename]
       
   184         self._titles = new_titles
       
   185         for wordnames in self._mapping.itervalues():
       
   186             wordnames.intersection_update(filenames)
       
   187 
       
   188     def feed(self, filename, title, doctree):
       
   189         """Feed a doctree to the index."""
       
   190         self._titles[filename] = title
       
   191 
       
   192         visitor = WordCollector(doctree)
       
   193         doctree.walk(visitor)
       
   194 
       
   195         def add_term(word, prefix='', stem=self._stemmer.stem):
       
   196             word = stem(word)
       
   197             if len(word) < 3 or word in stopwords or word.isdigit():
       
   198                 return
       
   199             self._mapping.setdefault(prefix + word, set()).add(filename)
       
   200 
       
   201         for word in word_re.findall(title):
       
   202             add_term(word)
       
   203 
       
   204         for word in visitor.found_words:
       
   205             add_term(word)