buildframework/helium/external/python/lib/common/Sphinx-0.5.1-py2.5.egg/sphinx/util/stemmer.py
changeset 179 d8ac696cc51f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/buildframework/helium/external/python/lib/common/Sphinx-0.5.1-py2.5.egg/sphinx/util/stemmer.py	Wed Dec 23 19:29:07 2009 +0200
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+"""
+    sphinx.util.stemmer
+    ~~~~~~~~~~~~~~~~~~~
+
+    Porter Stemming Algorithm
+
+    This is the Porter stemming algorithm, ported to Python from the
+    version coded up in ANSI C by the author. It may be be regarded
+    as canonical, in that it follows the algorithm presented in
+
+    Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+    no. 3, pp 130-137,
+
+    only differing from it at the points maked --DEPARTURE-- below.
+
+    See also http://www.tartarus.org/~martin/PorterStemmer
+
+    The algorithm as described in the paper could be exactly replicated
+    by adjusting the points of DEPARTURE, but this is barely necessary,
+    because (a) the points of DEPARTURE are definitely improvements, and
+    (b) no encoding of the Porter stemmer I have seen is anything like
+    as exact as this version, even with the points of DEPARTURE!
+
+    Release 1: January 2001
+
+    :copyright: 2001 by Vivake Gupta <v@nano.com>.
+    :license: Public Domain ("can be used free of charge for any purpose").
+"""
+
+class PorterStemmer(object):
+
+    def __init__(self):
+        """The main part of the stemming algorithm starts here.
+        b is a buffer holding a word to be stemmed. The letters are in b[k0],
+        b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
+        readjusted downwards as the stemming progresses. Zero termination is
+        not in fact used in the algorithm.
+
+        Note that only lower case sequences are stemmed. Forcing to lower case
+        should be done before stem(...) is called.
+        """
+
+        self.b = ""  # buffer for word to be stemmed
+        self.k = 0
+        self.k0 = 0
+        self.j = 0   # j is a general offset into the string
+
+    def cons(self, i):
+        """cons(i) is TRUE <=> b[i] is a consonant."""
+        if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \
+            or self.b[i] == 'o' or self.b[i] == 'u':
+            return 0
+        if self.b[i] == 'y':
+            if i == self.k0:
+                return 1
+            else:
+                return (not self.cons(i - 1))
+        return 1
+
+    def m(self):
+        """m() measures the number of consonant sequences between k0 and j.
+        if c is a consonant sequence and v a vowel sequence, and <..>
+        indicates arbitrary presence,
+
+           <c><v>       gives 0
+           <c>vc<v>     gives 1
+           <c>vcvc<v>   gives 2
+           <c>vcvcvc<v> gives 3
+           ....
+        """
+        n = 0
+        i = self.k0
+        while 1:
+            if i > self.j:
+                return n
+            if not self.cons(i):
+                break
+            i = i + 1
+        i = i + 1
+        while 1:
+            while 1:
+                if i > self.j:
+                    return n
+                if self.cons(i):
+                    break
+                i = i + 1
+            i = i + 1
+            n = n + 1
+            while 1:
+                if i > self.j:
+                    return n
+                if not self.cons(i):
+                    break
+                i = i + 1
+            i = i + 1
+
+    def vowelinstem(self):
+        """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
+        for i in range(self.k0, self.j + 1):
+            if not self.cons(i):
+                return 1
+        return 0
+
+    def doublec(self, j):
+        """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
+        if j < (self.k0 + 1):
+            return 0
+        if (self.b[j] != self.b[j-1]):
+            return 0
+        return self.cons(j)
+
+    def cvc(self, i):
+        """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+        and also if the second c is not w,x or y. this is used when trying to
+        restore an e at the end of a short  e.g.
+
+           cav(e), lov(e), hop(e), crim(e), but
+           snow, box, tray.
+        """
+        if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
+            return 0
+        ch = self.b[i]
+        if ch == 'w' or ch == 'x' or ch == 'y':
+            return 0
+        return 1
+
+    def ends(self, s):
+        """ends(s) is TRUE <=> k0,...k ends with the string s."""
+        length = len(s)
+        if s[length - 1] != self.b[self.k]: # tiny speed-up
+            return 0
+        if length > (self.k - self.k0 + 1):
+            return 0
+        if self.b[self.k-length+1:self.k+1] != s:
+            return 0
+        self.j = self.k - length
+        return 1
+
+    def setto(self, s):
+        """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
+        length = len(s)
+        self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
+        self.k = self.j + length
+
+    def r(self, s):
+        """r(s) is used further down."""
+        if self.m() > 0:
+            self.setto(s)
+
+    def step1ab(self):
+        """step1ab() gets rid of plurals and -ed or -ing. e.g.
+
+           caresses  ->  caress
+           ponies    ->  poni
+           ties      ->  ti
+           caress    ->  caress
+           cats      ->  cat
+
+           feed      ->  feed
+           agreed    ->  agree
+           disabled  ->  disable
+
+           matting   ->  mat
+           mating    ->  mate
+           meeting   ->  meet
+           milling   ->  mill
+           messing   ->  mess
+
+           meetings  ->  meet
+        """
+        if self.b[self.k] == 's':
+            if self.ends("sses"):
+                self.k = self.k - 2
+            elif self.ends("ies"):
+                self.setto("i")
+            elif self.b[self.k - 1] != 's':
+                self.k = self.k - 1
+        if self.ends("eed"):
+            if self.m() > 0:
+                self.k = self.k - 1
+        elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
+            self.k = self.j
+            if self.ends("at"):   self.setto("ate")
+            elif self.ends("bl"): self.setto("ble")
+            elif self.ends("iz"): self.setto("ize")
+            elif self.doublec(self.k):
+                self.k = self.k - 1
+                ch = self.b[self.k]
+                if ch == 'l' or ch == 's' or ch == 'z':
+                    self.k = self.k + 1
+            elif (self.m() == 1 and self.cvc(self.k)):
+                self.setto("e")
+
+    def step1c(self):
+        """step1c() turns terminal y to i when there is another vowel in the stem."""
+        if (self.ends("y") and self.vowelinstem()):
+            self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
+
+    def step2(self):
+        """step2() maps double suffices to single ones.
+        so -ization ( = -ize plus -ation) maps to -ize etc. note that the
+        string before the suffix must give m() > 0.
+        """
+        if self.b[self.k - 1] == 'a':
+            if self.ends("ational"):   self.r("ate")
+            elif self.ends("tional"):  self.r("tion")
+        elif self.b[self.k - 1] == 'c':
+            if self.ends("enci"):      self.r("ence")
+            elif self.ends("anci"):    self.r("ance")
+        elif self.b[self.k - 1] == 'e':
+            if self.ends("izer"):      self.r("ize")
+        elif self.b[self.k - 1] == 'l':
+            if self.ends("bli"):       self.r("ble") # --DEPARTURE--
+            # To match the published algorithm, replace this phrase with
+            #   if self.ends("abli"):      self.r("able")
+            elif self.ends("alli"):    self.r("al")
+            elif self.ends("entli"):   self.r("ent")
+            elif self.ends("eli"):     self.r("e")
+            elif self.ends("ousli"):   self.r("ous")
+        elif self.b[self.k - 1] == 'o':
+            if self.ends("ization"):   self.r("ize")
+            elif self.ends("ation"):   self.r("ate")
+            elif self.ends("ator"):    self.r("ate")
+        elif self.b[self.k - 1] == 's':
+            if self.ends("alism"):     self.r("al")
+            elif self.ends("iveness"): self.r("ive")
+            elif self.ends("fulness"): self.r("ful")
+            elif self.ends("ousness"): self.r("ous")
+        elif self.b[self.k - 1] == 't':
+            if self.ends("aliti"):     self.r("al")
+            elif self.ends("iviti"):   self.r("ive")
+            elif self.ends("biliti"):  self.r("ble")
+        elif self.b[self.k - 1] == 'g': # --DEPARTURE--
+            if self.ends("logi"):      self.r("log")
+        # To match the published algorithm, delete this phrase
+
+    def step3(self):
+        """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
+        if self.b[self.k] == 'e':
+            if self.ends("icate"):     self.r("ic")
+            elif self.ends("ative"):   self.r("")
+            elif self.ends("alize"):   self.r("al")
+        elif self.b[self.k] == 'i':
+            if self.ends("iciti"):     self.r("ic")
+        elif self.b[self.k] == 'l':
+            if self.ends("ical"):      self.r("ic")
+            elif self.ends("ful"):     self.r("")
+        elif self.b[self.k] == 's':
+            if self.ends("ness"):      self.r("")
+
+    def step4(self):
+        """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
+        if self.b[self.k - 1] == 'a':
+            if self.ends("al"): pass
+            else: return
+        elif self.b[self.k - 1] == 'c':
+            if self.ends("ance"): pass
+            elif self.ends("ence"): pass
+            else: return
+        elif self.b[self.k - 1] == 'e':
+            if self.ends("er"): pass
+            else: return
+        elif self.b[self.k - 1] == 'i':
+            if self.ends("ic"): pass
+            else: return
+        elif self.b[self.k - 1] == 'l':
+            if self.ends("able"): pass
+            elif self.ends("ible"): pass
+            else: return
+        elif self.b[self.k - 1] == 'n':
+            if self.ends("ant"): pass
+            elif self.ends("ement"): pass
+            elif self.ends("ment"): pass
+            elif self.ends("ent"): pass
+            else: return
+        elif self.b[self.k - 1] == 'o':
+            if self.ends("ion") and (self.b[self.j] == 's' \
+                or self.b[self.j] == 't'): pass
+            elif self.ends("ou"): pass
+            # takes care of -ous
+            else: return
+        elif self.b[self.k - 1] == 's':
+            if self.ends("ism"): pass
+            else: return
+        elif self.b[self.k - 1] == 't':
+            if self.ends("ate"): pass
+            elif self.ends("iti"): pass
+            else: return
+        elif self.b[self.k - 1] == 'u':
+            if self.ends("ous"): pass
+            else: return
+        elif self.b[self.k - 1] == 'v':
+            if self.ends("ive"): pass
+            else: return
+        elif self.b[self.k - 1] == 'z':
+            if self.ends("ize"): pass
+            else: return
+        else:
+            return
+        if self.m() > 1:
+            self.k = self.j
+
+    def step5(self):
+        """step5() removes a final -e if m() > 1, and changes -ll to -l if
+        m() > 1.
+        """
+        self.j = self.k
+        if self.b[self.k] == 'e':
+            a = self.m()
+            if a > 1 or (a == 1 and not self.cvc(self.k-1)):
+                self.k = self.k - 1
+        if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
+            self.k = self.k -1
+
+    def stem(self, p, i, j):
+        """In stem(p,i,j), p is a char pointer, and the string to be stemmed
+        is from p[i] to p[j] inclusive. Typically i is zero and j is the
+        offset to the last character of a string, (p[j+1] == '\0'). The
+        stemmer adjusts the characters p[i] ... p[j] and returns the new
+        end-point of the string, k. Stemming never increases word length, so
+        i <= k <= j. To turn the stemmer into a module, declare 'stem' as
+        extern, and delete the remainder of this file.
+        """
+        # copy the parameters into statics
+        self.b = p
+        self.k = j
+        self.k0 = i
+        if self.k <= self.k0 + 1:
+            return self.b # --DEPARTURE--
+
+        # With this line, strings of length 1 or 2 don't go through the
+        # stemming process, although no mention is made of this in the
+        # published algorithm. Remove the line to match the published
+        # algorithm.
+
+        self.step1ab()
+        self.step1c()
+        self.step2()
+        self.step3()
+        self.step4()
+        self.step5()
+        return self.b[self.k0:self.k+1]