--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/buildframework/helium/external/python/lib/common/Sphinx-0.5.1-py2.5.egg/sphinx/util/stemmer.py Wed Dec 23 19:29:07 2009 +0200
@@ -0,0 +1,343 @@
+# -*- coding: utf-8 -*-
+"""
+ sphinx.util.stemmer
+ ~~~~~~~~~~~~~~~~~~~
+
+ Porter Stemming Algorithm
+
+ This is the Porter stemming algorithm, ported to Python from the
+ version coded up in ANSI C by the author. It may be be regarded
+ as canonical, in that it follows the algorithm presented in
+
+ Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
+ no. 3, pp 130-137,
+
+ only differing from it at the points maked --DEPARTURE-- below.
+
+ See also http://www.tartarus.org/~martin/PorterStemmer
+
+ The algorithm as described in the paper could be exactly replicated
+ by adjusting the points of DEPARTURE, but this is barely necessary,
+ because (a) the points of DEPARTURE are definitely improvements, and
+ (b) no encoding of the Porter stemmer I have seen is anything like
+ as exact as this version, even with the points of DEPARTURE!
+
+ Release 1: January 2001
+
+ :copyright: 2001 by Vivake Gupta <v@nano.com>.
+ :license: Public Domain ("can be used free of charge for any purpose").
+"""
+
+class PorterStemmer(object):
+
+ def __init__(self):
+ """The main part of the stemming algorithm starts here.
+ b is a buffer holding a word to be stemmed. The letters are in b[k0],
+ b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
+ readjusted downwards as the stemming progresses. Zero termination is
+ not in fact used in the algorithm.
+
+ Note that only lower case sequences are stemmed. Forcing to lower case
+ should be done before stem(...) is called.
+ """
+
+ self.b = "" # buffer for word to be stemmed
+ self.k = 0
+ self.k0 = 0
+ self.j = 0 # j is a general offset into the string
+
+ def cons(self, i):
+ """cons(i) is TRUE <=> b[i] is a consonant."""
+ if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \
+ or self.b[i] == 'o' or self.b[i] == 'u':
+ return 0
+ if self.b[i] == 'y':
+ if i == self.k0:
+ return 1
+ else:
+ return (not self.cons(i - 1))
+ return 1
+
+ def m(self):
+ """m() measures the number of consonant sequences between k0 and j.
+ if c is a consonant sequence and v a vowel sequence, and <..>
+ indicates arbitrary presence,
+
+ <c><v> gives 0
+ <c>vc<v> gives 1
+ <c>vcvc<v> gives 2
+ <c>vcvcvc<v> gives 3
+ ....
+ """
+ n = 0
+ i = self.k0
+ while 1:
+ if i > self.j:
+ return n
+ if not self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+ while 1:
+ while 1:
+ if i > self.j:
+ return n
+ if self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+ n = n + 1
+ while 1:
+ if i > self.j:
+ return n
+ if not self.cons(i):
+ break
+ i = i + 1
+ i = i + 1
+
+ def vowelinstem(self):
+ """vowelinstem() is TRUE <=> k0,...j contains a vowel"""
+ for i in range(self.k0, self.j + 1):
+ if not self.cons(i):
+ return 1
+ return 0
+
+ def doublec(self, j):
+ """doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
+ if j < (self.k0 + 1):
+ return 0
+ if (self.b[j] != self.b[j-1]):
+ return 0
+ return self.cons(j)
+
+ def cvc(self, i):
+ """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
+ and also if the second c is not w,x or y. this is used when trying to
+ restore an e at the end of a short e.g.
+
+ cav(e), lov(e), hop(e), crim(e), but
+ snow, box, tray.
+ """
+ if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
+ return 0
+ ch = self.b[i]
+ if ch == 'w' or ch == 'x' or ch == 'y':
+ return 0
+ return 1
+
+ def ends(self, s):
+ """ends(s) is TRUE <=> k0,...k ends with the string s."""
+ length = len(s)
+ if s[length - 1] != self.b[self.k]: # tiny speed-up
+ return 0
+ if length > (self.k - self.k0 + 1):
+ return 0
+ if self.b[self.k-length+1:self.k+1] != s:
+ return 0
+ self.j = self.k - length
+ return 1
+
+ def setto(self, s):
+ """setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
+ length = len(s)
+ self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
+ self.k = self.j + length
+
+ def r(self, s):
+ """r(s) is used further down."""
+ if self.m() > 0:
+ self.setto(s)
+
+ def step1ab(self):
+ """step1ab() gets rid of plurals and -ed or -ing. e.g.
+
+ caresses -> caress
+ ponies -> poni
+ ties -> ti
+ caress -> caress
+ cats -> cat
+
+ feed -> feed
+ agreed -> agree
+ disabled -> disable
+
+ matting -> mat
+ mating -> mate
+ meeting -> meet
+ milling -> mill
+ messing -> mess
+
+ meetings -> meet
+ """
+ if self.b[self.k] == 's':
+ if self.ends("sses"):
+ self.k = self.k - 2
+ elif self.ends("ies"):
+ self.setto("i")
+ elif self.b[self.k - 1] != 's':
+ self.k = self.k - 1
+ if self.ends("eed"):
+ if self.m() > 0:
+ self.k = self.k - 1
+ elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
+ self.k = self.j
+ if self.ends("at"): self.setto("ate")
+ elif self.ends("bl"): self.setto("ble")
+ elif self.ends("iz"): self.setto("ize")
+ elif self.doublec(self.k):
+ self.k = self.k - 1
+ ch = self.b[self.k]
+ if ch == 'l' or ch == 's' or ch == 'z':
+ self.k = self.k + 1
+ elif (self.m() == 1 and self.cvc(self.k)):
+ self.setto("e")
+
+ def step1c(self):
+ """step1c() turns terminal y to i when there is another vowel in the stem."""
+ if (self.ends("y") and self.vowelinstem()):
+ self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
+
+ def step2(self):
+ """step2() maps double suffices to single ones.
+ so -ization ( = -ize plus -ation) maps to -ize etc. note that the
+ string before the suffix must give m() > 0.
+ """
+ if self.b[self.k - 1] == 'a':
+ if self.ends("ational"): self.r("ate")
+ elif self.ends("tional"): self.r("tion")
+ elif self.b[self.k - 1] == 'c':
+ if self.ends("enci"): self.r("ence")
+ elif self.ends("anci"): self.r("ance")
+ elif self.b[self.k - 1] == 'e':
+ if self.ends("izer"): self.r("ize")
+ elif self.b[self.k - 1] == 'l':
+ if self.ends("bli"): self.r("ble") # --DEPARTURE--
+ # To match the published algorithm, replace this phrase with
+ # if self.ends("abli"): self.r("able")
+ elif self.ends("alli"): self.r("al")
+ elif self.ends("entli"): self.r("ent")
+ elif self.ends("eli"): self.r("e")
+ elif self.ends("ousli"): self.r("ous")
+ elif self.b[self.k - 1] == 'o':
+ if self.ends("ization"): self.r("ize")
+ elif self.ends("ation"): self.r("ate")
+ elif self.ends("ator"): self.r("ate")
+ elif self.b[self.k - 1] == 's':
+ if self.ends("alism"): self.r("al")
+ elif self.ends("iveness"): self.r("ive")
+ elif self.ends("fulness"): self.r("ful")
+ elif self.ends("ousness"): self.r("ous")
+ elif self.b[self.k - 1] == 't':
+ if self.ends("aliti"): self.r("al")
+ elif self.ends("iviti"): self.r("ive")
+ elif self.ends("biliti"): self.r("ble")
+ elif self.b[self.k - 1] == 'g': # --DEPARTURE--
+ if self.ends("logi"): self.r("log")
+ # To match the published algorithm, delete this phrase
+
+ def step3(self):
+ """step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
+ if self.b[self.k] == 'e':
+ if self.ends("icate"): self.r("ic")
+ elif self.ends("ative"): self.r("")
+ elif self.ends("alize"): self.r("al")
+ elif self.b[self.k] == 'i':
+ if self.ends("iciti"): self.r("ic")
+ elif self.b[self.k] == 'l':
+ if self.ends("ical"): self.r("ic")
+ elif self.ends("ful"): self.r("")
+ elif self.b[self.k] == 's':
+ if self.ends("ness"): self.r("")
+
+ def step4(self):
+ """step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
+ if self.b[self.k - 1] == 'a':
+ if self.ends("al"): pass
+ else: return
+ elif self.b[self.k - 1] == 'c':
+ if self.ends("ance"): pass
+ elif self.ends("ence"): pass
+ else: return
+ elif self.b[self.k - 1] == 'e':
+ if self.ends("er"): pass
+ else: return
+ elif self.b[self.k - 1] == 'i':
+ if self.ends("ic"): pass
+ else: return
+ elif self.b[self.k - 1] == 'l':
+ if self.ends("able"): pass
+ elif self.ends("ible"): pass
+ else: return
+ elif self.b[self.k - 1] == 'n':
+ if self.ends("ant"): pass
+ elif self.ends("ement"): pass
+ elif self.ends("ment"): pass
+ elif self.ends("ent"): pass
+ else: return
+ elif self.b[self.k - 1] == 'o':
+ if self.ends("ion") and (self.b[self.j] == 's' \
+ or self.b[self.j] == 't'): pass
+ elif self.ends("ou"): pass
+ # takes care of -ous
+ else: return
+ elif self.b[self.k - 1] == 's':
+ if self.ends("ism"): pass
+ else: return
+ elif self.b[self.k - 1] == 't':
+ if self.ends("ate"): pass
+ elif self.ends("iti"): pass
+ else: return
+ elif self.b[self.k - 1] == 'u':
+ if self.ends("ous"): pass
+ else: return
+ elif self.b[self.k - 1] == 'v':
+ if self.ends("ive"): pass
+ else: return
+ elif self.b[self.k - 1] == 'z':
+ if self.ends("ize"): pass
+ else: return
+ else:
+ return
+ if self.m() > 1:
+ self.k = self.j
+
+ def step5(self):
+ """step5() removes a final -e if m() > 1, and changes -ll to -l if
+ m() > 1.
+ """
+ self.j = self.k
+ if self.b[self.k] == 'e':
+ a = self.m()
+ if a > 1 or (a == 1 and not self.cvc(self.k-1)):
+ self.k = self.k - 1
+ if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
+ self.k = self.k -1
+
+ def stem(self, p, i, j):
+ """In stem(p,i,j), p is a char pointer, and the string to be stemmed
+ is from p[i] to p[j] inclusive. Typically i is zero and j is the
+ offset to the last character of a string, (p[j+1] == '\0'). The
+ stemmer adjusts the characters p[i] ... p[j] and returns the new
+ end-point of the string, k. Stemming never increases word length, so
+ i <= k <= j. To turn the stemmer into a module, declare 'stem' as
+ extern, and delete the remainder of this file.
+ """
+ # copy the parameters into statics
+ self.b = p
+ self.k = j
+ self.k0 = i
+ if self.k <= self.k0 + 1:
+ return self.b # --DEPARTURE--
+
+ # With this line, strings of length 1 or 2 don't go through the
+ # stemming process, although no mention is made of this in the
+ # published algorithm. Remove the line to match the published
+ # algorithm.
+
+ self.step1ab()
+ self.step1c()
+ self.step2()
+ self.step3()
+ self.step4()
+ self.step5()
+ return self.b[self.k0:self.k+1]