179
|
1 |
# -*- coding: utf-8 -*-
|
|
2 |
"""
|
|
3 |
sphinx.util.stemmer
|
|
4 |
~~~~~~~~~~~~~~~~~~~
|
|
5 |
|
|
6 |
Porter Stemming Algorithm
|
|
7 |
|
|
8 |
This is the Porter stemming algorithm, ported to Python from the
|
|
9 |
version coded up in ANSI C by the author. It may be be regarded
|
|
10 |
as canonical, in that it follows the algorithm presented in
|
|
11 |
|
|
12 |
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
|
13 |
no. 3, pp 130-137,
|
|
14 |
|
|
15 |
only differing from it at the points maked --DEPARTURE-- below.
|
|
16 |
|
|
17 |
See also http://www.tartarus.org/~martin/PorterStemmer
|
|
18 |
|
|
19 |
The algorithm as described in the paper could be exactly replicated
|
|
20 |
by adjusting the points of DEPARTURE, but this is barely necessary,
|
|
21 |
because (a) the points of DEPARTURE are definitely improvements, and
|
|
22 |
(b) no encoding of the Porter stemmer I have seen is anything like
|
|
23 |
as exact as this version, even with the points of DEPARTURE!
|
|
24 |
|
|
25 |
Release 1: January 2001
|
|
26 |
|
|
27 |
:copyright: 2001 by Vivake Gupta <v@nano.com>.
|
|
28 |
:license: Public Domain ("can be used free of charge for any purpose").
|
|
29 |
"""
|
|
30 |
|
|
31 |
class PorterStemmer(object):
|
|
32 |
|
|
33 |
def __init__(self):
|
|
34 |
"""The main part of the stemming algorithm starts here.
|
|
35 |
b is a buffer holding a word to be stemmed. The letters are in b[k0],
|
|
36 |
b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
|
|
37 |
readjusted downwards as the stemming progresses. Zero termination is
|
|
38 |
not in fact used in the algorithm.
|
|
39 |
|
|
40 |
Note that only lower case sequences are stemmed. Forcing to lower case
|
|
41 |
should be done before stem(...) is called.
|
|
42 |
"""
|
|
43 |
|
|
44 |
self.b = "" # buffer for word to be stemmed
|
|
45 |
self.k = 0
|
|
46 |
self.k0 = 0
|
|
47 |
self.j = 0 # j is a general offset into the string
|
|
48 |
|
|
49 |
def cons(self, i):
|
|
50 |
"""cons(i) is TRUE <=> b[i] is a consonant."""
|
|
51 |
if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \
|
|
52 |
or self.b[i] == 'o' or self.b[i] == 'u':
|
|
53 |
return 0
|
|
54 |
if self.b[i] == 'y':
|
|
55 |
if i == self.k0:
|
|
56 |
return 1
|
|
57 |
else:
|
|
58 |
return (not self.cons(i - 1))
|
|
59 |
return 1
|
|
60 |
|
|
61 |
def m(self):
|
|
62 |
"""m() measures the number of consonant sequences between k0 and j.
|
|
63 |
if c is a consonant sequence and v a vowel sequence, and <..>
|
|
64 |
indicates arbitrary presence,
|
|
65 |
|
|
66 |
<c><v> gives 0
|
|
67 |
<c>vc<v> gives 1
|
|
68 |
<c>vcvc<v> gives 2
|
|
69 |
<c>vcvcvc<v> gives 3
|
|
70 |
....
|
|
71 |
"""
|
|
72 |
n = 0
|
|
73 |
i = self.k0
|
|
74 |
while 1:
|
|
75 |
if i > self.j:
|
|
76 |
return n
|
|
77 |
if not self.cons(i):
|
|
78 |
break
|
|
79 |
i = i + 1
|
|
80 |
i = i + 1
|
|
81 |
while 1:
|
|
82 |
while 1:
|
|
83 |
if i > self.j:
|
|
84 |
return n
|
|
85 |
if self.cons(i):
|
|
86 |
break
|
|
87 |
i = i + 1
|
|
88 |
i = i + 1
|
|
89 |
n = n + 1
|
|
90 |
while 1:
|
|
91 |
if i > self.j:
|
|
92 |
return n
|
|
93 |
if not self.cons(i):
|
|
94 |
break
|
|
95 |
i = i + 1
|
|
96 |
i = i + 1
|
|
97 |
|
|
98 |
def vowelinstem(self):
|
|
99 |
"""vowelinstem() is TRUE <=> k0,...j contains a vowel"""
|
|
100 |
for i in range(self.k0, self.j + 1):
|
|
101 |
if not self.cons(i):
|
|
102 |
return 1
|
|
103 |
return 0
|
|
104 |
|
|
105 |
def doublec(self, j):
|
|
106 |
"""doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
|
|
107 |
if j < (self.k0 + 1):
|
|
108 |
return 0
|
|
109 |
if (self.b[j] != self.b[j-1]):
|
|
110 |
return 0
|
|
111 |
return self.cons(j)
|
|
112 |
|
|
113 |
def cvc(self, i):
|
|
114 |
"""cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant
|
|
115 |
and also if the second c is not w,x or y. this is used when trying to
|
|
116 |
restore an e at the end of a short e.g.
|
|
117 |
|
|
118 |
cav(e), lov(e), hop(e), crim(e), but
|
|
119 |
snow, box, tray.
|
|
120 |
"""
|
|
121 |
if i < (self.k0 + 2) or not self.cons(i) or self.cons(i-1) or not self.cons(i-2):
|
|
122 |
return 0
|
|
123 |
ch = self.b[i]
|
|
124 |
if ch == 'w' or ch == 'x' or ch == 'y':
|
|
125 |
return 0
|
|
126 |
return 1
|
|
127 |
|
|
128 |
def ends(self, s):
|
|
129 |
"""ends(s) is TRUE <=> k0,...k ends with the string s."""
|
|
130 |
length = len(s)
|
|
131 |
if s[length - 1] != self.b[self.k]: # tiny speed-up
|
|
132 |
return 0
|
|
133 |
if length > (self.k - self.k0 + 1):
|
|
134 |
return 0
|
|
135 |
if self.b[self.k-length+1:self.k+1] != s:
|
|
136 |
return 0
|
|
137 |
self.j = self.k - length
|
|
138 |
return 1
|
|
139 |
|
|
140 |
def setto(self, s):
|
|
141 |
"""setto(s) sets (j+1),...k to the characters in the string s, readjusting k."""
|
|
142 |
length = len(s)
|
|
143 |
self.b = self.b[:self.j+1] + s + self.b[self.j+length+1:]
|
|
144 |
self.k = self.j + length
|
|
145 |
|
|
146 |
def r(self, s):
|
|
147 |
"""r(s) is used further down."""
|
|
148 |
if self.m() > 0:
|
|
149 |
self.setto(s)
|
|
150 |
|
|
151 |
def step1ab(self):
|
|
152 |
"""step1ab() gets rid of plurals and -ed or -ing. e.g.
|
|
153 |
|
|
154 |
caresses -> caress
|
|
155 |
ponies -> poni
|
|
156 |
ties -> ti
|
|
157 |
caress -> caress
|
|
158 |
cats -> cat
|
|
159 |
|
|
160 |
feed -> feed
|
|
161 |
agreed -> agree
|
|
162 |
disabled -> disable
|
|
163 |
|
|
164 |
matting -> mat
|
|
165 |
mating -> mate
|
|
166 |
meeting -> meet
|
|
167 |
milling -> mill
|
|
168 |
messing -> mess
|
|
169 |
|
|
170 |
meetings -> meet
|
|
171 |
"""
|
|
172 |
if self.b[self.k] == 's':
|
|
173 |
if self.ends("sses"):
|
|
174 |
self.k = self.k - 2
|
|
175 |
elif self.ends("ies"):
|
|
176 |
self.setto("i")
|
|
177 |
elif self.b[self.k - 1] != 's':
|
|
178 |
self.k = self.k - 1
|
|
179 |
if self.ends("eed"):
|
|
180 |
if self.m() > 0:
|
|
181 |
self.k = self.k - 1
|
|
182 |
elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
|
|
183 |
self.k = self.j
|
|
184 |
if self.ends("at"): self.setto("ate")
|
|
185 |
elif self.ends("bl"): self.setto("ble")
|
|
186 |
elif self.ends("iz"): self.setto("ize")
|
|
187 |
elif self.doublec(self.k):
|
|
188 |
self.k = self.k - 1
|
|
189 |
ch = self.b[self.k]
|
|
190 |
if ch == 'l' or ch == 's' or ch == 'z':
|
|
191 |
self.k = self.k + 1
|
|
192 |
elif (self.m() == 1 and self.cvc(self.k)):
|
|
193 |
self.setto("e")
|
|
194 |
|
|
195 |
def step1c(self):
|
|
196 |
"""step1c() turns terminal y to i when there is another vowel in the stem."""
|
|
197 |
if (self.ends("y") and self.vowelinstem()):
|
|
198 |
self.b = self.b[:self.k] + 'i' + self.b[self.k+1:]
|
|
199 |
|
|
200 |
def step2(self):
|
|
201 |
"""step2() maps double suffices to single ones.
|
|
202 |
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
|
|
203 |
string before the suffix must give m() > 0.
|
|
204 |
"""
|
|
205 |
if self.b[self.k - 1] == 'a':
|
|
206 |
if self.ends("ational"): self.r("ate")
|
|
207 |
elif self.ends("tional"): self.r("tion")
|
|
208 |
elif self.b[self.k - 1] == 'c':
|
|
209 |
if self.ends("enci"): self.r("ence")
|
|
210 |
elif self.ends("anci"): self.r("ance")
|
|
211 |
elif self.b[self.k - 1] == 'e':
|
|
212 |
if self.ends("izer"): self.r("ize")
|
|
213 |
elif self.b[self.k - 1] == 'l':
|
|
214 |
if self.ends("bli"): self.r("ble") # --DEPARTURE--
|
|
215 |
# To match the published algorithm, replace this phrase with
|
|
216 |
# if self.ends("abli"): self.r("able")
|
|
217 |
elif self.ends("alli"): self.r("al")
|
|
218 |
elif self.ends("entli"): self.r("ent")
|
|
219 |
elif self.ends("eli"): self.r("e")
|
|
220 |
elif self.ends("ousli"): self.r("ous")
|
|
221 |
elif self.b[self.k - 1] == 'o':
|
|
222 |
if self.ends("ization"): self.r("ize")
|
|
223 |
elif self.ends("ation"): self.r("ate")
|
|
224 |
elif self.ends("ator"): self.r("ate")
|
|
225 |
elif self.b[self.k - 1] == 's':
|
|
226 |
if self.ends("alism"): self.r("al")
|
|
227 |
elif self.ends("iveness"): self.r("ive")
|
|
228 |
elif self.ends("fulness"): self.r("ful")
|
|
229 |
elif self.ends("ousness"): self.r("ous")
|
|
230 |
elif self.b[self.k - 1] == 't':
|
|
231 |
if self.ends("aliti"): self.r("al")
|
|
232 |
elif self.ends("iviti"): self.r("ive")
|
|
233 |
elif self.ends("biliti"): self.r("ble")
|
|
234 |
elif self.b[self.k - 1] == 'g': # --DEPARTURE--
|
|
235 |
if self.ends("logi"): self.r("log")
|
|
236 |
# To match the published algorithm, delete this phrase
|
|
237 |
|
|
238 |
def step3(self):
|
|
239 |
"""step3() dels with -ic-, -full, -ness etc. similar strategy to step2."""
|
|
240 |
if self.b[self.k] == 'e':
|
|
241 |
if self.ends("icate"): self.r("ic")
|
|
242 |
elif self.ends("ative"): self.r("")
|
|
243 |
elif self.ends("alize"): self.r("al")
|
|
244 |
elif self.b[self.k] == 'i':
|
|
245 |
if self.ends("iciti"): self.r("ic")
|
|
246 |
elif self.b[self.k] == 'l':
|
|
247 |
if self.ends("ical"): self.r("ic")
|
|
248 |
elif self.ends("ful"): self.r("")
|
|
249 |
elif self.b[self.k] == 's':
|
|
250 |
if self.ends("ness"): self.r("")
|
|
251 |
|
|
252 |
def step4(self):
|
|
253 |
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
|
|
254 |
if self.b[self.k - 1] == 'a':
|
|
255 |
if self.ends("al"): pass
|
|
256 |
else: return
|
|
257 |
elif self.b[self.k - 1] == 'c':
|
|
258 |
if self.ends("ance"): pass
|
|
259 |
elif self.ends("ence"): pass
|
|
260 |
else: return
|
|
261 |
elif self.b[self.k - 1] == 'e':
|
|
262 |
if self.ends("er"): pass
|
|
263 |
else: return
|
|
264 |
elif self.b[self.k - 1] == 'i':
|
|
265 |
if self.ends("ic"): pass
|
|
266 |
else: return
|
|
267 |
elif self.b[self.k - 1] == 'l':
|
|
268 |
if self.ends("able"): pass
|
|
269 |
elif self.ends("ible"): pass
|
|
270 |
else: return
|
|
271 |
elif self.b[self.k - 1] == 'n':
|
|
272 |
if self.ends("ant"): pass
|
|
273 |
elif self.ends("ement"): pass
|
|
274 |
elif self.ends("ment"): pass
|
|
275 |
elif self.ends("ent"): pass
|
|
276 |
else: return
|
|
277 |
elif self.b[self.k - 1] == 'o':
|
|
278 |
if self.ends("ion") and (self.b[self.j] == 's' \
|
|
279 |
or self.b[self.j] == 't'): pass
|
|
280 |
elif self.ends("ou"): pass
|
|
281 |
# takes care of -ous
|
|
282 |
else: return
|
|
283 |
elif self.b[self.k - 1] == 's':
|
|
284 |
if self.ends("ism"): pass
|
|
285 |
else: return
|
|
286 |
elif self.b[self.k - 1] == 't':
|
|
287 |
if self.ends("ate"): pass
|
|
288 |
elif self.ends("iti"): pass
|
|
289 |
else: return
|
|
290 |
elif self.b[self.k - 1] == 'u':
|
|
291 |
if self.ends("ous"): pass
|
|
292 |
else: return
|
|
293 |
elif self.b[self.k - 1] == 'v':
|
|
294 |
if self.ends("ive"): pass
|
|
295 |
else: return
|
|
296 |
elif self.b[self.k - 1] == 'z':
|
|
297 |
if self.ends("ize"): pass
|
|
298 |
else: return
|
|
299 |
else:
|
|
300 |
return
|
|
301 |
if self.m() > 1:
|
|
302 |
self.k = self.j
|
|
303 |
|
|
304 |
def step5(self):
|
|
305 |
"""step5() removes a final -e if m() > 1, and changes -ll to -l if
|
|
306 |
m() > 1.
|
|
307 |
"""
|
|
308 |
self.j = self.k
|
|
309 |
if self.b[self.k] == 'e':
|
|
310 |
a = self.m()
|
|
311 |
if a > 1 or (a == 1 and not self.cvc(self.k-1)):
|
|
312 |
self.k = self.k - 1
|
|
313 |
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
|
|
314 |
self.k = self.k -1
|
|
315 |
|
|
316 |
def stem(self, p, i, j):
|
|
317 |
"""In stem(p,i,j), p is a char pointer, and the string to be stemmed
|
|
318 |
is from p[i] to p[j] inclusive. Typically i is zero and j is the
|
|
319 |
offset to the last character of a string, (p[j+1] == '\0'). The
|
|
320 |
stemmer adjusts the characters p[i] ... p[j] and returns the new
|
|
321 |
end-point of the string, k. Stemming never increases word length, so
|
|
322 |
i <= k <= j. To turn the stemmer into a module, declare 'stem' as
|
|
323 |
extern, and delete the remainder of this file.
|
|
324 |
"""
|
|
325 |
# copy the parameters into statics
|
|
326 |
self.b = p
|
|
327 |
self.k = j
|
|
328 |
self.k0 = i
|
|
329 |
if self.k <= self.k0 + 1:
|
|
330 |
return self.b # --DEPARTURE--
|
|
331 |
|
|
332 |
# With this line, strings of length 1 or 2 don't go through the
|
|
333 |
# stemming process, although no mention is made of this in the
|
|
334 |
# published algorithm. Remove the line to match the published
|
|
335 |
# algorithm.
|
|
336 |
|
|
337 |
self.step1ab()
|
|
338 |
self.step1c()
|
|
339 |
self.step2()
|
|
340 |
self.step3()
|
|
341 |
self.step4()
|
|
342 |
self.step5()
|
|
343 |
return self.b[self.k0:self.k+1]
|