0
|
1 |
# A simple and limited RSS feed parser used in the RSS reader example.
|
|
2 |
|
|
3 |
# Copyright (c) 2005 Nokia Corporation
|
|
4 |
#
|
|
5 |
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
6 |
# you may not use this file except in compliance with the License.
|
|
7 |
# You may obtain a copy of the License at
|
|
8 |
#
|
|
9 |
# http://www.apache.org/licenses/LICENSE-2.0
|
|
10 |
#
|
|
11 |
# Unless required by applicable law or agreed to in writing, software
|
|
12 |
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
13 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14 |
# See the License for the specific language governing permissions and
|
|
15 |
# limitations under the License.
|
|
16 |
|
|
17 |
|
|
18 |
import re
|
|
19 |
import urllib
|
|
20 |
|
|
21 |
def parse(url):
|
|
22 |
return parse_feed(urllib.urlopen(url).read())
|
|
23 |
|
|
24 |
def parse_feed(text):
|
|
25 |
feed={}
|
|
26 |
items=[]
|
|
27 |
currentitem=[{}]
|
|
28 |
|
|
29 |
def clean_entities(text): return re.sub('&[#0-9a-z]+;','?',text)
|
|
30 |
def clean_lf(text): return re.sub('[\n\t\r]',' ',text)
|
|
31 |
|
|
32 |
def end_a(tag,content): write('LINK(%s)'%gettext())
|
|
33 |
def start_item(tag,content):
|
|
34 |
gettext()
|
|
35 |
write(content)
|
|
36 |
currentitem[0]={}
|
|
37 |
def end_item(tag,content):
|
|
38 |
items.append(currentitem[0])
|
|
39 |
currentitem[0]={}
|
|
40 |
def end_link(tag,content):
|
|
41 |
if within('item'):
|
|
42 |
currentitem[0]['link']=gettext()
|
|
43 |
def end_description(tag,content):
|
|
44 |
if within('item'):
|
|
45 |
currentitem[0]['summary']=clean_entities(gettext())
|
|
46 |
def end_title(tag,content):
|
|
47 |
text=clean_lf(gettext()).strip()
|
|
48 |
if within('item'):
|
|
49 |
currentitem[0]['title']=text
|
|
50 |
elif parentis('channel'):
|
|
51 |
feed['title']=text
|
|
52 |
|
|
53 |
tagre=re.compile('([^ \n\t]+)(.*)>(.*)',re.S)
|
|
54 |
tagpath=[]
|
|
55 |
textbuffer=[[]]
|
|
56 |
assumed_encoding='latin-1'
|
|
57 |
lines=text.split('<')
|
|
58 |
def start_default(tag,content): write(content)
|
|
59 |
def end_default(tag,content): pass
|
|
60 |
def tag_default(tag,content): pass
|
|
61 |
def write(text): textbuffer[0].append(text)
|
|
62 |
def gettext():
|
|
63 |
text=''.join(textbuffer[0])
|
|
64 |
textbuffer[0]=[]
|
|
65 |
return unicode(text,assumed_encoding)
|
|
66 |
def current_tag(): return tagpath[-1]
|
|
67 |
def current_path(): return '/'.join(tagpath)
|
|
68 |
def within(tag): return tag in tagpath
|
|
69 |
def parentis(tag): return current_tag()==tag
|
|
70 |
for k in lines:
|
|
71 |
m=tagre.match(k)
|
|
72 |
if m:
|
|
73 |
(tag,attributes,content)=m.groups()
|
|
74 |
if tag.startswith('?'):
|
|
75 |
continue
|
|
76 |
if tag.startswith('/'):
|
|
77 |
tagname=tag[1:]
|
|
78 |
handler='end_%s'%tagname
|
|
79 |
generic_handler=end_default
|
|
80 |
if current_tag() != tagname:
|
|
81 |
pass # Unbalanced tags, just ignore for now.
|
|
82 |
del tagpath[-1]
|
|
83 |
elif tag.endswith('/'):
|
|
84 |
tagname=tag[0:-1]
|
|
85 |
handler='tag_%s'%tagname
|
|
86 |
generic_handler=tag_default
|
|
87 |
else:
|
|
88 |
tagname=tag
|
|
89 |
handler='start_%s'%tagname
|
|
90 |
generic_handler=start_default
|
|
91 |
tagpath.append(tagname)
|
|
92 |
locals().get(handler,generic_handler)(tagname,content)
|
|
93 |
else:
|
|
94 |
pass # Malformed line, just ignore.
|
|
95 |
|
|
96 |
feed['entries']=items
|
|
97 |
return feed
|
|
98 |
|