|
1 # A simple and limited RSS feed parser used in the RSS reader example. |
|
2 |
|
3 # Copyright (c) 2005 Nokia Corporation |
|
4 # |
|
5 # Licensed under the Apache License, Version 2.0 (the "License"); |
|
6 # you may not use this file except in compliance with the License. |
|
7 # You may obtain a copy of the License at |
|
8 # |
|
9 # http://www.apache.org/licenses/LICENSE-2.0 |
|
10 # |
|
11 # Unless required by applicable law or agreed to in writing, software |
|
12 # distributed under the License is distributed on an "AS IS" BASIS, |
|
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
14 # See the License for the specific language governing permissions and |
|
15 # limitations under the License. |
|
16 |
|
17 |
|
18 import re |
|
19 import urllib |
|
20 |
|
21 def parse(url): |
|
22 return parse_feed(urllib.urlopen(url).read()) |
|
23 |
|
24 def parse_feed(text): |
|
25 feed={} |
|
26 items=[] |
|
27 currentitem=[{}] |
|
28 |
|
29 def clean_entities(text): return re.sub('&[#0-9a-z]+;','?',text) |
|
30 def clean_lf(text): return re.sub('[\n\t\r]',' ',text) |
|
31 |
|
32 def end_a(tag,content): write('LINK(%s)'%gettext()) |
|
33 def start_item(tag,content): |
|
34 gettext() |
|
35 write(content) |
|
36 currentitem[0]={} |
|
37 def end_item(tag,content): |
|
38 items.append(currentitem[0]) |
|
39 currentitem[0]={} |
|
40 def end_link(tag,content): |
|
41 if within('item'): |
|
42 currentitem[0]['link']=gettext() |
|
43 def end_description(tag,content): |
|
44 if within('item'): |
|
45 currentitem[0]['summary']=clean_entities(gettext()) |
|
46 def end_title(tag,content): |
|
47 text=clean_lf(gettext()).strip() |
|
48 if within('item'): |
|
49 currentitem[0]['title']=text |
|
50 elif parentis('channel'): |
|
51 feed['title']=text |
|
52 |
|
53 tagre=re.compile('([^ \n\t]+)(.*)>(.*)',re.S) |
|
54 tagpath=[] |
|
55 textbuffer=[[]] |
|
56 assumed_encoding='latin-1' |
|
57 lines=text.split('<') |
|
58 def start_default(tag,content): write(content) |
|
59 def end_default(tag,content): pass |
|
60 def tag_default(tag,content): pass |
|
61 def write(text): textbuffer[0].append(text) |
|
62 def gettext(): |
|
63 text=''.join(textbuffer[0]) |
|
64 textbuffer[0]=[] |
|
65 return unicode(text,assumed_encoding) |
|
66 def current_tag(): return tagpath[-1] |
|
67 def current_path(): return '/'.join(tagpath) |
|
68 def within(tag): return tag in tagpath |
|
69 def parentis(tag): return current_tag()==tag |
|
70 for k in lines: |
|
71 m=tagre.match(k) |
|
72 if m: |
|
73 (tag,attributes,content)=m.groups() |
|
74 if tag.startswith('?'): |
|
75 continue |
|
76 if tag.startswith('/'): |
|
77 tagname=tag[1:] |
|
78 handler='end_%s'%tagname |
|
79 generic_handler=end_default |
|
80 if current_tag() != tagname: |
|
81 pass # Unbalanced tags, just ignore for now. |
|
82 del tagpath[-1] |
|
83 elif tag.endswith('/'): |
|
84 tagname=tag[0:-1] |
|
85 handler='tag_%s'%tagname |
|
86 generic_handler=tag_default |
|
87 else: |
|
88 tagname=tag |
|
89 handler='start_%s'%tagname |
|
90 generic_handler=start_default |
|
91 tagpath.append(tagname) |
|
92 locals().get(handler,generic_handler)(tagname,content) |
|
93 else: |
|
94 pass # Malformed line, just ignore. |
|
95 |
|
96 feed['entries']=items |
|
97 return feed |
|
98 |