src/extras/examples/dumbfeedparser.py
changeset 0 ca70ae20a155
equal deleted inserted replaced
-1:000000000000 0:ca70ae20a155
       
     1 # A simple and limited RSS feed parser used in the RSS reader example.
       
     2 
       
     3 # Copyright (c) 2005 Nokia Corporation
       
     4 #
       
     5 # Licensed under the Apache License, Version 2.0 (the "License");
       
     6 # you may not use this file except in compliance with the License.
       
     7 # You may obtain a copy of the License at
       
     8 #
       
     9 #     http://www.apache.org/licenses/LICENSE-2.0
       
    10 #
       
    11 # Unless required by applicable law or agreed to in writing, software
       
    12 # distributed under the License is distributed on an "AS IS" BASIS,
       
    13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
       
    14 # See the License for the specific language governing permissions and
       
    15 # limitations under the License.
       
    16 
       
    17 
       
    18 import re
       
    19 import urllib
       
    20 
       
    21 def parse(url):
       
    22     return parse_feed(urllib.urlopen(url).read())
       
    23 
       
    24 def parse_feed(text):
       
    25     feed={}
       
    26     items=[]
       
    27     currentitem=[{}]
       
    28 
       
    29     def clean_entities(text): return re.sub('&[#0-9a-z]+;','?',text)
       
    30     def clean_lf(text): return re.sub('[\n\t\r]',' ',text)
       
    31 
       
    32     def end_a(tag,content): write('LINK(%s)'%gettext())
       
    33     def start_item(tag,content):
       
    34         gettext()
       
    35         write(content)
       
    36         currentitem[0]={}
       
    37     def end_item(tag,content):
       
    38         items.append(currentitem[0])
       
    39         currentitem[0]={}
       
    40     def end_link(tag,content):
       
    41         if within('item'):
       
    42             currentitem[0]['link']=gettext()
       
    43     def end_description(tag,content):
       
    44         if within('item'):
       
    45             currentitem[0]['summary']=clean_entities(gettext())
       
    46     def end_title(tag,content):
       
    47         text=clean_lf(gettext()).strip()
       
    48         if within('item'):
       
    49             currentitem[0]['title']=text
       
    50         elif parentis('channel'):
       
    51             feed['title']=text
       
    52             
       
    53     tagre=re.compile('([^ \n\t]+)(.*)>(.*)',re.S)
       
    54     tagpath=[]
       
    55     textbuffer=[[]]
       
    56     assumed_encoding='latin-1'
       
    57     lines=text.split('<')
       
    58     def start_default(tag,content): write(content)
       
    59     def end_default(tag,content): pass
       
    60     def tag_default(tag,content): pass
       
    61     def write(text): textbuffer[0].append(text)
       
    62     def gettext():
       
    63         text=''.join(textbuffer[0])
       
    64         textbuffer[0]=[]
       
    65         return unicode(text,assumed_encoding)
       
    66     def current_tag(): return tagpath[-1]
       
    67     def current_path(): return '/'.join(tagpath)
       
    68     def within(tag): return tag in tagpath
       
    69     def parentis(tag): return current_tag()==tag
       
    70     for k in lines:
       
    71         m=tagre.match(k)
       
    72         if m:
       
    73             (tag,attributes,content)=m.groups()
       
    74             if tag.startswith('?'):
       
    75                 continue
       
    76             if tag.startswith('/'):
       
    77                 tagname=tag[1:]
       
    78                 handler='end_%s'%tagname
       
    79                 generic_handler=end_default
       
    80                 if current_tag() != tagname:
       
    81                     pass # Unbalanced tags, just ignore for now.
       
    82                 del tagpath[-1]
       
    83             elif tag.endswith('/'):
       
    84                 tagname=tag[0:-1]
       
    85                 handler='tag_%s'%tagname
       
    86                 generic_handler=tag_default
       
    87             else:
       
    88                 tagname=tag
       
    89                 handler='start_%s'%tagname
       
    90                 generic_handler=start_default
       
    91                 tagpath.append(tagname)
       
    92             locals().get(handler,generic_handler)(tagname,content)
       
    93         else:
       
    94             pass # Malformed line, just ignore.
       
    95         
       
    96     feed['entries']=items
       
    97     return feed
       
    98