1275
1265
def _end_expirationdate(self):
1276
1266
self._save('expired_parsed', _parse_date(self.pop('expired')))
1278
# geospatial location, or "where", from georss.org
1280
def _start_georssgeom(self, attrsD):
1281
self.push('geometry', 0)
1282
_start_georss_point = _start_georssgeom
1283
_start_georss_line = _start_georssgeom
1284
_start_georss_polygon = _start_georssgeom
1285
_start_georss_box = _start_georssgeom
1287
def _save_where(self, geometry):
1288
context = self._getContext()
1289
context.setdefault('where', {})
1290
context['where'] = FeedParserDict(geometry)
1292
def _end_georss_point(self):
1293
geometry = _parse_georss_point(self.pop('geometry'))
1294
self._save_where(geometry)
1296
def _end_georss_line(self):
1297
geometry = _parse_georss_line(self.pop('geometry'))
1298
self._save_where(geometry)
1300
def _end_georss_polygon(self):
1301
this = self.pop('geometry')
1302
geometry = _parse_georss_polygon(this)
1303
self._save_where(geometry)
1305
def _end_georss_box(self):
1306
geometry = _parse_georss_box(self.pop('geometry'))
1307
self._save_where(geometry)
1309
def _start_where(self, attrsD):
1310
self.push('where', 0)
1311
_start_georss_where = _start_where
1313
def _start_gml_point(self, attrsD):
1314
self.ingeometry = 'point'
1315
self.push('geometry', 0)
1317
def _start_gml_linestring(self, attrsD):
1318
self.ingeometry = 'linestring'
1319
self.push('geometry', 0)
1321
def _start_gml_polygon(self, attrsD):
1322
self.push('geometry', 0)
1324
def _start_gml_exterior(self, attrsD):
1325
self.push('geometry', 0)
1327
def _start_gml_linearring(self, attrsD):
1328
self.ingeometry = 'polygon'
1329
self.push('geometry', 0)
1331
def _start_gml_pos(self, attrsD):
1334
def _end_gml_pos(self):
1335
this = self.pop('pos')
1336
geometry = _parse_georss_point(this)
1337
self._save_where(geometry)
1339
def _start_gml_poslist(self, attrsD):
1342
def _end_gml_poslist(self):
1343
geometry = _parse_poslist(self.pop('pos'), self.ingeometry)
1344
self._save_where(geometry)
1346
def _end_geom(self):
1348
self.pop('geometry')
1349
_end_gml_point = _end_geom
1350
_end_gml_linestring = _end_geom
1351
_end_gml_linearring = _end_geom
1352
_end_gml_exterior = _end_geom
1353
_end_gml_polygon = _end_geom
1355
def _end_where(self):
1357
_end_georss_where = _end_where
1361
1268
def _start_cc_license(self, attrsD):
1362
1269
context = self._getContext()
1363
1270
value = self._getAttribute(attrsD, 'rdf:resource')
3425
3336
data = doctype_pattern.sub(replacement, head) + data
3427
3338
return version, data, dict(replacement and safe_pattern.findall(replacement))
3429
# GeoRSS geometry parsers. Each return a dict with 'type' and 'coordinates'
3430
# keys, or None in the case of a parsing error
3432
def _parse_poslist(value, geom_type):
3433
if geom_type == 'linestring':
3434
return _parse_georss_line(value)
3435
elif geom_type == 'polygon':
3436
ring = _parse_georss_line(value)
3437
return {'type': 'Polygon', 'coordinates': (ring['coordinates'],)}
3439
raise ValueError, "unsupported geometry type: %s" % geom_type
3441
# Point coordinates are a 2-tuple (lon, lat)
3442
def _parse_georss_point(value):
3444
lat, lon = value.replace(',', ' ').split()
3445
return {'type': 'Point', 'coordinates': (float(lon), float(lat))}
3446
except Exception, e:
3448
sys.stderr.write('_parse_georss_point raised %s\n' % (handler.__name__, repr(e)))
3452
# Line coordinates are a tuple of 2-tuples ((lon0, lat0), ... (lonN, latN))
3453
def _parse_georss_line(value):
3455
latlons = value.replace(',', ' ').split()
3457
for i in range(0, len(latlons), 2):
3458
lat = float(latlons[i])
3459
lon = float(latlons[i+1])
3460
coords.append((lon, lat))
3461
return {'type': 'LineString', 'coordinates': tuple(coords)}
3462
except Exception, e:
3464
sys.stderr.write('_parse_georss_line raised %s\n' % repr(e))
3468
# Polygon coordinates are a tuple of closed LineString tuples. The first item
3469
# in the tuple is the exterior ring. Subsequent items are interior rings, but
3470
# georss:polygon elements usually have no interior rings.
3471
def _parse_georss_polygon(value):
3473
latlons = value.replace(',', ' ').split()
3475
for i in range(0, len(latlons), 2):
3476
lat = float(latlons[i])
3477
lon = float(latlons[i+1])
3478
coords.append((lon, lat))
3479
return {'type': 'Polygon', 'coordinates': (tuple(coords),)}
3480
except Exception, e:
3482
sys.stderr.write('_parse_georss_polygon raised %s\n' % repr(e))
3486
# Box coordinates are a 2-tuple of 2-tuples ((lon_ll, lat_ll), (lon_ur, lat_ur))
3487
def _parse_georss_box(value):
3489
vals = [float(x) for x in value.replace(',', ' ').split()]
3490
return {'type': 'Box', 'coordinates': ((vals[1], vals[0]), (vals[3], vals[2]))}
3491
except Exception, e:
3493
sys.stderr.write('_parse_georss_box raised %s\n' % repr(e))
3497
# end geospatial parsers
3499
3340
def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]):
3500
3341
'''Parse a feed from a URL, file, stream, or string'''
3501
3342
result = FeedParserDict()
3767
3610
for url in urls:
3768
3611
results = parse(url, etag=options.etag, modified=options.modified, agent=options.agent, referrer=options.referrer)
3769
3612
serializer(results).write(sys.stdout)
3772
#1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements,
3773
# added Simon Fell's test suite
3774
#1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections
3776
# JD - use inchannel to watch out for image and textinput elements which can
3777
# also contain title, link, and description elements
3778
# JD - check for isPermaLink='false' attribute on guid elements
3779
# JD - replaced openAnything with open_resource supporting ETag and
3780
# If-Modified-Since request headers
3781
# JD - parse now accepts etag, modified, agent, and referrer optional
3783
# JD - modified parse to return a dictionary instead of a tuple so that any
3784
# etag or modified information can be returned and cached by the caller
3785
#2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything
3786
# because of etag/modified, return the old etag/modified to the caller to
3787
# indicate why nothing is being returned
3788
#2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its
3789
# useless. Fixes the problem JD was addressing by adding it.
3790
#2.1 - 11/14/2002 - MAP - added gzip support
3791
#2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent.
3792
# start_admingeneratoragent is an example of how to handle elements with
3793
# only attributes, no content.
3794
#2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify);
3795
# also, make sure we send the User-Agent even if urllib2 isn't available.
3796
# Match any variation of backend.userland.com/rss namespace.
3797
#2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is.
3798
#2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's
3799
# snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed
3801
#2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree);
3802
# removed unnecessary urllib code -- urllib2 should always be available anyway;
3803
# return actual url, status, and full HTTP headers (as result['url'],
3804
# result['status'], and result['headers']) if parsing a remote feed over HTTP --
3805
# this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>;
3806
# added the latest namespace-of-the-week for RSS 2.0
3807
#2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom
3808
# User-Agent (otherwise urllib2 sends two, which confuses some servers)
3809
#2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for
3810
# inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds
3811
#2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or
3812
# textInput, and also to return the character encoding (if specified)
3813
#2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking
3814
# nested divs within content (JohnD); fixed missing sys import (JohanS);
3815
# fixed regular expression to capture XML character encoding (Andrei);
3816
# added support for Atom 0.3-style links; fixed bug with textInput tracking;
3817
# added support for cloud (MartijnP); added support for multiple
3818
# category/dc:subject (MartijnP); normalize content model: 'description' gets
3819
# description (which can come from description, summary, or full content if no
3820
# description), 'content' gets dict of base/language/type/value (which can come
3821
# from content:encoded, xhtml:body, content, or fullitem);
3822
# fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang
3823
# tracking; fixed bug tracking unknown tags; fixed bug tracking content when
3824
# <content> element is not in default namespace (like Pocketsoap feed);
3825
# resolve relative URLs in link, guid, docs, url, comments, wfw:comment,
3826
# wfw:commentRSS; resolve relative URLs within embedded HTML markup in
3827
# description, xhtml:body, content, content:encoded, title, subtitle,
3828
# summary, info, tagline, and copyright; added support for pingback and
3829
# trackback namespaces
3830
#2.7 - 1/5/2004 - MAP - really added support for trackback and pingback
3831
# namespaces, as opposed to 2.6 when I said I did but didn't really;
3832
# sanitize HTML markup within some elements; added mxTidy support (if
3833
# installed) to tidy HTML markup within some elements; fixed indentation
3834
# bug in _parse_date (FazalM); use socket.setdefaulttimeout if available
3835
# (FazalM); universal date parsing and normalization (FazalM): 'created', modified',
3836
# 'issued' are parsed into 9-tuple date format and stored in 'created_parsed',
3837
# 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified'
3838
# and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa
3839
#2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory
3840
# leak not closing url opener (JohnD); added dc:publisher support (MarekK);
3841
# added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK)
3842
#2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in
3843
# encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL);
3844
# fixed relative URI processing for guid (skadz); added ICBM support; added
3846
#2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many
3847
# blogspot.com sites); added _debug variable
3848
#2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing
3849
#3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available);
3850
# added several new supported namespaces; fixed bug tracking naked markup in
3851
# description; added support for enclosure; added support for source; re-added
3852
# support for cloud which got dropped somehow; added support for expirationDate
3853
#3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking
3854
# xml:base URI, one for documents that don't define one explicitly and one for
3855
# documents that define an outer and an inner xml:base that goes out of scope
3856
# before the end of the document
3857
#3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level
3858
#3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version']
3859
# will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized;
3860
# added support for creativeCommons:license and cc:license; added support for
3861
# full Atom content model in title, tagline, info, copyright, summary; fixed bug
3862
# with gzip encoding (not always telling server we support it when we do)
3863
#3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail
3864
# (dictionary of 'name', 'url', 'email'); map author to author_detail if author
3865
# contains name + email address
3866
#3.0b8 - 1/28/2004 - MAP - added support for contributor
3867
#3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added
3868
# support for summary
3869
#3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from
3871
#3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain
3872
# dangerous markup; fiddled with decodeEntities (not right); liberalized
3873
# date parsing even further
3874
#3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right);
3875
# added support to Atom 0.2 subtitle; added support for Atom content model
3876
# in copyright; better sanitizing of dangerous HTML elements with end tags
3877
# (script, frameset)
3878
#3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img,
3879
# etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />)
3880
#3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under
3882
#3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS;
3883
# fixed bug capturing author and contributor URL; fixed bug resolving relative
3884
# links in author and contributor URL; fixed bug resolvin relative links in
3885
# generator URL; added support for recognizing RSS 1.0; passed Simon Fell's
3886
# namespace tests, and included them permanently in the test suite with his
3887
# permission; fixed namespace handling under Python 2.1
3888
#3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15)
3889
#3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023
3890
#3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei);
3891
# use libxml2 (if available)
3892
#3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author
3893
# name was in parentheses; removed ultra-problematic mxTidy support; patch to
3894
# workaround crash in PyXML/expat when encountering invalid entities
3895
# (MarkMoraes); support for textinput/textInput
3896
#3.0b20 - 4/7/2004 - MAP - added CDF support
3897
#3.0b21 - 4/14/2004 - MAP - added Hot RSS support
3898
#3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in
3899
# results dict; changed results dict to allow getting values with results.key
3900
# as well as results[key]; work around embedded illformed HTML with half
3901
# a DOCTYPE; work around malformed Content-Type header; if character encoding
3902
# is wrong, try several common ones before falling back to regexes (if this
3903
# works, bozo_exception is set to CharacterEncodingOverride); fixed character
3904
# encoding issues in BaseHTMLProcessor by tracking encoding and converting
3905
# from Unicode to raw strings before feeding data to sgmllib.SGMLParser;
3906
# convert each value in results to Unicode (if possible), even if using
3907
# regex-based parsing
3908
#3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain
3909
# high-bit characters in attributes in embedded HTML in description (thanks
3910
# Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in
3911
# FeedParserDict; tweaked FeedParserDict.has_key to return True if asking
3912
# about a mapped key
3913
#3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and
3914
# results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could
3915
# cause the same encoding to be tried twice (even if it failed the first time);
3916
# fixed DOCTYPE stripping when DOCTYPE contained entity declarations;
3917
# better textinput and image tracking in illformed RSS 1.0 feeds
3918
#3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed
3919
# my blink tag tests
3920
#3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that
3921
# failed to parse utf-16 encoded feeds; made source into a FeedParserDict;
3922
# duplicate admin:generatorAgent/@rdf:resource in generator_detail.url;
3923
# added support for image; refactored parse() fallback logic to try other
3924
# encodings if SAX parsing fails (previously it would only try other encodings
3925
# if re-encoding failed); remove unichr madness in normalize_attrs now that
3926
# we're properly tracking encoding in and out of BaseHTMLProcessor; set
3927
# feed.language from root-level xml:lang; set entry.id from rdf:about;
3928
# send Accept header
3929
#3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between
3930
# iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are
3931
# windows-1252); fixed regression that could cause the same encoding to be
3932
# tried twice (even if it failed the first time)
3933
#3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types;
3934
# recover from malformed content-type header parameter with no equals sign
3935
# ('text/xml; charset:iso-8859-1')
3936
#3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities
3937
# to Unicode equivalents in illformed feeds (aaronsw); added and
3938
# passed tests for converting character entities to Unicode equivalents
3939
# in illformed feeds (aaronsw); test for valid parsers when setting
3940
# XML_AVAILABLE; make version and encoding available when server returns
3941
# a 304; add handlers parameter to pass arbitrary urllib2 handlers (like
3942
# digest auth or proxy support); add code to parse username/password
3943
# out of url and send as basic authentication; expose downloading-related
3944
# exceptions in bozo_exception (aaronsw); added __contains__ method to
3945
# FeedParserDict (aaronsw); added publisher_detail (aaronsw)
3946
#3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always
3947
# convert feed to UTF-8 before passing to XML parser; completely revamped
3948
# logic for determining character encoding and attempting XML parsing
3949
# (much faster); increased default timeout to 20 seconds; test for presence
3950
# of Location header on redirects; added tests for many alternate character
3951
# encodings; support various EBCDIC encodings; support UTF-16BE and
3952
# UTF16-LE with or without a BOM; support UTF-8 with a BOM; support
3953
# UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no
3954
# XML parsers are available; added support for 'Content-encoding: deflate';
3955
# send blank 'Accept-encoding: ' header if neither gzip nor zlib modules
3957
#3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure
3958
# problem tracking xml:base and xml:lang if element declares it, child
3959
# doesn't, first grandchild redeclares it, and second grandchild doesn't;
3960
# refactored date parsing; defined public registerDateHandler so callers
3961
# can add support for additional date formats at runtime; added support
3962
# for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added
3963
# zopeCompatibilityHack() which turns FeedParserDict into a regular
3964
# dictionary, required for Zope compatibility, and also makes command-
3965
# line debugging easier because pprint module formats real dictionaries
3966
# better than dictionary-like objects; added NonXMLContentType exception,
3967
# which is stored in bozo_exception when a feed is served with a non-XML
3968
# media type such as 'text/plain'; respect Content-Language as default
3969
# language if not xml:lang is present; cloud dict is now FeedParserDict;
3970
# generator dict is now FeedParserDict; better tracking of xml:lang,
3971
# including support for xml:lang='' to unset the current language;
3972
# recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default
3973
# namespace; don't overwrite final status on redirects (scenarios:
3974
# redirecting to a URL that returns 304, redirecting to a URL that
3975
# redirects to another URL with a different type of redirect); add
3976
# support for HTTP 303 redirects
3977
#4.0 - MAP - support for relative URIs in xml:base attribute; fixed
3978
# encoding issue with mxTidy (phopkins); preliminary support for RFC 3229;
3979
# support for Atom 1.0; support for iTunes extensions; new 'tags' for
3980
# categories/keywords/etc. as array of dict
3981
# {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0
3982
# terminology; parse RFC 822-style dates with no time; lots of other
3984
#4.1 - MAP - removed socket timeout; added support for chardet library
3985
#4.2 - MAP - added support for parsing microformats within content elements:
3986
# currently supports rel-tag (maps to 'tags'), rel-enclosure (maps to
3987
# 'enclosures'), XFN links within content elements (maps to 'xfn'),
3988
# and hCard (parses as vCard); bug [ 1481975 ] Misencoded utf-8/win-1252