| 1 | #!/usr/bin/env python |
|---|
| 2 | """Universal feed parser |
|---|
| 3 | |
|---|
| 4 | Handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds |
|---|
| 5 | |
|---|
| 6 | Visit http://feedparser.org/ for the latest version |
|---|
| 7 | Visit http://feedparser.org/docs/ for the latest documentation |
|---|
| 8 | |
|---|
| 9 | Required: Python 2.1 or later |
|---|
| 10 | Recommended: Python 2.3 or later |
|---|
| 11 | Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/> |
|---|
| 12 | """ |
|---|
| 13 | |
|---|
| 14 | __version__ = "4.1"# + "$Revision: 1.92 $"[11:15] + "-cvs" |
|---|
| 15 | __license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved. |
|---|
| 16 | |
|---|
| 17 | Redistribution and use in source and binary forms, with or without modification, |
|---|
| 18 | are permitted provided that the following conditions are met: |
|---|
| 19 | |
|---|
| 20 | * Redistributions of source code must retain the above copyright notice, |
|---|
| 21 | this list of conditions and the following disclaimer. |
|---|
| 22 | * Redistributions in binary form must reproduce the above copyright notice, |
|---|
| 23 | this list of conditions and the following disclaimer in the documentation |
|---|
| 24 | and/or other materials provided with the distribution. |
|---|
| 25 | |
|---|
| 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 'AS IS' |
|---|
| 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|---|
| 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|---|
| 29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE |
|---|
| 30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
|---|
| 31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
|---|
| 32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
|---|
| 33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
|---|
| 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
|---|
| 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|---|
| 36 | POSSIBILITY OF SUCH DAMAGE.""" |
|---|
| 37 | __author__ = "Mark Pilgrim <http://diveintomark.org/>" |
|---|
| 38 | __contributors__ = ["Jason Diamond <http://injektilo.org/>", |
|---|
| 39 | "John Beimler <http://john.beimler.org/>", |
|---|
| 40 | "Fazal Majid <http://www.majid.info/mylos/weblog/>", |
|---|
| 41 | "Aaron Swartz <http://aaronsw.com/>", |
|---|
| 42 | "Kevin Marks <http://epeus.blogspot.com/>"] |
|---|
| 43 | _debug = 0 |
|---|
| 44 | |
|---|
| 45 | # HTTP "User-Agent" header to send to servers when downloading feeds. |
|---|
| 46 | # If you are embedding feedparser in a larger application, you should |
|---|
| 47 | # change this to your application name and URL. |
|---|
| 48 | USER_AGENT = "UniversalFeedParser/%s +http://feedparser.org/" % __version__ |
|---|
| 49 | |
|---|
| 50 | # HTTP "Accept" header to send to servers when downloading feeds. If you don't |
|---|
| 51 | # want to send an Accept header, set this to None. |
|---|
| 52 | ACCEPT_HEADER = "application/atom+xml,application/rdf+xml,application/rss+xml,application/x-netcdf,application/xml;q=0.9,text/xml;q=0.2,*/*;q=0.1" |
|---|
| 53 | |
|---|
| 54 | # List of preferred XML parsers, by SAX driver name. These will be tried first, |
|---|
| 55 | # but if they're not installed, Python will keep searching through its own list |
|---|
| 56 | # of pre-installed parsers until it finds one that supports everything we need. |
|---|
| 57 | PREFERRED_XML_PARSERS = ["drv_libxml2"] |
|---|
| 58 | |
|---|
| 59 | # If you want feedparser to automatically run HTML markup through HTML Tidy, set |
|---|
| 60 | # this to 1. Requires mxTidy <http://www.egenix.com/files/python/mxTidy.html> |
|---|
| 61 | # or utidylib <http://utidylib.berlios.de/>. |
|---|
| 62 | TIDY_MARKUP = 0 |
|---|
| 63 | |
|---|
| 64 | # List of Python interfaces for HTML Tidy, in order of preference. Only useful |
|---|
| 65 | # if TIDY_MARKUP = 1 |
|---|
| 66 | PREFERRED_TIDY_INTERFACES = ["uTidy", "mxTidy"] |
|---|
| 67 | |
|---|
| 68 | # ---------- required modules (should come with any Python distribution) ---------- |
|---|
| 69 | import sgmllib, re, sys, copy, urlparse, time, rfc822, types, cgi, urllib, urllib2 |
|---|
| 70 | try: |
|---|
| 71 | from cStringIO import StringIO as _StringIO |
|---|
| 72 | except: |
|---|
| 73 | from StringIO import StringIO as _StringIO |
|---|
| 74 | |
|---|
| 75 | # ---------- optional modules (feedparser will work without these, but with reduced functionality) ---------- |
|---|
| 76 | |
|---|
| 77 | # gzip is included with most Python distributions, but may not be available if you compiled your own |
|---|
| 78 | try: |
|---|
| 79 | import gzip |
|---|
| 80 | except: |
|---|
| 81 | gzip = None |
|---|
| 82 | try: |
|---|
| 83 | import zlib |
|---|
| 84 | except: |
|---|
| 85 | zlib = None |
|---|
| 86 | |
|---|
| 87 | # If a real XML parser is available, feedparser will attempt to use it. feedparser has |
|---|
| 88 | # been tested with the built-in SAX parser, PyXML, and libxml2. On platforms where the |
|---|
| 89 | # Python distribution does not come with an XML parser (such as Mac OS X 10.2 and some |
|---|
| 90 | # versions of FreeBSD), feedparser will quietly fall back on regex-based parsing. |
|---|
| 91 | try: |
|---|
| 92 | import xml.sax |
|---|
| 93 | xml.sax.make_parser(PREFERRED_XML_PARSERS) # test for valid parsers |
|---|
| 94 | from xml.sax.saxutils import escape as _xmlescape |
|---|
| 95 | _XML_AVAILABLE = 1 |
|---|
| 96 | except: |
|---|
| 97 | _XML_AVAILABLE = 0 |
|---|
| 98 | def _xmlescape(data): |
|---|
| 99 | data = data.replace('&', '&') |
|---|
| 100 | data = data.replace('>', '>') |
|---|
| 101 | data = data.replace('<', '<') |
|---|
| 102 | return data |
|---|
| 103 | |
|---|
| 104 | # base64 support for Atom feeds that contain embedded binary data |
|---|
| 105 | try: |
|---|
| 106 | import base64, binascii |
|---|
| 107 | except: |
|---|
| 108 | base64 = binascii = None |
|---|
| 109 | |
|---|
| 110 | # cjkcodecs and iconv_codec provide support for more character encodings. |
|---|
| 111 | # Both are available from http://cjkpython.i18n.org/ |
|---|
| 112 | try: |
|---|
| 113 | import cjkcodecs.aliases |
|---|
| 114 | except: |
|---|
| 115 | pass |
|---|
| 116 | try: |
|---|
| 117 | import iconv_codec |
|---|
| 118 | except: |
|---|
| 119 | pass |
|---|
| 120 | |
|---|
| 121 | # chardet library auto-detects character encodings |
|---|
| 122 | # Download from http://chardet.feedparser.org/ |
|---|
| 123 | try: |
|---|
| 124 | import chardet |
|---|
| 125 | if _debug: |
|---|
| 126 | import chardet.constants |
|---|
| 127 | chardet.constants._debug = 1 |
|---|
| 128 | except: |
|---|
| 129 | chardet = None |
|---|
| 130 | |
|---|
| 131 | # ---------- don't touch these ---------- |
|---|
| 132 | class ThingsNobodyCaresAboutButMe(Exception): pass |
|---|
| 133 | class CharacterEncodingOverride(ThingsNobodyCaresAboutButMe): pass |
|---|
| 134 | class CharacterEncodingUnknown(ThingsNobodyCaresAboutButMe): pass |
|---|
| 135 | class NonXMLContentType(ThingsNobodyCaresAboutButMe): pass |
|---|
| 136 | class UndeclaredNamespace(Exception): pass |
|---|
| 137 | |
|---|
| 138 | sgmllib.tagfind = re.compile('[a-zA-Z][-_.:a-zA-Z0-9]*') |
|---|
| 139 | sgmllib.special = re.compile('<!') |
|---|
| 140 | sgmllib.charref = re.compile('&#(x?[0-9A-Fa-f]+)[^0-9A-Fa-f]') |
|---|
| 141 | |
|---|
| 142 | SUPPORTED_VERSIONS = {'': 'unknown', |
|---|
| 143 | 'rss090': 'RSS 0.90', |
|---|
| 144 | 'rss091n': 'RSS 0.91 (Netscape)', |
|---|
| 145 | 'rss091u': 'RSS 0.91 (Userland)', |
|---|
| 146 | 'rss092': 'RSS 0.92', |
|---|
| 147 | 'rss093': 'RSS 0.93', |
|---|
| 148 | 'rss094': 'RSS 0.94', |
|---|
| 149 | 'rss20': 'RSS 2.0', |
|---|
| 150 | 'rss10': 'RSS 1.0', |
|---|
| 151 | 'rss': 'RSS (unknown version)', |
|---|
| 152 | 'atom01': 'Atom 0.1', |
|---|
| 153 | 'atom02': 'Atom 0.2', |
|---|
| 154 | 'atom03': 'Atom 0.3', |
|---|
| 155 | 'atom10': 'Atom 1.0', |
|---|
| 156 | 'atom': 'Atom (unknown version)', |
|---|
| 157 | 'cdf': 'CDF', |
|---|
| 158 | 'hotrss': 'Hot RSS' |
|---|
| 159 | } |
|---|
| 160 | |
|---|
| 161 | try: |
|---|
| 162 | UserDict = dict |
|---|
| 163 | except NameError: |
|---|
| 164 | # Python 2.1 does not have dict |
|---|
| 165 | from UserDict import UserDict |
|---|
| 166 | def dict(aList): |
|---|
| 167 | rc = {} |
|---|
| 168 | for k, v in aList: |
|---|
| 169 | rc[k] = v |
|---|
| 170 | return rc |
|---|
| 171 | |
|---|
| 172 | class FeedParserDict(UserDict): |
|---|
| 173 | keymap = {'channel': 'feed', |
|---|
| 174 | 'items': 'entries', |
|---|
| 175 | 'guid': 'id', |
|---|
| 176 | 'date': 'updated', |
|---|
| 177 | 'date_parsed': 'updated_parsed', |
|---|
| 178 | 'description': ['subtitle', 'summary'], |
|---|
| 179 | 'url': ['href'], |
|---|
| 180 | 'modified': 'updated', |
|---|
| 181 | 'modified_parsed': 'updated_parsed', |
|---|
| 182 | 'issued': 'published', |
|---|
| 183 | 'issued_parsed': 'published_parsed', |
|---|
| 184 | 'copyright': 'rights', |
|---|
| 185 | 'copyright_detail': 'rights_detail', |
|---|
| 186 | 'tagline': 'subtitle', |
|---|
| 187 | 'tagline_detail': 'subtitle_detail'} |
|---|
| 188 | def __getitem__(self, key): |
|---|
| 189 | if key == 'category': |
|---|
| 190 | return UserDict.__getitem__(self, 'tags')[0]['term'] |
|---|
| 191 | if key == 'categories': |
|---|
| 192 | return [(tag['scheme'], tag['term']) for tag in UserDict.__getitem__(self, 'tags')] |
|---|
| 193 | realkey = self.keymap.get(key, key) |
|---|
| 194 | if type(realkey) == types.ListType: |
|---|
| 195 | for k in realkey: |
|---|
| 196 | if UserDict.has_key(self, k): |
|---|
| 197 | return UserDict.__getitem__(self, k) |
|---|
| 198 | if UserDict.has_key(self, key): |
|---|
| 199 | return UserDict.__getitem__(self, key) |
|---|
| 200 | return UserDict.__getitem__(self, realkey) |
|---|
| 201 | |
|---|
| 202 | def __setitem__(self, key, value): |
|---|
| 203 | for k in self.keymap.keys(): |
|---|
| 204 | if key == k: |
|---|
| 205 | key = self.keymap[k] |
|---|
| 206 | if type(key) == types.ListType: |
|---|
| 207 | key = key[0] |
|---|
| 208 | return UserDict.__setitem__(self, key, value) |
|---|
| 209 | |
|---|
| 210 | def get(self, key, default=None): |
|---|
| 211 | if self.has_key(key): |
|---|
| 212 | return self[key] |
|---|
| 213 | else: |
|---|
| 214 | return default |
|---|
| 215 | |
|---|
| 216 | def setdefault(self, key, value): |
|---|
| 217 | if not self.has_key(key): |
|---|
| 218 | self[key] = value |
|---|
| 219 | return self[key] |
|---|
| 220 | |
|---|
| 221 | def has_key(self, key): |
|---|
| 222 | try: |
|---|
| 223 | return hasattr(self, key) or UserDict.has_key(self, key) |
|---|
| 224 | except AttributeError: |
|---|
| 225 | return False |
|---|
| 226 | |
|---|
| 227 | def __getattr__(self, key): |
|---|
| 228 | try: |
|---|
| 229 | return self.__dict__[key] |
|---|
| 230 | except KeyError: |
|---|
| 231 | pass |
|---|
| 232 | try: |
|---|
| 233 | assert not key.startswith('_') |
|---|
| 234 | return self.__getitem__(key) |
|---|
| 235 | except: |
|---|
| 236 | raise AttributeError, "object has no attribute '%s'" % key |
|---|
| 237 | |
|---|
| 238 | def __setattr__(self, key, value): |
|---|
| 239 | if key.startswith('_') or key == 'data': |
|---|
| 240 | self.__dict__[key] = value |
|---|
| 241 | else: |
|---|
| 242 | return self.__setitem__(key, value) |
|---|
| 243 | |
|---|
| 244 | def __contains__(self, key): |
|---|
| 245 | return self.has_key(key) |
|---|
| 246 | |
|---|
| 247 | def zopeCompatibilityHack(): |
|---|
| 248 | global FeedParserDict |
|---|
| 249 | del FeedParserDict |
|---|
| 250 | def FeedParserDict(aDict=None): |
|---|
| 251 | rc = {} |
|---|
| 252 | if aDict: |
|---|
| 253 | rc.update(aDict) |
|---|
| 254 | return rc |
|---|
| 255 | |
|---|
| 256 | _ebcdic_to_ascii_map = None |
|---|
| 257 | def _ebcdic_to_ascii(s): |
|---|
| 258 | global _ebcdic_to_ascii_map |
|---|
| 259 | if not _ebcdic_to_ascii_map: |
|---|
| 260 | emap = ( |
|---|
| 261 | 0,1,2,3,156,9,134,127,151,141,142,11,12,13,14,15, |
|---|
| 262 | 16,17,18,19,157,133,8,135,24,25,146,143,28,29,30,31, |
|---|
| 263 | 128,129,130,131,132,10,23,27,136,137,138,139,140,5,6,7, |
|---|
| 264 | 144,145,22,147,148,149,150,4,152,153,154,155,20,21,158,26, |
|---|
| 265 | 32,160,161,162,163,164,165,166,167,168,91,46,60,40,43,33, |
|---|
| 266 | 38,169,170,171,172,173,174,175,176,177,93,36,42,41,59,94, |
|---|
| 267 | 45,47,178,179,180,181,182,183,184,185,124,44,37,95,62,63, |
|---|
| 268 | 186,187,188,189,190,191,192,193,194,96,58,35,64,39,61,34, |
|---|
| 269 | 195,97,98,99,100,101,102,103,104,105,196,197,198,199,200,201, |
|---|
| 270 | 202,106,107,108,109,110,111,112,113,114,203,204,205,206,207,208, |
|---|
| 271 | 209,126,115,116,117,118,119,120,121,122,210,211,212,213,214,215, |
|---|
| 272 | 216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231, |
|---|
| 273 | 123,65,66,67,68,69,70,71,72,73,232,233,234,235,236,237, |
|---|
| 274 | 125,74,75,76,77,78,79,80,81,82,238,239,240,241,242,243, |
|---|
| 275 | 92,159,83,84,85,86,87,88,89,90,244,245,246,247,248,249, |
|---|
| 276 | 48,49,50,51,52,53,54,55,56,57,250,251,252,253,254,255 |
|---|
| 277 | ) |
|---|
| 278 | import string |
|---|
| 279 | _ebcdic_to_ascii_map = string.maketrans( \ |
|---|
| 280 | ''.join(map(chr, range(256))), ''.join(map(chr, emap))) |
|---|
| 281 | return s.translate(_ebcdic_to_ascii_map) |
|---|
| 282 | |
|---|
| 283 | _urifixer = re.compile('^([A-Za-z][A-Za-z0-9+-.]*://)(/*)(.*?)') |
|---|
| 284 | def _urljoin(base, uri): |
|---|
| 285 | uri = _urifixer.sub(r'\1\3', uri) |
|---|
| 286 | return urlparse.urljoin(base, uri) |
|---|
| 287 | |
|---|
| 288 | class _FeedParserMixin: |
|---|
| 289 | namespaces = {'': '', |
|---|
| 290 | 'http://backend.userland.com/rss': '', |
|---|
| 291 | 'http://blogs.law.harvard.edu/tech/rss': '', |
|---|
| 292 | 'http://purl.org/rss/1.0/': '', |
|---|
| 293 | 'http://my.netscape.com/rdf/simple/0.9/': '', |
|---|
| 294 | 'http://example.com/newformat#': '', |
|---|
| 295 | 'http://example.com/necho': '', |
|---|
| 296 | 'http://purl.org/echo/': '', |
|---|
| 297 | 'uri/of/echo/namespace#': '', |
|---|
| 298 | 'http://purl.org/pie/': '', |
|---|
| 299 | 'http://purl.org/atom/ns#': '', |
|---|
| 300 | 'http://www.w3.org/2005/Atom': '', |
|---|
| 301 | 'http://purl.org/rss/1.0/modules/rss091#': '', |
|---|
| 302 | |
|---|
| 303 | 'http://webns.net/mvcb/': 'admin', |
|---|
| 304 | 'http://purl.org/rss/1.0/modules/aggregation/': 'ag', |
|---|
| 305 | 'http://purl.org/rss/1.0/modules/annotate/': 'annotate', |
|---|
| 306 | 'http://media.tangent.org/rss/1.0/': 'audio', |
|---|
| 307 | 'http://backend.userland.com/blogChannelModule': 'blogChannel', |
|---|
| 308 | 'http://web.resource.org/cc/': 'cc', |
|---|
| 309 | 'http://backend.userland.com/creativeCommonsRssModule': 'creativeCommons', |
|---|
| 310 | 'http://purl.org/rss/1.0/modules/company': 'co', |
|---|
| 311 | 'http://purl.org/rss/1.0/modules/content/': 'content', |
|---|
| 312 | 'http://my.theinfo.org/changed/1.0/rss/': 'cp', |
|---|
| 313 | 'http://purl.org/dc/elements/1.1/': 'dc', |
|---|
| 314 | 'http://purl.org/dc/terms/': 'dcterms', |
|---|
| 315 | 'http://purl.org/rss/1.0/modules/email/': 'email', |
|---|
| 316 | 'http://purl.org/rss/1.0/modules/event/': 'ev', |
|---|
| 317 | 'http://rssnamespace.org/feedburner/ext/1.0': 'feedburner', |
|---|
| 318 | 'http://freshmeat.net/rss/fm/': 'fm', |
|---|
| 319 | 'http://xmlns.com/foaf/0.1/': 'foaf', |
|---|
| 320 | 'http://www.w3.org/2003/01/geo/wgs84_pos#': 'geo', |
|---|
| 321 | 'http://postneo.com/icbm/': 'icbm', |
|---|
| 322 | 'http://purl.org/rss/1.0/modules/image/': 'image', |
|---|
| 323 | 'http://www.itunes.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|---|
| 324 | 'http://example.com/DTDs/PodCast-1.0.dtd': 'itunes', |
|---|
| 325 | 'http://purl.org/rss/1.0/modules/link/': 'l', |
|---|
| 326 | 'http://search.yahoo.com/mrss': 'media', |
|---|
| 327 | 'http://madskills.com/public/xml/rss/module/pingback/': 'pingback', |
|---|
| 328 | 'http://prismstandard.org/namespaces/1.2/basic/': 'prism', |
|---|
| 329 | 'http://www.w3.org/1999/02/22-rdf-syntax-ns#': 'rdf', |
|---|
| 330 | 'http://www.w3.org/2000/01/rdf-schema#': 'rdfs', |
|---|
| 331 | 'http://purl.org/rss/1.0/modules/reference/': 'ref', |
|---|
| 332 | 'http://purl.org/rss/1.0/modules/richequiv/': 'reqv', |
|---|
| 333 | 'http://purl.org/rss/1.0/modules/search/': 'search', |
|---|
| 334 | 'http://purl.org/rss/1.0/modules/slash/': 'slash', |
|---|
| 335 | 'http://schemas.xmlsoap.org/soap/envelope/': 'soap', |
|---|
| 336 | 'http://purl.org/rss/1.0/modules/servicestatus/': 'ss', |
|---|
| 337 | 'http://hacks.benhammersley.com/rss/streaming/': 'str', |
|---|
| 338 | 'http://purl.org/rss/1.0/modules/subscription/': 'sub', |
|---|
| 339 | 'http://purl.org/rss/1.0/modules/syndication/': 'sy', |
|---|
| 340 | 'http://purl.org/rss/1.0/modules/taxonomy/': 'taxo', |
|---|
| 341 | 'http://purl.org/rss/1.0/modules/threading/': 'thr', |
|---|
| 342 | 'http://purl.org/rss/1.0/modules/textinput/': 'ti', |
|---|
| 343 | 'http://madskills.com/public/xml/rss/module/trackback/':'trackback', |
|---|
| 344 | 'http://wellformedweb.org/commentAPI/': 'wfw', |
|---|
| 345 | 'http://purl.org/rss/1.0/modules/wiki/': 'wiki', |
|---|
| 346 | 'http://www.w3.org/1999/xhtml': 'xhtml', |
|---|
| 347 | 'http://www.w3.org/XML/1998/namespace': 'xml', |
|---|
| 348 | 'http://schemas.pocketsoap.com/rss/myDescModule/': 'szf' |
|---|
| 349 | } |
|---|
| 350 | _matchnamespaces = {} |
|---|
| 351 | |
|---|
| 352 | can_be_relative_uri = ['link', 'id', 'wfw_comment', 'wfw_commentrss', 'docs', 'url', 'href', 'comments', 'license', 'icon', 'logo'] |
|---|
| 353 | can_contain_relative_uris = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|---|
| 354 | can_contain_dangerous_markup = ['content', 'title', 'summary', 'info', 'tagline', 'subtitle', 'copyright', 'rights', 'description'] |
|---|
| 355 | html_types = ['text/html', 'application/xhtml+xml'] |
|---|
| 356 | |
|---|
| 357 | def __init__(self, baseuri=None, baselang=None, encoding='utf-8'): |
|---|
| 358 | if _debug: sys.stderr.write('initializing FeedParser\n') |
|---|
| 359 | if not self._matchnamespaces: |
|---|
| 360 | for k, v in self.namespaces.items(): |
|---|
| 361 | self._matchnamespaces[k.lower()] = v |
|---|
| 362 | self.feeddata = FeedParserDict() # feed-level data |
|---|
| 363 | self.encoding = encoding # character encoding |
|---|
| 364 | self.entries = [] # list of entry-level data |
|---|
| 365 | self.version = '' # feed type/version, see SUPPORTED_VERSIONS |
|---|
| 366 | self.namespacesInUse = {} # dictionary of namespaces defined by the feed |
|---|
| 367 | |
|---|
| 368 | # the following are used internally to track state; |
|---|
| 369 | # this is really out of control and should be refactored |
|---|
| 370 | self.infeed = 0 |
|---|
| 371 | self.inentry = 0 |
|---|
| 372 | self.incontent = 0 |
|---|
| 373 | self.intextinput = 0 |
|---|
| 374 | self.inimage = 0 |
|---|
| 375 | self.inauthor = 0 |
|---|
| 376 | self.incontributor = 0 |
|---|
| 377 | self.inpublisher = 0 |
|---|
| 378 | self.insource = 0 |
|---|
| 379 | self.sourcedata = FeedParserDict() |
|---|
| 380 | self.contentparams = FeedParserDict() |
|---|
| 381 | self._summaryKey = None |
|---|
| 382 | self.namespacemap = {} |
|---|
| 383 | self.elementstack = [] |
|---|
| 384 | self.basestack = [] |
|---|
| 385 | self.langstack = [] |
|---|
| 386 | self.baseuri = baseuri or '' |
|---|
| 387 | self.lang = baselang or None |
|---|
| 388 | if baselang: |
|---|
| 389 | self.feeddata['language'] = baselang |
|---|
| 390 | |
|---|
| 391 | def unknown_starttag(self, tag, attrs): |
|---|
| 392 | if _debug: sys.stderr.write('start %s with %s\n' % (tag, attrs)) |
|---|
| 393 | # normalize attrs |
|---|
| 394 | attrs = [(k.lower(), v) for k, v in attrs] |
|---|
| 395 | attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] |
|---|
| 396 | |
|---|
| 397 | # track xml:base and xml:lang |
|---|
| 398 | attrsD = dict(attrs) |
|---|
| 399 | baseuri = attrsD.get('xml:base', attrsD.get('base')) or self.baseuri |
|---|
| 400 | self.baseuri = _urljoin(self.baseuri, baseuri) |
|---|
| 401 | lang = attrsD.get('xml:lang', attrsD.get('lang')) |
|---|
| 402 | if lang == '': |
|---|
| 403 | # xml:lang could be explicitly set to '', we need to capture that |
|---|
| 404 | lang = None |
|---|
| 405 | elif lang is None: |
|---|
| 406 | # if no xml:lang is specified, use parent lang |
|---|
| 407 | lang = self.lang |
|---|
| 408 | if lang: |
|---|
| 409 | if tag in ('feed', 'rss', 'rdf:RDF'): |
|---|
| 410 | self.feeddata['language'] = lang |
|---|
| 411 | self.lang = lang |
|---|
| 412 | self.basestack.append(self.baseuri) |
|---|
| 413 | self.langstack.append(lang) |
|---|
| 414 | |
|---|
| 415 | # track namespaces |
|---|
| 416 | for prefix, uri in attrs: |
|---|
| 417 | if prefix.startswith('xmlns:'): |
|---|
| 418 | self.trackNamespace(prefix[6:], uri) |
|---|
| 419 | elif prefix == 'xmlns': |
|---|
| 420 | self.trackNamespace(None, uri) |
|---|
| 421 | |
|---|
| 422 | # track inline content |
|---|
| 423 | if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|---|
| 424 | # element declared itself as escaped markup, but it isn't really |
|---|
| 425 | self.contentparams['type'] = 'application/xhtml+xml' |
|---|
| 426 | if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': |
|---|
| 427 | # Note: probably shouldn't simply recreate localname here, but |
|---|
| 428 | # our namespace handling isn't actually 100% correct in cases where |
|---|
| 429 | # the feed redefines the default namespace (which is actually |
|---|
| 430 | # the usual case for inline content, thanks Sam), so here we |
|---|
| 431 | # cheat and just reconstruct the element based on localname |
|---|
| 432 | # because that compensates for the bugs in our namespace handling. |
|---|
| 433 | # This will horribly munge inline content with non-empty qnames, |
|---|
| 434 | # but nobody actually does that, so I'm not fixing it. |
|---|
| 435 | tag = tag.split(':')[-1] |
|---|
| 436 | return self.handle_data('<%s%s>' % (tag, ''.join([' %s="%s"' % t for t in attrs])), escape=0) |
|---|
| 437 | |
|---|
| 438 | # match namespaces |
|---|
| 439 | if tag.find(':') <> -1: |
|---|
| 440 | prefix, suffix = tag.split(':', 1) |
|---|
| 441 | else: |
|---|
| 442 | prefix, suffix = '', tag |
|---|
| 443 | prefix = self.namespacemap.get(prefix, prefix) |
|---|
| 444 | if prefix: |
|---|
| 445 | prefix = prefix + '_' |
|---|
| 446 | |
|---|
| 447 | # special hack for better tracking of empty textinput/image elements in illformed feeds |
|---|
| 448 | if (not prefix) and tag not in ('title', 'link', 'description', 'name'): |
|---|
| 449 | self.intextinput = 0 |
|---|
| 450 | if (not prefix) and tag not in ('title', 'link', 'description', 'url', 'href', 'width', 'height'): |
|---|
| 451 | self.inimage = 0 |
|---|
| 452 | |
|---|
| 453 | # call special handler (if defined) or default handler |
|---|
| 454 | methodname = '_start_' + prefix + suffix |
|---|
| 455 | try: |
|---|
| 456 | method = getattr(self, methodname) |
|---|
| 457 | return method(attrsD) |
|---|
| 458 | except AttributeError: |
|---|
| 459 | return self.push(prefix + suffix, 1) |
|---|
| 460 | |
|---|
| 461 | def unknown_endtag(self, tag): |
|---|
| 462 | if _debug: sys.stderr.write('end %s\n' % tag) |
|---|
| 463 | # match namespaces |
|---|
| 464 | if tag.find(':') <> -1: |
|---|
| 465 | prefix, suffix = tag.split(':', 1) |
|---|
| 466 | else: |
|---|
| 467 | prefix, suffix = '', tag |
|---|
| 468 | prefix = self.namespacemap.get(prefix, prefix) |
|---|
| 469 | if prefix: |
|---|
| 470 | prefix = prefix + '_' |
|---|
| 471 | |
|---|
| 472 | # call special handler (if defined) or default handler |
|---|
| 473 | methodname = '_end_' + prefix + suffix |
|---|
| 474 | try: |
|---|
| 475 | method = getattr(self, methodname) |
|---|
| 476 | method() |
|---|
| 477 | except AttributeError: |
|---|
| 478 | self.pop(prefix + suffix) |
|---|
| 479 | |
|---|
| 480 | # track inline content |
|---|
| 481 | if self.incontent and self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|---|
| 482 | # element declared itself as escaped markup, but it isn't really |
|---|
| 483 | self.contentparams['type'] = 'application/xhtml+xml' |
|---|
| 484 | if self.incontent and self.contentparams.get('type') == 'application/xhtml+xml': |
|---|
| 485 | tag = tag.split(':')[-1] |
|---|
| 486 | self.handle_data('</%s>' % tag, escape=0) |
|---|
| 487 | |
|---|
| 488 | # track xml:base and xml:lang going out of scope |
|---|
| 489 | if self.basestack: |
|---|
| 490 | self.basestack.pop() |
|---|
| 491 | if self.basestack and self.basestack[-1]: |
|---|
| 492 | self.baseuri = self.basestack[-1] |
|---|
| 493 | if self.langstack: |
|---|
| 494 | self.langstack.pop() |
|---|
| 495 | if self.langstack: # and (self.langstack[-1] is not None): |
|---|
| 496 | self.lang = self.langstack[-1] |
|---|
| 497 | |
|---|
| 498 | def handle_charref(self, ref): |
|---|
| 499 | # called for each character reference, e.g. for ' ', ref will be '160' |
|---|
| 500 | if not self.elementstack: return |
|---|
| 501 | ref = ref.lower() |
|---|
| 502 | if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'): |
|---|
| 503 | text = '&#%s;' % ref |
|---|
| 504 | else: |
|---|
| 505 | if ref[0] == 'x': |
|---|
| 506 | c = int(ref[1:], 16) |
|---|
| 507 | else: |
|---|
| 508 | c = int(ref) |
|---|
| 509 | text = unichr(c).encode('utf-8') |
|---|
| 510 | self.elementstack[-1][2].append(text) |
|---|
| 511 | |
|---|
| 512 | def handle_entityref(self, ref): |
|---|
| 513 | # called for each entity reference, e.g. for '©', ref will be 'copy' |
|---|
| 514 | if not self.elementstack: return |
|---|
| 515 | if _debug: sys.stderr.write('entering handle_entityref with %s\n' % ref) |
|---|
| 516 | if ref in ('lt', 'gt', 'quot', 'amp', 'apos'): |
|---|
| 517 | text = '&%s;' % ref |
|---|
| 518 | else: |
|---|
| 519 | # entity resolution graciously donated by Aaron Swartz |
|---|
| 520 | def name2cp(k): |
|---|
| 521 | import htmlentitydefs |
|---|
| 522 | if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3 |
|---|
| 523 | return htmlentitydefs.name2codepoint[k] |
|---|
| 524 | k = htmlentitydefs.entitydefs[k] |
|---|
| 525 | if k.startswith('&#') and k.endswith(';'): |
|---|
| 526 | return int(k[2:-1]) # not in latin-1 |
|---|
| 527 | return ord(k) |
|---|
| 528 | try: name2cp(ref) |
|---|
| 529 | except KeyError: text = '&%s;' % ref |
|---|
| 530 | else: text = unichr(name2cp(ref)).encode('utf-8') |
|---|
| 531 | self.elementstack[-1][2].append(text) |
|---|
| 532 | |
|---|
| 533 | def handle_data(self, text, escape=1): |
|---|
| 534 | # called for each block of plain text, i.e. outside of any tag and |
|---|
| 535 | # not containing any character or entity references |
|---|
| 536 | if not self.elementstack: return |
|---|
| 537 | if escape and self.contentparams.get('type') == 'application/xhtml+xml': |
|---|
| 538 | text = _xmlescape(text) |
|---|
| 539 | self.elementstack[-1][2].append(text) |
|---|
| 540 | |
|---|
| 541 | def handle_comment(self, text): |
|---|
| 542 | # called for each comment, e.g. <!-- insert message here --> |
|---|
| 543 | pass |
|---|
| 544 | |
|---|
| 545 | def handle_pi(self, text): |
|---|
| 546 | # called for each processing instruction, e.g. <?instruction> |
|---|
| 547 | pass |
|---|
| 548 | |
|---|
| 549 | def handle_decl(self, text): |
|---|
| 550 | pass |
|---|
| 551 | |
|---|
| 552 | def parse_declaration(self, i): |
|---|
| 553 | # override internal declaration handler to handle CDATA blocks |
|---|
| 554 | if _debug: sys.stderr.write('entering parse_declaration\n') |
|---|
| 555 | if self.rawdata[i:i+9] == '<![CDATA[': |
|---|
| 556 | k = self.rawdata.find(']]>', i) |
|---|
| 557 | if k == -1: k = len(self.rawdata) |
|---|
| 558 | self.handle_data(_xmlescape(self.rawdata[i+9:k]), 0) |
|---|
| 559 | return k+3 |
|---|
| 560 | else: |
|---|
| 561 | k = self.rawdata.find('>', i) |
|---|
| 562 | return k+1 |
|---|
| 563 | |
|---|
| 564 | def mapContentType(self, contentType): |
|---|
| 565 | contentType = contentType.lower() |
|---|
| 566 | if contentType == 'text': |
|---|
| 567 | contentType = 'text/plain' |
|---|
| 568 | elif contentType == 'html': |
|---|
| 569 | contentType = 'text/html' |
|---|
| 570 | elif contentType == 'xhtml': |
|---|
| 571 | contentType = 'application/xhtml+xml' |
|---|
| 572 | return contentType |
|---|
| 573 | |
|---|
| 574 | def trackNamespace(self, prefix, uri): |
|---|
| 575 | loweruri = uri.lower() |
|---|
| 576 | if (prefix, loweruri) == (None, 'http://my.netscape.com/rdf/simple/0.9/') and not self.version: |
|---|
| 577 | self.version = 'rss090' |
|---|
| 578 | if loweruri == 'http://purl.org/rss/1.0/' and not self.version: |
|---|
| 579 | self.version = 'rss10' |
|---|
| 580 | if loweruri == 'http://www.w3.org/2005/atom' and not self.version: |
|---|
| 581 | self.version = 'atom10' |
|---|
| 582 | if loweruri.find('backend.userland.com/rss') <> -1: |
|---|
| 583 | # match any backend.userland.com namespace |
|---|
| 584 | uri = 'http://backend.userland.com/rss' |
|---|
| 585 | loweruri = uri |
|---|
| 586 | if self._matchnamespaces.has_key(loweruri): |
|---|
| 587 | self.namespacemap[prefix] = self._matchnamespaces[loweruri] |
|---|
| 588 | self.namespacesInUse[self._matchnamespaces[loweruri]] = uri |
|---|
| 589 | else: |
|---|
| 590 | self.namespacesInUse[prefix or ''] = uri |
|---|
| 591 | |
|---|
| 592 | def resolveURI(self, uri): |
|---|
| 593 | return _urljoin(self.baseuri or '', uri) |
|---|
| 594 | |
|---|
| 595 | def decodeEntities(self, element, data): |
|---|
| 596 | return data |
|---|
| 597 | |
|---|
| 598 | def push(self, element, expectingText): |
|---|
| 599 | self.elementstack.append([element, expectingText, []]) |
|---|
| 600 | |
|---|
| 601 | def pop(self, element, stripWhitespace=1): |
|---|
| 602 | if not self.elementstack: return |
|---|
| 603 | if self.elementstack[-1][0] != element: return |
|---|
| 604 | |
|---|
| 605 | element, expectingText, pieces = self.elementstack.pop() |
|---|
| 606 | output = ''.join(pieces) |
|---|
| 607 | if stripWhitespace: |
|---|
| 608 | output = output.strip() |
|---|
| 609 | if not expectingText: return output |
|---|
| 610 | |
|---|
| 611 | # decode base64 content |
|---|
| 612 | if base64 and self.contentparams.get('base64', 0): |
|---|
| 613 | try: |
|---|
| 614 | output = base64.decodestring(output) |
|---|
| 615 | except binascii.Error: |
|---|
| 616 | pass |
|---|
| 617 | except binascii.Incomplete: |
|---|
| 618 | pass |
|---|
| 619 | |
|---|
| 620 | # resolve relative URIs |
|---|
| 621 | if (element in self.can_be_relative_uri) and output: |
|---|
| 622 | output = self.resolveURI(output) |
|---|
| 623 | |
|---|
| 624 | # decode entities within embedded markup |
|---|
| 625 | if not self.contentparams.get('base64', 0): |
|---|
| 626 | output = self.decodeEntities(element, output) |
|---|
| 627 | |
|---|
| 628 | # remove temporary cruft from contentparams |
|---|
| 629 | try: |
|---|
| 630 | del self.contentparams['mode'] |
|---|
| 631 | except KeyError: |
|---|
| 632 | pass |
|---|
| 633 | try: |
|---|
| 634 | del self.contentparams['base64'] |
|---|
| 635 | except KeyError: |
|---|
| 636 | pass |
|---|
| 637 | |
|---|
| 638 | # resolve relative URIs within embedded markup |
|---|
| 639 | if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: |
|---|
| 640 | if element in self.can_contain_relative_uris: |
|---|
| 641 | output = _resolveRelativeURIs(output, self.baseuri, self.encoding) |
|---|
| 642 | |
|---|
| 643 | # sanitize embedded markup |
|---|
| 644 | if self.mapContentType(self.contentparams.get('type', 'text/html')) in self.html_types: |
|---|
| 645 | if element in self.can_contain_dangerous_markup: |
|---|
| 646 | output = _sanitizeHTML(output, self.encoding) |
|---|
| 647 | |
|---|
| 648 | if self.encoding and type(output) != type(u''): |
|---|
| 649 | try: |
|---|
| 650 | output = unicode(output, self.encoding) |
|---|
| 651 | except: |
|---|
| 652 | pass |
|---|
| 653 | |
|---|
| 654 | # categories/tags/keywords/whatever are handled in _end_category |
|---|
| 655 | if element == 'category': |
|---|
| 656 | return output |
|---|
| 657 | |
|---|
| 658 | # store output in appropriate place(s) |
|---|
| 659 | if self.inentry and not self.insource: |
|---|
| 660 | if element == 'content': |
|---|
| 661 | self.entries[-1].setdefault(element, []) |
|---|
| 662 | contentparams = copy.deepcopy(self.contentparams) |
|---|
| 663 | contentparams['value'] = output |
|---|
| 664 | self.entries[-1][element].append(contentparams) |
|---|
| 665 | elif element == 'link': |
|---|
| 666 | self.entries[-1][element] = output |
|---|
| 667 | if output: |
|---|
| 668 | self.entries[-1]['links'][-1]['href'] = output |
|---|
| 669 | else: |
|---|
| 670 | if element == 'description': |
|---|
| 671 | element = 'summary' |
|---|
| 672 | self.entries[-1][element] = output |
|---|
| 673 | if self.incontent: |
|---|
| 674 | contentparams = copy.deepcopy(self.contentparams) |
|---|
| 675 | contentparams['value'] = output |
|---|
| 676 | self.entries[-1][element + '_detail'] = contentparams |
|---|
| 677 | elif (self.infeed or self.insource) and (not self.intextinput) and (not self.inimage): |
|---|
| 678 | context = self._getContext() |
|---|
| 679 | if element == 'description': |
|---|
| 680 | element = 'subtitle' |
|---|
| 681 | context[element] = output |
|---|
| 682 | if element == 'link': |
|---|
| 683 | context['links'][-1]['href'] = output |
|---|
| 684 | elif self.incontent: |
|---|
| 685 | contentparams = copy.deepcopy(self.contentparams) |
|---|
| 686 | contentparams['value'] = output |
|---|
| 687 | context[element + '_detail'] = contentparams |
|---|
| 688 | return output |
|---|
| 689 | |
|---|
| 690 | def pushContent(self, tag, attrsD, defaultContentType, expectingText): |
|---|
| 691 | self.incontent += 1 |
|---|
| 692 | self.contentparams = FeedParserDict({ |
|---|
| 693 | 'type': self.mapContentType(attrsD.get('type', defaultContentType)), |
|---|
| 694 | 'language': self.lang, |
|---|
| 695 | 'base': self.baseuri}) |
|---|
| 696 | self.contentparams['base64'] = self._isBase64(attrsD, self.contentparams) |
|---|
| 697 | self.push(tag, expectingText) |
|---|
| 698 | |
|---|
| 699 | def popContent(self, tag): |
|---|
| 700 | value = self.pop(tag) |
|---|
| 701 | self.incontent -= 1 |
|---|
| 702 | self.contentparams.clear() |
|---|
| 703 | return value |
|---|
| 704 | |
|---|
| 705 | def _mapToStandardPrefix(self, name): |
|---|
| 706 | colonpos = name.find(':') |
|---|
| 707 | if colonpos <> -1: |
|---|
| 708 | prefix = name[:colonpos] |
|---|
| 709 | suffix = name[colonpos+1:] |
|---|
| 710 | prefix = self.namespacemap.get(prefix, prefix) |
|---|
| 711 | name = prefix + ':' + suffix |
|---|
| 712 | return name |
|---|
| 713 | |
|---|
| 714 | def _getAttribute(self, attrsD, name): |
|---|
| 715 | return attrsD.get(self._mapToStandardPrefix(name)) |
|---|
| 716 | |
|---|
| 717 | def _isBase64(self, attrsD, contentparams): |
|---|
| 718 | if attrsD.get('mode', '') == 'base64': |
|---|
| 719 | return 1 |
|---|
| 720 | if self.contentparams['type'].startswith('text/'): |
|---|
| 721 | return 0 |
|---|
| 722 | if self.contentparams['type'].endswith('+xml'): |
|---|
| 723 | return 0 |
|---|
| 724 | if self.contentparams['type'].endswith('/xml'): |
|---|
| 725 | return 0 |
|---|
| 726 | return 1 |
|---|
| 727 | |
|---|
| 728 | def _itsAnHrefDamnIt(self, attrsD): |
|---|
| 729 | href = attrsD.get('url', attrsD.get('uri', attrsD.get('href', None))) |
|---|
| 730 | if href: |
|---|
| 731 | try: |
|---|
| 732 | del attrsD['url'] |
|---|
| 733 | except KeyError: |
|---|
| 734 | pass |
|---|
| 735 | try: |
|---|
| 736 | del attrsD['uri'] |
|---|
| 737 | except KeyError: |
|---|
| 738 | pass |
|---|
| 739 | attrsD['href'] = href |
|---|
| 740 | return attrsD |
|---|
| 741 | |
|---|
| 742 | def _save(self, key, value): |
|---|
| 743 | context = self._getContext() |
|---|
| 744 | context.setdefault(key, value) |
|---|
| 745 | |
|---|
| 746 | def _start_rss(self, attrsD): |
|---|
| 747 | versionmap = {'0.91': 'rss091u', |
|---|
| 748 | '0.92': 'rss092', |
|---|
| 749 | '0.93': 'rss093', |
|---|
| 750 | '0.94': 'rss094'} |
|---|
| 751 | if not self.version: |
|---|
| 752 | attr_version = attrsD.get('version', '') |
|---|
| 753 | version = versionmap.get(attr_version) |
|---|
| 754 | if version: |
|---|
| 755 | self.version = version |
|---|
| 756 | elif attr_version.startswith('2.'): |
|---|
| 757 | self.version = 'rss20' |
|---|
| 758 | else: |
|---|
| 759 | self.version = 'rss' |
|---|
| 760 | |
|---|
| 761 | def _start_dlhottitles(self, attrsD): |
|---|
| 762 | self.version = 'hotrss' |
|---|
| 763 | |
|---|
| 764 | def _start_channel(self, attrsD): |
|---|
| 765 | self.infeed = 1 |
|---|
| 766 | self._cdf_common(attrsD) |
|---|
| 767 | _start_feedinfo = _start_channel |
|---|
| 768 | |
|---|
| 769 | def _cdf_common(self, attrsD): |
|---|
| 770 | if attrsD.has_key('lastmod'): |
|---|
| 771 | self._start_modified({}) |
|---|
| 772 | self.elementstack[-1][-1] = attrsD['lastmod'] |
|---|
| 773 | self._end_modified() |
|---|
| 774 | if attrsD.has_key('href'): |
|---|
| 775 | self._start_link({}) |
|---|
| 776 | self.elementstack[-1][-1] = attrsD['href'] |
|---|
| 777 | self._end_link() |
|---|
| 778 | |
|---|
| 779 | def _start_feed(self, attrsD): |
|---|
| 780 | self.infeed = 1 |
|---|
| 781 | versionmap = {'0.1': 'atom01', |
|---|
| 782 | '0.2': 'atom02', |
|---|
| 783 | '0.3': 'atom03'} |
|---|
| 784 | if not self.version: |
|---|
| 785 | attr_version = attrsD.get('version') |
|---|
| 786 | version = versionmap.get(attr_version) |
|---|
| 787 | if version: |
|---|
| 788 | self.version = version |
|---|
| 789 | else: |
|---|
| 790 | self.version = 'atom' |
|---|
| 791 | |
|---|
| 792 | def _end_channel(self): |
|---|
| 793 | self.infeed = 0 |
|---|
| 794 | _end_feed = _end_channel |
|---|
| 795 | |
|---|
| 796 | def _start_image(self, attrsD): |
|---|
| 797 | self.inimage = 1 |
|---|
| 798 | self.push('image', 0) |
|---|
| 799 | context = self._getContext() |
|---|
| 800 | context.setdefault('image', FeedParserDict()) |
|---|
| 801 | |
|---|
| 802 | def _end_image(self): |
|---|
| 803 | self.pop('image') |
|---|
| 804 | self.inimage = 0 |
|---|
| 805 | |
|---|
| 806 | def _start_textinput(self, attrsD): |
|---|
| 807 | self.intextinput = 1 |
|---|
| 808 | self.push('textinput', 0) |
|---|
| 809 | context = self._getContext() |
|---|
| 810 | context.setdefault('textinput', FeedParserDict()) |
|---|
| 811 | _start_textInput = _start_textinput |
|---|
| 812 | |
|---|
| 813 | def _end_textinput(self): |
|---|
| 814 | self.pop('textinput') |
|---|
| 815 | self.intextinput = 0 |
|---|
| 816 | _end_textInput = _end_textinput |
|---|
| 817 | |
|---|
| 818 | def _start_author(self, attrsD): |
|---|
| 819 | self.inauthor = 1 |
|---|
| 820 | self.push('author', 1) |
|---|
| 821 | _start_managingeditor = _start_author |
|---|
| 822 | _start_dc_author = _start_author |
|---|
| 823 | _start_dc_creator = _start_author |
|---|
| 824 | _start_itunes_author = _start_author |
|---|
| 825 | |
|---|
| 826 | def _end_author(self): |
|---|
| 827 | self.pop('author') |
|---|
| 828 | self.inauthor = 0 |
|---|
| 829 | self._sync_author_detail() |
|---|
| 830 | _end_managingeditor = _end_author |
|---|
| 831 | _end_dc_author = _end_author |
|---|
| 832 | _end_dc_creator = _end_author |
|---|
| 833 | _end_itunes_author = _end_author |
|---|
| 834 | |
|---|
| 835 | def _start_itunes_owner(self, attrsD): |
|---|
| 836 | self.inpublisher = 1 |
|---|
| 837 | self.push('publisher', 0) |
|---|
| 838 | |
|---|
| 839 | def _end_itunes_owner(self): |
|---|
| 840 | self.pop('publisher') |
|---|
| 841 | self.inpublisher = 0 |
|---|
| 842 | self._sync_author_detail('publisher') |
|---|
| 843 | |
|---|
| 844 | def _start_contributor(self, attrsD): |
|---|
| 845 | self.incontributor = 1 |
|---|
| 846 | context = self._getContext() |
|---|
| 847 | context.setdefault('contributors', []) |
|---|
| 848 | context['contributors'].append(FeedParserDict()) |
|---|
| 849 | self.push('contributor', 0) |
|---|
| 850 | |
|---|
| 851 | def _end_contributor(self): |
|---|
| 852 | self.pop('contributor') |
|---|
| 853 | self.incontributor = 0 |
|---|
| 854 | |
|---|
| 855 | def _start_dc_contributor(self, attrsD): |
|---|
| 856 | self.incontributor = 1 |
|---|
| 857 | context = self._getContext() |
|---|
| 858 | context.setdefault('contributors', []) |
|---|
| 859 | context['contributors'].append(FeedParserDict()) |
|---|
| 860 | self.push('name', 0) |
|---|
| 861 | |
|---|
| 862 | def _end_dc_contributor(self): |
|---|
| 863 | self._end_name() |
|---|
| 864 | self.incontributor = 0 |
|---|
| 865 | |
|---|
| 866 | def _start_name(self, attrsD): |
|---|
| 867 | self.push('name', 0) |
|---|
| 868 | _start_itunes_name = _start_name |
|---|
| 869 | |
|---|
| 870 | def _end_name(self): |
|---|
| 871 | value = self.pop('name') |
|---|
| 872 | if self.inpublisher: |
|---|
| 873 | self._save_author('name', value, 'publisher') |
|---|
| 874 | elif self.inauthor: |
|---|
| 875 | self._save_author('name', value) |
|---|
| 876 | elif self.incontributor: |
|---|
| 877 | self._save_contributor('name', value) |
|---|
| 878 | elif self.intextinput: |
|---|
| 879 | context = self._getContext() |
|---|
| 880 | context['textinput']['name'] = value |
|---|
| 881 | _end_itunes_name = _end_name |
|---|
| 882 | |
|---|
| 883 | def _start_width(self, attrsD): |
|---|
| 884 | self.push('width', 0) |
|---|
| 885 | |
|---|
| 886 | def _end_width(self): |
|---|
| 887 | value = self.pop('width') |
|---|
| 888 | try: |
|---|
| 889 | value = int(value) |
|---|
| 890 | except: |
|---|
| 891 | value = 0 |
|---|
| 892 | if self.inimage: |
|---|
| 893 | context = self._getContext() |
|---|
| 894 | context['image']['width'] = value |
|---|
| 895 | |
|---|
| 896 | def _start_height(self, attrsD): |
|---|
| 897 | self.push('height', 0) |
|---|
| 898 | |
|---|
| 899 | def _end_height(self): |
|---|
| 900 | value = self.pop('height') |
|---|
| 901 | try: |
|---|
| 902 | value = int(value) |
|---|
| 903 | except: |
|---|
| 904 | value = 0 |
|---|
| 905 | if self.inimage: |
|---|
| 906 | context = self._getContext() |
|---|
| 907 | context['image']['height'] = value |
|---|
| 908 | |
|---|
| 909 | def _start_url(self, attrsD): |
|---|
| 910 | self.push('href', 1) |
|---|
| 911 | _start_homepage = _start_url |
|---|
| 912 | _start_uri = _start_url |
|---|
| 913 | |
|---|
| 914 | def _end_url(self): |
|---|
| 915 | value = self.pop('href') |
|---|
| 916 | if self.inauthor: |
|---|
| 917 | self._save_author('href', value) |
|---|
| 918 | elif self.incontributor: |
|---|
| 919 | self._save_contributor('href', value) |
|---|
| 920 | elif self.inimage: |
|---|
| 921 | context = self._getContext() |
|---|
| 922 | context['image']['href'] = value |
|---|
| 923 | elif self.intextinput: |
|---|
| 924 | context = self._getContext() |
|---|
| 925 | context['textinput']['link'] = value |
|---|
| 926 | _end_homepage = _end_url |
|---|
| 927 | _end_uri = _end_url |
|---|
| 928 | |
|---|
| 929 | def _start_email(self, attrsD): |
|---|
| 930 | self.push('email', 0) |
|---|
| 931 | _start_itunes_email = _start_email |
|---|
| 932 | |
|---|
| 933 | def _end_email(self): |
|---|
| 934 | value = self.pop('email') |
|---|
| 935 | if self.inpublisher: |
|---|
| 936 | self._save_author('email', value, 'publisher') |
|---|
| 937 | elif self.inauthor: |
|---|
| 938 | self._save_author('email', value) |
|---|
| 939 | elif self.incontributor: |
|---|
| 940 | self._save_contributor('email', value) |
|---|
| 941 | _end_itunes_email = _end_email |
|---|
| 942 | |
|---|
| 943 | def _getContext(self): |
|---|
| 944 | if self.insource: |
|---|
| 945 | context = self.sourcedata |
|---|
| 946 | elif self.inentry: |
|---|
| 947 | context = self.entries[-1] |
|---|
| 948 | else: |
|---|
| 949 | context = self.feeddata |
|---|
| 950 | return context |
|---|
| 951 | |
|---|
| 952 | def _save_author(self, key, value, prefix='author'): |
|---|
| 953 | context = self._getContext() |
|---|
| 954 | context.setdefault(prefix + '_detail', FeedParserDict()) |
|---|
| 955 | context[prefix + '_detail'][key] = value |
|---|
| 956 | self._sync_author_detail() |
|---|
| 957 | |
|---|
| 958 | def _save_contributor(self, key, value): |
|---|
| 959 | context = self._getContext() |
|---|
| 960 | context.setdefault('contributors', [FeedParserDict()]) |
|---|
| 961 | context['contributors'][-1][key] = value |
|---|
| 962 | |
|---|
| 963 | def _sync_author_detail(self, key='author'): |
|---|
| 964 | context = self._getContext() |
|---|
| 965 | detail = context.get('%s_detail' % key) |
|---|
| 966 | if detail: |
|---|
| 967 | name = detail.get('name') |
|---|
| 968 | email = detail.get('email') |
|---|
| 969 | if name and email: |
|---|
| 970 | context[key] = '%s (%s)' % (name, email) |
|---|
| 971 | elif name: |
|---|
| 972 | context[key] = name |
|---|
| 973 | elif email: |
|---|
| 974 | context[key] = email |
|---|
| 975 | else: |
|---|
| 976 | author = context.get(key) |
|---|
| 977 | if not author: return |
|---|
| 978 | emailmatch = re.search(r'''(([a-zA-Z0-9\_\-\.\+]+)@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.)|(([a-zA-Z0-9\-]+\.)+))([a-zA-Z]{2,4}|[0-9]{1,3})(\]?))''', author) |
|---|
| 979 | if not emailmatch: return |
|---|
| 980 | email = emailmatch.group(0) |
|---|
| 981 | # probably a better way to do the following, but it passes all the tests |
|---|
| 982 | author = author.replace(email, '') |
|---|
| 983 | author = author.replace('()', '') |
|---|
| 984 | author = author.strip() |
|---|
| 985 | if author and (author[0] == '('): |
|---|
| 986 | author = author[1:] |
|---|
| 987 | if author and (author[-1] == ')'): |
|---|
| 988 | author = author[:-1] |
|---|
| 989 | author = author.strip() |
|---|
| 990 | context.setdefault('%s_detail' % key, FeedParserDict()) |
|---|
| 991 | context['%s_detail' % key]['name'] = author |
|---|
| 992 | context['%s_detail' % key]['email'] = email |
|---|
| 993 | |
|---|
| 994 | def _start_subtitle(self, attrsD): |
|---|
| 995 | self.pushContent('subtitle', attrsD, 'text/plain', 1) |
|---|
| 996 | _start_tagline = _start_subtitle |
|---|
| 997 | _start_itunes_subtitle = _start_subtitle |
|---|
| 998 | |
|---|
| 999 | def _end_subtitle(self): |
|---|
| 1000 | self.popContent('subtitle') |
|---|
| 1001 | _end_tagline = _end_subtitle |
|---|
| 1002 | _end_itunes_subtitle = _end_subtitle |
|---|
| 1003 | |
|---|
| 1004 | def _start_rights(self, attrsD): |
|---|
| 1005 | self.pushContent('rights', attrsD, 'text/plain', 1) |
|---|
| 1006 | _start_dc_rights = _start_rights |
|---|
| 1007 | _start_copyright = _start_rights |
|---|
| 1008 | |
|---|
| 1009 | def _end_rights(self): |
|---|
| 1010 | self.popContent('rights') |
|---|
| 1011 | _end_dc_rights = _end_rights |
|---|
| 1012 | _end_copyright = _end_rights |
|---|
| 1013 | |
|---|
| 1014 | def _start_item(self, attrsD): |
|---|
| 1015 | self.entries.append(FeedParserDict()) |
|---|
| 1016 | self.push('item', 0) |
|---|
| 1017 | self.inentry = 1 |
|---|
| 1018 | self.guidislink = 0 |
|---|
| 1019 | id = self._getAttribute(attrsD, 'rdf:about') |
|---|
| 1020 | if id: |
|---|
| 1021 | context = self._getContext() |
|---|
| 1022 | context['id'] = id |
|---|
| 1023 | self._cdf_common(attrsD) |
|---|
| 1024 | _start_entry = _start_item |
|---|
| 1025 | _start_product = _start_item |
|---|
| 1026 | |
|---|
| 1027 | def _end_item(self): |
|---|
| 1028 | self.pop('item') |
|---|
| 1029 | self.inentry = 0 |
|---|
| 1030 | _end_entry = _end_item |
|---|
| 1031 | |
|---|
| 1032 | def _start_dc_language(self, attrsD): |
|---|
| 1033 | self.push('language', 1) |
|---|
| 1034 | _start_language = _start_dc_language |
|---|
| 1035 | |
|---|
| 1036 | def _end_dc_language(self): |
|---|
| 1037 | self.lang = self.pop('language') |
|---|
| 1038 | _end_language = _end_dc_language |
|---|
| 1039 | |
|---|
| 1040 | def _start_dc_publisher(self, attrsD): |
|---|
| 1041 | self.push('publisher', 1) |
|---|
| 1042 | _start_webmaster = _start_dc_publisher |
|---|
| 1043 | |
|---|
| 1044 | def _end_dc_publisher(self): |
|---|
| 1045 | self.pop('publisher') |
|---|
| 1046 | self._sync_author_detail('publisher') |
|---|
| 1047 | _end_webmaster = _end_dc_publisher |
|---|
| 1048 | |
|---|
| 1049 | def _start_published(self, attrsD): |
|---|
| 1050 | self.push('published', 1) |
|---|
| 1051 | _start_dcterms_issued = _start_published |
|---|
| 1052 | _start_issued = _start_published |
|---|
| 1053 | |
|---|
| 1054 | def _end_published(self): |
|---|
| 1055 | value = self.pop('published') |
|---|
| 1056 | self._save('published_parsed', _parse_date(value)) |
|---|
| 1057 | _end_dcterms_issued = _end_published |
|---|
| 1058 | _end_issued = _end_published |
|---|
| 1059 | |
|---|
| 1060 | def _start_updated(self, attrsD): |
|---|
| 1061 | self.push('updated', 1) |
|---|
| 1062 | _start_modified = _start_updated |
|---|
| 1063 | _start_dcterms_modified = _start_updated |
|---|
| 1064 | _start_pubdate = _start_updated |
|---|
| 1065 | _start_dc_date = _start_updated |
|---|
| 1066 | |
|---|
| 1067 | def _end_updated(self): |
|---|
| 1068 | value = self.pop('updated') |
|---|
| 1069 | parsed_value = _parse_date(value) |
|---|
| 1070 | self._save('updated_parsed', parsed_value) |
|---|
| 1071 | _end_modified = _end_updated |
|---|
| 1072 | _end_dcterms_modified = _end_updated |
|---|
| 1073 | _end_pubdate = _end_updated |
|---|
| 1074 | _end_dc_date = _end_updated |
|---|
| 1075 | |
|---|
| 1076 | def _start_created(self, attrsD): |
|---|
| 1077 | self.push('created', 1) |
|---|
| 1078 | _start_dcterms_created = _start_created |
|---|
| 1079 | |
|---|
| 1080 | def _end_created(self): |
|---|
| 1081 | value = self.pop('created') |
|---|
| 1082 | self._save('created_parsed', _parse_date(value)) |
|---|
| 1083 | _end_dcterms_created = _end_created |
|---|
| 1084 | |
|---|
| 1085 | def _start_expirationdate(self, attrsD): |
|---|
| 1086 | self.push('expired', 1) |
|---|
| 1087 | |
|---|
| 1088 | def _end_expirationdate(self): |
|---|
| 1089 | self._save('expired_parsed', _parse_date(self.pop('expired'))) |
|---|
| 1090 | |
|---|
| 1091 | def _start_cc_license(self, attrsD): |
|---|
| 1092 | self.push('license', 1) |
|---|
| 1093 | value = self._getAttribute(attrsD, 'rdf:resource') |
|---|
| 1094 | if value: |
|---|
| 1095 | self.elementstack[-1][2].append(value) |
|---|
| 1096 | self.pop('license') |
|---|
| 1097 | |
|---|
| 1098 | def _start_creativecommons_license(self, attrsD): |
|---|
| 1099 | self.push('license', 1) |
|---|
| 1100 | |
|---|
| 1101 | def _end_creativecommons_license(self): |
|---|
| 1102 | self.pop('license') |
|---|
| 1103 | |
|---|
| 1104 | def _addTag(self, term, scheme, label): |
|---|
| 1105 | context = self._getContext() |
|---|
| 1106 | tags = context.setdefault('tags', []) |
|---|
| 1107 | if (not term) and (not scheme) and (not label): return |
|---|
| 1108 | value = FeedParserDict({'term': term, 'scheme': scheme, 'label': label}) |
|---|
| 1109 | if value not in tags: |
|---|
| 1110 | tags.append(FeedParserDict({'term': term, 'scheme': scheme, 'label': label})) |
|---|
| 1111 | |
|---|
| 1112 | def _start_category(self, attrsD): |
|---|
| 1113 | if _debug: sys.stderr.write('entering _start_category with %s\n' % repr(attrsD)) |
|---|
| 1114 | term = attrsD.get('term') |
|---|
| 1115 | scheme = attrsD.get('scheme', attrsD.get('domain')) |
|---|
| 1116 | label = attrsD.get('label') |
|---|
| 1117 | self._addTag(term, scheme, label) |
|---|
| 1118 | self.push('category', 1) |
|---|
| 1119 | _start_dc_subject = _start_category |
|---|
| 1120 | _start_keywords = _start_category |
|---|
| 1121 | |
|---|
| 1122 | def _end_itunes_keywords(self): |
|---|
| 1123 | for term in self.pop('itunes_keywords').split(): |
|---|
| 1124 | self._addTag(term, 'http://www.itunes.com/', None) |
|---|
| 1125 | |
|---|
| 1126 | def _start_itunes_category(self, attrsD): |
|---|
| 1127 | self._addTag(attrsD.get('text'), 'http://www.itunes.com/', None) |
|---|
| 1128 | self.push('category', 1) |
|---|
| 1129 | |
|---|
| 1130 | def _end_category(self): |
|---|
| 1131 | value = self.pop('category') |
|---|
| 1132 | if not value: return |
|---|
| 1133 | context = self._getContext() |
|---|
| 1134 | tags = context['tags'] |
|---|
| 1135 | if value and len(tags) and not tags[-1]['term']: |
|---|
| 1136 | tags[-1]['term'] = value |
|---|
| 1137 | else: |
|---|
| 1138 | self._addTag(value, None, None) |
|---|
| 1139 | _end_dc_subject = _end_category |
|---|
| 1140 | _end_keywords = _end_category |
|---|
| 1141 | _end_itunes_category = _end_category |
|---|
| 1142 | |
|---|
| 1143 | def _start_cloud(self, attrsD): |
|---|
| 1144 | self._getContext()['cloud'] = FeedParserDict(attrsD) |
|---|
| 1145 | |
|---|
| 1146 | def _start_link(self, attrsD): |
|---|
| 1147 | attrsD.setdefault('rel', 'alternate') |
|---|
| 1148 | attrsD.setdefault('type', 'text/html') |
|---|
| 1149 | attrsD = self._itsAnHrefDamnIt(attrsD) |
|---|
| 1150 | if attrsD.has_key('href'): |
|---|
| 1151 | attrsD['href'] = self.resolveURI(attrsD['href']) |
|---|
| 1152 | expectingText = self.infeed or self.inentry or self.insource |
|---|
| 1153 | context = self._getContext() |
|---|
| 1154 | context.setdefault('links', []) |
|---|
| 1155 | context['links'].append(FeedParserDict(attrsD)) |
|---|
| 1156 | if attrsD['rel'] == 'enclosure': |
|---|
| 1157 | self._start_enclosure(attrsD) |
|---|
| 1158 | if attrsD.has_key('href'): |
|---|
| 1159 | expectingText = 0 |
|---|
| 1160 | if (attrsD.get('rel') == 'alternate') and (self.mapContentType(attrsD.get('type')) in self.html_types): |
|---|
| 1161 | context['link'] = attrsD['href'] |
|---|
| 1162 | else: |
|---|
| 1163 | self.push('link', expectingText) |
|---|
| 1164 | _start_producturl = _start_link |
|---|
| 1165 | |
|---|
| 1166 | def _end_link(self): |
|---|
| 1167 | value = self.pop('link') |
|---|
| 1168 | context = self._getContext() |
|---|
| 1169 | if self.intextinput: |
|---|
| 1170 | context['textinput']['link'] = value |
|---|
| 1171 | if self.inimage: |
|---|
| 1172 | context['image']['link'] = value |
|---|
| 1173 | _end_producturl = _end_link |
|---|
| 1174 | |
|---|
| 1175 | def _start_guid(self, attrsD): |
|---|
| 1176 | self.guidislink = (attrsD.get('ispermalink', 'true') == 'true') |
|---|
| 1177 | self.push('id', 1) |
|---|
| 1178 | |
|---|
| 1179 | def _end_guid(self): |
|---|
| 1180 | value = self.pop('id') |
|---|
| 1181 | self._save('guidislink', self.guidislink and not self._getContext().has_key('link')) |
|---|
| 1182 | if self.guidislink: |
|---|
| 1183 | # guid acts as link, but only if 'ispermalink' is not present or is 'true', |
|---|
| 1184 | # and only if the item doesn't already have a link element |
|---|
| 1185 | self._save('link', value) |
|---|
| 1186 | |
|---|
| 1187 | def _start_title(self, attrsD): |
|---|
| 1188 | self.pushContent('title', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) |
|---|
| 1189 | _start_dc_title = _start_title |
|---|
| 1190 | _start_media_title = _start_title |
|---|
| 1191 | |
|---|
| 1192 | def _end_title(self): |
|---|
| 1193 | value = self.popContent('title') |
|---|
| 1194 | context = self._getContext() |
|---|
| 1195 | if self.intextinput: |
|---|
| 1196 | context['textinput']['title'] = value |
|---|
| 1197 | elif self.inimage: |
|---|
| 1198 | context['image']['title'] = value |
|---|
| 1199 | _end_dc_title = _end_title |
|---|
| 1200 | _end_media_title = _end_title |
|---|
| 1201 | |
|---|
| 1202 | def _start_description(self, attrsD): |
|---|
| 1203 | context = self._getContext() |
|---|
| 1204 | if context.has_key('summary'): |
|---|
| 1205 | self._summaryKey = 'content' |
|---|
| 1206 | self._start_content(attrsD) |
|---|
| 1207 | else: |
|---|
| 1208 | self.pushContent('description', attrsD, 'text/html', self.infeed or self.inentry or self.insource) |
|---|
| 1209 | |
|---|
| 1210 | def _start_abstract(self, attrsD): |
|---|
| 1211 | self.pushContent('description', attrsD, 'text/plain', self.infeed or self.inentry or self.insource) |
|---|
| 1212 | |
|---|
| 1213 | def _end_description(self): |
|---|
| 1214 | if self._summaryKey == 'content': |
|---|
| 1215 | self._end_content() |
|---|
| 1216 | else: |
|---|
| 1217 | value = self.popContent('description') |
|---|
| 1218 | context = self._getContext() |
|---|
| 1219 | if self.intextinput: |
|---|
| 1220 | context['textinput']['description'] = value |
|---|
| 1221 | elif self.inimage: |
|---|
| 1222 | context['image']['description'] = value |
|---|
| 1223 | self._summaryKey = None |
|---|
| 1224 | _end_abstract = _end_description |
|---|
| 1225 | |
|---|
| 1226 | def _start_info(self, attrsD): |
|---|
| 1227 | self.pushContent('info', attrsD, 'text/plain', 1) |
|---|
| 1228 | _start_feedburner_browserfriendly = _start_info |
|---|
| 1229 | |
|---|
| 1230 | def _end_info(self): |
|---|
| 1231 | self.popContent('info') |
|---|
| 1232 | _end_feedburner_browserfriendly = _end_info |
|---|
| 1233 | |
|---|
| 1234 | def _start_generator(self, attrsD): |
|---|
| 1235 | if attrsD: |
|---|
| 1236 | attrsD = self._itsAnHrefDamnIt(attrsD) |
|---|
| 1237 | if attrsD.has_key('href'): |
|---|
| 1238 | attrsD['href'] = self.resolveURI(attrsD['href']) |
|---|
| 1239 | self._getContext()['generator_detail'] = FeedParserDict(attrsD) |
|---|
| 1240 | self.push('generator', 1) |
|---|
| 1241 | |
|---|
| 1242 | def _end_generator(self): |
|---|
| 1243 | value = self.pop('generator') |
|---|
| 1244 | context = self._getContext() |
|---|
| 1245 | if context.has_key('generator_detail'): |
|---|
| 1246 | context['generator_detail']['name'] = value |
|---|
| 1247 | |
|---|
| 1248 | def _start_admin_generatoragent(self, attrsD): |
|---|
| 1249 | self.push('generator', 1) |
|---|
| 1250 | value = self._getAttribute(attrsD, 'rdf:resource') |
|---|
| 1251 | if value: |
|---|
| 1252 | self.elementstack[-1][2].append(value) |
|---|
| 1253 | self.pop('generator') |
|---|
| 1254 | self._getContext()['generator_detail'] = FeedParserDict({'href': value}) |
|---|
| 1255 | |
|---|
| 1256 | def _start_admin_errorreportsto(self, attrsD): |
|---|
| 1257 | self.push('errorreportsto', 1) |
|---|
| 1258 | value = self._getAttribute(attrsD, 'rdf:resource') |
|---|
| 1259 | if value: |
|---|
| 1260 | self.elementstack[-1][2].append(value) |
|---|
| 1261 | self.pop('errorreportsto') |
|---|
| 1262 | |
|---|
| 1263 | def _start_summary(self, attrsD): |
|---|
| 1264 | context = self._getContext() |
|---|
| 1265 | if context.has_key('summary'): |
|---|
| 1266 | self._summaryKey = 'content' |
|---|
| 1267 | self._start_content(attrsD) |
|---|
| 1268 | else: |
|---|
| 1269 | self._summaryKey = 'summary' |
|---|
| 1270 | self.pushContent(self._summaryKey, attrsD, 'text/plain', 1) |
|---|
| 1271 | _start_itunes_summary = _start_summary |
|---|
| 1272 | |
|---|
| 1273 | def _end_summary(self): |
|---|
| 1274 | if self._summaryKey == 'content': |
|---|
| 1275 | self._end_content() |
|---|
| 1276 | else: |
|---|
| 1277 | self.popContent(self._summaryKey or 'summary') |
|---|
| 1278 | self._summaryKey = None |
|---|
| 1279 | _end_itunes_summary = _end_summary |
|---|
| 1280 | |
|---|
| 1281 | def _start_enclosure(self, attrsD): |
|---|
| 1282 | attrsD = self._itsAnHrefDamnIt(attrsD) |
|---|
| 1283 | self._getContext().setdefault('enclosures', []).append(FeedParserDict(attrsD)) |
|---|
| 1284 | href = attrsD.get('href') |
|---|
| 1285 | if href: |
|---|
| 1286 | context = self._getContext() |
|---|
| 1287 | if not context.get('id'): |
|---|
| 1288 | context['id'] = href |
|---|
| 1289 | |
|---|
| 1290 | def _start_source(self, attrsD): |
|---|
| 1291 | self.insource = 1 |
|---|
| 1292 | |
|---|
| 1293 | def _end_source(self): |
|---|
| 1294 | self.insource = 0 |
|---|
| 1295 | self._getContext()['source'] = copy.deepcopy(self.sourcedata) |
|---|
| 1296 | self.sourcedata.clear() |
|---|
| 1297 | |
|---|
| 1298 | def _start_content(self, attrsD): |
|---|
| 1299 | self.pushContent('content', attrsD, 'text/plain', 1) |
|---|
| 1300 | src = attrsD.get('src') |
|---|
| 1301 | if src: |
|---|
| 1302 | self.contentparams['src'] = src |
|---|
| 1303 | self.push('content', 1) |
|---|
| 1304 | |
|---|
| 1305 | def _start_prodlink(self, attrsD): |
|---|
| 1306 | self.pushContent('content', attrsD, 'text/html', 1) |
|---|
| 1307 | |
|---|
| 1308 | def _start_body(self, attrsD): |
|---|
| 1309 | self.pushContent('content', attrsD, 'application/xhtml+xml', 1) |
|---|
| 1310 | _start_xhtml_body = _start_body |
|---|
| 1311 | |
|---|
| 1312 | def _start_content_encoded(self, attrsD): |
|---|
| 1313 | self.pushContent('content', attrsD, 'text/html', 1) |
|---|
| 1314 | _start_fullitem = _start_content_encoded |
|---|
| 1315 | |
|---|
| 1316 | def _end_content(self): |
|---|
| 1317 | copyToDescription = self.mapContentType(self.contentparams.get('type')) in (['text/plain'] + self.html_types) |
|---|
| 1318 | value = self.popContent('content') |
|---|
| 1319 | if copyToDescription: |
|---|
| 1320 | self._save('description', value) |
|---|
| 1321 | _end_body = _end_content |
|---|
| 1322 | _end_xhtml_body = _end_content |
|---|
| 1323 | _end_content_encoded = _end_content |
|---|
| 1324 | _end_fullitem = _end_content |
|---|
| 1325 | _end_prodlink = _end_content |
|---|
| 1326 | |
|---|
| 1327 | def _start_itunes_image(self, attrsD): |
|---|
| 1328 | self.push('itunes_image', 0) |
|---|
| 1329 | self._getContext()['image'] = FeedParserDict({'href': attrsD.get('href')}) |
|---|
| 1330 | _start_itunes_link = _start_itunes_image |
|---|
| 1331 | |
|---|
| 1332 | def _end_itunes_block(self): |
|---|
| 1333 | value = self.pop('itunes_block', 0) |
|---|
| 1334 | self._getContext()['itunes_block'] = (value == 'yes') and 1 or 0 |
|---|
| 1335 | |
|---|
| 1336 | def _end_itunes_explicit(self): |
|---|
| 1337 | value = self.pop('itunes_explicit', 0) |
|---|
| 1338 | self._getContext()['itunes_explicit'] = (value == 'yes') and 1 or 0 |
|---|
| 1339 | |
|---|
| 1340 | if _XML_AVAILABLE: |
|---|
| 1341 | class _StrictFeedParser(_FeedParserMixin, xml.sax.handler.ContentHandler): |
|---|
| 1342 | def __init__(self, baseuri, baselang, encoding): |
|---|
| 1343 | if _debug: sys.stderr.write('trying StrictFeedParser\n') |
|---|
| 1344 | xml.sax.handler.ContentHandler.__init__(self) |
|---|
| 1345 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|---|
| 1346 | self.bozo = 0 |
|---|
| 1347 | self.exc = None |
|---|
| 1348 | |
|---|
| 1349 | def startPrefixMapping(self, prefix, uri): |
|---|
| 1350 | self.trackNamespace(prefix, uri) |
|---|
| 1351 | |
|---|
| 1352 | def startElementNS(self, name, qname, attrs): |
|---|
| 1353 | namespace, localname = name |
|---|
| 1354 | lowernamespace = str(namespace or '').lower() |
|---|
| 1355 | if lowernamespace.find('backend.userland.com/rss') <> -1: |
|---|
| 1356 | # match any backend.userland.com namespace |
|---|
| 1357 | namespace = 'http://backend.userland.com/rss' |
|---|
| 1358 | lowernamespace = namespace |
|---|
| 1359 | if qname and qname.find(':') > 0: |
|---|
| 1360 | givenprefix = qname.split(':')[0] |
|---|
| 1361 | else: |
|---|
| 1362 | givenprefix = None |
|---|
| 1363 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|---|
| 1364 | if givenprefix and (prefix == None or (prefix == '' and lowernamespace == '')) and not self.namespacesInUse.has_key(givenprefix): |
|---|
| 1365 | raise UndeclaredNamespace, "'%s' is not associated with a namespace" % givenprefix |
|---|
| 1366 | if prefix: |
|---|
| 1367 | localname = prefix + ':' + localname |
|---|
| 1368 | localname = str(localname).lower() |
|---|
| 1369 | if _debug: sys.stderr.write('startElementNS: qname = %s, namespace = %s, givenprefix = %s, prefix = %s, attrs = %s, localname = %s\n' % (qname, namespace, givenprefix, prefix, attrs.items(), localname)) |
|---|
| 1370 | |
|---|
| 1371 | # qname implementation is horribly broken in Python 2.1 (it |
|---|
| 1372 | # doesn't report any), and slightly broken in Python 2.2 (it |
|---|
| 1373 | # doesn't report the xml: namespace). So we match up namespaces |
|---|
| 1374 | # with a known list first, and then possibly override them with |
|---|
| 1375 | # the qnames the SAX parser gives us (if indeed it gives us any |
|---|
| 1376 | # at all). Thanks to MatejC for helping me test this and |
|---|
| 1377 | # tirelessly telling me that it didn't work yet. |
|---|
| 1378 | attrsD = {} |
|---|
| 1379 | for (namespace, attrlocalname), attrvalue in attrs._attrs.items(): |
|---|
| 1380 | lowernamespace = (namespace or '').lower() |
|---|
| 1381 | prefix = self._matchnamespaces.get(lowernamespace, '') |
|---|
| 1382 | if prefix: |
|---|
| 1383 | attrlocalname = prefix + ':' + attrlocalname |
|---|
| 1384 | attrsD[str(attrlocalname).lower()] = attrvalue |
|---|
| 1385 | for qname in attrs.getQNames(): |
|---|
| 1386 | attrsD[str(qname).lower()] = attrs.getValueByQName(qname) |
|---|
| 1387 | self.unknown_starttag(localname, attrsD.items()) |
|---|
| 1388 | |
|---|
| 1389 | def characters(self, text): |
|---|
| 1390 | self.handle_data(text) |
|---|
| 1391 | |
|---|
| 1392 | def endElementNS(self, name, qname): |
|---|
| 1393 | namespace, localname = name |
|---|
| 1394 | lowernamespace = str(namespace or '').lower() |
|---|
| 1395 | if qname and qname.find(':') > 0: |
|---|
| 1396 | givenprefix = qname.split(':')[0] |
|---|
| 1397 | else: |
|---|
| 1398 | givenprefix = '' |
|---|
| 1399 | prefix = self._matchnamespaces.get(lowernamespace, givenprefix) |
|---|
| 1400 | if prefix: |
|---|
| 1401 | localname = prefix + ':' + localname |
|---|
| 1402 | localname = str(localname).lower() |
|---|
| 1403 | self.unknown_endtag(localname) |
|---|
| 1404 | |
|---|
| 1405 | def error(self, exc): |
|---|
| 1406 | self.bozo = 1 |
|---|
| 1407 | self.exc = exc |
|---|
| 1408 | |
|---|
| 1409 | def fatalError(self, exc): |
|---|
| 1410 | self.error(exc) |
|---|
| 1411 | raise exc |
|---|
| 1412 | |
|---|
| 1413 | class _BaseHTMLProcessor(sgmllib.SGMLParser): |
|---|
| 1414 | elements_no_end_tag = ['area', 'base', 'basefont', 'br', 'col', 'frame', 'hr', |
|---|
| 1415 | 'img', 'input', 'isindex', 'link', 'meta', 'param'] |
|---|
| 1416 | |
|---|
| 1417 | def __init__(self, encoding): |
|---|
| 1418 | self.encoding = encoding |
|---|
| 1419 | if _debug: sys.stderr.write('entering BaseHTMLProcessor, encoding=%s\n' % self.encoding) |
|---|
| 1420 | sgmllib.SGMLParser.__init__(self) |
|---|
| 1421 | |
|---|
| 1422 | def reset(self): |
|---|
| 1423 | self.pieces = [] |
|---|
| 1424 | sgmllib.SGMLParser.reset(self) |
|---|
| 1425 | |
|---|
| 1426 | def _shorttag_replace(self, match): |
|---|
| 1427 | tag = match.group(1) |
|---|
| 1428 | if tag in self.elements_no_end_tag: |
|---|
| 1429 | return '<' + tag + ' />' |
|---|
| 1430 | else: |
|---|
| 1431 | return '<' + tag + '></' + tag + '>' |
|---|
| 1432 | |
|---|
| 1433 | def feed(self, data): |
|---|
| 1434 | data = re.compile(r'<!((?!DOCTYPE|--|\[))', re.IGNORECASE).sub(r'<!\1', data) |
|---|
| 1435 | #data = re.sub(r'<(\S+?)\s*?/>', self._shorttag_replace, data) # bug [ 1399464 ] Bad regexp for _shorttag_replace |
|---|
| 1436 | data = re.sub(r'<([^<\s]+?)\s*/>', self._shorttag_replace, data) |
|---|
| 1437 | data = data.replace(''', "'") |
|---|
| 1438 | data = data.replace('"', '"') |
|---|
| 1439 | if self.encoding and type(data) == type(u''): |
|---|
| 1440 | data = data.encode(self.encoding) |
|---|
| 1441 | sgmllib.SGMLParser.feed(self, data) |
|---|
| 1442 | |
|---|
| 1443 | def normalize_attrs(self, attrs): |
|---|
| 1444 | # utility method to be called by descendants |
|---|
| 1445 | attrs = [(k.lower(), v) for k, v in attrs] |
|---|
| 1446 | attrs = [(k, k in ('rel', 'type') and v.lower() or v) for k, v in attrs] |
|---|
| 1447 | return attrs |
|---|
| 1448 | |
|---|
| 1449 | def unknown_starttag(self, tag, attrs): |
|---|
| 1450 | # called for each start tag |
|---|
| 1451 | # attrs is a list of (attr, value) tuples |
|---|
| 1452 | # e.g. for <pre class='screen'>, tag='pre', attrs=[('class', 'screen')] |
|---|
| 1453 | if _debug: sys.stderr.write('_BaseHTMLProcessor, unknown_starttag, tag=%s\n' % tag) |
|---|
| 1454 | uattrs = [] |
|---|
| 1455 | # thanks to Kevin Marks for this breathtaking hack to deal with (valid) high-bit attribute values in UTF-8 feeds |
|---|
| 1456 | for key, value in attrs: |
|---|
| 1457 | if type(value) != type(u''): |
|---|
| 1458 | value = unicode(value, self.encoding) |
|---|
| 1459 | uattrs.append((unicode(key, self.encoding), value)) |
|---|
| 1460 | strattrs = u''.join([u' %s="%s"' % (key, value) for key, value in uattrs]).encode(self.encoding) |
|---|
| 1461 | if tag in self.elements_no_end_tag: |
|---|
| 1462 | self.pieces.append('<%(tag)s%(strattrs)s />' % locals()) |
|---|
| 1463 | else: |
|---|
| 1464 | self.pieces.append('<%(tag)s%(strattrs)s>' % locals()) |
|---|
| 1465 | |
|---|
| 1466 | def unknown_endtag(self, tag): |
|---|
| 1467 | # called for each end tag, e.g. for </pre>, tag will be 'pre' |
|---|
| 1468 | # Reconstruct the original end tag. |
|---|
| 1469 | if tag not in self.elements_no_end_tag: |
|---|
| 1470 | self.pieces.append("</%(tag)s>" % locals()) |
|---|
| 1471 | |
|---|
| 1472 | def handle_charref(self, ref): |
|---|
| 1473 | # called for each character reference, e.g. for ' ', ref will be '160' |
|---|
| 1474 | # Reconstruct the original character reference. |
|---|
| 1475 | self.pieces.append('&#%(ref)s;' % locals()) |
|---|
| 1476 | |
|---|
| 1477 | def handle_entityref(self, ref): |
|---|
| 1478 | # called for each entity reference, e.g. for '©', ref will be 'copy' |
|---|
| 1479 | # Reconstruct the original entity reference. |
|---|
| 1480 | self.pieces.append('&%(ref)s;' % locals()) |
|---|
| 1481 | |
|---|
| 1482 | def handle_data(self, text): |
|---|
| 1483 | # called for each block of plain text, i.e. outside of any tag and |
|---|
| 1484 | # not containing any character or entity references |
|---|
| 1485 | # Store the original text verbatim. |
|---|
| 1486 | if _debug: sys.stderr.write('_BaseHTMLProcessor, handle_text, text=%s\n' % text) |
|---|
| 1487 | self.pieces.append(text) |
|---|
| 1488 | |
|---|
| 1489 | def handle_comment(self, text): |
|---|
| 1490 | # called for each HTML comment, e.g. <!-- insert Javascript code here --> |
|---|
| 1491 | # Reconstruct the original comment. |
|---|
| 1492 | self.pieces.append('<!--%(text)s-->' % locals()) |
|---|
| 1493 | |
|---|
| 1494 | def handle_pi(self, text): |
|---|
| 1495 | # called for each processing instruction, e.g. <?instruction> |
|---|
| 1496 | # Reconstruct original processing instruction. |
|---|
| 1497 | self.pieces.append('<?%(text)s>' % locals()) |
|---|
| 1498 | |
|---|
| 1499 | def handle_decl(self, text): |
|---|
| 1500 | # called for the DOCTYPE, if present, e.g. |
|---|
| 1501 | # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" |
|---|
| 1502 | # "http://www.w3.org/TR/html4/loose.dtd"> |
|---|
| 1503 | # Reconstruct original DOCTYPE |
|---|
| 1504 | self.pieces.append('<!%(text)s>' % locals()) |
|---|
| 1505 | |
|---|
| 1506 | _new_declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9:]*\s*').match |
|---|
| 1507 | def _scan_name(self, i, declstartpos): |
|---|
| 1508 | rawdata = self.rawdata |
|---|
| 1509 | n = len(rawdata) |
|---|
| 1510 | if i == n: |
|---|
| 1511 | return None, -1 |
|---|
| 1512 | m = self._new_declname_match(rawdata, i) |
|---|
| 1513 | if m: |
|---|
| 1514 | s = m.group() |
|---|
| 1515 | name = s.strip() |
|---|
| 1516 | if (i + len(s)) == n: |
|---|
| 1517 | return None, -1 # end of buffer |
|---|
| 1518 | return name.lower(), m.end() |
|---|
| 1519 | else: |
|---|
| 1520 | self.handle_data(rawdata) |
|---|
| 1521 | # self.updatepos(declstartpos, i) |
|---|
| 1522 | return None, -1 |
|---|
| 1523 | |
|---|
| 1524 | def output(self): |
|---|
| 1525 | '''Return processed HTML as a single string''' |
|---|
| 1526 | return ''.join([str(p) for p in self.pieces]) |
|---|
| 1527 | |
|---|
| 1528 | class _LooseFeedParser(_FeedParserMixin, _BaseHTMLProcessor): |
|---|
| 1529 | def __init__(self, baseuri, baselang, encoding): |
|---|
| 1530 | sgmllib.SGMLParser.__init__(self) |
|---|
| 1531 | _FeedParserMixin.__init__(self, baseuri, baselang, encoding) |
|---|
| 1532 | |
|---|
| 1533 | def decodeEntities(self, element, data): |
|---|
| 1534 | data = data.replace('<', '<') |
|---|
| 1535 | data = data.replace('<', '<') |
|---|
| 1536 | data = data.replace('>', '>') |
|---|
| 1537 | data = data.replace('>', '>') |
|---|
| 1538 | data = data.replace('&', '&') |
|---|
| 1539 | data = data.replace('&', '&') |
|---|
| 1540 | data = data.replace('"', '"') |
|---|
| 1541 | data = data.replace('"', '"') |
|---|
| 1542 | data = data.replace(''', ''') |
|---|
| 1543 | data = data.replace(''', ''') |
|---|
| 1544 | if self.contentparams.has_key('type') and not self.contentparams.get('type', 'xml').endswith('xml'): |
|---|
| 1545 | data = data.replace('<', '<') |
|---|
| 1546 | data = data.replace('>', '>') |
|---|
| 1547 | data = data.replace('&', '&') |
|---|
| 1548 | data = data.replace('"', '"') |
|---|
| 1549 | data = data.replace(''', "'") |
|---|
| 1550 | return data |
|---|
| 1551 | |
|---|
| 1552 | class _RelativeURIResolver(_BaseHTMLProcessor): |
|---|
| 1553 | relative_uris = [('a', 'href'), |
|---|
| 1554 | ('applet', 'codebase'), |
|---|
| 1555 | ('area', 'href'), |
|---|
| 1556 | ('blockquote', 'cite'), |
|---|
| 1557 | ('body', 'background'), |
|---|
| 1558 | ('del', 'cite'), |
|---|
| 1559 | ('form', 'action'), |
|---|
| 1560 | ('frame', 'longdesc'), |
|---|
| 1561 | ('frame', 'src'), |
|---|
| 1562 | ('iframe', 'longdesc'), |
|---|
| 1563 | ('iframe', 'src'), |
|---|
| 1564 | ('head', 'profile'), |
|---|
| 1565 | ('img', 'longdesc'), |
|---|
| 1566 | ('img', 'src'), |
|---|
| 1567 | ('img', 'usemap'), |
|---|
| 1568 | ('input', 'src'), |
|---|
| 1569 | ('input', 'usemap'), |
|---|
| 1570 | ('ins', 'cite'), |
|---|
| 1571 | ('link', 'href'), |
|---|
| 1572 | ('object', 'classid'), |
|---|
| 1573 | ('object', 'codebase'), |
|---|
| 1574 | ('object', 'data'), |
|---|
| 1575 | ('object', 'usemap'), |
|---|
| 1576 | ('q', 'cite'), |
|---|
| 1577 | ('script', 'src')] |
|---|
| 1578 | |
|---|
| 1579 | def __init__(self, baseuri, encoding): |
|---|
| 1580 | _BaseHTMLProcessor.__init__(self, encoding) |
|---|
| 1581 | self.baseuri = baseuri |
|---|
| 1582 | |
|---|
| 1583 | def resolveURI(self, uri): |
|---|
| 1584 | return _urljoin(self.baseuri, uri) |
|---|
| 1585 | |
|---|
| 1586 | def unknown_starttag(self, tag, attrs): |
|---|
| 1587 | attrs = self.normalize_attrs(attrs) |
|---|
| 1588 | attrs = [(key, ((tag, key) in self.relative_uris) and self.resolveURI(value) or value) for key, value in attrs] |
|---|
| 1589 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) |
|---|
| 1590 | |
|---|
| 1591 | def _resolveRelativeURIs(htmlSource, baseURI, encoding): |
|---|
| 1592 | if _debug: sys.stderr.write('entering _resolveRelativeURIs\n') |
|---|
| 1593 | p = _RelativeURIResolver(baseURI, encoding) |
|---|
| 1594 | p.feed(htmlSource) |
|---|
| 1595 | return p.output() |
|---|
| 1596 | |
|---|
| 1597 | class _HTMLSanitizer(_BaseHTMLProcessor): |
|---|
| 1598 | acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', 'big', |
|---|
| 1599 | 'blockquote', 'br', 'button', 'caption', 'center', 'cite', 'code', 'col', |
|---|
| 1600 | 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'fieldset', |
|---|
| 1601 | 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'i', 'img', 'input', |
|---|
| 1602 | 'ins', 'kbd', 'label', 'legend', 'li', 'map', 'menu', 'ol', 'optgroup', |
|---|
| 1603 | 'option', 'p', 'pre', 'q', 's', 'samp', 'select', 'small', 'span', 'strike', |
|---|
| 1604 | 'strong', 'sub', 'sup', 'table', 'tbody', 'td', 'textarea', 'tfoot', 'th', |
|---|
| 1605 | 'thead', 'tr', 'tt', 'u', 'ul', 'var'] |
|---|
| 1606 | |
|---|
| 1607 | acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey', |
|---|
| 1608 | 'action', 'align', 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', |
|---|
| 1609 | 'char', 'charoff', 'charset', 'checked', 'cite', 'class', 'clear', 'cols', |
|---|
| 1610 | 'colspan', 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', |
|---|
| 1611 | 'enctype', 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', |
|---|
| 1612 | 'id', 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method', |
|---|
| 1613 | 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', |
|---|
| 1614 | 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size', |
|---|
| 1615 | 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title', 'type', |
|---|
| 1616 | 'usemap', 'valign', 'value', 'vspace', 'width'] |
|---|
| 1617 | |
|---|
| 1618 | unacceptable_elements_with_end_tag = ['script', 'applet'] |
|---|
| 1619 | |
|---|
| 1620 | def reset(self): |
|---|
| 1621 | _BaseHTMLProcessor.reset(self) |
|---|
| 1622 | self.unacceptablestack = 0 |
|---|
| 1623 | |
|---|
| 1624 | def unknown_starttag(self, tag, attrs): |
|---|
| 1625 | if not tag in self.acceptable_elements: |
|---|
| 1626 | if tag in self.unacceptable_elements_with_end_tag: |
|---|
| 1627 | self.unacceptablestack += 1 |
|---|
| 1628 | return |
|---|
| 1629 | attrs = self.normalize_attrs(attrs) |
|---|
| 1630 | attrs = [(key, value) for key, value in attrs if key in self.acceptable_attributes] |
|---|
| 1631 | _BaseHTMLProcessor.unknown_starttag(self, tag, attrs) |
|---|
| 1632 | |
|---|
| 1633 | def unknown_endtag(self, tag): |
|---|
| 1634 | if not tag in self.acceptable_elements: |
|---|
| 1635 | if tag in self.unacceptable_elements_with_end_tag: |
|---|
| 1636 | self.unacceptablestack -= 1 |
|---|
| 1637 | return |
|---|
| 1638 | _BaseHTMLProcessor.unknown_endtag(self, tag) |
|---|
| 1639 | |
|---|
| 1640 | def handle_pi(self, text): |
|---|
| 1641 | pass |
|---|
| 1642 | |
|---|
| 1643 | def handle_decl(self, text): |
|---|
| 1644 | pass |
|---|
| 1645 | |
|---|
| 1646 | def handle_data(self, text): |
|---|
| 1647 | if not self.unacceptablestack: |
|---|
| 1648 | _BaseHTMLProcessor.handle_data(self, text) |
|---|
| 1649 | |
|---|
| 1650 | def _sanitizeHTML(htmlSource, encoding): |
|---|
| 1651 | p = _HTMLSanitizer(encoding) |
|---|
| 1652 | p.feed(htmlSource) |
|---|
| 1653 | data = p.output() |
|---|
| 1654 | if TIDY_MARKUP: |
|---|
| 1655 | # loop through list of preferred Tidy interfaces looking for one that's installed, |
|---|
| 1656 | # then set up a common _tidy function to wrap the interface-specific API. |
|---|
| 1657 | _tidy = None |
|---|
| 1658 | for tidy_interface in PREFERRED_TIDY_INTERFACES: |
|---|
| 1659 | try: |
|---|
| 1660 | if tidy_interface == "uTidy": |
|---|
| 1661 | from tidy import parseString as _utidy |
|---|
| 1662 | def _tidy(data, **kwargs): |
|---|
| 1663 | return str(_utidy(data, **kwargs)) |
|---|
| 1664 | break |
|---|
| 1665 | elif tidy_interface == "mxTidy": |
|---|
| 1666 | from mx.Tidy import Tidy as _mxtidy |
|---|
| 1667 | def _tidy(data, **kwargs): |
|---|
| 1668 | nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) |
|---|
| 1669 | return data |
|---|
| 1670 | break |
|---|
| 1671 | except: |
|---|
| 1672 | pass |
|---|
| 1673 | if _tidy: |
|---|
| 1674 | utf8 = type(data) == type(u'') |
|---|
| 1675 | if utf8: |
|---|
| 1676 | data = data.encode('utf-8') |
|---|
| 1677 | data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") |
|---|
| 1678 | if utf8: |
|---|
| 1679 | data = unicode(data, 'utf-8') |
|---|
| 1680 | if data.count('<body'): |
|---|
| 1681 | data = data.split('<body', 1)[1] |
|---|
| 1682 | if data.count('>'): |
|---|
| 1683 | data = data.split('>', 1)[1] |
|---|
| 1684 | if data.count('</body'): |
|---|
| 1685 | data = data.split('</body', 1)[0] |
|---|
| 1686 | data = data.strip().replace('\r\n', '\n') |
|---|
| 1687 | return data |
|---|
| 1688 | |
|---|
| 1689 | class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): |
|---|
| 1690 | def http_error_default(self, req, fp, code, msg, headers): |
|---|
| 1691 | if ((code / 100) == 3) and (code != 304): |
|---|
| 1692 | return self.http_error_302(req, fp, code, msg, headers) |
|---|
| 1693 | infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|---|
| 1694 | infourl.status = code |
|---|
| 1695 | return infourl |
|---|
| 1696 | |
|---|
| 1697 | def http_error_302(self, req, fp, code, msg, headers): |
|---|
| 1698 | if headers.dict.has_key('location'): |
|---|
| 1699 | infourl = urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers) |
|---|
| 1700 | else: |
|---|
| 1701 | infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|---|
| 1702 | if not hasattr(infourl, 'status'): |
|---|
| 1703 | infourl.status = code |
|---|
| 1704 | return infourl |
|---|
| 1705 | |
|---|
| 1706 | def http_error_301(self, req, fp, code, msg, headers): |
|---|
| 1707 | if headers.dict.has_key('location'): |
|---|
| 1708 | infourl = urllib2.HTTPRedirectHandler.http_error_301(self, req, fp, code, msg, headers) |
|---|
| 1709 | else: |
|---|
| 1710 | infourl = urllib.addinfourl(fp, headers, req.get_full_url()) |
|---|
| 1711 | if not hasattr(infourl, 'status'): |
|---|
| 1712 | infourl.status = code |
|---|
| 1713 | return infourl |
|---|
| 1714 | |
|---|
| 1715 | http_error_300 = http_error_302 |
|---|
| 1716 | http_error_303 = http_error_302 |
|---|
| 1717 | http_error_307 = http_error_302 |
|---|
| 1718 | |
|---|
| 1719 | def http_error_401(self, req, fp, code, msg, headers): |
|---|
| 1720 | # Check if |
|---|
| 1721 | # - server requires digest auth, AND |
|---|
| 1722 | # - we tried (unsuccessfully) with basic auth, AND |
|---|
| 1723 | # - we're using Python 2.3.3 or later (digest auth is irreparably broken in earlier versions) |
|---|
| 1724 | # If all conditions hold, parse authentication information |
|---|
| 1725 | # out of the Authorization header we sent the first time |
|---|
| 1726 | # (for the username and password) and the WWW-Authenticate |
|---|
| 1727 | # header the server sent back (for the realm) and retry |
|---|
| 1728 | # the request with the appropriate digest auth headers instead. |
|---|
| 1729 | # This evil genius hack has been brought to you by Aaron Swartz. |
|---|
| 1730 | host = urlparse.urlparse(req.get_full_url())[1] |
|---|
| 1731 | try: |
|---|
| 1732 | assert sys.version.split()[0] >= '2.3.3' |
|---|
| 1733 | assert base64 != None |
|---|
| 1734 | user, passw = base64.decodestring(req.headers['Authorization'].split(' ')[1]).split(':') |
|---|
| 1735 | realm = re.findall('realm="([^"]*)"', headers['WWW-Authenticate'])[0] |
|---|
| 1736 | self.add_password(realm, host, user, passw) |
|---|
| 1737 | retry = self.http_error_auth_reqed('www-authenticate', host, req, headers) |
|---|
| 1738 | self.reset_retry_count() |
|---|
| 1739 | return retry |
|---|
| 1740 | except: |
|---|
| 1741 | return self.http_error_default(req, fp, code, msg, headers) |
|---|
| 1742 | |
|---|
| 1743 | def _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers): |
|---|
| 1744 | """URL, filename, or string --> stream |
|---|
| 1745 | |
|---|
| 1746 | This function lets you define parsers that take any input source |
|---|
| 1747 | (URL, pathname to local or network file, or actual data as a string) |
|---|
| 1748 | and deal with it in a uniform manner. Returned object is guaranteed |
|---|
| 1749 | to have all the basic stdio read methods (read, readline, readlines). |
|---|
| 1750 | Just .close() the object when you're done with it. |
|---|
| 1751 | |
|---|
| 1752 | If the etag argument is supplied, it will be used as the value of an |
|---|
| 1753 | If-None-Match request header. |
|---|
| 1754 | |
|---|
| 1755 | If the modified argument is supplied, it must be a tuple of 9 integers |
|---|
| 1756 | as returned by gmtime() in the standard Python time module. This MUST |
|---|
| 1757 | be in GMT (Greenwich Mean Time). The formatted date/time will be used |
|---|
| 1758 | as the value of an If-Modified-Since request header. |
|---|
| 1759 | |
|---|
| 1760 | If the agent argument is supplied, it will be used as the value of a |
|---|
| 1761 | User-Agent request header. |
|---|
| 1762 | |
|---|
| 1763 | If the referrer argument is supplied, it will be used as the value of a |
|---|
| 1764 | Referer[sic] request header. |
|---|
| 1765 | |
|---|
| 1766 | If handlers is supplied, it is a list of handlers used to build a |
|---|
| 1767 | urllib2 opener. |
|---|
| 1768 | """ |
|---|
| 1769 | |
|---|
| 1770 | if hasattr(url_file_stream_or_string, 'read'): |
|---|
| 1771 | return url_file_stream_or_string |
|---|
| 1772 | |
|---|
| 1773 | if url_file_stream_or_string == '-': |
|---|
| 1774 | return sys.stdin |
|---|
| 1775 | |
|---|
| 1776 | if urlparse.urlparse(url_file_stream_or_string)[0] in ('http', 'https', 'ftp'): |
|---|
| 1777 | if not agent: |
|---|
| 1778 | agent = USER_AGENT |
|---|
| 1779 | # test for inline user:password for basic auth |
|---|
| 1780 | auth = None |
|---|
| 1781 | if base64: |
|---|
| 1782 | urltype, rest = urllib.splittype(url_file_stream_or_string) |
|---|
| 1783 | realhost, rest = urllib.splithost(rest) |
|---|
| 1784 | if realhost: |
|---|
| 1785 | user_passwd, realhost = urllib.splituser(realhost) |
|---|
| 1786 | if user_passwd: |
|---|
| 1787 | url_file_stream_or_string = '%s://%s%s' % (urltype, realhost, rest) |
|---|
| 1788 | auth = base64.encodestring(user_passwd).strip() |
|---|
| 1789 | # try to open with urllib2 (to use optional headers) |
|---|
| 1790 | request = urllib2.Request(url_file_stream_or_string) |
|---|
| 1791 | request.add_header('User-Agent', agent) |
|---|
| 1792 | if etag: |
|---|
| 1793 | request.add_header('If-None-Match', etag) |
|---|
| 1794 | if modified: |
|---|
| 1795 | # format into an RFC 1123-compliant timestamp. We can't use |
|---|
| 1796 | # time.strftime() since the %a and %b directives can be affected |
|---|
| 1797 | # by the current locale, but RFC 2616 states that dates must be |
|---|
| 1798 | # in English. |
|---|
| 1799 | short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] |
|---|
| 1800 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] |
|---|
| 1801 | request.add_header('If-Modified-Since', '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])) |
|---|
| 1802 | if referrer: |
|---|
| 1803 | request.add_header('Referer', referrer) |
|---|
| 1804 | if gzip and zlib: |
|---|
| 1805 | request.add_header('Accept-encoding', 'gzip, deflate') |
|---|
| 1806 | elif gzip: |
|---|
| 1807 | request.add_header('Accept-encoding', 'gzip') |
|---|
| 1808 | elif zlib: |
|---|
| 1809 | request.add_header('Accept-encoding', 'deflate') |
|---|
| 1810 | else: |
|---|
| 1811 | request.add_header('Accept-encoding', '') |
|---|
| 1812 | if auth: |
|---|
| 1813 | request.add_header('Authorization', 'Basic %s' % auth) |
|---|
| 1814 | if ACCEPT_HEADER: |
|---|
| 1815 | request.add_header('Accept', ACCEPT_HEADER) |
|---|
| 1816 | request.add_header('A-IM', 'feed') # RFC 3229 support |
|---|
| 1817 | opener = apply(urllib2.build_opener, tuple([_FeedURLHandler()] + handlers)) |
|---|
| 1818 | opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent |
|---|
| 1819 | try: |
|---|
| 1820 | return opener.open(request) |
|---|
| 1821 | finally: |
|---|
| 1822 | opener.close() # JohnD |
|---|
| 1823 | |
|---|
| 1824 | # try to open with native open function (if url_file_stream_or_string is a filename) |
|---|
| 1825 | try: |
|---|
| 1826 | return open(url_file_stream_or_string) |
|---|
| 1827 | except: |
|---|
| 1828 | pass |
|---|
| 1829 | |
|---|
| 1830 | # treat url_file_stream_or_string as string |
|---|
| 1831 | return _StringIO(str(url_file_stream_or_string)) |
|---|
| 1832 | |
|---|
| 1833 | _date_handlers = [] |
|---|
| 1834 | def registerDateHandler(func): |
|---|
| 1835 | '''Register a date handler function (takes string, returns 9-tuple date in GMT)''' |
|---|
| 1836 | _date_handlers.insert(0, func) |
|---|
| 1837 | |
|---|
| 1838 | # ISO-8601 date parsing routines written by Fazal Majid. |
|---|
| 1839 | # The ISO 8601 standard is very convoluted and irregular - a full ISO 8601 |
|---|
| 1840 | # parser is beyond the scope of feedparser and would be a worthwhile addition |
|---|
| 1841 | # to the Python library. |
|---|
| 1842 | # A single regular expression cannot parse ISO 8601 date formats into groups |
|---|
| 1843 | # as the standard is highly irregular (for instance is 030104 2003-01-04 or |
|---|
| 1844 | # 0301-04-01), so we use templates instead. |
|---|
| 1845 | # Please note the order in templates is significant because we need a |
|---|
| 1846 | # greedy match. |
|---|
| 1847 | _iso8601_tmpl = ['YYYY-?MM-?DD', 'YYYY-MM', 'YYYY-?OOO', |
|---|
| 1848 | 'YY-?MM-?DD', 'YY-?OOO', 'YYYY', |
|---|
| 1849 | '-YY-?MM', '-OOO', '-YY', |
|---|
| 1850 | '--MM-?DD', '--MM', |
|---|
| 1851 | '---DD', |
|---|
| 1852 | 'CC', ''] |
|---|
| 1853 | _iso8601_re = [ |
|---|
| 1854 | tmpl.replace( |
|---|
| 1855 | 'YYYY', r'(?P<year>\d{4})').replace( |
|---|
| 1856 | 'YY', r'(?P<year>\d\d)').replace( |
|---|
| 1857 | 'MM', r'(?P<month>[01]\d)').replace( |
|---|
| 1858 | 'DD', r'(?P<day>[0123]\d)').replace( |
|---|
| 1859 | 'OOO', r'(?P<ordinal>[0123]\d\d)').replace( |
|---|
| 1860 | 'CC', r'(?P<century>\d\d$)') |
|---|
| 1861 | + r'(T?(?P<hour>\d{2}):(?P<minute>\d{2})' |
|---|
| 1862 | + r'(:(?P<second>\d{2}))?' |
|---|
| 1863 | + r'(?P<tz>[+-](?P<tzhour>\d{2})(:(?P<tzmin>\d{2}))?|Z)?)?' |
|---|
| 1864 | for tmpl in _iso8601_tmpl] |
|---|
| 1865 | del tmpl |
|---|
| 1866 | _iso8601_matches = [re.compile(regex).match for regex in _iso8601_re] |
|---|
| 1867 | del regex |
|---|
| 1868 | def _parse_date_iso8601(dateString): |
|---|
| 1869 | '''Parse a variety of ISO-8601-compatible formats like 20040105''' |
|---|
| 1870 | m = None |
|---|
| 1871 | for _iso8601_match in _iso8601_matches: |
|---|
| 1872 | m = _iso8601_match(dateString) |
|---|
| 1873 | if m: break |
|---|
| 1874 | if not m: return |
|---|
| 1875 | if m.span() == (0, 0): return |
|---|
| 1876 | params = m.groupdict() |
|---|
| 1877 | ordinal = params.get('ordinal', 0) |
|---|
| 1878 | if ordinal: |
|---|
| 1879 | ordinal = int(ordinal) |
|---|
| 1880 | else: |
|---|
| 1881 | ordinal = 0 |
|---|
| 1882 | year = params.get('year', '--') |
|---|
| 1883 | if not year or year == '--': |
|---|
| 1884 | year = time.gmtime()[0] |
|---|
| 1885 | elif len(year) == 2: |
|---|
| 1886 | # ISO 8601 assumes current century, i.e. 93 -> 2093, NOT 1993 |
|---|
| 1887 | year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|---|
| 1888 | else: |
|---|
| 1889 | year = int(year) |
|---|
| 1890 | month = params.get('month', '-') |
|---|
| 1891 | if not month or month == '-': |
|---|
| 1892 | # ordinals are NOT normalized by mktime, we simulate them |
|---|
| 1893 | # by setting month=1, day=ordinal |
|---|
| 1894 | if ordinal: |
|---|
| 1895 | month = 1 |
|---|
| 1896 | else: |
|---|
| 1897 | month = time.gmtime()[1] |
|---|
| 1898 | month = int(month) |
|---|
| 1899 | day = params.get('day', 0) |
|---|
| 1900 | if not day: |
|---|
| 1901 | # see above |
|---|
| 1902 | if ordinal: |
|---|
| 1903 | day = ordinal |
|---|
| 1904 | elif params.get('century', 0) or \ |
|---|
| 1905 | params.get('year', 0) or params.get('month', 0): |
|---|
| 1906 | day = 1 |
|---|
| 1907 | else: |
|---|
| 1908 | day = time.gmtime()[2] |
|---|
| 1909 | else: |
|---|
| 1910 | day = int(day) |
|---|
| 1911 | # special case of the century - is the first year of the 21st century |
|---|
| 1912 | # 2000 or 2001 ? The debate goes on... |
|---|
| 1913 | if 'century' in params.keys(): |
|---|
| 1914 | year = (int(params['century']) - 1) * 100 + 1 |
|---|
| 1915 | # in ISO 8601 most fields are optional |
|---|
| 1916 | for field in ['hour', 'minute', 'second', 'tzhour', 'tzmin']: |
|---|
| 1917 | if not params.get(field, None): |
|---|
| 1918 | params[field] = 0 |
|---|
| 1919 | hour = int(params.get('hour', 0)) |
|---|
| 1920 | minute = int(params.get('minute', 0)) |
|---|
| 1921 | second = int(params.get('second', 0)) |
|---|
| 1922 | # weekday is normalized by mktime(), we can ignore it |
|---|
| 1923 | weekday = 0 |
|---|
| 1924 | # daylight savings is complex, but not needed for feedparser's purposes |
|---|
| 1925 | # as time zones, if specified, include mention of whether it is active |
|---|
| 1926 | # (e.g. PST vs. PDT, CET). Using -1 is implementation-dependent and |
|---|
| 1927 | # and most implementations have DST bugs |
|---|
| 1928 | daylight_savings_flag = 0 |
|---|
| 1929 | tm = [year, month, day, hour, minute, second, weekday, |
|---|
| 1930 | ordinal, daylight_savings_flag] |
|---|
| 1931 | # ISO 8601 time zone adjustments |
|---|
| 1932 | tz = params.get('tz') |
|---|
| 1933 | if tz and tz != 'Z': |
|---|
| 1934 | if tz[0] == '-': |
|---|
| 1935 | tm[3] += int(params.get('tzhour', 0)) |
|---|
| 1936 | tm[4] += int(params.get('tzmin', 0)) |
|---|
| 1937 | elif tz[0] == '+': |
|---|
| 1938 | tm[3] -= int(params.get('tzhour', 0)) |
|---|
| 1939 | tm[4] -= int(params.get('tzmin', 0)) |
|---|
| 1940 | else: |
|---|
| 1941 | return None |
|---|
| 1942 | # Python's time.mktime() is a wrapper around the ANSI C mktime(3c) |
|---|
| 1943 | # which is guaranteed to normalize d/m/y/h/m/s. |
|---|
| 1944 | # Many implementations have bugs, but we'll pretend they don't. |
|---|
| 1945 | return time.localtime(time.mktime(tm)) |
|---|
| 1946 | registerDateHandler(_parse_date_iso8601) |
|---|
| 1947 | |
|---|
| 1948 | # 8-bit date handling routines written by ytrewq1. |
|---|
| 1949 | _korean_year = u'\ub144' # b3e2 in euc-kr |
|---|
| 1950 | _korean_month = u'\uc6d4' # bff9 in euc-kr |
|---|
| 1951 | _korean_day = u'\uc77c' # c0cf in euc-kr |
|---|
| 1952 | _korean_am = u'\uc624\uc804' # bfc0 c0fc in euc-kr |
|---|
| 1953 | _korean_pm = u'\uc624\ud6c4' # bfc0 c8c4 in euc-kr |
|---|
| 1954 | |
|---|
| 1955 | _korean_onblog_date_re = \ |
|---|
| 1956 | re.compile('(\d{4})%s\s+(\d{2})%s\s+(\d{2})%s\s+(\d{2}):(\d{2}):(\d{2})' % \ |
|---|
| 1957 | (_korean_year, _korean_month, _korean_day)) |
|---|
| 1958 | _korean_nate_date_re = \ |
|---|
| 1959 | re.compile(u'(\d{4})-(\d{2})-(\d{2})\s+(%s|%s)\s+(\d{,2}):(\d{,2}):(\d{,2})' % \ |
|---|
| 1960 | (_korean_am, _korean_pm)) |
|---|
| 1961 | def _parse_date_onblog(dateString): |
|---|
| 1962 | '''Parse a string according to the OnBlog 8-bit date format''' |
|---|
| 1963 | m = _korean_onblog_date_re.match(dateString) |
|---|
| 1964 | if not m: return |
|---|
| 1965 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|---|
| 1966 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|---|
| 1967 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|---|
| 1968 | 'zonediff': '+09:00'} |
|---|
| 1969 | if _debug: sys.stderr.write('OnBlog date parsed as: %s\n' % w3dtfdate) |
|---|
| 1970 | return _parse_date_w3dtf(w3dtfdate) |
|---|
| 1971 | registerDateHandler(_parse_date_onblog) |
|---|
| 1972 | |
|---|
| 1973 | def _parse_date_nate(dateString): |
|---|
| 1974 | '''Parse a string according to the Nate 8-bit date format''' |
|---|
| 1975 | m = _korean_nate_date_re.match(dateString) |
|---|
| 1976 | if not m: return |
|---|
| 1977 | hour = int(m.group(5)) |
|---|
| 1978 | ampm = m.group(4) |
|---|
| 1979 | if (ampm == _korean_pm): |
|---|
| 1980 | hour += 12 |
|---|
| 1981 | hour = str(hour) |
|---|
| 1982 | if len(hour) == 1: |
|---|
| 1983 | hour = '0' + hour |
|---|
| 1984 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|---|
| 1985 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|---|
| 1986 | 'hour': hour, 'minute': m.group(6), 'second': m.group(7),\ |
|---|
| 1987 | 'zonediff': '+09:00'} |
|---|
| 1988 | if _debug: sys.stderr.write('Nate date parsed as: %s\n' % w3dtfdate) |
|---|
| 1989 | return _parse_date_w3dtf(w3dtfdate) |
|---|
| 1990 | registerDateHandler(_parse_date_nate) |
|---|
| 1991 | |
|---|
| 1992 | _mssql_date_re = \ |
|---|
| 1993 | re.compile('(\d{4})-(\d{2})-(\d{2})\s+(\d{2}):(\d{2}):(\d{2})(\.\d+)?') |
|---|
| 1994 | def _parse_date_mssql(dateString): |
|---|
| 1995 | '''Parse a string according to the MS SQL date format''' |
|---|
| 1996 | m = _mssql_date_re.match(dateString) |
|---|
| 1997 | if not m: return |
|---|
| 1998 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s:%(second)s%(zonediff)s' % \ |
|---|
| 1999 | {'year': m.group(1), 'month': m.group(2), 'day': m.group(3),\ |
|---|
| 2000 | 'hour': m.group(4), 'minute': m.group(5), 'second': m.group(6),\ |
|---|
| 2001 | 'zonediff': '+09:00'} |
|---|
| 2002 | if _debug: sys.stderr.write('MS SQL date parsed as: %s\n' % w3dtfdate) |
|---|
| 2003 | return _parse_date_w3dtf(w3dtfdate) |
|---|
| 2004 | registerDateHandler(_parse_date_mssql) |
|---|
| 2005 | |
|---|
| 2006 | # Unicode strings for Greek date strings |
|---|
| 2007 | _greek_months = \ |
|---|
| 2008 | { \ |
|---|
| 2009 | u'\u0399\u03b1\u03bd': u'Jan', # c9e1ed in iso-8859-7 |
|---|
| 2010 | u'\u03a6\u03b5\u03b2': u'Feb', # d6e5e2 in iso-8859-7 |
|---|
| 2011 | u'\u039c\u03ac\u03ce': u'Mar', # ccdcfe in iso-8859-7 |
|---|
| 2012 | u'\u039c\u03b1\u03ce': u'Mar', # cce1fe in iso-8859-7 |
|---|
| 2013 | u'\u0391\u03c0\u03c1': u'Apr', # c1f0f1 in iso-8859-7 |
|---|
| 2014 | u'\u039c\u03ac\u03b9': u'May', # ccdce9 in iso-8859-7 |
|---|
| 2015 | u'\u039c\u03b1\u03ca': u'May', # cce1fa in iso-8859-7 |
|---|
| 2016 | u'\u039c\u03b1\u03b9': u'May', # cce1e9 in iso-8859-7 |
|---|
| 2017 | u'\u0399\u03bf\u03cd\u03bd': u'Jun', # c9effded in iso-8859-7 |
|---|
| 2018 | u'\u0399\u03bf\u03bd': u'Jun', # c9efed in iso-8859-7 |
|---|
| 2019 | u'\u0399\u03bf\u03cd\u03bb': u'Jul', # c9effdeb in iso-8859-7 |
|---|
| 2020 | u'\u0399\u03bf\u03bb': u'Jul', # c9f9eb in iso-8859-7 |
|---|
| 2021 | u'\u0391\u03cd\u03b3': u'Aug', # c1fde3 in iso-8859-7 |
|---|
| 2022 | u'\u0391\u03c5\u03b3': u'Aug', # c1f5e3 in iso-8859-7 |
|---|
| 2023 | u'\u03a3\u03b5\u03c0': u'Sep', # d3e5f0 in iso-8859-7 |
|---|
| 2024 | u'\u039f\u03ba\u03c4': u'Oct', # cfeaf4 in iso-8859-7 |
|---|
| 2025 | u'\u039d\u03bf\u03ad': u'Nov', # cdefdd in iso-8859-7 |
|---|
| 2026 | u'\u039d\u03bf\u03b5': u'Nov', # cdefe5 in iso-8859-7 |
|---|
| 2027 | u'\u0394\u03b5\u03ba': u'Dec', # c4e5ea in iso-8859-7 |
|---|
| 2028 | } |
|---|
| 2029 | |
|---|
| 2030 | _greek_wdays = \ |
|---|
| 2031 | { \ |
|---|
| 2032 | u'\u039a\u03c5\u03c1': u'Sun', # caf5f1 in iso-8859-7 |
|---|
| 2033 | u'\u0394\u03b5\u03c5': u'Mon', # c4e5f5 in iso-8859-7 |
|---|
| 2034 | u'\u03a4\u03c1\u03b9': u'Tue', # d4f1e9 in iso-8859-7 |
|---|
| 2035 | u'\u03a4\u03b5\u03c4': u'Wed', # d4e5f4 in iso-8859-7 |
|---|
| 2036 | u'\u03a0\u03b5\u03bc': u'Thu', # d0e5ec in iso-8859-7 |
|---|
| 2037 | u'\u03a0\u03b1\u03c1': u'Fri', # d0e1f1 in iso-8859-7 |
|---|
| 2038 | u'\u03a3\u03b1\u03b2': u'Sat', # d3e1e2 in iso-8859-7 |
|---|
| 2039 | } |
|---|
| 2040 | |
|---|
| 2041 | _greek_date_format_re = \ |
|---|
| 2042 | re.compile(u'([^,]+),\s+(\d{2})\s+([^\s]+)\s+(\d{4})\s+(\d{2}):(\d{2}):(\d{2})\s+([^\s]+)') |
|---|
| 2043 | |
|---|
| 2044 | def _parse_date_greek(dateString): |
|---|
| 2045 | '''Parse a string according to a Greek 8-bit date format.''' |
|---|
| 2046 | m = _greek_date_format_re.match(dateString) |
|---|
| 2047 | if not m: return |
|---|
| 2048 | try: |
|---|
| 2049 | wday = _greek_wdays[m.group(1)] |
|---|
| 2050 | month = _greek_months[m.group(3)] |
|---|
| 2051 | except: |
|---|
| 2052 | return |
|---|
| 2053 | rfc822date = '%(wday)s, %(day)s %(month)s %(year)s %(hour)s:%(minute)s:%(second)s %(zonediff)s' % \ |
|---|
| 2054 | {'wday': wday, 'day': m.group(2), 'month': month, 'year': m.group(4),\ |
|---|
| 2055 | 'hour': m.group(5), 'minute': m.group(6), 'second': m.group(7),\ |
|---|
| 2056 | 'zonediff': m.group(8)} |
|---|
| 2057 | if _debug: sys.stderr.write('Greek date parsed as: %s\n' % rfc822date) |
|---|
| 2058 | return _parse_date_rfc822(rfc822date) |
|---|
| 2059 | registerDateHandler(_parse_date_greek) |
|---|
| 2060 | |
|---|
| 2061 | # Unicode strings for Hungarian date strings |
|---|
| 2062 | _hungarian_months = \ |
|---|
| 2063 | { \ |
|---|
| 2064 | u'janu\u00e1r': u'01', # e1 in iso-8859-2 |
|---|
| 2065 | u'febru\u00e1ri': u'02', # e1 in iso-8859-2 |
|---|
| 2066 | u'm\u00e1rcius': u'03', # e1 in iso-8859-2 |
|---|
| 2067 | u'\u00e1prilis': u'04', # e1 in iso-8859-2 |
|---|
| 2068 | u'm\u00e1ujus': u'05', # e1 in iso-8859-2 |
|---|
| 2069 | u'j\u00fanius': u'06', # fa in iso-8859-2 |
|---|
| 2070 | u'j\u00falius': u'07', # fa in iso-8859-2 |
|---|
| 2071 | u'augusztus': u'08', |
|---|
| 2072 | u'szeptember': u'09', |
|---|
| 2073 | u'okt\u00f3ber': u'10', # f3 in iso-8859-2 |
|---|
| 2074 | u'november': u'11', |
|---|
| 2075 | u'december': u'12', |
|---|
| 2076 | } |
|---|
| 2077 | |
|---|
| 2078 | _hungarian_date_format_re = \ |
|---|
| 2079 | re.compile(u'(\d{4})-([^-]+)-(\d{,2})T(\d{,2}):(\d{2})((\+|-)(\d{,2}:\d{2}))') |
|---|
| 2080 | |
|---|
| 2081 | def _parse_date_hungarian(dateString): |
|---|
| 2082 | '''Parse a string according to a Hungarian 8-bit date format.''' |
|---|
| 2083 | m = _hungarian_date_format_re.match(dateString) |
|---|
| 2084 | if not m: return |
|---|
| 2085 | try: |
|---|
| 2086 | month = _hungarian_months[m.group(2)] |
|---|
| 2087 | day = m.group(3) |
|---|
| 2088 | if len(day) == 1: |
|---|
| 2089 | day = '0' + day |
|---|
| 2090 | hour = m.group(4) |
|---|
| 2091 | if len(hour) == 1: |
|---|
| 2092 | hour = '0' + hour |
|---|
| 2093 | except: |
|---|
| 2094 | return |
|---|
| 2095 | w3dtfdate = '%(year)s-%(month)s-%(day)sT%(hour)s:%(minute)s%(zonediff)s' % \ |
|---|
| 2096 | {'year': m.group(1), 'month': month, 'day': day,\ |
|---|
| 2097 | 'hour': hour, 'minute': m.group(5),\ |
|---|
| 2098 | 'zonediff': m.group(6)} |
|---|
| 2099 | if _debug: sys.stderr.write('Hungarian date parsed as: %s\n' % w3dtfdate) |
|---|
| 2100 | return _parse_date_w3dtf(w3dtfdate) |
|---|
| 2101 | registerDateHandler(_parse_date_hungarian) |
|---|
| 2102 | |
|---|
| 2103 | # W3DTF-style date parsing adapted from PyXML xml.utils.iso8601, written by |
|---|
| 2104 | # Drake and licensed under the Python license. Removed all range checking |
|---|
| 2105 | # for month, day, hour, minute, and second, since mktime will normalize |
|---|
| 2106 | # these later |
|---|
| 2107 | def _parse_date_w3dtf(dateString): |
|---|
| 2108 | def __extract_date(m): |
|---|
| 2109 | year = int(m.group('year')) |
|---|
| 2110 | if year < 100: |
|---|
| 2111 | year = 100 * int(time.gmtime()[0] / 100) + int(year) |
|---|
| 2112 | if year < 1000: |
|---|
| 2113 | return 0, 0, 0 |
|---|
| 2114 | julian = m.group('julian') |
|---|
| 2115 | if julian: |
|---|
| 2116 | julian = int(julian) |
|---|
| 2117 | month = julian / 30 + 1 |
|---|
| 2118 | day = julian % 30 + 1 |
|---|
| 2119 | jday = None |
|---|
| 2120 | while jday != julian: |
|---|
| 2121 | t = time.mktime((year, month, day, 0, 0, 0, 0, 0, 0)) |
|---|
| 2122 | jday = time.gmtime(t)[-2] |
|---|
| 2123 | diff = abs(jday - julian) |
|---|
| 2124 | if jday > julian: |
|---|
| 2125 | if diff < day: |
|---|
| 2126 | day = day - diff |
|---|
| 2127 | else: |
|---|
| 2128 | month = month - 1 |
|---|
| 2129 | day = 31 |
|---|
| 2130 | elif jday < julian: |
|---|
| 2131 | if day + diff < 28: |
|---|
| 2132 | day = day + diff |
|---|
| 2133 | else: |
|---|
| 2134 | month = month + 1 |
|---|
| 2135 | return year, month, day |
|---|
| 2136 | month = m.group('month') |
|---|
| 2137 | day = 1 |
|---|
| 2138 | if month is None: |
|---|
| 2139 | month = 1 |
|---|
| 2140 | else: |
|---|
| 2141 | month = int(month) |
|---|
| 2142 | day = m.group('day') |
|---|
| 2143 | if day: |
|---|
| 2144 | day = int(day) |
|---|
| 2145 | else: |
|---|
| 2146 | day = 1 |
|---|
| 2147 | return year, month, day |
|---|
| 2148 | |
|---|
| 2149 | def __extract_time(m): |
|---|
| 2150 | if not m: |
|---|
| 2151 | return 0, 0, 0 |
|---|
| 2152 | hours = m.group('hours') |
|---|
| 2153 | if not hours: |
|---|
| 2154 | return 0, 0, 0 |
|---|
| 2155 | hours = int(hours) |
|---|
| 2156 | minutes = int(m.group('minutes')) |
|---|
| 2157 | seconds = m.group('seconds') |
|---|
| 2158 | if seconds: |
|---|
| 2159 | seconds = int(seconds) |
|---|
| 2160 | else: |
|---|
| 2161 | seconds = 0 |
|---|
| 2162 | return hours, minutes, seconds |
|---|
| 2163 | |
|---|
| 2164 | def __extract_tzd(m): |
|---|
| 2165 | '''Return the Time Zone Designator as an offset in seconds from UTC.''' |
|---|
| 2166 | if not m: |
|---|
| 2167 | return 0 |
|---|
| 2168 | tzd = m.group('tzd') |
|---|
| 2169 | if not tzd: |
|---|
| 2170 | return 0 |
|---|
| 2171 | if tzd == 'Z': |
|---|
| 2172 | return 0 |
|---|
| 2173 | hours = int(m.group('tzdhours')) |
|---|
| 2174 | minutes = m.group('tzdminutes') |
|---|
| 2175 | if minutes: |
|---|
| 2176 | minutes = int(minutes) |
|---|
| 2177 | else: |
|---|
| 2178 | minutes = 0 |
|---|
| 2179 | offset = (hours*60 + minutes) * 60 |
|---|
| 2180 | if tzd[0] == '+': |
|---|
| 2181 | return -offset |
|---|
| 2182 | return offset |
|---|
| 2183 | |
|---|
| 2184 | __date_re = ('(?P<year>\d\d\d\d)' |
|---|
| 2185 | '(?:(?P<dsep>-|)' |
|---|
| 2186 | '(?:(?P<julian>\d\d\d)' |
|---|
| 2187 | '|(?P<month>\d\d)(?:(?P=dsep)(?P<day>\d\d))?))?') |
|---|
| 2188 | __tzd_re = '(?P<tzd>[-+](?P<tzdhours>\d\d)(?::?(?P<tzdminutes>\d\d))|Z)' |
|---|
| 2189 | __tzd_rx = re.compile(__tzd_re) |
|---|
| 2190 | __time_re = ('(?P<hours>\d\d)(?P<tsep>:|)(?P<minutes>\d\d)' |
|---|
| 2191 | '(?:(?P=tsep)(?P<seconds>\d\d(?:[.,]\d+)?))?' |
|---|
| 2192 | + __tzd_re) |
|---|
| 2193 | __datetime_re = '%s(?:T%s)?' % (__date_re, __time_re) |
|---|
| 2194 | __datetime_rx = re.compile(__datetime_re) |
|---|
| 2195 | m = __datetime_rx.match(dateString) |
|---|
| 2196 | if (m is None) or (m.group() != dateString): return |
|---|
| 2197 | gmt = __extract_date(m) + __extract_time(m) + (0, 0, 0) |
|---|
| 2198 | if gmt[0] == 0: return |
|---|
| 2199 | return time.gmtime(time.mktime(gmt) + __extract_tzd(m) - time.timezone) |
|---|
| 2200 | registerDateHandler(_parse_date_w3dtf) |
|---|
| 2201 | |
|---|
| 2202 | def _parse_date_rfc822(dateString): |
|---|
| 2203 | '''Parse an RFC822, RFC1123, RFC2822, or asctime-style date''' |
|---|
| 2204 | data = dateString.split() |
|---|
| 2205 | if data[0][-1] in (',', '.') or data[0].lower() in rfc822._daynames: |
|---|
| 2206 | del data[0] |
|---|
| 2207 | if len(data) == 4: |
|---|
| 2208 | s = data[3] |
|---|
| 2209 | i = s.find('+') |
|---|
| 2210 | if i > 0: |
|---|
| 2211 | data[3:] = [s[:i], s[i+1:]] |
|---|
| 2212 | else: |
|---|
| 2213 | data.append('') |
|---|
| 2214 | dateString = " ".join(data) |
|---|
| 2215 | if len(data) < 5: |
|---|
| 2216 | dateString += ' 00:00:00 GMT' |
|---|
| 2217 | tm = rfc822.parsedate_tz(dateString) |
|---|
| 2218 | if tm: |
|---|
| 2219 | return time.gmtime(rfc822.mktime_tz(tm)) |
|---|
| 2220 | # rfc822.py defines several time zones, but we define some extra ones. |
|---|
| 2221 | # 'ET' is equivalent to 'EST', etc. |
|---|
| 2222 | _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -800} |
|---|
| 2223 | rfc822._timezones.update(_additional_timezones) |
|---|
| 2224 | registerDateHandler(_parse_date_rfc822) |
|---|
| 2225 | |
|---|
| 2226 | def _parse_date(dateString): |
|---|
| 2227 | '''Parses a variety of date formats into a 9-tuple in GMT''' |
|---|
| 2228 | for handler in _date_handlers: |
|---|
| 2229 | try: |
|---|
| 2230 | date9tuple = handler(dateString) |
|---|
| 2231 | if not date9tuple: continue |
|---|
| 2232 | if len(date9tuple) != 9: |
|---|
| 2233 | if _debug: sys.stderr.write('date handler function must return 9-tuple\n') |
|---|
| 2234 | raise ValueError |
|---|
| 2235 | map(int, date9tuple) |
|---|
| 2236 | return date9tuple |
|---|
| 2237 | except Exception, e: |
|---|
| 2238 | if _debug: sys.stderr.write('%s raised %s\n' % (handler.__name__, repr(e))) |
|---|
| 2239 | pass |
|---|
| 2240 | return None |
|---|
| 2241 | |
|---|
| 2242 | def _getCharacterEncoding(http_headers, xml_data): |
|---|
| 2243 | '''Get the character encoding of the XML document |
|---|
| 2244 | |
|---|
| 2245 | http_headers is a dictionary |
|---|
| 2246 | xml_data is a raw string (not Unicode) |
|---|
| 2247 | |
|---|
| 2248 | This is so much trickier than it sounds, it's not even funny. |
|---|
| 2249 | According to RFC 3023 ('XML Media Types'), if the HTTP Content-Type |
|---|
| 2250 | is application/xml, application/*+xml, |
|---|
| 2251 | application/xml-external-parsed-entity, or application/xml-dtd, |
|---|
| 2252 | the encoding given in the charset parameter of the HTTP Content-Type |
|---|
| 2253 | takes precedence over the encoding given in the XML prefix within the |
|---|
| 2254 | document, and defaults to 'utf-8' if neither are specified. But, if |
|---|
| 2255 | the HTTP Content-Type is text/xml, text/*+xml, or |
|---|
| 2256 | text/xml-external-parsed-entity, the encoding given in the XML prefix |
|---|
| 2257 | within the document is ALWAYS IGNORED and only the encoding given in |
|---|
| 2258 | the charset parameter of the HTTP Content-Type header should be |
|---|
| 2259 | respected, and it defaults to 'us-ascii' if not specified. |
|---|
| 2260 | |
|---|
| 2261 | Furthermore, discussion on the atom-syntax mailing list with the |
|---|
| 2262 | author of RFC 3023 leads me to the conclusion that any document |
|---|
| 2263 | served with a Content-Type of text/* and no charset parameter |
|---|
| 2264 | must be treated as us-ascii. (We now do this.) And also that it |
|---|
| 2265 | must always be flagged as non-well-formed. (We now do this too.) |
|---|
| 2266 | |
|---|
| 2267 | If Content-Type is unspecified (input was local file or non-HTTP source) |
|---|
| 2268 | or unrecognized (server just got it totally wrong), then go by the |
|---|
| 2269 | encoding given in the XML prefix of the document and default to |
|---|
| 2270 | 'iso-8859-1' as per the HTTP specification (RFC 2616). |
|---|
| 2271 | |
|---|
| 2272 | Then, assuming we didn't find a character encoding in the HTTP headers |
|---|
| 2273 | (and the HTTP Content-type allowed us to look in the body), we need |
|---|
| 2274 | to sniff the first few bytes of the XML data and try to determine |
|---|
| 2275 | whether the encoding is ASCII-compatible. Section F of the XML |
|---|
| 2276 | specification shows the way here: |
|---|
| 2277 | http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|---|
| 2278 | |
|---|
| 2279 | If the sniffed encoding is not ASCII-compatible, we need to make it |
|---|
| 2280 | ASCII compatible so that we can sniff further into the XML declaration |
|---|
| 2281 | to find the encoding attribute, which will tell us the true encoding. |
|---|
| 2282 | |
|---|
| 2283 | Of course, none of this guarantees that we will be able to parse the |
|---|
| 2284 | feed in the declared character encoding (assuming it was declared |
|---|
| 2285 | correctly, which many are not). CJKCodecs and iconv_codec help a lot; |
|---|
| 2286 | you should definitely install them if you can. |
|---|
| 2287 | http://cjkpython.i18n.org/ |
|---|
| 2288 | ''' |
|---|
| 2289 | |
|---|
| 2290 | def _parseHTTPContentType(content_type): |
|---|
| 2291 | '''takes HTTP Content-Type header and returns (content type, charset) |
|---|
| 2292 | |
|---|
| 2293 | If no charset is specified, returns (content type, '') |
|---|
| 2294 | If no content type is specified, returns ('', '') |
|---|
| 2295 | Both return parameters are guaranteed to be lowercase strings |
|---|
| 2296 | ''' |
|---|
| 2297 | content_type = content_type or '' |
|---|
| 2298 | content_type, params = cgi.parse_header(content_type) |
|---|
| 2299 | return content_type, params.get('charset', '').replace("'", '') |
|---|
| 2300 | |
|---|
| 2301 | sniffed_xml_encoding = '' |
|---|
| 2302 | xml_encoding = '' |
|---|
| 2303 | true_encoding = '' |
|---|
| 2304 | http_content_type, http_encoding = _parseHTTPContentType(http_headers.get('content-type')) |
|---|
| 2305 | # Must sniff for non-ASCII-compatible character encodings before |
|---|
| 2306 | # searching for XML declaration. This heuristic is defined in |
|---|
| 2307 | # section F of the XML specification: |
|---|
| 2308 | # http://www.w3.org/TR/REC-xml/#sec-guessing-no-ext-info |
|---|
| 2309 | try: |
|---|
| 2310 | if xml_data[:4] == '\x4c\x6f\xa7\x94': |
|---|
| 2311 | # EBCDIC |
|---|
| 2312 | xml_data = _ebcdic_to_ascii(xml_data) |
|---|
| 2313 | elif xml_data[:4] == '\x00\x3c\x00\x3f': |
|---|
| 2314 | # UTF-16BE |
|---|
| 2315 | sniffed_xml_encoding = 'utf-16be' |
|---|
| 2316 | xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') |
|---|
| 2317 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') and (xml_data[2:4] != '\x00\x00'): |
|---|
| 2318 | # UTF-16BE with BOM |
|---|
| 2319 | sniffed_xml_encoding = 'utf-16be' |
|---|
| 2320 | xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') |
|---|
| 2321 | elif xml_data[:4] == '\x3c\x00\x3f\x00': |
|---|
| 2322 | # UTF-16LE |
|---|
| 2323 | sniffed_xml_encoding = 'utf-16le' |
|---|
| 2324 | xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') |
|---|
| 2325 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and (xml_data[2:4] != '\x00\x00'): |
|---|
| 2326 | # UTF-16LE with BOM |
|---|
| 2327 | sniffed_xml_encoding = 'utf-16le' |
|---|
| 2328 | xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') |
|---|
| 2329 | elif xml_data[:4] == '\x00\x00\x00\x3c': |
|---|
| 2330 | # UTF-32BE |
|---|
| 2331 | sniffed_xml_encoding = 'utf-32be' |
|---|
| 2332 | xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') |
|---|
| 2333 | elif xml_data[:4] == '\x3c\x00\x00\x00': |
|---|
| 2334 | # UTF-32LE |
|---|
| 2335 | sniffed_xml_encoding = 'utf-32le' |
|---|
| 2336 | xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') |
|---|
| 2337 | elif xml_data[:4] == '\x00\x00\xfe\xff': |
|---|
| 2338 | # UTF-32BE with BOM |
|---|
| 2339 | sniffed_xml_encoding = 'utf-32be' |
|---|
| 2340 | xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') |
|---|
| 2341 | elif xml_data[:4] == '\xff\xfe\x00\x00': |
|---|
| 2342 | # UTF-32LE with BOM |
|---|
| 2343 | sniffed_xml_encoding = 'utf-32le' |
|---|
| 2344 | xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') |
|---|
| 2345 | elif xml_data[:3] == '\xef\xbb\xbf': |
|---|
| 2346 | # UTF-8 with BOM |
|---|
| 2347 | sniffed_xml_encoding = 'utf-8' |
|---|
| 2348 | xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') |
|---|
| 2349 | else: |
|---|
| 2350 | # ASCII-compatible |
|---|
| 2351 | pass |
|---|
| 2352 | xml_encoding_match = re.compile('^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) |
|---|
| 2353 | except: |
|---|
| 2354 | xml_encoding_match = None |
|---|
| 2355 | if xml_encoding_match: |
|---|
| 2356 | xml_encoding = xml_encoding_match.groups()[0].lower() |
|---|
| 2357 | if sniffed_xml_encoding and (xml_encoding in ('iso-10646-ucs-2', 'ucs-2', 'csunicode', 'iso-10646-ucs-4', 'ucs-4', 'csucs4', 'utf-16', 'utf-32', 'utf_16', 'utf_32', 'utf16', 'u16')): |
|---|
| 2358 | xml_encoding = sniffed_xml_encoding |
|---|
| 2359 | acceptable_content_type = 0 |
|---|
| 2360 | application_content_types = ('application/xml', 'application/xml-dtd', 'application/xml-external-parsed-entity') |
|---|
| 2361 | text_content_types = ('text/xml', 'text/xml-external-parsed-entity') |
|---|
| 2362 | if (http_content_type in application_content_types) or \ |
|---|
| 2363 | (http_content_type.startswith('application/') and http_content_type.endswith('+xml')): |
|---|
| 2364 | acceptable_content_type = 1 |
|---|
| 2365 | true_encoding = http_encoding or xml_encoding or 'utf-8' |
|---|
| 2366 | elif (http_content_type in text_content_types) or \ |
|---|
| 2367 | (http_content_type.startswith('text/')) and http_content_type.endswith('+xml'): |
|---|
| 2368 | acceptable_content_type = 1 |
|---|
| 2369 | true_encoding = http_encoding or 'us-ascii' |
|---|
| 2370 | elif http_content_type.startswith('text/'): |
|---|
| 2371 | true_encoding = http_encoding or 'us-ascii' |
|---|
| 2372 | elif http_headers and (not http_headers.has_key('content-type')): |
|---|
| 2373 | true_encoding = xml_encoding or 'iso-8859-1' |
|---|
| 2374 | else: |
|---|
| 2375 | true_encoding = xml_encoding or 'utf-8' |
|---|
| 2376 | return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type |
|---|
| 2377 | |
|---|
| 2378 | def _toUTF8(data, encoding): |
|---|
| 2379 | '''Changes an XML data stream on the fly to specify a new encoding |
|---|
| 2380 | |
|---|
| 2381 | data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already |
|---|
| 2382 | encoding is a string recognized by encodings.aliases |
|---|
| 2383 | ''' |
|---|
| 2384 | if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) |
|---|
| 2385 | # strip Byte Order Mark (if present) |
|---|
| 2386 | if (len(data) >= 4) and (data[:2] == '\xfe\xff') and (data[2:4] != '\x00\x00'): |
|---|
| 2387 | if _debug: |
|---|
| 2388 | sys.stderr.write('stripping BOM\n') |
|---|
| 2389 | if encoding != 'utf-16be': |
|---|
| 2390 | sys.stderr.write('trying utf-16be instead\n') |
|---|
| 2391 | encoding = 'utf-16be' |
|---|
| 2392 | data = data[2:] |
|---|
| 2393 | elif (len(data) >= 4) and (data[:2] == '\xff\xfe') and (data[2:4] != '\x00\x00'): |
|---|
| 2394 | if _debug: |
|---|
| 2395 | sys.stderr.write('stripping BOM\n') |
|---|
| 2396 | if encoding != 'utf-16le': |
|---|
| 2397 | sys.stderr.write('trying utf-16le instead\n') |
|---|
| 2398 | encoding = 'utf-16le' |
|---|
| 2399 | data = data[2:] |
|---|
| 2400 | elif data[:3] == '\xef\xbb\xbf': |
|---|
| 2401 | if _debug: |
|---|
| 2402 | sys.stderr.write('stripping BOM\n') |
|---|
| 2403 | if encoding != 'utf-8': |
|---|
| 2404 | sys.stderr.write('trying utf-8 instead\n') |
|---|
| 2405 | encoding = 'utf-8' |
|---|
| 2406 | data = data[3:] |
|---|
| 2407 | elif data[:4] == '\x00\x00\xfe\xff': |
|---|
| 2408 | if _debug: |
|---|
| 2409 | sys.stderr.write('stripping BOM\n') |
|---|
| 2410 | if encoding != 'utf-32be': |
|---|
| 2411 | sys.stderr.write('trying utf-32be instead\n') |
|---|
| 2412 | encoding = 'utf-32be' |
|---|
| 2413 | data = data[4:] |
|---|
| 2414 | elif data[:4] == '\xff\xfe\x00\x00': |
|---|
| 2415 | if _debug: |
|---|
| 2416 | sys.stderr.write('stripping BOM\n') |
|---|
| 2417 | if encoding != 'utf-32le': |
|---|
| 2418 | sys.stderr.write('trying utf-32le instead\n') |
|---|
| 2419 | encoding = 'utf-32le' |
|---|
| 2420 | data = data[4:] |
|---|
| 2421 | newdata = unicode(data, encoding) |
|---|
| 2422 | if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) |
|---|
| 2423 | declmatch = re.compile('^<\?xml[^>]*?>') |
|---|
| 2424 | newdecl = '''<?xml version='1.0' encoding='utf-8'?>''' |
|---|
| 2425 | if declmatch.search(newdata): |
|---|
| 2426 | newdata = declmatch.sub(newdecl, newdata) |
|---|
| 2427 | else: |
|---|
| 2428 | newdata = newdecl + u'\n' + newdata |
|---|
| 2429 | return newdata.encode('utf-8') |
|---|
| 2430 | |
|---|
| 2431 | def _stripDoctype(data): |
|---|
| 2432 | '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) |
|---|
| 2433 | |
|---|
| 2434 | rss_version may be 'rss091n' or None |
|---|
| 2435 | stripped_data is the same XML document, minus the DOCTYPE |
|---|
| 2436 | ''' |
|---|
| 2437 | entity_pattern = re.compile(r'<!ENTITY([^>]*?)>', re.MULTILINE) |
|---|
| 2438 | data = entity_pattern.sub('', data) |
|---|
| 2439 | doctype_pattern = re.compile(r'<!DOCTYPE([^>]*?)>', re.MULTILINE) |
|---|
| 2440 | doctype_results = doctype_pattern.findall(data) |
|---|
| 2441 | doctype = doctype_results and doctype_results[0] or '' |
|---|
| 2442 | if doctype.lower().count('netscape'): |
|---|
| 2443 | version = 'rss091n' |
|---|
| 2444 | else: |
|---|
| 2445 | version = None |
|---|
| 2446 | data = doctype_pattern.sub('', data) |
|---|
| 2447 | return version, data |
|---|
| 2448 | |
|---|
| 2449 | def parse(url_file_stream_or_string, etag=None, modified=None, agent=None, referrer=None, handlers=[]): |
|---|
| 2450 | '''Parse a feed from a URL, file, stream, or string''' |
|---|
| 2451 | result = FeedParserDict() |
|---|
| 2452 | result['feed'] = FeedParserDict() |
|---|
| 2453 | result['entries'] = [] |
|---|
| 2454 | if _XML_AVAILABLE: |
|---|
| 2455 | result['bozo'] = 0 |
|---|
| 2456 | if type(handlers) == types.InstanceType: |
|---|
| 2457 | handlers = [handlers] |
|---|
| 2458 | try: |
|---|
| 2459 | f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers) |
|---|
| 2460 | data = f.read() |
|---|
| 2461 | except Exception, e: |
|---|
| 2462 | result['bozo'] = 1 |
|---|
| 2463 | result['bozo_exception'] = e |
|---|
| 2464 | data = '' |
|---|
| 2465 | f = None |
|---|
| 2466 | |
|---|
| 2467 | # if feed is gzip-compressed, decompress it |
|---|
| 2468 | if f and data and hasattr(f, 'headers'): |
|---|
| 2469 | if gzip and f.headers.get('content-encoding', '') == 'gzip': |
|---|
| 2470 | try: |
|---|
| 2471 | data = gzip.GzipFile(fileobj=_StringIO(data)).read() |
|---|
| 2472 | except Exception, e: |
|---|
| 2473 | # Some feeds claim to be gzipped but they're not, so |
|---|
| 2474 | # we get garbage. Ideally, we should re-request the |
|---|
| 2475 | # feed without the 'Accept-encoding: gzip' header, |
|---|
| 2476 | # but we don't. |
|---|
| 2477 | result['bozo'] = 1 |
|---|
| 2478 | result['bozo_exception'] = e |
|---|
| 2479 | data = '' |
|---|
| 2480 | elif zlib and f.headers.get('content-encoding', '') == 'deflate': |
|---|
| 2481 | try: |
|---|
| 2482 | data = zlib.decompress(data, -zlib.MAX_WBITS) |
|---|
| 2483 | except Exception, e: |
|---|
| 2484 | result['bozo'] = 1 |
|---|
| 2485 | result['bozo_exception'] = e |
|---|
| 2486 | data = '' |
|---|
| 2487 | |
|---|
| 2488 | # save HTTP headers |
|---|
| 2489 | if hasattr(f, 'info'): |
|---|
| 2490 | info = f.info() |
|---|
| 2491 | result['etag'] = info.getheader('ETag') |
|---|
| 2492 | last_modified = info.getheader('Last-Modified') |
|---|
| 2493 | if last_modified: |
|---|
| 2494 | result['modified'] = _parse_date(last_modified) |
|---|
| 2495 | if hasattr(f, 'url'): |
|---|
| 2496 | result['href'] = f.url |
|---|
| 2497 | result['status'] = 200 |
|---|
| 2498 | if hasattr(f, 'status'): |
|---|
| 2499 | result['status'] = f.status |
|---|
| 2500 | if hasattr(f, 'headers'): |
|---|
| 2501 | result['headers'] = f.headers.dict |
|---|
| 2502 | if hasattr(f, 'close'): |
|---|
| 2503 | f.close() |
|---|
| 2504 | |
|---|
| 2505 | # there are four encodings to keep track of: |
|---|
| 2506 | # - http_encoding is the encoding declared in the Content-Type HTTP header |
|---|
| 2507 | # - xml_encoding is the encoding declared in the <?xml declaration |
|---|
| 2508 | # - sniffed_encoding is the encoding sniffed from the first 4 bytes of the XML data |
|---|
| 2509 | # - result['encoding'] is the actual encoding, as per RFC 3023 and a variety of other conflicting specifications |
|---|
| 2510 | http_headers = result.get('headers', {}) |
|---|
| 2511 | result['encoding'], http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type = \ |
|---|
| 2512 | _getCharacterEncoding(http_headers, data) |
|---|
| 2513 | if http_headers and (not acceptable_content_type): |
|---|
| 2514 | if http_headers.has_key('content-type'): |
|---|
| 2515 | bozo_message = '%s is not an XML media type' % http_headers['content-type'] |
|---|
| 2516 | else: |
|---|
| 2517 | bozo_message = 'no Content-type specified' |
|---|
| 2518 | result['bozo'] = 1 |
|---|
| 2519 | result['bozo_exception'] = NonXMLContentType(bozo_message) |
|---|
| 2520 | |
|---|
| 2521 | result['version'], data = _stripDoctype(data) |
|---|
| 2522 | |
|---|
| 2523 | baseuri = http_headers.get('content-location', result.get('href')) |
|---|
| 2524 | baselang = http_headers.get('content-language', None) |
|---|
| 2525 | |
|---|
| 2526 | # if server sent 304, we're done |
|---|
| 2527 | if result.get('status', 0) == 304: |
|---|
| 2528 | result['version'] = '' |
|---|
| 2529 | result['debug_message'] = 'The feed has not changed since you last checked, ' + \ |
|---|
| 2530 | 'so the server sent no data. This is a feature, not a bug!' |
|---|
| 2531 | return result |
|---|
| 2532 | |
|---|
| 2533 | # if there was a problem downloading, we're done |
|---|
| 2534 | if not data: |
|---|
| 2535 | return result |
|---|
| 2536 | |
|---|
| 2537 | # determine character encoding |
|---|
| 2538 | use_strict_parser = 0 |
|---|
| 2539 | known_encoding = 0 |
|---|
| 2540 | tried_encodings = [] |
|---|
| 2541 | # try: HTTP encoding, declared XML encoding, encoding sniffed from BOM |
|---|
| 2542 | for proposed_encoding in (result['encoding'], xml_encoding, sniffed_xml_encoding): |
|---|
| 2543 | if not proposed_encoding: continue |
|---|
| 2544 | if proposed_encoding in tried_encodings: continue |
|---|
| 2545 | tried_encodings.append(proposed_encoding) |
|---|
| 2546 | try: |
|---|
| 2547 | data = _toUTF8(data, proposed_encoding) |
|---|
| 2548 | known_encoding = use_strict_parser = 1 |
|---|
| 2549 | break |
|---|
| 2550 | except: |
|---|
| 2551 | pass |
|---|
| 2552 | # if no luck and we have auto-detection library, try that |
|---|
| 2553 | if (not known_encoding) and chardet: |
|---|
| 2554 | try: |
|---|
| 2555 | proposed_encoding = chardet.detect(data)['encoding'] |
|---|
| 2556 | if proposed_encoding and (proposed_encoding not in tried_encodings): |
|---|
| 2557 | tried_encodings.append(proposed_encoding) |
|---|
| 2558 | data = _toUTF8(data, proposed_encoding) |
|---|
| 2559 | known_encoding = use_strict_parser = 1 |
|---|
| 2560 | except: |
|---|
| 2561 | pass |
|---|
| 2562 | # if still no luck and we haven't tried utf-8 yet, try that |
|---|
| 2563 | if (not known_encoding) and ('utf-8' not in tried_encodings): |
|---|
| 2564 | try: |
|---|
| 2565 | proposed_encoding = 'utf-8' |
|---|
| 2566 | tried_encodings.append(proposed_encoding) |
|---|
| 2567 | data = _toUTF8(data, proposed_encoding) |
|---|
| 2568 | known_encoding = use_strict_parser = 1 |
|---|
| 2569 | except: |
|---|
| 2570 | pass |
|---|
| 2571 | # if still no luck and we haven't tried windows-1252 yet, try that |
|---|
| 2572 | if (not known_encoding) and ('windows-1252' not in tried_encodings): |
|---|
| 2573 | try: |
|---|
| 2574 | proposed_encoding = 'windows-1252' |
|---|
| 2575 | tried_encodings.append(proposed_encoding) |
|---|
| 2576 | data = _toUTF8(data, proposed_encoding) |
|---|
| 2577 | known_encoding = use_strict_parser = 1 |
|---|
| 2578 | except: |
|---|
| 2579 | pass |
|---|
| 2580 | # if still no luck, give up |
|---|
| 2581 | if not known_encoding: |
|---|
| 2582 | result['bozo'] = 1 |
|---|
| 2583 | result['bozo_exception'] = CharacterEncodingUnknown( \ |
|---|
| 2584 | 'document encoding unknown, I tried ' + \ |
|---|
| 2585 | '%s, %s, utf-8, and windows-1252 but nothing worked' % \ |
|---|
| 2586 | (result['encoding'], xml_encoding)) |
|---|
| 2587 | result['encoding'] = '' |
|---|
| 2588 | elif proposed_encoding != result['encoding']: |
|---|
| 2589 | result['bozo'] = 1 |
|---|
| 2590 | result['bozo_exception'] = CharacterEncodingOverride( \ |
|---|
| 2591 | 'documented declared as %s, but parsed as %s' % \ |
|---|
| 2592 | (result['encoding'], proposed_encoding)) |
|---|
| 2593 | result['encoding'] = proposed_encoding |
|---|
| 2594 | |
|---|
| 2595 | if not _XML_AVAILABLE: |
|---|
| 2596 | use_strict_parser = 0 |
|---|
| 2597 | if use_strict_parser: |
|---|
| 2598 | # initialize the SAX parser |
|---|
| 2599 | feedparser = _StrictFeedParser(baseuri, baselang, 'utf-8') |
|---|
| 2600 | saxparser = xml.sax.make_parser(PREFERRED_XML_PARSERS) |
|---|
| 2601 | saxparser.setFeature(xml.sax.handler.feature_namespaces, 1) |
|---|
| 2602 | saxparser.setContentHandler(feedparser) |
|---|
| 2603 | saxparser.setErrorHandler(feedparser) |
|---|
| 2604 | source = xml.sax.xmlreader.InputSource() |
|---|
| 2605 | source.setByteStream(_StringIO(data)) |
|---|
| 2606 | if hasattr(saxparser, '_ns_stack'): |
|---|
| 2607 | # work around bug in built-in SAX parser (doesn't recognize xml: namespace) |
|---|
| 2608 | # PyXML doesn't have this problem, and it doesn't have _ns_stack either |
|---|
| 2609 | saxparser._ns_stack.append({'http://www.w3.org/XML/1998/namespace':'xml'}) |
|---|
| 2610 | try: |
|---|
| 2611 | saxparser.parse(source) |
|---|
| 2612 | except Exception, e: |
|---|
| 2613 | if _debug: |
|---|
| 2614 | import traceback |
|---|
| 2615 | traceback.print_stack() |
|---|
| 2616 | traceback.print_exc() |
|---|
| 2617 | sys.stderr.write('xml parsing failed\n') |
|---|
| 2618 | result['bozo'] = 1 |
|---|
| 2619 | result['bozo_exception'] = feedparser.exc or e |
|---|
| 2620 | use_strict_parser = 0 |
|---|
| 2621 | if not use_strict_parser: |
|---|
| 2622 | feedparser = _LooseFeedParser(baseuri, baselang, known_encoding and 'utf-8' or '') |
|---|
| 2623 | feedparser.feed(data) |
|---|
| 2624 | result['feed'] = feedparser.feeddata |
|---|
| 2625 | result['entries'] = feedparser.entries |
|---|
| 2626 | result['version'] = result['version'] or feedparser.version |
|---|
| 2627 | result['namespaces'] = feedparser.namespacesInUse |
|---|
| 2628 | return result |
|---|
| 2629 | |
|---|
| 2630 | if __name__ == '__main__': |
|---|
| 2631 | if not sys.argv[1:]: |
|---|
| 2632 | print __doc__ |
|---|
| 2633 | sys.exit(0) |
|---|
| 2634 | else: |
|---|
| 2635 | urls = sys.argv[1:] |
|---|
| 2636 | zopeCompatibilityHack() |
|---|
| 2637 | from pprint import pprint |
|---|
| 2638 | for url in urls: |
|---|
| 2639 | print url |
|---|
| 2640 | print |
|---|
| 2641 | result = parse(url) |
|---|
| 2642 | pprint(result) |
|---|
| 2643 | print |
|---|
| 2644 | |
|---|
| 2645 | #REVISION HISTORY |
|---|
| 2646 | #1.0 - 9/27/2002 - MAP - fixed namespace processing on prefixed RSS 2.0 elements, |
|---|
| 2647 | # added Simon Fell's test suite |
|---|
| 2648 | #1.1 - 9/29/2002 - MAP - fixed infinite loop on incomplete CDATA sections |
|---|
| 2649 | #2.0 - 10/19/2002 |
|---|
| 2650 | # JD - use inchannel to watch out for image and textinput elements which can |
|---|
| 2651 | # also contain title, link, and description elements |
|---|
| 2652 | # JD - check for isPermaLink='false' attribute on guid elements |
|---|
| 2653 | # JD - replaced openAnything with open_resource supporting ETag and |
|---|
| 2654 | # If-Modified-Since request headers |
|---|
| 2655 | # JD - parse now accepts etag, modified, agent, and referrer optional |
|---|
| 2656 | # arguments |
|---|
| 2657 | # JD - modified parse to return a dictionary instead of a tuple so that any |
|---|
| 2658 | # etag or modified information can be returned and cached by the caller |
|---|
| 2659 | #2.0.1 - 10/21/2002 - MAP - changed parse() so that if we don't get anything |
|---|
| 2660 | # because of etag/modified, return the old etag/modified to the caller to |
|---|
| 2661 | # indicate why nothing is being returned |
|---|
| 2662 | #2.0.2 - 10/21/2002 - JB - added the inchannel to the if statement, otherwise its |
|---|
| 2663 | # useless. Fixes the problem JD was addressing by adding it. |
|---|
| 2664 | #2.1 - 11/14/2002 - MAP - added gzip support |
|---|
| 2665 | #2.2 - 1/27/2003 - MAP - added attribute support, admin:generatorAgent. |
|---|
| 2666 | # start_admingeneratoragent is an example of how to handle elements with |
|---|
| 2667 | # only attributes, no content. |
|---|
| 2668 | #2.3 - 6/11/2003 - MAP - added USER_AGENT for default (if caller doesn't specify); |
|---|
| 2669 | # also, make sure we send the User-Agent even if urllib2 isn't available. |
|---|
| 2670 | # Match any variation of backend.userland.com/rss namespace. |
|---|
| 2671 | #2.3.1 - 6/12/2003 - MAP - if item has both link and guid, return both as-is. |
|---|
| 2672 | #2.4 - 7/9/2003 - MAP - added preliminary Pie/Atom/Echo support based on Sam Ruby's |
|---|
| 2673 | # snapshot of July 1 <http://www.intertwingly.net/blog/1506.html>; changed |
|---|
| 2674 | # project name |
|---|
| 2675 | #2.5 - 7/25/2003 - MAP - changed to Python license (all contributors agree); |
|---|
| 2676 | # removed unnecessary urllib code -- urllib2 should always be available anyway; |
|---|
| 2677 | # return actual url, status, and full HTTP headers (as result['url'], |
|---|
| 2678 | # result['status'], and result['headers']) if parsing a remote feed over HTTP -- |
|---|
| 2679 | # this should pass all the HTTP tests at <http://diveintomark.org/tests/client/http/>; |
|---|
| 2680 | # added the latest namespace-of-the-week for RSS 2.0 |
|---|
| 2681 | #2.5.1 - 7/26/2003 - RMK - clear opener.addheaders so we only send our custom |
|---|
| 2682 | # User-Agent (otherwise urllib2 sends two, which confuses some servers) |
|---|
| 2683 | #2.5.2 - 7/28/2003 - MAP - entity-decode inline xml properly; added support for |
|---|
| 2684 | # inline <xhtml:body> and <xhtml:div> as used in some RSS 2.0 feeds |
|---|
| 2685 | #2.5.3 - 8/6/2003 - TvdV - patch to track whether we're inside an image or |
|---|
| 2686 | # textInput, and also to return the character encoding (if specified) |
|---|
| 2687 | #2.6 - 1/1/2004 - MAP - dc:author support (MarekK); fixed bug tracking |
|---|
| 2688 | # nested divs within content (JohnD); fixed missing sys import (JohanS); |
|---|
| 2689 | # fixed regular expression to capture XML character encoding (Andrei); |
|---|
| 2690 | # added support for Atom 0.3-style links; fixed bug with textInput tracking; |
|---|
| 2691 | # added support for cloud (MartijnP); added support for multiple |
|---|
| 2692 | # category/dc:subject (MartijnP); normalize content model: 'description' gets |
|---|
| 2693 | # description (which can come from description, summary, or full content if no |
|---|
| 2694 | # description), 'content' gets dict of base/language/type/value (which can come |
|---|
| 2695 | # from content:encoded, xhtml:body, content, or fullitem); |
|---|
| 2696 | # fixed bug matching arbitrary Userland namespaces; added xml:base and xml:lang |
|---|
| 2697 | # tracking; fixed bug tracking unknown tags; fixed bug tracking content when |
|---|
| 2698 | # <content> element is not in default namespace (like Pocketsoap feed); |
|---|
| 2699 | # resolve relative URLs in link, guid, docs, url, comments, wfw:comment, |
|---|
| 2700 | # wfw:commentRSS; resolve relative URLs within embedded HTML markup in |
|---|
| 2701 | # description, xhtml:body, content, content:encoded, title, subtitle, |
|---|
| 2702 | # summary, info, tagline, and copyright; added support for pingback and |
|---|
| 2703 | # trackback namespaces |
|---|
| 2704 | #2.7 - 1/5/2004 - MAP - really added support for trackback and pingback |
|---|
| 2705 | # namespaces, as opposed to 2.6 when I said I did but didn't really; |
|---|
| 2706 | # sanitize HTML markup within some elements; added mxTidy support (if |
|---|
| 2707 | # installed) to tidy HTML markup within some elements; fixed indentation |
|---|
| 2708 | # bug in _parse_date (FazalM); use socket.setdefaulttimeout if available |
|---|
| 2709 | # (FazalM); universal date parsing and normalization (FazalM): 'created', modified', |
|---|
| 2710 | # 'issued' are parsed into 9-tuple date format and stored in 'created_parsed', |
|---|
| 2711 | # 'modified_parsed', and 'issued_parsed'; 'date' is duplicated in 'modified' |
|---|
| 2712 | # and vice-versa; 'date_parsed' is duplicated in 'modified_parsed' and vice-versa |
|---|
| 2713 | #2.7.1 - 1/9/2004 - MAP - fixed bug handling " and '. fixed memory |
|---|
| 2714 | # leak not closing url opener (JohnD); added dc:publisher support (MarekK); |
|---|
| 2715 | # added admin:errorReportsTo support (MarekK); Python 2.1 dict support (MarekK) |
|---|
| 2716 | #2.7.4 - 1/14/2004 - MAP - added workaround for improperly formed <br/> tags in |
|---|
| 2717 | # encoded HTML (skadz); fixed unicode handling in normalize_attrs (ChrisL); |
|---|
| 2718 | # fixed relative URI processing for guid (skadz); added ICBM support; added |
|---|
| 2719 | # base64 support |
|---|
| 2720 | #2.7.5 - 1/15/2004 - MAP - added workaround for malformed DOCTYPE (seen on many |
|---|
| 2721 | # blogspot.com sites); added _debug variable |
|---|
| 2722 | #2.7.6 - 1/16/2004 - MAP - fixed bug with StringIO importing |
|---|
| 2723 | #3.0b3 - 1/23/2004 - MAP - parse entire feed with real XML parser (if available); |
|---|
| 2724 | # added several new supported namespaces; fixed bug tracking naked markup in |
|---|
| 2725 | # description; added support for enclosure; added support for source; re-added |
|---|
| 2726 | # support for cloud which got dropped somehow; added support for expirationDate |
|---|
| 2727 | #3.0b4 - 1/26/2004 - MAP - fixed xml:lang inheritance; fixed multiple bugs tracking |
|---|
| 2728 | # xml:base URI, one for documents that don't define one explicitly and one for |
|---|
| 2729 | # documents that define an outer and an inner xml:base that goes out of scope |
|---|
| 2730 | # before the end of the document |
|---|
| 2731 | #3.0b5 - 1/26/2004 - MAP - fixed bug parsing multiple links at feed level |
|---|
| 2732 | #3.0b6 - 1/27/2004 - MAP - added feed type and version detection, result['version'] |
|---|
| 2733 | # will be one of SUPPORTED_VERSIONS.keys() or empty string if unrecognized; |
|---|
| 2734 | # added support for creativeCommons:license and cc:license; added support for |
|---|
| 2735 | # full Atom content model in title, tagline, info, copyright, summary; fixed bug |
|---|
| 2736 | # with gzip encoding (not always telling server we support it when we do) |
|---|
| 2737 | #3.0b7 - 1/28/2004 - MAP - support Atom-style author element in author_detail |
|---|
| 2738 | # (dictionary of 'name', 'url', 'email'); map author to author_detail if author |
|---|
| 2739 | # contains name + email address |
|---|
| 2740 | #3.0b8 - 1/28/2004 - MAP - added support for contributor |
|---|
| 2741 | #3.0b9 - 1/29/2004 - MAP - fixed check for presence of dict function; added |
|---|
| 2742 | # support for summary |
|---|
| 2743 | #3.0b10 - 1/31/2004 - MAP - incorporated ISO-8601 date parsing routines from |
|---|
| 2744 | # xml.util.iso8601 |
|---|
| 2745 | #3.0b11 - 2/2/2004 - MAP - added 'rights' to list of elements that can contain |
|---|
| 2746 | # dangerous markup; fiddled with decodeEntities (not right); liberalized |
|---|
| 2747 | # date parsing even further |
|---|
| 2748 | #3.0b12 - 2/6/2004 - MAP - fiddled with decodeEntities (still not right); |
|---|
| 2749 | # added support to Atom 0.2 subtitle; added support for Atom content model |
|---|
| 2750 | # in copyright; better sanitizing of dangerous HTML elements with end tags |
|---|
| 2751 | # (script, frameset) |
|---|
| 2752 | #3.0b13 - 2/8/2004 - MAP - better handling of empty HTML tags (br, hr, img, |
|---|
| 2753 | # etc.) in embedded markup, in either HTML or XHTML form (<br>, <br/>, <br />) |
|---|
| 2754 | #3.0b14 - 2/8/2004 - MAP - fixed CDATA handling in non-wellformed feeds under |
|---|
| 2755 | # Python 2.1 |
|---|
| 2756 | #3.0b15 - 2/11/2004 - MAP - fixed bug resolving relative links in wfw:commentRSS; |
|---|
| 2757 | # fixed bug capturing author and contributor URL; fixed bug resolving relative |
|---|
| 2758 | # links in author and contributor URL; fixed bug resolvin relative links in |
|---|
| 2759 | # generator URL; added support for recognizing RSS 1.0; passed Simon Fell's |
|---|
| 2760 | # namespace tests, and included them permanently in the test suite with his |
|---|
| 2761 | # permission; fixed namespace handling under Python 2.1 |
|---|
| 2762 | #3.0b16 - 2/12/2004 - MAP - fixed support for RSS 0.90 (broken in b15) |
|---|
| 2763 | #3.0b17 - 2/13/2004 - MAP - determine character encoding as per RFC 3023 |
|---|
| 2764 | #3.0b18 - 2/17/2004 - MAP - always map description to summary_detail (Andrei); |
|---|
| 2765 | # use libxml2 (if available) |
|---|
| 2766 | #3.0b19 - 3/15/2004 - MAP - fixed bug exploding author information when author |
|---|
| 2767 | # name was in parentheses; removed ultra-problematic mxTidy support; patch to |
|---|
| 2768 | # workaround crash in PyXML/expat when encountering invalid entities |
|---|
| 2769 | # (MarkMoraes); support for textinput/textInput |
|---|
| 2770 | #3.0b20 - 4/7/2004 - MAP - added CDF support |
|---|
| 2771 | #3.0b21 - 4/14/2004 - MAP - added Hot RSS support |
|---|
| 2772 | #3.0b22 - 4/19/2004 - MAP - changed 'channel' to 'feed', 'item' to 'entries' in |
|---|
| 2773 | # results dict; changed results dict to allow getting values with results.key |
|---|
| 2774 | # as well as results[key]; work around embedded illformed HTML with half |
|---|
| 2775 | # a DOCTYPE; work around malformed Content-Type header; if character encoding |
|---|
| 2776 | # is wrong, try several common ones before falling back to regexes (if this |
|---|
| 2777 | # works, bozo_exception is set to CharacterEncodingOverride); fixed character |
|---|
| 2778 | # encoding issues in BaseHTMLProcessor by tracking encoding and converting |
|---|
| 2779 | # from Unicode to raw strings before feeding data to sgmllib.SGMLParser; |
|---|
| 2780 | # convert each value in results to Unicode (if possible), even if using |
|---|
| 2781 | # regex-based parsing |
|---|
| 2782 | #3.0b23 - 4/21/2004 - MAP - fixed UnicodeDecodeError for feeds that contain |
|---|
| 2783 | # high-bit characters in attributes in embedded HTML in description (thanks |
|---|
| 2784 | # Thijs van de Vossen); moved guid, date, and date_parsed to mapped keys in |
|---|
| 2785 | # FeedParserDict; tweaked FeedParserDict.has_key to return True if asking |
|---|
| 2786 | # about a mapped key |
|---|
| 2787 | #3.0fc1 - 4/23/2004 - MAP - made results.entries[0].links[0] and |
|---|
| 2788 | # results.entries[0].enclosures[0] into FeedParserDict; fixed typo that could |
|---|
| 2789 | # cause the same encoding to be tried twice (even if it failed the first time); |
|---|
| 2790 | # fixed DOCTYPE stripping when DOCTYPE contained entity declarations; |
|---|
| 2791 | # better textinput and image tracking in illformed RSS 1.0 feeds |
|---|
| 2792 | #3.0fc2 - 5/10/2004 - MAP - added and passed Sam's amp tests; added and passed |
|---|
| 2793 | # my blink tag tests |
|---|
| 2794 | #3.0fc3 - 6/18/2004 - MAP - fixed bug in _changeEncodingDeclaration that |
|---|
| 2795 | # failed to parse utf-16 encoded feeds; made source into a FeedParserDict; |
|---|
| 2796 | # duplicate admin:generatorAgent/@rdf:resource in generator_detail.url; |
|---|
| 2797 | # added support for image; refactored parse() fallback logic to try other |
|---|
| 2798 | # encodings if SAX parsing fails (previously it would only try other encodings |
|---|
| 2799 | # if re-encoding failed); remove unichr madness in normalize_attrs now that |
|---|
| 2800 | # we're properly tracking encoding in and out of BaseHTMLProcessor; set |
|---|
| 2801 | # feed.language from root-level xml:lang; set entry.id from rdf:about; |
|---|
| 2802 | # send Accept header |
|---|
| 2803 | #3.0 - 6/21/2004 - MAP - don't try iso-8859-1 (can't distinguish between |
|---|
| 2804 | # iso-8859-1 and windows-1252 anyway, and most incorrectly marked feeds are |
|---|
| 2805 | # windows-1252); fixed regression that could cause the same encoding to be |
|---|
| 2806 | # tried twice (even if it failed the first time) |
|---|
| 2807 | #3.0.1 - 6/22/2004 - MAP - default to us-ascii for all text/* content types; |
|---|
| 2808 | # recover from malformed content-type header parameter with no equals sign |
|---|
| 2809 | # ('text/xml; charset:iso-8859-1') |
|---|
| 2810 | #3.1 - 6/28/2004 - MAP - added and passed tests for converting HTML entities |
|---|
| 2811 | # to Unicode equivalents in illformed feeds (aaronsw); added and |
|---|
| 2812 | # passed tests for converting character entities to Unicode equivalents |
|---|
| 2813 | # in illformed feeds (aaronsw); test for valid parsers when setting |
|---|
| 2814 | # XML_AVAILABLE; make version and encoding available when server returns |
|---|
| 2815 | # a 304; add handlers parameter to pass arbitrary urllib2 handlers (like |
|---|
| 2816 | # digest auth or proxy support); add code to parse username/password |
|---|
| 2817 | # out of url and send as basic authentication; expose downloading-related |
|---|
| 2818 | # exceptions in bozo_exception (aaronsw); added __contains__ method to |
|---|
| 2819 | # FeedParserDict (aaronsw); added publisher_detail (aaronsw) |
|---|
| 2820 | #3.2 - 7/3/2004 - MAP - use cjkcodecs and iconv_codec if available; always |
|---|
| 2821 | # convert feed to UTF-8 before passing to XML parser; completely revamped |
|---|
| 2822 | # logic for determining character encoding and attempting XML parsing |
|---|
| 2823 | # (much faster); increased default timeout to 20 seconds; test for presence |
|---|
| 2824 | # of Location header on redirects; added tests for many alternate character |
|---|
| 2825 | # encodings; support various EBCDIC encodings; support UTF-16BE and |
|---|
| 2826 | # UTF16-LE with or without a BOM; support UTF-8 with a BOM; support |
|---|
| 2827 | # UTF-32BE and UTF-32LE with or without a BOM; fixed crashing bug if no |
|---|
| 2828 | # XML parsers are available; added support for 'Content-encoding: deflate'; |
|---|
| 2829 | # send blank 'Accept-encoding: ' header if neither gzip nor zlib modules |
|---|
| 2830 | # are available |
|---|
| 2831 | #3.3 - 7/15/2004 - MAP - optimize EBCDIC to ASCII conversion; fix obscure |
|---|
| 2832 | # problem tracking xml:base and xml:lang if element declares it, child |
|---|
| 2833 | # doesn't, first grandchild redeclares it, and second grandchild doesn't; |
|---|
| 2834 | # refactored date parsing; defined public registerDateHandler so callers |
|---|
| 2835 | # can add support for additional date formats at runtime; added support |
|---|
| 2836 | # for OnBlog, Nate, MSSQL, Greek, and Hungarian dates (ytrewq1); added |
|---|
| 2837 | # zopeCompatibilityHack() which turns FeedParserDict into a regular |
|---|
| 2838 | # dictionary, required for Zope compatibility, and also makes command- |
|---|
| 2839 | # line debugging easier because pprint module formats real dictionaries |
|---|
| 2840 | # better than dictionary-like objects; added NonXMLContentType exception, |
|---|
| 2841 | # which is stored in bozo_exception when a feed is served with a non-XML |
|---|
| 2842 | # media type such as 'text/plain'; respect Content-Language as default |
|---|
| 2843 | # language if not xml:lang is present; cloud dict is now FeedParserDict; |
|---|
| 2844 | # generator dict is now FeedParserDict; better tracking of xml:lang, |
|---|
| 2845 | # including support for xml:lang='' to unset the current language; |
|---|
| 2846 | # recognize RSS 1.0 feeds even when RSS 1.0 namespace is not the default |
|---|
| 2847 | # namespace; don't overwrite final status on redirects (scenarios: |
|---|
| 2848 | # redirecting to a URL that returns 304, redirecting to a URL that |
|---|
| 2849 | # redirects to another URL with a different type of redirect); add |
|---|
| 2850 | # support for HTTP 303 redirects |
|---|
| 2851 | #4.0 - MAP - support for relative URIs in xml:base attribute; fixed |
|---|
| 2852 | # encoding issue with mxTidy (phopkins); preliminary support for RFC 3229; |
|---|
| 2853 | # support for Atom 1.0; support for iTunes extensions; new 'tags' for |
|---|
| 2854 | # categories/keywords/etc. as array of dict |
|---|
| 2855 | # {'term': term, 'scheme': scheme, 'label': label} to match Atom 1.0 |
|---|
| 2856 | # terminology; parse RFC 822-style dates with no time; lots of other |
|---|
| 2857 | # bug fixes |
|---|
| 2858 | #4.1 - MAP - removed socket timeout; added support for chardet library |
|---|