diff options
author | luxagraf@c63593aa-01b0-44d9-8516-4b9c7e931d7f <luxagraf@c63593aa-01b0-44d9-8516-4b9c7e931d7f> | 2010-07-04 00:42:55 +0000 |
---|---|---|
committer | luxagraf@c63593aa-01b0-44d9-8516-4b9c7e931d7f <luxagraf@c63593aa-01b0-44d9-8516-4b9c7e931d7f> | 2010-07-04 00:42:55 +0000 |
commit | 56d590eab871d5c229a718f14709f5704b131d28 (patch) | |
tree | bd53d4e0c4110a40492cbe69e2facfca3771eddf /lib/utilslib | |
parent | a4abbbc5589e948e469849c58b67d179212785d0 (diff) |
fuck up
Diffstat (limited to 'lib/utilslib')
-rw-r--r-- | lib/utilslib/APIClients.py | 104 | ||||
-rw-r--r-- | lib/utilslib/__init__.py | 0 | ||||
-rwxr-xr-x | lib/utilslib/markdown2.py | 1877 | ||||
-rw-r--r-- | lib/utilslib/pydelicious.py | 1045 | ||||
-rw-r--r-- | lib/utilslib/strutils.py | 50 |
5 files changed, 0 insertions, 3076 deletions
diff --git a/lib/utilslib/APIClients.py b/lib/utilslib/APIClients.py deleted file mode 100644 index 24ab97b..0000000 --- a/lib/utilslib/APIClients.py +++ /dev/null @@ -1,104 +0,0 @@ -# APIClients for grabbing data from popular web services -# By Scott Gilbertson -# Copyright is lame, take what you want, except for those portions noted - -# Dependencies: -import sys, urllib -import xml.etree.cElementTree as xml_parser - - -DEBUG = 0 - -""" -base class -- handles GoodReads.com, but works for any rss feed, just send an empty string for anything you don't need -""" -class APIClient: - def __init__(self, base_path, api_key): - self.api_key = api_key - self.base_path = base_path - - def __getattr__(self, method): - def method(_self=self, _method=method, **params): - url = "%s%s?%s&" % (self.base_path, self.api_key, urllib.urlencode(params)) - if DEBUG: print url - data = self.fetch(url) - return data - - return method - - def fetch(self, url): - u = urllib.FancyURLopener(None) - usock = u.open(url) - rawdata = usock.read() - if DEBUG: print rawdata - usock.close() - return xml_parser.fromstring(rawdata) - -""" - Extend APIClient to work with the ma.gnolia.com API - (http://wiki.ma.gnolia.com/Ma.gnolia_API) - Adds some error handling as well -""" -class MagnoliaError(Exception): - def __init__(self, code, message): - self.code = code - self.message = message - - def __str__(self): - return 'Magnolia Error %s: %s' % (self.code, self.message) - - -class MagnoliaClient(APIClient): - def __getattr__(self, method): - def method(_self=self, _method=method, **params): - url = "%s%s?%s&api_key=%s" % (self.base_path, _method, urllib.urlencode(params), self.api_key) - if DEBUG: print url - data = APIClient.fetch(self, url) - return data - return method - - -""" - Extend APIClient to work with the Flickr API - (http://www.flickr.com/services/api/) - Adds error handling as well -""" - -class FlickrError(Exception): - def __init__(self, code, message): - self.code = code - self.message = message - - def __str__(self): - return 'Flickr Error %s: %s' % (self.code, self.message) - -class FlickrClient(APIClient): - def __getattr__(self, method): - def method(_self=self, _method=method, **params): - _method = _method.replace("_", ".") - url = "%s?method=%s&%s&api_key=%s" % (self.base_path, _method, urllib.urlencode(params), self.api_key) - if DEBUG: print url - data = APIClient.fetch(self, url) - return data - return method - -class TumblrClient: - def __init__(self, base_path): - self.base_path = base_path - - def __getattr__(self, method): - def method(_self=self, _method=method, **params): - url = "%s" % (self.base_path) - if DEBUG: print url - data = self.fetch(url) - return data - - return method - - def fetch(self, url): - u = urllib.FancyURLopener(None) - usock = u.open(url) - rawdata = usock.read() - if DEBUG: print rawdata - usock.close() - return xml_parser.fromstring(rawdata) diff --git a/lib/utilslib/__init__.py b/lib/utilslib/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/lib/utilslib/__init__.py +++ /dev/null diff --git a/lib/utilslib/markdown2.py b/lib/utilslib/markdown2.py deleted file mode 100755 index d72f414..0000000 --- a/lib/utilslib/markdown2.py +++ /dev/null @@ -1,1877 +0,0 @@ -#!/usr/bin/env python -# Copyright (c) 2007-2008 ActiveState Corp. -# License: MIT (http://www.opensource.org/licenses/mit-license.php) - -r"""A fast and complete Python implementation of Markdown. - -[from http://daringfireball.net/projects/markdown/] -> Markdown is a text-to-HTML filter; it translates an easy-to-read / -> easy-to-write structured text format into HTML. Markdown's text -> format is most similar to that of plain text email, and supports -> features such as headers, *emphasis*, code blocks, blockquotes, and -> links. -> -> Markdown's syntax is designed not as a generic markup language, but -> specifically to serve as a front-end to (X)HTML. You can use span-level -> HTML tags anywhere in a Markdown document, and you can use block level -> HTML tags (like <div> and <table> as well). - -Module usage: - - >>> import markdown2 - >>> markdown2.markdown("*boo!*") # or use `html = markdown_path(PATH)` - u'<p><em>boo!</em></p>\n' - - >>> markdowner = Markdown() - >>> markdowner.convert("*boo!*") - u'<p><em>boo!</em></p>\n' - >>> markdowner.convert("**boom!**") - u'<p><strong>boom!</strong></p>\n' - -This implementation of Markdown implements the full "core" syntax plus a -number of extras (e.g., code syntax coloring, footnotes) as described on -<http://code.google.com/p/python-markdown2/wiki/Extras>. -""" - -cmdln_desc = """A fast and complete Python implementation of Markdown, a -text-to-HTML conversion tool for web writers. -""" - -# Dev Notes: -# - There is already a Python markdown processor -# (http://www.freewisdom.org/projects/python-markdown/). -# - Python's regex syntax doesn't have '\z', so I'm using '\Z'. I'm -# not yet sure if there implications with this. Compare 'pydoc sre' -# and 'perldoc perlre'. - -__version_info__ = (1, 0, 1, 13) # first three nums match Markdown.pl -__version__ = '1.0.1.13' -__author__ = "Trent Mick" - -import os -import sys -from pprint import pprint -import re -import logging -try: - from hashlib import md5 -except ImportError: - from md5 import md5 -import optparse -from random import random -import codecs - - - -#---- Python version compat - -if sys.version_info[:2] < (2,4): - from sets import Set as set - def reversed(sequence): - for i in sequence[::-1]: - yield i - def _unicode_decode(s, encoding, errors='xmlcharrefreplace'): - return unicode(s, encoding, errors) -else: - def _unicode_decode(s, encoding, errors='strict'): - return s.decode(encoding, errors) - - -#---- globals - -DEBUG = False -log = logging.getLogger("markdown") - -DEFAULT_TAB_WIDTH = 4 - -# Table of hash values for escaped characters: -def _escape_hash(s): - # Lame attempt to avoid possible collision with someone actually - # using the MD5 hexdigest of one of these chars in there text. - # Other ideas: random.random(), uuid.uuid() - #return md5(s).hexdigest() # Markdown.pl effectively does this. - return 'md5-'+md5(s).hexdigest() -g_escape_table = dict([(ch, _escape_hash(ch)) - for ch in '\\`*_{}[]()>#+-.!']) - - - -#---- exceptions - -class MarkdownError(Exception): - pass - - - -#---- public api - -def markdown_path(path, encoding="utf-8", - html4tags=False, tab_width=DEFAULT_TAB_WIDTH, - safe_mode=None, extras=None, link_patterns=None, - use_file_vars=False): - text = codecs.open(path, 'r', encoding).read() - return Markdown(html4tags=html4tags, tab_width=tab_width, - safe_mode=safe_mode, extras=extras, - link_patterns=link_patterns, - use_file_vars=use_file_vars).convert(text) - -def markdown(text, html4tags=False, tab_width=DEFAULT_TAB_WIDTH, - safe_mode=None, extras=None, link_patterns=None, - use_file_vars=False): - return Markdown(html4tags=html4tags, tab_width=tab_width, - safe_mode=safe_mode, extras=extras, - link_patterns=link_patterns, - use_file_vars=use_file_vars).convert(text) - -class Markdown(object): - # The dict of "extras" to enable in processing -- a mapping of - # extra name to argument for the extra. Most extras do not have an - # argument, in which case the value is None. - # - # This can be set via (a) subclassing and (b) the constructor - # "extras" argument. - extras = None - - urls = None - titles = None - html_blocks = None - html_spans = None - html_removed_text = "[HTML_REMOVED]" # for compat with markdown.py - - # Used to track when we're inside an ordered or unordered list - # (see _ProcessListItems() for details): - list_level = 0 - - _ws_only_line_re = re.compile(r"^[ \t]+$", re.M) - - def __init__(self, html4tags=False, tab_width=4, safe_mode=None, - extras=None, link_patterns=None, use_file_vars=False): - if html4tags: - self.empty_element_suffix = ">" - else: - self.empty_element_suffix = " />" - self.tab_width = tab_width - - # For compatibility with earlier markdown2.py and with - # markdown.py's safe_mode being a boolean, - # safe_mode == True -> "replace" - if safe_mode is True: - self.safe_mode = "replace" - else: - self.safe_mode = safe_mode - - if self.extras is None: - self.extras = {} - elif not isinstance(self.extras, dict): - self.extras = dict([(e, None) for e in self.extras]) - if extras: - if not isinstance(extras, dict): - extras = dict([(e, None) for e in extras]) - self.extras.update(extras) - assert isinstance(self.extras, dict) - self._instance_extras = self.extras.copy() - self.link_patterns = link_patterns - self.use_file_vars = use_file_vars - self._outdent_re = re.compile(r'^(\t|[ ]{1,%d})' % tab_width, re.M) - - def reset(self): - self.urls = {} - self.titles = {} - self.html_blocks = {} - self.html_spans = {} - self.list_level = 0 - self.extras = self._instance_extras.copy() - if "footnotes" in self.extras: - self.footnotes = {} - self.footnote_ids = [] - - def convert(self, text): - """Convert the given text.""" - # Main function. The order in which other subs are called here is - # essential. Link and image substitutions need to happen before - # _EscapeSpecialChars(), so that any *'s or _'s in the <a> - # and <img> tags get encoded. - - # Clear the global hashes. If we don't clear these, you get conflicts - # from other articles when generating a page which contains more than - # one article (e.g. an index page that shows the N most recent - # articles): - self.reset() - - if not isinstance(text, unicode): - #TODO: perhaps shouldn't presume UTF-8 for string input? - text = unicode(text, 'utf-8') - - if self.use_file_vars: - # Look for emacs-style file variable hints. - emacs_vars = self._get_emacs_vars(text) - if "markdown-extras" in emacs_vars: - splitter = re.compile("[ ,]+") - for e in splitter.split(emacs_vars["markdown-extras"]): - if '=' in e: - ename, earg = e.split('=', 1) - try: - earg = int(earg) - except ValueError: - pass - else: - ename, earg = e, None - self.extras[ename] = earg - - # Standardize line endings: - text = re.sub("\r\n|\r", "\n", text) - - # Make sure $text ends with a couple of newlines: - text += "\n\n" - - # Convert all tabs to spaces. - text = self._detab(text) - - # Strip any lines consisting only of spaces and tabs. - # This makes subsequent regexen easier to write, because we can - # match consecutive blank lines with /\n+/ instead of something - # contorted like /[ \t]*\n+/ . - text = self._ws_only_line_re.sub("", text) - - if self.safe_mode: - text = self._hash_html_spans(text) - - # Turn block-level HTML blocks into hash entries - text = self._hash_html_blocks(text, raw=True) - - # Strip link definitions, store in hashes. - if "footnotes" in self.extras: - # Must do footnotes first because an unlucky footnote defn - # looks like a link defn: - # [^4]: this "looks like a link defn" - text = self._strip_footnote_definitions(text) - text = self._strip_link_definitions(text) - - text = self._run_block_gamut(text) - - text = self._unescape_special_chars(text) - - if "footnotes" in self.extras: - text = self._add_footnotes(text) - - if self.safe_mode: - text = self._unhash_html_spans(text) - - text += "\n" - return text - - _emacs_oneliner_vars_pat = re.compile(r"-\*-\s*([^\r\n]*?)\s*-\*-", re.UNICODE) - # This regular expression is intended to match blocks like this: - # PREFIX Local Variables: SUFFIX - # PREFIX mode: Tcl SUFFIX - # PREFIX End: SUFFIX - # Some notes: - # - "[ \t]" is used instead of "\s" to specifically exclude newlines - # - "(\r\n|\n|\r)" is used instead of "$" because the sre engine does - # not like anything other than Unix-style line terminators. - _emacs_local_vars_pat = re.compile(r"""^ - (?P<prefix>(?:[^\r\n|\n|\r])*?) - [\ \t]*Local\ Variables:[\ \t]* - (?P<suffix>.*?)(?:\r\n|\n|\r) - (?P<content>.*?\1End:) - """, re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE) - - def _get_emacs_vars(self, text): - """Return a dictionary of emacs-style local variables. - - Parsing is done loosely according to this spec (and according to - some in-practice deviations from this): - http://www.gnu.org/software/emacs/manual/html_node/emacs/Specifying-File-Variables.html#Specifying-File-Variables - """ - emacs_vars = {} - SIZE = pow(2, 13) # 8kB - - # Search near the start for a '-*-'-style one-liner of variables. - head = text[:SIZE] - if "-*-" in head: - match = self._emacs_oneliner_vars_pat.search(head) - if match: - emacs_vars_str = match.group(1) - assert '\n' not in emacs_vars_str - emacs_var_strs = [s.strip() for s in emacs_vars_str.split(';') - if s.strip()] - if len(emacs_var_strs) == 1 and ':' not in emacs_var_strs[0]: - # While not in the spec, this form is allowed by emacs: - # -*- Tcl -*- - # where the implied "variable" is "mode". This form - # is only allowed if there are no other variables. - emacs_vars["mode"] = emacs_var_strs[0].strip() - else: - for emacs_var_str in emacs_var_strs: - try: - variable, value = emacs_var_str.strip().split(':', 1) - except ValueError: - log.debug("emacs variables error: malformed -*- " - "line: %r", emacs_var_str) - continue - # Lowercase the variable name because Emacs allows "Mode" - # or "mode" or "MoDe", etc. - emacs_vars[variable.lower()] = value.strip() - - tail = text[-SIZE:] - if "Local Variables" in tail: - match = self._emacs_local_vars_pat.search(tail) - if match: - prefix = match.group("prefix") - suffix = match.group("suffix") - lines = match.group("content").splitlines(0) - #print "prefix=%r, suffix=%r, content=%r, lines: %s"\ - # % (prefix, suffix, match.group("content"), lines) - - # Validate the Local Variables block: proper prefix and suffix - # usage. - for i, line in enumerate(lines): - if not line.startswith(prefix): - log.debug("emacs variables error: line '%s' " - "does not use proper prefix '%s'" - % (line, prefix)) - return {} - # Don't validate suffix on last line. Emacs doesn't care, - # neither should we. - if i != len(lines)-1 and not line.endswith(suffix): - log.debug("emacs variables error: line '%s' " - "does not use proper suffix '%s'" - % (line, suffix)) - return {} - - # Parse out one emacs var per line. - continued_for = None - for line in lines[:-1]: # no var on the last line ("PREFIX End:") - if prefix: line = line[len(prefix):] # strip prefix - if suffix: line = line[:-len(suffix)] # strip suffix - line = line.strip() - if continued_for: - variable = continued_for - if line.endswith('\\'): - line = line[:-1].rstrip() - else: - continued_for = None - emacs_vars[variable] += ' ' + line - else: - try: - variable, value = line.split(':', 1) - except ValueError: - log.debug("local variables error: missing colon " - "in local variables entry: '%s'" % line) - continue - # Do NOT lowercase the variable name, because Emacs only - # allows "mode" (and not "Mode", "MoDe", etc.) in this block. - value = value.strip() - if value.endswith('\\'): - value = value[:-1].rstrip() - continued_for = variable - else: - continued_for = None - emacs_vars[variable] = value - - # Unquote values. - for var, val in emacs_vars.items(): - if len(val) > 1 and (val.startswith('"') and val.endswith('"') - or val.startswith('"') and val.endswith('"')): - emacs_vars[var] = val[1:-1] - - return emacs_vars - - # Cribbed from a post by Bart Lateur: - # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154> - _detab_re = re.compile(r'(.*?)\t', re.M) - def _detab_sub(self, match): - g1 = match.group(1) - return g1 + (' ' * (self.tab_width - len(g1) % self.tab_width)) - def _detab(self, text): - r"""Remove (leading?) tabs from a file. - - >>> m = Markdown() - >>> m._detab("\tfoo") - ' foo' - >>> m._detab(" \tfoo") - ' foo' - >>> m._detab("\t foo") - ' foo' - >>> m._detab(" foo") - ' foo' - >>> m._detab(" foo\n\tbar\tblam") - ' foo\n bar blam' - """ - if '\t' not in text: - return text - return self._detab_re.subn(self._detab_sub, text)[0] - - _block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del' - _strict_tag_block_re = re.compile(r""" - ( # save in \1 - ^ # start of line (with re.M) - <(%s) # start tag = \2 - \b # word break - (.*\n)*? # any number of lines, minimally matching - </\2> # the matching end tag - [ \t]* # trailing spaces/tabs - (?=\n+|\Z) # followed by a newline or end of document - ) - """ % _block_tags_a, - re.X | re.M) - - _block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math' - _liberal_tag_block_re = re.compile(r""" - ( # save in \1 - ^ # start of line (with re.M) - <(%s) # start tag = \2 - \b # word break - (.*\n)*? # any number of lines, minimally matching - .*</\2> # the matching end tag - [ \t]* # trailing spaces/tabs - (?=\n+|\Z) # followed by a newline or end of document - ) - """ % _block_tags_b, - re.X | re.M) - - def _hash_html_block_sub(self, match, raw=False): - html = match.group(1) - if raw and self.safe_mode: - html = self._sanitize_html(html) - key = _hash_text(html) - self.html_blocks[key] = html - return "\n\n" + key + "\n\n" - - def _hash_html_blocks(self, text, raw=False): - """Hashify HTML blocks - - We only want to do this for block-level HTML tags, such as headers, - lists, and tables. That's because we still want to wrap <p>s around - "paragraphs" that are wrapped in non-block-level tags, such as anchors, - phrase emphasis, and spans. The list of tags we're looking for is - hard-coded. - - @param raw {boolean} indicates if these are raw HTML blocks in - the original source. It makes a difference in "safe" mode. - """ - if '<' not in text: - return text - - # Pass `raw` value into our calls to self._hash_html_block_sub. - hash_html_block_sub = _curry(self._hash_html_block_sub, raw=raw) - - # First, look for nested blocks, e.g.: - # <div> - # <div> - # tags for inner block must be indented. - # </div> - # </div> - # - # The outermost tags must start at the left margin for this to match, and - # the inner nested divs must be indented. - # We need to do this before the next, more liberal match, because the next - # match will start at the first `<div>` and stop at the first `</div>`. - text = self._strict_tag_block_re.sub(hash_html_block_sub, text) - - # Now match more liberally, simply from `\n<tag>` to `</tag>\n` - text = self._liberal_tag_block_re.sub(hash_html_block_sub, text) - - # Special case just for <hr />. It was easier to make a special - # case than to make the other regex more complicated. - if "<hr" in text: - _hr_tag_re = _hr_tag_re_from_tab_width(self.tab_width) - text = _hr_tag_re.sub(hash_html_block_sub, text) - - # Special case for standalone HTML comments: - if "<!--" in text: - start = 0 - while True: - # Delimiters for next comment block. - try: - start_idx = text.index("<!--", start) - except ValueError, ex: - break - try: - end_idx = text.index("-->", start_idx) + 3 - except ValueError, ex: - break - - # Start position for next comment block search. - start = end_idx - - # Validate whitespace before comment. - if start_idx: - # - Up to `tab_width - 1` spaces before start_idx. - for i in range(self.tab_width - 1): - if text[start_idx - 1] != ' ': - break - start_idx -= 1 - if start_idx == 0: - break - # - Must be preceded by 2 newlines or hit the start of - # the document. - if start_idx == 0: - pass - elif start_idx == 1 and text[0] == '\n': - start_idx = 0 # to match minute detail of Markdown.pl regex - elif text[start_idx-2:start_idx] == '\n\n': - pass - else: - break - - # Validate whitespace after comment. - # - Any number of spaces and tabs. - while end_idx < len(text): - if text[end_idx] not in ' \t': - break - end_idx += 1 - # - Must be following by 2 newlines or hit end of text. - if text[end_idx:end_idx+2] not in ('', '\n', '\n\n'): - continue - - # Escape and hash (must match `_hash_html_block_sub`). - html = text[start_idx:end_idx] - if raw and self.safe_mode: - html = self._sanitize_html(html) - key = _hash_text(html) - self.html_blocks[key] = html - text = text[:start_idx] + "\n\n" + key + "\n\n" + text[end_idx:] - - if "xml" in self.extras: - # Treat XML processing instructions and namespaced one-liner - # tags as if they were block HTML tags. E.g., if standalone - # (i.e. are their own paragraph), the following do not get - # wrapped in a <p> tag: - # <?foo bar?> - # - # <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="chapter_1.md"/> - _xml_oneliner_re = _xml_oneliner_re_from_tab_width(self.tab_width) - text = _xml_oneliner_re.sub(hash_html_block_sub, text) - - return text - - def _strip_link_definitions(self, text): - # Strips link definitions from text, stores the URLs and titles in - # hash references. - less_than_tab = self.tab_width - 1 - - # Link defs are in the form: - # [id]: url "optional title" - _link_def_re = re.compile(r""" - ^[ ]{0,%d}\[(.+)\]: # id = \1 - [ \t]* - \n? # maybe *one* newline - [ \t]* - <?(.+?)>? # url = \2 - [ \t]* - (?: - \n? # maybe one newline - [ \t]* - (?<=\s) # lookbehind for whitespace - ['"(] - ([^\n]*) # title = \3 - ['")] - [ \t]* - )? # title is optional - (?:\n+|\Z) - """ % less_than_tab, re.X | re.M | re.U) - return _link_def_re.sub(self._extract_link_def_sub, text) - - def _extract_link_def_sub(self, match): - id, url, title = match.groups() - key = id.lower() # Link IDs are case-insensitive - self.urls[key] = self._encode_amps_and_angles(url) - if title: - self.titles[key] = title.replace('"', '"') - return "" - - def _extract_footnote_def_sub(self, match): - id, text = match.groups() - text = _dedent(text, skip_first_line=not text.startswith('\n')).strip() - normed_id = re.sub(r'\W', '-', id) - # Ensure footnote text ends with a couple newlines (for some - # block gamut matches). - self.footnotes[normed_id] = text + "\n\n" - return "" - - def _strip_footnote_definitions(self, text): - """A footnote definition looks like this: - - [^note-id]: Text of the note. - - May include one or more indented paragraphs. - - Where, - - The 'note-id' can be pretty much anything, though typically it - is the number of the footnote. - - The first paragraph may start on the next line, like so: - - [^note-id]: - Text of the note. - """ - less_than_tab = self.tab_width - 1 - footnote_def_re = re.compile(r''' - ^[ ]{0,%d}\[\^(.+)\]: # id = \1 - [ \t]* - ( # footnote text = \2 - # First line need not start with the spaces. - (?:\s*.*\n+) - (?: - (?:[ ]{%d} | \t) # Subsequent lines must be indented. - .*\n+ - )* - ) - # Lookahead for non-space at line-start, or end of doc. - (?:(?=^[ ]{0,%d}\S)|\Z) - ''' % (less_than_tab, self.tab_width, self.tab_width), - re.X | re.M) - return footnote_def_re.sub(self._extract_footnote_def_sub, text) - - - _hr_res = [ - re.compile(r"^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$", re.M), - re.compile(r"^[ ]{0,2}([ ]?\-[ ]?){3,}[ \t]*$", re.M), - re.compile(r"^[ ]{0,2}([ ]?\_[ ]?){3,}[ \t]*$", re.M), - ] - - def _run_block_gamut(self, text): - # These are all the transformations that form block-level - # tags like paragraphs, headers, and list items. - - text = self._do_headers(text) - - # Do Horizontal Rules: - hr = "\n<hr"+self.empty_element_suffix+"\n" - for hr_re in self._hr_res: - text = hr_re.sub(hr, text) - - text = self._do_lists(text) - - if "pyshell" in self.extras: - text = self._prepare_pyshell_blocks(text) - - text = self._do_code_blocks(text) - - text = self._do_block_quotes(text) - - # We already ran _HashHTMLBlocks() before, in Markdown(), but that - # was to escape raw HTML in the original Markdown source. This time, - # we're escaping the markup we've just created, so that we don't wrap - # <p> tags around block-level tags. - text = self._hash_html_blocks(text) - - text = self._form_paragraphs(text) - - return text - - def _pyshell_block_sub(self, match): - lines = match.group(0).splitlines(0) - _dedentlines(lines) - indent = ' ' * self.tab_width - s = ('\n' # separate from possible cuddled paragraph - + indent + ('\n'+indent).join(lines) - + '\n\n') - return s - - def _prepare_pyshell_blocks(self, text): - """Ensure that Python interactive shell sessions are put in - code blocks -- even if not properly indented. - """ - if ">>>" not in text: - return text - - less_than_tab = self.tab_width - 1 - _pyshell_block_re = re.compile(r""" - ^([ ]{0,%d})>>>[ ].*\n # first line - ^(\1.*\S+.*\n)* # any number of subsequent lines - ^\n # ends with a blank line - """ % less_than_tab, re.M | re.X) - - return _pyshell_block_re.sub(self._pyshell_block_sub, text) - - def _run_span_gamut(self, text): - # These are all the transformations that occur *within* block-level - # tags like paragraphs, headers, and list items. - - text = self._do_code_spans(text) - - text = self._escape_special_chars(text) - - # Process anchor and image tags. - text = self._do_links(text) - - # Make links out of things like `<http://example.com/>` - # Must come after _do_links(), because you can use < and > - # delimiters in inline links like [this](<url>). - text = self._do_auto_links(text) - - if "link-patterns" in self.extras: - text = self._do_link_patterns(text) - - text = self._encode_amps_and_angles(text) - - text = self._do_italics_and_bold(text) - - # Do hard breaks: - text = re.sub(r" {2,}\n", " <br%s\n" % self.empty_element_suffix, text) - - return text - - # "Sorta" because auto-links are identified as "tag" tokens. - _sorta_html_tokenize_re = re.compile(r""" - ( - # tag - </? - (?:\w+) # tag name - (?:\s+(?:[\w-]+:)?[\w-]+=(?:".*?"|'.*?'))* # attributes - \s*/?> - | - # auto-link (e.g., <http://www.activestate.com/>) - <\w+[^>]*> - | - <!--.*?--> # comment - | - <\?.*?\?> # processing instruction - ) - """, re.X) - - def _escape_special_chars(self, text): - # Python markdown note: the HTML tokenization here differs from - # that in Markdown.pl, hence the behaviour for subtle cases can - # differ (I believe the tokenizer here does a better job because - # it isn't susceptible to unmatched '<' and '>' in HTML tags). - # Note, however, that '>' is not allowed in an auto-link URL - # here. - escaped = [] - is_html_markup = False - for token in self._sorta_html_tokenize_re.split(text): - if is_html_markup: - # Within tags/HTML-comments/auto-links, encode * and _ - # so they don't conflict with their use in Markdown for - # italics and strong. We're replacing each such - # character with its corresponding MD5 checksum value; - # this is likely overkill, but it should prevent us from - # colliding with the escape values by accident. - escaped.append(token.replace('*', g_escape_table['*']) - .replace('_', g_escape_table['_'])) - else: - escaped.append(self._encode_backslash_escapes(token)) - is_html_markup = not is_html_markup - return ''.join(escaped) - - def _hash_html_spans(self, text): - # Used for safe_mode. - - def _is_auto_link(s): - if ':' in s and self._auto_link_re.match(s): - return True - elif '@' in s and self._auto_email_link_re.match(s): - return True - return False - - tokens = [] - is_html_markup = False - for token in self._sorta_html_tokenize_re.split(text): - if is_html_markup and not _is_auto_link(token): - sanitized = self._sanitize_html(token) - key = _hash_text(sanitized) - self.html_spans[key] = sanitized - tokens.append(key) - else: - tokens.append(token) - is_html_markup = not is_html_markup - return ''.join(tokens) - - def _unhash_html_spans(self, text): - for key, sanitized in self.html_spans.items(): - text = text.replace(key, sanitized) - return text - - def _sanitize_html(self, s): - if self.safe_mode == "replace": - return self.html_removed_text - elif self.safe_mode == "escape": - replacements = [ - ('&', '&'), - ('<', '<'), - ('>', '>'), - ] - for before, after in replacements: - s = s.replace(before, after) - return s - else: - raise MarkdownError("invalid value for 'safe_mode': %r (must be " - "'escape' or 'replace')" % self.safe_mode) - - _tail_of_inline_link_re = re.compile(r''' - # Match tail of: [text](/url/) or [text](/url/ "title") - \( # literal paren - [ \t]* - (?P<url> # \1 - <.*?> - | - .*? - ) - [ \t]* - ( # \2 - (['"]) # quote char = \3 - (?P<title>.*?) - \3 # matching quote - )? # title is optional - \) - ''', re.X | re.S) - _tail_of_reference_link_re = re.compile(r''' - # Match tail of: [text][id] - [ ]? # one optional space - (?:\n[ ]*)? # one optional newline followed by spaces - \[ - (?P<id>.*?) - \] - ''', re.X | re.S) - - def _do_links(self, text): - """Turn Markdown link shortcuts into XHTML <a> and <img> tags. - - This is a combination of Markdown.pl's _DoAnchors() and - _DoImages(). They are done together because that simplified the - approach. It was necessary to use a different approach than - Markdown.pl because of the lack of atomic matching support in - Python's regex engine used in $g_nested_brackets. - """ - MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 - - # `anchor_allowed_pos` is used to support img links inside - # anchors, but not anchors inside anchors. An anchor's start - # pos must be `>= anchor_allowed_pos`. - anchor_allowed_pos = 0 - - curr_pos = 0 - while True: # Handle the next link. - # The next '[' is the start of: - # - an inline anchor: [text](url "title") - # - a reference anchor: [text][id] - # - an inline img: ![text](url "title") - # - a reference img: ![text][id] - # - a footnote ref: [^id] - # (Only if 'footnotes' extra enabled) - # - a footnote defn: [^id]: ... - # (Only if 'footnotes' extra enabled) These have already - # been stripped in _strip_footnote_definitions() so no - # need to watch for them. - # - a link definition: [id]: url "title" - # These have already been stripped in - # _strip_link_definitions() so no need to watch for them. - # - not markup: [...anything else... - try: - start_idx = text.index('[', curr_pos) - except ValueError: - break - text_length = len(text) - - # Find the matching closing ']'. - # Markdown.pl allows *matching* brackets in link text so we - # will here too. Markdown.pl *doesn't* currently allow - # matching brackets in img alt text -- we'll differ in that - # regard. - bracket_depth = 0 - for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, - text_length)): - ch = text[p] - if ch == ']': - bracket_depth -= 1 - if bracket_depth < 0: - break - elif ch == '[': - bracket_depth += 1 - else: - # Closing bracket not found within sentinel length. - # This isn't markup. - curr_pos = start_idx + 1 - continue - link_text = text[start_idx+1:p] - - # Possibly a footnote ref? - if "footnotes" in self.extras and link_text.startswith("^"): - normed_id = re.sub(r'\W', '-', link_text[1:]) - if normed_id in self.footnotes: - self.footnote_ids.append(normed_id) - result = '<sup class="footnote-ref" id="fnref-%s">' \ - '<a href="#fn-%s">%s</a></sup>' \ - % (normed_id, normed_id, len(self.footnote_ids)) - text = text[:start_idx] + result + text[p+1:] - else: - # This id isn't defined, leave the markup alone. - curr_pos = p+1 - continue - - # Now determine what this is by the remainder. - p += 1 - if p == text_length: - return text - - # Inline anchor or img? - if text[p] == '(': # attempt at perf improvement - match = self._tail_of_inline_link_re.match(text, p) - if match: - # Handle an inline anchor or img. - is_img = start_idx > 0 and text[start_idx-1] == "!" - if is_img: - start_idx -= 1 - - url, title = match.group("url"), match.group("title") - if url and url[0] == '<': - url = url[1:-1] # '<url>' -> 'url' - # We've got to encode these to avoid conflicting - # with italics/bold. - url = url.replace('*', g_escape_table['*']) \ - .replace('_', g_escape_table['_']) - if title: - title_str = ' title="%s"' \ - % title.replace('*', g_escape_table['*']) \ - .replace('_', g_escape_table['_']) \ - .replace('"', '"') - else: - title_str = '' - if is_img: - result = '<img src="%s" alt="%s"%s%s' \ - % (url, link_text.replace('"', '"'), - title_str, self.empty_element_suffix) - curr_pos = start_idx + len(result) - text = text[:start_idx] + result + text[match.end():] - elif start_idx >= anchor_allowed_pos: - result_head = '<a href="%s"%s>' % (url, title_str) - result = '%s%s</a>' % (result_head, link_text) - # <img> allowed from curr_pos on, <a> from - # anchor_allowed_pos on. - curr_pos = start_idx + len(result_head) - anchor_allowed_pos = start_idx + len(result) - text = text[:start_idx] + result + text[match.end():] - else: - # Anchor not allowed here. - curr_pos = start_idx + 1 - continue - - # Reference anchor or img? - else: - match = self._tail_of_reference_link_re.match(text, p) - if match: - # Handle a reference-style anchor or img. - is_img = start_idx > 0 and text[start_idx-1] == "!" - if is_img: - start_idx -= 1 - link_id = match.group("id").lower() - if not link_id: - link_id = link_text.lower() # for links like [this][] - if link_id in self.urls: - url = self.urls[link_id] - # We've got to encode these to avoid conflicting - # with italics/bold. - url = url.replace('*', g_escape_table['*']) \ - .replace('_', g_escape_table['_']) - title = self.titles.get(link_id) - if title: - title = title.replace('*', g_escape_table['*']) \ - .replace('_', g_escape_table['_']) - title_str = ' title="%s"' % title - else: - title_str = '' - if is_img: - result = '<img src="%s" alt="%s"%s%s' \ - % (url, link_text.replace('"', '"'), - title_str, self.empty_element_suffix) - curr_pos = start_idx + len(result) - text = text[:start_idx] + result + text[match.end():] - elif start_idx >= anchor_allowed_pos: - result = '<a href="%s"%s>%s</a>' \ - % (url, title_str, link_text) - result_head = '<a href="%s"%s>' % (url, title_str) - result = '%s%s</a>' % (result_head, link_text) - # <img> allowed from curr_pos on, <a> from - # anchor_allowed_pos on. - curr_pos = start_idx + len(result_head) - anchor_allowed_pos = start_idx + len(result) - text = text[:start_idx] + result + text[match.end():] - else: - # Anchor not allowed here. - curr_pos = start_idx + 1 - else: - # This id isn't defined, leave the markup alone. - curr_pos = match.end() - continue - - # Otherwise, it isn't markup. - curr_pos = start_idx + 1 - - return text - - - _setext_h_re = re.compile(r'^(.+)[ \t]*\n(=+|-+)[ \t]*\n+', re.M) - def _setext_h_sub(self, match): - n = {"=": 1, "-": 2}[match.group(2)[0]] - demote_headers = self.extras.get("demote-headers") - if demote_headers: - n = min(n + demote_headers, 6) - return "<h%d>%s</h%d>\n\n" \ - % (n, self._run_span_gamut(match.group(1)), n) - - _atx_h_re = re.compile(r''' - ^(\#{1,6}) # \1 = string of #'s - [ \t]* - (.+?) # \2 = Header text - [ \t]* - (?<!\\) # ensure not an escaped trailing '#' - \#* # optional closing #'s (not counted) - \n+ - ''', re.X | re.M) - def _atx_h_sub(self, match): - n = len(match.group(1)) - demote_headers = self.extras.get("demote-headers") - if demote_headers: - n = min(n + demote_headers, 6) - return "<h%d>%s</h%d>\n\n" \ - % (n, self._run_span_gamut(match.group(2)), n) - - def _do_headers(self, text): - # Setext-style headers: - # Header 1 - # ======== - # - # Header 2 - # -------- - text = self._setext_h_re.sub(self._setext_h_sub, text) - - # atx-style headers: - # # Header 1 - # ## Header 2 - # ## Header 2 with closing hashes ## - # ... - # ###### Header 6 - text = self._atx_h_re.sub(self._atx_h_sub, text) - - return text - - - _marker_ul_chars = '*+-' - _marker_any = r'(?:[%s]|\d+\.)' % _marker_ul_chars - _marker_ul = '(?:[%s])' % _marker_ul_chars - _marker_ol = r'(?:\d+\.)' - - def _list_sub(self, match): - lst = match.group(1) - lst_type = match.group(3) in self._marker_ul_chars and "ul" or "ol" - result = self._process_list_items(lst) - if self.list_level: - return "<%s>\n%s</%s>\n" % (lst_type, result, lst_type) - else: - return "<%s>\n%s</%s>\n\n" % (lst_type, result, lst_type) - - def _do_lists(self, text): - # Form HTML ordered (numbered) and unordered (bulleted) lists. - - for marker_pat in (self._marker_ul, self._marker_ol): - # Re-usable pattern to match any entire ul or ol list: - less_than_tab = self.tab_width - 1 - whole_list = r''' - ( # \1 = whole list - ( # \2 - [ ]{0,%d} - (%s) # \3 = first list item marker - [ \t]+ - ) - (?:.+?) - ( # \4 - \Z - | - \n{2,} - (?=\S) - (?! # Negative lookahead for another list item marker - [ \t]* - %s[ \t]+ - ) - ) - ) - ''' % (less_than_tab, marker_pat, marker_pat) - - # We use a different prefix before nested lists than top-level lists. - # See extended comment in _process_list_items(). - # - # Note: There's a bit of duplication here. My original implementation - # created a scalar regex pattern as the conditional result of the test on - # $g_list_level, and then only ran the $text =~ s{...}{...}egmx - # substitution once, using the scalar as the pattern. This worked, - # everywhere except when running under MT on my hosting account at Pair - # Networks. There, this caused all rebuilds to be killed by the reaper (or - # perhaps they crashed, but that seems incredibly unlikely given that the - # same script on the same server ran fine *except* under MT. I've spent - # more time trying to figure out why this is happening than I'd like to - # admit. My only guess, backed up by the fact that this workaround works, - # is that Perl optimizes the substition when it can figure out that the - # pattern will never change, and when this optimization isn't on, we run - # afoul of the reaper. Thus, the slightly redundant code to that uses two - # static s/// patterns rather than one conditional pattern. - - if self.list_level: - sub_list_re = re.compile("^"+whole_list, re.X | re.M | re.S) - text = sub_list_re.sub(self._list_sub, text) - else: - list_re = re.compile(r"(?:(?<=\n\n)|\A\n?)"+whole_list, - re.X | re.M | re.S) - text = list_re.sub(self._list_sub, text) - - return text - - _list_item_re = re.compile(r''' - (\n)? # leading line = \1 - (^[ \t]*) # leading whitespace = \2 - (%s) [ \t]+ # list marker = \3 - ((?:.+?) # list item text = \4 - (\n{1,2})) # eols = \5 - (?= \n* (\Z | \2 (%s) [ \t]+)) - ''' % (_marker_any, _marker_any), - re.M | re.X | re.S) - - _last_li_endswith_two_eols = False - def _list_item_sub(self, match): - item = match.group(4) - leading_line = match.group(1) - leading_space = match.group(2) - if leading_line or "\n\n" in item or self._last_li_endswith_two_eols: - item = self._run_block_gamut(self._outdent(item)) - else: - # Recursion for sub-lists: - item = self._do_lists(self._outdent(item)) - if item.endswith('\n'): - item = item[:-1] - item = self._run_span_gamut(item) - self._last_li_endswith_two_eols = (len(match.group(5)) == 2) - return "<li>%s</li>\n" % item - - def _process_list_items(self, list_str): - # Process the contents of a single ordered or unordered list, - # splitting it into individual list items. - - # The $g_list_level global keeps track of when we're inside a list. - # Each time we enter a list, we increment it; when we leave a list, - # we decrement. If it's zero, we're not in a list anymore. - # - # We do this because when we're not inside a list, we want to treat - # something like this: - # - # I recommend upgrading to version - # 8. Oops, now this line is treated - # as a sub-list. - # - # As a single paragraph, despite the fact that the second line starts - # with a digit-period-space sequence. - # - # Whereas when we're inside a list (or sub-list), that line will be - # treated as the start of a sub-list. What a kludge, huh? This is - # an aspect of Markdown's syntax that's hard to parse perfectly - # without resorting to mind-reading. Perhaps the solution is to - # change the syntax rules such that sub-lists must start with a - # starting cardinal number; e.g. "1." or "a.". - self.list_level += 1 - self._last_li_endswith_two_eols = False - list_str = list_str.rstrip('\n') + '\n' - list_str = self._list_item_re.sub(self._list_item_sub, list_str) - self.list_level -= 1 - return list_str - - def _get_pygments_lexer(self, lexer_name): - try: - from pygments import lexers, util - except ImportError: - return None - try: - return lexers.get_lexer_by_name(lexer_name) - except util.ClassNotFound: - return None - - def _color_with_pygments(self, codeblock, lexer, **formatter_opts): - import pygments - import pygments.formatters - - class HtmlCodeFormatter(pygments.formatters.HtmlFormatter): - def _wrap_code(self, inner): - """A function for use in a Pygments Formatter which - wraps in <code> tags. - """ - yield 0, "<code>" - for tup in inner: - yield tup - yield 0, "</code>" - - def wrap(self, source, outfile): - """Return the source with a code, pre, and div.""" - return self._wrap_div(self._wrap_pre(self._wrap_code(source))) - - formatter = HtmlCodeFormatter(cssclass="codehilite", **formatter_opts) - return pygments.highlight(codeblock, lexer, formatter) - - def _code_block_sub(self, match): - codeblock = match.group(1) - codeblock = self._outdent(codeblock) - codeblock = self._detab(codeblock) - codeblock = codeblock.lstrip('\n') # trim leading newlines - codeblock = codeblock.rstrip() # trim trailing whitespace - - if "code-color" in self.extras and codeblock.startswith(":::"): - lexer_name, rest = codeblock.split('\n', 1) - lexer_name = lexer_name[3:].strip() - lexer = self._get_pygments_lexer(lexer_name) - codeblock = rest.lstrip("\n") # Remove lexer declaration line. - if lexer: - formatter_opts = self.extras['code-color'] or {} - colored = self._color_with_pygments(codeblock, lexer, - **formatter_opts) - return "\n\n%s\n\n" % colored - - codeblock = self._encode_code(codeblock) - return "\n\n<pre><code>%s\n</code></pre>\n\n" % codeblock - - def _do_code_blocks(self, text): - """Process Markdown `<pre><code>` blocks.""" - code_block_re = re.compile(r''' - (?:\n\n|\A) - ( # $1 = the code block -- one or more lines, starting with a space/tab - (?: - (?:[ ]{%d} | \t) # Lines must start with a tab or a tab-width of spaces - .*\n+ - )+ - ) - ((?=^[ ]{0,%d}\S)|\Z) # Lookahead for non-space at line-start, or end of doc - ''' % (self.tab_width, self.tab_width), - re.M | re.X) - - return code_block_re.sub(self._code_block_sub, text) - - - # Rules for a code span: - # - backslash escapes are not interpreted in a code span - # - to include one or or a run of more backticks the delimiters must - # be a longer run of backticks - # - cannot start or end a code span with a backtick; pad with a - # space and that space will be removed in the emitted HTML - # See `test/tm-cases/escapes.text` for a number of edge-case - # examples. - _code_span_re = re.compile(r''' - (?<!\\) - (`+) # \1 = Opening run of ` - (?!`) # See Note A test/tm-cases/escapes.text - (.+?) # \2 = The code block - (?<!`) - \1 # Matching closer - (?!`) - ''', re.X | re.S) - - def _code_span_sub(self, match): - c = match.group(2).strip(" \t") - c = self._encode_code(c) - return "<code>%s</code>" % c - - def _do_code_spans(self, text): - # * Backtick quotes are used for <code></code> spans. - # - # * You can use multiple backticks as the delimiters if you want to - # include literal backticks in the code span. So, this input: - # - # Just type ``foo `bar` baz`` at the prompt. - # - # Will translate to: - # - # <p>Just type <code>foo `bar` baz</code> at the prompt.</p> - # - # There's no arbitrary limit to the number of backticks you - # can use as delimters. If you need three consecutive backticks - # in your code, use four for delimiters, etc. - # - # * You can use spaces to get literal backticks at the edges: - # - # ... type `` `bar` `` ... - # - # Turns to: - # - # ... type <code>`bar`</code> ... - return self._code_span_re.sub(self._code_span_sub, text) - - def _encode_code(self, text): - """Encode/escape certain characters inside Markdown code runs. - The point is that in code, these characters are literals, - and lose their special Markdown meanings. - """ - replacements = [ - # Encode all ampersands; HTML entities are not - # entities within a Markdown code span. - ('&', '&'), - # Do the angle bracket song and dance: - ('<', '<'), - ('>', '>'), - # Now, escape characters that are magic in Markdown: - ('*', g_escape_table['*']), - ('_', g_escape_table['_']), - ('{', g_escape_table['{']), - ('}', g_escape_table['}']), - ('[', g_escape_table['[']), - (']', g_escape_table[']']), - ('\\', g_escape_table['\\']), - ] - for before, after in replacements: - text = text.replace(before, after) - return text - - _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]*)(?<=\S)\1", re.S) - _em_re = re.compile(r"(\*|_)(?=\S)(.+?)(?<=\S)\1", re.S) - _code_friendly_strong_re = re.compile(r"\*\*(?=\S)(.+?[*_]*)(?<=\S)\*\*", re.S) - _code_friendly_em_re = re.compile(r"\*(?=\S)(.+?)(?<=\S)\*", re.S) - def _do_italics_and_bold(self, text): - # <strong> must go first: - if "code-friendly" in self.extras: - text = self._code_friendly_strong_re.sub(r"<strong>\1</strong>", text) - text = self._code_friendly_em_re.sub(r"<em>\1</em>", text) - else: - text = self._strong_re.sub(r"<strong>\2</strong>", text) - text = self._em_re.sub(r"<em>\2</em>", text) - return text - - - _block_quote_re = re.compile(r''' - ( # Wrap whole match in \1 - ( - ^[ \t]*>[ \t]? # '>' at the start of a line - .+\n # rest of the first line - (.+\n)* # subsequent consecutive lines - \n* # blanks - )+ - ) - ''', re.M | re.X) - _bq_one_level_re = re.compile('^[ \t]*>[ \t]?', re.M); - - _html_pre_block_re = re.compile(r'(\s*<pre>.+?</pre>)', re.S) - def _dedent_two_spaces_sub(self, match): - return re.sub(r'(?m)^ ', '', match.group(1)) - - def _block_quote_sub(self, match): - bq = match.group(1) - bq = self._bq_one_level_re.sub('', bq) # trim one level of quoting - bq = self._ws_only_line_re.sub('', bq) # trim whitespace-only lines - bq = self._run_block_gamut(bq) # recurse - - bq = re.sub('(?m)^', ' ', bq) - # These leading spaces screw with <pre> content, so we need to fix that: - bq = self._html_pre_block_re.sub(self._dedent_two_spaces_sub, bq) - - return "<blockquote>\n%s\n</blockquote>\n\n" % bq - - def _do_block_quotes(self, text): - if '>' not in text: - return text - return self._block_quote_re.sub(self._block_quote_sub, text) - - def _form_paragraphs(self, text): - # Strip leading and trailing lines: - text = text.strip('\n') - - # Wrap <p> tags. - grafs = re.split(r"\n{2,}", text) - for i, graf in enumerate(grafs): - if graf in self.html_blocks: - # Unhashify HTML blocks - grafs[i] = self.html_blocks[graf] - else: - # Wrap <p> tags. - graf = self._run_span_gamut(graf) - grafs[i] = "<p>" + graf.lstrip(" \t") + "</p>" - - return "\n\n".join(grafs) - - def _add_footnotes(self, text): - if self.footnotes: - footer = [ - '<div class="footnotes">', - '<hr' + self.empty_element_suffix, - '<ol>', - ] - for i, id in enumerate(self.footnote_ids): - if i != 0: - footer.append('') - footer.append('<li id="fn-%s">' % id) - footer.append(self._run_block_gamut(self.footnotes[id])) - backlink = ('<a href="#fnref-%s" ' - 'class="footnoteBackLink" ' - 'title="Jump back to footnote %d in the text.">' - '↩</a>' % (id, i+1)) - if footer[-1].endswith("</p>"): - footer[-1] = footer[-1][:-len("</p>")] \ - + ' ' + backlink + "</p>" - else: - footer.append("\n<p>%s</p>" % backlink) - footer.append('</li>') - footer.append('</ol>') - footer.append('</div>') - return text + '\n\n' + '\n'.join(footer) - else: - return text - - # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin: - # http://bumppo.net/projects/amputator/ - _ampersand_re = re.compile(r'&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)') - _naked_lt_re = re.compile(r'<(?![a-z/?\$!])', re.I) - _naked_gt_re = re.compile(r'''(?<![a-z?!/'"-])>''', re.I) - - def _encode_amps_and_angles(self, text): - # Smart processing for ampersands and angle brackets that need - # to be encoded. - text = self._ampersand_re.sub('&', text) - - # Encode naked <'s - text = self._naked_lt_re.sub('<', text) - - # Encode naked >'s - # Note: Other markdown implementations (e.g. Markdown.pl, PHP - # Markdown) don't do this. - text = self._naked_gt_re.sub('>', text) - return text - - def _encode_backslash_escapes(self, text): - for ch, escape in g_escape_table.items(): - text = text.replace("\\"+ch, escape) - return text - - _auto_link_re = re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I) - def _auto_link_sub(self, match): - g1 = match.group(1) - return '<a href="%s">%s</a>' % (g1, g1) - - _auto_email_link_re = re.compile(r""" - < - (?:mailto:)? - ( - [-.\w]+ - \@ - [-\w]+(\.[-\w]+)*\.[a-z]+ - ) - > - """, re.I | re.X | re.U) - def _auto_email_link_sub(self, match): - return self._encode_email_address( - self._unescape_special_chars(match.group(1))) - - def _do_auto_links(self, text): - text = self._auto_link_re.sub(self._auto_link_sub, text) - text = self._auto_email_link_re.sub(self._auto_email_link_sub, text) - return text - - def _encode_email_address(self, addr): - # Input: an email address, e.g. "foo@example.com" - # - # Output: the email address as a mailto link, with each character - # of the address encoded as either a decimal or hex entity, in - # the hopes of foiling most address harvesting spam bots. E.g.: - # - # <a href="mailto:foo@e - # xample.com">foo - # @example.com</a> - # - # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk - # mailing list: <http://tinyurl.com/yu7ue> - chars = [_xml_encode_email_char_at_random(ch) - for ch in "mailto:" + addr] - # Strip the mailto: from the visible part. - addr = '<a href="%s">%s</a>' \ - % (''.join(chars), ''.join(chars[7:])) - return addr - - def _do_link_patterns(self, text): - """Caveat emptor: there isn't much guarding against link - patterns being formed inside other standard Markdown links, e.g. - inside a [link def][like this]. - - Dev Notes: *Could* consider prefixing regexes with a negative - lookbehind assertion to attempt to guard against this. - """ - link_from_hash = {} - for regex, repl in self.link_patterns: - replacements = [] - for match in regex.finditer(text): - if hasattr(repl, "__call__"): - href = repl(match) - else: - href = match.expand(repl) - replacements.append((match.span(), href)) - for (start, end), href in reversed(replacements): - escaped_href = ( - href.replace('"', '"') # b/c of attr quote - # To avoid markdown <em> and <strong>: - .replace('*', g_escape_table['*']) - .replace('_', g_escape_table['_'])) - link = '<a href="%s">%s</a>' % (escaped_href, text[start:end]) - hash = md5(link).hexdigest() - link_from_hash[hash] = link - text = text[:start] + hash + text[end:] - for hash, link in link_from_hash.items(): - text = text.replace(hash, link) - return text - - def _unescape_special_chars(self, text): - # Swap back in all the special characters we've hidden. - for ch, hash in g_escape_table.items(): - text = text.replace(hash, ch) - return text - - def _outdent(self, text): - # Remove one level of line-leading tabs or spaces - return self._outdent_re.sub('', text) - - -class MarkdownWithExtras(Markdown): - """A markdowner class that enables most extras: - - - footnotes - - code-color (only has effect if 'pygments' Python module on path) - - These are not included: - - pyshell (specific to Python-related documenting) - - code-friendly (because it *disables* part of the syntax) - - link-patterns (because you need to specify some actual - link-patterns anyway) - """ - extras = ["footnotes", "code-color"] - - -#---- internal support functions - -# From http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/52549 -def _curry(*args, **kwargs): - function, args = args[0], args[1:] - def result(*rest, **kwrest): - combined = kwargs.copy() - combined.update(kwrest) - return function(*args + rest, **combined) - return result - -# Recipe: regex_from_encoded_pattern (1.0) -def _regex_from_encoded_pattern(s): - """'foo' -> re.compile(re.escape('foo')) - '/foo/' -> re.compile('foo') - '/foo/i' -> re.compile('foo', re.I) - """ - if s.startswith('/') and s.rfind('/') != 0: - # Parse it: /PATTERN/FLAGS - idx = s.rfind('/') - pattern, flags_str = s[1:idx], s[idx+1:] - flag_from_char = { - "i": re.IGNORECASE, - "l": re.LOCALE, - "s": re.DOTALL, - "m": re.MULTILINE, - "u": re.UNICODE, - } - flags = 0 - for char in flags_str: - try: - flags |= flag_from_char[char] - except KeyError: - raise ValueError("unsupported regex flag: '%s' in '%s' " - "(must be one of '%s')" - % (char, s, ''.join(flag_from_char.keys()))) - return re.compile(s[1:idx], flags) - else: # not an encoded regex - return re.compile(re.escape(s)) - -# Recipe: dedent (0.1.2) -def _dedentlines(lines, tabsize=8, skip_first_line=False): - """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines - - "lines" is a list of lines to dedent. - "tabsize" is the tab width to use for indent width calculations. - "skip_first_line" is a boolean indicating if the first line should - be skipped for calculating the indent width and for dedenting. - This is sometimes useful for docstrings and similar. - - Same as dedent() except operates on a sequence of lines. Note: the - lines list is modified **in-place**. - """ - DEBUG = False - if DEBUG: - print "dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\ - % (tabsize, skip_first_line) - indents = [] - margin = None - for i, line in enumerate(lines): - if i == 0 and skip_first_line: continue - indent = 0 - for ch in line: - if ch == ' ': - indent += 1 - elif ch == '\t': - indent += tabsize - (indent % tabsize) - elif ch in '\r\n': - continue # skip all-whitespace lines - else: - break - else: - continue # skip all-whitespace lines - if DEBUG: print "dedent: indent=%d: %r" % (indent, line) - if margin is None: - margin = indent - else: - margin = min(margin, indent) - if DEBUG: print "dedent: margin=%r" % margin - - if margin is not None and margin > 0: - for i, line in enumerate(lines): - if i == 0 and skip_first_line: continue - removed = 0 - for j, ch in enumerate(line): - if ch == ' ': - removed += 1 - elif ch == '\t': - removed += tabsize - (removed % tabsize) - elif ch in '\r\n': - if DEBUG: print "dedent: %r: EOL -> strip up to EOL" % line - lines[i] = lines[i][j:] - break - else: - raise ValueError("unexpected non-whitespace char %r in " - "line %r while removing %d-space margin" - % (ch, line, margin)) - if DEBUG: - print "dedent: %r: %r -> removed %d/%d"\ - % (line, ch, removed, margin) - if removed == margin: - lines[i] = lines[i][j+1:] - break - elif removed > margin: - lines[i] = ' '*(removed-margin) + lines[i][j+1:] - break - else: - if removed: - lines[i] = lines[i][removed:] - return lines - -def _dedent(text, tabsize=8, skip_first_line=False): - """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text - - "text" is the text to dedent. - "tabsize" is the tab width to use for indent width calculations. - "skip_first_line" is a boolean indicating if the first line should - be skipped for calculating the indent width and for dedenting. - This is sometimes useful for docstrings and similar. - - textwrap.dedent(s), but don't expand tabs to spaces - """ - lines = text.splitlines(1) - _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line) - return ''.join(lines) - - -class _memoized(object): - """Decorator that caches a function's return value each time it is called. - If called later with the same arguments, the cached value is returned, and - not re-evaluated. - - http://wiki.python.org/moin/PythonDecoratorLibrary - """ - def __init__(self, func): - self.func = func - self.cache = {} - def __call__(self, *args): - try: - return self.cache[args] - except KeyError: - self.cache[args] = value = self.func(*args) - return value - except TypeError: - # uncachable -- for instance, passing a list as an argument. - # Better to not cache than to blow up entirely. - return self.func(*args) - def __repr__(self): - """Return the function's docstring.""" - return self.func.__doc__ - - -def _xml_oneliner_re_from_tab_width(tab_width): - """Standalone XML processing instruction regex.""" - return re.compile(r""" - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in $1 - [ ]{0,%d} - (?: - <\?\w+\b\s+.*?\?> # XML processing instruction - | - <\w+:\w+\b\s+.*?/> # namespaced single tag - ) - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document - ) - """ % (tab_width - 1), re.X) -_xml_oneliner_re_from_tab_width = _memoized(_xml_oneliner_re_from_tab_width) - -def _hr_tag_re_from_tab_width(tab_width): - return re.compile(r""" - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc - ) - ( # save in \1 - [ ]{0,%d} - <(hr) # start tag = \2 - \b # word break - ([^<>])*? # - /?> # the matching end tag - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document - ) - """ % (tab_width - 1), re.X) -_hr_tag_re_from_tab_width = _memoized(_hr_tag_re_from_tab_width) - - -def _xml_encode_email_char_at_random(ch): - r = random() - # Roughly 10% raw, 45% hex, 45% dec. - # '@' *must* be encoded. I [John Gruber] insist. - # Issue 26: '_' must be encoded. - if r > 0.9 and ch not in "@_": - return ch - elif r < 0.45: - # The [1:] is to drop leading '0': 0x63 -> x63 - return '&#%s;' % hex(ord(ch))[1:] - else: - return '&#%s;' % ord(ch) - -def _hash_text(text): - return 'md5:'+md5(text.encode("utf-8")).hexdigest() - - -#---- mainline - -class _NoReflowFormatter(optparse.IndentedHelpFormatter): - """An optparse formatter that does NOT reflow the description.""" - def format_description(self, description): - return description or "" - -def _test(): - import doctest - doctest.testmod() - -def main(argv=None): - if argv is None: - argv = sys.argv - if not logging.root.handlers: - logging.basicConfig() - - usage = "usage: %prog [PATHS...]" - version = "%prog "+__version__ - parser = optparse.OptionParser(prog="markdown2", usage=usage, - version=version, description=cmdln_desc, - formatter=_NoReflowFormatter()) - parser.add_option("-v", "--verbose", dest="log_level", - action="store_const", const=logging.DEBUG, - help="more verbose output") - parser.add_option("--encoding", - help="specify encoding of text content") - parser.add_option("--html4tags", action="store_true", default=False, - help="use HTML 4 style for empty element tags") - parser.add_option("-s", "--safe", metavar="MODE", dest="safe_mode", - help="sanitize literal HTML: 'escape' escapes " - "HTML meta chars, 'replace' replaces with an " - "[HTML_REMOVED] note") - parser.add_option("-x", "--extras", action="append", - help="Turn on specific extra features (not part of " - "the core Markdown spec). Supported values: " - "'code-friendly' disables _/__ for emphasis; " - "'code-color' adds code-block syntax coloring; " - "'link-patterns' adds auto-linking based on patterns; " - "'footnotes' adds the footnotes syntax;" - "'xml' passes one-liner processing instructions and namespaced XML tags;" - "'pyshell' to put unindented Python interactive shell sessions in a <code> block.") - parser.add_option("--use-file-vars", - help="Look for and use Emacs-style 'markdown-extras' " - "file var to turn on extras. See " - "<http://code.google.com/p/python-markdown2/wiki/Extras>.") - parser.add_option("--link-patterns-file", - help="path to a link pattern file") - parser.add_option("--self-test", action="store_true", - help="run internal self-tests (some doctests)") - parser.add_option("--compare", action="store_true", - help="run against Markdown.pl as well (for testing)") - parser.set_defaults(log_level=logging.INFO, compare=False, - encoding="utf-8", safe_mode=None, use_file_vars=False) - opts, paths = parser.parse_args() - log.setLevel(opts.log_level) - - if opts.self_test: - return _test() - - if opts.extras: - extras = {} - for s in opts.extras: - splitter = re.compile("[,;: ]+") - for e in splitter.split(s): - if '=' in e: - ename, earg = e.split('=', 1) - try: - earg = int(earg) - except ValueError: - pass - else: - ename, earg = e, None - extras[ename] = earg - else: - extras = None - - if opts.link_patterns_file: - link_patterns = [] - f = open(opts.link_patterns_file) - try: - for i, line in enumerate(f.readlines()): - if not line.strip(): continue - if line.lstrip().startswith("#"): continue - try: - pat, href = line.rstrip().rsplit(None, 1) - except ValueError: - raise MarkdownError("%s:%d: invalid link pattern line: %r" - % (opts.link_patterns_file, i+1, line)) - link_patterns.append( - (_regex_from_encoded_pattern(pat), href)) - finally: - f.close() - else: - link_patterns = None - - from os.path import join, dirname, abspath, exists - markdown_pl = join(dirname(dirname(abspath(__file__))), "test", - "Markdown.pl") - for path in paths: - if opts.compare: - print "==== Markdown.pl ====" - perl_cmd = 'perl %s "%s"' % (markdown_pl, path) - o = os.popen(perl_cmd) - perl_html = o.read() - o.close() - sys.stdout.write(perl_html) - print "==== markdown2.py ====" - html = markdown_path(path, encoding=opts.encoding, - html4tags=opts.html4tags, - safe_mode=opts.safe_mode, - extras=extras, link_patterns=link_patterns, - use_file_vars=opts.use_file_vars) - sys.stdout.write( - html.encode(sys.stdout.encoding or "utf-8", 'xmlcharrefreplace')) - if opts.compare: - test_dir = join(dirname(dirname(abspath(__file__))), "test") - if exists(join(test_dir, "test_markdown2.py")): - sys.path.insert(0, test_dir) - from test_markdown2 import norm_html_from_html - norm_html = norm_html_from_html(html) - norm_perl_html = norm_html_from_html(perl_html) - else: - norm_html = html - norm_perl_html = perl_html - print "==== match? %r ====" % (norm_perl_html == norm_html) - - -if __name__ == "__main__": - sys.exit( main(sys.argv) ) - diff --git a/lib/utilslib/pydelicious.py b/lib/utilslib/pydelicious.py deleted file mode 100644 index 8e45843..0000000 --- a/lib/utilslib/pydelicious.py +++ /dev/null @@ -1,1045 +0,0 @@ -"""Library to access del.icio.us data via Python. - -An introduction to the project is given in the README. -pydelicious is released under the BSD license. See license.txt for details -and the copyright holders. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR -ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES -(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; -LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON -ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -TODO: - - distribute license, readme docs via setup.py? - - automatic release build? -""" -import sys -import os -import time -import datetime -import locale -import httplib -import urllib2 -from urllib import urlencode, quote_plus -from StringIO import StringIO -from pprint import pformat - -v = sys.version_info -if v[0] >= 2 and v[1] >= 5: - from hashlib import md5 -else: - from md5 import md5 - -try: - from elementtree.ElementTree import parse as parse_xml -except ImportError: - # Python 2.5 and higher - from xml.etree.ElementTree import parse as parse_xml - -try: - import feedparser -except ImportError: - print >>sys.stderr, \ - "Feedparser not available, no RSS parsing." - feedparser = None - - -### Static config - -__version__ = '0.5.3' -__author__ = 'Frank Timmermann <regenkind_at_gmx_dot_de>' - # GP: does not respond to emails -__contributors__ = [ - 'Greg Pinero', - 'Berend van Berkum <berend+pydelicious@dotmpe.com>'] -__url__ = 'http://code.google.com/p/pydelicious/' -# Old URL: 'http://deliciouspython.python-hosting.com/' -__author_email__ = "" -__docformat__ = "restructuredtext en" -__description__ = "pydelicious.py allows you to access the web service of " \ - "del.icio.us via it's API through Python." -__long_description__ = "The goal is to design an easy to use and fully " \ - "functional Python interface to del.icio.us." - -DLCS_OK_MESSAGES = ('done', 'ok') -"Known text values of positive del.icio.us <result/> answers" -DLCS_WAIT_TIME = 4 -"Time to wait between API requests" -DLCS_REQUEST_TIMEOUT = 444 -"Seconds before socket triggers timeout" -#DLCS_API_REALM = 'del.icio.us API' -DLCS_API_HOST = 'api.del.icio.us' -DLCS_API_PATH = 'v1' -DLCS_API = "https://%s/%s" % (DLCS_API_HOST, DLCS_API_PATH) -DLCS_RSS = 'http://del.icio.us/rss/' -DLCS_FEEDS = 'http://feeds.delicious.com/v2/' - -PREFERRED_ENCODING = locale.getpreferredencoding() -# XXX: might need to check sys.platform/encoding combinations here, ie -#if sys.platform == 'darwin' || PREFERRED_ENCODING == 'macroman: -# PREFERRED_ENCODING = 'utf-8' -if not PREFERRED_ENCODING: - PREFERRED_ENCODING = 'iso-8859-1' - -ISO_8601_DATETIME = '%Y-%m-%dT%H:%M:%SZ' - -USER_AGENT = 'pydelicious/%s %s' % (__version__, __url__) - -DEBUG = 0 -if 'DLCS_DEBUG' in os.environ: - DEBUG = int(os.environ['DLCS_DEBUG']) - if DEBUG: - print >>sys.stderr, \ - "Set DEBUG to %i from DLCS_DEBUG env." % DEBUG - -HTTP_PROXY = None -if 'HTTP_PROXY' in os.environ: - HTTP_PROXY = os.environ['HTTP_PROXY'] - if DEBUG: - print >>sys.stderr, \ - "Set HTTP_PROXY to %i from env." % HTTP_PROXY - -### Timeoutsocket hack taken from FeedParser.py - -# timeoutsocket allows feedparser to time out rather than hang forever on ultra- -# slow servers. Python 2.3 now has this functionality available in the standard -# socket library, so under 2.3 you don't need to install anything. But you -# probably should anyway, because the socket module is buggy and timeoutsocket -# is better. -try: - import timeoutsocket # http://www.timo-tasi.org/python/timeoutsocket.py - timeoutsocket.setDefaultSocketTimeout(DLCS_REQUEST_TIMEOUT) -except ImportError: - import socket - if hasattr(socket, 'setdefaulttimeout'): - socket.setdefaulttimeout(DLCS_REQUEST_TIMEOUT) -if DEBUG: print >>sys.stderr, \ - "Set socket timeout to %s seconds" % DLCS_REQUEST_TIMEOUT - - -### Utility classes - -class _Waiter: - """Waiter makes sure a certain amount of time passes between - successive calls of `Waiter()`. - - Some attributes: - :last: time of last call - :wait: the minimum time needed between calls - :waited: the number of calls throttled - - pydelicious.Waiter is an instance created when the module is loaded. - """ - def __init__(self, wait): - self.wait = wait - self.waited = 0 - self.lastcall = 0; - - def __call__(self): - tt = time.time() - wait = self.wait - - timeago = tt - self.lastcall - - if timeago < wait: - wait = wait - timeago - if DEBUG>0: print >>sys.stderr, "Waiting %s seconds." % wait - time.sleep(wait) - self.waited += 1 - self.lastcall = tt + wait - else: - self.lastcall = tt - -Waiter = _Waiter(DLCS_WAIT_TIME) - - -class PyDeliciousException(Exception): - """Standard pydelicious error""" -class PyDeliciousThrottled(Exception): pass -class PyDeliciousUnauthorized(Exception): pass - -class DeliciousError(Exception): - """Raised when the server responds with a negative answer""" - - @staticmethod - def raiseFor(error_string, path, **params): - if error_string == 'item already exists': - raise DeliciousItemExistsError, params['url'] - else: - raise DeliciousError, "%s, while calling <%s?%s>" % (error_string, - path, urlencode(params)) - -class DeliciousItemExistsError(DeliciousError): - """Raised then adding an already existing post.""" - - -class HTTPErrorHandler(urllib2.HTTPDefaultErrorHandler): - - def http_error_401(self, req, fp, code, msg, headers): - raise PyDeliciousUnauthorized, "Check credentials." - - def http_error_503(self, req, fp, code, msg, headers): - # Retry-After? - errmsg = "Try again later." - if 'Retry-After' in headers: - errmsg = "You may try again after %s" % headers['Retry-After'] - raise PyDeliciousThrottled, errmsg - - -### Utility functions - -def dict0(d): - "Removes empty string values from dictionary" - return dict([(k,v) for k,v in d.items() - if v=='' and isinstance(v, basestring)]) - - -def delicious_datetime(str): - """Parse a ISO 8601 formatted string to a Python datetime ... - """ - return datetime.datetime(*time.strptime(str, ISO_8601_DATETIME)[0:6]) - - -def http_request(url, user_agent=USER_AGENT, retry=4, opener=None): - """Retrieve the contents referenced by the URL using urllib2. - - Retries up to four times (default) on exceptions. - """ - request = urllib2.Request(url, headers={'User-Agent':user_agent}) - - if not opener: - opener = urllib2.build_opener() - - # Remember last error - e = None - - # Repeat request on time-out errors - tries = retry; - while tries: - try: - return opener.open(request) - - except urllib2.HTTPError, e: - # reraise unexpected protocol errors as PyDeliciousException - raise PyDeliciousException, "%s" % e - - except urllib2.URLError, e: - # xxx: Ugly check for time-out errors - #if len(e)>0 and 'timed out' in arg[0]: - print >> sys.stderr, "%s, %s tries left." % (e, tries) - Waiter() - tries = tries - 1 - #else: - # tries = None - - # Give up - raise PyDeliciousException, \ - "Unable to retrieve data at '%s', %s" % (url, e) - - -def build_api_opener(host, user, passwd, extra_handlers=() ): - """ - Build a urllib2 style opener with HTTP Basic authorization for one host - and additional error handling. If HTTP_PROXY is set a proxyhandler is also - added. - """ - - global DEBUG - - if DEBUG: httplib.HTTPConnection.debuglevel = 1 - - password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() - password_manager.add_password(None, host, user, passwd) - auth_handler = urllib2.HTTPBasicAuthHandler(password_manager) - - extra_handlers += ( HTTPErrorHandler(), ) - if HTTP_PROXY: - extra_handlers += ( urllib2.ProxyHandler( {'http': HTTP_PROXY} ), ) - - return urllib2.build_opener(auth_handler, *extra_handlers) - - -def dlcs_api_opener(user, passwd): - "Build an opener for DLCS_API_HOST, see build_api_opener()" - - return build_api_opener(DLCS_API_HOST, user, passwd) - - -def dlcs_api_request(path, params='', user='', passwd='', throttle=True, - opener=None): - """Retrieve/query a path within the del.icio.us API. - - This implements a minimum interval between calls to avoid - throttling. [#]_ Use param 'throttle' to turn this behaviour off. - - .. [#] http://del.icio.us/help/api/ - """ - if throttle: - Waiter() - - if params: - url = "%s/%s?%s" % (DLCS_API, path, urlencode(params)) - else: - url = "%s/%s" % (DLCS_API, path) - - if DEBUG: print >>sys.stderr, \ - "dlcs_api_request: %s" % url - - if not opener: - opener = dlcs_api_opener(user, passwd) - - fl = http_request(url, opener=opener) - - if DEBUG>2: print >>sys.stderr, \ - pformat(fl.info().headers) - - return fl - - -def dlcs_encode_params(params, usercodec=PREFERRED_ENCODING): - """Turn all param values (int, list, bool) into utf8 encoded strings. - """ - - if params: - for key in params.keys(): - if isinstance(params[key], bool): - if params[key]: - params[key] = 'yes' - else: - params[key] = 'no' - - elif isinstance(params[key], int): - params[key] = str(params[key]) - - elif not params[key]: - # strip/ignore empties other than False or 0 - del params[key] - continue - - elif isinstance(params[key], list): - params[key] = " ".join(params[key]) - - elif not isinstance(params[key], unicode): - params[key] = params[key].decode(usercodec) - - assert isinstance(params[key], basestring) - - params = dict([ (k, v.encode('utf8')) - for k, v in params.items() if v]) - - return params - - -def dlcs_parse_xml(data, split_tags=False): - """Parse any del.icio.us XML document and return Python data structure. - - Recognizes all XML document formats as returned by the version 1 API and - translates to a JSON-like data structure (dicts 'n lists). - - Returned instance is always a dictionary. Examples:: - - {'posts': [{'url':'...','hash':'...',},],} - {'tags':['tag1', 'tag2',]} - {'dates': [{'count':'...','date':'...'},], 'tag':'', 'user':'...'} - {'result':(True, "done")} - # etcetera. - """ - # TODO: split_tags is not implemented - - if DEBUG>3: print >>sys.stderr, "dlcs_parse_xml: parsing from ", data - - if not hasattr(data, 'read'): - data = StringIO(data) - - doc = parse_xml(data) - root = doc.getroot() - fmt = root.tag - - # Split up into three cases: Data, Result or Update - if fmt in ('tags', 'posts', 'dates', 'bundles'): - - # Data: expect a list of data elements, 'resources'. - # Use `fmt` (without last 's') to find data elements, elements - # don't have contents, attributes contain all the data we need: - # append to list - elist = [el.attrib for el in doc.findall(fmt[:-1])] - - # Return list in dict, use tagname of rootnode as keyname. - data = {fmt: elist} - - # Root element might have attributes too, append dict. - data.update(root.attrib) - - return data - - elif fmt == 'result': - - # Result: answer to operations - if root.attrib.has_key('code'): - msg = root.attrib['code'] - else: - msg = root.text - - # XXX: Return {'result':(True, msg)} for /known/ O.K. messages, - # use (False, msg) otherwise. Move this to DeliciousAPI? - v = msg in DLCS_OK_MESSAGES - return {fmt: (v, msg)} - - elif fmt == 'update': - - # Update: "time" - return {fmt: { - 'time':time.strptime(root.attrib['time'], ISO_8601_DATETIME) }} - - else: - raise PyDeliciousException, "Unknown XML document format '%s'" % fmt - - -def dlcs_rss_request(tag="", popular=0, user="", url=''): - """Parse a RSS request. - - This requests old (now undocumented?) URL paths that still seem to work. - """ - - tag = quote_plus(tag) - user = quote_plus(user) - - if url != '': - # http://del.icio.us/rss/url/efbfb246d886393d48065551434dab54 - url = DLCS_RSS + 'url/%s' % md5(url).hexdigest() - - elif user != '' and tag != '': - url = DLCS_RSS + '%(user)s/%(tag)s' % {'user':user, 'tag':tag} - - elif user != '' and tag == '': - # http://del.icio.us/rss/delpy - url = DLCS_RSS + '%s' % user - - elif popular == 0 and tag == '': - url = DLCS_RSS - - elif popular == 0 and tag != '': - # http://del.icio.us/rss/tag/apple - # http://del.icio.us/rss/tag/web2.0 - url = DLCS_RSS + "tag/%s" % tag - - elif popular == 1 and tag == '': - url = DLCS_RSS + 'popular/' - - elif popular == 1 and tag != '': - url = DLCS_RSS + 'popular/%s' % tag - - if DEBUG: - print 'dlcs_rss_request', url - - rss = http_request(url).read() - - # assert feedparser, "dlcs_rss_request requires feedparser to be installed." - if not feedparser: - return rss - - rss = feedparser.parse(rss) - - posts = [] - for e in rss.entries: - if e.has_key("links") and e["links"]!=[] and e["links"][0].has_key("href"): - url = e["links"][0]["href"] - elif e.has_key("link"): - url = e["link"] - elif e.has_key("id"): - url = e["id"] - else: - url = "" - if e.has_key("title"): - description = e['title'] - elif e.has_key("title_detail") and e["title_detail"].has_key("title"): - description = e["title_detail"]['value'] - else: - description = '' - try: tags = e['categories'][0][1] - except: - try: tags = e["category"] - except: tags = "" - if e.has_key("modified"): - dt = e['modified'] - else: - dt = "" - if e.has_key("summary"): - extended = e['summary'] - elif e.has_key("summary_detail"): - e['summary_detail']["value"] - else: - extended = "" - if e.has_key("author"): - user = e['author'] - else: - user = "" - # time = dt ist weist auf ein problem hin - # die benennung der variablen ist nicht einheitlich - # api senden und - # xml bekommen sind zwei verschiedene schuhe :( - posts.append({'url':url, 'description':description, 'tags':tags, - 'dt':dt, 'extended':extended, 'user':user}) - return posts - - -delicious_v2_feeds = { - #"Bookmarks from the hotlist" - '': "%(format)s", - #"Recent bookmarks" - 'recent': "%(format)s/recent", - #"Recent bookmarks by tag" - 'tagged': "%(format)s/tag/%(tags)s", - #"Popular bookmarks" - 'popular': "%(format)s/popular", - #"Popular bookmarks by tag" - 'popular_tagged': "%(format)s/popular/%(tag)s", - #"Recent site alerts (as seen in the top-of-page alert bar on the site)" - 'alerts': "%(format)s/alerts", - #"Bookmarks for a specific user" - 'user': "%(format)s/%(username)s", - #"Bookmarks for a specific user by tag(s)" - 'user_tagged': "%(format)s/%(username)s/%(tags)s", - #"Public summary information about a user (as seen in the network badge)" - 'user_info': "%(format)s/userinfo/%(username)s", - #"A list of all public tags for a user" - 'user_tags': "%(format)s/tags/%(username)s", - #"Bookmarks from a user's subscriptions" - 'user_subscription': "%(format)s/subscriptions/%(username)s", - #"Private feed for a user's inbox bookmarks from others" - 'user_inbox': "%(format)s/inbox/%(username)s?private=%(key)s", - #"Bookmarks from members of a user's network" - 'user_network': "%(format)s/network/%(username)s", - #"Bookmarks from members of a user's network by tag" - 'user_network_tagged': "%(format)s/network/%(username)s/%(tags)s", - #"A list of a user's network members" - 'user_network_member': "%(format)s/networkmembers/%(username)s", - #"A list of a user's network fans" - 'user_network_fan': "%(format)s/networkfans/%(username)s", - #"Recent bookmarks for a URL" - 'url': "%(format)s/url/%(urlmd5)s", - #"Summary information about a URL (as seen in the tagometer)" - 'urlinfo': "json/urlinfo/%(urlmd5)s", -} - -def dlcs_feed(name_or_url, url_map=delicious_v2_feeds, count=15, **params): - - """ - Request and parse a feed. See delicious_v2_feeds for available names and - required parameters. Format defaults to json. - """ - -# http://delicious.com/help/feeds -# TODO: plain or fancy - - format = params.setdefault('format', 'json') - if count == 'all': -# TODO: fetch all - print >>sys.stderr, "! Maxcount 100 " - count = 100 - - if name_or_url in url_map: - params['count'] = count - url = DLCS_FEEDS + url_map[name_or_url] % params - - else: - url = name_or_url - - if DEBUG: - print 'dlcs_feed', url - - feed = http_request(url).read() - - if format == 'rss': - if feedparser: - rss = feedparser.parse(feed) - return rss - - else: - return feed - - elif format == 'json': - return feed - - -### Main module class - -class DeliciousAPI: - - """A single-user Python facade to the del.icio.us HTTP API. - - See http://delicious.com/help/api. - - Methods ``request`` and ``request_raw`` represent the core. For all API - paths there are furthermore methods (e.g. posts_add for 'posts/all') with - an explicit declaration of parameters and documentation. - """ - - def __init__(self, user, passwd, codec=PREFERRED_ENCODING, - api_request=dlcs_api_request, xml_parser=dlcs_parse_xml, - build_opener=dlcs_api_opener, encode_params=dlcs_encode_params): - - """Initialize access to the API for ``user`` with ``passwd``. - - ``codec`` sets the encoding of the arguments, which defaults to the - users preferred locale. - - The ``api_request`` and ``xml_parser`` parameters by default point to - functions within this package with standard implementations which - request and parse a resource. See ``dlcs_api_request()`` and - ``dlcs_parse_xml()``. - - Parameter ``build_opener`` is a callable that, provided with the - credentials, should build a urllib2 opener for the delicious API server - with HTTP authentication. See ``dlcs_api_opener()`` for the default - implementation. - - ``encode_params`` finally preprocesses API parameters before - they are passed to ``api_request``. - """ - - assert user != "" - self.user = user - self.passwd = passwd - self.codec = codec - - # Implement communication to server and parsing of respons messages: - assert callable(encode_params) - self._encode_params = encode_params - assert callable(build_opener) - self._opener = build_opener(user, passwd) - assert callable(api_request) - self._api_request = api_request - assert callable(xml_parser) - self._parse_response = xml_parser - - ### Core functionality - - def request(self, path, _raw=False, **params): - """Sends a request message to `path` in the API, and parses the results - from XML. Use with ``_raw=True`` or ``call request_raw()`` directly - to get the filehandler and process the response message manually. - - Calls to some paths will return a `result` message, i.e.:: - - <result code="..." /> - - or:: - - <result>...</result> - - These should all be parsed to ``{'result':(Boolean, MessageString)}``, - this method raises a ``DeliciousError`` on negative `result` answers. - Positive answers are silently accepted and nothing is returned. - - Using ``_raw=True`` bypasses all parsing and never raises - ``DeliciousError``. - - See ``dlcs_parse_xml()`` and ``self.request_raw()``.""" - - if _raw: - # return answer - return self.request_raw(path, **params) - - else: - params = self._encode_params(params, self.codec) - - # get answer and parse - fl = self._api_request(path, params=params, opener=self._opener) - rs = self._parse_response(fl) - - if type(rs) == dict and 'result' in rs: - if not rs['result'][0]: - # Raise an error for negative 'result' answers - errmsg = "" - if len(rs['result'])>0: - errmsg = rs['result'][1] - DeliciousError.raiseFor(errmsg, path, **params) - - else: - # not out-of-the-oridinary result, OK - return - - return rs - - def request_raw(self, path, **params): - """Calls the path in the API, returns the filehandle. Returned file- - like instances have an ``HTTPMessage`` instance with HTTP header - information available. Use ``filehandle.info()`` or refer to the - ``urllib2.openurl`` documentation. - """ - # see `request()` on how the response can be handled - params = self._encode_params(params, self.codec) - return self._api_request(path, params=params, opener=self._opener) - - ### Explicit declarations of API paths, their parameters and docs - - # Tags - def tags_get(self, **kwds): - """Returns a list of tags and the number of times it is used by the - user. - :: - - <tags> - <tag tag="TagName" count="888"> - """ - return self.request("tags/get", **kwds) - - def tags_delete(self, tag, **kwds): - """Delete an existing tag. - - &tag={TAG} - (required) Tag to delete - """ - return self.request('tags/delete', tag=tag, **kwds) - - def tags_rename(self, old, new, **kwds): - """Rename an existing tag with a new tag name. Returns a `result` - message or raises an ``DeliciousError``. See ``self.request()``. - - &old={TAG} - (required) Tag to rename. - &new={TAG} - (required) New tag name. - """ - return self.request("tags/rename", old=old, new=new, **kwds) - - # Posts - def posts_update(self, **kwds): - """Returns the last update time for the user. Use this before calling - `posts_all` to see if the data has changed since the last fetch. - :: - - <update time="CCYY-MM-DDThh:mm:ssZ"> - """ - return self.request("posts/update", **kwds) - - def posts_dates(self, tag="", **kwds): - """Returns a list of dates with the number of posts at each date. - :: - - <dates> - <date date="CCYY-MM-DD" count="888"> - - &tag={TAG} - (optional) Filter by this tag - """ - return self.request("posts/dates", tag=tag, **kwds) - - def posts_get(self, tag="", dt="", url="", hashes=[], meta=True, **kwds): - """Returns posts matching the arguments. If no date or url is given, - most recent date will be used. - :: - - <posts dt="CCYY-MM-DD" tag="..." user="..."> - <post ...> - - &tag={TAG} {TAG} ... {TAG} - (optional) Filter by this/these tag(s). - &dt={CCYY-MM-DDThh:mm:ssZ} - (optional) Filter by this date, defaults to the most recent date on - which bookmarks were saved. - &url={URL} - (optional) Fetch a bookmark for this URL, regardless of date. - &hashes={MD5} {MD5} ... {MD5} - (optional) Fetch multiple bookmarks by one or more URL MD5s - regardless of date. - &meta=yes - (optional) Include change detection signatures on each item in a - 'meta' attribute. Clients wishing to maintain a synchronized local - store of bookmarks should retain the value of this attribute - its - value will change when any significant field of the bookmark - changes. - """ - return self.request("posts/get", tag=tag, dt=dt, url=url, - hashes=hashes, meta=meta, **kwds) - - def posts_recent(self, tag="", count="", **kwds): - """Returns a list of the most recent posts, filtered by argument. - :: - - <posts tag="..." user="..."> - <post ...> - - &tag={TAG} - (optional) Filter by this tag. - &count={1..100} - (optional) Number of items to retrieve (Default:15, Maximum:100). - """ - return self.request("posts/recent", tag=tag, count=count, **kwds) - - def posts_all(self, tag="", start=None, results=None, fromdt=None, - todt=None, meta=True, hashes=False, **kwds): - """Returns all posts. Please use sparingly. Call the `posts_update` - method to see if you need to fetch this at all. - :: - - <posts tag="..." user="..." update="CCYY-MM-DDThh:mm:ssZ"> - <post ...> - - &tag - (optional) Filter by this tag. - &start={#} - (optional) Start returning posts this many results into the set. - &results={#} - (optional) Return this many results. - &fromdt={CCYY-MM-DDThh:mm:ssZ} - (optional) Filter for posts on this date or later - &todt={CCYY-MM-DDThh:mm:ssZ} - (optional) Filter for posts on this date or earlier - &meta=yes - (optional) Include change detection signatures on each item in a - 'meta' attribute. Clients wishing to maintain a synchronized local - store of bookmarks should retain the value of this attribute - its - value will change when any significant field of the bookmark - changes. - &hashes - (optional, exclusive) Do not fetch post details but a posts - manifest with url- and meta-hashes. Other options do not apply. - """ - if hashes: - return self.request("posts/all", hashes=hashes, **kwds) - else: - return self.request("posts/all", tag=tag, fromdt=fromdt, todt=todt, - start=start, results=results, meta=meta, **kwds) - - def posts_add(self, url, description, extended="", tags="", dt="", - replace=False, shared=True, **kwds): - """Add a post to del.icio.us. Returns a `result` message or raises an - ``DeliciousError``. See ``self.request()``. - - &url (required) - the url of the item. - &description (required) - the description of the item. - &extended (optional) - notes for the item. - &tags (optional) - tags for the item (space delimited). - &dt (optional) - datestamp of the item (format "CCYY-MM-DDThh:mm:ssZ"). - Requires a LITERAL "T" and "Z" like in ISO8601 at - http://www.cl.cam.ac.uk/~mgk25/iso-time.html for example: - "1984-09-01T14:21:31Z" - &replace=no (optional) - don't replace post if given url has already - been posted. - &shared=yes (optional) - wether the item is public. - """ - return self.request("posts/add", url=url, description=description, - extended=extended, tags=tags, dt=dt, - replace=replace, shared=shared, **kwds) - - def posts_delete(self, url, **kwds): - """Delete a post from del.icio.us. Returns a `result` message or - raises an ``DeliciousError``. See ``self.request()``. - - &url (required) - the url of the item. - """ - return self.request("posts/delete", url=url, **kwds) - - # Bundles - def bundles_all(self, **kwds): - """Retrieve user bundles from del.icio.us. - :: - - <bundles> - <bundel name="..." tags=..."> - """ - return self.request("tags/bundles/all", **kwds) - - def bundles_set(self, bundle, tags, **kwds): - """Assign a set of tags to a single bundle, wipes away previous - settings for bundle. Returns a `result` messages or raises an - ``DeliciousError``. See ``self.request()``. - - &bundle (required) - the bundle name. - &tags (required) - list of tags. - """ - if type(tags)==list: - tags = " ".join(tags) - return self.request("tags/bundles/set", bundle=bundle, tags=tags, - **kwds) - - def bundles_delete(self, bundle, **kwds): - """Delete a bundle from del.icio.us. Returns a `result` message or - raises an ``DeliciousError``. See ``self.request()``. - - &bundle (required) - the bundle name. - """ - return self.request("tags/bundles/delete", bundle=bundle, **kwds) - - ### Utils - - # Lookup table for del.icio.us url-path to DeliciousAPI method. - paths = { - 'tags/get': 'tags_get', - 'tags/delete': 'tags_delete', - 'tags/rename': 'tags_rename', - 'posts/update': 'posts_update', - 'posts/dates': 'posts_dates', - 'posts/get': 'posts_get', - 'posts/recent': 'posts_recent', - 'posts/all': 'posts_all', - 'posts/add': 'posts_add', - 'posts/delete': 'posts_delete', - 'tags/bundles/all': 'bundles_all', - 'tags/bundles/set': 'bundles_set', - 'tags/bundles/delete': 'bundles_delete', - } - def get_method(self, path): - return getattr(self, self.paths[path]) - - def get_url(self, url): - """Return the del.icio.us url at which the HTML page with posts for - ``url`` can be found. - """ - return "http://del.icio.us/url/?url=%s" % (url,) - - def __repr__(self): - return "DeliciousAPI(%s)" % self.user - - -### Convenience functions on this package - -def apiNew(user, passwd): - "Creates a new DeliciousAPI object, requires user(name) and passwd." - return DeliciousAPI(user=user, passwd=passwd) - -def add(user, passwd, url, description, tags="", extended="", dt=None, - replace=False): - apiNew(user, passwd).posts_add(url=url, description=description, - extended=extended, tags=tags, dt=dt, replace=replace) - -def get(user, passwd, tag="", dt=None, count=0, hashes=[]): - "Returns a list of posts for the user" - posts = apiNew(user, passwd).posts_get( - tag=tag, dt=dt, hashes=hashes)['posts'] - if count: posts = posts[:count] - return posts - -def get_update(user, passwd): - "Returns the last update time for the user." - return apiNew(user, passwd).posts_update()['update']['time'] - -def get_all(user, passwd, tag="", start=0, results=100, fromdt=None, - todt=None): - "Returns a list with all posts. Please use sparingly. See `get_updated`" - return apiNew(user, passwd).posts_all(tag=tag, start=start, - results=results, fromdt=fromdt, todt=todt, meta=True)['posts'] - -def get_tags(user, passwd): - "Returns a list with all tags for user." - return apiNew(user=user, passwd=passwd).tags_get()['tags'] - -def delete(user, passwd, url): - "Delete the URL from the del.icio.us account." - apiNew(user, passwd).posts_delete(url=url) - -def rename_tag(user, passwd, oldtag, newtag): - "Rename the tag for the del.icio.us account." - apiNew(user=user, passwd=passwd).tags_rename(old=oldtag, new=newtag) - - -### RSS functions - -def getrss(tag="", popular=0, url='', user=""): - """Get posts from del.icio.us via parsing RSS. - - tag (opt) sort by tag - popular (opt) look for the popular stuff - user (opt) get the posts by a user, this striks popular - url (opt) get the posts by url - """ - return dlcs_rss_request(tag=tag, popular=popular, user=user, url=url) - -def get_userposts(user): - "parse RSS for user" - return getrss(user=user) - -def get_tagposts(tag): - "parse RSS for tag" - return getrss(tag=tag) - -def get_urlposts(url): - "parse RSS for URL" - return getrss(url=url) - -def get_popular(tag=""): - "parse RSS for popular URLS for tag" - return getrss(tag=tag, popular=1) - - -### JSON feeds -# TODO: untested - -def json_posts(user, count=15, tag=None, raw=True): - """ - user - count=### the number of posts you want to get (default is 15, maximum - is 100) - raw a raw JSON object is returned, instead of an object named - Delicious.posts - """ - url = "http://del.icio.us/feeds/json/" + \ - dlcs_encode_params({0:user})[0] - if tag: url += '/'+dlcs_encode_params({0:tag})[0] - - return dlcs_feed(url, count=count, raw=raw) - - -def json_tags(user, atleast, count, sort='alpha', raw=True, callback=None): - """ - user - atleast=### include only tags for which there are at least ### - number of posts. - count=### include ### tags, counting down from the top. - sort={alpha|count} construct the object with tags in alphabetic order - (alpha), or by count of posts (count). - callback=NAME wrap the object definition in a function call NAME(...), - thus invoking that function when the feed is executed. - raw a pure JSON object is returned, instead of code that - will construct an object named Delicious.tags. - """ - url = 'http://del.icio.us/feeds/json/tags/' + \ - dlcs_encode_params({0:user})[0] - return dlcs_feed(url, atleast=atleast, count=count, sort=sort, raw=raw, - callback=callback) - - -def json_network(user, raw=True, callback=None): - """ - callback=NAME wrap the object definition in a function call NAME(...) - ?raw a raw JSON object is returned, instead of an object named - Delicious.posts - """ - url = 'http://del.icio.us/feeds/json/network/' + \ - dlcs_encode_params({0:user})[0] - return dlcs_feed(url, raw=raw, callback=callback) - - -def json_fans(user, raw=True, callback=None): - """ - callback=NAME wrap the object definition in a function call NAME(...) - ?raw a pure JSON object is returned, instead of an object named - Delicious. - """ - url = 'http://del.icio.us/feeds/json/fans/' + \ - dlcs_encode_params({0:user})[0] - return dlcs_feed(url, raw=raw, callback=callback) - - -### delicious V2 feeds - -def getfeed(name, **params): - return dlcs_feed(name, **params) - diff --git a/lib/utilslib/strutils.py b/lib/utilslib/strutils.py deleted file mode 100644 index 368d3d8..0000000 --- a/lib/utilslib/strutils.py +++ /dev/null @@ -1,50 +0,0 @@ - -# -# String/unicode conversion utils. -# - -def safestr(s): - """ - Safely corerce *anything* to a string. If the object can't be str'd, an - empty string will be returned. - - You can (and I do) use this for really crappy unicode handling, but it's - a bit like killing a mosquito with a bazooka. - """ - if s is None: - return "" - if isinstance(s, unicode): - return s.encode('ascii', 'xmlcharrefreplace') - else: - try: - return str(s) - except: - return "" - -def safeint(s): - """Like safestr(), but always returns an int. Returns 0 on failure.""" - try: - return int(safestr(s)) - except ValueError: - return 0 - - -def convertentity(m): - import htmlentitydefs - """Convert a HTML entity into normal string (ISO-8859-1)""" - if m.group(1)=='#': - try: - return chr(int(m.group(2))) - except ValueError: - return '&#%s;' % m.group(2) - try: - return htmlentitydefs.entitydefs[m.group(2)] - except KeyError: - return '&%s;' % m.group(2) - -def unquotehtml(s): - import re - """Convert a HTML quoted string into normal string (ISO-8859-1). - - Works with &#XX; and with > etc.""" - return re.sub(r'&(#?)(.+?);',convertentity,s) |