Source code for html5charref

"""
html5charref
=============

Python library for escaping/unescaping HTML5 Named Character References.

The standard python library includes the `HTMLParser`_ package
for unescaping HTML named entities and HTML unicode escapes.  Unfortunately,
it doesn't include any of the named character entity references defined in
`HTML5`_.  This library intends to provide a solution for 
escaping/unescaping HTML character references defined in HTML5.

.. _HTMLParser: https://docs.python.org/2/library/htmlparser.html
.. _HTML5: http://dev.w3.org/html5/html-author/charref


Installation
------------

This project is still under development, so you should install it via GitHub
instead of PyPI::

    pip install git+https://github.com/bpabel/html5charref.git



Usage
-------

The main purpose of html5charref is to unescape HTML named entities.  It 
will also handle HTML unicode character escapes.

::

    html = u'This has © and < and © symbols'
    print html5charref.unescape(html)
    # u'This has \uxa9 and < and \uxa9 symbols' 


You can also use html5charref to find the HTML5 named entity for a given
unicode character.

::

    import html5charref
    # The copyright character
    print html5charref.escape_char(u'\u00a9')
    # u'&copy;'



Updating Named Entity References
--------------------------------

It is possible that additional named entity references will be 
added to the HTLM5 spec.  You can update the list maintained by
html5charref using the :func:`update_charrefs` function.  This queries
the latest named entity definitions from the w3 HTML5 site.

::

    import html5charref
    html5charref.update_charrefs()



Licensing
---------

This project is licensed under the `MIT`_ license.

.. _MIT: http://opensource.org/licenses/MIT



API Reference
--------------

"""


import os
import re
import json


__version__ = '0.1.0'

UPDATE_URL = r'http://dev.w3.org/html5/html-author/charref'
CACHE_FILENAME = 'data/html5charref.json'

charref_map = None
unicode_map = None


[docs]def update_charrefs(): """ Update the named entity dictionary from the w3 html5 specification site. """ import requests import BeautifulSoup as bs filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), CACHE_FILENAME) r = requests.get(UPDATE_URL) soup = bs.BeautifulSoup(r.text) escape_map = dict() for row in soup.table.contents: if isinstance(row, bs.Tag): escape_codes = row.contents[1].text.replace('&amp;', '&').split(' ') match = re.match(r'&#x([a-f0-9]{5});', row.contents[0].text, flags=re.I) if match: unicode_char = '\\U000{0}'.format(match.group(1)).decode('unicode-escape') for escape_code in escape_codes: escape_map[escape_code] = unicode_char with open(filepath, 'w') as f: json.dump(escape_map, f, sort_keys=True, separators=(',', ': '), indent=0)
def _load_charrefs(): """ Loads the cached character entity reference information from disk. """ global charref_map global unicode_map filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), CACHE_FILENAME) if not os.path.isfile(filepath): update_charrefs() with open(filepath, 'r') as f: charref_map = json.load(f) # Store the reverse lookup as well. # Some named character references refer to the same unicode point # (e.g. &lsqb; &lbrack), so the matching escape codes are stored as # a list. unicode_map = dict() for escape_code, unicode_char in charref_map.iteritems(): unicode_map.setdefault(unicode_char, []).append(escape_code)
[docs]def unescape_charref(charref): """ Return the matching unicode character for the given HTML5 named character reference. """ if charref_map is None: _load_charrefs() return charref_map.get(charref, charref)
[docs]def escape_char(c, named_only=False): """ Return an HTML5 named character reference for the given unicode character. If no character entity reference is available, return a an html unicode escape, or the original unicode char if that cannot be done. Characters that are part of ASCII are not escaped. :param bool named_only: If set to True, will only try to use named entities. If a named entity can't be found, the original character will be returned instead of an html unicode escape. .. note:: Because several character references may refer to the same unicode point, the returned character reference may not be the one you expect. Use the :func:`escape_char_advanced` function to get a list of all named character references for a given unicode point and choose the specific one you want. """ if unicode_map is None: _load_charrefs() charrefs = unicode_map.get(c) if charrefs: if len(charrefs) > 1: # If more than one named entity exists, choose the # all-lowercase version if it exists. for charref in charrefs: if re.match('&[a-z]+;', charref): return charref return charrefs[0] elif named_only: return c else: # Don't try to unicode escape ascii chars. try: if ord(c) < 128: return c except TypeError: return c # Use a unicode point escape if no named entity exists. try: return '&#x{0:05x};'.format(ord(c)) except TypeError: # Catches surrogate pair errors for high unicode code points. return c
[docs]def escape_char_advanced(c): """ Return a list of all HTML5 named character references for the given unicode character. """ if unicode_map is None: _load_charrefs() return unicode_map.get(c)
[docs]def unescape(html): """ Return a unicode string with html character entity references and html unicode escapes converted to their unicode equivalent. This closely matches HTMLParser.unescape(), but supports the HTML5 named entities. """ if '&' not in html: return html def repl(m): s = m.group(1) try: if s[0] == "#": s = s[1:] if s[0] in ['x', 'X']: c = int(s[1:], 16) else: c = int(s) return unichr(c) except ValueError: return '&#' + s + ';' else: s = '&' + s + ';' return unescape_charref(s) return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w+));", repl, html)