diff -Nru html5lib-0.999999999/AUTHORS.rst html5lib-1.0.1/AUTHORS.rst --- html5lib-0.999999999/AUTHORS.rst 2016-07-11 23:24:55.000000000 +0000 +++ html5lib-1.0.1/AUTHORS.rst 2017-12-07 12:25:26.000000000 +0000 @@ -6,6 +6,7 @@ - James Graham - Geoffrey Sneddon - Łukasz Langa +- Will Kahn-Greene Patches and suggestions @@ -23,22 +24,43 @@ - Philip Taylor - Edward Z. Yang - fantasai -- Mike West - Philip Jägenstedt - Ms2ger - Mohammad Taha Jahangir - Andy Wingo -- Juan Carlos Garcia Segovia - Andreas Madsack - Karim Valiev +- Juan Carlos Garcia Segovia +- Mike West - Marc DM +- Simon Sapin +- Michael[tm] Smith +- Ritwik Gupta +- Marc Abramowitz - Tony Lopes - lilbludevil -- Simon Sapin -- Jon Dufresne +- Kevin - Drew Hubl - Austin Kumbera - Jim Baker -- Michael[tm] Smith -- Marc Abramowitz - Jon Dufresne +- Donald Stufft +- Alex Gaynor +- Nik Nyby +- Jakub Wilk +- Sigmund Cherem +- Gabi Davar +- Florian Mounier +- neumond +- Vitalik Verhovodov +- Kovid Goyal +- Adam Chainz +- John Vandenberg +- Eric Amorde +- Benedikt Morbach +- Jonathan Vanasco +- Tom Most +- Ville Skyttä +- Hugo van Kemenade +- Mark Vasilkov + diff -Nru html5lib-0.999999999/CHANGES.rst html5lib-1.0.1/CHANGES.rst --- html5lib-0.999999999/CHANGES.rst 2016-07-15 01:34:57.000000000 +0000 +++ html5lib-1.0.1/CHANGES.rst 2017-12-07 14:09:00.000000000 +0000 @@ -1,6 +1,50 @@ Change Log ---------- +1.0.1 +~~~~~ + +Released on December 7, 2017 + +Breaking changes: + +* Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!) +* Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!) + +Features: + +* Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most, + Will Kahn-Greene!) +* Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!) +* Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!) +* Support Python 3.6. (#333) (Thank you, Jon Dufresne!) +* Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!) +* Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon + Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!) +* Semver-compliant version number. + +Bug fixes: + +* Add support for setuptools < 18.5 to support environment markers. (Thank you, + John Vandenberg!) +* Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!) +* Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank + you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!) +* Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will + Kahn-Greene!) +* Include license file in generated wheel package. (#350) (Thank you, Jon + Dufresne!) +* Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!) +* Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you, + Komal Dembla, Hugo!) + + +1.0 +~~~ + +Released and unreleased on December 7, 2017. Badly packaged release. + + 0.999999999/1.0b10 ~~~~~~~~~~~~~~~~~~ @@ -25,7 +69,7 @@ * Cease supporting DATrie under PyPy. -* **Remove ``PullDOM`` support, as this hasn't ever been properly +* **Remove PullDOM support, as this hasn't ever been properly tested, doesn't entirely work, and as far as I can tell is completely unused by anyone.** @@ -63,7 +107,7 @@ to clarify their status as public.** * **Get rid of the sanitizer package. Merge sanitizer.sanitize into the - sanitizer.htmlsanitizer module and move that to saniziter. This means + sanitizer.htmlsanitizer module and move that to sanitizer. This means anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no code changes.** diff -Nru html5lib-0.999999999/debian/changelog html5lib-1.0.1/debian/changelog --- html5lib-0.999999999/debian/changelog 2016-11-16 14:34:19.000000000 +0000 +++ html5lib-1.0.1/debian/changelog 2018-07-11 08:30:04.000000000 +0000 @@ -1,3 +1,16 @@ +html5lib (1.0.1-1) unstable; urgency=medium + + [ Ondřej Nový ] + * d/control: Set Vcs-* to salsa.debian.org + * d/changelog: Remove trailing whitespaces + * d/control: Remove ancient X-Python-Version field + * d/control: Remove ancient X-Python3-Version field + + [ Alexander GQ Gerasiov ] + * New upstream release (Closes: #895816). + + -- Alexander GQ Gerasiov Wed, 11 Jul 2018 11:30:04 +0300 + html5lib (0.999999999-1) unstable; urgency=medium [ Ondřej Nový ] @@ -95,16 +108,16 @@ * Drop patch system and debian/README.source * Remove debian/examples - not shipped by upstream anymore. * Bump Standards-Version to 3.9.3. - * Do not install tests folder. + * Do not install tests folder. -- Bernd Zeimetz Wed, 09 May 2012 22:28:28 +0200 html5lib (0.90-1) unstable; urgency=low - * New upstream version. - * Updating patch to apply at the new version. + * New upstream version. + * Updating patch to apply at the new version. * Check for the tests directory before running tests. - They're not always included in the source.... + They're not always included in the source.... -- Bernd Zeimetz Sun, 24 Jan 2010 01:28:50 +0100 @@ -134,7 +147,7 @@ [ Bernd Zeimetz ] * debian/README.source: Add file * debian/control: Bump Standards-Version to 3.8.2. - * debian/copyright: Updating debian packaging copyright. + * debian/copyright: Updating debian packaging copyright. -- Bernd Zeimetz Wed, 05 Aug 2009 22:12:43 +0200 diff -Nru html5lib-0.999999999/debian/control html5lib-1.0.1/debian/control --- html5lib-0.999999999/debian/control 2016-11-16 14:34:19.000000000 +0000 +++ html5lib-1.0.1/debian/control 2018-07-11 08:21:52.000000000 +0000 @@ -25,12 +25,10 @@ python3-setuptools, python3-six, python3-webencodings, -Vcs-Git: https://anonscm.debian.org/git/python-modules/packages/html5lib.git -Vcs-Browser: https://anonscm.debian.org/cgit/python-modules/packages/html5lib.git +Vcs-Git: https://salsa.debian.org/python-team/modules/html5lib.git +Vcs-Browser: https://salsa.debian.org/python-team/modules/html5lib Homepage: https://github.com/html5lib/html5lib-python Standards-Version: 3.9.8 -X-Python-Version: >= 2.6 -X-Python3-Version: >= 3.2 Package: python-html5lib Architecture: all diff -Nru html5lib-0.999999999/debian/.git-dpm html5lib-1.0.1/debian/.git-dpm --- html5lib-0.999999999/debian/.git-dpm 2016-11-16 14:34:19.000000000 +0000 +++ html5lib-1.0.1/debian/.git-dpm 2018-07-11 08:21:52.000000000 +0000 @@ -1,11 +1,11 @@ # see git-dpm(1) from git-dpm package -a7d4397597d76c71dd9d232522bad124f0dd68a3 -a7d4397597d76c71dd9d232522bad124f0dd68a3 -a7d4397597d76c71dd9d232522bad124f0dd68a3 -a7d4397597d76c71dd9d232522bad124f0dd68a3 -html5lib_0.999999999.orig.tar.gz -814e7ab8d865c3c0ba96a13fe383c06735329c36 -245488 +18b20c1acbaf76d21f006ff2f6b9960e4745a764 +18b20c1acbaf76d21f006ff2f6b9960e4745a764 +18b20c1acbaf76d21f006ff2f6b9960e4745a764 +18b20c1acbaf76d21f006ff2f6b9960e4745a764 +html5lib_1.0.1.orig.tar.gz +5e1a2c7e18de7d1d0883e223f1733dc6dc796ee2 +252959 debianTag="debian/%e%v" patchedTag="patched/%e%v" upstreamTag="upstream/%e%u" diff -Nru html5lib-0.999999999/html5lib/constants.py html5lib-1.0.1/html5lib/constants.py --- html5lib-0.999999999/html5lib/constants.py 2016-05-22 02:05:51.000000000 +0000 +++ html5lib-1.0.1/html5lib/constants.py 2017-12-07 12:25:26.000000000 +0000 @@ -423,7 +423,7 @@ ]) htmlIntegrationPointElements = frozenset([ - (namespaces["mathml"], "annotaion-xml"), + (namespaces["mathml"], "annotation-xml"), (namespaces["svg"], "foreignObject"), (namespaces["svg"], "desc"), (namespaces["svg"], "title") @@ -588,7 +588,7 @@ ]) booleanAttributes = { - "": frozenset(["irrelevant"]), + "": frozenset(["irrelevant", "itemscope"]), "style": frozenset(["scoped"]), "img": frozenset(["ismap"]), "audio": frozenset(["autoplay", "controls"]), @@ -606,6 +606,7 @@ "input": frozenset(["disabled", "readonly", "required", "autofocus", "checked", "ismap"]), "select": frozenset(["disabled", "readonly", "autofocus", "multiple"]), "output": frozenset(["disabled", "readonly"]), + "iframe": frozenset(["seamless"]), } # entitiesWindows1252 has to be _ordered_ and needs to have an index. It @@ -2938,8 +2939,9 @@ class DataLossWarning(UserWarning): + """Raised when the current tree is unable to represent the input data""" pass -class ReparseException(Exception): +class _ReparseException(Exception): pass diff -Nru html5lib-0.999999999/html5lib/filters/alphabeticalattributes.py html5lib-1.0.1/html5lib/filters/alphabeticalattributes.py --- html5lib-0.999999999/html5lib/filters/alphabeticalattributes.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/filters/alphabeticalattributes.py 2017-12-07 12:25:26.000000000 +0000 @@ -2,19 +2,28 @@ from . import base -try: - from collections import OrderedDict -except ImportError: - from ordereddict import OrderedDict +from collections import OrderedDict + + +def _attr_key(attr): + """Return an appropriate key for an attribute for sorting + + Attributes have a namespace that can be either ``None`` or a string. We + can't compare the two because they're different types, so we convert + ``None`` to an empty string first. + + """ + return (attr[0][0] or ''), attr[0][1] class Filter(base.Filter): + """Alphabetizes attributes for elements""" def __iter__(self): for token in base.Filter.__iter__(self): if token["type"] in ("StartTag", "EmptyTag"): attrs = OrderedDict() for name, value in sorted(token["data"].items(), - key=lambda x: x[0]): + key=_attr_key): attrs[name] = value token["data"] = attrs yield token diff -Nru html5lib-0.999999999/html5lib/filters/inject_meta_charset.py html5lib-1.0.1/html5lib/filters/inject_meta_charset.py --- html5lib-0.999999999/html5lib/filters/inject_meta_charset.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/filters/inject_meta_charset.py 2017-12-07 12:25:26.000000000 +0000 @@ -4,7 +4,15 @@ class Filter(base.Filter): + """Injects ```` tag into head of document""" def __init__(self, source, encoding): + """Creates a Filter + + :arg source: the source token stream + + :arg encoding: the encoding to set + + """ base.Filter.__init__(self, source) self.encoding = encoding diff -Nru html5lib-0.999999999/html5lib/filters/lint.py html5lib-1.0.1/html5lib/filters/lint.py --- html5lib-0.999999999/html5lib/filters/lint.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/filters/lint.py 2017-12-07 12:25:26.000000000 +0000 @@ -10,7 +10,19 @@ class Filter(base.Filter): + """Lints the token stream for errors + + If it finds any errors, it'll raise an ``AssertionError``. + + """ def __init__(self, source, require_matching_tags=True): + """Creates a Filter + + :arg source: the source token stream + + :arg require_matching_tags: whether or not to require matching tags + + """ super(Filter, self).__init__(source) self.require_matching_tags = require_matching_tags diff -Nru html5lib-0.999999999/html5lib/filters/optionaltags.py html5lib-1.0.1/html5lib/filters/optionaltags.py --- html5lib-0.999999999/html5lib/filters/optionaltags.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/filters/optionaltags.py 2017-12-07 12:25:26.000000000 +0000 @@ -4,6 +4,7 @@ class Filter(base.Filter): + """Removes optional tags from the token stream""" def slider(self): previous1 = previous2 = None for token in self.source: diff -Nru html5lib-0.999999999/html5lib/filters/sanitizer.py html5lib-1.0.1/html5lib/filters/sanitizer.py --- html5lib-0.999999999/html5lib/filters/sanitizer.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/filters/sanitizer.py 2017-12-07 12:25:26.000000000 +0000 @@ -705,7 +705,7 @@ class Filter(base.Filter): - """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" + """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes""" def __init__(self, source, allowed_elements=allowed_elements, @@ -718,6 +718,37 @@ attr_val_is_uri=attr_val_is_uri, svg_attr_val_allows_ref=svg_attr_val_allows_ref, svg_allow_local_href=svg_allow_local_href): + """Creates a Filter + + :arg allowed_elements: set of elements to allow--everything else will + be escaped + + :arg allowed_attributes: set of attributes to allow in + elements--everything else will be stripped + + :arg allowed_css_properties: set of CSS properties to allow--everything + else will be stripped + + :arg allowed_css_keywords: set of CSS keywords to allow--everything + else will be stripped + + :arg allowed_svg_properties: set of SVG properties to allow--everything + else will be removed + + :arg allowed_protocols: set of allowed protocols for URIs + + :arg allowed_content_types: set of allowed content types for ``data`` URIs. + + :arg attr_val_is_uri: set of attributes that have URI values--values + that have a scheme not listed in ``allowed_protocols`` are removed + + :arg svg_attr_val_allows_ref: set of SVG attributes that can have + references + + :arg svg_allow_local_href: set of SVG elements that can have local + hrefs--these are removed + + """ super(Filter, self).__init__(source) self.allowed_elements = allowed_elements self.allowed_attributes = allowed_attributes @@ -737,11 +768,11 @@ yield token # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and - # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style - # attributes are parsed, and a restricted set, # specified by - # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through. - # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified - # in ALLOWED_PROTOCOLS are allowed. + # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes + # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and + # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI + # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are + # allowed. # # sanitize_html('') # => <script> do_nasty_stuff() </script> @@ -782,7 +813,7 @@ # characters, nor why we call unescape. I just know it's always been here. # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all # this will do is remove *more* than it otherwise would. - val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\s]+", '', + val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '', unescape(attrs[attr])).lower() # remove replacement characters from unescaped characters val_unescaped = val_unescaped.replace("\ufffd", "") @@ -807,7 +838,7 @@ ' ', unescape(attrs[attr])) if (token["name"] in self.svg_allow_local_href and - (namespaces['xlink'], 'href') in attrs and re.search('^\s*[^#\s].*', + (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*', attrs[(namespaces['xlink'], 'href')])): del attrs[(namespaces['xlink'], 'href')] if (None, 'style') in attrs: @@ -837,16 +868,16 @@ def sanitize_css(self, style): # disallow urls - style = re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) + style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style) # gauntlet - if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): + if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return '' - if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): + if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return '' clean = [] - for prop, value in re.findall("([-\w]+)\s*:\s*([^:;]*)", style): + for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style): if not value: continue if prop.lower() in self.allowed_css_properties: @@ -855,7 +886,7 @@ 'padding']: for keyword in value.split(): if keyword not in self.allowed_css_keywords and \ - not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa + not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa break else: clean.append(prop + ': ' + value + ';') diff -Nru html5lib-0.999999999/html5lib/filters/whitespace.py html5lib-1.0.1/html5lib/filters/whitespace.py --- html5lib-0.999999999/html5lib/filters/whitespace.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/filters/whitespace.py 2017-12-07 12:25:26.000000000 +0000 @@ -10,7 +10,7 @@ class Filter(base.Filter): - + """Collapses whitespace except in pre, textarea, and script elements""" spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) def __iter__(self): diff -Nru html5lib-0.999999999/html5lib/html5parser.py html5lib-1.0.1/html5lib/html5parser.py --- html5lib-0.999999999/html5lib/html5parser.py 2016-07-15 01:34:57.000000000 +0000 +++ html5lib-1.0.1/html5lib/html5parser.py 2017-12-07 12:25:26.000000000 +0000 @@ -1,12 +1,8 @@ from __future__ import absolute_import, division, unicode_literals -from six import with_metaclass, viewkeys, PY3 +from six import with_metaclass, viewkeys import types - -try: - from collections import OrderedDict -except ImportError: - from ordereddict import OrderedDict +from collections import OrderedDict from . import _inputstream from . import _tokenizer @@ -24,18 +20,53 @@ adjustForeignAttributes as adjustForeignAttributesMap, adjustMathMLAttributes, adjustSVGAttributes, E, - ReparseException + _ReparseException ) def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): - """Parse a string or file-like object into a tree""" + """Parse an HTML document as a string or file-like object into a tree + + :arg doc: the document to parse as a string or file-like object + + :arg treebuilder: the treebuilder to use when parsing + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :returns: parsed tree + + Example: + + >>> from html5lib.html5parser import parse + >>> parse('

This is a doc

') + + + """ tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parse(doc, **kwargs) def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): + """Parse an HTML fragment as a string or file-like object into a tree + + :arg doc: the fragment to parse as a string or file-like object + + :arg container: the container context to parse the fragment in + + :arg treebuilder: the treebuilder to use when parsing + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :returns: parsed tree + + Example: + + >>> from html5lib.html5libparser import parseFragment + >>> parseFragment('this is a fragment') + + + """ tb = treebuilders.getTreeBuilder(treebuilder) p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) return p.parseFragment(doc, container=container, **kwargs) @@ -54,16 +85,30 @@ class HTMLParser(object): - """HTML parser. Generates a tree structure from a stream of (possibly - malformed) HTML""" + """HTML parser + + Generates a tree structure from a stream of (possibly malformed) HTML. + + """ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): """ - strict - raise an exception when a parse error is encountered + :arg tree: a treebuilder class controlling the type of tree that will be + returned. Built in treebuilders can be accessed through + html5lib.treebuilders.getTreeBuilder(treeType) + + :arg strict: raise an exception when a parse error is encountered + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + :arg debug: whether or not to enable debug mode which logs things + + Example: + + >>> from html5lib.html5parser import HTMLParser + >>> parser = HTMLParser() # generates parser with etree builder + >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict - tree - a treebuilder class controlling the type of tree that will be - returned. Built in treebuilders can be accessed through - html5lib.treebuilders.getTreeBuilder(treeType) """ # Raise an exception on the first error encountered @@ -87,7 +132,7 @@ try: self.mainLoop() - except ReparseException: + except _ReparseException: self.reset() self.mainLoop() @@ -127,9 +172,8 @@ @property def documentEncoding(self): - """The name of the character encoding - that was used to decode the input stream, - or :obj:`None` if that is not determined yet. + """Name of the character encoding that was used to decode the input stream, or + :obj:`None` if that is not determined yet """ if not hasattr(self, 'tokenizer'): @@ -223,14 +267,24 @@ def parse(self, stream, *args, **kwargs): """Parse a HTML document into a well-formed tree - stream - a filelike object or string containing the HTML to be parsed + :arg stream: a file-like object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element). + + :arg scripting: treat noscript elements as if JavaScript was turned on - The optional encoding parameter must be a string that indicates - the encoding. If specified, that encoding will be used, - regardless of any BOM or later declaration (such as in a meta - element) + :returns: parsed tree + + Example: + + >>> from html5lib.html5parser import HTMLParser + >>> parser = HTMLParser() + >>> parser.parse('

This is a doc

') + - scripting - treat noscript elements as if javascript was turned on """ self._parse(stream, False, None, *args, **kwargs) return self.tree.getDocument() @@ -238,17 +292,27 @@ def parseFragment(self, stream, *args, **kwargs): """Parse a HTML fragment into a well-formed tree fragment - container - name of the element we're setting the innerHTML property - if set to None, default to 'div' + :arg container: name of the element we're setting the innerHTML + property if set to None, default to 'div' + + :arg stream: a file-like object or string containing the HTML to be parsed + + The optional encoding parameter must be a string that indicates + the encoding. If specified, that encoding will be used, + regardless of any BOM or later declaration (such as in a meta + element) - stream - a filelike object or string containing the HTML to be parsed + :arg scripting: treat noscript elements as if JavaScript was turned on - The optional encoding parameter must be a string that indicates - the encoding. If specified, that encoding will be used, - regardless of any BOM or later declaration (such as in a meta - element) + :returns: parsed tree + + Example: + + >>> from html5lib.html5libparser import HTMLParser + >>> parser = HTMLParser() + >>> parser.parseFragment('this is a fragment') + - scripting - treat noscript elements as if javascript was turned on """ self._parse(stream, True, *args, **kwargs) return self.tree.getFragment() @@ -262,8 +326,7 @@ raise ParseError(E[errorcode] % datavars) def normalizeToken(self, token): - """ HTML5 specific normalizations to the token stream """ - + # HTML5 specific normalizations to the token stream if token["type"] == tokenTypes["StartTag"]: raw = token["data"] token["data"] = OrderedDict(raw) @@ -331,9 +394,7 @@ self.phase = new_phase def parseRCDataRawtext(self, token, contentType): - """Generic RCDATA/RAWTEXT Parsing algorithm - contentType - RCDATA or RAWTEXT - """ + # Generic RCDATA/RAWTEXT Parsing algorithm assert contentType in ("RAWTEXT", "RCDATA") self.tree.insertElement(token) @@ -2711,10 +2772,7 @@ def adjust_attributes(token, replacements): - if PY3 or _utils.PY27: - needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) - else: - needs_adjustment = frozenset(token['data']) & frozenset(replacements) + needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) if needs_adjustment: token['data'] = OrderedDict((replacements.get(k, k), v) for k, v in token['data'].items()) diff -Nru html5lib-0.999999999/html5lib/_ihatexml.py html5lib-1.0.1/html5lib/_ihatexml.py --- html5lib-0.999999999/html5lib/_ihatexml.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/_ihatexml.py 2017-12-07 12:25:26.000000000 +0000 @@ -180,7 +180,7 @@ nonXmlNameFirstBMPRegexp = re.compile('[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]') # noqa # Simpler things -nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\-\'()+,./:=?;!*#@$_%]") +nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]") class InfosetFilter(object): diff -Nru html5lib-0.999999999/html5lib/__init__.py html5lib-1.0.1/html5lib/__init__.py --- html5lib-0.999999999/html5lib/__init__.py 2016-07-15 01:37:45.000000000 +0000 +++ html5lib-1.0.1/html5lib/__init__.py 2017-12-07 14:07:38.000000000 +0000 @@ -1,14 +1,23 @@ """ -HTML parsing library based on the WHATWG "HTML5" -specification. The parser is designed to be compatible with existing -HTML found in the wild and implements well-defined error recovery that +HTML parsing library based on the `WHATWG HTML specification +`_. The parser is designed to be compatible with +existing HTML found in the wild and implements well-defined error recovery that is largely compatible with modern desktop web browsers. -Example usage: +Example usage:: -import html5lib -f = open("my_document.html") -tree = html5lib.parse(f) + import html5lib + with open("my_document.html", "rb") as f: + tree = html5lib.parse(f) + +For convenience, this module re-exports the following names: + +* :func:`~.html5parser.parse` +* :func:`~.html5parser.parseFragment` +* :class:`~.html5parser.HTMLParser` +* :func:`~.treebuilders.getTreeBuilder` +* :func:`~.treewalkers.getTreeWalker` +* :func:`~.serializer.serialize` """ from __future__ import absolute_import, division, unicode_literals @@ -22,4 +31,5 @@ "getTreeWalker", "serialize"] # this has to be at the top level, see how setup.py parses this -__version__ = "0.999999999" +#: Distribution version number. +__version__ = "1.0.1" diff -Nru html5lib-0.999999999/html5lib/_inputstream.py html5lib-1.0.1/html5lib/_inputstream.py --- html5lib-0.999999999/html5lib/_inputstream.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/_inputstream.py 2017-12-07 12:25:26.000000000 +0000 @@ -9,7 +9,7 @@ import webencodings from .constants import EOF, spaceCharacters, asciiLetters, asciiUppercase -from .constants import ReparseException +from .constants import _ReparseException from . import _utils from io import StringIO @@ -48,7 +48,7 @@ 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF]) -ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]") +ascii_punctuation_re = re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]") # Cache for charsUntil() charsUntilRegEx = {} @@ -530,7 +530,7 @@ self.rawStream.seek(0) self.charEncoding = (newEncoding, "certain") self.reset() - raise ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding)) + raise _ReparseException("Encoding changed from %s to %s" % (self.charEncoding[0], newEncoding)) def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If diff -Nru html5lib-0.999999999/html5lib/serializer.py html5lib-1.0.1/html5lib/serializer.py --- html5lib-0.999999999/html5lib/serializer.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/serializer.py 2017-12-07 12:25:26.000000000 +0000 @@ -68,10 +68,33 @@ else: return xmlcharrefreplace_errors(exc) + register_error("htmlentityreplace", htmlentityreplace_errors) def serialize(input, tree="etree", encoding=None, **serializer_opts): + """Serializes the input token stream using the specified treewalker + + :arg input: the token stream to serialize + + :arg tree: the treewalker to use + + :arg encoding: the encoding to use + + :arg serializer_opts: any options to pass to the + :py:class:`html5lib.serializer.HTMLSerializer` that gets created + + :returns: the tree serialized as a string + + Example: + + >>> from html5lib.html5parser import parse + >>> from html5lib.serializer import serialize + >>> token_stream = parse('

Hi!

') + >>> serialize(token_stream, omit_optional_tags=False) + '

Hi!

' + + """ # XXX: Should we cache this? walker = treewalkers.getTreeWalker(tree) s = HTMLSerializer(**serializer_opts) @@ -110,50 +133,83 @@ "strip_whitespace", "sanitize") def __init__(self, **kwargs): - """Initialize HTMLSerializer. + """Initialize HTMLSerializer + + :arg inject_meta_charset: Whether or not to inject the meta charset. + + Defaults to ``True``. + + :arg quote_attr_values: Whether to quote attribute values that don't + require quoting per legacy browser behavior (``"legacy"``), when + required by the standard (``"spec"``), or always (``"always"``). + + Defaults to ``"legacy"``. + + :arg quote_char: Use given quote character for attribute quoting. + + Defaults to ``"`` which will use double quotes unless attribute + value contains a double quote, in which case single quotes are + used. + + :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute + values. + + Defaults to ``False``. + + :arg escape_rcdata: Whether to escape characters that need to be + escaped within normal elements within rcdata elements such as + style. + + Defaults to ``False``. + + :arg resolve_entities: Whether to resolve named character entities that + appear in the source tree. The XML predefined entities < > + & " ' are unaffected by this setting. + + Defaults to ``True``. + + :arg strip_whitespace: Whether to remove semantically meaningless + whitespace. (This compresses all whitespace to a single space + except within ``pre``.) - Keyword options (default given first unless specified) include: + Defaults to ``False``. - inject_meta_charset=True|False - Whether it insert a meta element to define the character set of the - document. - quote_attr_values="legacy"|"spec"|"always" - Whether to quote attribute values that don't require quoting - per legacy browser behaviour, when required by the standard, or always. - quote_char=u'"'|u"'" - Use given quote character for attribute quoting. Default is to - use double quote unless attribute value contains a double quote, - in which case single quotes are used instead. - escape_lt_in_attrs=False|True - Whether to escape < in attribute values. - escape_rcdata=False|True - Whether to escape characters that need to be escaped within normal - elements within rcdata elements such as style. - resolve_entities=True|False - Whether to resolve named character entities that appear in the - source tree. The XML predefined entities < > & " ' - are unaffected by this setting. - strip_whitespace=False|True - Whether to remove semantically meaningless whitespace. (This - compresses all whitespace to a single space except within pre.) - minimize_boolean_attributes=True|False - Shortens boolean attributes to give just the attribute value, - for example becomes . - use_trailing_solidus=False|True - Includes a close-tag slash at the end of the start tag of void - elements (empty elements whose end tag is forbidden). E.g.
. - space_before_trailing_solidus=True|False - Places a space immediately before the closing slash in a tag - using a trailing solidus. E.g.
. Requires use_trailing_solidus. - sanitize=False|True - Strip all unsafe or unknown constructs from output. - See `html5lib user documentation`_ - omit_optional_tags=True|False - Omit start/end tags that are optional. - alphabetical_attributes=False|True - Reorder attributes to be in alphabetical order. + :arg minimize_boolean_attributes: Shortens boolean attributes to give + just the attribute value, for example:: + + + + becomes:: + + + + Defaults to ``True``. + + :arg use_trailing_solidus: Includes a close-tag slash at the end of the + start tag of void elements (empty elements whose end tag is + forbidden). E.g. ``
``. + + Defaults to ``False``. + + :arg space_before_trailing_solidus: Places a space immediately before + the closing slash in a tag using a trailing solidus. E.g. + ``
``. Requires ``use_trailing_solidus=True``. + + Defaults to ``True``. + + :arg sanitize: Strip all unsafe or unknown constructs from output. + See :py:class:`html5lib.filters.sanitizer.Filter`. + + Defaults to ``False``. + + :arg omit_optional_tags: Omit start/end tags that are optional. + + Defaults to ``True``. + + :arg alphabetical_attributes: Reorder attributes to be in alphabetical order. + + Defaults to ``False``. - .. _html5lib user documentation: http://code.google.com/p/html5lib/wiki/UserDocumentation """ unexpected_args = frozenset(kwargs) - frozenset(self.options) if len(unexpected_args) > 0: @@ -317,6 +373,25 @@ self.serializeError(token["data"]) def render(self, treewalker, encoding=None): + """Serializes the stream from the treewalker into a string + + :arg treewalker: the treewalker to serialize + + :arg encoding: the string encoding to use + + :returns: the serialized tree + + Example: + + >>> from html5lib import parse, getTreeWalker + >>> from html5lib.serializer import HTMLSerializer + >>> token_stream = parse('Hi!') + >>> walker = getTreeWalker('etree') + >>> serializer = HTMLSerializer(omit_optional_tags=False) + >>> serializer.render(walker(token_stream)) + 'Hi!' + + """ if encoding: return b"".join(list(self.serialize(treewalker, encoding))) else: diff -Nru html5lib-0.999999999/html5lib/tests/conftest.py html5lib-1.0.1/html5lib/tests/conftest.py --- html5lib-0.999999999/html5lib/tests/conftest.py 2016-07-11 23:24:55.000000000 +0000 +++ html5lib-1.0.1/html5lib/tests/conftest.py 2017-12-07 12:25:26.000000000 +0000 @@ -1,4 +1,6 @@ +from __future__ import print_function import os.path +import sys import pkg_resources import pytest @@ -15,6 +17,26 @@ _sanitizer_testdata = os.path.join(_dir, "sanitizer-testdata") +def fail_if_missing_pytest_expect(): + """Throws an exception halting pytest if pytest-expect isn't working""" + try: + from pytest_expect import expect # noqa + except ImportError: + header = '*' * 78 + print( + '\n' + + header + '\n' + + 'ERROR: Either pytest-expect or its dependency u-msgpack-python is not\n' + + 'installed. Please install them both before running pytest.\n' + + header + '\n', + file=sys.stderr + ) + raise + + +fail_if_missing_pytest_expect() + + def pytest_configure(config): msgs = [] diff -Nru html5lib-0.999999999/html5lib/tests/test_alphabeticalattributes.py html5lib-1.0.1/html5lib/tests/test_alphabeticalattributes.py --- html5lib-0.999999999/html5lib/tests/test_alphabeticalattributes.py 1970-01-01 00:00:00.000000000 +0000 +++ html5lib-1.0.1/html5lib/tests/test_alphabeticalattributes.py 2017-12-07 12:25:26.000000000 +0000 @@ -0,0 +1,78 @@ +from __future__ import absolute_import, division, unicode_literals + +from collections import OrderedDict + +import pytest + +import html5lib +from html5lib.filters.alphabeticalattributes import Filter +from html5lib.serializer import HTMLSerializer + + +@pytest.mark.parametrize('msg, attrs, expected_attrs', [ + ( + 'no attrs', + {}, + {} + ), + ( + 'one attr', + {(None, 'alt'): 'image'}, + OrderedDict([((None, 'alt'), 'image')]) + ), + ( + 'multiple attrs', + { + (None, 'src'): 'foo', + (None, 'alt'): 'image', + (None, 'style'): 'border: 1px solid black;' + }, + OrderedDict([ + ((None, 'alt'), 'image'), + ((None, 'src'), 'foo'), + ((None, 'style'), 'border: 1px solid black;') + ]) + ), +]) +def test_alphabetizing(msg, attrs, expected_attrs): + tokens = [{'type': 'StartTag', 'name': 'img', 'data': attrs}] + output_tokens = list(Filter(tokens)) + + attrs = output_tokens[0]['data'] + assert attrs == expected_attrs + + +def test_with_different_namespaces(): + tokens = [{ + 'type': 'StartTag', + 'name': 'pattern', + 'data': { + (None, 'id'): 'patt1', + ('http://www.w3.org/1999/xlink', 'href'): '#patt2' + } + }] + output_tokens = list(Filter(tokens)) + + attrs = output_tokens[0]['data'] + assert attrs == OrderedDict([ + ((None, 'id'), 'patt1'), + (('http://www.w3.org/1999/xlink', 'href'), '#patt2') + ]) + + +def test_with_serializer(): + """Verify filter works in the context of everything else""" + parser = html5lib.HTMLParser() + dom = parser.parseFragment('') + walker = html5lib.getTreeWalker('etree') + ser = HTMLSerializer( + alphabetical_attributes=True, + quote_attr_values='always' + ) + + # FIXME(willkg): The "xlink" namespace gets dropped by the serializer. When + # that gets fixed, we can fix this expected result. + assert ( + ser.render(walker(dom)) == + '' + ) diff -Nru html5lib-0.999999999/html5lib/tests/testdata/.git html5lib-1.0.1/html5lib/tests/testdata/.git --- html5lib-0.999999999/html5lib/tests/testdata/.git 2013-04-09 23:43:27.000000000 +0000 +++ html5lib-1.0.1/html5lib/tests/testdata/.git 2017-12-07 13:52:47.000000000 +0000 @@ -1 +1 @@ -gitdir: ../../../.git/modules/testdata +gitdir: ../../../../html5lib-python/.git/worktrees/html5lib-python-b/modules/testdata diff -Nru html5lib-0.999999999/html5lib/tests/test_sanitizer.py html5lib-1.0.1/html5lib/tests/test_sanitizer.py --- html5lib-0.999999999/html5lib/tests/test_sanitizer.py 2016-07-12 01:44:10.000000000 +0000 +++ html5lib-1.0.1/html5lib/tests/test_sanitizer.py 2017-12-07 12:25:26.000000000 +0000 @@ -113,3 +113,15 @@ yield (runSanitizerTest, "test_should_allow_uppercase_%s_uris" % protocol, "foo" % (protocol, rest_of_uri), """foo""" % (protocol, rest_of_uri)) + + +def test_lowercase_color_codes_in_style(): + sanitized = sanitize_html("

") + expected = '

' + assert expected == sanitized + + +def test_uppercase_color_codes_in_style(): + sanitized = sanitize_html("

") + expected = '

' + assert expected == sanitized diff -Nru html5lib-0.999999999/html5lib/treeadapters/genshi.py html5lib-1.0.1/html5lib/treeadapters/genshi.py --- html5lib-0.999999999/html5lib/treeadapters/genshi.py 2016-05-20 14:50:15.000000000 +0000 +++ html5lib-1.0.1/html5lib/treeadapters/genshi.py 2017-12-07 12:25:26.000000000 +0000 @@ -5,6 +5,13 @@ def to_genshi(walker): + """Convert a tree to a genshi tree + + :arg walker: the treewalker to use to walk the tree to convert it + + :returns: generator of genshi nodes + + """ text = [] for token in walker: type = token["type"] diff -Nru html5lib-0.999999999/html5lib/treeadapters/__init__.py html5lib-1.0.1/html5lib/treeadapters/__init__.py --- html5lib-0.999999999/html5lib/treeadapters/__init__.py 2016-05-22 01:09:33.000000000 +0000 +++ html5lib-1.0.1/html5lib/treeadapters/__init__.py 2017-12-07 12:25:26.000000000 +0000 @@ -1,3 +1,21 @@ +"""Tree adapters let you convert from one tree structure to another + +Example: + +.. code-block:: python + + import html5lib + from html5lib.treeadapters import genshi + + doc = 'Hi!' + treebuilder = html5lib.getTreeBuilder('etree') + parser = html5lib.HTMLParser(tree=treebuilder) + tree = parser.parse(doc) + TreeWalker = html5lib.getTreeWalker('etree') + + genshi_tree = genshi.to_genshi(TreeWalker(tree)) + +""" from __future__ import absolute_import, division, unicode_literals from . import sax diff -Nru html5lib-0.999999999/html5lib/treeadapters/sax.py html5lib-1.0.1/html5lib/treeadapters/sax.py --- html5lib-0.999999999/html5lib/treeadapters/sax.py 2015-04-26 02:17:12.000000000 +0000 +++ html5lib-1.0.1/html5lib/treeadapters/sax.py 2017-12-07 12:25:26.000000000 +0000 @@ -11,7 +11,13 @@ def to_sax(walker, handler): - """Call SAX-like content handler based on treewalker walker""" + """Call SAX-like content handler based on treewalker walker + + :arg walker: the treewalker to use to walk the tree to convert it + + :arg handler: SAX handler to use + + """ handler.startDocument() for prefix, namespace in prefix_mapping.items(): handler.startPrefixMapping(prefix, namespace) diff -Nru html5lib-0.999999999/html5lib/treebuilders/base.py html5lib-1.0.1/html5lib/treebuilders/base.py --- html5lib-0.999999999/html5lib/treebuilders/base.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/treebuilders/base.py 2017-12-07 12:25:26.000000000 +0000 @@ -21,22 +21,25 @@ class Node(object): + """Represents an item in the tree""" def __init__(self, name): - """Node representing an item in the tree. - name - The tag name associated with the node - parent - The parent of the current node (or None for the document node) - value - The value of the current node (applies to text nodes and - comments - attributes - a dict holding name, value pairs for attributes of the node - childNodes - a list of child nodes of the current node. This must - include all elements but not necessarily other node types - _flags - A list of miscellaneous flags that can be set on the node + """Creates a Node + + :arg name: The tag name associated with the node + """ + # The tag name assocaited with the node self.name = name + # The parent of the current node (or None for the document node) self.parent = None + # The value of the current node (applies to text nodes and comments) self.value = None + # A dict holding name -> value pairs for attributes of the node self.attributes = {} + # A list of child nodes of the current node. This must include all + # elements but not necessarily other node types. self.childNodes = [] + # A list of miscellaneous flags that can be set on the node. self._flags = [] def __str__(self): @@ -53,23 +56,41 @@ def appendChild(self, node): """Insert node as a child of the current node + + :arg node: the node to insert + """ raise NotImplementedError def insertText(self, data, insertBefore=None): """Insert data as text in the current node, positioned before the start of node insertBefore or to the end of the node's text. + + :arg data: the data to insert + + :arg insertBefore: True if you want to insert the text before the node + and False if you want to insert it after the node + """ raise NotImplementedError def insertBefore(self, node, refNode): """Insert node as a child of the current node, before refNode in the list of child nodes. Raises ValueError if refNode is not a child of - the current node""" + the current node + + :arg node: the node to insert + + :arg refNode: the child node to insert the node before + + """ raise NotImplementedError def removeChild(self, node): """Remove node from the children of the current node + + :arg node: the child node to remove + """ raise NotImplementedError @@ -77,6 +98,9 @@ """Move all the children of the current node to newParent. This is needed so that trees that don't store text as nodes move the text in the correct way + + :arg newParent: the node to move all this node's children to + """ # XXX - should this method be made more general? for child in self.childNodes: @@ -121,10 +145,12 @@ class TreeBuilder(object): """Base treebuilder implementation - documentClass - the class to use for the bottommost node of a document - elementClass - the class to use for HTML Elements - commentClass - the class to use for comments - doctypeClass - the class to use for doctypes + + * documentClass - the class to use for the bottommost node of a document + * elementClass - the class to use for HTML Elements + * commentClass - the class to use for comments + * doctypeClass - the class to use for doctypes + """ # pylint:disable=not-callable @@ -144,6 +170,11 @@ fragmentClass = None def __init__(self, namespaceHTMLElements): + """Create a TreeBuilder + + :arg namespaceHTMLElements: whether or not to namespace HTML elements + + """ if namespaceHTMLElements: self.defaultNamespace = "http://www.w3.org/1999/xhtml" else: @@ -367,11 +398,11 @@ self.generateImpliedEndTags(exclude) def getDocument(self): - "Return the final tree" + """Return the final tree""" return self.document def getFragment(self): - "Return the final fragment" + """Return the final fragment""" # assert self.innerHTML fragment = self.fragmentClass() self.openElements[0].reparentChildren(fragment) @@ -379,5 +410,8 @@ def testSerializer(self, node): """Serialize the subtree of node in the format required by unit tests - node - the node from which to start serializing""" + + :arg node: the node from which to start serializing + + """ raise NotImplementedError diff -Nru html5lib-0.999999999/html5lib/treebuilders/etree_lxml.py html5lib-1.0.1/html5lib/treebuilders/etree_lxml.py --- html5lib-0.999999999/html5lib/treebuilders/etree_lxml.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/treebuilders/etree_lxml.py 2017-12-07 12:25:26.000000000 +0000 @@ -309,7 +309,6 @@ super(TreeBuilder, self).insertComment(data, parent) def insertRoot(self, token): - """Create the document root""" # Because of the way libxml2 works, it doesn't seem to be possible to # alter information like the doctype after the tree has been parsed. # Therefore we need to use the built-in parser to create our initial diff -Nru html5lib-0.999999999/html5lib/treebuilders/__init__.py html5lib-1.0.1/html5lib/treebuilders/__init__.py --- html5lib-0.999999999/html5lib/treebuilders/__init__.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/treebuilders/__init__.py 2017-12-07 12:25:26.000000000 +0000 @@ -1,29 +1,32 @@ -"""A collection of modules for building different kinds of tree from -HTML documents. +"""A collection of modules for building different kinds of trees from HTML +documents. To create a treebuilder for a new type of tree, you need to do implement several things: -1) A set of classes for various types of elements: Document, Doctype, -Comment, Element. These must implement the interface of -_base.treebuilders.Node (although comment nodes have a different -signature for their constructor, see treebuilders.etree.Comment) -Textual content may also be implemented as another node type, or not, as -your tree implementation requires. - -2) A treebuilder object (called TreeBuilder by convention) that -inherits from treebuilders._base.TreeBuilder. This has 4 required attributes: -documentClass - the class to use for the bottommost node of a document -elementClass - the class to use for HTML Elements -commentClass - the class to use for comments -doctypeClass - the class to use for doctypes -It also has one required method: -getDocument - Returns the root node of the complete document tree - -3) If you wish to run the unit tests, you must also create a -testSerializer method on your treebuilder which accepts a node and -returns a string containing Node and its children serialized according -to the format used in the unittests +1. A set of classes for various types of elements: Document, Doctype, Comment, + Element. These must implement the interface of ``base.treebuilders.Node`` + (although comment nodes have a different signature for their constructor, + see ``treebuilders.etree.Comment``) Textual content may also be implemented + as another node type, or not, as your tree implementation requires. + +2. A treebuilder object (called ``TreeBuilder`` by convention) that inherits + from ``treebuilders.base.TreeBuilder``. This has 4 required attributes: + + * ``documentClass`` - the class to use for the bottommost node of a document + * ``elementClass`` - the class to use for HTML Elements + * ``commentClass`` - the class to use for comments + * ``doctypeClass`` - the class to use for doctypes + + It also has one required method: + + * ``getDocument`` - Returns the root node of the complete document tree + +3. If you wish to run the unit tests, you must also create a ``testSerializer`` + method on your treebuilder which accepts a node and returns a string + containing Node and its children serialized according to the format used in + the unittests + """ from __future__ import absolute_import, division, unicode_literals @@ -34,23 +37,32 @@ def getTreeBuilder(treeType, implementation=None, **kwargs): - """Get a TreeBuilder class for various types of tree with built-in support + """Get a TreeBuilder class for various types of trees with built-in support + + :arg treeType: the name of the tree type required (case-insensitive). Supported + values are: + + * "dom" - A generic builder for DOM implementations, defaulting to a + xml.dom.minidom based implementation. + * "etree" - A generic builder for tree implementations exposing an + ElementTree-like interface, defaulting to xml.etree.cElementTree if + available and xml.etree.ElementTree if not. + * "lxml" - A etree-based builder for lxml.etree, handling limitations + of lxml's implementation. + + :arg implementation: (Currently applies to the "etree" and "dom" tree + types). A module implementing the tree type e.g. xml.etree.ElementTree + or xml.etree.cElementTree. + + :arg kwargs: Any additional options to pass to the TreeBuilder when + creating it. + + Example: - treeType - the name of the tree type required (case-insensitive). Supported - values are: + >>> from html5lib.treebuilders import getTreeBuilder + >>> builder = getTreeBuilder('etree') - "dom" - A generic builder for DOM implementations, defaulting to - a xml.dom.minidom based implementation. - "etree" - A generic builder for tree implementations exposing an - ElementTree-like interface, defaulting to - xml.etree.cElementTree if available and - xml.etree.ElementTree if not. - "lxml" - A etree-based builder for lxml.etree, handling - limitations of lxml's implementation. - - implementation - (Currently applies to the "etree" and "dom" tree types). A - module implementing the tree type e.g. - xml.etree.ElementTree or xml.etree.cElementTree.""" + """ treeType = treeType.lower() if treeType not in treeBuilderCache: diff -Nru html5lib-0.999999999/html5lib/treewalkers/base.py html5lib-1.0.1/html5lib/treewalkers/base.py --- html5lib-0.999999999/html5lib/treewalkers/base.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/treewalkers/base.py 2017-12-07 12:25:26.000000000 +0000 @@ -18,16 +18,48 @@ class TreeWalker(object): + """Walks a tree yielding tokens + + Tokens are dicts that all have a ``type`` field specifying the type of the + token. + + """ def __init__(self, tree): + """Creates a TreeWalker + + :arg tree: the tree to walk + + """ self.tree = tree def __iter__(self): raise NotImplementedError def error(self, msg): + """Generates an error token with the given message + + :arg msg: the error message + + :returns: SerializeError token + + """ return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): + """Generates an EmptyTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :arg attrs: the attributes of the element as a dict + + :arg hasChildren: whether or not to yield a SerializationError because + this tag shouldn't have children + + :returns: EmptyTag token + + """ yield {"type": "EmptyTag", "name": name, "namespace": namespace, "data": attrs} @@ -35,17 +67,61 @@ yield self.error("Void element has children") def startTag(self, namespace, name, attrs): + """Generates a StartTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :arg attrs: the attributes of the element as a dict + + :returns: StartTag token + + """ return {"type": "StartTag", "name": name, "namespace": namespace, "data": attrs} def endTag(self, namespace, name): + """Generates an EndTag token + + :arg namespace: the namespace of the token--can be ``None`` + + :arg name: the name of the element + + :returns: EndTag token + + """ return {"type": "EndTag", "name": name, "namespace": namespace} def text(self, data): + """Generates SpaceCharacters and Characters tokens + + Depending on what's in the data, this generates one or more + ``SpaceCharacters`` and ``Characters`` tokens. + + For example: + + >>> from html5lib.treewalkers.base import TreeWalker + >>> # Give it an empty tree just so it instantiates + >>> walker = TreeWalker([]) + >>> list(walker.text('')) + [] + >>> list(walker.text(' ')) + [{u'data': ' ', u'type': u'SpaceCharacters'}] + >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE + [{u'data': ' ', u'type': u'SpaceCharacters'}, + {u'data': u'abc', u'type': u'Characters'}, + {u'data': u' ', u'type': u'SpaceCharacters'}] + + :arg data: the text data + + :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens + + """ data = data middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] @@ -60,18 +136,44 @@ yield {"type": "SpaceCharacters", "data": right} def comment(self, data): + """Generates a Comment token + + :arg data: the comment + + :returns: Comment token + + """ return {"type": "Comment", "data": data} def doctype(self, name, publicId=None, systemId=None): + """Generates a Doctype token + + :arg name: + + :arg publicId: + + :arg systemId: + + :returns: the Doctype token + + """ return {"type": "Doctype", "name": name, "publicId": publicId, "systemId": systemId} def entity(self, name): + """Generates an Entity token + + :arg name: the entity name + + :returns: an Entity token + + """ return {"type": "Entity", "name": name} def unknown(self, nodeType): + """Handles unknown node types""" return self.error("Unknown node type: " + nodeType) diff -Nru html5lib-0.999999999/html5lib/treewalkers/etree.py html5lib-1.0.1/html5lib/treewalkers/etree.py --- html5lib-0.999999999/html5lib/treewalkers/etree.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/treewalkers/etree.py 2017-12-07 12:25:26.000000000 +0000 @@ -1,13 +1,6 @@ from __future__ import absolute_import, division, unicode_literals -try: - from collections import OrderedDict -except ImportError: - try: - from ordereddict import OrderedDict - except ImportError: - OrderedDict = dict - +from collections import OrderedDict import re from six import string_types diff -Nru html5lib-0.999999999/html5lib/treewalkers/__init__.py html5lib-1.0.1/html5lib/treewalkers/__init__.py --- html5lib-0.999999999/html5lib/treewalkers/__init__.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/treewalkers/__init__.py 2017-12-07 12:25:26.000000000 +0000 @@ -13,7 +13,7 @@ from .. import constants from .._utils import default_etree -__all__ = ["getTreeWalker", "pprint", "dom", "etree", "genshi", "etree_lxml"] +__all__ = ["getTreeWalker", "pprint"] treeWalkerCache = {} @@ -21,20 +21,25 @@ def getTreeWalker(treeType, implementation=None, **kwargs): """Get a TreeWalker class for various types of tree with built-in support - Args: - treeType (str): the name of the tree type required (case-insensitive). - Supported values are: - - - "dom": The xml.dom.minidom DOM implementation - - "etree": A generic walker for tree implementations exposing an - elementtree-like interface (known to work with - ElementTree, cElementTree and lxml.etree). - - "lxml": Optimized walker for lxml.etree - - "genshi": a Genshi stream - - Implementation: A module implementing the tree type e.g. - xml.etree.ElementTree or cElementTree (Currently applies to the - "etree" tree type only). + :arg str treeType: the name of the tree type required (case-insensitive). + Supported values are: + + * "dom": The xml.dom.minidom DOM implementation + * "etree": A generic walker for tree implementations exposing an + elementtree-like interface (known to work with ElementTree, + cElementTree and lxml.etree). + * "lxml": Optimized walker for lxml.etree + * "genshi": a Genshi stream + + :arg implementation: A module implementing the tree type e.g. + xml.etree.ElementTree or cElementTree (Currently applies to the "etree" + tree type only). + + :arg kwargs: keyword arguments passed to the etree walker--for other + walkers, this has no effect + + :returns: a TreeWalker class + """ treeType = treeType.lower() @@ -73,7 +78,13 @@ def pprint(walker): - """Pretty printer for tree walkers""" + """Pretty printer for tree walkers + + Takes a TreeWalker instance and pretty prints the output of walking the tree. + + :arg walker: a TreeWalker instance + + """ output = [] indent = 0 for token in concatenateCharacterTokens(walker): diff -Nru html5lib-0.999999999/html5lib/_trie/_base.py html5lib-1.0.1/html5lib/_trie/_base.py --- html5lib-0.999999999/html5lib/_trie/_base.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/_trie/_base.py 2017-12-07 12:25:26.000000000 +0000 @@ -13,8 +13,7 @@ if prefix is None: return set(keys) - # Python 2.6: no set comprehensions - return set([x for x in keys if x.startswith(prefix)]) + return {x for x in keys if x.startswith(prefix)} def has_keys_with_prefix(self, prefix): for key in self.keys(): diff -Nru html5lib-0.999999999/html5lib/_utils.py html5lib-1.0.1/html5lib/_utils.py --- html5lib-0.999999999/html5lib/_utils.py 2016-07-14 19:07:32.000000000 +0000 +++ html5lib-1.0.1/html5lib/_utils.py 2017-12-07 12:25:26.000000000 +0000 @@ -1,6 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -import sys from types import ModuleType from six import text_type @@ -13,11 +12,9 @@ __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair", "surrogatePairToCodepoint", "moduleFactoryFactory", - "supports_lone_surrogates", "PY27"] + "supports_lone_surrogates"] -PY27 = sys.version_info[0] == 2 and sys.version_info[1] >= 7 - # Platforms not supporting lone surrogates (\uD800-\uDFFF) should be # caught by the below test. In general this would be any platform # using UTF-16 as its encoding of unicode strings, such as diff -Nru html5lib-0.999999999/html5lib.egg-info/PKG-INFO html5lib-1.0.1/html5lib.egg-info/PKG-INFO --- html5lib-0.999999999/html5lib.egg-info/PKG-INFO 2016-07-15 01:37:50.000000000 +0000 +++ html5lib-1.0.1/html5lib.egg-info/PKG-INFO 2017-12-07 14:09:53.000000000 +0000 @@ -1,11 +1,12 @@ Metadata-Version: 1.1 Name: html5lib -Version: 0.999999999 +Version: 1.0.1 Summary: HTML parser based on the WHATWG HTML specification Home-page: https://github.com/html5lib/html5lib-python Author: James Graham Author-email: james@hoppipolla.co.uk License: MIT License +Description-Content-Type: UNKNOWN Description: html5lib ======== @@ -98,7 +99,7 @@ Installation ------------ - html5lib works on CPython 2.6+, CPython 3.3+ and PyPy. To install it, + html5lib works on CPython 2.7+, CPython 3.3+ and PyPy. To install it, use: .. code-block:: bash @@ -136,8 +137,7 @@ ----- Unit tests require the ``pytest`` and ``mock`` libraries and can be - run using the ``py.test`` command in the root directory; - ``ordereddict`` is required under Python 2.6. All should pass. + run using the ``py.test`` command in the root directory. Test data are contained in a separate `html5lib-tests `_ repository and included @@ -162,6 +162,50 @@ Change Log ---------- + 1.0.1 + ~~~~~ + + Released on December 7, 2017 + + Breaking changes: + + * Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!) + * Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!) + + Features: + + * Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most, + Will Kahn-Greene!) + * Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!) + * Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!) + * Support Python 3.6. (#333) (Thank you, Jon Dufresne!) + * Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!) + * Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon + Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!) + * Semver-compliant version number. + + Bug fixes: + + * Add support for setuptools < 18.5 to support environment markers. (Thank you, + John Vandenberg!) + * Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!) + * Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank + you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!) + * Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will + Kahn-Greene!) + * Include license file in generated wheel package. (#350) (Thank you, Jon + Dufresne!) + * Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!) + * Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you, + Komal Dembla, Hugo!) + + + 1.0 + ~~~ + + Released and unreleased on December 7, 2017. Badly packaged release. + + 0.999999999/1.0b10 ~~~~~~~~~~~~~~~~~~ @@ -186,7 +230,7 @@ * Cease supporting DATrie under PyPy. - * **Remove ``PullDOM`` support, as this hasn't ever been properly + * **Remove PullDOM support, as this hasn't ever been properly tested, doesn't entirely work, and as far as I can tell is completely unused by anyone.** @@ -224,7 +268,7 @@ to clarify their status as public.** * **Get rid of the sanitizer package. Merge sanitizer.sanitize into the - sanitizer.htmlsanitizer module and move that to saniziter. This means + sanitizer.htmlsanitizer module and move that to sanitizer. This means anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no code changes.** @@ -458,11 +502,11 @@ Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2 -Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing :: Markup :: HTML diff -Nru html5lib-0.999999999/html5lib.egg-info/requires.txt html5lib-1.0.1/html5lib.egg-info/requires.txt --- html5lib-0.999999999/html5lib.egg-info/requires.txt 2016-07-15 01:37:50.000000000 +0000 +++ html5lib-1.0.1/html5lib.egg-info/requires.txt 2017-12-07 14:09:53.000000000 +0000 @@ -1,9 +1,5 @@ -six +six>=1.9 webencodings -setuptools>=18.5 - -[:python_version == '2.6'] -ordereddict [all] genshi diff -Nru html5lib-0.999999999/html5lib.egg-info/SOURCES.txt html5lib-1.0.1/html5lib.egg-info/SOURCES.txt --- html5lib-0.999999999/html5lib.egg-info/SOURCES.txt 2016-07-15 01:37:52.000000000 +0000 +++ html5lib-1.0.1/html5lib.egg-info/SOURCES.txt 2017-12-07 14:09:53.000000000 +0000 @@ -40,6 +40,7 @@ html5lib/tests/conftest.py html5lib/tests/sanitizer.py html5lib/tests/support.py +html5lib/tests/test_alphabeticalattributes.py html5lib/tests/test_encoding.py html5lib/tests/test_meta.py html5lib/tests/test_optionaltags_filter.py diff -Nru html5lib-0.999999999/PKG-INFO html5lib-1.0.1/PKG-INFO --- html5lib-0.999999999/PKG-INFO 2016-07-15 01:37:52.000000000 +0000 +++ html5lib-1.0.1/PKG-INFO 2017-12-07 14:09:53.000000000 +0000 @@ -1,11 +1,12 @@ Metadata-Version: 1.1 Name: html5lib -Version: 0.999999999 +Version: 1.0.1 Summary: HTML parser based on the WHATWG HTML specification Home-page: https://github.com/html5lib/html5lib-python Author: James Graham Author-email: james@hoppipolla.co.uk License: MIT License +Description-Content-Type: UNKNOWN Description: html5lib ======== @@ -98,7 +99,7 @@ Installation ------------ - html5lib works on CPython 2.6+, CPython 3.3+ and PyPy. To install it, + html5lib works on CPython 2.7+, CPython 3.3+ and PyPy. To install it, use: .. code-block:: bash @@ -136,8 +137,7 @@ ----- Unit tests require the ``pytest`` and ``mock`` libraries and can be - run using the ``py.test`` command in the root directory; - ``ordereddict`` is required under Python 2.6. All should pass. + run using the ``py.test`` command in the root directory. Test data are contained in a separate `html5lib-tests `_ repository and included @@ -162,6 +162,50 @@ Change Log ---------- + 1.0.1 + ~~~~~ + + Released on December 7, 2017 + + Breaking changes: + + * Drop support for Python 2.6. (#330) (Thank you, Hugo, Will Kahn-Greene!) + * Remove ``utils/spider.py`` (#353) (Thank you, Jon Dufresne!) + + Features: + + * Improve documentation. (#300, #307) (Thank you, Jon Dufresne, Tom Most, + Will Kahn-Greene!) + * Add iframe seamless boolean attribute. (Thank you, Ritwik Gupta!) + * Add itemscope as a boolean attribute. (#194) (Thank you, Jonathan Vanasco!) + * Support Python 3.6. (#333) (Thank you, Jon Dufresne!) + * Add CI support for Windows using AppVeyor. (Thank you, John Vandenberg!) + * Improve testing and CI and add code coverage (#323, #334), (Thank you, Jon + Dufresne, John Vandenberg, Geoffrey Sneddon, Will Kahn-Greene!) + * Semver-compliant version number. + + Bug fixes: + + * Add support for setuptools < 18.5 to support environment markers. (Thank you, + John Vandenberg!) + * Add explicit dependency for six >= 1.9. (Thank you, Eric Amorde!) + * Fix regexes to work with Python 3.7 regex adjustments. (#318, #379) (Thank + you, Benedikt Morbach, Ville Skyttä, Mark Vasilkov!) + * Fix alphabeticalattributes filter namespace bug. (#324) (Thank you, Will + Kahn-Greene!) + * Include license file in generated wheel package. (#350) (Thank you, Jon + Dufresne!) + * Fix annotation-xml typo. (#339) (Thank you, Will Kahn-Greene!) + * Allow uppercase hex chararcters in CSS colour check. (#377) (Thank you, + Komal Dembla, Hugo!) + + + 1.0 + ~~~ + + Released and unreleased on December 7, 2017. Badly packaged release. + + 0.999999999/1.0b10 ~~~~~~~~~~~~~~~~~~ @@ -186,7 +230,7 @@ * Cease supporting DATrie under PyPy. - * **Remove ``PullDOM`` support, as this hasn't ever been properly + * **Remove PullDOM support, as this hasn't ever been properly tested, doesn't entirely work, and as far as I can tell is completely unused by anyone.** @@ -224,7 +268,7 @@ to clarify their status as public.** * **Get rid of the sanitizer package. Merge sanitizer.sanitize into the - sanitizer.htmlsanitizer module and move that to saniziter. This means + sanitizer.htmlsanitizer module and move that to sanitizer. This means anyone who used sanitizer.sanitize or sanitizer.HTMLSanitizer needs no code changes.** @@ -458,11 +502,11 @@ Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python Classifier: Programming Language :: Python :: 2 -Classifier: Programming Language :: Python :: 2.6 Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Programming Language :: Python :: 3.5 +Classifier: Programming Language :: Python :: 3.6 Classifier: Topic :: Software Development :: Libraries :: Python Modules Classifier: Topic :: Text Processing :: Markup :: HTML diff -Nru html5lib-0.999999999/README.rst html5lib-1.0.1/README.rst --- html5lib-0.999999999/README.rst 2016-07-12 15:04:04.000000000 +0000 +++ html5lib-1.0.1/README.rst 2017-12-07 12:25:26.000000000 +0000 @@ -90,7 +90,7 @@ Installation ------------ -html5lib works on CPython 2.6+, CPython 3.3+ and PyPy. To install it, +html5lib works on CPython 2.7+, CPython 3.3+ and PyPy. To install it, use: .. code-block:: bash @@ -128,8 +128,7 @@ ----- Unit tests require the ``pytest`` and ``mock`` libraries and can be -run using the ``py.test`` command in the root directory; -``ordereddict`` is required under Python 2.6. All should pass. +run using the ``py.test`` command in the root directory. Test data are contained in a separate `html5lib-tests `_ repository and included diff -Nru html5lib-0.999999999/requirements-test.txt html5lib-1.0.1/requirements-test.txt --- html5lib-0.999999999/requirements-test.txt 2016-05-20 15:46:08.000000000 +0000 +++ html5lib-1.0.1/requirements-test.txt 2017-12-07 12:25:26.000000000 +0000 @@ -1,7 +1,10 @@ -r requirements.txt -flake8 -pytest +tox + +flake8<3.0 + +pytest==3.2.5 +coverage pytest-expect>=1.1,<2.0 mock -ordereddict ; python_version < '2.7' diff -Nru html5lib-0.999999999/requirements.txt html5lib-1.0.1/requirements.txt --- html5lib-0.999999999/requirements.txt 2016-07-10 23:36:48.000000000 +0000 +++ html5lib-1.0.1/requirements.txt 2017-12-07 12:25:26.000000000 +0000 @@ -1,4 +1,2 @@ -six +six>=1.9 webencodings -ordereddict ; python_version < '2.7' -setuptools>=18.5 diff -Nru html5lib-0.999999999/setup.cfg html5lib-1.0.1/setup.cfg --- html5lib-0.999999999/setup.cfg 2016-07-15 01:37:52.000000000 +0000 +++ html5lib-1.0.1/setup.cfg 2017-12-07 14:09:53.000000000 +0000 @@ -10,8 +10,10 @@ ignore = N max-line-length = 139 +[metadata] +license_file = LICENSE + [egg_info] tag_build = tag_date = 0 -tag_svn_revision = 0 diff -Nru html5lib-0.999999999/setup.py html5lib-1.0.1/setup.py --- html5lib-0.999999999/setup.py 2016-07-10 23:36:48.000000000 +0000 +++ html5lib-1.0.1/setup.py 2017-12-07 12:25:26.000000000 +0000 @@ -8,10 +8,54 @@ from setuptools import setup, find_packages, __version__ as setuptools_version from pkg_resources import parse_version -if parse_version(setuptools_version) < parse_version("18.5"): - print("html5lib requires setuptools version 18.5 or above; " - "please upgrade before installing (you have %s)" % setuptools_version) - sys.exit(1) +import pkg_resources + +try: + import _markerlib.markers +except ImportError: + _markerlib = None + + +# _markerlib.default_environment() obtains its data from _VARS +# and wraps it in another dict, but _markerlib_evaluate writes +# to the dict while it is iterating the keys, causing an error +# on Python 3 only. +# Replace _markerlib.default_environment to return a custom dict +# that has all the necessary markers, and ignores any writes. + +class Python3MarkerDict(dict): + + def __setitem__(self, key, value): + pass + + def pop(self, i=-1): + return self[i] + + +if _markerlib and sys.version_info[0] == 3: + env = _markerlib.markers._VARS + for key in list(env.keys()): + new_key = key.replace('.', '_') + if new_key != key: + env[new_key] = env[key] + + _markerlib.markers._VARS = Python3MarkerDict(env) + + def default_environment(): + return _markerlib.markers._VARS + + _markerlib.default_environment = default_environment + +# Avoid the very buggy pkg_resources.parser, which doesnt consistently +# recognise the markers needed by this setup.py +# Change this to setuptools 20.10.0 to support all markers. +if pkg_resources: + if parse_version(setuptools_version) < parse_version('18.5'): + MarkerEvaluation = pkg_resources.MarkerEvaluation + + del pkg_resources.parser + pkg_resources.evaluate_marker = MarkerEvaluation._markerlib_evaluate + MarkerEvaluation.evaluate_marker = MarkerEvaluation._markerlib_evaluate classifiers = [ 'Development Status :: 5 - Production/Stable', @@ -20,12 +64,12 @@ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Software Development :: Libraries :: Python Modules', 'Topic :: Text Processing :: Markup :: HTML' ] @@ -58,15 +102,10 @@ maintainer_email='james@hoppipolla.co.uk', packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), install_requires=[ - 'six', + 'six>=1.9', 'webencodings', - 'setuptools>=18.5' ], extras_require={ - # A empty extra that only has a conditional marker will be - # unconditonally installed when the condition matches. - ":python_version == '2.6'": ["ordereddict"], - # A conditional extra will only install these items when the extra is # requested and the condition matches. "datrie:platform_python_implementation == 'CPython'": ["datrie"], diff -Nru html5lib-0.999999999/tox.ini html5lib-1.0.1/tox.ini --- html5lib-0.999999999/tox.ini 2016-05-20 20:08:10.000000000 +0000 +++ html5lib-1.0.1/tox.ini 2017-12-07 12:25:26.000000000 +0000 @@ -1,17 +1,23 @@ [tox] -envlist = {py26,py27,py33,py34,py35,pypy}-{base,optional} +envlist = {py27,py33,py34,py35,py36,pypy}-{base,six19,optional} [testenv] deps = - flake8 - pytest - pytest-expect>=1.1,<2.0 - mock - base: six - base: webencodings - py26-base: ordereddict optional: -r{toxinidir}/requirements-optional.txt + -r{toxinidir}/requirements-test.txt + doc: Sphinx +passenv = + PYTEST_COMMAND + COVERAGE_RUN_OPTIONS commands = - {envbindir}/py.test - {toxinidir}/flake8-run.sh + six19: pip install six==1.9 + {env:PYTEST_COMMAND:{envbindir}/py.test} {posargs} + flake8 {toxinidir} + +[testenv:doc] +changedir = doc +commands = sphinx-build -b html . _build + +[flake8] +exclude = ./.tox