diff -Nru gumbo-parser-0.10.1+dfsg/debian/changelog gumbo-parser-0.10.1+dfsg/debian/changelog --- gumbo-parser-0.10.1+dfsg/debian/changelog 2018-04-14 23:20:28.000000000 +0000 +++ gumbo-parser-0.10.1+dfsg/debian/changelog 2018-10-02 04:50:58.000000000 +0000 @@ -1,3 +1,11 @@ +gumbo-parser (0.10.1+dfsg-2.3) unstable; urgency=medium + + * Non-maintainer upload. + * Apply a patch by Stefano Rivera to port gumbo-parser to + BeautifulSoup 4 (Closes: #891098) + + -- Sergei Golovan Tue, 02 Oct 2018 07:50:58 +0300 + gumbo-parser (0.10.1+dfsg-2.2) unstable; urgency=medium * Non-maintainer upload. diff -Nru gumbo-parser-0.10.1+dfsg/debian/control gumbo-parser-0.10.1+dfsg/debian/control --- gumbo-parser-0.10.1+dfsg/debian/control 2015-12-30 18:44:19.000000000 +0000 +++ gumbo-parser-0.10.1+dfsg/debian/control 2018-10-02 04:49:16.000000000 +0000 @@ -45,7 +45,7 @@ Architecture: all Depends: ${misc:Depends}, ${shlibs:Depends}, ${python:Depends}, libgumbo1 (>= ${source:Version}) -Recommends: python-beautifulsoup, python-html5lib +Recommends: python-bs4, python-html5lib Description: pure-C HTML5 parser Python bindings Gumbo is an implementation of the [HTML5 parsing algorithm implemented as a pure C99 library with no outside dependencies. It's designed to serve @@ -59,7 +59,7 @@ Architecture: all Depends: ${misc:Depends}, ${shlibs:Depends}, ${python3:Depends}, libgumbo1 (>= ${source:Version}) -Recommends: python3-html5lib +Recommends: python3-bs4, python3-html5lib Description: pure-C HTML5 parser Python 3 bindings Gumbo is an implementation of the [HTML5 parsing algorithm implemented as a pure C99 library with no outside dependencies. It's designed to serve diff -Nru gumbo-parser-0.10.1+dfsg/debian/patches/03-bs4.patch gumbo-parser-0.10.1+dfsg/debian/patches/03-bs4.patch --- gumbo-parser-0.10.1+dfsg/debian/patches/03-bs4.patch 1970-01-01 00:00:00.000000000 +0000 +++ gumbo-parser-0.10.1+dfsg/debian/patches/03-bs4.patch 2018-10-02 04:49:16.000000000 +0000 @@ -0,0 +1,178 @@ +From 29e1abb337af2a15ac4b38fb1c28d1b55ed08d54 Mon Sep 17 00:00:00 2001 +From: Roman Miroshnychenko +Date: Tue, 19 Jul 2016 18:25:52 +0300 +Subject: [PATCH] Updates soup_adapter to use BeautifulSoup 4 + +Also fixes the indentation according to PEP-8 +--- + python/gumbo/soup_adapter.py | 123 +++++++++++++++++------------------ + 1 file changed, 61 insertions(+), 62 deletions(-) + +diff --git a/python/gumbo/soup_adapter.py b/python/gumbo/soup_adapter.py +index b18748f..6a247dd 100644 +--- a/python/gumbo/soup_adapter.py ++++ b/python/gumbo/soup_adapter.py +@@ -13,66 +13,65 @@ + # limitations under the License. + # + +-"""Adapter between Gumbo and BeautifulSoup. ++"""Adapter between Gumbo and BeautifulSoup 4. + +-This parses an HTML document and gives back a BeautifulSoup object, which you +-can then manipulate like a normal BeautifulSoup parse tree. ++This parses an HTML document and gives back a BeautifulSoup 4 object, which you ++can then manipulate like a normal BeautifulSoup 4 parse tree. + """ + + __author__ = 'jdtang@google.com (Jonathan Tang)' + +-import BeautifulSoup +- ++import bs4 as BeautifulSoup + import gumboc + + + def _utf8(text): +- return text.decode('utf-8', 'replace') ++ return text.decode('utf-8', 'replace') + + + def _add_source_info(obj, original_text, start_pos, end_pos): +- obj.original = str(original_text) +- obj.line = start_pos.line +- obj.col = start_pos.column +- obj.offset = start_pos.offset +- if end_pos: +- obj.end_line = end_pos.line +- obj.end_col = end_pos.column +- obj.end_offset = end_pos.offset ++ obj.original = str(original_text) ++ obj.line = start_pos.line ++ obj.col = start_pos.column ++ obj.offset = start_pos.offset ++ if end_pos: ++ obj.end_line = end_pos.line ++ obj.end_col = end_pos.column ++ obj.end_offset = end_pos.offset + + + def _convert_attrs(attrs): +- # TODO(jdtang): Ideally attributes would pass along their positions as well, +- # but I can't extend the built in str objects with new attributes. Maybe work +- # around this with a subclass in some way... +- return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs] ++ # TODO(jdtang): Ideally attributes would pass along their positions as well, ++ # but I can't extend the built in str objects with new attributes. Maybe work ++ # around this with a subclass in some way... ++ return [(_utf8(attr.name), _utf8(attr.value)) for attr in attrs] + + + def _add_document(soup, element): +- # Currently ignored, since there's no real place for this in the BeautifulSoup +- # API. +- pass ++ # Currently ignored, since there's no real place for this in the BeautifulSoup ++ # API. ++ pass + + + def _add_element(soup, element): +- # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to +- # BeautifulSoup. +- tag = BeautifulSoup.Tag( +- soup, _utf8(element.tag_name), _convert_attrs(element.attributes)) +- for child in element.children: +- tag.append(_add_node(soup, child)) +- _add_source_info( +- tag, element.original_tag, element.start_pos, element.end_pos) +- tag.original_end_tag = str(element.original_end_tag) +- return tag ++ # TODO(jdtang): Expose next/previous in gumbo so they can be passed along to ++ # BeautifulSoup. ++ tag = BeautifulSoup.Tag( ++ name=_utf8(element.tag_name), attrs=_convert_attrs(element.attributes)) ++ for child in element.children: ++ tag.append(_add_node(soup, child)) ++ _add_source_info( ++ tag, element.original_tag, element.start_pos, element.end_pos) ++ tag.original_end_tag = str(element.original_end_tag) ++ return tag + + + def _add_text(cls): +- def add_text_internal(soup, element): +- text = cls(_utf8(element.text)) +- _add_source_info(text, element.original_text, element.start_pos, None) +- return text +- return add_text_internal ++ def add_text_internal(soup, element): ++ text = cls(_utf8(element.text)) ++ _add_source_info(text, element.original_text, element.start_pos, None) ++ return text ++ return add_text_internal + + + _HANDLERS = [ +@@ -87,35 +86,35 @@ def add_text_internal(soup, element): + + + def _add_node(soup, node): +- return _HANDLERS[node.type.value](soup, node.contents) ++ return _HANDLERS[node.type.value](soup, node.contents) + + + def _add_next_prev_pointers(soup): +- def _traverse(node): +- # .findAll requires the .next pointer, which is what we're trying to add +- # when we call this, and so we manually supply a generator to yield the +- # nodes in DOM order. +- yield node +- try: +- for child in node.contents: +- for descendant in _traverse(child): +- yield descendant +- except AttributeError: +- # Not an element. +- return +- +- nodes = sorted(_traverse(soup), key=lambda node: node.offset) +- if nodes: +- nodes[0].previous = None +- nodes[-1].next = None +- for i, node in enumerate(nodes[1:-1], 1): +- nodes[i-1].next = node +- node.previous = nodes[i-1] ++ def _traverse(node): ++ # .findAll requires the .next pointer, which is what we're trying to add ++ # when we call this, and so we manually supply a generator to yield the ++ # nodes in DOM order. ++ yield node ++ try: ++ for child in node.contents: ++ for descendant in _traverse(child): ++ yield descendant ++ except AttributeError: ++ # Not an element. ++ return ++ ++ nodes = sorted(_traverse(soup), key=lambda node: node.offset) ++ if nodes: ++ nodes[0].previous_element = None ++ nodes[-1].next_element = None ++ for i, node in enumerate(nodes[1:-1], 1): ++ nodes[i-1].next_element = node ++ node.previous_element = nodes[i-1] + + + def parse(text, **kwargs): +- with gumboc.parse(text, **kwargs) as output: +- soup = BeautifulSoup.BeautifulSoup() +- soup.append(_add_node(soup, output.contents.root.contents)) +- _add_next_prev_pointers(soup) +- return soup ++ with gumboc.parse(text, **kwargs) as output: ++ soup = BeautifulSoup.BeautifulSoup(features='html.parser') ++ soup.append(_add_node(soup, output.contents.root.contents)) ++ _add_next_prev_pointers(soup) ++ return soup diff -Nru gumbo-parser-0.10.1+dfsg/debian/patches/series gumbo-parser-0.10.1+dfsg/debian/patches/series --- gumbo-parser-0.10.1+dfsg/debian/patches/series 2015-12-30 18:22:04.000000000 +0000 +++ gumbo-parser-0.10.1+dfsg/debian/patches/series 2018-10-02 04:49:16.000000000 +0000 @@ -1,3 +1,4 @@ 00-tests.patch 01-name-of-lib.patch 02-circular-dependency.patch +03-bs4.patch