diff -Nru lxml-html-clean-0.1.0/CHANGES.rst lxml-html-clean-0.1.1/CHANGES.rst --- lxml-html-clean-0.1.0/CHANGES.rst 2024-02-26 20:06:29.000000000 +0000 +++ lxml-html-clean-0.1.1/CHANGES.rst 2024-04-05 06:48:39.000000000 +0000 @@ -7,6 +7,16 @@ ========== +0.1.1 (2024-04-05) +================== + +Bugs fixed +---------- + +* Regular expresion for image data URLs now supports multiple data + URLs on a single line. + + 0.1.0 (2024-02-26) ================== diff -Nru lxml-html-clean-0.1.0/debian/changelog lxml-html-clean-0.1.1/debian/changelog --- lxml-html-clean-0.1.0/debian/changelog 2024-04-03 14:00:23.000000000 +0000 +++ lxml-html-clean-0.1.1/debian/changelog 2024-04-15 13:31:53.000000000 +0000 @@ -1,11 +1,11 @@ -lxml-html-clean (0.1.0-0ubuntu1) noble; urgency=medium +lxml-html-clean (0.1.1-1) unstable; urgency=medium - * Initial release, split out from lxml. + * New upstream version. - -- Matthias Klose Wed, 03 Apr 2024 16:00:23 +0200 + -- Matthias Klose Mon, 15 Apr 2024 15:31:53 +0200 lxml-html-clean (0.1.0-1) unstable; urgency=medium * Initial release, split out from lxml. - -- Matthias Klose Wed, 03 Apr 2024 15:42:50 +0200 + -- Matthias Klose Wed, 03 Apr 2024 22:49:28 +0200 diff -Nru lxml-html-clean-0.1.0/debian/copyright lxml-html-clean-0.1.1/debian/copyright --- lxml-html-clean-0.1.0/debian/copyright 2024-04-03 13:55:29.000000000 +0000 +++ lxml-html-clean-0.1.1/debian/copyright 2024-04-08 14:01:31.000000000 +0000 @@ -38,6 +38,7 @@ Files: debian/* Copyright: + 2024 Canonical Ltd. 2024 Matthias Klose License: GPL-2+ This package is free software; you can redistribute it and/or modify diff -Nru lxml-html-clean-0.1.0/lxml_html_clean/clean.py lxml-html-clean-0.1.1/lxml_html_clean/clean.py --- lxml-html-clean-0.1.0/lxml_html_clean/clean.py 2024-02-26 20:06:29.000000000 +0000 +++ lxml-html-clean-0.1.1/lxml_html_clean/clean.py 2024-04-05 06:48:39.000000000 +0000 @@ -54,7 +54,7 @@ # All kinds of schemes besides just javascript: that can cause # execution: _find_image_dataurls = re.compile( - r'data:image/(.+);base64,', re.I).findall + r'data:image/(.+?);base64,', re.I).findall _possibly_malicious_schemes = re.compile( r'(javascript|jscript|livescript|vbscript|data|about|mocha):', re.I).findall diff -Nru lxml-html-clean-0.1.0/setup.cfg lxml-html-clean-0.1.1/setup.cfg --- lxml-html-clean-0.1.0/setup.cfg 2024-02-26 20:06:29.000000000 +0000 +++ lxml-html-clean-0.1.1/setup.cfg 2024-04-05 06:48:39.000000000 +0000 @@ -1,6 +1,6 @@ [metadata] name = lxml_html_clean -version = 0.1.0 +version = 0.1.1 description = HTML cleaner from lxml project long_description = file:README.md long_description_content_type = text/markdown diff -Nru lxml-html-clean-0.1.0/tests/test_autolink.txt lxml-html-clean-0.1.1/tests/test_autolink.txt --- lxml-html-clean-0.1.0/tests/test_autolink.txt 1970-01-01 00:00:00.000000000 +0000 +++ lxml-html-clean-0.1.1/tests/test_autolink.txt 2024-04-05 06:48:39.000000000 +0000 @@ -0,0 +1,79 @@ +This tests autolink:: + + >>> from lxml.html import usedoctest + >>> from lxml_html_clean import autolink_html + >>> print(autolink_html(''' + ...
Link here: http://test.com/foo.html.
+ ... ''')) + + >>> print(autolink_html(''' + ...
Mail me at mailto:ianb@test.com or http://myhome.com
+ ... ''')) + + >>> print(autolink_html(''' + ...
The great thing is the http://link.com links and + ... the http://foobar.com links.
''')) +
The great thing is the http://link.com links and + the http://foobar.com links.
+ >>> print(autolink_html(''' + ...
Link: <http://foobar.com>
''')) + + >>> print(autolink_html(''' + ...
Link: (http://foobar.com)
''')) + + +Parenthesis are tricky, we'll do our best:: + + >>> print(autolink_html(''' + ...
(Link: http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software))
+ ... ''')) + + >>> print(autolink_html(''' + ...
... a link: http://foo.com)
+ ... ''')) +
... a link: http://foo.com)
+ +Some cases that won't be caught (on purpose):: + + >>> print(autolink_html(''' + ...
A link to http://localhost/foo/bar won't, but a link to + ... http://test.com will
''')) +
A link to http://localhost/foo/bar won't, but a link to + http://test.com will
+ >>> print(autolink_html(''' + ...
A link in
''')) +
A link in
+ >>> print(autolink_html(''' + ...
A link in http://bar.com
''')) +
A link in http://bar.com
+ >>> print(autolink_html(''' + ...
A link in http://foo.com or + ... http://bar.com
''')) +
A link in http://foo.com or + http://bar.com
+ +There's also a word wrapping function, that should probably be run +after autolink:: + + >>> from lxml_html_clean import word_break_html + >>> def pascii(s): + ... print(s.encode('ascii', 'xmlcharrefreplace').decode('ascii')) + >>> pascii(word_break_html( u''' + ...
Hey you + ... 12345678901234567890123456789012345678901234567890
''')) +
Hey you + 1234567890123456789012345678901234567890​1234567890
+ +Not everything is broken: + + >>> pascii(word_break_html(''' + ...
Hey you + ... 12345678901234567890123456789012345678901234567890
''')) +
Hey you + 12345678901234567890123456789012345678901234567890
+ >>> pascii(word_break_html(''' + ... text''')) + text + + diff -Nru lxml-html-clean-0.1.0/tests/test_clean.py lxml-html-clean-0.1.1/tests/test_clean.py --- lxml-html-clean-0.1.0/tests/test_clean.py 2024-02-26 20:06:29.000000000 +0000 +++ lxml-html-clean-0.1.1/tests/test_clean.py 2024-04-05 06:48:39.000000000 +0000 @@ -255,6 +255,31 @@ cleaned, "%s -> %s" % (url, cleaned)) + def test_image_data_links_in_inline_style(self): + safe_attrs = set(lxml.html.defs.safe_attrs) + safe_attrs.add('style') + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + + data = b'123' + data_b64 = base64.b64encode(data).decode('ASCII') + url = "url(data:image/jpeg;base64,%s)" % data_b64 + styles = [ + "background: %s" % url, + "background: %s; background-image: %s" % (url, url), + ] + for style in styles: + html = '
' % style + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(cleaner.clean_html(s)) + self.assertEqual( + html.encode("UTF-8"), + cleaned, + "%s -> %s" % (style, cleaned)) + def test_formaction_attribute_in_button_input(self): # The formaction attribute overrides the form's action and should be # treated as a malicious link attribute diff -Nru lxml-html-clean-0.1.0/tox.ini lxml-html-clean-0.1.1/tox.ini --- lxml-html-clean-0.1.0/tox.ini 2024-02-26 20:06:29.000000000 +0000 +++ lxml-html-clean-0.1.1/tox.ini 2024-04-05 06:48:39.000000000 +0000 @@ -5,5 +5,5 @@ [testenv] commands = python -m unittest tests.test_clean - python -m doctest tests/test_clean_embed.txt tests/test_clean.txt + python -m doctest tests/test_clean_embed.txt tests/test_clean.txt tests/test_autolink.txt deps = lxml