License: GPL-2+
This package is free software; you can redistribute it and/or modify
diff -Nru lxml-html-clean-0.1.0/lxml_html_clean/clean.py lxml-html-clean-0.1.1/lxml_html_clean/clean.py
--- lxml-html-clean-0.1.0/lxml_html_clean/clean.py 2024-02-26 20:06:29.000000000 +0000
+++ lxml-html-clean-0.1.1/lxml_html_clean/clean.py 2024-04-05 06:48:39.000000000 +0000
@@ -54,7 +54,7 @@
# All kinds of schemes besides just javascript: that can cause
# execution:
_find_image_dataurls = re.compile(
- r'data:image/(.+);base64,', re.I).findall
+ r'data:image/(.+?);base64,', re.I).findall
_possibly_malicious_schemes = re.compile(
r'(javascript|jscript|livescript|vbscript|data|about|mocha):',
re.I).findall
diff -Nru lxml-html-clean-0.1.0/setup.cfg lxml-html-clean-0.1.1/setup.cfg
--- lxml-html-clean-0.1.0/setup.cfg 2024-02-26 20:06:29.000000000 +0000
+++ lxml-html-clean-0.1.1/setup.cfg 2024-04-05 06:48:39.000000000 +0000
@@ -1,6 +1,6 @@
[metadata]
name = lxml_html_clean
-version = 0.1.0
+version = 0.1.1
description = HTML cleaner from lxml project
long_description = file:README.md
long_description_content_type = text/markdown
diff -Nru lxml-html-clean-0.1.0/tests/test_autolink.txt lxml-html-clean-0.1.1/tests/test_autolink.txt
--- lxml-html-clean-0.1.0/tests/test_autolink.txt 1970-01-01 00:00:00.000000000 +0000
+++ lxml-html-clean-0.1.1/tests/test_autolink.txt 2024-04-05 06:48:39.000000000 +0000
@@ -0,0 +1,79 @@
+This tests autolink::
+
+ >>> from lxml.html import usedoctest
+ >>> from lxml_html_clean import autolink_html
+ >>> print(autolink_html('''
+ ... Link here: http://test.com/foo.html.
+ ... '''))
+
+ >>> print(autolink_html('''
+ ... Mail me at mailto:ianb@test.com or http://myhome.com
+ ... '''))
+
+ >>> print(autolink_html('''
+ ... The great thing is the http://link.com links and
+ ... the http://foobar.com links.
'''))
+
+ >>> print(autolink_html('''
+ ... Link: <http://foobar.com>
'''))
+
+ >>> print(autolink_html('''
+ ... Link: (http://foobar.com)
'''))
+
+
+Parenthesis are tricky, we'll do our best::
+
+ >>> print(autolink_html('''
+ ... (Link: http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software))
+ ... '''))
+
+ >>> print(autolink_html('''
+ ... ... a link: http://foo.com)
+ ... '''))
+
+
+Some cases that won't be caught (on purpose)::
+
+ >>> print(autolink_html('''
+ ... A link to http://localhost/foo/bar won't, but a link to
+ ... http://test.com will
'''))
+ A link to http://localhost/foo/bar won't, but a link to
+
http://test.com will
+ >>> print(autolink_html('''
+ ... A link in
'''))
+ A link in
+ >>> print(autolink_html('''
+ ... '''))
+
+ >>> print(autolink_html('''
+ ... A link in http://foo.com
or
+ ... http://bar.com
'''))
+ A link in http://foo.com
or
+ http://bar.com
+
+There's also a word wrapping function, that should probably be run
+after autolink::
+
+ >>> from lxml_html_clean import word_break_html
+ >>> def pascii(s):
+ ... print(s.encode('ascii', 'xmlcharrefreplace').decode('ascii'))
+ >>> pascii(word_break_html( u'''
+ ... Hey you
+ ... 12345678901234567890123456789012345678901234567890
'''))
+ Hey you
+ 12345678901234567890123456789012345678901234567890
+
+Not everything is broken:
+
+ >>> pascii(word_break_html('''
+ ... Hey you
+ ... 12345678901234567890123456789012345678901234567890
'''))
+ Hey you
+ 12345678901234567890123456789012345678901234567890
+ >>> pascii(word_break_html('''
+ ... text'''))
+ text
+
+
diff -Nru lxml-html-clean-0.1.0/tests/test_clean.py lxml-html-clean-0.1.1/tests/test_clean.py
--- lxml-html-clean-0.1.0/tests/test_clean.py 2024-02-26 20:06:29.000000000 +0000
+++ lxml-html-clean-0.1.1/tests/test_clean.py 2024-04-05 06:48:39.000000000 +0000
@@ -255,6 +255,31 @@
cleaned,
"%s -> %s" % (url, cleaned))
+ def test_image_data_links_in_inline_style(self):
+ safe_attrs = set(lxml.html.defs.safe_attrs)
+ safe_attrs.add('style')
+
+ cleaner = Cleaner(
+ safe_attrs_only=True,
+ safe_attrs=safe_attrs)
+
+ data = b'123'
+ data_b64 = base64.b64encode(data).decode('ASCII')
+ url = "url(data:image/jpeg;base64,%s)" % data_b64
+ styles = [
+ "background: %s" % url,
+ "background: %s; background-image: %s" % (url, url),
+ ]
+ for style in styles:
+ html = '' % style
+ s = lxml.html.fragment_fromstring(html)
+
+ cleaned = lxml.html.tostring(cleaner.clean_html(s))
+ self.assertEqual(
+ html.encode("UTF-8"),
+ cleaned,
+ "%s -> %s" % (style, cleaned))
+
def test_formaction_attribute_in_button_input(self):
# The formaction attribute overrides the form's action and should be
# treated as a malicious link attribute
diff -Nru lxml-html-clean-0.1.0/tox.ini lxml-html-clean-0.1.1/tox.ini
--- lxml-html-clean-0.1.0/tox.ini 2024-02-26 20:06:29.000000000 +0000
+++ lxml-html-clean-0.1.1/tox.ini 2024-04-05 06:48:39.000000000 +0000
@@ -5,5 +5,5 @@
[testenv]
commands =
python -m unittest tests.test_clean
- python -m doctest tests/test_clean_embed.txt tests/test_clean.txt
+ python -m doctest tests/test_clean_embed.txt tests/test_clean.txt tests/test_autolink.txt
deps = lxml