diff -Nru xmldiff-2.3/CHANGES.rst xmldiff-2.4/CHANGES.rst --- xmldiff-2.3/CHANGES.rst 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/CHANGES.rst 2019-10-09 09:42:47.000000000 +0000 @@ -1,6 +1,13 @@ Changes ======= +2.4 (2019-10-09) +---------------- + +- Added an option to pass pairs of (element, attr) as unique + attributes for tree matching. Exposed this option on the command + line, too. + 2.3 (2019-02-27) ---------------- diff -Nru xmldiff-2.3/debian/changelog xmldiff-2.4/debian/changelog --- xmldiff-2.3/debian/changelog 2019-10-30 19:37:06.000000000 +0000 +++ xmldiff-2.4/debian/changelog 2019-11-04 15:15:11.000000000 +0000 @@ -1,3 +1,15 @@ +xmldiff (2.4-1) unstable; urgency=medium + + * QA upload. + * New upstream version 2.4. + * debian/copyright: updated the upstream copyright years and added a + new upstream. + * debian/manpage/: + - Duplicated the code of create-man.sh to generate two manpages. + - Updated all files to generate updated 2.4 manpages. + + -- Thiago Andrade Marques Mon, 04 Nov 2019 12:15:11 -0300 + xmldiff (2.3-3) unstable; urgency=medium * QA upload. diff -Nru xmldiff-2.3/debian/copyright xmldiff-2.4/debian/copyright --- xmldiff-2.3/debian/copyright 2019-10-30 19:37:06.000000000 +0000 +++ xmldiff-2.4/debian/copyright 2019-11-04 15:15:11.000000000 +0000 @@ -3,9 +3,10 @@ Source: https://github.com/Shoobx/xmldiff Files: * -Copyright: 2018 Lennart Regebro - 2018 Stephan Richter - 2018 Xmldiff Contributors +Copyright: 2018 Xmldiff Contributors + 2018-2019 Lennart Regebro + 2018-2019 Stephan Richter + 2019 Albertas Agejevas License: MIT Files: xmldiff/_diff_match_patch_py2.py diff -Nru xmldiff-2.3/debian/manpage/create-man.sh xmldiff-2.4/debian/manpage/create-man.sh --- xmldiff-2.3/debian/manpage/create-man.sh 2019-10-30 19:37:06.000000000 +0000 +++ xmldiff-2.4/debian/manpage/create-man.sh 2019-11-04 15:15:11.000000000 +0000 @@ -5,9 +5,21 @@ # # This script can be used under BSD-3-Clause license. -T2M_DATE="30 Oct 2019" +T2M_DATE="04 Nov 2019" +T2M_NAME=xmldiff +T2M_VERSION=2.4 +T2M_LEVEL=1 +T2M_DESC="Patch an XML file with an xmldiff" + +# Don't change the following lines +TEST=$(txt2man -h 2> /dev/null) +[ "$TEST" ] || { echo -e "\nYou need to install txt2man, from https://github.com/mvertes/txt2man.\n"; exit 1; } + +txt2man -d "$T2M_DATE" -t $T2M_NAME -r $T2M_NAME-$T2M_VERSION -s $T2M_LEVEL -v "$T2M_DESC" $T2M_NAME.txt > $T2M_NAME.$T2M_LEVEL + +T2M_DATE="04 Nov 2019" T2M_NAME=xmlpatch -T2M_VERSION=2.3 +T2M_VERSION=2.4 T2M_LEVEL=1 T2M_DESC="Patch an XML file with an xmldiff" diff -Nru xmldiff-2.3/debian/manpage/xmldiff.1 xmldiff-2.4/debian/manpage/xmldiff.1 --- xmldiff-2.3/debian/manpage/xmldiff.1 2019-10-30 19:37:06.000000000 +0000 +++ xmldiff-2.4/debian/manpage/xmldiff.1 2019-11-04 15:15:11.000000000 +0000 @@ -1,5 +1,5 @@ .\" Text automatically generated by txt2man -.TH xmldiff 1 "30 Oct 2019" "xmldiff-2.3" "Patch an XML file with an xmldiff" +.TH xmldiff 1 "04 Nov 2019" "xmldiff-2.4" "Patch an XML file with an xmldiff" .SH NAME \fBxmldiff \fP- Create a diff for two XML files \fB @@ -58,6 +58,7 @@ .B \fB--unique-attributes\fP A comma separated list of attributes that uniquely identify a node. Can be empty. +Unique attributes for certain elements can be specified in the format {NS}element@attr. Default: "{http://www.w3.org/XML/1998/namespace}id" .TP .B diff -Nru xmldiff-2.3/debian/manpage/xmldiff.txt xmldiff-2.4/debian/manpage/xmldiff.txt --- xmldiff-2.3/debian/manpage/xmldiff.txt 2019-10-30 19:37:06.000000000 +0000 +++ xmldiff-2.4/debian/manpage/xmldiff.txt 2019-11-04 15:15:11.000000000 +0000 @@ -30,6 +30,7 @@ Default: False -F A value between 0 and 1 that determines how similar nodes must be to match. --unique-attributes A comma separated list of attributes that uniquely identify a node. Can be empty. + Unique attributes for certain elements can be specified in the format {NS}element@attr. Default: "{http://www.w3.org/XML/1998/namespace}id" --ratio-mode Possible choices: accurate, faster, fast Choose the node comparison optimization. diff -Nru xmldiff-2.3/debian/manpage/xmlpatch.1 xmldiff-2.4/debian/manpage/xmlpatch.1 --- xmldiff-2.3/debian/manpage/xmlpatch.1 2019-10-30 19:37:06.000000000 +0000 +++ xmldiff-2.4/debian/manpage/xmlpatch.1 2019-11-04 15:15:11.000000000 +0000 @@ -1,5 +1,5 @@ .\" Text automatically generated by txt2man -.TH xmlpatch 1 "30 Oct 2019" "xmlpatch-2.3" "Patch an XML file with an xmldiff" +.TH xmlpatch 1 "04 Nov 2019" "xmlpatch-2.4" "Patch an XML file with an xmldiff" .SH NAME \fBxmlpatch \fP- Patch an XML file with an xmldiff \fB diff -Nru xmldiff-2.3/docs/source/advanced.rst xmldiff-2.4/docs/source/advanced.rst --- xmldiff-2.3/docs/source/advanced.rst 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/docs/source/advanced.rst 2019-10-09 09:42:47.000000000 +0000 @@ -120,7 +120,7 @@ The XSLT template above of course only handles a few cases, like inserted formatting and insert and delete tags (used below). -A more complete XSLT file is included `here `_. +A more complete XSLT file is included `here `_. Now use that formatter in the diffing: diff -Nru xmldiff-2.3/docs/source/api.rst xmldiff-2.4/docs/source/api.rst --- xmldiff-2.3/docs/source/api.rst 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/docs/source/api.rst 2019-10-09 09:42:47.000000000 +0000 @@ -71,7 +71,7 @@ ``fast_match``: By default ``xmldiff`` will compare each node from one tree with all nodes from the other tree. It will then pick the one node that matches best as the match, - if that match passes the match treshold ``F`` (see above). + if that match passes the match threshold ``F`` (see above). If fast_match is true ``xmldiff`` will first make a faster run, trying to find chains of matching nodes, @@ -98,7 +98,7 @@ and no guarantees are done that the output of one version will be the same as the output of any previous version. The actions of the edit script can be in a different order or replaced by equivalent actions dependingon the version of ``xmldiff``, but if the Edit Script does not correctly transform one XML tree into another, -thas is regarded as a bug. +that is regarded as a bug. This means that the output of the ``xml`` format also may change from version to version. There is no "correct" solution to how that output should look, as the same change can be represented in several different ways. @@ -107,11 +107,17 @@ Unique Attributes ----------------- -The ``uniqueattrs`` argument is a list of strings specifying attributes that uniquely identify a node in the document. +The ``uniqueattrs`` argument is a list of strings or ``(tag, attribute)`` tuples +specifying attributes that uniquely identify a node in the document. This is used by the differ when trying to match nodes. If one node in the left tree has a this attribute, the node in the right three with the same value for that attribute will match, regardless of other attributes, child nodes or text content. +Respectively, if the values of the attribute on the nodes in question are different, +or if only one of the nodes has this attribute, +the nodes will not match regardless of their structural similarity. +In case the attribute is a tuple, the attribute match applies only if both nodes +have the given tag. The default is ``['{http://www.w3.org/XML/1998/namespace}id']``, which is the ``xml:id`` attribute. @@ -463,7 +469,7 @@ * ``xmldiff.main.patch_tree()`` takes as input one edit script, (ie a list of actions, see above) and one ``lxml`` tree, - and returnes a patched ``lxml`` tree. + and returns a patched ``lxml`` tree. They all return a string with the patched XML tree. There are currently no configuration parameters for these commands. diff -Nru xmldiff-2.3/docs/source/installation.rst xmldiff-2.4/docs/source/installation.rst --- xmldiff-2.3/docs/source/installation.rst 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/docs/source/installation.rst 2019-10-09 09:42:47.000000000 +0000 @@ -13,7 +13,7 @@ That's it, ``xmldiff`` should now be available for you to use. -Several Unix distributions also include ``xmldiff`` so you can install it with your distrubutions package manager. +Several Unix distributions also include ``xmldiff`` so you can install it with your distributions package manager. Be aware that currently most distribute an earlier version, typically 0.6.10, which is very different from 2.x, which this documentation is written for. diff -Nru xmldiff-2.3/README.rst xmldiff-2.4/README.rst --- xmldiff-2.3/README.rst 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/README.rst 2019-10-09 09:42:47.000000000 +0000 @@ -10,7 +10,7 @@ ``xmldiff`` is a library and a command-line utility for making diffs out of XML. This may seem like something that doesn't need a dedicated utility, but change detection in hierarchical data is very different from change detection in flat data. -XML type formats are also not only used for computer readable0 data, +XML type formats are also not only used for computer readable data, it is also often used as a format for hierarchical data that can be rendered into human readable formats. A traditional diff on such a format would tell you line by line the differences, but this would not be be readable by a human. @@ -32,7 +32,7 @@ There is also a command to patch a file with the output from the ``xmldiff`` command:: - $ xmldiff file.diff file1.xml + $ xmlpatch file.diff file1.xml There is a simple API for using ``xmldiff`` as a library:: @@ -44,9 +44,9 @@ There is also a method ``diff_trees()`` that take two lxml trees, and a method ``diff_texts()`` that will take strings containing XML. -Similarily, there is ``patch_file()`` ``patch_text()`` and ``patch_tree()``:: +Similarly, there is ``patch_file()`` ``patch_text()`` and ``patch_tree()``:: - result = main.diff_file('file.diff', 'file1.xml') + result = main.patch_file('file.diff', 'file1.xml') Changes from ``xmldiff`` 0.6/1.x @@ -69,7 +69,7 @@ * An output format compatible with 0.6/1.x is also available. - * 2.0 is urrently significantly slower than ``xmldiff`` 0.6/1.x, + * 2.0 is currently significantly slower than ``xmldiff`` 0.6/1.x, but this will change in the future. Currently we make no effort to make ``xmldiff`` 2.0 fast, we concentrate on making it correct and usable. @@ -78,9 +78,11 @@ Contributors ------------ - * Lennart Regebro, lregebro@shoobx.com (main author) + * Lennart Regebro, regebro@gmail.com (main author) * Stephan Richter, srichter@shoobx.com + * Albertas Agejevas, alga@shoobx.com + The diff algorithm is based on "`Change Detection in Hierarchically Structured Information `_", and the text diff is using Google's ``diff_match_patch`` algorithm. diff -Nru xmldiff-2.3/setup.py xmldiff-2.4/setup.py --- xmldiff-2.3/setup.py 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/setup.py 2019-10-09 09:42:47.000000000 +0000 @@ -1,7 +1,7 @@ from io import open from setuptools import setup, find_packages -version = '2.3' +version = '2.4' with open('README.rst', 'rt', encoding='utf8') as readme: description = readme.read() diff -Nru xmldiff-2.3/tests/test_diff.py xmldiff-2.4/tests/test_diff.py --- xmldiff-2.3/tests/test_diff.py 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/tests/test_diff.py 2019-10-09 09:42:47.000000000 +0000 @@ -13,6 +13,20 @@ from .testing import compare_elements +def dedent(string): + """Remove the maximum common indent of the lines making up the string.""" + lines = string.splitlines() + indent = min( + len(line) - len(line.lstrip()) + for line in lines + if line + ) + return "\n".join( + line[indent:] if line else line + for line in lines + ) + + class APITests(unittest.TestCase): left = u"

Text

More

" right = u"

Tokst

More

" @@ -302,6 +316,91 @@ self.assertEqual(differ.child_ratio(left, right), 1.0) self.assertEqual(differ.node_ratio(left, right), 0) + def test_compare_with_uniqueattrs(self): + # `uniqueattrs` can be pairs of (tag, attribute) as well as just string + # attributes. + left = dedent(u"""\ + + +
+ First paragraph + This is the second paragraph +
+
+ Det tredje stycket +
+
+ Last paragraph +
+
+
+ """) + + right = dedent(u"""\ + + +
+ First paragraph +
+
+ This is the second + Det tredje stycket +
+
+ Last paragraph +
+ + First paragraph + This is the second paragraph + +
+
+ """) + + differ = Differ(uniqueattrs=[ + ('section', 'name'), + '{http://www.w3.org/XML/1998/namespace}id' + ]) + differ.set_trees(etree.fromstring(left), etree.fromstring(right)) + differ.match() + + # Make some choice comparisons here. + + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/section[1]')[0] + + # These are very similar + self.assertEqual(differ.leaf_ratio(left, right), 0.90625) + # And one out of two children in common + self.assertEqual(differ.child_ratio(left, right), 0.5) + # But different names, hence 0 as match + self.assertEqual(differ.node_ratio(left, right), 0) + + # Here's the ones with the same tag and name attribute: + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/section[2]')[0] + + # Only one out of two children in common + self.assertEqual(differ.child_ratio(left, right), 0) + # But same id's, hence 1 as match + self.assertEqual(differ.node_ratio(left, right), 1.0) + + # The last ones are completely similar, but only one + # has an name, so they do not match. + left = differ.left.xpath('/document/story/section[3]')[0] + right = differ.right.xpath('/document/story/section[3]')[0] + self.assertAlmostEqual(differ.leaf_ratio(left, right), 0.78260869565) + self.assertEqual(differ.child_ratio(left, right), 1.0) + self.assertEqual(differ.node_ratio(left, right), 0) + + # Now these are structurally similar, have the same name, but + # one of them is not a section, so the uniqueattr does not match + left = differ.left.xpath('/document/story/section[1]')[0] + right = differ.right.xpath('/document/story/subsection[1]')[0] + self.assertAlmostEqual(differ.leaf_ratio(left, right), 1.0) + self.assertEqual(differ.child_ratio(left, right), 0.5) + self.assertAlmostEqual(differ.node_ratio(left, right), 0.75) + def test_compare_node_rename(self): left = u""" First paragraph diff -Nru xmldiff-2.3/xmldiff/_diff_match_patch_py2.py xmldiff-2.4/xmldiff/_diff_match_patch_py2.py --- xmldiff-2.3/xmldiff/_diff_match_patch_py2.py 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/xmldiff/_diff_match_patch_py2.py 2019-10-09 09:42:47.000000000 +0000 @@ -959,7 +959,7 @@ pointer += 1 text_insert = text_insert[commonlength:] text_delete = text_delete[commonlength:] - # Factor out any common suffixies. + # Factor out any common suffixes. commonlength = self.diff_commonSuffix(text_insert, text_delete) if commonlength != 0: diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] + diff -Nru xmldiff-2.3/xmldiff/_diff_match_patch_py3.py xmldiff-2.4/xmldiff/_diff_match_patch_py3.py --- xmldiff-2.3/xmldiff/_diff_match_patch_py3.py 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/xmldiff/_diff_match_patch_py3.py 2019-10-09 09:42:47.000000000 +0000 @@ -957,7 +957,7 @@ pointer += 1 text_insert = text_insert[commonlength:] text_delete = text_delete[commonlength:] - # Factor out any common suffixies. + # Factor out any common suffixes. commonlength = self.diff_commonSuffix(text_insert, text_delete) if commonlength != 0: diffs[pointer] = (diffs[pointer][0], text_insert[-commonlength:] + diff -Nru xmldiff-2.3/xmldiff/diff.py xmldiff-2.4/xmldiff/diff.py --- xmldiff-2.3/xmldiff/diff.py 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/xmldiff/diff.py 2019-10-09 09:42:47.000000000 +0000 @@ -14,8 +14,9 @@ if F is None: F = 0.5 self.F = F - # uniquattrs is a list of attributes that uniquely identifies a node - # inside a document. Defaults to 'xml:id'. + # uniqueattrs is a list of attributes or (tag, attribute) pairs + # that uniquely identifies a node inside a document. Defaults + # to 'xml:id'. if uniqueattrs is None: uniqueattrs = ['{http://www.w3.org/XML/1998/namespace}id'] self.uniqueattrs = uniqueattrs @@ -162,6 +163,12 @@ return 0 for attr in self.uniqueattrs: + if not isinstance(attr, str): + # If it's actually a sequence of (tag, attr), the tags must + # match first. + tag, attr = attr + if tag != left.tag or tag != right.tag: + continue if attr in left.attrib or attr in right.attrib: # One of the nodes have a unique attribute, we check only that. # If only one node has it, it means they are not the same. @@ -258,6 +265,8 @@ yield actions.RenameAttrib(left_xpath, lk, rk) # Remove from list of new attributes new_keys.remove(rk) + # Delete used attribute from map of attributes + del newattrmap[value] # Update left node left.attrib[rk] = value del left.attrib[lk] @@ -345,7 +354,7 @@ # Go over those children that are not in order: for lchild in lchildren: if lchild in self._inorder: - # Alrady aligned + # Already aligned continue rchild = self._l2rmap[id(lchild)] diff -Nru xmldiff-2.3/xmldiff/formatting.py xmldiff-2.4/xmldiff/formatting.py --- xmldiff-2.3/xmldiff/formatting.py 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/xmldiff/formatting.py 2019-10-09 09:42:47.000000000 +0000 @@ -369,7 +369,7 @@ def _xpath(self, node, xpath): # This method finds an element with xpath and makes sure that # one and exactly one element is found. This is to protect against - # formatting a diff on the wrong tree, or against using ambigous + # formatting a diff on the wrong tree, or against using ambiguous # edit script xpaths. if xpath[0] == '/': root = True diff -Nru xmldiff-2.3/xmldiff/main.py xmldiff-2.4/xmldiff/main.py --- xmldiff-2.3/xmldiff/main.py 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/xmldiff/main.py 2019-10-09 09:42:47.000000000 +0000 @@ -71,12 +71,14 @@ parser.add_argument('-p', '--pretty-print', action='store_true', help='Try to make XML output more readable.') parser.add_argument('-F', type=float, - help='A value betwen 0 and 1 that determines how ' + help='A value between 0 and 1 that determines how ' 'similar nodes must be to match.') parser.add_argument('--unique-attributes', type=str, nargs='?', default='{http://www.w3.org/XML/1998/namespace}id', help='A comma separated list of attributes ' - 'that uniquely identify a node. Can be empty.') + 'that uniquely identify a node. Can be empty. ' + 'Unique attributes for certain elements can ' + 'be specified in the format {NS}element@attr.') parser.add_argument('--ratio-mode', default='fast', choices={'accurate', 'fast', 'faster'}, help='Choose the node comparison optimization.') @@ -85,6 +87,15 @@ return parser +def _parse_uniqueattrs(uniqueattrs): + if uniqueattrs is None: + return [] + return [ + attr if '@' not in attr else attr.split('@', 1) + for attr in uniqueattrs.split(',') + ] + + def diff_command(args=None): parser = make_diff_parser() args = parser.parse_args(args=args) @@ -97,15 +108,10 @@ formatter = FORMATTERS[args.formatter](normalize=normalize, pretty_print=args.pretty_print) - if args.unique_attributes is None: - uniqueattrs = [] - else: - uniqueattrs = args.unique_attributes.split(',') - diff_options = {'ratio_mode': args.ratio_mode, 'F': args.F, 'fast_match': args.fast_match, - 'uniqueattrs': uniqueattrs, + 'uniqueattrs': _parse_uniqueattrs(args.unique_attributes), } result = diff_files(args.file1, args.file2, diff_options=diff_options, formatter=formatter) diff -Nru xmldiff-2.3/xmldiff/utils.py xmldiff-2.4/xmldiff/utils.py --- xmldiff-2.3/xmldiff/utils.py 2019-02-27 11:49:38.000000000 +0000 +++ xmldiff-2.4/xmldiff/utils.py 2019-10-09 09:42:47.000000000 +0000 @@ -58,7 +58,7 @@ lmax = len(left) rmax = len(right) - furtherst = {1: (0, [])} + furthest = {1: (0, [])} if not lmax + rmax: # The sequences are equal @@ -68,13 +68,13 @@ for d in range(0, lmax + rmax + 1): for k in range(-d, d + 1, 2): if (k == -d or - (k != d and furtherst[k - 1][0] < furtherst[k + 1][0])): + (k != d and furthest[k - 1][0] < furthest[k + 1][0])): # Go down - old_x, history = furtherst[k + 1] + old_x, history = furthest[k + 1] x = old_x else: # Go left - old_x, history = furtherst[k - 1] + old_x, history = furthest[k - 1] x = old_x + 1 # Copy the history @@ -92,7 +92,7 @@ return [(e, e) for e in range(start)] + history + \ list(zip(range(lend, lslen), range(rend, rslen))) else: - furtherst[k] = (x, history) + furthest[k] = (x, history) WHITESPACE = re.compile(u'\\s+', flags=re.MULTILINE)