0
|
1 |
#
|
|
2 |
# Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies).
|
|
3 |
# All rights reserved.
|
|
4 |
# This component and the accompanying materials are made available
|
|
5 |
# under the terms of "Eclipse Public License v1.0"
|
|
6 |
# which accompanies this distribution, and is available
|
|
7 |
# at the URL "http://www.eclipse.org/legal/epl-v10.html".
|
|
8 |
#
|
|
9 |
# Initial Contributors:
|
|
10 |
# Nokia Corporation - initial contribution.
|
|
11 |
#
|
|
12 |
# Contributors:
|
|
13 |
#
|
|
14 |
# Description:
|
|
15 |
#
|
|
16 |
|
|
17 |
import sys
|
|
18 |
import xml.etree.ElementTree as ElementTree
|
|
19 |
import unittest
|
|
20 |
import traceback
|
|
21 |
|
|
22 |
def compare_xml_documents(data1, data2, **kwargs):
|
|
23 |
"""
|
|
24 |
Compare two XML documents for equality.
|
|
25 |
@param data1: The raw byte data of the first XML document as a string.
|
|
26 |
@param data2: The raw byte data of the second XML document as a string.
|
|
27 |
|
|
28 |
Keyword arguments:
|
|
29 |
@param check_encoding: If True, the encoding of the documents is checked to be the same.
|
|
30 |
@param ignore_namespace: If True, XML namespaces are ignored in the comparison.
|
|
31 |
@param ignored_empty_tags: List of tags that will be ignored in the comparison if empty,
|
|
32 |
i.e. if the tags is empty on one side and does not exist on the other.
|
|
33 |
@param debug_stream: If not None, the stream where debug messages are printed.
|
|
34 |
|
|
35 |
@return: True if the documents are equal, False if not.
|
|
36 |
"""
|
|
37 |
|
|
38 |
check_encoding = kwargs.get('check_encoding', False)
|
|
39 |
ignore_namespaces = kwargs.get('ignore_namespaces', False)
|
|
40 |
debug_stream = kwargs.get('debug_stream', None)
|
|
41 |
ignored_empty_tags = kwargs.get('ignored_empty_tags', [])
|
|
42 |
|
|
43 |
if data1 == data2:
|
|
44 |
if debug_stream: print >>debug_stream, "Raw byte data equal"
|
|
45 |
return True
|
|
46 |
|
|
47 |
if check_encoding:
|
|
48 |
enc1 = _get_xml_encoding(data1)
|
|
49 |
enc2 = _get_xml_encoding(data2)
|
|
50 |
if enc1.lower() != enc2.lower():
|
|
51 |
if debug_stream: print >>debug_stream, "XML encoding is not the same (%s vs. %s)" % (repr(enc1), repr(enc2))
|
|
52 |
return False
|
|
53 |
|
|
54 |
try:
|
|
55 |
et1 = ElementTree.fromstring(data1)
|
|
56 |
except Exception:
|
|
57 |
if debug_stream: print >>debug_stream, "Failure when parsing data1: %s" % traceback.format_exc()
|
|
58 |
return False
|
|
59 |
|
|
60 |
try:
|
|
61 |
et2 = ElementTree.fromstring(data2)
|
|
62 |
except Exception:
|
|
63 |
if debug_stream: print >>debug_stream, "Failure when parsing data2: %s" % traceback.format_exc()
|
|
64 |
return False
|
|
65 |
|
|
66 |
return _xml_elements_equal(et1, et2, ignore_namespaces, debug_stream, '', ignored_empty_tags)
|
|
67 |
|
|
68 |
def _xml_elements_equal(elem1, elem2, ignore_namespaces, debug_stream, parent_path, ignored_empty_tags):
|
|
69 |
ds = debug_stream
|
|
70 |
|
|
71 |
elem1_tag = _get_tag(elem1, ignore_namespaces)
|
|
72 |
elem2_tag = _get_tag(elem2, ignore_namespaces)
|
|
73 |
|
|
74 |
full_path1 = parent_path + '/' + elem1_tag
|
|
75 |
full_path2 = parent_path + '/' + elem2_tag
|
|
76 |
if ds and parent_path == '':
|
|
77 |
print >>ds, "Comparing '%s' vs. '%s'" % (full_path1, full_path2)
|
|
78 |
|
|
79 |
if elem1_tag != elem2_tag:
|
|
80 |
if ds and parent_path == '':
|
|
81 |
print >>ds, "Tags don't match"
|
|
82 |
return False
|
|
83 |
|
|
84 |
def strip_string(data):
|
|
85 |
if data == None: return data
|
|
86 |
else: return data.strip(' \n\r\t')
|
|
87 |
text1 = strip_string(elem1.text)
|
|
88 |
text2 = strip_string(elem2.text)
|
|
89 |
if text1 != text2:
|
|
90 |
if ds and parent_path == '':
|
|
91 |
print >>ds, "Element text %s does not match %s" % (repr(text1), repr(text2))
|
|
92 |
return False
|
|
93 |
|
|
94 |
def strip_namespace_attrs(attrib):
|
|
95 |
if not ignore_namespaces:
|
|
96 |
return attrib
|
|
97 |
else:
|
|
98 |
# Strip all attributes with a namespace if namespace are ignored
|
|
99 |
result = {}
|
|
100 |
for key, value in attrib.iteritems():
|
|
101 |
if '{' not in key:
|
|
102 |
result[key] = value
|
|
103 |
return result
|
|
104 |
attrs1 = strip_namespace_attrs(elem1.attrib)
|
|
105 |
attrs2 = strip_namespace_attrs(elem2.attrib)
|
|
106 |
if attrs1 != attrs2:
|
|
107 |
if ds and parent_path == '':
|
|
108 |
print >>ds, "Element attributes don't match (%s vs. %s)" % (repr(attrs1), repr(attrs2))
|
|
109 |
return False
|
|
110 |
|
|
111 |
# Remove ignored empty sub-elements before comparing the sub-elems
|
|
112 |
subelems1 = elem1.getchildren()
|
|
113 |
subelems2 = elem2.getchildren()
|
|
114 |
_remove_ignored_empty_subelems(subelems1, elem2.getchildren(), full_path1, ignore_namespaces, ignored_empty_tags, ds)
|
|
115 |
_remove_ignored_empty_subelems(subelems2, elem1.getchildren(), full_path1, ignore_namespaces, ignored_empty_tags, ds)
|
|
116 |
|
|
117 |
# Compare sub-elements without caring about their document order
|
|
118 |
# NOTE: This approach will not scale well for very large documents
|
|
119 |
len1 = len(elem1.getchildren())
|
|
120 |
len2 = len(elem2.getchildren())
|
|
121 |
if len1 != len2: return False
|
|
122 |
if len1 == 0: return True
|
|
123 |
matched_subelems2 = []
|
|
124 |
for subelem1 in subelems1:
|
|
125 |
matched = False
|
|
126 |
for subelem2 in subelems2:
|
|
127 |
# Try to match the sub-element in elem2 only if it
|
|
128 |
# has not been matched yet
|
|
129 |
if id(subelem2) not in matched_subelems2:
|
|
130 |
if _xml_elements_equal(subelem1, subelem2, ignore_namespaces, ds, full_path1, ignored_empty_tags):
|
|
131 |
matched = True
|
|
132 |
matched_subelems2.append(id(subelem2))
|
|
133 |
break
|
|
134 |
if not matched:
|
|
135 |
if ds:
|
|
136 |
print >>ds, "No match found for element '%s' under '%s'." % (subelem1.tag, full_path1)
|
|
137 |
print >>ds, "Element data:"
|
|
138 |
print >>ds, ElementTree.tostring(subelem1)
|
|
139 |
return False
|
|
140 |
|
|
141 |
# Everything matched
|
|
142 |
return True
|
|
143 |
|
|
144 |
def _remove_ignored_empty_subelems(subelems1, subelems2, parent_path, ignore_namespaces, ignored_empty_tags, debug_stream):
|
|
145 |
"""Remove ignored empty sub-elements from list subelems1."""
|
|
146 |
ds = debug_stream
|
|
147 |
if ds: print >>ds, "parent_path: %s" % parent_path
|
|
148 |
removed = []
|
|
149 |
for i, subelem1 in enumerate(subelems1):
|
|
150 |
if len(subelem1.getchildren()) > 0:
|
|
151 |
continue
|
|
152 |
|
|
153 |
# See if the tag should be ignored if it doesn't exist on
|
|
154 |
# the other side
|
|
155 |
is_ignored = False
|
|
156 |
for ignored_tag in ignored_empty_tags:
|
|
157 |
if ds: print >>ds, "ignored_tag = %s, tag = %s" % (ignored_tag, parent_path + "/" + _get_tag(subelem1, ignore_namespaces))
|
|
158 |
if ignored_tag == parent_path + "/" + _get_tag(subelem1, ignore_namespaces):
|
|
159 |
is_ignored = True
|
|
160 |
break
|
|
161 |
if not is_ignored:
|
|
162 |
continue
|
|
163 |
|
|
164 |
# See if the tag exists on the other side
|
|
165 |
found = False
|
|
166 |
for subelem2 in subelems2:
|
|
167 |
if _get_tag(subelem1, ignore_namespaces) == _get_tag(subelem2, ignore_namespaces):
|
|
168 |
found = True
|
|
169 |
break
|
|
170 |
if not found:
|
|
171 |
removed.append(i)
|
|
172 |
|
|
173 |
# Sort and reverse the removed list so that deleting starts from the
|
|
174 |
# end and the indices are correct throughout the operation
|
|
175 |
removed.sort()
|
|
176 |
removed = removed[::-1]
|
|
177 |
if len(removed) >= 2:
|
|
178 |
if removed[0] < removed[-1]:
|
|
179 |
raise RuntimeError("Internal error: list in wrong order: %s" % removed)
|
|
180 |
|
|
181 |
for i in removed:
|
|
182 |
del subelems1[i]
|
|
183 |
|
|
184 |
def _get_tag(elem, ignore_namespaces):
|
|
185 |
tag = elem.tag
|
|
186 |
if ignore_namespaces:
|
|
187 |
pos = tag.find('}')
|
|
188 |
if pos >= 0:
|
|
189 |
tag = tag[pos + 1:]
|
|
190 |
return tag
|
|
191 |
|
|
192 |
def _get_xml_encoding(xml_data):
|
|
193 |
encoding = 'UTF-8'
|
|
194 |
if xml_data.startswith('\xFE\xFF') or xml_data.startswith('\xFF\xFE'):
|
|
195 |
encoding = 'UTF-16'
|
|
196 |
|
|
197 |
# Decode only up to the first 200 bytes (should be enough for the header)
|
|
198 |
decoded_data = xml_data[:200].decode(encoding, 'ignore')
|
|
199 |
if decoded_data.startswith('<?xml'):
|
|
200 |
header = decoded_data[:decoded_data.find('?>') + 2]
|
|
201 |
# E.g header = '<?xml version="1.0" encoding = 'UTF-8'?>'
|
|
202 |
|
|
203 |
def get_substr(string, sought_data):
|
|
204 |
pos = string.find(sought_data)
|
|
205 |
if pos >= 0: return string[pos + len(sought_data):]
|
|
206 |
else: return None
|
|
207 |
|
|
208 |
x = get_substr(header, "encoding")
|
|
209 |
if not x: return ''
|
|
210 |
# E.g x = ' = 'UTF-8'?>'
|
|
211 |
x = x.replace(' ', '').replace('\t', '')
|
|
212 |
# E.g x = '='UTF-8'?>'
|
|
213 |
sgl_quoted = get_substr(x, "='")
|
|
214 |
dbl_quoted = get_substr(x, '="')
|
|
215 |
# E.g sgl_quoted = 'UTF-8'?>'
|
|
216 |
if sgl_quoted: return sgl_quoted[:sgl_quoted.find("'")]
|
|
217 |
elif dbl_quoted: return dbl_quoted[:dbl_quoted.find('"')]
|
|
218 |
|
|
219 |
return ''
|