|
1 # |
|
2 # Copyright (c) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
3 # All rights reserved. |
|
4 # This component and the accompanying materials are made available |
|
5 # under the terms of "Eclipse Public License v1.0" |
|
6 # which accompanies this distribution, and is available |
|
7 # at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 # |
|
9 # Initial Contributors: |
|
10 # Nokia Corporation - initial contribution. |
|
11 # |
|
12 # Contributors: |
|
13 # |
|
14 # Description: |
|
15 # |
|
16 |
|
17 import sys |
|
18 import xml.etree.ElementTree as ElementTree |
|
19 import unittest |
|
20 import traceback |
|
21 |
|
22 def compare_xml_documents(data1, data2, **kwargs): |
|
23 """ |
|
24 Compare two XML documents for equality. |
|
25 @param data1: The raw byte data of the first XML document as a string. |
|
26 @param data2: The raw byte data of the second XML document as a string. |
|
27 |
|
28 Keyword arguments: |
|
29 @param check_encoding: If True, the encoding of the documents is checked to be the same. |
|
30 @param ignore_namespace: If True, XML namespaces are ignored in the comparison. |
|
31 @param ignored_empty_tags: List of tags that will be ignored in the comparison if empty, |
|
32 i.e. if the tags is empty on one side and does not exist on the other. |
|
33 @param debug_stream: If not None, the stream where debug messages are printed. |
|
34 |
|
35 @return: True if the documents are equal, False if not. |
|
36 """ |
|
37 |
|
38 check_encoding = kwargs.get('check_encoding', False) |
|
39 ignore_namespaces = kwargs.get('ignore_namespaces', False) |
|
40 debug_stream = kwargs.get('debug_stream', None) |
|
41 ignored_empty_tags = kwargs.get('ignored_empty_tags', []) |
|
42 |
|
43 if data1 == data2: |
|
44 if debug_stream: print >>debug_stream, "Raw byte data equal" |
|
45 return True |
|
46 |
|
47 if check_encoding: |
|
48 enc1 = _get_xml_encoding(data1) |
|
49 enc2 = _get_xml_encoding(data2) |
|
50 if enc1.lower() != enc2.lower(): |
|
51 if debug_stream: print >>debug_stream, "XML encoding is not the same (%s vs. %s)" % (repr(enc1), repr(enc2)) |
|
52 return False |
|
53 |
|
54 try: |
|
55 et1 = ElementTree.fromstring(data1) |
|
56 except Exception: |
|
57 if debug_stream: print >>debug_stream, "Failure when parsing data1: %s" % traceback.format_exc() |
|
58 return False |
|
59 |
|
60 try: |
|
61 et2 = ElementTree.fromstring(data2) |
|
62 except Exception: |
|
63 if debug_stream: print >>debug_stream, "Failure when parsing data2: %s" % traceback.format_exc() |
|
64 return False |
|
65 |
|
66 return _xml_elements_equal(et1, et2, ignore_namespaces, debug_stream, '', ignored_empty_tags) |
|
67 |
|
68 def _xml_elements_equal(elem1, elem2, ignore_namespaces, debug_stream, parent_path, ignored_empty_tags): |
|
69 ds = debug_stream |
|
70 |
|
71 elem1_tag = _get_tag(elem1, ignore_namespaces) |
|
72 elem2_tag = _get_tag(elem2, ignore_namespaces) |
|
73 |
|
74 full_path1 = parent_path + '/' + elem1_tag |
|
75 full_path2 = parent_path + '/' + elem2_tag |
|
76 if ds and parent_path == '': |
|
77 print >>ds, "Comparing '%s' vs. '%s'" % (full_path1, full_path2) |
|
78 |
|
79 if elem1_tag != elem2_tag: |
|
80 if ds and parent_path == '': |
|
81 print >>ds, "Tags don't match" |
|
82 return False |
|
83 |
|
84 def strip_string(data): |
|
85 if data == None: return data |
|
86 else: return data.strip(' \n\r\t') |
|
87 text1 = strip_string(elem1.text) |
|
88 text2 = strip_string(elem2.text) |
|
89 if text1 != text2: |
|
90 if ds and parent_path == '': |
|
91 print >>ds, "Element text %s does not match %s" % (repr(text1), repr(text2)) |
|
92 return False |
|
93 |
|
94 def strip_namespace_attrs(attrib): |
|
95 if not ignore_namespaces: |
|
96 return attrib |
|
97 else: |
|
98 # Strip all attributes with a namespace if namespace are ignored |
|
99 result = {} |
|
100 for key, value in attrib.iteritems(): |
|
101 if '{' not in key: |
|
102 result[key] = value |
|
103 return result |
|
104 attrs1 = strip_namespace_attrs(elem1.attrib) |
|
105 attrs2 = strip_namespace_attrs(elem2.attrib) |
|
106 if attrs1 != attrs2: |
|
107 if ds and parent_path == '': |
|
108 print >>ds, "Element attributes don't match (%s vs. %s)" % (repr(attrs1), repr(attrs2)) |
|
109 return False |
|
110 |
|
111 # Remove ignored empty sub-elements before comparing the sub-elems |
|
112 subelems1 = elem1.getchildren() |
|
113 subelems2 = elem2.getchildren() |
|
114 _remove_ignored_empty_subelems(subelems1, elem2.getchildren(), full_path1, ignore_namespaces, ignored_empty_tags, ds) |
|
115 _remove_ignored_empty_subelems(subelems2, elem1.getchildren(), full_path1, ignore_namespaces, ignored_empty_tags, ds) |
|
116 |
|
117 # Compare sub-elements without caring about their document order |
|
118 # NOTE: This approach will not scale well for very large documents |
|
119 len1 = len(elem1.getchildren()) |
|
120 len2 = len(elem2.getchildren()) |
|
121 if len1 != len2: return False |
|
122 if len1 == 0: return True |
|
123 matched_subelems2 = [] |
|
124 for subelem1 in subelems1: |
|
125 matched = False |
|
126 for subelem2 in subelems2: |
|
127 # Try to match the sub-element in elem2 only if it |
|
128 # has not been matched yet |
|
129 if id(subelem2) not in matched_subelems2: |
|
130 if _xml_elements_equal(subelem1, subelem2, ignore_namespaces, ds, full_path1, ignored_empty_tags): |
|
131 matched = True |
|
132 matched_subelems2.append(id(subelem2)) |
|
133 break |
|
134 if not matched: |
|
135 if ds: |
|
136 print >>ds, "No match found for element '%s' under '%s'." % (subelem1.tag, full_path1) |
|
137 print >>ds, "Element data:" |
|
138 print >>ds, ElementTree.tostring(subelem1) |
|
139 return False |
|
140 |
|
141 # Everything matched |
|
142 return True |
|
143 |
|
144 def _remove_ignored_empty_subelems(subelems1, subelems2, parent_path, ignore_namespaces, ignored_empty_tags, debug_stream): |
|
145 """Remove ignored empty sub-elements from list subelems1.""" |
|
146 ds = debug_stream |
|
147 if ds: print >>ds, "parent_path: %s" % parent_path |
|
148 removed = [] |
|
149 for i, subelem1 in enumerate(subelems1): |
|
150 if len(subelem1.getchildren()) > 0: |
|
151 continue |
|
152 |
|
153 # See if the tag should be ignored if it doesn't exist on |
|
154 # the other side |
|
155 is_ignored = False |
|
156 for ignored_tag in ignored_empty_tags: |
|
157 if ds: print >>ds, "ignored_tag = %s, tag = %s" % (ignored_tag, parent_path + "/" + _get_tag(subelem1, ignore_namespaces)) |
|
158 if ignored_tag == parent_path + "/" + _get_tag(subelem1, ignore_namespaces): |
|
159 is_ignored = True |
|
160 break |
|
161 if not is_ignored: |
|
162 continue |
|
163 |
|
164 # See if the tag exists on the other side |
|
165 found = False |
|
166 for subelem2 in subelems2: |
|
167 if _get_tag(subelem1, ignore_namespaces) == _get_tag(subelem2, ignore_namespaces): |
|
168 found = True |
|
169 break |
|
170 if not found: |
|
171 removed.append(i) |
|
172 |
|
173 # Sort and reverse the removed list so that deleting starts from the |
|
174 # end and the indices are correct throughout the operation |
|
175 removed.sort() |
|
176 removed = removed[::-1] |
|
177 if len(removed) >= 2: |
|
178 if removed[0] < removed[-1]: |
|
179 raise RuntimeError("Internal error: list in wrong order: %s" % removed) |
|
180 |
|
181 for i in removed: |
|
182 del subelems1[i] |
|
183 |
|
184 def _get_tag(elem, ignore_namespaces): |
|
185 tag = elem.tag |
|
186 if ignore_namespaces: |
|
187 pos = tag.find('}') |
|
188 if pos >= 0: |
|
189 tag = tag[pos + 1:] |
|
190 return tag |
|
191 |
|
192 def _get_xml_encoding(xml_data): |
|
193 encoding = 'UTF-8' |
|
194 if xml_data.startswith('\xFE\xFF') or xml_data.startswith('\xFF\xFE'): |
|
195 encoding = 'UTF-16' |
|
196 |
|
197 # Decode only up to the first 200 bytes (should be enough for the header) |
|
198 decoded_data = xml_data[:200].decode(encoding, 'ignore') |
|
199 if decoded_data.startswith('<?xml'): |
|
200 header = decoded_data[:decoded_data.find('?>') + 2] |
|
201 # E.g header = '<?xml version="1.0" encoding = 'UTF-8'?>' |
|
202 |
|
203 def get_substr(string, sought_data): |
|
204 pos = string.find(sought_data) |
|
205 if pos >= 0: return string[pos + len(sought_data):] |
|
206 else: return None |
|
207 |
|
208 x = get_substr(header, "encoding") |
|
209 if not x: return '' |
|
210 # E.g x = ' = 'UTF-8'?>' |
|
211 x = x.replace(' ', '').replace('\t', '') |
|
212 # E.g x = '='UTF-8'?>' |
|
213 sgl_quoted = get_substr(x, "='") |
|
214 dbl_quoted = get_substr(x, '="') |
|
215 # E.g sgl_quoted = 'UTF-8'?>' |
|
216 if sgl_quoted: return sgl_quoted[:sgl_quoted.find("'")] |
|
217 elif dbl_quoted: return dbl_quoted[:dbl_quoted.find('"')] |
|
218 |
|
219 return '' |