mpdot/linkcheck.py
changeset 2 932c358ece3e
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mpdot/linkcheck.py	Fri Apr 23 20:45:58 2010 +0100
@@ -0,0 +1,2336 @@
+# Copyright (c) 2007-2010 Nokia Corporation and/or its subsidiary(-ies) All rights reserved.
+# This component and the accompanying materials are made available under the terms of the License 
+# "Eclipse Public License v1.0" which accompanies this distribution, 
+# and is available at the URL "http://www.eclipse.org/legal/epl-v10.html".
+#
+# Initial Contributors:
+# Nokia Corporation - initial contribution.
+#
+# Contributors:
+#
+# Description:
+# Checks links in DITA XML and reports issues.
+"""
+Created on 12 Feb 2010
+
+@author: p2ross
+
+Definitions
+===========
+Doctype
+-------
+See: http://www.w3.org/TR/2008/REC-xml-20081126/#dt-root
+Note: this is sometimes called the Doctype because of http://www.w3.org/TR/2008/REC-xml-20081126/#vc-roottype
+
+ID
+--
+The value of the 'id' attribute of an element.
+
+Root ID
+-------
+The value of the 'id' attribute of the root element.
+Note: A development would allow differently named attributes provided that they
+were ID types. See http://www.w3.org/TR/2008/REC-xml-20081126/#sec-attribute-types
+for validity constraints for ID types.
+
+Reference
+---------
+The value of the href attribute of an element.
+
+Map
+---
+An XML file whose root element name is 'map' or ends with 'Map'.   
+
+Topic
+-----
+An XML file that is not a Map.
+
+Lonely topic
+------------
+A topic whose root ID is not referenced by any map. 
+
+Lonely map
+----------
+A map whose root ID is not referenced by any map. 
+
+Map Cycle
+---------
+A sequence of map references whose members are not unique.
+
+"""
+
+import os
+import unittest
+import sys
+import logging
+import pprint
+import fnmatch
+import re
+import urllib
+import time
+from optparse import OptionParser, check_choice
+try:
+    from xml.etree import cElementTree as etree
+except ImportError:
+    from xml.etree import ElementTree as etree
+import urlparse
+import multiprocessing
+# used for DitaFileObj persistence
+import shelve
+
+__version__ = '0.1.5'
+
+class ExceptionLinkCheck(Exception):
+    pass
+
+class CountDict(dict):
+    """Dictionary with a default value of 0 for unknown keys."""
+    def __getitem__(self, key):
+        if key not in self: 
+            self[key] = 0
+        return self.get(key)
+
+# Matches stuff like: GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E
+RE_GUID = re.compile(r'GUID-[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}', re.IGNORECASE)
+
+# Of the form {integer_error_code : (format_string, num_args), ...}
+PROBLEM_CODE_FORMAT = {
+    # 'id_syntax'
+    100 : ('Character \'#\' not allowed in id="%s"', 1),
+    101 : ('NMTOKEN character \'%s\' not allowed in id="%s"', 2),
+    102 : ('GUID specification does not match id="%s"', 1),
+    # 'ref_syntax'
+    200 : ('Multiple \'#\' not allowed in reference "%s"', 1),
+    201 : ('Reference element "%s" is missing href=... attribute', 1),
+    202 : ('URL has missing type/format in reference "%s"', 1),
+    203 : ('GUID specification does not match file reference "%s"', 1),
+    204 : ('GUID specification does not match fragment reference "%s"', 1),
+    # 'ref'
+    300 : ('Can not resolve URI "%s"', 1),
+    # 'file'
+    400 : ('Failed to open: "%s"', 1),
+    401 : ('Multiple id="%s"', 1),
+    402 : ('No id attribute on root element', 0),
+    403 : ('Root ID in cycle: %s', 1),
+    404 : ('Can not parse: "%s"', 1),
+    410 : ('Can not resolve reference to file "%s"', 1),
+    411 : ('Can resolve reference to file "%s" but not to fragment "%s"', 2),
+    412 : ('Referencing element "%s" does not match target root element "%s"', 2),
+    413 : ('Referencing element "%s" does not match target element "%s" for id="%s"', 3),
+    414 : ('topicref element with format="ditamap" does not match target root element "%s"', 1),
+    415 : ('topicref to <map> does not have format="ditamap" but format="%s"', 1),
+    416 : ('topicref element type="%s" does not match target root element "%s"', 2),
+    417 : ('topicref element type="%s" does not match target element "%s" for id="%s"', 3),
+    418 : ('Unknown referencing element "%s" does not match target root element "%s"', 2),
+    419 : ('Unknown referencing element "%s" does not match target element "%s" for id="%s"', 3),
+    # 'file_set'
+    500 : ('Not a directory: %s', 1),
+    501 : ('Duplicate root id="%s" in files: %s', 2), 
+    #502 : ('Can not resolve reference to "%s"', 1),
+    #503 : ('Reference type "%s" does not match target type "%s" for id="%s"', 3),
+    504 : ('Duplicate file path: "%s"', 1),
+    505 : ('Duplicate id="%s" in files: %s', 2),
+    # 'topic_set'
+    600 : ('Topic id="%s" is not referenced by any map', 1), 
+    # 'map_set'
+    700 : ('More than one top level map exists: %s', 1),  
+    701 : ('Maps "%s" are in a a cycle.', 1),
+}
+
+GENERIC_STRING = '...'
+PRINT_WIDTH = 75
+
+def genericStringForErrorCode(ec):
+    assert(PROBLEM_CODE_FORMAT.has_key(ec))
+    f, c = PROBLEM_CODE_FORMAT[ec]
+    if c == 0:
+        return f
+    return f % ((GENERIC_STRING,) * c)
+
+def writeGenericStringsForErrorCodes(s=sys.stdout):
+    s.write(' All Error Codes '.center(PRINT_WIDTH, '='))
+    s.write('\n')
+    s.write('%4s  %s\n' % ('Code', 'Error'))
+    s.write('%4s  %s\n' % ('----', '-----'))
+    ecS = PROBLEM_CODE_FORMAT.keys()
+    ecS.sort()
+    for ec in ecS:
+        s.write('%4d  %s\n' % (ec, genericStringForErrorCode(ec)))
+    s.write('='*PRINT_WIDTH)
+    s.write('\n\n')
+
+def normalisePath(thePath):
+    # TODO: How come this does not work?
+    #return os.path.abspath(thePath)
+    return os.path.abspath(thePath).replace('\\', '/')
+
+FNMATCH_PATTERNS = ['*.xml', '*.dita', '*.ditamap']
+FNMATCH_STRING = ' '.join(FNMATCH_PATTERNS)
+
+# These elements descend from topic/xref so can be treated as referencing elements
+XREF_DESCENDENTS = set(
+    (
+        # From the api specialisation
+        'apiRelation',
+        'apiBaseClassifier',
+        'apiOtherClassifier',
+        'apiOperationClassifier',
+        'apiValueClassifier',
+        # From the C++ specialisation
+        'cxxfile',
+        'cxxclass',
+        'cxxstruct',
+        'cxxunion',
+        'cxxfunction',
+        'cxxdefine',
+        'cxxtypedef',
+        'cxxvariable',
+        'cxxenumeration',
+        'cxxClassBaseClass',
+        'cxxClassBaseStruct',
+        'cxxClassBaseUnion',
+        'cxxClassNestedClass',
+        'cxxClassNestedStruct',
+        'cxxClassNestedUnion',
+        'cxxClassEnumerationInherited',
+        'cxxClassEnumeratorInherited',
+        'cxxClassFunctionInherited',
+        'cxxClassVariableInherited',
+        'cxxDefineReimplemented',
+        'cxxEnumerationReimplemented',
+        'cxxFunctionReimplemented',
+        'cxxStructBaseClass',
+        'cxxStructBaseStruct',
+        'cxxStructBaseUnion',
+        'cxxStructNestedClass',
+        'cxxStructNestedStruct',
+        'cxxStructNestedUnion',
+        'cxxStructEnumerationInherited',
+        'cxxStructEnumeratorInherited',
+        'cxxStructFunctionInherited',
+        'cxxStructVariableInherited',
+        'cxxTypedefReimplemented',
+        'cxxUnionBaseClass',
+        'cxxUnionBaseStruct',
+        'cxxUnionBaseUnion',
+        'cxxUnionNestedClass',
+        'cxxUnionNestedStruct',
+        'cxxUnionNestedUnion',
+        'cxxUnionEnumerationInherited',
+        'cxxUnionFunctionInherited',
+        'cxxUnionVariableInherited',
+        'cxxVariableReimplemented',
+    )
+)
+
+class UrlAccessCache(object):
+    def __init__(self):
+        # {URL : True/False, ...}
+        self._cache = {}
+        
+    def clear(self):
+        self._cache = {}
+        
+    def canAccess(self, theUrl):
+        if not self._cache.has_key(theUrl):
+            try:
+                u = urllib.urlopen(theUrl)#, data, proxies)
+                u.read()
+                self._cache[theUrl] = True
+                logging.debug('URL: %s  for %s' % (True, theUrl))
+            except IOError:
+                self._cache[theUrl] = False
+                logging.debug('URL: %s for %s' % (False, theUrl))
+        return self._cache[theUrl]
+
+GlobalUrlCache = UrlAccessCache()
+ 
+class DitaLinkCheckBase(object):
+    """Base class that holds some common functionality."""
+    def __init__(self, theIdentity):#=None):
+        self.__identity = theIdentity
+        # Set of error strings, lazily evaluated
+        self._errS = None
+    
+    @property
+    def identity(self):
+        return self.__identity
+    
+    def __cmp__(self, other):
+        assert(self.identity is not None)
+        assert(other.identity is not None)
+        return cmp(self.identity, other.identity)
+
+    def __eq__(self, other):
+        assert(self.identity is not None)
+        assert(other.identity is not None)
+        return self.identity == other.identity
+
+    def __hash__(self):
+        assert(self.identity is not None)
+        return hash(self.identity)
+    
+    def __str__(self):
+        return str(self.__identity)
+
+    def debugDump(self, s=sys.stdout, prefix=''):
+        """Dump of IR for debug purposes."""
+        raise NotImplementedError
+    
+    def addError(self, errCode, argTuple):
+        assert(errCode in PROBLEM_CODE_FORMAT.keys()), 'No error code: %s' % errCode
+        assert(PROBLEM_CODE_FORMAT[errCode][1] == len(argTuple)), \
+            'Length missmatch for error code %d: %d != %d for %s' \
+            % (errCode, PROBLEM_CODE_FORMAT[errCode][1], len(argTuple), str(argTuple))
+        if self._errS is None:
+            self._errS = {}
+        try:
+            self._errS[errCode].add(argTuple)
+        except KeyError:
+            self._errS[errCode] = set((argTuple,))
+
+    def errStrings(self, generic, theFilter):
+        """Return a sorted list of error messages without duplicates."""
+        if self._errS is not None:
+            mySet = set()
+            for ec in self._errS.keys():
+                if theFilter is None or ec in theFilter:
+                    assert(ec in PROBLEM_CODE_FORMAT.keys())
+                    for tu in self._errS[ec]:
+                        if generic:
+                            mySet.add(genericStringForErrorCode(ec))
+                        else:
+                            f, c = PROBLEM_CODE_FORMAT[ec]
+                            assert(len(tu) == c)
+                            mySet.add(f % tu)
+            l = list(mySet)
+            l.sort()
+            return l
+        return []
+    
+    def updateErrorCount(self, theMap):
+        """Updates a map of {error_code, : count, ...}.
+        Overridden for file and file set."""
+        if self._errS is not None:
+            for e in self._errS.keys():
+                theMap[e] += len(self._errS[e])
+    
+    def writeErrors(self, isGeneric, theFilter, theStream=sys.stdout):
+        """Can be overridden in child classes to recurse into
+        their data structures."""
+        theStream.write('\n'.join(self.errStrings(isGeneric, theFilter)))
+    
+class DitaId(DitaLinkCheckBase):
+    """Represents a node with an id."""
+    def __init__(self, theN):
+        assert(theN.get('id', None) is not None)
+        super(DitaId, self).__init__(theN.get('id', None))
+        self._elem = theN.tag
+        if '#' in self.id:
+            self.addError(100, (self.id,))
+        # TODO: NMTOKENS
+    
+    @property
+    def elem(self):
+        return self._elem
+
+    @property
+    def id(self):
+        return self.identity
+
+    def checkGuid(self):
+        """optionally applies additional checks for GUID requirements."""
+        if RE_GUID.match(self.id) is None:
+            self.addError(102, (self.id,))
+
+    def debugDump(self, s=sys.stdout, prefix=''):
+        """Dump of IR for debug purposes."""
+        s.write('%sID:  <%s id="%s" />\n' % (prefix, self.elem, self.id))
+        
+class DitaRef(DitaLinkCheckBase):
+    """Represents a reference node."""
+    def __init__(self, theN):
+        self._elem = theN.tag
+        self._href = theN.get('href', None)
+        super(DitaRef, self).__init__('%s %s' % (self._elem, self._href))
+        # This is used when figuring out of the target is the correct element
+        # e.g. in Vanilla DITA
+        # <topicref href="batcaring.dita" type="task"></topicref>
+        self._refType = theN.get('type', None)
+        # Format attribute, this can be format="ditamap"
+        self._format = theN.get('format', None)
+        if self._href is None:
+            self.addError(201, (self._elem,))
+            self._url = None
+        else:
+            self._url = urlparse.urlparse(self._href)
+            if '#' in self._url.fragment:
+                self.addError(200, (self._href,))
+
+    @property
+    def elem(self):
+        return self._elem
+
+    @property
+    def href(self):
+        """The value of the href attribute."""
+        return self._href
+    
+    @property
+    def refType(self):
+        """The value of the type attribute."""
+        return self._refType
+    
+    @property
+    def format(self):
+        """The value of the format attribute."""
+        return self._format
+    
+    @property
+    def path(self):
+        """The value of the path part of the href attribute."""
+        return self._url.path
+        
+    @property
+    def fragment(self):
+        """The value of the fragment part of the href attribute."""
+        return self._url.fragment
+        
+    @property
+    def scheme(self):
+        """The URI scheme e.g. 'http' or '' if no scheme."""
+        return self._url.scheme
+    
+    def fileFragment(self, theRefFile):                               
+        """The absolute path of the file and the fragment identifier or (None, None)."""
+        if self.scheme not in ('', 'file'):
+            return (None, None)
+        if len(self.path) == 0:
+            myPath = theRefFile
+        else:
+            myPath = os.path.join(os.path.dirname(theRefFile), self.path)
+        return normalisePath(myPath), self.fragment
+    
+    def checkGuid(self):
+        """optionally applies additional checks for GUID requirements."""
+        if RE_GUID.match(self.path) is None:
+            self.addError(203, (self.path,))
+        if RE_GUID.match(self.fragment) is None:
+            self.addError(204, (self.fragment,))                
+
+    def checkUrl(self):
+        if self.scheme:
+            myU = urlparse.urlunparse(self._url)
+            if not GlobalUrlCache.canAccess(myU):
+                self.addError(300, (myU,))
+
+    def debugDump(self, s=sys.stdout, prefix=''):
+        """Dump of IR for debug purposes."""
+        s.write('%sREF: <%s href="%s" />\n' % (prefix, self.elem, self._href))
+
+class DitaFileObj(DitaLinkCheckBase):
+    """Base class for a DITA topic or map."""
+    def __init__(self, theFileObj, theFileName=None):
+        """Initialiser with a file object and a file path"""
+        #print '\nDitaFileObj(%s, %s)' % (theFileObj, theFileName)
+        if theFileName is not None:
+            super(DitaFileObj, self).__init__(normalisePath(theFileName))
+        elif theFileObj is not None:
+            super(DitaFileObj, self).__init__(theFileObj.name)
+        else:
+            super(DitaFileObj, self).__init__(None)
+        self._rootId = None
+        self._doctype = None
+        # Sets of class DitaId
+        self._idS = set()
+        self._dupeIdS = set()
+        # Set of class DitaRef
+        self._xrefS = set()
+        # Ouptut control
+        self._hasWritten = False
+        # Size of input
+        try:
+            self._bytes = os.path.getsize(theFileName)
+        except Exception:
+            # Try as if a StringIO
+            try:
+                self._bytes = theFileObj.len
+            except AttributeError:
+                # Give up
+                self._bytes = 0
+        # Process the file object
+        if theFileObj is not None:
+            try:
+                # TODO: use iterparse?
+                theTree = etree.parse(theFileObj)
+            except SyntaxError, err:
+                self.addError(404, (str(err),))
+            else:
+                # Walk the tree
+                for i, e in enumerate(theTree.getiterator()):
+                    #print 'TRACE: e', e
+                    # Element [0] is the root element
+                    if i == 0:
+                        assert(self._rootId is None)
+                        assert(self._doctype is None)
+                        self._doctype = e.tag
+                        if e.get('id', None) is not None:
+                            self._rootId = DitaId(e)
+                            self._addId(self._rootId)
+                        else:
+                            self.addError(402, ())
+                    else:
+                        # NOTE: Elements with id attributes can also have href
+                        # attributes. For example a <topicref> in a <bookmap>
+                        # Thus these tests are not exclusive
+                        if e.get('id', None) is not None:
+                            self._addId(DitaId(e))
+                        if e.get('href', None) is not None:
+                            # TODO: Do we limit ourselves to only a certain set of elements?
+                            self._xrefS.add(DitaRef(e))
+        else:
+            self.addError(400, (self.identity,))
+    
+    def _addId(self, theId):
+        #print 'TRACE: adding %s' % theId
+        #print 'TRACE: self._idS %s' % self._idS
+        if theId in self._idS:
+            # Remove from self._idS
+            #print 'TRACE: removing %s' % theId
+            self._idS.remove(theId)
+            self._dupeIdS.add(theId)
+            self.addError(401, (theId.identity,))
+        elif theId not in self._dupeIdS:
+            self._idS.add(theId)
+    
+    @property
+    def bytes(self):
+        return self._bytes
+    
+    @property
+    def doctype(self):
+        return self._doctype
+    
+    @property
+    def rootId(self):
+        if self._rootId is not None:
+            return self._rootId.id
+    
+    @property
+    def isMap(self):
+        return self.doctype == "map" \
+        or self.doctype == 'bookmap' \
+        or (self.doctype is not None and self.doctype.endswith('Map'))
+    
+    @property
+    def idS(self):
+        """The set of IDs."""
+        return self._idS
+    
+    @property
+    def refS(self):
+        """The set of DitaRef objects."""
+        return self._xrefS
+    
+    def idElemMap(self):
+        """Returns a map {id : elem name, ...}."""
+        retVal = {}
+        for anId in self._idS:
+            retVal[anId.id] = anId.elem
+        return retVal
+    
+    def hasId(self, theString):
+        for anId in self._idS:
+            if theString == anId.id:
+                return True
+        return False
+
+    def idElem(self, theString):
+        for anId in self._idS:
+            if theString == anId.id:
+                return anId.elem
+        return None
+
+    def idObj(self, theString):
+        for anId in self._idS:
+            if theString == anId.id:
+                return anId
+        return None
+
+    def updateErrorCount(self, theMap):
+        """Updates a map of {error_code, : count, ...}."""
+        if self._errS is not None:
+            for e in self._errS.keys():
+                theMap[e] += len(self._errS[e])
+        for idObj in self.idS:
+            idObj.updateErrorCount(theMap)
+        for refObj in self.refS:
+            refObj.updateErrorCount(theMap)
+    
+    def writeErrorList(self, theList, theSubHead='', theS=sys.stdout):
+        if len(theList) > 0:
+            theList.sort()
+            if not self._hasWritten:
+                theS.write('File: %s\n' % self.identity)
+            self._hasWritten = True
+            if len(theSubHead) > 0:
+                theS.write('%s [%d]:\n' % (theSubHead, len(theList)))
+            theS.write('\n'.join(theList))
+            theS.write('\n')
+    
+    def writeErrors(self, isGeneric, theFilter, theStream=sys.stdout):
+        """Writes out errors for me, my IDs and my Refs."""
+        self._hasWritten = False
+        self.writeErrorList(self.errStrings(isGeneric, theFilter), 'File errors:', theStream)
+#===============================================================================
+#        # Duplicate IDs
+#        myList = (list(self._dupeIdS))
+#        if len(myList):
+#            self.writeErrorList(
+#                    [i.identity for i in myList],
+#                    'Duplicate ID',
+#                    theStream)
+#===============================================================================
+        # Now IDs
+        myList = (list(self.idS))
+        myList.sort()
+        for anId in myList:
+            self.writeErrorList(anId.errStrings(isGeneric, theFilter), 'ID=%s' % anId.identity, theStream)
+        # Now Refs
+        myList = (list(self._xrefS))
+        myList.sort()
+        for anId in myList:
+            self.writeErrorList(anId.errStrings(isGeneric, theFilter), 'Ref=%s' % anId.identity, theStream)
+        if self._hasWritten:
+            theStream.write('\n')
+    
+    def debugDump(self, s=sys.stdout, prefix=''):
+        """Dump of IR for debug purposes."""
+        s.write('%sFile: %s\n' % (prefix, self.identity))
+        for anId in self._idS:
+            anId.debugDump(s, prefix=prefix+'  ')
+        for aRef in self._xrefS:
+            aRef.debugDump(s, prefix=prefix+'  ')
+    
+class DitaFilePath(DitaFileObj):
+    """Base class for a DITA topic or map from the file system."""
+    def __init__(self, theFilePath):
+        """Initialiser with a file path"""
+        try:
+            f = open(theFilePath)
+        except IOError:
+            f = None
+        #print 'DitaFilePath(%s)' % theFilePath
+        super(DitaFilePath, self).__init__(f, theFilePath)
+        if f is None:
+            self.addError(400, (theFilePath,))
+            
+            
+class DitaFileMapBase(object):
+    """Base class for holding a map of {file path : class DitaFile, ...}
+    Actual implementation can be in-memory or via a database e.g. the
+    shelve module."""
+    def keys(self):
+        """Returns an unsorted list of keys in the map."""
+        raise NotImplementedError()
+    
+    def has_key(self, thePath):
+        """Return True if the key exists."""
+        raise NotImplementedError()
+    
+    def remove(self, thePath):
+        """Remove the entry corresponding to thePath, may raise KeyError."""
+        raise NotImplementedError()
+    
+    def getDitaFileObj(self, thePath):
+        """Return a DitaFileObj that corresponds to thePath, may raise KeyError."""
+        raise NotImplementedError()
+        
+    def setDitaFileObj(self, thePath, theObj):
+        """Load a DitaFileObj or update a mutated DitaFileObj."""
+        raise NotImplementedError()
+        
+class DitaFileMapInMemory(DitaFileMapBase):
+    """Holds map of {file path : class DitaFile, ...} in memory."""
+    def __init__(self):
+        # Map of {file path : class DitaFile, ...}
+        self._fileMap = {}
+    
+    def keys(self):
+        """Returns an unsorted list of keys in the map."""
+        return self._fileMap.keys()
+    
+    def has_key(self, thePath):
+        """Return True if the key exists."""
+        return self._fileMap.has_key(thePath)
+        
+    def remove(self, thePath):
+        """Remove the entry corresponding to thePath, may raise KeyError."""
+        del self._fileMap[thePath]
+    
+    def getDitaFileObj(self, thePath):
+        """Return a DitaFileObj that corresponds to thePath, may raise KeyError."""
+        return self._fileMap[thePath]
+        
+    def setDitaFileObj(self, thePath, theObj):
+        """Load a DitaFileObj or update a mutated DitaFileObj."""
+        self._fileMap[thePath] = theObj
+        
+class DitaFileMapShelve(DitaFileMapBase):
+    """Holds map of {file path : class DitaFile, ...} in a shelve database."""
+    DBASE_FILENAME = 'linkchecker.dbase'
+    def __init__(self):
+        if os.path.exists(self.DBASE_FILENAME):
+            os.remove(self.DBASE_FILENAME)
+        self._db = shelve.open(self.DBASE_FILENAME)
+        # Use this as a 'cache' as shelf.keys() is slow
+        self._keys = set()
+    
+    def keys(self):
+        """Returns an unsorted list of keys in the map."""
+        return list(self._keys)
+    
+    def has_key(self, thePath):
+        """Return True if the key exists."""
+        return thePath in self._keys
+        
+    def remove(self, thePath):
+        """Remove the entry corresponding to thePath, may raise KeyError."""
+        del self._db[thePath]
+        self._keys.remove(thePath)
+    
+    def getDitaFileObj(self, thePath):
+        """Return a DitaFileObj that corresponds to thePath, may raise KeyError."""
+        return self._db[thePath]
+        
+    def setDitaFileObj(self, thePath, theObj):
+        """Load a DitaFileObj or update a mutated DitaFileObj."""
+        self._db[thePath] = theObj
+        self._keys.add(thePath)
+        
+class DitaFileSet(DitaLinkCheckBase):
+    """Holds information about a set of DITA files."""
+    STATS_KEYS = ('Maps', 'Non-maps', 'Files', 'Bytes', 'IDs', 'Refs')
+    def __init__(self,
+                 theDir,
+                 procDir=True,
+                 thePatterns=None,
+                 recursive=False,
+                 testExt=False,
+                 useDbase=False):
+        """Constructor. theDir is the root directory of DITA XML.
+        procDir - If True then process this directory immediately, otherwise
+                    the directory can be processed independently and
+                    _addFileObj() or _addDitaFileObj() invoked.
+        thePatterns - If supplied this should be a space separated string of
+                        fnmatch extensions.
+        recursive - If True and procDir True the directory is processed recursively.
+        testExt - If True then test external URLs.
+        useDbase - If True then store all DitaFile objects in an external dbase
+                    (slower but less memory issues).
+        """
+        if thePatterns is None:
+            thePatterns = FNMATCH_STRING.split(' ')
+        if theDir is not None:
+            theDir = normalisePath(theDir)
+        super(DitaFileSet, self).__init__(theDir)
+        logging.info('DitaFileSet starting to read...')
+        GlobalUrlCache.clear()
+        self._testExt = testExt
+        # Set up how we store the DitaFile objects
+        if useDbase:
+            self._fileMap = DitaFileMapShelve()
+        else:
+            self._fileMap = DitaFileMapInMemory()
+        # Map of (str(rootId) : filepath, ...) with no duplicates
+        # Keys will be in self._uniqueRootIds
+        self._rootIdToFilePathMap = {}
+        # Path to the unique DITA map
+        self._uniqueMapPath = None
+        # Count of {error_code : count, ...}
+        self._errCountMap = CountDict()
+        # Statistics
+        self._statsMap = CountDict()
+        ## and initialise
+        #for k in self.STATS_KEYS:
+        #    self._statsMap[k]
+        # Finalisation control (weak)
+        self._hasFinalised = False
+        # Timers
+        self._timeRead = time.clock()
+        self._timeAnalyse = 0.0
+        if procDir:
+            if theDir is not None and os.path.isdir(theDir):
+                self._readDir(theDir, thePatterns, recursive)
+            else:
+                self.addError(500, (theDir,))
+            # Finalise and run all the tests
+            self.finalise()
+    
+    @property
+    def errCountMap(self):
+        return self._errCountMap
+    
+    @property
+    def statsMap(self):
+        return self._statsMap
+    
+    def writeStatistics(self, s=sys.stdout):
+        """Writes out read statistics."""
+        s.write(' Statistics '.center(PRINT_WIDTH, '='))
+        s.write('\n')
+        if len(self._statsMap) > 0:
+            o = self.STATS_KEYS
+            #assert(set(o) == set(self._statsMap.keys())), \
+            #    '%s != %s' % (o, self._statsMap.keys())
+            for k in o:
+                try:
+                    m = self._statsMap[k] / (1024.0*1024.0)
+                    s.write('%20s: %10d [%10.3f M]\n' % (k, self._statsMap[k], m))
+                except KeyError:
+                    s.write('%20s: %10s \n' % (k, 'Not seen'))
+            s.write('%20s: %10.3f (s)\n' % ('Read time', self._timeRead))
+            s.write('%20s: %10.3f (s)\n' % ('Analysis time', self._timeAnalyse))
+            s.write('='*PRINT_WIDTH)
+        else:
+            s.write('Nothing processed.')
+        s.write('\n')
+        
+    def writeErrorSummary(self, s=sys.stdout):
+        s.write(' Error Summary '.center(PRINT_WIDTH, '='))
+        s.write('\n')
+        if len(self._errCountMap):
+            s.write('%4s %10s %s\n' % ('Code', 'Count', 'Error'))
+            s.write('%4s %10s %s\n' % ('----', '-----', '-----'))
+            errCodeS = self._errCountMap.keys()
+            errCodeS.sort()
+            for c in errCodeS:
+                s.write('%4d %10d %s\n' \
+                        % (c, self._errCountMap[c], genericStringForErrorCode(c)))
+        else:
+            s.write('No errors\n')            
+        s.write('='*PRINT_WIDTH)
+        s.write('\n')
+        
+    def writeErrors(self, isGeneric, theFilter, theStream=sys.stdout):
+        """Writes out errors for me and my files."""
+        theStream.write('\n'.join(self.errStrings(isGeneric, theFilter)))
+        fileS = self._fileMap.keys()
+        fileS.sort()
+        for aFile in fileS:
+            # Immutable call so just use get
+            self._fileMap.getDitaFileObj(aFile).writeErrors(isGeneric, theFilter, theStream)
+        
+    def allErrStrings(self, isGeneric, theFilter):
+        """Return a sorted list of error messages without duplicates including
+        files."""
+        retSet = set(self.errStrings(isGeneric, theFilter))
+        fileS = self._fileMap.keys()
+        fileS.sort()
+        for aFilePath in self._fileMap.keys():
+            # Immutable call so just use get
+            for anErr in self._fileMap.getDitaFileObj(aFilePath).errStrings(isGeneric, theFilter): 
+                retSet.add(anErr)
+        retList = list(retSet)
+        retList.sort()
+        return retList
+            
+    def _readDir(self, theDir, thePatS, recursive):    
+        assert(os.path.isdir(theDir))
+        for aName in os.listdir(theDir):
+            aPath = os.path.join(theDir, aName)
+            if os.path.isdir(aPath) and recursive:
+                self._readDir(aPath, thePatS, recursive)
+            elif os.path.isfile(aPath):
+                for aPat in thePatS:
+                    if fnmatch.fnmatch(aName, aPat):
+                        assert(not self._fileMap.has_key(aPath))
+                        logging.debug(' Reading %s' % aPath)
+                        try:
+                            f = open(aPath)
+                        except IOError:
+                            f = None
+                        self._addFileObj(f, aPath)
+                        break
+
+    def _addFileObj(self, theFileObj, theFilePath):
+        myObj = DitaFileObj(theFileObj, theFilePath)
+        self._addDitaFileObj(myObj)
+
+    def _addDitaFileObj(self, theDitaFileObj):
+        if self._fileMap.has_key(theDitaFileObj.identity):
+            self.addError(504, (theDitaFileObj.identity,))
+        else:
+            # Mutable call so use set
+            self._fileMap.setDitaFileObj(theDitaFileObj.identity, theDitaFileObj)
+        # Update statistics (files, bytes, ids, refs) etc.
+        self._statsMap['Files'] += 1
+        self._statsMap['Bytes'] += theDitaFileObj.bytes
+        self._statsMap['IDs'] += len(theDitaFileObj.idS)
+        self._statsMap['Refs'] += len(theDitaFileObj.refS)
+        if theDitaFileObj.isMap:
+            self._statsMap['Maps'] += 1
+        else:
+            self._statsMap['Non-maps'] += 1
+    
+    def finalise(self):
+        """Creates the environment for all checks and then runs them."""
+        logging.info('DitaFileSet.finalise() start...')
+        if not self._hasFinalised:
+            self._timeRead = time.clock() - self._timeRead
+            self._timeAnalyse = time.clock()
+            self._initRootIdToFilePathMap()
+            self._checkDupeIdS()
+            self._setMapCycles()
+            self._checkLonely()
+            self._checkRefArcs()
+            self._errCountMap = CountDict()
+            self.updateErrorCount(self._errCountMap)
+            self._hasFinalised = True
+            self._timeAnalyse = time.clock() - self._timeAnalyse
+        logging.info('DitaFileSet.finalise() done.')
+        
+    def _initRootIdToFilePathMap(self):
+        # Map of (str(rootId) : filepath, ...) with no duplicates
+        self._rootIdToFilePathMap = {}
+        # Temporary map of (str(rootId) : [filepath, ...], ...)
+        myDupeIdFiles = {}
+        for fPath in self._fileMap.keys():
+            # fObj is not written to so we don't need to use set
+            fObj = self._fileMap.getDitaFileObj(fPath)
+            #print 'TRACE: _initRootIdToFilePathMap() fPath:', fPath
+            rId = fObj.rootId
+            if rId is not None:
+                if myDupeIdFiles.has_key(rId):
+                    #print 'TRACE: _initRootIdToFilePathMap() another dupe:', fPath
+                    myDupeIdFiles[rId].append(fObj.identity)
+                elif self._rootIdToFilePathMap.has_key(rId):
+                    #print 'TRACE: _initRootIdToFilePathMap() first dupe:', fPath
+                    # Remove from map and add to myDupeIdFiles
+                    myFile = self._rootIdToFilePathMap.pop(rId)
+                    try:
+                        myDupeIdFiles[rId].append(myFile)
+                    except KeyError:
+                        myDupeIdFiles[rId] = [myFile,]
+                    myDupeIdFiles[rId].append(fPath)
+                else:
+                    #print 'TRACE: _initRootIdToFilePathMap() adding:', fPath
+                    self._rootIdToFilePathMap[rId] = fObj.identity
+        # Set duplicate errors
+        for k in myDupeIdFiles.keys():
+            myDupeIdFiles[k].sort()
+            self.addError(501, (k, tuple(myDupeIdFiles[k])))
+            #self.addError(501, (k, str([str(a) for a in myDupeIdFiles[k]])))
+    
+    def _checkDupeIdS(self):
+        """Checks if there are any duplicate IDs anywhere."""
+        # {ID : [fileS, ...], ...}
+        myDupeIdMap = {}
+        # Temporary data structure
+        # {ID : first file ID is seen in, ...}
+        seenIdMap = {}
+        for f in self._fileMap.keys():
+            # o is not written to so we don't need set...
+            o = self._fileMap.getDitaFileObj(f)
+            for anId in o.idS:
+                if seenIdMap.has_key(anId):
+                    try:
+                        myDupeIdMap[anId].append(f)
+                    except KeyError:
+                        myDupeIdMap[anId] = [seenIdMap[anId],]
+                        myDupeIdMap[anId].append(f)
+                else:
+                    seenIdMap[anId] = f
+        # Now add to errs as a 505 error message
+        # Sort the files in the map
+        for k in myDupeIdMap.keys():
+            myDupeIdMap[k].sort()
+            self.addError(505, (k, tuple(myDupeIdMap[k])))
+            #self.addError(505, (k, str([str(a) for a in myDupeIdMap[k]])))
+                    
+    def _retMapAdjList(self):
+        """Create an adjacency list {file_path : set(refs), ...} (all strings)"""
+        adjList = {}
+        for f in self._fileMap.keys():
+            fObj = self._fileMap.getDitaFileObj(f)
+            if fObj.isMap:# and fObj.rootId is not None:
+                assert(fObj.identity not in adjList.keys())
+                refSet = set()
+                for r in fObj.refS:
+                    refSet.add(r.fileFragment(fObj.identity)[0])
+                adjList[fObj.identity] = refSet
+        return adjList
+
+    def _setMapCycles(self):
+        """Sets any cyclic references seen in DITA maps."""
+        adjList = self._retMapAdjList()
+        # A branch
+        myBr = []
+        myCycles = set()
+        for aPath, aSet in adjList.items():
+            myBr.append(aPath)
+            self._recurseCycles(adjList, myBr, myCycles)
+            myBr.pop()
+        self._setCycleErrors(myCycles)      
+            
+    def _recurseCycles(self, a, b, c):
+        assert(len(b) > 0)
+        try:
+            myPath = b[-1]
+            for r in a[myPath]:
+                #print '_recurseCycles() testing r', r
+                #print '_recurseCycles() testing b', b
+                if r in b:
+                    #print 'Adding cycle', tuple(b[b.index(r):])
+                    c.add(tuple(b[b.index(r):]))
+                else:
+                    b.append(r)
+                    self._recurseCycles(a, b, c)
+                    b.pop()
+        except KeyError:
+            pass
+        
+    def _setCycleErrors(self, theC):
+        for aT in theC:
+            self.addError(701, (str(aT),))
+            myL = list(aT)
+            assert(len(myL) > 0)
+            i = 0
+            while i < len(myL):
+                myL.append(myL[0])
+                # Should this be in the file thus, or in the files set?
+                # As we are mutating the file object we need to use both
+                # getDitaFileObj() and setDitaFileObj()
+                fObj = self._fileMap.getDitaFileObj(myL[0])
+                fObj.addError(701, (str(myL),))
+                self._fileMap.setDitaFileObj(myL[0], fObj)
+                myL.pop()
+                myL.append(myL.pop(0))
+                i += 1    
+
+    def _checkLonely(self):
+        self._checkLonelyMaps()
+        self._checkLonelyTopics()
+        
+    def _checkLonelyMaps(self):
+        """Checks for lonely maps."""
+        mapPathSet = set()
+        pathSetRemain = set()
+        for f in self._fileMap.keys():
+            if self._fileMap.getDitaFileObj(f).isMap:
+                mapPathSet.add(f)
+                pathSetRemain.add(f)
+        for aPath in mapPathSet:
+            myMapObj = self._fileMap.getDitaFileObj(aPath)
+            for r in myMapObj.refS:
+                refFile, frag = r.fileFragment(f)
+                try:
+                    pathSetRemain.remove(refFile)
+                except KeyError:
+                    # refFile is a topic or an already seen map
+                    pass
+        if len(pathSetRemain) > 1:
+            for aPath in pathSetRemain:
+                self.addError(700, (aPath,))
+        elif len(pathSetRemain) == 1:
+            self._uniqueMapPath = pathSetRemain.pop()
+
+    def _checkLonelyTopics(self):
+        """Checks for topics that are not referenced by any map."""
+        mapPathSet = set()
+        pathSetRemain = set()
+        for f in self._fileMap.keys():
+            #print 'TRACE: f:', f
+            if self._fileMap.getDitaFileObj(f).isMap:
+                mapPathSet.add(f)
+            else:
+                pathSetRemain.add(f)
+        #print 'TRACE: mapPathSet', mapPathSet
+        #print 'TRACE: pathSetRemain', pathSetRemain
+        for aMapPath in mapPathSet:
+            myMapObj = self._fileMap.getDitaFileObj(aMapPath)
+            for r in myMapObj.refS:
+                refFile, frag = r.fileFragment(aMapPath)
+                #print 'TRACE: removing:', refFile
+                try:
+                    pathSetRemain.remove(refFile)
+                except KeyError:
+                    # topic has already been seen in another map
+                    pass
+        if len(pathSetRemain) > 0:
+            for aPath in pathSetRemain:
+                self.addError(600, (aPath,))
+            
+    def _checkRefArcs(self):
+        """Checks all references are reachable."""
+        for fPath in self._fileMap.keys():
+            fObjSrc = self._fileMap.getDitaFileObj(fPath)
+            hasMutated = False
+            for rObjSrc in fObjSrc.refS:
+                if rObjSrc.scheme:
+                    # Decide whether to test and external URL
+                    if self._testExt:
+                        rObjSrc.checkUrl()
+                else:
+                    fi, fr = rObjSrc.fileFragment(fPath)
+                    assert(fi is not None), 'fi is None for rObjSrc: %s in file: %s' % (rObjSrc, fPath)
+                    assert(fr is not None), 'fr is None for rObjSrc: %s in file: %s' % (rObjSrc, fPath)
+                    ## If a url then fileFragment() returns (None, None)
+                    #if fi is None:
+                    #    print 'fPath', fPath
+                    #    print 'rObjSrc', rObjSrc
+                    #    print 'fi', fi
+                    #    print 'fr', fr
+                    try:
+                        fObjTgt = self._fileMap.getDitaFileObj(fi)
+                    except KeyError:
+                        # Target file can not be found in the IR
+                        # check the file system to see if it is a non-DITA resource
+                        if not os.path.isfile(fi):
+                            #print 'TRACE: adding 410 to', fObj.identity
+                            fObjSrc.addError(410, (fi,))
+                            hasMutated = True
+                    else:
+                        if len(fr) > 0:
+                            # Target file is found, test fragment
+                            if not fObjTgt.hasId(fr):
+                                # Fragment not found
+                                fObjSrc.addError(411, (fi, fr))
+                                hasMutated = True
+                        if self._checkRefArcElemName(fObjSrc, rObjSrc, fObjTgt, fr):
+                            hasMutated = True
+            if hasMutated:
+                self._fileMap.setDitaFileObj(fPath, fObjSrc)
+
+    def _checkRefArcElemName(self, fObjSrc, rObjSrc, fObjTgt, frag):
+        """Test source and target element names
+        e.g. Source <cxxClassRef> should match target <cxxClass>
+        And in vanilla DITA:
+        <topicref href="batcaring.dita" type="task"></topicref>
+        or:
+        <topicref href="batcaring.dita" format="ditamap"></topicref>
+        Should match target element <task>."""
+        isRootTgt = False
+        hasMutated = False
+        if len(frag) == 0:
+            # iObjTgt is the root element of fObjTgt
+            if fObjTgt.rootId is None or fObjTgt.idElem(fObjTgt.rootId) is None:
+                # Covered by other error codes
+                return
+            iObjTgt = fObjTgt.idObj(fObjTgt.rootId)
+            isRootTgt = True
+        elif fObjTgt.hasId(frag):
+            iObjTgt = fObjTgt.idObj(frag)
+        else:
+            # frag not found that will be a 411 error (handled by caller).
+            return
+        # Have an rObjSrc + iObjTgt so check elements
+        # First case:
+        if rObjSrc.elem.endswith('Ref'):
+            if rObjSrc.elem[:-3] != iObjTgt.elem:
+                if isRootTgt:
+                    fObjSrc.addError(412, (rObjSrc.elem, iObjTgt.elem))
+                else:
+                    fObjSrc.addError(413, (fObjTgt.idElem(frag), rObjSrc.elem, frag))
+                hasMutated = True
+        # Second case(s) for vanilla DITA
+        elif rObjSrc.elem == 'topicref':
+            # Check DITA map links
+            if rObjSrc.format == 'ditamap' and iObjTgt.elem != 'map':
+                # Target must be a root element (actually we don't care)
+                fObjSrc.addError(414, (iObjTgt.elem,))
+                hasMutated = True
+            elif iObjTgt.elem == 'map' and rObjSrc.format != 'ditamap':
+                fObjSrc.addError(415, (rObjSrc.format,))
+                hasMutated = True
+            elif not (rObjSrc.format == 'ditamap' and iObjTgt.elem == 'map'):
+                # Treat refType None as type="topic", see DITA standard for <topicref>
+                # Well, also look at the type attribute in chapter 25
+                # "When the type attribute is unspecified, it should be
+                # determined by inspecting the target if possible. If the
+                # target cannot be inspected for some reason, the value
+                # should default to "topic".
+                # Note: DITA 1.2 takes a different view...
+                # Was:
+                #if (rObjSrc.refType is None and iObjTgt.elem != 'topic') \
+                #or (rObjSrc.refType is not None and rObjSrc.refType != iObjTgt.elem):
+                if rObjSrc.refType is not None and rObjSrc.refType != iObjTgt.elem:
+                    if isRootTgt:
+                        fObjSrc.addError(416, (rObjSrc.refType, iObjTgt.elem,))
+                        hasMutated = True
+                    else:
+                        fObjSrc.addError(417, (rObjSrc.refType, iObjTgt.elem, frag,))
+                        hasMutated = True
+                # Otherwise topicref looks OK
+        elif rObjSrc.elem != 'xref' and rObjSrc.elem not in XREF_DESCENDENTS:
+            # Unknown referencing element
+            if isRootTgt:
+                fObjSrc.addError(418, (rObjSrc.elem, fObjTgt.doctype))
+                hasMutated = True
+            else:
+                fObjSrc.addError(419, (rObjSrc.elem, fObjTgt.idElem(frag), frag))
+                hasMutated = True
+        return hasMutated
+                                        
+    def updateErrorCount(self, theMap):
+        """Updates a map of {error_code, : count, ...}."""
+        if self._errS is not None:
+            for e in self._errS.keys():
+                theMap[e] += len(self._errS[e])
+        for fPath in self._fileMap.keys():
+            fObj = self._fileMap.getDitaFileObj(fPath)
+            # Mutable call so need to update
+            fObj.updateErrorCount(theMap)
+            self._fileMap.setDitaFileObj(fPath, fObj)
+
+    def debugDump(self, s=sys.stdout, prefix=''):
+        """Dump of IR for debug purposes."""
+        s.write(' Debug Dump '.center(PRINT_WIDTH, '+'))
+        s.write('\n')
+        fileS = self._fileMap.keys()
+        fileS.sort()
+        for f in fileS:
+            self._fileMap.getDitaFileObj(f).debugDump(s, prefix)
+        s.write(' END Debug Dump '.center(PRINT_WIDTH, '+'))
+        s.write('\n\n')
+    
+#####################################
+# Multiprocessing code
+#####################################
+def retDitaFileObj(thePath):
+    return DitaFilePath(thePath)
+ 
+def genDitaPath(theDir, thePatS, recursive):
+    assert(os.path.isdir(theDir))
+    for aName in os.listdir(theDir):
+        aPath = os.path.join(theDir, aName)
+        if os.path.isdir(aPath) and recursive:
+            for p in genDitaPath(aPath, thePatS, recursive):
+                yield p
+        elif os.path.isfile(aPath):
+            for aPat in thePatS:
+                if fnmatch.fnmatch(aName, aPat):
+                    #logging.info('genDitaPath(): %s' % aPath)
+                    yield aPath
+                    break    
+    
+def retMpDitaFileSetObj(theDir,
+                        thePatterns,
+                        recursive,
+                        numJobs, 
+                        checkExt,
+                        useDb):
+    assert(os.path.isdir(theDir))
+    assert(numJobs >= 0)
+    retObj = DitaFileSet(theDir, procDir=False, testExt=checkExt, useDbase=useDb)
+    myNumJobs = numJobs
+    if numJobs == 0:
+        myNumJobs = multiprocessing.cpu_count()
+        logging.info('Set multiprocessing number of jobs to %d' % myNumJobs)
+    myPool = multiprocessing.Pool(processes=myNumJobs)
+    for result in [
+            myPool.apply_async(retDitaFileObj, (f,))
+                for f in genDitaPath(theDir, thePatterns, recursive)
+            ]:
+        myObj = result.get()
+        logging.debug('Got %s' % myObj.identity)
+        retObj._addDitaFileObj(myObj)
+    # Note: finalise() is a serial process
+    logging.info('retMpDitaFileSetObj(): finalising')
+    retObj.finalise()
+    return retObj
+
+######################################
+# Test code
+######################################
+try:
+    import cStringIO as StringIO
+except ImportError:
+    import StringIO
+
+class NullClass(unittest.TestCase):
+    pass
+
+class TestCountDict(unittest.TestCase):
+    def setUp(self):
+        pass
+    
+    def tearDown(self):
+        pass
+    
+    def testSetUpTearDown(self):
+        """TestCountDict: test setUp() and tearDown()."""
+        pass
+    
+    def test_basic(self):
+        """TestCountDict: test basic functionality."""
+        myMap = CountDict()
+        self.assertEqual(myMap.has_key('wtf'), False)
+        self.assertEqual(myMap['wtf'], 0)
+        self.assertEqual(myMap.has_key('wtf'), True)
+        myMap['wtf'] += 1
+        self.assertEqual(myMap['wtf'], 1)
+
+class TestDitaId(unittest.TestCase):
+    def setUp(self):
+        pass
+    
+    def tearDown(self):
+        pass
+    
+    def testSetUpTearDown(self):
+        """DitaId: test setUp() and tearDown()."""
+        pass
+    
+    def test_basic(self):
+        """DitaId: basic read of an node with an id"""
+        myXml = """<cxxClass id="class_big_endian"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaId(myTree.getroot())
+        self.assertEqual(myObj.id, 'class_big_endian')
+        self.assertEqual(str(myObj), 'class_big_endian')
+        self.assertEqual(myObj.errStrings(True, None), [])
+        self.assertEqual(myObj.errStrings(False, None), [])
+        
+    def test_guid_00(self):
+        """DitaId: basic read of an node with an GUID id"""
+        myXml = """<cxxClass id="GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaId(myTree.getroot())
+        self.assertEqual(myObj.id, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
+        myObj.checkGuid()
+        self.assertEqual(myObj.errStrings(True, None), [])
+        self.assertEqual(myObj.errStrings(False, None), [])
+
+    def test_guid_01(self):
+        """DitaId: basic read of an node with an GUID id fails"""
+        myXml = """<cxxClass id="25825EC4-341F-3EA4-94AA-7DCE380E6D2E"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaId(myTree.getroot())
+        self.assertEqual(myObj.id, '25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
+        myObj.checkGuid()
+        self.assertEqual(
+            myObj.errStrings(False, None),
+            [
+             'GUID specification does not match id="25825EC4-341F-3EA4-94AA-7DCE380E6D2E"'
+            ])
+        self.assertEqual(
+            myObj.errStrings(True, None),
+            [
+             'GUID specification does not match id="%s"' % GENERIC_STRING,
+            ])
+
+    def test_cmp_eq_00(self):
+        """DitaId: cmp(), == of two identical nodes"""
+        myXml = """<cxxClass id="class_big_endian"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj_00 = DitaId(myTree.getroot())
+        myObj_01 = DitaId(myTree.getroot())
+        self.assertEqual(cmp(myObj_00, myObj_01), 0)
+        self.assertEqual((myObj_00 == myObj_01), True)
+
+    def test_cmp_eq_01(self):
+        """DitaId: cmp(), == of two identical nodes from different elements."""
+        myXml_00 = """<cxxClass id="big_endian"/>"""
+        myTree_00 = etree.parse(StringIO.StringIO(myXml_00))
+        myObj_00 = DitaId(myTree_00.getroot())
+        myXml_01 = """<cxxStruct id="big_endian"/>"""
+        myTree_01 = etree.parse(StringIO.StringIO(myXml_01))
+        myObj_01 = DitaId(myTree_01.getroot())
+        self.assertEqual(cmp(myObj_00, myObj_01), 0)
+        self.assertEqual((myObj_00 == myObj_01), True)
+
+    def test_set(self):
+        """DitaId: read of an node with an id several times into a set and check unique,"""
+        myXml = """<cxxClass id="class_big_endian"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        s = set()
+        i = 0
+        while i < 8:
+            s.add(DitaId(myTree.getroot()))
+            i += 1
+        self.assertEqual(len(s), 1)
+        self.assertEqual(DitaId(myTree.getroot()) in s, True)
+
+    def test_map(self):
+        """DitaId: read of an node with an id several times into a map and check unique,"""
+        myXml = """<cxxClass id="class_big_endian"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        m = {}
+        i = 0
+        while i < 8:
+            m[DitaId(myTree.getroot())] = 1
+            i += 1
+        self.assertEqual(len(m), 1)
+        self.assertEqual(m.has_key(DitaId(myTree.getroot())), True)
+
+    def test_error_hash(self):
+        """DitaId: error with a '#' in an id"""
+        myXml = """<cxxClass id="class_#big_endian"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaId(myTree.getroot())
+        self.assertEqual(myObj.id, 'class_#big_endian')
+        self.assertEqual(str(myObj), 'class_#big_endian')
+        self.assertEqual(
+                myObj.errStrings(True, None),
+                [
+                    genericStringForErrorCode(100),
+                ]
+            )
+        self.assertEqual(
+                myObj.errStrings(False, None),
+                [
+                 'Character \'#\' not allowed in id="class_#big_endian"',
+                 ]
+            )
+        
+
+
+class TestDitaRef(unittest.TestCase):
+    def setUp(self):
+        pass
+    
+    def tearDown(self):
+        pass
+    
+    def testSetUpTearDown(self):
+        """DitaRef: test setUp() and tearDown()."""
+        pass
+    
+    def test_basic(self):
+        """DitaRef: basic read of an xref node, no fragment"""
+        myXml = """<xref href="class_big_endian"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, 'class_big_endian')
+        self.assertEqual(myObj.path, 'class_big_endian')
+        self.assertEqual(myObj.elem, 'xref')
+        self.assertEqual(str(myObj), 'xref class_big_endian')
+        self.assertEqual(myObj.fragment, '')
+        self.assertEqual(myObj.scheme, '')
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+
+    def test_basic_frag(self):
+        """DitaRef: basic read of an xref node, with fragment"""
+        myXml = """<xref href="class_big_endian.xml#function"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, 'class_big_endian.xml#function')
+        self.assertEqual(myObj.path, 'class_big_endian.xml')
+        self.assertEqual(myObj.fragment, 'function')
+        self.assertEqual(myObj.scheme, '')
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+
+    def test_file_frag_00(self):
+        """DitaRef: accessing an xref node, with a file and a fragment"""
+        myXml = """<xref href="class_big_endian.xml#function"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, 'class_big_endian.xml#function')
+        self.assertEqual(myObj.path, 'class_big_endian.xml')
+        self.assertEqual(myObj.fragment, 'function')
+        self.assertEqual(myObj.scheme, '')
+        srcPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
+        expPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'class_big_endian.xml'))
+        self.assertEqual(
+            myObj.fileFragment(srcPath),
+            (expPath, 'function')
+        )
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+        
+    def test_file_frag_01(self):
+        """DitaRef: accessing an xref node, with a file and a fragment and relative path with '\\'."""
+        myXml = """<xref href="..\\chips\\class_big_endian.xml#function"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        srcPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
+        expPath = normalisePath(os.path.join('C:%s' % os.sep, 'chips', 'class_big_endian.xml'))
+        self.assertEqual(
+            myObj.fileFragment(srcPath),
+            (expPath, 'function')
+        )
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+        
+    def test_file_frag_02(self):
+        """DitaRef: accessing an xref node, with a file and a fragment and relative path with '/'."""
+        myXml = """<xref href="../chips/class_big_endian.xml#function"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        srcPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
+        expPath = normalisePath(os.path.join('C:%s' % os.sep, 'chips', 'class_big_endian.xml'))
+        self.assertEqual(
+            myObj.fileFragment(srcPath),
+            (expPath, 'function')
+        )
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+        
+    def test_file_frag_03(self):
+        """DitaRef: accessing an xref node, with a no file but with a fragment"""
+        myXml = """<xref href="#function"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, '#function')
+        self.assertEqual(myObj.path, '')
+        self.assertEqual(myObj.fragment, 'function')
+        self.assertEqual(myObj.scheme, '')
+        srcPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
+        expPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
+        self.assertEqual(
+            myObj.fileFragment(srcPath),
+            (expPath, 'function')
+        )
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+        
+    def test_basic_scheme(self):
+        """DitaRef: an xref node with a URI scheme"""
+        myXml = """<xref href="http://www.cwi.nl:80/%7Eguido/Python.html#fragment"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, 'http://www.cwi.nl:80/%7Eguido/Python.html#fragment')
+        self.assertEqual(myObj.path, '/%7Eguido/Python.html')
+        self.assertEqual(myObj.fragment, 'fragment')
+        self.assertEqual(myObj.scheme, 'http')
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+
+    def test_basic_scheme_file_frag(self):
+        """DitaRef: an xref node with a URI scheme, invoking fileFragment()"""
+        myXml = """<xref href="http://www.cwi.nl:80/%7Eguido/Python.html#fragment"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, 'http://www.cwi.nl:80/%7Eguido/Python.html#fragment')
+        self.assertEqual(myObj.path, '/%7Eguido/Python.html')
+        self.assertEqual(myObj.fragment, 'fragment')
+        self.assertEqual(myObj.scheme, 'http')
+        srcPath = os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml')
+        self.assertEqual(
+            myObj.fileFragment(srcPath),
+            (None, None)
+        )
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+
+    def test_fail_no_href(self):
+        """DitaRef: Fails on an xref node with no href attribute"""
+        myXml = """<xref />"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(
+            myObj.errStrings(False, None),
+            [
+             'Reference element "xref" is missing href=... attribute',
+             ]
+        )
+        self.assertEqual(
+            myObj.errStrings(True, None),
+            [
+             'Reference element "%s" is missing href=... attribute' % GENERIC_STRING,
+             ]
+        )
+
+    def test_fail_bad_frag(self):
+        """DitaRef: Fails on an xref node with href attribute that has multiple '#' characters"""
+        myXml = """<xref href="a#b#c" />"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(
+            myObj.errStrings(False, None),
+            [
+             'Multiple \'#\' not allowed in reference "a#b#c"',
+             ]
+        )
+        self.assertEqual(
+            myObj.errStrings(True, None),
+            [
+             'Multiple \'#\' not allowed in reference "%s"' % GENERIC_STRING,
+             ]
+        )
+
+    def test_guid_00(self):
+        """DitaRef: basic read of an node with an GUID file/fragment reference"""
+        myXml = """<xref href="GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
+        self.assertEqual(myObj.path, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml')
+        self.assertEqual(myObj.elem, 'xref')
+        self.assertEqual(str(myObj), 'xref GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
+        self.assertEqual(myObj.fragment, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
+        self.assertEqual(myObj.scheme, '')
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+
+    def test_guid_01(self):
+        """DitaRef: basic read of an node with an GUID file part fails"""
+        myXml = """<xref href="GUID-.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, 'GUID-.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
+        self.assertEqual(myObj.path, 'GUID-.xml')
+        self.assertEqual(myObj.elem, 'xref')
+        self.assertEqual(str(myObj), 'xref GUID-.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
+        self.assertEqual(myObj.fragment, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+        myObj.checkGuid()
+        self.assertEqual(
+            myObj.errStrings(False, None),
+            [
+             'GUID specification does not match file reference "GUID-.xml"'
+            ])
+        self.assertEqual(
+            myObj.errStrings(True, None),
+            [
+             genericStringForErrorCode(203),
+            ]
+        )
+
+    def test_guid_02(self):
+        """DitaRef: basic read of an node with an GUID fragment part fails"""
+        myXml = """<xref href="GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4"/>"""
+        myTree = etree.parse(StringIO.StringIO(myXml))
+        myObj = DitaRef(myTree.getroot())
+        self.assertEqual(myObj.href, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4')
+        self.assertEqual(myObj.path, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml')
+        self.assertEqual(myObj.elem, 'xref')
+        self.assertEqual(str(myObj), 'xref GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4')
+        self.assertEqual(myObj.fragment, 'GUID-25825EC4')
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+        myObj.checkGuid()
+        self.assertEqual(
+            myObj.errStrings(False, None),
+            [
+             'GUID specification does not match fragment reference "GUID-25825EC4"'
+            ])
+        self.assertEqual(
+            myObj.errStrings(True, None),
+            [
+             genericStringForErrorCode(204),
+            ]
+        )
+
+class TestDitaFile(unittest.TestCase):
+    def setUp(self):
+        pass
+    
+    def tearDown(self):
+        pass
+    
+    def testSetUpTearDown(self):
+        """DitaFile: test setUp() and tearDown()."""
+        pass
+    
+    def test_Basic(self):
+        """DitaFile: basic read of an XML file"""
+        myXml = """<?xml version='1.0' encoding='UTF-8' standalone='no'?>
+<!DOCTYPE cxxClass PUBLIC "-//NOKIA//DTD DITA C++ API Class Reference Type v0.1.0//EN" "dtd/cxxClass.dtd" >
+<cxxClass id="class_big_endian">
+    <apiName>BigEndian</apiName>
+    <shortdesc/>
+    <cxxClassDetail>
+        <cxxClassDefinition>
+            <cxxClassAccessSpecifier value="public"/>
+            <cxxClassAPIItemLocation>
+                <cxxClassDeclarationFile name="filePath" value="K:/sf/os/commsfw/datacommsserver/esockserver/inc/es_sock.h"/>
+                <cxxClassDeclarationFileLine name="lineNumber" value="1520"/>
+                <cxxClassDefinitionFile name="filePath" value="K:/sf/os/commsfw/datacommsserver/esockserver/inc/es_sock.h"/>
+                <cxxClassDefinitionFileLineStart name="lineNumber" value="1516"/>
+                <cxxClassDefinitionFileLineEnd name="lineNumber" value="1526"/>
+            </cxxClassAPIItemLocation>
+        </cxxClassDefinition>
+        <apiDesc>
+            <p>Inserts and extracts integers in big-endian format.   </p>
+        </apiDesc>
+    </cxxClassDetail>
+    <cxxFunction id="class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f">
+    </cxxFunction>
+    <cxxFunction id="class_big_endian_1aedf702f5c0118e4294d1a6d9684f8441">
+    </cxxFunction>
+    <cxxFunction id="class_big_endian_1ae266722f7bb965c971155a3315bad484">
+    </cxxFunction>
+    <cxxFunction id="class_big_endian_1a497d5248ea259f8490fb40ac4f2aafb2">
+    </cxxFunction>
+</cxxClass>"""
+        myFile = StringIO.StringIO(myXml)
+        myObj = DitaFileObj(myFile, 'foo')
+        self.assertEqual(myObj.identity, normalisePath('foo'))
+        self.assertEqual(myObj.doctype, 'cxxClass')
+        self.assertEqual(myObj.rootId, 'class_big_endian')
+        #print myObj.idMap()
+        self.assertEqual(
+            myObj.idElemMap(),
+            {
+                'class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'   : 'cxxFunction',
+                'class_big_endian_1aedf702f5c0118e4294d1a6d9684f8441'   : 'cxxFunction',
+                'class_big_endian'                                      : 'cxxClass',
+                'class_big_endian_1a497d5248ea259f8490fb40ac4f2aafb2'   : 'cxxFunction',
+                'class_big_endian_1ae266722f7bb965c971155a3315bad484'   : 'cxxFunction',
+                }
+        )
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+
+    def test_missing_file(self):
+        """DitaFile: read an missing XML file"""
+        myObj = DitaFileObj(None, 'foo')
+        self.assertEqual(
+            myObj.errStrings(False, None),
+            [
+             'Failed to open: "%s"' % normalisePath('foo'),
+             ]
+        )
+        self.assertEqual(
+            myObj.errStrings(True, None),
+            [
+             genericStringForErrorCode(400),
+             ]
+        )
+    
+    def test_IllFormedFile(self):
+        """DitaFile: read an ill-formed XML file"""
+        myXml = """<?xml version='1.0' encoding='UTF-8' standalone='no'?>
+<!DOCTYPE cxxClass PUBLIC "-//NOKIA//DTD DITA C++ API Class Reference Type v0.1.0//EN" "dtd/cxxClass.dtd" >
+<cxxClass id="class_big_endian">
+"""
+        myFile = StringIO.StringIO(myXml)
+        myObj = DitaFileObj(myFile, 'foo')
+        self.assertEqual(myObj.identity, normalisePath('foo'))
+        self.assertEqual(myObj.doctype, None)
+        self.assertEqual(myObj.rootId, None)
+        #print myObj.idMap()
+        self.assertEqual(myObj.idElemMap(), {})
+        self.assertEqual(
+            myObj.errStrings(False, None),
+            [
+             'Can not parse: "no element found: line 4, column 0"',
+             ]
+        )
+        self.assertEqual(
+            myObj.errStrings(True, None),
+            [
+             genericStringForErrorCode(404),
+             ]
+        )
+
+    def test_missing_root_id(self):
+        """DitaFile: read of an XML file with no id on root element"""
+        myXml = """<?xml version='1.0' encoding='UTF-8' standalone='no'?>
+<!DOCTYPE cxxClass PUBLIC "-//NOKIA//DTD DITA C++ API Class Reference Type v0.1.0//EN" "dtd/cxxClass.dtd" >
+<cxxClass>
+    <xref href="OtherClass">OtherClass</xref>
+    <cxxFunction id="class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f"/>
+</cxxClass>"""
+        myFile = StringIO.StringIO(myXml)
+        myObj = DitaFileObj(myFile, 'foo')
+        self.assertEqual(myObj.identity, normalisePath('foo'))
+        self.assertEqual(myObj.doctype, 'cxxClass')
+        self.assertEqual(myObj.rootId, None)
+        self.assertEqual(
+            myObj.idElemMap(),
+            {
+                'class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'   : 'cxxFunction',
+                }
+        )
+        self.assertEqual(myObj.errStrings(False, None), [genericStringForErrorCode(402)])
+        self.assertEqual(myObj.errStrings(True, None), [genericStringForErrorCode(402)])
+
+    def test_duplicate_id(self):
+        """DitaFile: duplicate IDs"""
+        myXml = """<root id="AnID">
+<elem id="AnID"/>
+</root>"""
+        myFile = StringIO.StringIO(myXml)
+        myObj = DitaFileObj(myFile, 'spam.xml')
+        self.assertEqual(myObj.identity, normalisePath('spam.xml'))
+        self.assertEqual(myObj.doctype, 'root')
+        self.assertEqual(myObj.rootId, 'AnID')
+        self.assertEqual(myObj.idElemMap(), {})
+        self.assertEqual(
+            myObj.errStrings(False, None),
+            [
+                'Multiple id="AnID"',
+            ]
+        )
+        self.assertEqual(myObj.errStrings(True, None), [genericStringForErrorCode(401)])
+
+    def test_ismap_00(self):
+        """DitaFile: Is a map for <map>."""
+        myXml = """<map id="myMap"/>"""
+        myFile = StringIO.StringIO(myXml)
+        myObj = DitaFileObj(myFile, 'spam.xml')
+        self.assertEqual(myObj.isMap, True)
+    
+    def test_ismap_01(self):
+        """DitaFile: Is a map for <cxxAPIMap>."""
+        myXml = """<cxxAPIMap id="myMap"/>"""
+        myFile = StringIO.StringIO(myXml)
+        myObj = DitaFileObj(myFile, 'spam.xml')
+        self.assertEqual(myObj.isMap, True)
+    
+    def test_Basic_01(self):
+        """DitaFile: read of an simple XML file with id and xref"""
+        myXml = """<?xml version='1.0' encoding='UTF-8' standalone='no'?>
+<!DOCTYPE cxxClass PUBLIC "-//NOKIA//DTD DITA C++ API Class Reference Type v0.1.0//EN" "dtd/cxxClass.dtd" >
+<cxxClass id="class_big_endian">
+    <xref href="OtherClass">OtherClass</xref>
+    <cxxFunction id="class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f"/>
+</cxxClass>"""
+        myFile = StringIO.StringIO(myXml)
+        myObj = DitaFileObj(myFile, 'foo')
+        self.assertEqual(myObj.identity, normalisePath('foo'))
+        self.assertEqual(myObj.doctype, 'cxxClass')
+        self.assertEqual(myObj.rootId, 'class_big_endian')
+        self.assertEqual(myObj.isMap, False)
+        self.assertEqual(len(myObj.idS), 2)
+        self.assertEqual(len(myObj.refS), 1)
+        self.assertEqual(myObj.hasId('class_big_endian'), True)
+        self.assertEqual(myObj.hasId('class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'), True)
+        self.assertEqual(myObj.hasId('noID'), False)
+        self.assertEqual(myObj.idElem('class_big_endian'), 'cxxClass')
+        self.assertEqual(myObj.idElem('noID'), None)
+        self.assertEqual(
+            myObj.idElem('class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'),
+            'cxxFunction'
+        )
+        #print myObj.idMap()
+        self.assertEqual(
+            myObj.idElemMap(),
+            {
+                'class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'   : 'cxxFunction',
+                'class_big_endian'                                      : 'cxxClass',
+                }
+        )
+        self.assertEqual(myObj.errStrings(False, None), [])
+        self.assertEqual(myObj.errStrings(True, None), [])
+
+class TestDitaFileSet(unittest.TestCase):
+    def setUp(self):
+        pass
+    
+    def tearDown(self):
+        pass
+    
+    def testSetUpTearDown(self):
+        """DitaFileSet: test setUp() and tearDown()."""
+        pass
+    
+    def test_None(self):
+        """DitaFileSet: read of None."""
+        myO = DitaFileSet(None)
+        myO.finalise()
+        self.assertEqual(myO.errStrings(False, None), ['Not a directory: None'])
+        self.assertEqual(myO.errStrings(True, None), ['Not a directory: %s' % GENERIC_STRING, ])
+        self.assertEqual(myO.errCountMap, {500 : 1})
+
+    def test_basic(self):
+        """DitaFileSet: Test reading a map and a couple of files."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="spam.dita" />
+    <topicref href="eggs.dita" />
+</map>"""
+            ),
+            'map.ditamap'
+        )
+        myO._addFileObj(StringIO.StringIO('<topic id="spam"/>'), 'spam.dita')
+        myO._addFileObj(StringIO.StringIO('<topic id="eggs"/>'), 'eggs.dita')
+        myO.finalise()
+        #print 'HI'
+        #myO.writeErrors(False)
+        self.assertEqual(myO.allErrStrings(False, None), [])
+        self.assertEqual(myO.allErrStrings(True, None), [])
+        self.assertEqual(myO.errCountMap, {})
+
+    def test_duplicate_paths(self):
+        """DitaFileSet: Test reading a couple of files in duplicate paths."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="spam.dita" />
+</map>"""
+            ),
+            'map.ditamap'
+        )
+        myO._addFileObj(StringIO.StringIO('<topic id="spam"/>'), 'spam.dita')
+        myO._addFileObj(StringIO.StringIO('<topic id="eggs"/>'), 'spam.dita')
+        myO.finalise()
+        self.assertEqual(
+            myO.errStrings(False, None),
+            [
+                'Duplicate file path: "%s"' % normalisePath('spam.dita'),
+            ]
+        )
+        self.assertEqual(myO.errStrings(True, None), [genericStringForErrorCode(504),])
+        self.assertEqual(myO.errCountMap, {504 : 1})
+
+    def test_duplicate_ids(self):
+        """DitaFileSet: Test reading a map and a couple of files with duplicate IDs."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="spam.dita" />
+    <topicref href="eggs.dita" />
+    <topicref href="chips.dita" />
+</map>"""
+            ),
+            'map.ditamap'
+        )
+        myO._addFileObj(StringIO.StringIO('<topic id="chips"/>'), 'spam.dita')
+        myO._addFileObj(StringIO.StringIO('<topic id="chips"/>'), 'eggs.dita')
+        myO._addFileObj(StringIO.StringIO('<topic id="chips"/>'), 'chips.dita')
+        myO.finalise()
+        #print 'HI'
+        #myO.writeErrors(False)
+        #pprint.pprint(myO.errStrings(False, None))
+        self.assertEqual(
+            myO.errStrings(True, None),
+            [
+             genericStringForErrorCode(505),
+             genericStringForErrorCode(501),
+             ]
+        )
+        expErrs = [
+                """Duplicate id="chips" in files: ('%s', '%s', '%s')""" \
+                    % (normalisePath('chips.dita'), normalisePath('eggs.dita'), normalisePath('spam.dita')),
+                """Duplicate root id="chips" in files: ('%s', '%s', '%s')""" \
+                    % (normalisePath('chips.dita'), normalisePath('eggs.dita'), normalisePath('spam.dita')),
+            ]
+        myErrs = myO.errStrings(False, None)
+#===============================================================================
+#        for i in range(2):
+#            if myErrs[i] != expErrs[i]:
+#                print myErrs[i]
+#                print expErrs[i]
+#                print
+#===============================================================================
+        self.assertEqual(myErrs, expErrs)
+        self.assertEqual(myO.errCountMap, {505: 1, 501: 1})
+    
+    def test_lonely_topics(self):
+        """DitaFileSet: Test a couple of lonely topics."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(StringIO.StringIO('<spam id="spam"/>'), 'spam')
+        myO._addFileObj(StringIO.StringIO('<eggs id="eggs"/>'), 'eggs')
+        myO.finalise()
+        self.assertEqual(
+            myO.errStrings(False, None),
+            [
+             'Topic id="%s" is not referenced by any map' % normalisePath('eggs'),
+             'Topic id="%s" is not referenced by any map' % normalisePath('spam'),
+             ]
+        )
+        self.assertEqual(
+            myO.errStrings(True, None),
+            [
+                genericStringForErrorCode(600),
+            ]
+        )
+
+    def test_map_cycles_00(self):
+        """DitaFileSet: Cyclic references between two maps."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="map_01.ditamap" format="ditamap" />
+</map>"""
+            ),
+            'map_00.ditamap'
+        )
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_01">
+    <topicref href="map_00.ditamap" format="ditamap" />
+</map>"""
+            ),
+            'map_01.ditamap'
+        )
+        myO.finalise()
+        #print 'HI test_map_cycles_00()'
+        #pprint.pprint(myO._retMapAdjList())
+        self.assertEqual(
+            myO.errStrings(False, None),
+            [
+                'Maps "%s" are in a a cycle.' % str(
+                    (
+                     normalisePath('map_00.ditamap'),
+                     normalisePath('map_01.ditamap'),
+                     )
+                ),
+                'Maps "%s" are in a a cycle.' % str(
+                    (
+                     normalisePath('map_01.ditamap'),
+                     normalisePath('map_00.ditamap'),
+                     )
+                ),
+            ]
+        )
+        #print
+        #pprint.pprint(myO.allErrStrings(False, None))
+        self.assertEqual(myO.allErrStrings(True, None), [genericStringForErrorCode(701)])
+        self.assertEqual(myO.errCountMap, {701 : 4})
+
+    def test_map_cycles_01(self):
+        """DitaFileSet: Cyclic references between three maps."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="map_01.ditamap" format="ditamap" />
+</map>"""
+            ),
+            'map_00.ditamap'
+        )
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_01">
+    <topicref href="map_02.ditamap" format="ditamap" />
+</map>"""
+            ),
+            'map_01.ditamap'
+        )
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_02">
+    <topicref href="map_00.ditamap" format="ditamap" />
+</map>"""
+            ),
+            'map_02.ditamap'
+        )
+        myO.finalise()
+        #print 'HI test_map_cycles_00()'
+        #pprint.pprint(myO._retMapAdjList())
+        self.assertEqual(
+            myO.errStrings(False, None),
+            [
+                'Maps "%s" are in a a cycle.' % str(
+                    (
+                     normalisePath('map_00.ditamap'),
+                     normalisePath('map_01.ditamap'),
+                     normalisePath('map_02.ditamap'),
+                     )
+                ),
+                'Maps "%s" are in a a cycle.' % str(
+                    (
+                     normalisePath('map_01.ditamap'),
+                     normalisePath('map_02.ditamap'),
+                     normalisePath('map_00.ditamap'),
+                     )
+                ),
+                'Maps "%s" are in a a cycle.' % str(
+                    (
+                     normalisePath('map_02.ditamap'),
+                     normalisePath('map_00.ditamap'),
+                     normalisePath('map_01.ditamap'),
+                     )
+                ),
+            ]
+        )
+        self.assertEqual(myO.errStrings(True, None), [genericStringForErrorCode(701)])
+        self.assertEqual(myO.errCountMap, {701 : 6})
+
+    def test_refarc_00(self):
+        """DitaFileSet: Test ref arcing - all resolve."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="spam.dita#spam" />
+    <topicref href="eggs.dita#eggs" />
+</map>"""
+            ),
+            'map.ditamap'
+        )
+        myO._addFileObj(StringIO.StringIO('<topic id="spam"/>'), 'spam.dita')
+        myO._addFileObj(StringIO.StringIO('<topic id="eggs"/>'), 'eggs.dita')
+        myO.finalise()
+        self.assertEqual(myO.errCountMap, {})
+        self.assertEqual(myO.allErrStrings(False, None), [])
+        self.assertEqual(myO.allErrStrings(True, None), [])
+        self.assertEqual(myO.errStrings(False, None), [])
+        self.assertEqual(myO.errStrings(True, None), [])
+
+    def test_refarc_fail_00(self):
+        """DitaFileSet: Test ref arcing - can't find file."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="spam_.dita" />
+    <topicref href="eggs_for_tea.dita" />
+</map>"""
+            ),
+            'map.ditamap'
+        )
+        myO.finalise()
+        self.assertEqual(myO.errCountMap, {410: 2})
+        #print 'HI'
+        #pprint.pprint(myO.allErrStrings(False, None))
+        self.assertEqual(
+            myO.allErrStrings(False, None),
+            [
+                'Can not resolve reference to file "%s"' % normalisePath('eggs_for_tea.dita'),
+                'Can not resolve reference to file "%s"' % normalisePath('spam_.dita'),
+            ]
+        )
+        self.assertEqual(
+            myO.allErrStrings(True, None),
+            [
+                'Can not resolve reference to file "..."',
+            ]
+        )
+        self.assertEqual(myO.errStrings(False, None), [])
+        self.assertEqual(myO.errStrings(True, None), [])
+
+    def test_refarc_fail_01(self):
+        """DitaFileSet: Test ref arcing - can't find fragment."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="spam.dita#spam_" />
+    <topicref href="eggs.dita#eggs_" />
+</map>"""
+            ),
+            'map.ditamap'
+        )
+        myO._addFileObj(StringIO.StringIO('<spam id="spam"/>'), 'spam.dita')
+        myO._addFileObj(StringIO.StringIO('<eggs id="eggs"/>'), 'eggs.dita')
+        myO.finalise()
+        self.assertEqual(myO.errCountMap, {411: 2})
+        #print 'HI'
+        #pprint.pprint(myO.allErrStrings(False, None))
+        self.assertEqual(
+            myO.allErrStrings(False, None),
+            [
+                'Can resolve reference to file "%s" but not to fragment "eggs_"' % normalisePath('eggs.dita'),
+                'Can resolve reference to file "%s" but not to fragment "spam_"' % normalisePath('spam.dita'),
+            ]
+        )
+        self.assertEqual(
+            myO.allErrStrings(True, None),
+            [
+                'Can resolve reference to file "%s" but not to fragment "%s"' % (GENERIC_STRING, GENERIC_STRING),
+            ]
+        )
+        self.assertEqual(myO.errStrings(False, None), [])
+        self.assertEqual(myO.errStrings(True, None), [])
+
+    def test_refarc_url_00(self):
+        """DitaFileSet: Test ref arcing - URL."""
+        myO = DitaFileSet(None, procDir=False, testExt=True)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<map id="map_00">
+    <topicref href="spam.dita#spam" />
+    <topicref href="eggs.dita#eggs" />
+</map>"""
+            ),
+            'map.ditamap'
+        )
+        myO._addFileObj(StringIO.StringIO("""<topic id="spam">
+        <xref href="http://www.nokia.com">Nokia</xref>
+</topic>"""), 'spam.dita')
+        myO._addFileObj(StringIO.StringIO("""<topic id="eggs">
+        <xref href="http://www.google.com">Google</xref>
+</topic>"""), 'eggs.dita')
+        myO.finalise()
+        #print 'HI'
+        #pprint.pprint(myO.allErrStrings(False, None))
+        self.assertEqual(myO.errCountMap, {})
+        self.assertEqual(
+            myO.allErrStrings(False, None),
+            [
+            ]
+        )
+        self.assertEqual(
+            myO.allErrStrings(True, None),
+            [
+            ]
+        )
+        self.assertEqual(myO.errStrings(False, None), [])
+        self.assertEqual(myO.errStrings(True, None), [])
+
+class TestDitaBookmapFileSet(unittest.TestCase):
+    def setUp(self):
+        pass
+    
+    def tearDown(self):
+        pass
+    
+    def testSetUpTearDown(self):
+        """TestDitaBookmapFileSet: test setUp() and tearDown()."""
+        pass
+    
+    def test_basic(self):
+        """TestDitaBookmapFileSet: Test reading a bookmap and a topic."""
+        myO = DitaFileSet(None, procDir=False)
+        myO._addFileObj(
+            StringIO.StringIO(
+"""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE bookmap PUBLIC "-//OASIS//DTD DITA BookMap//EN"
+"bookmap.dtd">
+<bookmap id="GUID-5BDFDB6B-7801-4804-9F41-2BDC5BE53DDF">
+  <booktitle>
+    <mainbooktitle>My Bookmap</mainbooktitle>
+    <booktitlealt>Alternate title</booktitlealt>
+  </booktitle>
+  <frontmatter id="GUID-DA857913-F826-4CF7-A135-93F2AEB48353">
+    <topicref href="GUID-00025EAD-C4B6-5408-96A3-FFDBBBDC7CAB.dita" id="GUID-994B1764-393F-401F-8571-CE0955AB6CA6" />
+  </frontmatter>
+</bookmap>
+"""
+            ),
+            'bookmap.ditamap'
+        )
+        myO._addFileObj(StringIO.StringIO("""<?xml version="1.0" encoding="utf-8"?>
+<!DOCTYPE concept  PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
+<concept id="GUID-00025EAD-C4B6-5408-96A3-FFDBBBDC7CAB" xml:lang="en">
+    <title>How to read and write a file</title>
+</concept>
+"""), 'GUID-00025EAD-C4B6-5408-96A3-FFDBBBDC7CAB.dita')
+        myO.finalise()
+        #print
+        #myO.debugDump()
+        #print 'HI'
+        #myO.writeErrors(False)
+        self.assertEqual(myO.allErrStrings(False, None), [])
+        self.assertEqual(myO.allErrStrings(True, None), [])
+        self.assertEqual(myO.errCountMap, {})
+
+class Special(unittest.TestCase):
+    pass
+
+def unitTest(theVerbosity=2):
+    suite = unittest.TestLoader().loadTestsFromTestCase(NullClass)
+    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCountDict))
+    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaId))
+    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaRef))
+    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaFile))
+    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaFileSet))
+    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaBookmapFileSet))
+    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Special))
+    myResult = unittest.TextTestRunner(verbosity=theVerbosity).run(suite)
+    return (myResult.testsRun, len(myResult.errors), len(myResult.failures))
+
+######################################
+# main() stuff
+######################################
+def main():
+    print 'CMD: %s' % ' '.join(sys.argv)
+    usage = "usage: %prog [options] <Directory of XML content>"
+    parser = OptionParser(usage, version='%prog ' + __version__)
+    parser.add_option("-d", action="store_true", dest="dump", default=False, 
+                      help="Dump internal representation. [default: %default]")
+    parser.add_option(
+            "-e", "--errors",
+            type="str",
+            dest="error_codes",
+            default='All',
+            help="Only report on certain error codes (space seperated list). [default: \"%default\"]"
+        )      
+    parser.add_option("-f", "--file", dest="file", type="str", default='None', 
+                      help="Report of errors by file either 'None', 'generic', 'specific'. [default: %default]")
+    parser.add_option("-g", action="store_true", dest="guid", default=False, 
+                      help="Enforce GUID specification. [default: %default]")
+    parser.add_option(
+            "-j", "--jobs",
+            type="int",
+            dest="jobs",
+            default=-1,
+            help="Max processes when multiprocessing. 0 takes CPUs, -1 no MP. [default: %default]"
+        )      
+    parser.add_option(
+            "-l", "--loglevel",
+            type="int",
+            dest="loglevel",
+            default=20,
+            help="Log Level (debug=10, info=20, warning=30, [error=40], critical=50) [default: %default]"
+        )      
+    parser.add_option(
+            "-p", "--pattern",
+            type="str",
+            dest="pattern",
+            default=FNMATCH_STRING,
+            help="Pattern match. [default: \"%default\"]"
+        )      
+    parser.add_option("-r", action="store_true", dest="recursive", default=False, 
+                      help="Recursive. [default: %default]")
+    parser.add_option("-s", action="store_true", dest="shelve", default=False, 
+                      help="Use the shelve dBase rather than storing the internal representation in memory. This is slower but is useful for large data sets where a memory error might occur. [default: %default]")
+    parser.add_option("-u", action="store_true", dest="unit_test", default=False, 
+                      help="Execute unit tests and exit. [default: %default]")
+    parser.add_option("-x", action="store_true", dest="ext_url", default=False, 
+                      help="Test external |URLs. [default: %default]")
+    parser.add_option("-?", action="store_true", dest="query_errors", default=False, 
+                      help="Display the error types that are detected. [default: %default]")
+    (options, args) = parser.parse_args()
+    logging.basicConfig(
+        level=options.loglevel,
+        format='%(asctime)s %(levelname)-8s %(message)s',
+        stream=sys.stdout,
+    )
+    if options.file not in ('None', 'generic', 'specific'):
+        parser.error("--file option must be: 'None' | 'generic' | 'specific'")
+        return 1
+    if options.unit_test:
+        unitTest()
+    if options.query_errors:
+        writeGenericStringsForErrorCodes()
+    if len(args) < 1 and not options.unit_test:
+        parser.print_help()
+        parser.error("I can't do much without a path to the XML content.")
+        return 1
+    elif len(args) == 1:
+        if options.jobs > -1:
+            myObj = retMpDitaFileSetObj(
+                        args[0],
+                        options.pattern.split(' '),
+                        options.recursive,
+                        options.jobs,
+                        options.ext_url,
+                        options.shelve,
+                        )
+        else:
+            myObj = DitaFileSet(args[0],
+                                procDir=True,
+                                thePatterns=options.pattern.split(' '),
+                                recursive=options.recursive,
+                                testExt=options.ext_url,
+                                useDbase=options.shelve,
+                                )
+            #print 'MyObj:', myObj
+        if options.dump:
+            myObj.debugDump()
+        myObj.writeStatistics()
+        myObj.writeErrorSummary()
+        #pprint.pprint(myObj.statsMap)
+        # TODO: Write out the results in different ways
+        errFilter = set(PROBLEM_CODE_FORMAT.keys())
+        if options.error_codes != 'All':
+            errFilter = set([int(i) for i in options.error_codes.split()])
+        if options.file == 'generic':
+            print 'Generic problems:'
+            myObj.writeErrors(True, errFilter)
+        elif options.file == 'specific':
+            print 'Specific problems:'
+            myObj.writeErrors(False, errFilter)
+    elif len(args) > 1:
+        parser.error("Too many arguments, I need only one.")
+        return 1
+    return 0
+
+if __name__ == '__main__':
+    multiprocessing.freeze_support()
+    sys.exit(main())