mpdot/linkcheck.py
author Michel Szarindar <Michel.Szarindar@Nokia.com>
Fri, 23 Apr 2010 20:45:58 +0100
changeset 2 932c358ece3e
permissions -rw-r--r--
Orb version 0.1.9. Fixes Bug 1965, Bug 2401

# Copyright (c) 2007-2010 Nokia Corporation and/or its subsidiary(-ies) All rights reserved.
# This component and the accompanying materials are made available under the terms of the License 
# "Eclipse Public License v1.0" which accompanies this distribution, 
# and is available at the URL "http://www.eclipse.org/legal/epl-v10.html".
#
# Initial Contributors:
# Nokia Corporation - initial contribution.
#
# Contributors:
#
# Description:
# Checks links in DITA XML and reports issues.
"""
Created on 12 Feb 2010

@author: p2ross

Definitions
===========
Doctype
-------
See: http://www.w3.org/TR/2008/REC-xml-20081126/#dt-root
Note: this is sometimes called the Doctype because of http://www.w3.org/TR/2008/REC-xml-20081126/#vc-roottype

ID
--
The value of the 'id' attribute of an element.

Root ID
-------
The value of the 'id' attribute of the root element.
Note: A development would allow differently named attributes provided that they
were ID types. See http://www.w3.org/TR/2008/REC-xml-20081126/#sec-attribute-types
for validity constraints for ID types.

Reference
---------
The value of the href attribute of an element.

Map
---
An XML file whose root element name is 'map' or ends with 'Map'.   

Topic
-----
An XML file that is not a Map.

Lonely topic
------------
A topic whose root ID is not referenced by any map. 

Lonely map
----------
A map whose root ID is not referenced by any map. 

Map Cycle
---------
A sequence of map references whose members are not unique.

"""

import os
import unittest
import sys
import logging
import pprint
import fnmatch
import re
import urllib
import time
from optparse import OptionParser, check_choice
try:
    from xml.etree import cElementTree as etree
except ImportError:
    from xml.etree import ElementTree as etree
import urlparse
import multiprocessing
# used for DitaFileObj persistence
import shelve

__version__ = '0.1.5'

class ExceptionLinkCheck(Exception):
    pass

class CountDict(dict):
    """Dictionary with a default value of 0 for unknown keys."""
    def __getitem__(self, key):
        if key not in self: 
            self[key] = 0
        return self.get(key)

# Matches stuff like: GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E
RE_GUID = re.compile(r'GUID-[0-9A-F]{8}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{4}-[0-9A-F]{12}', re.IGNORECASE)

# Of the form {integer_error_code : (format_string, num_args), ...}
PROBLEM_CODE_FORMAT = {
    # 'id_syntax'
    100 : ('Character \'#\' not allowed in id="%s"', 1),
    101 : ('NMTOKEN character \'%s\' not allowed in id="%s"', 2),
    102 : ('GUID specification does not match id="%s"', 1),
    # 'ref_syntax'
    200 : ('Multiple \'#\' not allowed in reference "%s"', 1),
    201 : ('Reference element "%s" is missing href=... attribute', 1),
    202 : ('URL has missing type/format in reference "%s"', 1),
    203 : ('GUID specification does not match file reference "%s"', 1),
    204 : ('GUID specification does not match fragment reference "%s"', 1),
    # 'ref'
    300 : ('Can not resolve URI "%s"', 1),
    # 'file'
    400 : ('Failed to open: "%s"', 1),
    401 : ('Multiple id="%s"', 1),
    402 : ('No id attribute on root element', 0),
    403 : ('Root ID in cycle: %s', 1),
    404 : ('Can not parse: "%s"', 1),
    410 : ('Can not resolve reference to file "%s"', 1),
    411 : ('Can resolve reference to file "%s" but not to fragment "%s"', 2),
    412 : ('Referencing element "%s" does not match target root element "%s"', 2),
    413 : ('Referencing element "%s" does not match target element "%s" for id="%s"', 3),
    414 : ('topicref element with format="ditamap" does not match target root element "%s"', 1),
    415 : ('topicref to <map> does not have format="ditamap" but format="%s"', 1),
    416 : ('topicref element type="%s" does not match target root element "%s"', 2),
    417 : ('topicref element type="%s" does not match target element "%s" for id="%s"', 3),
    418 : ('Unknown referencing element "%s" does not match target root element "%s"', 2),
    419 : ('Unknown referencing element "%s" does not match target element "%s" for id="%s"', 3),
    # 'file_set'
    500 : ('Not a directory: %s', 1),
    501 : ('Duplicate root id="%s" in files: %s', 2), 
    #502 : ('Can not resolve reference to "%s"', 1),
    #503 : ('Reference type "%s" does not match target type "%s" for id="%s"', 3),
    504 : ('Duplicate file path: "%s"', 1),
    505 : ('Duplicate id="%s" in files: %s', 2),
    # 'topic_set'
    600 : ('Topic id="%s" is not referenced by any map', 1), 
    # 'map_set'
    700 : ('More than one top level map exists: %s', 1),  
    701 : ('Maps "%s" are in a a cycle.', 1),
}

GENERIC_STRING = '...'
PRINT_WIDTH = 75

def genericStringForErrorCode(ec):
    assert(PROBLEM_CODE_FORMAT.has_key(ec))
    f, c = PROBLEM_CODE_FORMAT[ec]
    if c == 0:
        return f
    return f % ((GENERIC_STRING,) * c)

def writeGenericStringsForErrorCodes(s=sys.stdout):
    s.write(' All Error Codes '.center(PRINT_WIDTH, '='))
    s.write('\n')
    s.write('%4s  %s\n' % ('Code', 'Error'))
    s.write('%4s  %s\n' % ('----', '-----'))
    ecS = PROBLEM_CODE_FORMAT.keys()
    ecS.sort()
    for ec in ecS:
        s.write('%4d  %s\n' % (ec, genericStringForErrorCode(ec)))
    s.write('='*PRINT_WIDTH)
    s.write('\n\n')

def normalisePath(thePath):
    # TODO: How come this does not work?
    #return os.path.abspath(thePath)
    return os.path.abspath(thePath).replace('\\', '/')

FNMATCH_PATTERNS = ['*.xml', '*.dita', '*.ditamap']
FNMATCH_STRING = ' '.join(FNMATCH_PATTERNS)

# These elements descend from topic/xref so can be treated as referencing elements
XREF_DESCENDENTS = set(
    (
        # From the api specialisation
        'apiRelation',
        'apiBaseClassifier',
        'apiOtherClassifier',
        'apiOperationClassifier',
        'apiValueClassifier',
        # From the C++ specialisation
        'cxxfile',
        'cxxclass',
        'cxxstruct',
        'cxxunion',
        'cxxfunction',
        'cxxdefine',
        'cxxtypedef',
        'cxxvariable',
        'cxxenumeration',
        'cxxClassBaseClass',
        'cxxClassBaseStruct',
        'cxxClassBaseUnion',
        'cxxClassNestedClass',
        'cxxClassNestedStruct',
        'cxxClassNestedUnion',
        'cxxClassEnumerationInherited',
        'cxxClassEnumeratorInherited',
        'cxxClassFunctionInherited',
        'cxxClassVariableInherited',
        'cxxDefineReimplemented',
        'cxxEnumerationReimplemented',
        'cxxFunctionReimplemented',
        'cxxStructBaseClass',
        'cxxStructBaseStruct',
        'cxxStructBaseUnion',
        'cxxStructNestedClass',
        'cxxStructNestedStruct',
        'cxxStructNestedUnion',
        'cxxStructEnumerationInherited',
        'cxxStructEnumeratorInherited',
        'cxxStructFunctionInherited',
        'cxxStructVariableInherited',
        'cxxTypedefReimplemented',
        'cxxUnionBaseClass',
        'cxxUnionBaseStruct',
        'cxxUnionBaseUnion',
        'cxxUnionNestedClass',
        'cxxUnionNestedStruct',
        'cxxUnionNestedUnion',
        'cxxUnionEnumerationInherited',
        'cxxUnionFunctionInherited',
        'cxxUnionVariableInherited',
        'cxxVariableReimplemented',
    )
)

class UrlAccessCache(object):
    def __init__(self):
        # {URL : True/False, ...}
        self._cache = {}
        
    def clear(self):
        self._cache = {}
        
    def canAccess(self, theUrl):
        if not self._cache.has_key(theUrl):
            try:
                u = urllib.urlopen(theUrl)#, data, proxies)
                u.read()
                self._cache[theUrl] = True
                logging.debug('URL: %s  for %s' % (True, theUrl))
            except IOError:
                self._cache[theUrl] = False
                logging.debug('URL: %s for %s' % (False, theUrl))
        return self._cache[theUrl]

GlobalUrlCache = UrlAccessCache()
 
class DitaLinkCheckBase(object):
    """Base class that holds some common functionality."""
    def __init__(self, theIdentity):#=None):
        self.__identity = theIdentity
        # Set of error strings, lazily evaluated
        self._errS = None
    
    @property
    def identity(self):
        return self.__identity
    
    def __cmp__(self, other):
        assert(self.identity is not None)
        assert(other.identity is not None)
        return cmp(self.identity, other.identity)

    def __eq__(self, other):
        assert(self.identity is not None)
        assert(other.identity is not None)
        return self.identity == other.identity

    def __hash__(self):
        assert(self.identity is not None)
        return hash(self.identity)
    
    def __str__(self):
        return str(self.__identity)

    def debugDump(self, s=sys.stdout, prefix=''):
        """Dump of IR for debug purposes."""
        raise NotImplementedError
    
    def addError(self, errCode, argTuple):
        assert(errCode in PROBLEM_CODE_FORMAT.keys()), 'No error code: %s' % errCode
        assert(PROBLEM_CODE_FORMAT[errCode][1] == len(argTuple)), \
            'Length missmatch for error code %d: %d != %d for %s' \
            % (errCode, PROBLEM_CODE_FORMAT[errCode][1], len(argTuple), str(argTuple))
        if self._errS is None:
            self._errS = {}
        try:
            self._errS[errCode].add(argTuple)
        except KeyError:
            self._errS[errCode] = set((argTuple,))

    def errStrings(self, generic, theFilter):
        """Return a sorted list of error messages without duplicates."""
        if self._errS is not None:
            mySet = set()
            for ec in self._errS.keys():
                if theFilter is None or ec in theFilter:
                    assert(ec in PROBLEM_CODE_FORMAT.keys())
                    for tu in self._errS[ec]:
                        if generic:
                            mySet.add(genericStringForErrorCode(ec))
                        else:
                            f, c = PROBLEM_CODE_FORMAT[ec]
                            assert(len(tu) == c)
                            mySet.add(f % tu)
            l = list(mySet)
            l.sort()
            return l
        return []
    
    def updateErrorCount(self, theMap):
        """Updates a map of {error_code, : count, ...}.
        Overridden for file and file set."""
        if self._errS is not None:
            for e in self._errS.keys():
                theMap[e] += len(self._errS[e])
    
    def writeErrors(self, isGeneric, theFilter, theStream=sys.stdout):
        """Can be overridden in child classes to recurse into
        their data structures."""
        theStream.write('\n'.join(self.errStrings(isGeneric, theFilter)))
    
class DitaId(DitaLinkCheckBase):
    """Represents a node with an id."""
    def __init__(self, theN):
        assert(theN.get('id', None) is not None)
        super(DitaId, self).__init__(theN.get('id', None))
        self._elem = theN.tag
        if '#' in self.id:
            self.addError(100, (self.id,))
        # TODO: NMTOKENS
    
    @property
    def elem(self):
        return self._elem

    @property
    def id(self):
        return self.identity

    def checkGuid(self):
        """optionally applies additional checks for GUID requirements."""
        if RE_GUID.match(self.id) is None:
            self.addError(102, (self.id,))

    def debugDump(self, s=sys.stdout, prefix=''):
        """Dump of IR for debug purposes."""
        s.write('%sID:  <%s id="%s" />\n' % (prefix, self.elem, self.id))
        
class DitaRef(DitaLinkCheckBase):
    """Represents a reference node."""
    def __init__(self, theN):
        self._elem = theN.tag
        self._href = theN.get('href', None)
        super(DitaRef, self).__init__('%s %s' % (self._elem, self._href))
        # This is used when figuring out of the target is the correct element
        # e.g. in Vanilla DITA
        # <topicref href="batcaring.dita" type="task"></topicref>
        self._refType = theN.get('type', None)
        # Format attribute, this can be format="ditamap"
        self._format = theN.get('format', None)
        if self._href is None:
            self.addError(201, (self._elem,))
            self._url = None
        else:
            self._url = urlparse.urlparse(self._href)
            if '#' in self._url.fragment:
                self.addError(200, (self._href,))

    @property
    def elem(self):
        return self._elem

    @property
    def href(self):
        """The value of the href attribute."""
        return self._href
    
    @property
    def refType(self):
        """The value of the type attribute."""
        return self._refType
    
    @property
    def format(self):
        """The value of the format attribute."""
        return self._format
    
    @property
    def path(self):
        """The value of the path part of the href attribute."""
        return self._url.path
        
    @property
    def fragment(self):
        """The value of the fragment part of the href attribute."""
        return self._url.fragment
        
    @property
    def scheme(self):
        """The URI scheme e.g. 'http' or '' if no scheme."""
        return self._url.scheme
    
    def fileFragment(self, theRefFile):                               
        """The absolute path of the file and the fragment identifier or (None, None)."""
        if self.scheme not in ('', 'file'):
            return (None, None)
        if len(self.path) == 0:
            myPath = theRefFile
        else:
            myPath = os.path.join(os.path.dirname(theRefFile), self.path)
        return normalisePath(myPath), self.fragment
    
    def checkGuid(self):
        """optionally applies additional checks for GUID requirements."""
        if RE_GUID.match(self.path) is None:
            self.addError(203, (self.path,))
        if RE_GUID.match(self.fragment) is None:
            self.addError(204, (self.fragment,))                

    def checkUrl(self):
        if self.scheme:
            myU = urlparse.urlunparse(self._url)
            if not GlobalUrlCache.canAccess(myU):
                self.addError(300, (myU,))

    def debugDump(self, s=sys.stdout, prefix=''):
        """Dump of IR for debug purposes."""
        s.write('%sREF: <%s href="%s" />\n' % (prefix, self.elem, self._href))

class DitaFileObj(DitaLinkCheckBase):
    """Base class for a DITA topic or map."""
    def __init__(self, theFileObj, theFileName=None):
        """Initialiser with a file object and a file path"""
        #print '\nDitaFileObj(%s, %s)' % (theFileObj, theFileName)
        if theFileName is not None:
            super(DitaFileObj, self).__init__(normalisePath(theFileName))
        elif theFileObj is not None:
            super(DitaFileObj, self).__init__(theFileObj.name)
        else:
            super(DitaFileObj, self).__init__(None)
        self._rootId = None
        self._doctype = None
        # Sets of class DitaId
        self._idS = set()
        self._dupeIdS = set()
        # Set of class DitaRef
        self._xrefS = set()
        # Ouptut control
        self._hasWritten = False
        # Size of input
        try:
            self._bytes = os.path.getsize(theFileName)
        except Exception:
            # Try as if a StringIO
            try:
                self._bytes = theFileObj.len
            except AttributeError:
                # Give up
                self._bytes = 0
        # Process the file object
        if theFileObj is not None:
            try:
                # TODO: use iterparse?
                theTree = etree.parse(theFileObj)
            except SyntaxError, err:
                self.addError(404, (str(err),))
            else:
                # Walk the tree
                for i, e in enumerate(theTree.getiterator()):
                    #print 'TRACE: e', e
                    # Element [0] is the root element
                    if i == 0:
                        assert(self._rootId is None)
                        assert(self._doctype is None)
                        self._doctype = e.tag
                        if e.get('id', None) is not None:
                            self._rootId = DitaId(e)
                            self._addId(self._rootId)
                        else:
                            self.addError(402, ())
                    else:
                        # NOTE: Elements with id attributes can also have href
                        # attributes. For example a <topicref> in a <bookmap>
                        # Thus these tests are not exclusive
                        if e.get('id', None) is not None:
                            self._addId(DitaId(e))
                        if e.get('href', None) is not None:
                            # TODO: Do we limit ourselves to only a certain set of elements?
                            self._xrefS.add(DitaRef(e))
        else:
            self.addError(400, (self.identity,))
    
    def _addId(self, theId):
        #print 'TRACE: adding %s' % theId
        #print 'TRACE: self._idS %s' % self._idS
        if theId in self._idS:
            # Remove from self._idS
            #print 'TRACE: removing %s' % theId
            self._idS.remove(theId)
            self._dupeIdS.add(theId)
            self.addError(401, (theId.identity,))
        elif theId not in self._dupeIdS:
            self._idS.add(theId)
    
    @property
    def bytes(self):
        return self._bytes
    
    @property
    def doctype(self):
        return self._doctype
    
    @property
    def rootId(self):
        if self._rootId is not None:
            return self._rootId.id
    
    @property
    def isMap(self):
        return self.doctype == "map" \
        or self.doctype == 'bookmap' \
        or (self.doctype is not None and self.doctype.endswith('Map'))
    
    @property
    def idS(self):
        """The set of IDs."""
        return self._idS
    
    @property
    def refS(self):
        """The set of DitaRef objects."""
        return self._xrefS
    
    def idElemMap(self):
        """Returns a map {id : elem name, ...}."""
        retVal = {}
        for anId in self._idS:
            retVal[anId.id] = anId.elem
        return retVal
    
    def hasId(self, theString):
        for anId in self._idS:
            if theString == anId.id:
                return True
        return False

    def idElem(self, theString):
        for anId in self._idS:
            if theString == anId.id:
                return anId.elem
        return None

    def idObj(self, theString):
        for anId in self._idS:
            if theString == anId.id:
                return anId
        return None

    def updateErrorCount(self, theMap):
        """Updates a map of {error_code, : count, ...}."""
        if self._errS is not None:
            for e in self._errS.keys():
                theMap[e] += len(self._errS[e])
        for idObj in self.idS:
            idObj.updateErrorCount(theMap)
        for refObj in self.refS:
            refObj.updateErrorCount(theMap)
    
    def writeErrorList(self, theList, theSubHead='', theS=sys.stdout):
        if len(theList) > 0:
            theList.sort()
            if not self._hasWritten:
                theS.write('File: %s\n' % self.identity)
            self._hasWritten = True
            if len(theSubHead) > 0:
                theS.write('%s [%d]:\n' % (theSubHead, len(theList)))
            theS.write('\n'.join(theList))
            theS.write('\n')
    
    def writeErrors(self, isGeneric, theFilter, theStream=sys.stdout):
        """Writes out errors for me, my IDs and my Refs."""
        self._hasWritten = False
        self.writeErrorList(self.errStrings(isGeneric, theFilter), 'File errors:', theStream)
#===============================================================================
#        # Duplicate IDs
#        myList = (list(self._dupeIdS))
#        if len(myList):
#            self.writeErrorList(
#                    [i.identity for i in myList],
#                    'Duplicate ID',
#                    theStream)
#===============================================================================
        # Now IDs
        myList = (list(self.idS))
        myList.sort()
        for anId in myList:
            self.writeErrorList(anId.errStrings(isGeneric, theFilter), 'ID=%s' % anId.identity, theStream)
        # Now Refs
        myList = (list(self._xrefS))
        myList.sort()
        for anId in myList:
            self.writeErrorList(anId.errStrings(isGeneric, theFilter), 'Ref=%s' % anId.identity, theStream)
        if self._hasWritten:
            theStream.write('\n')
    
    def debugDump(self, s=sys.stdout, prefix=''):
        """Dump of IR for debug purposes."""
        s.write('%sFile: %s\n' % (prefix, self.identity))
        for anId in self._idS:
            anId.debugDump(s, prefix=prefix+'  ')
        for aRef in self._xrefS:
            aRef.debugDump(s, prefix=prefix+'  ')
    
class DitaFilePath(DitaFileObj):
    """Base class for a DITA topic or map from the file system."""
    def __init__(self, theFilePath):
        """Initialiser with a file path"""
        try:
            f = open(theFilePath)
        except IOError:
            f = None
        #print 'DitaFilePath(%s)' % theFilePath
        super(DitaFilePath, self).__init__(f, theFilePath)
        if f is None:
            self.addError(400, (theFilePath,))
            
            
class DitaFileMapBase(object):
    """Base class for holding a map of {file path : class DitaFile, ...}
    Actual implementation can be in-memory or via a database e.g. the
    shelve module."""
    def keys(self):
        """Returns an unsorted list of keys in the map."""
        raise NotImplementedError()
    
    def has_key(self, thePath):
        """Return True if the key exists."""
        raise NotImplementedError()
    
    def remove(self, thePath):
        """Remove the entry corresponding to thePath, may raise KeyError."""
        raise NotImplementedError()
    
    def getDitaFileObj(self, thePath):
        """Return a DitaFileObj that corresponds to thePath, may raise KeyError."""
        raise NotImplementedError()
        
    def setDitaFileObj(self, thePath, theObj):
        """Load a DitaFileObj or update a mutated DitaFileObj."""
        raise NotImplementedError()
        
class DitaFileMapInMemory(DitaFileMapBase):
    """Holds map of {file path : class DitaFile, ...} in memory."""
    def __init__(self):
        # Map of {file path : class DitaFile, ...}
        self._fileMap = {}
    
    def keys(self):
        """Returns an unsorted list of keys in the map."""
        return self._fileMap.keys()
    
    def has_key(self, thePath):
        """Return True if the key exists."""
        return self._fileMap.has_key(thePath)
        
    def remove(self, thePath):
        """Remove the entry corresponding to thePath, may raise KeyError."""
        del self._fileMap[thePath]
    
    def getDitaFileObj(self, thePath):
        """Return a DitaFileObj that corresponds to thePath, may raise KeyError."""
        return self._fileMap[thePath]
        
    def setDitaFileObj(self, thePath, theObj):
        """Load a DitaFileObj or update a mutated DitaFileObj."""
        self._fileMap[thePath] = theObj
        
class DitaFileMapShelve(DitaFileMapBase):
    """Holds map of {file path : class DitaFile, ...} in a shelve database."""
    DBASE_FILENAME = 'linkchecker.dbase'
    def __init__(self):
        if os.path.exists(self.DBASE_FILENAME):
            os.remove(self.DBASE_FILENAME)
        self._db = shelve.open(self.DBASE_FILENAME)
        # Use this as a 'cache' as shelf.keys() is slow
        self._keys = set()
    
    def keys(self):
        """Returns an unsorted list of keys in the map."""
        return list(self._keys)
    
    def has_key(self, thePath):
        """Return True if the key exists."""
        return thePath in self._keys
        
    def remove(self, thePath):
        """Remove the entry corresponding to thePath, may raise KeyError."""
        del self._db[thePath]
        self._keys.remove(thePath)
    
    def getDitaFileObj(self, thePath):
        """Return a DitaFileObj that corresponds to thePath, may raise KeyError."""
        return self._db[thePath]
        
    def setDitaFileObj(self, thePath, theObj):
        """Load a DitaFileObj or update a mutated DitaFileObj."""
        self._db[thePath] = theObj
        self._keys.add(thePath)
        
class DitaFileSet(DitaLinkCheckBase):
    """Holds information about a set of DITA files."""
    STATS_KEYS = ('Maps', 'Non-maps', 'Files', 'Bytes', 'IDs', 'Refs')
    def __init__(self,
                 theDir,
                 procDir=True,
                 thePatterns=None,
                 recursive=False,
                 testExt=False,
                 useDbase=False):
        """Constructor. theDir is the root directory of DITA XML.
        procDir - If True then process this directory immediately, otherwise
                    the directory can be processed independently and
                    _addFileObj() or _addDitaFileObj() invoked.
        thePatterns - If supplied this should be a space separated string of
                        fnmatch extensions.
        recursive - If True and procDir True the directory is processed recursively.
        testExt - If True then test external URLs.
        useDbase - If True then store all DitaFile objects in an external dbase
                    (slower but less memory issues).
        """
        if thePatterns is None:
            thePatterns = FNMATCH_STRING.split(' ')
        if theDir is not None:
            theDir = normalisePath(theDir)
        super(DitaFileSet, self).__init__(theDir)
        logging.info('DitaFileSet starting to read...')
        GlobalUrlCache.clear()
        self._testExt = testExt
        # Set up how we store the DitaFile objects
        if useDbase:
            self._fileMap = DitaFileMapShelve()
        else:
            self._fileMap = DitaFileMapInMemory()
        # Map of (str(rootId) : filepath, ...) with no duplicates
        # Keys will be in self._uniqueRootIds
        self._rootIdToFilePathMap = {}
        # Path to the unique DITA map
        self._uniqueMapPath = None
        # Count of {error_code : count, ...}
        self._errCountMap = CountDict()
        # Statistics
        self._statsMap = CountDict()
        ## and initialise
        #for k in self.STATS_KEYS:
        #    self._statsMap[k]
        # Finalisation control (weak)
        self._hasFinalised = False
        # Timers
        self._timeRead = time.clock()
        self._timeAnalyse = 0.0
        if procDir:
            if theDir is not None and os.path.isdir(theDir):
                self._readDir(theDir, thePatterns, recursive)
            else:
                self.addError(500, (theDir,))
            # Finalise and run all the tests
            self.finalise()
    
    @property
    def errCountMap(self):
        return self._errCountMap
    
    @property
    def statsMap(self):
        return self._statsMap
    
    def writeStatistics(self, s=sys.stdout):
        """Writes out read statistics."""
        s.write(' Statistics '.center(PRINT_WIDTH, '='))
        s.write('\n')
        if len(self._statsMap) > 0:
            o = self.STATS_KEYS
            #assert(set(o) == set(self._statsMap.keys())), \
            #    '%s != %s' % (o, self._statsMap.keys())
            for k in o:
                try:
                    m = self._statsMap[k] / (1024.0*1024.0)
                    s.write('%20s: %10d [%10.3f M]\n' % (k, self._statsMap[k], m))
                except KeyError:
                    s.write('%20s: %10s \n' % (k, 'Not seen'))
            s.write('%20s: %10.3f (s)\n' % ('Read time', self._timeRead))
            s.write('%20s: %10.3f (s)\n' % ('Analysis time', self._timeAnalyse))
            s.write('='*PRINT_WIDTH)
        else:
            s.write('Nothing processed.')
        s.write('\n')
        
    def writeErrorSummary(self, s=sys.stdout):
        s.write(' Error Summary '.center(PRINT_WIDTH, '='))
        s.write('\n')
        if len(self._errCountMap):
            s.write('%4s %10s %s\n' % ('Code', 'Count', 'Error'))
            s.write('%4s %10s %s\n' % ('----', '-----', '-----'))
            errCodeS = self._errCountMap.keys()
            errCodeS.sort()
            for c in errCodeS:
                s.write('%4d %10d %s\n' \
                        % (c, self._errCountMap[c], genericStringForErrorCode(c)))
        else:
            s.write('No errors\n')            
        s.write('='*PRINT_WIDTH)
        s.write('\n')
        
    def writeErrors(self, isGeneric, theFilter, theStream=sys.stdout):
        """Writes out errors for me and my files."""
        theStream.write('\n'.join(self.errStrings(isGeneric, theFilter)))
        fileS = self._fileMap.keys()
        fileS.sort()
        for aFile in fileS:
            # Immutable call so just use get
            self._fileMap.getDitaFileObj(aFile).writeErrors(isGeneric, theFilter, theStream)
        
    def allErrStrings(self, isGeneric, theFilter):
        """Return a sorted list of error messages without duplicates including
        files."""
        retSet = set(self.errStrings(isGeneric, theFilter))
        fileS = self._fileMap.keys()
        fileS.sort()
        for aFilePath in self._fileMap.keys():
            # Immutable call so just use get
            for anErr in self._fileMap.getDitaFileObj(aFilePath).errStrings(isGeneric, theFilter): 
                retSet.add(anErr)
        retList = list(retSet)
        retList.sort()
        return retList
            
    def _readDir(self, theDir, thePatS, recursive):    
        assert(os.path.isdir(theDir))
        for aName in os.listdir(theDir):
            aPath = os.path.join(theDir, aName)
            if os.path.isdir(aPath) and recursive:
                self._readDir(aPath, thePatS, recursive)
            elif os.path.isfile(aPath):
                for aPat in thePatS:
                    if fnmatch.fnmatch(aName, aPat):
                        assert(not self._fileMap.has_key(aPath))
                        logging.debug(' Reading %s' % aPath)
                        try:
                            f = open(aPath)
                        except IOError:
                            f = None
                        self._addFileObj(f, aPath)
                        break

    def _addFileObj(self, theFileObj, theFilePath):
        myObj = DitaFileObj(theFileObj, theFilePath)
        self._addDitaFileObj(myObj)

    def _addDitaFileObj(self, theDitaFileObj):
        if self._fileMap.has_key(theDitaFileObj.identity):
            self.addError(504, (theDitaFileObj.identity,))
        else:
            # Mutable call so use set
            self._fileMap.setDitaFileObj(theDitaFileObj.identity, theDitaFileObj)
        # Update statistics (files, bytes, ids, refs) etc.
        self._statsMap['Files'] += 1
        self._statsMap['Bytes'] += theDitaFileObj.bytes
        self._statsMap['IDs'] += len(theDitaFileObj.idS)
        self._statsMap['Refs'] += len(theDitaFileObj.refS)
        if theDitaFileObj.isMap:
            self._statsMap['Maps'] += 1
        else:
            self._statsMap['Non-maps'] += 1
    
    def finalise(self):
        """Creates the environment for all checks and then runs them."""
        logging.info('DitaFileSet.finalise() start...')
        if not self._hasFinalised:
            self._timeRead = time.clock() - self._timeRead
            self._timeAnalyse = time.clock()
            self._initRootIdToFilePathMap()
            self._checkDupeIdS()
            self._setMapCycles()
            self._checkLonely()
            self._checkRefArcs()
            self._errCountMap = CountDict()
            self.updateErrorCount(self._errCountMap)
            self._hasFinalised = True
            self._timeAnalyse = time.clock() - self._timeAnalyse
        logging.info('DitaFileSet.finalise() done.')
        
    def _initRootIdToFilePathMap(self):
        # Map of (str(rootId) : filepath, ...) with no duplicates
        self._rootIdToFilePathMap = {}
        # Temporary map of (str(rootId) : [filepath, ...], ...)
        myDupeIdFiles = {}
        for fPath in self._fileMap.keys():
            # fObj is not written to so we don't need to use set
            fObj = self._fileMap.getDitaFileObj(fPath)
            #print 'TRACE: _initRootIdToFilePathMap() fPath:', fPath
            rId = fObj.rootId
            if rId is not None:
                if myDupeIdFiles.has_key(rId):
                    #print 'TRACE: _initRootIdToFilePathMap() another dupe:', fPath
                    myDupeIdFiles[rId].append(fObj.identity)
                elif self._rootIdToFilePathMap.has_key(rId):
                    #print 'TRACE: _initRootIdToFilePathMap() first dupe:', fPath
                    # Remove from map and add to myDupeIdFiles
                    myFile = self._rootIdToFilePathMap.pop(rId)
                    try:
                        myDupeIdFiles[rId].append(myFile)
                    except KeyError:
                        myDupeIdFiles[rId] = [myFile,]
                    myDupeIdFiles[rId].append(fPath)
                else:
                    #print 'TRACE: _initRootIdToFilePathMap() adding:', fPath
                    self._rootIdToFilePathMap[rId] = fObj.identity
        # Set duplicate errors
        for k in myDupeIdFiles.keys():
            myDupeIdFiles[k].sort()
            self.addError(501, (k, tuple(myDupeIdFiles[k])))
            #self.addError(501, (k, str([str(a) for a in myDupeIdFiles[k]])))
    
    def _checkDupeIdS(self):
        """Checks if there are any duplicate IDs anywhere."""
        # {ID : [fileS, ...], ...}
        myDupeIdMap = {}
        # Temporary data structure
        # {ID : first file ID is seen in, ...}
        seenIdMap = {}
        for f in self._fileMap.keys():
            # o is not written to so we don't need set...
            o = self._fileMap.getDitaFileObj(f)
            for anId in o.idS:
                if seenIdMap.has_key(anId):
                    try:
                        myDupeIdMap[anId].append(f)
                    except KeyError:
                        myDupeIdMap[anId] = [seenIdMap[anId],]
                        myDupeIdMap[anId].append(f)
                else:
                    seenIdMap[anId] = f
        # Now add to errs as a 505 error message
        # Sort the files in the map
        for k in myDupeIdMap.keys():
            myDupeIdMap[k].sort()
            self.addError(505, (k, tuple(myDupeIdMap[k])))
            #self.addError(505, (k, str([str(a) for a in myDupeIdMap[k]])))
                    
    def _retMapAdjList(self):
        """Create an adjacency list {file_path : set(refs), ...} (all strings)"""
        adjList = {}
        for f in self._fileMap.keys():
            fObj = self._fileMap.getDitaFileObj(f)
            if fObj.isMap:# and fObj.rootId is not None:
                assert(fObj.identity not in adjList.keys())
                refSet = set()
                for r in fObj.refS:
                    refSet.add(r.fileFragment(fObj.identity)[0])
                adjList[fObj.identity] = refSet
        return adjList

    def _setMapCycles(self):
        """Sets any cyclic references seen in DITA maps."""
        adjList = self._retMapAdjList()
        # A branch
        myBr = []
        myCycles = set()
        for aPath, aSet in adjList.items():
            myBr.append(aPath)
            self._recurseCycles(adjList, myBr, myCycles)
            myBr.pop()
        self._setCycleErrors(myCycles)      
            
    def _recurseCycles(self, a, b, c):
        assert(len(b) > 0)
        try:
            myPath = b[-1]
            for r in a[myPath]:
                #print '_recurseCycles() testing r', r
                #print '_recurseCycles() testing b', b
                if r in b:
                    #print 'Adding cycle', tuple(b[b.index(r):])
                    c.add(tuple(b[b.index(r):]))
                else:
                    b.append(r)
                    self._recurseCycles(a, b, c)
                    b.pop()
        except KeyError:
            pass
        
    def _setCycleErrors(self, theC):
        for aT in theC:
            self.addError(701, (str(aT),))
            myL = list(aT)
            assert(len(myL) > 0)
            i = 0
            while i < len(myL):
                myL.append(myL[0])
                # Should this be in the file thus, or in the files set?
                # As we are mutating the file object we need to use both
                # getDitaFileObj() and setDitaFileObj()
                fObj = self._fileMap.getDitaFileObj(myL[0])
                fObj.addError(701, (str(myL),))
                self._fileMap.setDitaFileObj(myL[0], fObj)
                myL.pop()
                myL.append(myL.pop(0))
                i += 1    

    def _checkLonely(self):
        self._checkLonelyMaps()
        self._checkLonelyTopics()
        
    def _checkLonelyMaps(self):
        """Checks for lonely maps."""
        mapPathSet = set()
        pathSetRemain = set()
        for f in self._fileMap.keys():
            if self._fileMap.getDitaFileObj(f).isMap:
                mapPathSet.add(f)
                pathSetRemain.add(f)
        for aPath in mapPathSet:
            myMapObj = self._fileMap.getDitaFileObj(aPath)
            for r in myMapObj.refS:
                refFile, frag = r.fileFragment(f)
                try:
                    pathSetRemain.remove(refFile)
                except KeyError:
                    # refFile is a topic or an already seen map
                    pass
        if len(pathSetRemain) > 1:
            for aPath in pathSetRemain:
                self.addError(700, (aPath,))
        elif len(pathSetRemain) == 1:
            self._uniqueMapPath = pathSetRemain.pop()

    def _checkLonelyTopics(self):
        """Checks for topics that are not referenced by any map."""
        mapPathSet = set()
        pathSetRemain = set()
        for f in self._fileMap.keys():
            #print 'TRACE: f:', f
            if self._fileMap.getDitaFileObj(f).isMap:
                mapPathSet.add(f)
            else:
                pathSetRemain.add(f)
        #print 'TRACE: mapPathSet', mapPathSet
        #print 'TRACE: pathSetRemain', pathSetRemain
        for aMapPath in mapPathSet:
            myMapObj = self._fileMap.getDitaFileObj(aMapPath)
            for r in myMapObj.refS:
                refFile, frag = r.fileFragment(aMapPath)
                #print 'TRACE: removing:', refFile
                try:
                    pathSetRemain.remove(refFile)
                except KeyError:
                    # topic has already been seen in another map
                    pass
        if len(pathSetRemain) > 0:
            for aPath in pathSetRemain:
                self.addError(600, (aPath,))
            
    def _checkRefArcs(self):
        """Checks all references are reachable."""
        for fPath in self._fileMap.keys():
            fObjSrc = self._fileMap.getDitaFileObj(fPath)
            hasMutated = False
            for rObjSrc in fObjSrc.refS:
                if rObjSrc.scheme:
                    # Decide whether to test and external URL
                    if self._testExt:
                        rObjSrc.checkUrl()
                else:
                    fi, fr = rObjSrc.fileFragment(fPath)
                    assert(fi is not None), 'fi is None for rObjSrc: %s in file: %s' % (rObjSrc, fPath)
                    assert(fr is not None), 'fr is None for rObjSrc: %s in file: %s' % (rObjSrc, fPath)
                    ## If a url then fileFragment() returns (None, None)
                    #if fi is None:
                    #    print 'fPath', fPath
                    #    print 'rObjSrc', rObjSrc
                    #    print 'fi', fi
                    #    print 'fr', fr
                    try:
                        fObjTgt = self._fileMap.getDitaFileObj(fi)
                    except KeyError:
                        # Target file can not be found in the IR
                        # check the file system to see if it is a non-DITA resource
                        if not os.path.isfile(fi):
                            #print 'TRACE: adding 410 to', fObj.identity
                            fObjSrc.addError(410, (fi,))
                            hasMutated = True
                    else:
                        if len(fr) > 0:
                            # Target file is found, test fragment
                            if not fObjTgt.hasId(fr):
                                # Fragment not found
                                fObjSrc.addError(411, (fi, fr))
                                hasMutated = True
                        if self._checkRefArcElemName(fObjSrc, rObjSrc, fObjTgt, fr):
                            hasMutated = True
            if hasMutated:
                self._fileMap.setDitaFileObj(fPath, fObjSrc)

    def _checkRefArcElemName(self, fObjSrc, rObjSrc, fObjTgt, frag):
        """Test source and target element names
        e.g. Source <cxxClassRef> should match target <cxxClass>
        And in vanilla DITA:
        <topicref href="batcaring.dita" type="task"></topicref>
        or:
        <topicref href="batcaring.dita" format="ditamap"></topicref>
        Should match target element <task>."""
        isRootTgt = False
        hasMutated = False
        if len(frag) == 0:
            # iObjTgt is the root element of fObjTgt
            if fObjTgt.rootId is None or fObjTgt.idElem(fObjTgt.rootId) is None:
                # Covered by other error codes
                return
            iObjTgt = fObjTgt.idObj(fObjTgt.rootId)
            isRootTgt = True
        elif fObjTgt.hasId(frag):
            iObjTgt = fObjTgt.idObj(frag)
        else:
            # frag not found that will be a 411 error (handled by caller).
            return
        # Have an rObjSrc + iObjTgt so check elements
        # First case:
        if rObjSrc.elem.endswith('Ref'):
            if rObjSrc.elem[:-3] != iObjTgt.elem:
                if isRootTgt:
                    fObjSrc.addError(412, (rObjSrc.elem, iObjTgt.elem))
                else:
                    fObjSrc.addError(413, (fObjTgt.idElem(frag), rObjSrc.elem, frag))
                hasMutated = True
        # Second case(s) for vanilla DITA
        elif rObjSrc.elem == 'topicref':
            # Check DITA map links
            if rObjSrc.format == 'ditamap' and iObjTgt.elem != 'map':
                # Target must be a root element (actually we don't care)
                fObjSrc.addError(414, (iObjTgt.elem,))
                hasMutated = True
            elif iObjTgt.elem == 'map' and rObjSrc.format != 'ditamap':
                fObjSrc.addError(415, (rObjSrc.format,))
                hasMutated = True
            elif not (rObjSrc.format == 'ditamap' and iObjTgt.elem == 'map'):
                # Treat refType None as type="topic", see DITA standard for <topicref>
                # Well, also look at the type attribute in chapter 25
                # "When the type attribute is unspecified, it should be
                # determined by inspecting the target if possible. If the
                # target cannot be inspected for some reason, the value
                # should default to "topic".
                # Note: DITA 1.2 takes a different view...
                # Was:
                #if (rObjSrc.refType is None and iObjTgt.elem != 'topic') \
                #or (rObjSrc.refType is not None and rObjSrc.refType != iObjTgt.elem):
                if rObjSrc.refType is not None and rObjSrc.refType != iObjTgt.elem:
                    if isRootTgt:
                        fObjSrc.addError(416, (rObjSrc.refType, iObjTgt.elem,))
                        hasMutated = True
                    else:
                        fObjSrc.addError(417, (rObjSrc.refType, iObjTgt.elem, frag,))
                        hasMutated = True
                # Otherwise topicref looks OK
        elif rObjSrc.elem != 'xref' and rObjSrc.elem not in XREF_DESCENDENTS:
            # Unknown referencing element
            if isRootTgt:
                fObjSrc.addError(418, (rObjSrc.elem, fObjTgt.doctype))
                hasMutated = True
            else:
                fObjSrc.addError(419, (rObjSrc.elem, fObjTgt.idElem(frag), frag))
                hasMutated = True
        return hasMutated
                                        
    def updateErrorCount(self, theMap):
        """Updates a map of {error_code, : count, ...}."""
        if self._errS is not None:
            for e in self._errS.keys():
                theMap[e] += len(self._errS[e])
        for fPath in self._fileMap.keys():
            fObj = self._fileMap.getDitaFileObj(fPath)
            # Mutable call so need to update
            fObj.updateErrorCount(theMap)
            self._fileMap.setDitaFileObj(fPath, fObj)

    def debugDump(self, s=sys.stdout, prefix=''):
        """Dump of IR for debug purposes."""
        s.write(' Debug Dump '.center(PRINT_WIDTH, '+'))
        s.write('\n')
        fileS = self._fileMap.keys()
        fileS.sort()
        for f in fileS:
            self._fileMap.getDitaFileObj(f).debugDump(s, prefix)
        s.write(' END Debug Dump '.center(PRINT_WIDTH, '+'))
        s.write('\n\n')
    
#####################################
# Multiprocessing code
#####################################
def retDitaFileObj(thePath):
    return DitaFilePath(thePath)
 
def genDitaPath(theDir, thePatS, recursive):
    assert(os.path.isdir(theDir))
    for aName in os.listdir(theDir):
        aPath = os.path.join(theDir, aName)
        if os.path.isdir(aPath) and recursive:
            for p in genDitaPath(aPath, thePatS, recursive):
                yield p
        elif os.path.isfile(aPath):
            for aPat in thePatS:
                if fnmatch.fnmatch(aName, aPat):
                    #logging.info('genDitaPath(): %s' % aPath)
                    yield aPath
                    break    
    
def retMpDitaFileSetObj(theDir,
                        thePatterns,
                        recursive,
                        numJobs, 
                        checkExt,
                        useDb):
    assert(os.path.isdir(theDir))
    assert(numJobs >= 0)
    retObj = DitaFileSet(theDir, procDir=False, testExt=checkExt, useDbase=useDb)
    myNumJobs = numJobs
    if numJobs == 0:
        myNumJobs = multiprocessing.cpu_count()
        logging.info('Set multiprocessing number of jobs to %d' % myNumJobs)
    myPool = multiprocessing.Pool(processes=myNumJobs)
    for result in [
            myPool.apply_async(retDitaFileObj, (f,))
                for f in genDitaPath(theDir, thePatterns, recursive)
            ]:
        myObj = result.get()
        logging.debug('Got %s' % myObj.identity)
        retObj._addDitaFileObj(myObj)
    # Note: finalise() is a serial process
    logging.info('retMpDitaFileSetObj(): finalising')
    retObj.finalise()
    return retObj

######################################
# Test code
######################################
try:
    import cStringIO as StringIO
except ImportError:
    import StringIO

class NullClass(unittest.TestCase):
    pass

class TestCountDict(unittest.TestCase):
    def setUp(self):
        pass
    
    def tearDown(self):
        pass
    
    def testSetUpTearDown(self):
        """TestCountDict: test setUp() and tearDown()."""
        pass
    
    def test_basic(self):
        """TestCountDict: test basic functionality."""
        myMap = CountDict()
        self.assertEqual(myMap.has_key('wtf'), False)
        self.assertEqual(myMap['wtf'], 0)
        self.assertEqual(myMap.has_key('wtf'), True)
        myMap['wtf'] += 1
        self.assertEqual(myMap['wtf'], 1)

class TestDitaId(unittest.TestCase):
    def setUp(self):
        pass
    
    def tearDown(self):
        pass
    
    def testSetUpTearDown(self):
        """DitaId: test setUp() and tearDown()."""
        pass
    
    def test_basic(self):
        """DitaId: basic read of an node with an id"""
        myXml = """<cxxClass id="class_big_endian"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaId(myTree.getroot())
        self.assertEqual(myObj.id, 'class_big_endian')
        self.assertEqual(str(myObj), 'class_big_endian')
        self.assertEqual(myObj.errStrings(True, None), [])
        self.assertEqual(myObj.errStrings(False, None), [])
        
    def test_guid_00(self):
        """DitaId: basic read of an node with an GUID id"""
        myXml = """<cxxClass id="GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaId(myTree.getroot())
        self.assertEqual(myObj.id, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
        myObj.checkGuid()
        self.assertEqual(myObj.errStrings(True, None), [])
        self.assertEqual(myObj.errStrings(False, None), [])

    def test_guid_01(self):
        """DitaId: basic read of an node with an GUID id fails"""
        myXml = """<cxxClass id="25825EC4-341F-3EA4-94AA-7DCE380E6D2E"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaId(myTree.getroot())
        self.assertEqual(myObj.id, '25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
        myObj.checkGuid()
        self.assertEqual(
            myObj.errStrings(False, None),
            [
             'GUID specification does not match id="25825EC4-341F-3EA4-94AA-7DCE380E6D2E"'
            ])
        self.assertEqual(
            myObj.errStrings(True, None),
            [
             'GUID specification does not match id="%s"' % GENERIC_STRING,
            ])

    def test_cmp_eq_00(self):
        """DitaId: cmp(), == of two identical nodes"""
        myXml = """<cxxClass id="class_big_endian"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj_00 = DitaId(myTree.getroot())
        myObj_01 = DitaId(myTree.getroot())
        self.assertEqual(cmp(myObj_00, myObj_01), 0)
        self.assertEqual((myObj_00 == myObj_01), True)

    def test_cmp_eq_01(self):
        """DitaId: cmp(), == of two identical nodes from different elements."""
        myXml_00 = """<cxxClass id="big_endian"/>"""
        myTree_00 = etree.parse(StringIO.StringIO(myXml_00))
        myObj_00 = DitaId(myTree_00.getroot())
        myXml_01 = """<cxxStruct id="big_endian"/>"""
        myTree_01 = etree.parse(StringIO.StringIO(myXml_01))
        myObj_01 = DitaId(myTree_01.getroot())
        self.assertEqual(cmp(myObj_00, myObj_01), 0)
        self.assertEqual((myObj_00 == myObj_01), True)

    def test_set(self):
        """DitaId: read of an node with an id several times into a set and check unique,"""
        myXml = """<cxxClass id="class_big_endian"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        s = set()
        i = 0
        while i < 8:
            s.add(DitaId(myTree.getroot()))
            i += 1
        self.assertEqual(len(s), 1)
        self.assertEqual(DitaId(myTree.getroot()) in s, True)

    def test_map(self):
        """DitaId: read of an node with an id several times into a map and check unique,"""
        myXml = """<cxxClass id="class_big_endian"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        m = {}
        i = 0
        while i < 8:
            m[DitaId(myTree.getroot())] = 1
            i += 1
        self.assertEqual(len(m), 1)
        self.assertEqual(m.has_key(DitaId(myTree.getroot())), True)

    def test_error_hash(self):
        """DitaId: error with a '#' in an id"""
        myXml = """<cxxClass id="class_#big_endian"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaId(myTree.getroot())
        self.assertEqual(myObj.id, 'class_#big_endian')
        self.assertEqual(str(myObj), 'class_#big_endian')
        self.assertEqual(
                myObj.errStrings(True, None),
                [
                    genericStringForErrorCode(100),
                ]
            )
        self.assertEqual(
                myObj.errStrings(False, None),
                [
                 'Character \'#\' not allowed in id="class_#big_endian"',
                 ]
            )
        


class TestDitaRef(unittest.TestCase):
    def setUp(self):
        pass
    
    def tearDown(self):
        pass
    
    def testSetUpTearDown(self):
        """DitaRef: test setUp() and tearDown()."""
        pass
    
    def test_basic(self):
        """DitaRef: basic read of an xref node, no fragment"""
        myXml = """<xref href="class_big_endian"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, 'class_big_endian')
        self.assertEqual(myObj.path, 'class_big_endian')
        self.assertEqual(myObj.elem, 'xref')
        self.assertEqual(str(myObj), 'xref class_big_endian')
        self.assertEqual(myObj.fragment, '')
        self.assertEqual(myObj.scheme, '')
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])

    def test_basic_frag(self):
        """DitaRef: basic read of an xref node, with fragment"""
        myXml = """<xref href="class_big_endian.xml#function"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, 'class_big_endian.xml#function')
        self.assertEqual(myObj.path, 'class_big_endian.xml')
        self.assertEqual(myObj.fragment, 'function')
        self.assertEqual(myObj.scheme, '')
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])

    def test_file_frag_00(self):
        """DitaRef: accessing an xref node, with a file and a fragment"""
        myXml = """<xref href="class_big_endian.xml#function"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, 'class_big_endian.xml#function')
        self.assertEqual(myObj.path, 'class_big_endian.xml')
        self.assertEqual(myObj.fragment, 'function')
        self.assertEqual(myObj.scheme, '')
        srcPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
        expPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'class_big_endian.xml'))
        self.assertEqual(
            myObj.fileFragment(srcPath),
            (expPath, 'function')
        )
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])
        
    def test_file_frag_01(self):
        """DitaRef: accessing an xref node, with a file and a fragment and relative path with '\\'."""
        myXml = """<xref href="..\\chips\\class_big_endian.xml#function"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        srcPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
        expPath = normalisePath(os.path.join('C:%s' % os.sep, 'chips', 'class_big_endian.xml'))
        self.assertEqual(
            myObj.fileFragment(srcPath),
            (expPath, 'function')
        )
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])
        
    def test_file_frag_02(self):
        """DitaRef: accessing an xref node, with a file and a fragment and relative path with '/'."""
        myXml = """<xref href="../chips/class_big_endian.xml#function"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        srcPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
        expPath = normalisePath(os.path.join('C:%s' % os.sep, 'chips', 'class_big_endian.xml'))
        self.assertEqual(
            myObj.fileFragment(srcPath),
            (expPath, 'function')
        )
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])
        
    def test_file_frag_03(self):
        """DitaRef: accessing an xref node, with a no file but with a fragment"""
        myXml = """<xref href="#function"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, '#function')
        self.assertEqual(myObj.path, '')
        self.assertEqual(myObj.fragment, 'function')
        self.assertEqual(myObj.scheme, '')
        srcPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
        expPath = normalisePath(os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml'))
        self.assertEqual(
            myObj.fileFragment(srcPath),
            (expPath, 'function')
        )
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])
        
    def test_basic_scheme(self):
        """DitaRef: an xref node with a URI scheme"""
        myXml = """<xref href="http://www.cwi.nl:80/%7Eguido/Python.html#fragment"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, 'http://www.cwi.nl:80/%7Eguido/Python.html#fragment')
        self.assertEqual(myObj.path, '/%7Eguido/Python.html')
        self.assertEqual(myObj.fragment, 'fragment')
        self.assertEqual(myObj.scheme, 'http')
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])

    def test_basic_scheme_file_frag(self):
        """DitaRef: an xref node with a URI scheme, invoking fileFragment()"""
        myXml = """<xref href="http://www.cwi.nl:80/%7Eguido/Python.html#fragment"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, 'http://www.cwi.nl:80/%7Eguido/Python.html#fragment')
        self.assertEqual(myObj.path, '/%7Eguido/Python.html')
        self.assertEqual(myObj.fragment, 'fragment')
        self.assertEqual(myObj.scheme, 'http')
        srcPath = os.path.join('C:%s' % os.sep, 'spam', 'eggs.xml')
        self.assertEqual(
            myObj.fileFragment(srcPath),
            (None, None)
        )
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])

    def test_fail_no_href(self):
        """DitaRef: Fails on an xref node with no href attribute"""
        myXml = """<xref />"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(
            myObj.errStrings(False, None),
            [
             'Reference element "xref" is missing href=... attribute',
             ]
        )
        self.assertEqual(
            myObj.errStrings(True, None),
            [
             'Reference element "%s" is missing href=... attribute' % GENERIC_STRING,
             ]
        )

    def test_fail_bad_frag(self):
        """DitaRef: Fails on an xref node with href attribute that has multiple '#' characters"""
        myXml = """<xref href="a#b#c" />"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(
            myObj.errStrings(False, None),
            [
             'Multiple \'#\' not allowed in reference "a#b#c"',
             ]
        )
        self.assertEqual(
            myObj.errStrings(True, None),
            [
             'Multiple \'#\' not allowed in reference "%s"' % GENERIC_STRING,
             ]
        )

    def test_guid_00(self):
        """DitaRef: basic read of an node with an GUID file/fragment reference"""
        myXml = """<xref href="GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
        self.assertEqual(myObj.path, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml')
        self.assertEqual(myObj.elem, 'xref')
        self.assertEqual(str(myObj), 'xref GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
        self.assertEqual(myObj.fragment, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
        self.assertEqual(myObj.scheme, '')
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])

    def test_guid_01(self):
        """DitaRef: basic read of an node with an GUID file part fails"""
        myXml = """<xref href="GUID-.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, 'GUID-.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
        self.assertEqual(myObj.path, 'GUID-.xml')
        self.assertEqual(myObj.elem, 'xref')
        self.assertEqual(str(myObj), 'xref GUID-.xml#GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
        self.assertEqual(myObj.fragment, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E')
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])
        myObj.checkGuid()
        self.assertEqual(
            myObj.errStrings(False, None),
            [
             'GUID specification does not match file reference "GUID-.xml"'
            ])
        self.assertEqual(
            myObj.errStrings(True, None),
            [
             genericStringForErrorCode(203),
            ]
        )

    def test_guid_02(self):
        """DitaRef: basic read of an node with an GUID fragment part fails"""
        myXml = """<xref href="GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4"/>"""
        myTree = etree.parse(StringIO.StringIO(myXml))
        myObj = DitaRef(myTree.getroot())
        self.assertEqual(myObj.href, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4')
        self.assertEqual(myObj.path, 'GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml')
        self.assertEqual(myObj.elem, 'xref')
        self.assertEqual(str(myObj), 'xref GUID-25825EC4-341F-3EA4-94AA-7DCE380E6D2E.xml#GUID-25825EC4')
        self.assertEqual(myObj.fragment, 'GUID-25825EC4')
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])
        myObj.checkGuid()
        self.assertEqual(
            myObj.errStrings(False, None),
            [
             'GUID specification does not match fragment reference "GUID-25825EC4"'
            ])
        self.assertEqual(
            myObj.errStrings(True, None),
            [
             genericStringForErrorCode(204),
            ]
        )

class TestDitaFile(unittest.TestCase):
    def setUp(self):
        pass
    
    def tearDown(self):
        pass
    
    def testSetUpTearDown(self):
        """DitaFile: test setUp() and tearDown()."""
        pass
    
    def test_Basic(self):
        """DitaFile: basic read of an XML file"""
        myXml = """<?xml version='1.0' encoding='UTF-8' standalone='no'?>
<!DOCTYPE cxxClass PUBLIC "-//NOKIA//DTD DITA C++ API Class Reference Type v0.1.0//EN" "dtd/cxxClass.dtd" >
<cxxClass id="class_big_endian">
    <apiName>BigEndian</apiName>
    <shortdesc/>
    <cxxClassDetail>
        <cxxClassDefinition>
            <cxxClassAccessSpecifier value="public"/>
            <cxxClassAPIItemLocation>
                <cxxClassDeclarationFile name="filePath" value="K:/sf/os/commsfw/datacommsserver/esockserver/inc/es_sock.h"/>
                <cxxClassDeclarationFileLine name="lineNumber" value="1520"/>
                <cxxClassDefinitionFile name="filePath" value="K:/sf/os/commsfw/datacommsserver/esockserver/inc/es_sock.h"/>
                <cxxClassDefinitionFileLineStart name="lineNumber" value="1516"/>
                <cxxClassDefinitionFileLineEnd name="lineNumber" value="1526"/>
            </cxxClassAPIItemLocation>
        </cxxClassDefinition>
        <apiDesc>
            <p>Inserts and extracts integers in big-endian format.   </p>
        </apiDesc>
    </cxxClassDetail>
    <cxxFunction id="class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f">
    </cxxFunction>
    <cxxFunction id="class_big_endian_1aedf702f5c0118e4294d1a6d9684f8441">
    </cxxFunction>
    <cxxFunction id="class_big_endian_1ae266722f7bb965c971155a3315bad484">
    </cxxFunction>
    <cxxFunction id="class_big_endian_1a497d5248ea259f8490fb40ac4f2aafb2">
    </cxxFunction>
</cxxClass>"""
        myFile = StringIO.StringIO(myXml)
        myObj = DitaFileObj(myFile, 'foo')
        self.assertEqual(myObj.identity, normalisePath('foo'))
        self.assertEqual(myObj.doctype, 'cxxClass')
        self.assertEqual(myObj.rootId, 'class_big_endian')
        #print myObj.idMap()
        self.assertEqual(
            myObj.idElemMap(),
            {
                'class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'   : 'cxxFunction',
                'class_big_endian_1aedf702f5c0118e4294d1a6d9684f8441'   : 'cxxFunction',
                'class_big_endian'                                      : 'cxxClass',
                'class_big_endian_1a497d5248ea259f8490fb40ac4f2aafb2'   : 'cxxFunction',
                'class_big_endian_1ae266722f7bb965c971155a3315bad484'   : 'cxxFunction',
                }
        )
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])

    def test_missing_file(self):
        """DitaFile: read an missing XML file"""
        myObj = DitaFileObj(None, 'foo')
        self.assertEqual(
            myObj.errStrings(False, None),
            [
             'Failed to open: "%s"' % normalisePath('foo'),
             ]
        )
        self.assertEqual(
            myObj.errStrings(True, None),
            [
             genericStringForErrorCode(400),
             ]
        )
    
    def test_IllFormedFile(self):
        """DitaFile: read an ill-formed XML file"""
        myXml = """<?xml version='1.0' encoding='UTF-8' standalone='no'?>
<!DOCTYPE cxxClass PUBLIC "-//NOKIA//DTD DITA C++ API Class Reference Type v0.1.0//EN" "dtd/cxxClass.dtd" >
<cxxClass id="class_big_endian">
"""
        myFile = StringIO.StringIO(myXml)
        myObj = DitaFileObj(myFile, 'foo')
        self.assertEqual(myObj.identity, normalisePath('foo'))
        self.assertEqual(myObj.doctype, None)
        self.assertEqual(myObj.rootId, None)
        #print myObj.idMap()
        self.assertEqual(myObj.idElemMap(), {})
        self.assertEqual(
            myObj.errStrings(False, None),
            [
             'Can not parse: "no element found: line 4, column 0"',
             ]
        )
        self.assertEqual(
            myObj.errStrings(True, None),
            [
             genericStringForErrorCode(404),
             ]
        )

    def test_missing_root_id(self):
        """DitaFile: read of an XML file with no id on root element"""
        myXml = """<?xml version='1.0' encoding='UTF-8' standalone='no'?>
<!DOCTYPE cxxClass PUBLIC "-//NOKIA//DTD DITA C++ API Class Reference Type v0.1.0//EN" "dtd/cxxClass.dtd" >
<cxxClass>
    <xref href="OtherClass">OtherClass</xref>
    <cxxFunction id="class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f"/>
</cxxClass>"""
        myFile = StringIO.StringIO(myXml)
        myObj = DitaFileObj(myFile, 'foo')
        self.assertEqual(myObj.identity, normalisePath('foo'))
        self.assertEqual(myObj.doctype, 'cxxClass')
        self.assertEqual(myObj.rootId, None)
        self.assertEqual(
            myObj.idElemMap(),
            {
                'class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'   : 'cxxFunction',
                }
        )
        self.assertEqual(myObj.errStrings(False, None), [genericStringForErrorCode(402)])
        self.assertEqual(myObj.errStrings(True, None), [genericStringForErrorCode(402)])

    def test_duplicate_id(self):
        """DitaFile: duplicate IDs"""
        myXml = """<root id="AnID">
<elem id="AnID"/>
</root>"""
        myFile = StringIO.StringIO(myXml)
        myObj = DitaFileObj(myFile, 'spam.xml')
        self.assertEqual(myObj.identity, normalisePath('spam.xml'))
        self.assertEqual(myObj.doctype, 'root')
        self.assertEqual(myObj.rootId, 'AnID')
        self.assertEqual(myObj.idElemMap(), {})
        self.assertEqual(
            myObj.errStrings(False, None),
            [
                'Multiple id="AnID"',
            ]
        )
        self.assertEqual(myObj.errStrings(True, None), [genericStringForErrorCode(401)])

    def test_ismap_00(self):
        """DitaFile: Is a map for <map>."""
        myXml = """<map id="myMap"/>"""
        myFile = StringIO.StringIO(myXml)
        myObj = DitaFileObj(myFile, 'spam.xml')
        self.assertEqual(myObj.isMap, True)
    
    def test_ismap_01(self):
        """DitaFile: Is a map for <cxxAPIMap>."""
        myXml = """<cxxAPIMap id="myMap"/>"""
        myFile = StringIO.StringIO(myXml)
        myObj = DitaFileObj(myFile, 'spam.xml')
        self.assertEqual(myObj.isMap, True)
    
    def test_Basic_01(self):
        """DitaFile: read of an simple XML file with id and xref"""
        myXml = """<?xml version='1.0' encoding='UTF-8' standalone='no'?>
<!DOCTYPE cxxClass PUBLIC "-//NOKIA//DTD DITA C++ API Class Reference Type v0.1.0//EN" "dtd/cxxClass.dtd" >
<cxxClass id="class_big_endian">
    <xref href="OtherClass">OtherClass</xref>
    <cxxFunction id="class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f"/>
</cxxClass>"""
        myFile = StringIO.StringIO(myXml)
        myObj = DitaFileObj(myFile, 'foo')
        self.assertEqual(myObj.identity, normalisePath('foo'))
        self.assertEqual(myObj.doctype, 'cxxClass')
        self.assertEqual(myObj.rootId, 'class_big_endian')
        self.assertEqual(myObj.isMap, False)
        self.assertEqual(len(myObj.idS), 2)
        self.assertEqual(len(myObj.refS), 1)
        self.assertEqual(myObj.hasId('class_big_endian'), True)
        self.assertEqual(myObj.hasId('class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'), True)
        self.assertEqual(myObj.hasId('noID'), False)
        self.assertEqual(myObj.idElem('class_big_endian'), 'cxxClass')
        self.assertEqual(myObj.idElem('noID'), None)
        self.assertEqual(
            myObj.idElem('class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'),
            'cxxFunction'
        )
        #print myObj.idMap()
        self.assertEqual(
            myObj.idElemMap(),
            {
                'class_big_endian_1a9f78fb092e713acf6ffe3e8e11f1626f'   : 'cxxFunction',
                'class_big_endian'                                      : 'cxxClass',
                }
        )
        self.assertEqual(myObj.errStrings(False, None), [])
        self.assertEqual(myObj.errStrings(True, None), [])

class TestDitaFileSet(unittest.TestCase):
    def setUp(self):
        pass
    
    def tearDown(self):
        pass
    
    def testSetUpTearDown(self):
        """DitaFileSet: test setUp() and tearDown()."""
        pass
    
    def test_None(self):
        """DitaFileSet: read of None."""
        myO = DitaFileSet(None)
        myO.finalise()
        self.assertEqual(myO.errStrings(False, None), ['Not a directory: None'])
        self.assertEqual(myO.errStrings(True, None), ['Not a directory: %s' % GENERIC_STRING, ])
        self.assertEqual(myO.errCountMap, {500 : 1})

    def test_basic(self):
        """DitaFileSet: Test reading a map and a couple of files."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="spam.dita" />
    <topicref href="eggs.dita" />
</map>"""
            ),
            'map.ditamap'
        )
        myO._addFileObj(StringIO.StringIO('<topic id="spam"/>'), 'spam.dita')
        myO._addFileObj(StringIO.StringIO('<topic id="eggs"/>'), 'eggs.dita')
        myO.finalise()
        #print 'HI'
        #myO.writeErrors(False)
        self.assertEqual(myO.allErrStrings(False, None), [])
        self.assertEqual(myO.allErrStrings(True, None), [])
        self.assertEqual(myO.errCountMap, {})

    def test_duplicate_paths(self):
        """DitaFileSet: Test reading a couple of files in duplicate paths."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="spam.dita" />
</map>"""
            ),
            'map.ditamap'
        )
        myO._addFileObj(StringIO.StringIO('<topic id="spam"/>'), 'spam.dita')
        myO._addFileObj(StringIO.StringIO('<topic id="eggs"/>'), 'spam.dita')
        myO.finalise()
        self.assertEqual(
            myO.errStrings(False, None),
            [
                'Duplicate file path: "%s"' % normalisePath('spam.dita'),
            ]
        )
        self.assertEqual(myO.errStrings(True, None), [genericStringForErrorCode(504),])
        self.assertEqual(myO.errCountMap, {504 : 1})

    def test_duplicate_ids(self):
        """DitaFileSet: Test reading a map and a couple of files with duplicate IDs."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="spam.dita" />
    <topicref href="eggs.dita" />
    <topicref href="chips.dita" />
</map>"""
            ),
            'map.ditamap'
        )
        myO._addFileObj(StringIO.StringIO('<topic id="chips"/>'), 'spam.dita')
        myO._addFileObj(StringIO.StringIO('<topic id="chips"/>'), 'eggs.dita')
        myO._addFileObj(StringIO.StringIO('<topic id="chips"/>'), 'chips.dita')
        myO.finalise()
        #print 'HI'
        #myO.writeErrors(False)
        #pprint.pprint(myO.errStrings(False, None))
        self.assertEqual(
            myO.errStrings(True, None),
            [
             genericStringForErrorCode(505),
             genericStringForErrorCode(501),
             ]
        )
        expErrs = [
                """Duplicate id="chips" in files: ('%s', '%s', '%s')""" \
                    % (normalisePath('chips.dita'), normalisePath('eggs.dita'), normalisePath('spam.dita')),
                """Duplicate root id="chips" in files: ('%s', '%s', '%s')""" \
                    % (normalisePath('chips.dita'), normalisePath('eggs.dita'), normalisePath('spam.dita')),
            ]
        myErrs = myO.errStrings(False, None)
#===============================================================================
#        for i in range(2):
#            if myErrs[i] != expErrs[i]:
#                print myErrs[i]
#                print expErrs[i]
#                print
#===============================================================================
        self.assertEqual(myErrs, expErrs)
        self.assertEqual(myO.errCountMap, {505: 1, 501: 1})
    
    def test_lonely_topics(self):
        """DitaFileSet: Test a couple of lonely topics."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(StringIO.StringIO('<spam id="spam"/>'), 'spam')
        myO._addFileObj(StringIO.StringIO('<eggs id="eggs"/>'), 'eggs')
        myO.finalise()
        self.assertEqual(
            myO.errStrings(False, None),
            [
             'Topic id="%s" is not referenced by any map' % normalisePath('eggs'),
             'Topic id="%s" is not referenced by any map' % normalisePath('spam'),
             ]
        )
        self.assertEqual(
            myO.errStrings(True, None),
            [
                genericStringForErrorCode(600),
            ]
        )

    def test_map_cycles_00(self):
        """DitaFileSet: Cyclic references between two maps."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="map_01.ditamap" format="ditamap" />
</map>"""
            ),
            'map_00.ditamap'
        )
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_01">
    <topicref href="map_00.ditamap" format="ditamap" />
</map>"""
            ),
            'map_01.ditamap'
        )
        myO.finalise()
        #print 'HI test_map_cycles_00()'
        #pprint.pprint(myO._retMapAdjList())
        self.assertEqual(
            myO.errStrings(False, None),
            [
                'Maps "%s" are in a a cycle.' % str(
                    (
                     normalisePath('map_00.ditamap'),
                     normalisePath('map_01.ditamap'),
                     )
                ),
                'Maps "%s" are in a a cycle.' % str(
                    (
                     normalisePath('map_01.ditamap'),
                     normalisePath('map_00.ditamap'),
                     )
                ),
            ]
        )
        #print
        #pprint.pprint(myO.allErrStrings(False, None))
        self.assertEqual(myO.allErrStrings(True, None), [genericStringForErrorCode(701)])
        self.assertEqual(myO.errCountMap, {701 : 4})

    def test_map_cycles_01(self):
        """DitaFileSet: Cyclic references between three maps."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="map_01.ditamap" format="ditamap" />
</map>"""
            ),
            'map_00.ditamap'
        )
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_01">
    <topicref href="map_02.ditamap" format="ditamap" />
</map>"""
            ),
            'map_01.ditamap'
        )
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_02">
    <topicref href="map_00.ditamap" format="ditamap" />
</map>"""
            ),
            'map_02.ditamap'
        )
        myO.finalise()
        #print 'HI test_map_cycles_00()'
        #pprint.pprint(myO._retMapAdjList())
        self.assertEqual(
            myO.errStrings(False, None),
            [
                'Maps "%s" are in a a cycle.' % str(
                    (
                     normalisePath('map_00.ditamap'),
                     normalisePath('map_01.ditamap'),
                     normalisePath('map_02.ditamap'),
                     )
                ),
                'Maps "%s" are in a a cycle.' % str(
                    (
                     normalisePath('map_01.ditamap'),
                     normalisePath('map_02.ditamap'),
                     normalisePath('map_00.ditamap'),
                     )
                ),
                'Maps "%s" are in a a cycle.' % str(
                    (
                     normalisePath('map_02.ditamap'),
                     normalisePath('map_00.ditamap'),
                     normalisePath('map_01.ditamap'),
                     )
                ),
            ]
        )
        self.assertEqual(myO.errStrings(True, None), [genericStringForErrorCode(701)])
        self.assertEqual(myO.errCountMap, {701 : 6})

    def test_refarc_00(self):
        """DitaFileSet: Test ref arcing - all resolve."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="spam.dita#spam" />
    <topicref href="eggs.dita#eggs" />
</map>"""
            ),
            'map.ditamap'
        )
        myO._addFileObj(StringIO.StringIO('<topic id="spam"/>'), 'spam.dita')
        myO._addFileObj(StringIO.StringIO('<topic id="eggs"/>'), 'eggs.dita')
        myO.finalise()
        self.assertEqual(myO.errCountMap, {})
        self.assertEqual(myO.allErrStrings(False, None), [])
        self.assertEqual(myO.allErrStrings(True, None), [])
        self.assertEqual(myO.errStrings(False, None), [])
        self.assertEqual(myO.errStrings(True, None), [])

    def test_refarc_fail_00(self):
        """DitaFileSet: Test ref arcing - can't find file."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="spam_.dita" />
    <topicref href="eggs_for_tea.dita" />
</map>"""
            ),
            'map.ditamap'
        )
        myO.finalise()
        self.assertEqual(myO.errCountMap, {410: 2})
        #print 'HI'
        #pprint.pprint(myO.allErrStrings(False, None))
        self.assertEqual(
            myO.allErrStrings(False, None),
            [
                'Can not resolve reference to file "%s"' % normalisePath('eggs_for_tea.dita'),
                'Can not resolve reference to file "%s"' % normalisePath('spam_.dita'),
            ]
        )
        self.assertEqual(
            myO.allErrStrings(True, None),
            [
                'Can not resolve reference to file "..."',
            ]
        )
        self.assertEqual(myO.errStrings(False, None), [])
        self.assertEqual(myO.errStrings(True, None), [])

    def test_refarc_fail_01(self):
        """DitaFileSet: Test ref arcing - can't find fragment."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="spam.dita#spam_" />
    <topicref href="eggs.dita#eggs_" />
</map>"""
            ),
            'map.ditamap'
        )
        myO._addFileObj(StringIO.StringIO('<spam id="spam"/>'), 'spam.dita')
        myO._addFileObj(StringIO.StringIO('<eggs id="eggs"/>'), 'eggs.dita')
        myO.finalise()
        self.assertEqual(myO.errCountMap, {411: 2})
        #print 'HI'
        #pprint.pprint(myO.allErrStrings(False, None))
        self.assertEqual(
            myO.allErrStrings(False, None),
            [
                'Can resolve reference to file "%s" but not to fragment "eggs_"' % normalisePath('eggs.dita'),
                'Can resolve reference to file "%s" but not to fragment "spam_"' % normalisePath('spam.dita'),
            ]
        )
        self.assertEqual(
            myO.allErrStrings(True, None),
            [
                'Can resolve reference to file "%s" but not to fragment "%s"' % (GENERIC_STRING, GENERIC_STRING),
            ]
        )
        self.assertEqual(myO.errStrings(False, None), [])
        self.assertEqual(myO.errStrings(True, None), [])

    def test_refarc_url_00(self):
        """DitaFileSet: Test ref arcing - URL."""
        myO = DitaFileSet(None, procDir=False, testExt=True)
        myO._addFileObj(
            StringIO.StringIO(
"""<map id="map_00">
    <topicref href="spam.dita#spam" />
    <topicref href="eggs.dita#eggs" />
</map>"""
            ),
            'map.ditamap'
        )
        myO._addFileObj(StringIO.StringIO("""<topic id="spam">
        <xref href="http://www.nokia.com">Nokia</xref>
</topic>"""), 'spam.dita')
        myO._addFileObj(StringIO.StringIO("""<topic id="eggs">
        <xref href="http://www.google.com">Google</xref>
</topic>"""), 'eggs.dita')
        myO.finalise()
        #print 'HI'
        #pprint.pprint(myO.allErrStrings(False, None))
        self.assertEqual(myO.errCountMap, {})
        self.assertEqual(
            myO.allErrStrings(False, None),
            [
            ]
        )
        self.assertEqual(
            myO.allErrStrings(True, None),
            [
            ]
        )
        self.assertEqual(myO.errStrings(False, None), [])
        self.assertEqual(myO.errStrings(True, None), [])

class TestDitaBookmapFileSet(unittest.TestCase):
    def setUp(self):
        pass
    
    def tearDown(self):
        pass
    
    def testSetUpTearDown(self):
        """TestDitaBookmapFileSet: test setUp() and tearDown()."""
        pass
    
    def test_basic(self):
        """TestDitaBookmapFileSet: Test reading a bookmap and a topic."""
        myO = DitaFileSet(None, procDir=False)
        myO._addFileObj(
            StringIO.StringIO(
"""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE bookmap PUBLIC "-//OASIS//DTD DITA BookMap//EN"
"bookmap.dtd">
<bookmap id="GUID-5BDFDB6B-7801-4804-9F41-2BDC5BE53DDF">
  <booktitle>
    <mainbooktitle>My Bookmap</mainbooktitle>
    <booktitlealt>Alternate title</booktitlealt>
  </booktitle>
  <frontmatter id="GUID-DA857913-F826-4CF7-A135-93F2AEB48353">
    <topicref href="GUID-00025EAD-C4B6-5408-96A3-FFDBBBDC7CAB.dita" id="GUID-994B1764-393F-401F-8571-CE0955AB6CA6" />
  </frontmatter>
</bookmap>
"""
            ),
            'bookmap.ditamap'
        )
        myO._addFileObj(StringIO.StringIO("""<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE concept  PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
<concept id="GUID-00025EAD-C4B6-5408-96A3-FFDBBBDC7CAB" xml:lang="en">
    <title>How to read and write a file</title>
</concept>
"""), 'GUID-00025EAD-C4B6-5408-96A3-FFDBBBDC7CAB.dita')
        myO.finalise()
        #print
        #myO.debugDump()
        #print 'HI'
        #myO.writeErrors(False)
        self.assertEqual(myO.allErrStrings(False, None), [])
        self.assertEqual(myO.allErrStrings(True, None), [])
        self.assertEqual(myO.errCountMap, {})

class Special(unittest.TestCase):
    pass

def unitTest(theVerbosity=2):
    suite = unittest.TestLoader().loadTestsFromTestCase(NullClass)
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestCountDict))
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaId))
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaRef))
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaFile))
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaFileSet))
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestDitaBookmapFileSet))
    suite.addTests(unittest.TestLoader().loadTestsFromTestCase(Special))
    myResult = unittest.TextTestRunner(verbosity=theVerbosity).run(suite)
    return (myResult.testsRun, len(myResult.errors), len(myResult.failures))

######################################
# main() stuff
######################################
def main():
    print 'CMD: %s' % ' '.join(sys.argv)
    usage = "usage: %prog [options] <Directory of XML content>"
    parser = OptionParser(usage, version='%prog ' + __version__)
    parser.add_option("-d", action="store_true", dest="dump", default=False, 
                      help="Dump internal representation. [default: %default]")
    parser.add_option(
            "-e", "--errors",
            type="str",
            dest="error_codes",
            default='All',
            help="Only report on certain error codes (space seperated list). [default: \"%default\"]"
        )      
    parser.add_option("-f", "--file", dest="file", type="str", default='None', 
                      help="Report of errors by file either 'None', 'generic', 'specific'. [default: %default]")
    parser.add_option("-g", action="store_true", dest="guid", default=False, 
                      help="Enforce GUID specification. [default: %default]")
    parser.add_option(
            "-j", "--jobs",
            type="int",
            dest="jobs",
            default=-1,
            help="Max processes when multiprocessing. 0 takes CPUs, -1 no MP. [default: %default]"
        )      
    parser.add_option(
            "-l", "--loglevel",
            type="int",
            dest="loglevel",
            default=20,
            help="Log Level (debug=10, info=20, warning=30, [error=40], critical=50) [default: %default]"
        )      
    parser.add_option(
            "-p", "--pattern",
            type="str",
            dest="pattern",
            default=FNMATCH_STRING,
            help="Pattern match. [default: \"%default\"]"
        )      
    parser.add_option("-r", action="store_true", dest="recursive", default=False, 
                      help="Recursive. [default: %default]")
    parser.add_option("-s", action="store_true", dest="shelve", default=False, 
                      help="Use the shelve dBase rather than storing the internal representation in memory. This is slower but is useful for large data sets where a memory error might occur. [default: %default]")
    parser.add_option("-u", action="store_true", dest="unit_test", default=False, 
                      help="Execute unit tests and exit. [default: %default]")
    parser.add_option("-x", action="store_true", dest="ext_url", default=False, 
                      help="Test external |URLs. [default: %default]")
    parser.add_option("-?", action="store_true", dest="query_errors", default=False, 
                      help="Display the error types that are detected. [default: %default]")
    (options, args) = parser.parse_args()
    logging.basicConfig(
        level=options.loglevel,
        format='%(asctime)s %(levelname)-8s %(message)s',
        stream=sys.stdout,
    )
    if options.file not in ('None', 'generic', 'specific'):
        parser.error("--file option must be: 'None' | 'generic' | 'specific'")
        return 1
    if options.unit_test:
        unitTest()
    if options.query_errors:
        writeGenericStringsForErrorCodes()
    if len(args) < 1 and not options.unit_test:
        parser.print_help()
        parser.error("I can't do much without a path to the XML content.")
        return 1
    elif len(args) == 1:
        if options.jobs > -1:
            myObj = retMpDitaFileSetObj(
                        args[0],
                        options.pattern.split(' '),
                        options.recursive,
                        options.jobs,
                        options.ext_url,
                        options.shelve,
                        )
        else:
            myObj = DitaFileSet(args[0],
                                procDir=True,
                                thePatterns=options.pattern.split(' '),
                                recursive=options.recursive,
                                testExt=options.ext_url,
                                useDbase=options.shelve,
                                )
            #print 'MyObj:', myObj
        if options.dump:
            myObj.debugDump()
        myObj.writeStatistics()
        myObj.writeErrorSummary()
        #pprint.pprint(myObj.statsMap)
        # TODO: Write out the results in different ways
        errFilter = set(PROBLEM_CODE_FORMAT.keys())
        if options.error_codes != 'All':
            errFilter = set([int(i) for i in options.error_codes.split()])
        if options.file == 'generic':
            print 'Generic problems:'
            myObj.writeErrors(True, errFilter)
        elif options.file == 'specific':
            print 'Specific problems:'
            myObj.writeErrors(False, errFilter)
    elif len(args) > 1:
        parser.error("Too many arguments, I need only one.")
        return 1
    return 0

if __name__ == '__main__':
    multiprocessing.freeze_support()
    sys.exit(main())