python-2.5.2/win32/Lib/csv.py
changeset 0 ae805ac0140d
equal deleted inserted replaced
-1:000000000000 0:ae805ac0140d
       
     1 
       
     2 """
       
     3 csv.py - read/write/investigate CSV files
       
     4 """
       
     5 
       
     6 import re
       
     7 from _csv import Error, __version__, writer, reader, register_dialect, \
       
     8                  unregister_dialect, get_dialect, list_dialects, \
       
     9                  field_size_limit, \
       
    10                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
       
    11                  __doc__
       
    12 from _csv import Dialect as _Dialect
       
    13 
       
    14 try:
       
    15     from cStringIO import StringIO
       
    16 except ImportError:
       
    17     from StringIO import StringIO
       
    18 
       
    19 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
       
    20             "Error", "Dialect", "excel", "excel_tab", "reader", "writer",
       
    21             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
       
    22             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
       
    23 
       
    24 class Dialect:
       
    25     """Describe an Excel dialect.
       
    26 
       
    27     This must be subclassed (see csv.excel).  Valid attributes are:
       
    28     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
       
    29     lineterminator, quoting.
       
    30 
       
    31     """
       
    32     _name = ""
       
    33     _valid = False
       
    34     # placeholders
       
    35     delimiter = None
       
    36     quotechar = None
       
    37     escapechar = None
       
    38     doublequote = None
       
    39     skipinitialspace = None
       
    40     lineterminator = None
       
    41     quoting = None
       
    42 
       
    43     def __init__(self):
       
    44         if self.__class__ != Dialect:
       
    45             self._valid = True
       
    46         self._validate()
       
    47 
       
    48     def _validate(self):
       
    49         try:
       
    50             _Dialect(self)
       
    51         except TypeError, e:
       
    52             # We do this for compatibility with py2.3
       
    53             raise Error(str(e))
       
    54 
       
    55 class excel(Dialect):
       
    56     """Describe the usual properties of Excel-generated CSV files."""
       
    57     delimiter = ','
       
    58     quotechar = '"'
       
    59     doublequote = True
       
    60     skipinitialspace = False
       
    61     lineterminator = '\r\n'
       
    62     quoting = QUOTE_MINIMAL
       
    63 register_dialect("excel", excel)
       
    64 
       
    65 class excel_tab(excel):
       
    66     """Describe the usual properties of Excel-generated TAB-delimited files."""
       
    67     delimiter = '\t'
       
    68 register_dialect("excel-tab", excel_tab)
       
    69 
       
    70 
       
    71 class DictReader:
       
    72     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
       
    73                  dialect="excel", *args, **kwds):
       
    74         self.fieldnames = fieldnames    # list of keys for the dict
       
    75         self.restkey = restkey          # key to catch long rows
       
    76         self.restval = restval          # default value for short rows
       
    77         self.reader = reader(f, dialect, *args, **kwds)
       
    78 
       
    79     def __iter__(self):
       
    80         return self
       
    81 
       
    82     def next(self):
       
    83         row = self.reader.next()
       
    84         if self.fieldnames is None:
       
    85             self.fieldnames = row
       
    86             row = self.reader.next()
       
    87 
       
    88         # unlike the basic reader, we prefer not to return blanks,
       
    89         # because we will typically wind up with a dict full of None
       
    90         # values
       
    91         while row == []:
       
    92             row = self.reader.next()
       
    93         d = dict(zip(self.fieldnames, row))
       
    94         lf = len(self.fieldnames)
       
    95         lr = len(row)
       
    96         if lf < lr:
       
    97             d[self.restkey] = row[lf:]
       
    98         elif lf > lr:
       
    99             for key in self.fieldnames[lr:]:
       
   100                 d[key] = self.restval
       
   101         return d
       
   102 
       
   103 
       
   104 class DictWriter:
       
   105     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
       
   106                  dialect="excel", *args, **kwds):
       
   107         self.fieldnames = fieldnames    # list of keys for the dict
       
   108         self.restval = restval          # for writing short dicts
       
   109         if extrasaction.lower() not in ("raise", "ignore"):
       
   110             raise ValueError, \
       
   111                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
       
   112                    extrasaction)
       
   113         self.extrasaction = extrasaction
       
   114         self.writer = writer(f, dialect, *args, **kwds)
       
   115 
       
   116     def _dict_to_list(self, rowdict):
       
   117         if self.extrasaction == "raise":
       
   118             for k in rowdict.keys():
       
   119                 if k not in self.fieldnames:
       
   120                     raise ValueError, "dict contains fields not in fieldnames"
       
   121         return [rowdict.get(key, self.restval) for key in self.fieldnames]
       
   122 
       
   123     def writerow(self, rowdict):
       
   124         return self.writer.writerow(self._dict_to_list(rowdict))
       
   125 
       
   126     def writerows(self, rowdicts):
       
   127         rows = []
       
   128         for rowdict in rowdicts:
       
   129             rows.append(self._dict_to_list(rowdict))
       
   130         return self.writer.writerows(rows)
       
   131 
       
   132 # Guard Sniffer's type checking against builds that exclude complex()
       
   133 try:
       
   134     complex
       
   135 except NameError:
       
   136     complex = float
       
   137 
       
   138 class Sniffer:
       
   139     '''
       
   140     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
       
   141     Returns a Dialect object.
       
   142     '''
       
   143     def __init__(self):
       
   144         # in case there is more than one possible delimiter
       
   145         self.preferred = [',', '\t', ';', ' ', ':']
       
   146 
       
   147 
       
   148     def sniff(self, sample, delimiters=None):
       
   149         """
       
   150         Returns a dialect (or None) corresponding to the sample
       
   151         """
       
   152 
       
   153         quotechar, delimiter, skipinitialspace = \
       
   154                    self._guess_quote_and_delimiter(sample, delimiters)
       
   155         if not delimiter:
       
   156             delimiter, skipinitialspace = self._guess_delimiter(sample,
       
   157                                                                 delimiters)
       
   158 
       
   159         if not delimiter:
       
   160             raise Error, "Could not determine delimiter"
       
   161 
       
   162         class dialect(Dialect):
       
   163             _name = "sniffed"
       
   164             lineterminator = '\r\n'
       
   165             quoting = QUOTE_MINIMAL
       
   166             # escapechar = ''
       
   167             doublequote = False
       
   168 
       
   169         dialect.delimiter = delimiter
       
   170         # _csv.reader won't accept a quotechar of ''
       
   171         dialect.quotechar = quotechar or '"'
       
   172         dialect.skipinitialspace = skipinitialspace
       
   173 
       
   174         return dialect
       
   175 
       
   176 
       
   177     def _guess_quote_and_delimiter(self, data, delimiters):
       
   178         """
       
   179         Looks for text enclosed between two identical quotes
       
   180         (the probable quotechar) which are preceded and followed
       
   181         by the same character (the probable delimiter).
       
   182         For example:
       
   183                          ,'some text',
       
   184         The quote with the most wins, same with the delimiter.
       
   185         If there is no quotechar the delimiter can't be determined
       
   186         this way.
       
   187         """
       
   188 
       
   189         matches = []
       
   190         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
       
   191                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
       
   192                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
       
   193                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
       
   194             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
       
   195             matches = regexp.findall(data)
       
   196             if matches:
       
   197                 break
       
   198 
       
   199         if not matches:
       
   200             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
       
   201 
       
   202         quotes = {}
       
   203         delims = {}
       
   204         spaces = 0
       
   205         for m in matches:
       
   206             n = regexp.groupindex['quote'] - 1
       
   207             key = m[n]
       
   208             if key:
       
   209                 quotes[key] = quotes.get(key, 0) + 1
       
   210             try:
       
   211                 n = regexp.groupindex['delim'] - 1
       
   212                 key = m[n]
       
   213             except KeyError:
       
   214                 continue
       
   215             if key and (delimiters is None or key in delimiters):
       
   216                 delims[key] = delims.get(key, 0) + 1
       
   217             try:
       
   218                 n = regexp.groupindex['space'] - 1
       
   219             except KeyError:
       
   220                 continue
       
   221             if m[n]:
       
   222                 spaces += 1
       
   223 
       
   224         quotechar = reduce(lambda a, b, quotes = quotes:
       
   225                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
       
   226 
       
   227         if delims:
       
   228             delim = reduce(lambda a, b, delims = delims:
       
   229                            (delims[a] > delims[b]) and a or b, delims.keys())
       
   230             skipinitialspace = delims[delim] == spaces
       
   231             if delim == '\n': # most likely a file with a single column
       
   232                 delim = ''
       
   233         else:
       
   234             # there is *no* delimiter, it's a single column of quoted data
       
   235             delim = ''
       
   236             skipinitialspace = 0
       
   237 
       
   238         return (quotechar, delim, skipinitialspace)
       
   239 
       
   240 
       
   241     def _guess_delimiter(self, data, delimiters):
       
   242         """
       
   243         The delimiter /should/ occur the same number of times on
       
   244         each row. However, due to malformed data, it may not. We don't want
       
   245         an all or nothing approach, so we allow for small variations in this
       
   246         number.
       
   247           1) build a table of the frequency of each character on every line.
       
   248           2) build a table of freqencies of this frequency (meta-frequency?),
       
   249              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
       
   250              7 times in 2 rows'
       
   251           3) use the mode of the meta-frequency to determine the /expected/
       
   252              frequency for that character
       
   253           4) find out how often the character actually meets that goal
       
   254           5) the character that best meets its goal is the delimiter
       
   255         For performance reasons, the data is evaluated in chunks, so it can
       
   256         try and evaluate the smallest portion of the data possible, evaluating
       
   257         additional chunks as necessary.
       
   258         """
       
   259 
       
   260         data = filter(None, data.split('\n'))
       
   261 
       
   262         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
       
   263 
       
   264         # build frequency tables
       
   265         chunkLength = min(10, len(data))
       
   266         iteration = 0
       
   267         charFrequency = {}
       
   268         modes = {}
       
   269         delims = {}
       
   270         start, end = 0, min(chunkLength, len(data))
       
   271         while start < len(data):
       
   272             iteration += 1
       
   273             for line in data[start:end]:
       
   274                 for char in ascii:
       
   275                     metaFrequency = charFrequency.get(char, {})
       
   276                     # must count even if frequency is 0
       
   277                     freq = line.count(char)
       
   278                     # value is the mode
       
   279                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
       
   280                     charFrequency[char] = metaFrequency
       
   281 
       
   282             for char in charFrequency.keys():
       
   283                 items = charFrequency[char].items()
       
   284                 if len(items) == 1 and items[0][0] == 0:
       
   285                     continue
       
   286                 # get the mode of the frequencies
       
   287                 if len(items) > 1:
       
   288                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
       
   289                                          items)
       
   290                     # adjust the mode - subtract the sum of all
       
   291                     # other frequencies
       
   292                     items.remove(modes[char])
       
   293                     modes[char] = (modes[char][0], modes[char][1]
       
   294                                    - reduce(lambda a, b: (0, a[1] + b[1]),
       
   295                                             items)[1])
       
   296                 else:
       
   297                     modes[char] = items[0]
       
   298 
       
   299             # build a list of possible delimiters
       
   300             modeList = modes.items()
       
   301             total = float(chunkLength * iteration)
       
   302             # (rows of consistent data) / (number of rows) = 100%
       
   303             consistency = 1.0
       
   304             # minimum consistency threshold
       
   305             threshold = 0.9
       
   306             while len(delims) == 0 and consistency >= threshold:
       
   307                 for k, v in modeList:
       
   308                     if v[0] > 0 and v[1] > 0:
       
   309                         if ((v[1]/total) >= consistency and
       
   310                             (delimiters is None or k in delimiters)):
       
   311                             delims[k] = v
       
   312                 consistency -= 0.01
       
   313 
       
   314             if len(delims) == 1:
       
   315                 delim = delims.keys()[0]
       
   316                 skipinitialspace = (data[0].count(delim) ==
       
   317                                     data[0].count("%c " % delim))
       
   318                 return (delim, skipinitialspace)
       
   319 
       
   320             # analyze another chunkLength lines
       
   321             start = end
       
   322             end += chunkLength
       
   323 
       
   324         if not delims:
       
   325             return ('', 0)
       
   326 
       
   327         # if there's more than one, fall back to a 'preferred' list
       
   328         if len(delims) > 1:
       
   329             for d in self.preferred:
       
   330                 if d in delims.keys():
       
   331                     skipinitialspace = (data[0].count(d) ==
       
   332                                         data[0].count("%c " % d))
       
   333                     return (d, skipinitialspace)
       
   334 
       
   335         # nothing else indicates a preference, pick the character that
       
   336         # dominates(?)
       
   337         items = [(v,k) for (k,v) in delims.items()]
       
   338         items.sort()
       
   339         delim = items[-1][1]
       
   340 
       
   341         skipinitialspace = (data[0].count(delim) ==
       
   342                             data[0].count("%c " % delim))
       
   343         return (delim, skipinitialspace)
       
   344 
       
   345 
       
   346     def has_header(self, sample):
       
   347         # Creates a dictionary of types of data in each column. If any
       
   348         # column is of a single type (say, integers), *except* for the first
       
   349         # row, then the first row is presumed to be labels. If the type
       
   350         # can't be determined, it is assumed to be a string in which case
       
   351         # the length of the string is the determining factor: if all of the
       
   352         # rows except for the first are the same length, it's a header.
       
   353         # Finally, a 'vote' is taken at the end for each column, adding or
       
   354         # subtracting from the likelihood of the first row being a header.
       
   355 
       
   356         rdr = reader(StringIO(sample), self.sniff(sample))
       
   357 
       
   358         header = rdr.next() # assume first row is header
       
   359 
       
   360         columns = len(header)
       
   361         columnTypes = {}
       
   362         for i in range(columns): columnTypes[i] = None
       
   363 
       
   364         checked = 0
       
   365         for row in rdr:
       
   366             # arbitrary number of rows to check, to keep it sane
       
   367             if checked > 20:
       
   368                 break
       
   369             checked += 1
       
   370 
       
   371             if len(row) != columns:
       
   372                 continue # skip rows that have irregular number of columns
       
   373 
       
   374             for col in columnTypes.keys():
       
   375 
       
   376                 for thisType in [int, long, float, complex]:
       
   377                     try:
       
   378                         thisType(row[col])
       
   379                         break
       
   380                     except (ValueError, OverflowError):
       
   381                         pass
       
   382                 else:
       
   383                     # fallback to length of string
       
   384                     thisType = len(row[col])
       
   385 
       
   386                 # treat longs as ints
       
   387                 if thisType == long:
       
   388                     thisType = int
       
   389 
       
   390                 if thisType != columnTypes[col]:
       
   391                     if columnTypes[col] is None: # add new column type
       
   392                         columnTypes[col] = thisType
       
   393                     else:
       
   394                         # type is inconsistent, remove column from
       
   395                         # consideration
       
   396                         del columnTypes[col]
       
   397 
       
   398         # finally, compare results against first row and "vote"
       
   399         # on whether it's a header
       
   400         hasHeader = 0
       
   401         for col, colType in columnTypes.items():
       
   402             if type(colType) == type(0): # it's a length
       
   403                 if len(header[col]) != colType:
       
   404                     hasHeader += 1
       
   405                 else:
       
   406                     hasHeader -= 1
       
   407             else: # attempt typecast
       
   408                 try:
       
   409                     colType(header[col])
       
   410                 except (ValueError, TypeError):
       
   411                     hasHeader += 1
       
   412                 else:
       
   413                     hasHeader -= 1
       
   414 
       
   415         return hasHeader > 0