symbian-qemu-0.9.1-12/python-win32-2.6.1/lib/csv.py
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 
       
     2 """
       
     3 csv.py - read/write/investigate CSV files
       
     4 """
       
     5 
       
     6 import re
       
     7 from functools import reduce
       
     8 from _csv import Error, __version__, writer, reader, register_dialect, \
       
     9                  unregister_dialect, get_dialect, list_dialects, \
       
    10                  field_size_limit, \
       
    11                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
       
    12                  __doc__
       
    13 from _csv import Dialect as _Dialect
       
    14 
       
    15 try:
       
    16     from cStringIO import StringIO
       
    17 except ImportError:
       
    18     from StringIO import StringIO
       
    19 
       
    20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
       
    21             "Error", "Dialect", "__doc__", "excel", "excel_tab",
       
    22             "field_size_limit", "reader", "writer",
       
    23             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
       
    24             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
       
    25 
       
    26 class Dialect:
       
    27     """Describe an Excel dialect.
       
    28 
       
    29     This must be subclassed (see csv.excel).  Valid attributes are:
       
    30     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
       
    31     lineterminator, quoting.
       
    32 
       
    33     """
       
    34     _name = ""
       
    35     _valid = False
       
    36     # placeholders
       
    37     delimiter = None
       
    38     quotechar = None
       
    39     escapechar = None
       
    40     doublequote = None
       
    41     skipinitialspace = None
       
    42     lineterminator = None
       
    43     quoting = None
       
    44 
       
    45     def __init__(self):
       
    46         if self.__class__ != Dialect:
       
    47             self._valid = True
       
    48         self._validate()
       
    49 
       
    50     def _validate(self):
       
    51         try:
       
    52             _Dialect(self)
       
    53         except TypeError, e:
       
    54             # We do this for compatibility with py2.3
       
    55             raise Error(str(e))
       
    56 
       
    57 class excel(Dialect):
       
    58     """Describe the usual properties of Excel-generated CSV files."""
       
    59     delimiter = ','
       
    60     quotechar = '"'
       
    61     doublequote = True
       
    62     skipinitialspace = False
       
    63     lineterminator = '\r\n'
       
    64     quoting = QUOTE_MINIMAL
       
    65 register_dialect("excel", excel)
       
    66 
       
    67 class excel_tab(excel):
       
    68     """Describe the usual properties of Excel-generated TAB-delimited files."""
       
    69     delimiter = '\t'
       
    70 register_dialect("excel-tab", excel_tab)
       
    71 
       
    72 
       
    73 class DictReader:
       
    74     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
       
    75                  dialect="excel", *args, **kwds):
       
    76         self._fieldnames = fieldnames   # list of keys for the dict
       
    77         self.restkey = restkey          # key to catch long rows
       
    78         self.restval = restval          # default value for short rows
       
    79         self.reader = reader(f, dialect, *args, **kwds)
       
    80         self.dialect = dialect
       
    81         self.line_num = 0
       
    82 
       
    83     def __iter__(self):
       
    84         return self
       
    85 
       
    86     @property
       
    87     def fieldnames(self):
       
    88         if self._fieldnames is None:
       
    89             try:
       
    90                 self._fieldnames = self.reader.next()
       
    91             except StopIteration:
       
    92                 pass
       
    93         self.line_num = self.reader.line_num
       
    94         return self._fieldnames
       
    95 
       
    96     @fieldnames.setter
       
    97     def fieldnames(self, value):
       
    98         self._fieldnames = value
       
    99 
       
   100     def next(self):
       
   101         if self.line_num == 0:
       
   102             # Used only for its side effect.
       
   103             self.fieldnames
       
   104         row = self.reader.next()
       
   105         self.line_num = self.reader.line_num
       
   106 
       
   107         # unlike the basic reader, we prefer not to return blanks,
       
   108         # because we will typically wind up with a dict full of None
       
   109         # values
       
   110         while row == []:
       
   111             row = self.reader.next()
       
   112         d = dict(zip(self.fieldnames, row))
       
   113         lf = len(self.fieldnames)
       
   114         lr = len(row)
       
   115         if lf < lr:
       
   116             d[self.restkey] = row[lf:]
       
   117         elif lf > lr:
       
   118             for key in self.fieldnames[lr:]:
       
   119                 d[key] = self.restval
       
   120         return d
       
   121 
       
   122 
       
   123 class DictWriter:
       
   124     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
       
   125                  dialect="excel", *args, **kwds):
       
   126         self.fieldnames = fieldnames    # list of keys for the dict
       
   127         self.restval = restval          # for writing short dicts
       
   128         if extrasaction.lower() not in ("raise", "ignore"):
       
   129             raise ValueError, \
       
   130                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
       
   131                    extrasaction)
       
   132         self.extrasaction = extrasaction
       
   133         self.writer = writer(f, dialect, *args, **kwds)
       
   134 
       
   135     def _dict_to_list(self, rowdict):
       
   136         if self.extrasaction == "raise":
       
   137             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
       
   138             if wrong_fields:
       
   139                 raise ValueError("dict contains fields not in fieldnames: " +
       
   140                                  ", ".join(wrong_fields))
       
   141         return [rowdict.get(key, self.restval) for key in self.fieldnames]
       
   142 
       
   143     def writerow(self, rowdict):
       
   144         return self.writer.writerow(self._dict_to_list(rowdict))
       
   145 
       
   146     def writerows(self, rowdicts):
       
   147         rows = []
       
   148         for rowdict in rowdicts:
       
   149             rows.append(self._dict_to_list(rowdict))
       
   150         return self.writer.writerows(rows)
       
   151 
       
   152 # Guard Sniffer's type checking against builds that exclude complex()
       
   153 try:
       
   154     complex
       
   155 except NameError:
       
   156     complex = float
       
   157 
       
   158 class Sniffer:
       
   159     '''
       
   160     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
       
   161     Returns a Dialect object.
       
   162     '''
       
   163     def __init__(self):
       
   164         # in case there is more than one possible delimiter
       
   165         self.preferred = [',', '\t', ';', ' ', ':']
       
   166 
       
   167 
       
   168     def sniff(self, sample, delimiters=None):
       
   169         """
       
   170         Returns a dialect (or None) corresponding to the sample
       
   171         """
       
   172 
       
   173         quotechar, delimiter, skipinitialspace = \
       
   174                    self._guess_quote_and_delimiter(sample, delimiters)
       
   175         if not delimiter:
       
   176             delimiter, skipinitialspace = self._guess_delimiter(sample,
       
   177                                                                 delimiters)
       
   178 
       
   179         if not delimiter:
       
   180             raise Error, "Could not determine delimiter"
       
   181 
       
   182         class dialect(Dialect):
       
   183             _name = "sniffed"
       
   184             lineterminator = '\r\n'
       
   185             quoting = QUOTE_MINIMAL
       
   186             # escapechar = ''
       
   187             doublequote = False
       
   188 
       
   189         dialect.delimiter = delimiter
       
   190         # _csv.reader won't accept a quotechar of ''
       
   191         dialect.quotechar = quotechar or '"'
       
   192         dialect.skipinitialspace = skipinitialspace
       
   193 
       
   194         return dialect
       
   195 
       
   196 
       
   197     def _guess_quote_and_delimiter(self, data, delimiters):
       
   198         """
       
   199         Looks for text enclosed between two identical quotes
       
   200         (the probable quotechar) which are preceded and followed
       
   201         by the same character (the probable delimiter).
       
   202         For example:
       
   203                          ,'some text',
       
   204         The quote with the most wins, same with the delimiter.
       
   205         If there is no quotechar the delimiter can't be determined
       
   206         this way.
       
   207         """
       
   208 
       
   209         matches = []
       
   210         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
       
   211                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
       
   212                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
       
   213                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
       
   214             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
       
   215             matches = regexp.findall(data)
       
   216             if matches:
       
   217                 break
       
   218 
       
   219         if not matches:
       
   220             return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
       
   221 
       
   222         quotes = {}
       
   223         delims = {}
       
   224         spaces = 0
       
   225         for m in matches:
       
   226             n = regexp.groupindex['quote'] - 1
       
   227             key = m[n]
       
   228             if key:
       
   229                 quotes[key] = quotes.get(key, 0) + 1
       
   230             try:
       
   231                 n = regexp.groupindex['delim'] - 1
       
   232                 key = m[n]
       
   233             except KeyError:
       
   234                 continue
       
   235             if key and (delimiters is None or key in delimiters):
       
   236                 delims[key] = delims.get(key, 0) + 1
       
   237             try:
       
   238                 n = regexp.groupindex['space'] - 1
       
   239             except KeyError:
       
   240                 continue
       
   241             if m[n]:
       
   242                 spaces += 1
       
   243 
       
   244         quotechar = reduce(lambda a, b, quotes = quotes:
       
   245                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
       
   246 
       
   247         if delims:
       
   248             delim = reduce(lambda a, b, delims = delims:
       
   249                            (delims[a] > delims[b]) and a or b, delims.keys())
       
   250             skipinitialspace = delims[delim] == spaces
       
   251             if delim == '\n': # most likely a file with a single column
       
   252                 delim = ''
       
   253         else:
       
   254             # there is *no* delimiter, it's a single column of quoted data
       
   255             delim = ''
       
   256             skipinitialspace = 0
       
   257 
       
   258         return (quotechar, delim, skipinitialspace)
       
   259 
       
   260 
       
   261     def _guess_delimiter(self, data, delimiters):
       
   262         """
       
   263         The delimiter /should/ occur the same number of times on
       
   264         each row. However, due to malformed data, it may not. We don't want
       
   265         an all or nothing approach, so we allow for small variations in this
       
   266         number.
       
   267           1) build a table of the frequency of each character on every line.
       
   268           2) build a table of freqencies of this frequency (meta-frequency?),
       
   269              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
       
   270              7 times in 2 rows'
       
   271           3) use the mode of the meta-frequency to determine the /expected/
       
   272              frequency for that character
       
   273           4) find out how often the character actually meets that goal
       
   274           5) the character that best meets its goal is the delimiter
       
   275         For performance reasons, the data is evaluated in chunks, so it can
       
   276         try and evaluate the smallest portion of the data possible, evaluating
       
   277         additional chunks as necessary.
       
   278         """
       
   279 
       
   280         data = filter(None, data.split('\n'))
       
   281 
       
   282         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
       
   283 
       
   284         # build frequency tables
       
   285         chunkLength = min(10, len(data))
       
   286         iteration = 0
       
   287         charFrequency = {}
       
   288         modes = {}
       
   289         delims = {}
       
   290         start, end = 0, min(chunkLength, len(data))
       
   291         while start < len(data):
       
   292             iteration += 1
       
   293             for line in data[start:end]:
       
   294                 for char in ascii:
       
   295                     metaFrequency = charFrequency.get(char, {})
       
   296                     # must count even if frequency is 0
       
   297                     freq = line.count(char)
       
   298                     # value is the mode
       
   299                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
       
   300                     charFrequency[char] = metaFrequency
       
   301 
       
   302             for char in charFrequency.keys():
       
   303                 items = charFrequency[char].items()
       
   304                 if len(items) == 1 and items[0][0] == 0:
       
   305                     continue
       
   306                 # get the mode of the frequencies
       
   307                 if len(items) > 1:
       
   308                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
       
   309                                          items)
       
   310                     # adjust the mode - subtract the sum of all
       
   311                     # other frequencies
       
   312                     items.remove(modes[char])
       
   313                     modes[char] = (modes[char][0], modes[char][1]
       
   314                                    - reduce(lambda a, b: (0, a[1] + b[1]),
       
   315                                             items)[1])
       
   316                 else:
       
   317                     modes[char] = items[0]
       
   318 
       
   319             # build a list of possible delimiters
       
   320             modeList = modes.items()
       
   321             total = float(chunkLength * iteration)
       
   322             # (rows of consistent data) / (number of rows) = 100%
       
   323             consistency = 1.0
       
   324             # minimum consistency threshold
       
   325             threshold = 0.9
       
   326             while len(delims) == 0 and consistency >= threshold:
       
   327                 for k, v in modeList:
       
   328                     if v[0] > 0 and v[1] > 0:
       
   329                         if ((v[1]/total) >= consistency and
       
   330                             (delimiters is None or k in delimiters)):
       
   331                             delims[k] = v
       
   332                 consistency -= 0.01
       
   333 
       
   334             if len(delims) == 1:
       
   335                 delim = delims.keys()[0]
       
   336                 skipinitialspace = (data[0].count(delim) ==
       
   337                                     data[0].count("%c " % delim))
       
   338                 return (delim, skipinitialspace)
       
   339 
       
   340             # analyze another chunkLength lines
       
   341             start = end
       
   342             end += chunkLength
       
   343 
       
   344         if not delims:
       
   345             return ('', 0)
       
   346 
       
   347         # if there's more than one, fall back to a 'preferred' list
       
   348         if len(delims) > 1:
       
   349             for d in self.preferred:
       
   350                 if d in delims.keys():
       
   351                     skipinitialspace = (data[0].count(d) ==
       
   352                                         data[0].count("%c " % d))
       
   353                     return (d, skipinitialspace)
       
   354 
       
   355         # nothing else indicates a preference, pick the character that
       
   356         # dominates(?)
       
   357         items = [(v,k) for (k,v) in delims.items()]
       
   358         items.sort()
       
   359         delim = items[-1][1]
       
   360 
       
   361         skipinitialspace = (data[0].count(delim) ==
       
   362                             data[0].count("%c " % delim))
       
   363         return (delim, skipinitialspace)
       
   364 
       
   365 
       
   366     def has_header(self, sample):
       
   367         # Creates a dictionary of types of data in each column. If any
       
   368         # column is of a single type (say, integers), *except* for the first
       
   369         # row, then the first row is presumed to be labels. If the type
       
   370         # can't be determined, it is assumed to be a string in which case
       
   371         # the length of the string is the determining factor: if all of the
       
   372         # rows except for the first are the same length, it's a header.
       
   373         # Finally, a 'vote' is taken at the end for each column, adding or
       
   374         # subtracting from the likelihood of the first row being a header.
       
   375 
       
   376         rdr = reader(StringIO(sample), self.sniff(sample))
       
   377 
       
   378         header = rdr.next() # assume first row is header
       
   379 
       
   380         columns = len(header)
       
   381         columnTypes = {}
       
   382         for i in range(columns): columnTypes[i] = None
       
   383 
       
   384         checked = 0
       
   385         for row in rdr:
       
   386             # arbitrary number of rows to check, to keep it sane
       
   387             if checked > 20:
       
   388                 break
       
   389             checked += 1
       
   390 
       
   391             if len(row) != columns:
       
   392                 continue # skip rows that have irregular number of columns
       
   393 
       
   394             for col in columnTypes.keys():
       
   395 
       
   396                 for thisType in [int, long, float, complex]:
       
   397                     try:
       
   398                         thisType(row[col])
       
   399                         break
       
   400                     except (ValueError, OverflowError):
       
   401                         pass
       
   402                 else:
       
   403                     # fallback to length of string
       
   404                     thisType = len(row[col])
       
   405 
       
   406                 # treat longs as ints
       
   407                 if thisType == long:
       
   408                     thisType = int
       
   409 
       
   410                 if thisType != columnTypes[col]:
       
   411                     if columnTypes[col] is None: # add new column type
       
   412                         columnTypes[col] = thisType
       
   413                     else:
       
   414                         # type is inconsistent, remove column from
       
   415                         # consideration
       
   416                         del columnTypes[col]
       
   417 
       
   418         # finally, compare results against first row and "vote"
       
   419         # on whether it's a header
       
   420         hasHeader = 0
       
   421         for col, colType in columnTypes.items():
       
   422             if type(colType) == type(0): # it's a length
       
   423                 if len(header[col]) != colType:
       
   424                     hasHeader += 1
       
   425                 else:
       
   426                     hasHeader -= 1
       
   427             else: # attempt typecast
       
   428                 try:
       
   429                     colType(header[col])
       
   430                 except (ValueError, TypeError):
       
   431                     hasHeader += 1
       
   432                 else:
       
   433                     hasHeader -= 1
       
   434 
       
   435         return hasHeader > 0