MCL/sftools/dev/hostenv/pythontoolsplat: comparison python-2.5.2/win32/Lib/csv.py

equal deleted inserted replaced

--1:000000000000
+:ae805ac0140d
+"""
+csv.py - read/write/investigate CSV files
+"""
+import re
+from _csv import Error, __version__, writer, reader, register_dialect, \
+unregister_dialect, get_dialect, list_dialects, \
+field_size_limit, \
+QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
+__doc__
+from _csv import Dialect as _Dialect
+try:
+from cStringIO import StringIO
+except ImportError:
+from StringIO import StringIO
+__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
+"Error", "Dialect", "excel", "excel_tab", "reader", "writer",
+"register_dialect", "get_dialect", "list_dialects", "Sniffer",
+"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
+class Dialect:
+"""Describe an Excel dialect.
+This must be subclassed (see csv.excel).  Valid attributes are:
+delimiter, quotechar, escapechar, doublequote, skipinitialspace,
+lineterminator, quoting.
+"""
+_name = ""
+_valid = False
+# placeholders
+delimiter = None
+quotechar = None
+escapechar = None
+doublequote = None
+skipinitialspace = None
+lineterminator = None
+quoting = None
+def __init__(self):
+if self.__class__ != Dialect:
+self._valid = True
+self._validate()
+def _validate(self):
+try:
+_Dialect(self)
+except TypeError, e:
+# We do this for compatibility with py2.3
+raise Error(str(e))
+class excel(Dialect):
+"""Describe the usual properties of Excel-generated CSV files."""
+delimiter = ','
+quotechar = '"'
+doublequote = True
+skipinitialspace = False
+lineterminator = '\r\n'
+quoting = QUOTE_MINIMAL
+register_dialect("excel", excel)
+class excel_tab(excel):
+"""Describe the usual properties of Excel-generated TAB-delimited files."""
+delimiter = '\t'
+register_dialect("excel-tab", excel_tab)
+class DictReader:
+def __init__(self, f, fieldnames=None, restkey=None, restval=None,
+dialect="excel", *args, **kwds):
+self.fieldnames = fieldnames    # list of keys for the dict
+self.restkey = restkey          # key to catch long rows
+self.restval = restval          # default value for short rows
+self.reader = reader(f, dialect, *args, **kwds)
+def __iter__(self):
+return self
+def next(self):
+row = self.reader.next()
+if self.fieldnames is None:
+self.fieldnames = row
+row = self.reader.next()
+# unlike the basic reader, we prefer not to return blanks,
+# because we will typically wind up with a dict full of None
+# values
+while row == []:
+row = self.reader.next()
+d = dict(zip(self.fieldnames, row))
+lf = len(self.fieldnames)
+lr = len(row)
+if lf < lr:
+d[self.restkey] = row[lf:]
+elif lf > lr:
+for key in self.fieldnames[lr:]:
+d[key] = self.restval
+return d
+class DictWriter:
+def __init__(self, f, fieldnames, restval="", extrasaction="raise",
+dialect="excel", *args, **kwds):
+self.fieldnames = fieldnames    # list of keys for the dict
+self.restval = restval          # for writing short dicts
+if extrasaction.lower() not in ("raise", "ignore"):
+raise ValueError, \
+("extrasaction (%s) must be 'raise' or 'ignore'" %
+extrasaction)
+self.extrasaction = extrasaction
+self.writer = writer(f, dialect, *args, **kwds)
+def _dict_to_list(self, rowdict):
+if self.extrasaction == "raise":
+for k in rowdict.keys():
+if k not in self.fieldnames:
+raise ValueError, "dict contains fields not in fieldnames"
+return [rowdict.get(key, self.restval) for key in self.fieldnames]
+def writerow(self, rowdict):
+return self.writer.writerow(self._dict_to_list(rowdict))
+def writerows(self, rowdicts):
+rows = []
+for rowdict in rowdicts:
+rows.append(self._dict_to_list(rowdict))
+return self.writer.writerows(rows)
+# Guard Sniffer's type checking against builds that exclude complex()
+try:
+complex
+except NameError:
+complex = float
+class Sniffer:
+'''
+"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
+Returns a Dialect object.
+'''
+def __init__(self):
+# in case there is more than one possible delimiter
+self.preferred = [',', '\t', ';', ' ', ':']
+def sniff(self, sample, delimiters=None):
+"""
+Returns a dialect (or None) corresponding to the sample
+"""
+quotechar, delimiter, skipinitialspace = \
+self._guess_quote_and_delimiter(sample, delimiters)
+if not delimiter:
+delimiter, skipinitialspace = self._guess_delimiter(sample,
+delimiters)
+if not delimiter:
+raise Error, "Could not determine delimiter"
+class dialect(Dialect):
+_name = "sniffed"
+lineterminator = '\r\n'
+quoting = QUOTE_MINIMAL
+# escapechar = ''
+doublequote = False
+dialect.delimiter = delimiter
+# _csv.reader won't accept a quotechar of ''
+dialect.quotechar = quotechar or '"'
+dialect.skipinitialspace = skipinitialspace
+return dialect
+def _guess_quote_and_delimiter(self, data, delimiters):
+"""
+Looks for text enclosed between two identical quotes
+(the probable quotechar) which are preceded and followed
+by the same character (the probable delimiter).
+For example:
+,'some text',
+The quote with the most wins, same with the delimiter.
+If there is no quotechar the delimiter can't be determined
+this way.
+"""
+matches = []
+for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
+'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
+'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
+'(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
+regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
+matches = regexp.findall(data)
+if matches:
+break
+if not matches:
+return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
+quotes = {}
+delims = {}
+spaces = 0
+for m in matches:
+n = regexp.groupindex['quote'] - 1
+key = m[n]
+if key:
+quotes[key] = quotes.get(key, 0) + 1
+try:
+n = regexp.groupindex['delim'] - 1
+key = m[n]
+except KeyError:
+continue
+if key and (delimiters is None or key in delimiters):
+delims[key] = delims.get(key, 0) + 1
+try:
+n = regexp.groupindex['space'] - 1
+except KeyError:
+continue
+if m[n]:
+spaces += 1
+quotechar = reduce(lambda a, b, quotes = quotes:
+(quotes[a] > quotes[b]) and a or b, quotes.keys())
+if delims:
+delim = reduce(lambda a, b, delims = delims:
+(delims[a] > delims[b]) and a or b, delims.keys())
+skipinitialspace = delims[delim] == spaces
+if delim == '\n': # most likely a file with a single column
+delim = ''
+else:
+# there is *no* delimiter, it's a single column of quoted data
+delim = ''
+skipinitialspace = 0
+return (quotechar, delim, skipinitialspace)
+def _guess_delimiter(self, data, delimiters):
+"""
+The delimiter /should/ occur the same number of times on
+each row. However, due to malformed data, it may not. We don't want
+an all or nothing approach, so we allow for small variations in this
+number.
+1) build a table of the frequency of each character on every line.
+2) build a table of freqencies of this frequency (meta-frequency?),
+e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
+7 times in 2 rows'
+3) use the mode of the meta-frequency to determine the /expected/
+frequency for that character
+4) find out how often the character actually meets that goal
+5) the character that best meets its goal is the delimiter
+For performance reasons, the data is evaluated in chunks, so it can
+try and evaluate the smallest portion of the data possible, evaluating
+additional chunks as necessary.
+"""
+data = filter(None, data.split('\n'))
+ascii = [chr(c) for c in range(127)] # 7-bit ASCII
+# build frequency tables
+chunkLength = min(10, len(data))
+iteration = 0
+charFrequency = {}
+modes = {}
+delims = {}
+start, end = 0, min(chunkLength, len(data))
+while start < len(data):
+iteration += 1
+for line in data[start:end]:
+for char in ascii:
+metaFrequency = charFrequency.get(char, {})
+# must count even if frequency is 0
+freq = line.count(char)
+# value is the mode
+metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
+charFrequency[char] = metaFrequency
+for char in charFrequency.keys():
+items = charFrequency[char].items()
+if len(items) == 1 and items[0][0] == 0:
+continue
+# get the mode of the frequencies
+if len(items) > 1:
+modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
+items)
+# adjust the mode - subtract the sum of all
+# other frequencies
+items.remove(modes[char])
+modes[char] = (modes[char][0], modes[char][1]
+- reduce(lambda a, b: (0, a[1] + b[1]),
+items)[1])
+else:
+modes[char] = items[0]
+# build a list of possible delimiters
+modeList = modes.items()
+total = float(chunkLength * iteration)
+# (rows of consistent data) / (number of rows) = 100%
+consistency = 1.0
+# minimum consistency threshold
+threshold = 0.9
+while len(delims) == 0 and consistency >= threshold:
+for k, v in modeList:
+if v[0] > 0 and v[1] > 0:
+if ((v[1]/total) >= consistency and
+(delimiters is None or k in delimiters)):
+delims[k] = v
+consistency -= 0.01
+if len(delims) == 1:
+delim = delims.keys()[0]
+skipinitialspace = (data[0].count(delim) ==
+data[0].count("%c " % delim))
+return (delim, skipinitialspace)
+# analyze another chunkLength lines
+start = end
+end += chunkLength
+if not delims:
+return ('', 0)
+# if there's more than one, fall back to a 'preferred' list
+if len(delims) > 1:
+for d in self.preferred:
+if d in delims.keys():
+skipinitialspace = (data[0].count(d) ==
+data[0].count("%c " % d))
+return (d, skipinitialspace)
+# nothing else indicates a preference, pick the character that
+# dominates(?)
+items = [(v,k) for (k,v) in delims.items()]
+items.sort()
+delim = items[-1][1]
+skipinitialspace = (data[0].count(delim) ==
+data[0].count("%c " % delim))
+return (delim, skipinitialspace)
+def has_header(self, sample):
+# Creates a dictionary of types of data in each column. If any
+# column is of a single type (say, integers), *except* for the first
+# row, then the first row is presumed to be labels. If the type
+# can't be determined, it is assumed to be a string in which case
+# the length of the string is the determining factor: if all of the
+# rows except for the first are the same length, it's a header.
+# Finally, a 'vote' is taken at the end for each column, adding or
+# subtracting from the likelihood of the first row being a header.
+rdr = reader(StringIO(sample), self.sniff(sample))
+header = rdr.next() # assume first row is header
+columns = len(header)
+columnTypes = {}
+for i in range(columns): columnTypes[i] = None
+checked = 0
+for row in rdr:
+# arbitrary number of rows to check, to keep it sane
+if checked > 20:
+break
+checked += 1
+if len(row) != columns:
+continue # skip rows that have irregular number of columns
+for col in columnTypes.keys():
+for thisType in [int, long, float, complex]:
+try:
+thisType(row[col])
+break
+except (ValueError, OverflowError):
+pass
+else:
+# fallback to length of string
+thisType = len(row[col])
+# treat longs as ints
+if thisType == long:
+thisType = int
+if thisType != columnTypes[col]:
+if columnTypes[col] is None: # add new column type
+columnTypes[col] = thisType
+else:
+# type is inconsistent, remove column from
+# consideration
+del columnTypes[col]
+# finally, compare results against first row and "vote"
+# on whether it's a header
+hasHeader = 0
+for col, colType in columnTypes.items():
+if type(colType) == type(0): # it's a length
+if len(header[col]) != colType:
+hasHeader += 1
+else:
+hasHeader -= 1
+else: # attempt typecast
+try:
+colType(header[col])
+except (ValueError, TypeError):
+hasHeader += 1
+else:
+hasHeader -= 1
+return hasHeader > 0