|
1 |
|
2 """ |
|
3 csv.py - read/write/investigate CSV files |
|
4 """ |
|
5 |
|
6 import re |
|
7 from functools import reduce |
|
8 from _csv import Error, __version__, writer, reader, register_dialect, \ |
|
9 unregister_dialect, get_dialect, list_dialects, \ |
|
10 field_size_limit, \ |
|
11 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \ |
|
12 __doc__ |
|
13 from _csv import Dialect as _Dialect |
|
14 |
|
15 try: |
|
16 from cStringIO import StringIO |
|
17 except ImportError: |
|
18 from StringIO import StringIO |
|
19 |
|
20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE", |
|
21 "Error", "Dialect", "__doc__", "excel", "excel_tab", |
|
22 "field_size_limit", "reader", "writer", |
|
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer", |
|
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ] |
|
25 |
|
26 class Dialect: |
|
27 """Describe an Excel dialect. |
|
28 |
|
29 This must be subclassed (see csv.excel). Valid attributes are: |
|
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace, |
|
31 lineterminator, quoting. |
|
32 |
|
33 """ |
|
34 _name = "" |
|
35 _valid = False |
|
36 # placeholders |
|
37 delimiter = None |
|
38 quotechar = None |
|
39 escapechar = None |
|
40 doublequote = None |
|
41 skipinitialspace = None |
|
42 lineterminator = None |
|
43 quoting = None |
|
44 |
|
45 def __init__(self): |
|
46 if self.__class__ != Dialect: |
|
47 self._valid = True |
|
48 self._validate() |
|
49 |
|
50 def _validate(self): |
|
51 try: |
|
52 _Dialect(self) |
|
53 except TypeError, e: |
|
54 # We do this for compatibility with py2.3 |
|
55 raise Error(str(e)) |
|
56 |
|
57 class excel(Dialect): |
|
58 """Describe the usual properties of Excel-generated CSV files.""" |
|
59 delimiter = ',' |
|
60 quotechar = '"' |
|
61 doublequote = True |
|
62 skipinitialspace = False |
|
63 lineterminator = '\r\n' |
|
64 quoting = QUOTE_MINIMAL |
|
65 register_dialect("excel", excel) |
|
66 |
|
67 class excel_tab(excel): |
|
68 """Describe the usual properties of Excel-generated TAB-delimited files.""" |
|
69 delimiter = '\t' |
|
70 register_dialect("excel-tab", excel_tab) |
|
71 |
|
72 |
|
73 class DictReader: |
|
74 def __init__(self, f, fieldnames=None, restkey=None, restval=None, |
|
75 dialect="excel", *args, **kwds): |
|
76 self._fieldnames = fieldnames # list of keys for the dict |
|
77 self.restkey = restkey # key to catch long rows |
|
78 self.restval = restval # default value for short rows |
|
79 self.reader = reader(f, dialect, *args, **kwds) |
|
80 self.dialect = dialect |
|
81 self.line_num = 0 |
|
82 |
|
83 def __iter__(self): |
|
84 return self |
|
85 |
|
86 @property |
|
87 def fieldnames(self): |
|
88 if self._fieldnames is None: |
|
89 try: |
|
90 self._fieldnames = self.reader.next() |
|
91 except StopIteration: |
|
92 pass |
|
93 self.line_num = self.reader.line_num |
|
94 return self._fieldnames |
|
95 |
|
96 @fieldnames.setter |
|
97 def fieldnames(self, value): |
|
98 self._fieldnames = value |
|
99 |
|
100 def next(self): |
|
101 if self.line_num == 0: |
|
102 # Used only for its side effect. |
|
103 self.fieldnames |
|
104 row = self.reader.next() |
|
105 self.line_num = self.reader.line_num |
|
106 |
|
107 # unlike the basic reader, we prefer not to return blanks, |
|
108 # because we will typically wind up with a dict full of None |
|
109 # values |
|
110 while row == []: |
|
111 row = self.reader.next() |
|
112 d = dict(zip(self.fieldnames, row)) |
|
113 lf = len(self.fieldnames) |
|
114 lr = len(row) |
|
115 if lf < lr: |
|
116 d[self.restkey] = row[lf:] |
|
117 elif lf > lr: |
|
118 for key in self.fieldnames[lr:]: |
|
119 d[key] = self.restval |
|
120 return d |
|
121 |
|
122 |
|
123 class DictWriter: |
|
124 def __init__(self, f, fieldnames, restval="", extrasaction="raise", |
|
125 dialect="excel", *args, **kwds): |
|
126 self.fieldnames = fieldnames # list of keys for the dict |
|
127 self.restval = restval # for writing short dicts |
|
128 if extrasaction.lower() not in ("raise", "ignore"): |
|
129 raise ValueError, \ |
|
130 ("extrasaction (%s) must be 'raise' or 'ignore'" % |
|
131 extrasaction) |
|
132 self.extrasaction = extrasaction |
|
133 self.writer = writer(f, dialect, *args, **kwds) |
|
134 |
|
135 def _dict_to_list(self, rowdict): |
|
136 if self.extrasaction == "raise": |
|
137 wrong_fields = [k for k in rowdict if k not in self.fieldnames] |
|
138 if wrong_fields: |
|
139 raise ValueError("dict contains fields not in fieldnames: " + |
|
140 ", ".join(wrong_fields)) |
|
141 return [rowdict.get(key, self.restval) for key in self.fieldnames] |
|
142 |
|
143 def writerow(self, rowdict): |
|
144 return self.writer.writerow(self._dict_to_list(rowdict)) |
|
145 |
|
146 def writerows(self, rowdicts): |
|
147 rows = [] |
|
148 for rowdict in rowdicts: |
|
149 rows.append(self._dict_to_list(rowdict)) |
|
150 return self.writer.writerows(rows) |
|
151 |
|
152 # Guard Sniffer's type checking against builds that exclude complex() |
|
153 try: |
|
154 complex |
|
155 except NameError: |
|
156 complex = float |
|
157 |
|
158 class Sniffer: |
|
159 ''' |
|
160 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar) |
|
161 Returns a Dialect object. |
|
162 ''' |
|
163 def __init__(self): |
|
164 # in case there is more than one possible delimiter |
|
165 self.preferred = [',', '\t', ';', ' ', ':'] |
|
166 |
|
167 |
|
168 def sniff(self, sample, delimiters=None): |
|
169 """ |
|
170 Returns a dialect (or None) corresponding to the sample |
|
171 """ |
|
172 |
|
173 quotechar, delimiter, skipinitialspace = \ |
|
174 self._guess_quote_and_delimiter(sample, delimiters) |
|
175 if not delimiter: |
|
176 delimiter, skipinitialspace = self._guess_delimiter(sample, |
|
177 delimiters) |
|
178 |
|
179 if not delimiter: |
|
180 raise Error, "Could not determine delimiter" |
|
181 |
|
182 class dialect(Dialect): |
|
183 _name = "sniffed" |
|
184 lineterminator = '\r\n' |
|
185 quoting = QUOTE_MINIMAL |
|
186 # escapechar = '' |
|
187 doublequote = False |
|
188 |
|
189 dialect.delimiter = delimiter |
|
190 # _csv.reader won't accept a quotechar of '' |
|
191 dialect.quotechar = quotechar or '"' |
|
192 dialect.skipinitialspace = skipinitialspace |
|
193 |
|
194 return dialect |
|
195 |
|
196 |
|
197 def _guess_quote_and_delimiter(self, data, delimiters): |
|
198 """ |
|
199 Looks for text enclosed between two identical quotes |
|
200 (the probable quotechar) which are preceded and followed |
|
201 by the same character (the probable delimiter). |
|
202 For example: |
|
203 ,'some text', |
|
204 The quote with the most wins, same with the delimiter. |
|
205 If there is no quotechar the delimiter can't be determined |
|
206 this way. |
|
207 """ |
|
208 |
|
209 matches = [] |
|
210 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?", |
|
211 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?", |
|
212 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?" |
|
213 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space) |
|
214 regexp = re.compile(restr, re.DOTALL | re.MULTILINE) |
|
215 matches = regexp.findall(data) |
|
216 if matches: |
|
217 break |
|
218 |
|
219 if not matches: |
|
220 return ('', None, 0) # (quotechar, delimiter, skipinitialspace) |
|
221 |
|
222 quotes = {} |
|
223 delims = {} |
|
224 spaces = 0 |
|
225 for m in matches: |
|
226 n = regexp.groupindex['quote'] - 1 |
|
227 key = m[n] |
|
228 if key: |
|
229 quotes[key] = quotes.get(key, 0) + 1 |
|
230 try: |
|
231 n = regexp.groupindex['delim'] - 1 |
|
232 key = m[n] |
|
233 except KeyError: |
|
234 continue |
|
235 if key and (delimiters is None or key in delimiters): |
|
236 delims[key] = delims.get(key, 0) + 1 |
|
237 try: |
|
238 n = regexp.groupindex['space'] - 1 |
|
239 except KeyError: |
|
240 continue |
|
241 if m[n]: |
|
242 spaces += 1 |
|
243 |
|
244 quotechar = reduce(lambda a, b, quotes = quotes: |
|
245 (quotes[a] > quotes[b]) and a or b, quotes.keys()) |
|
246 |
|
247 if delims: |
|
248 delim = reduce(lambda a, b, delims = delims: |
|
249 (delims[a] > delims[b]) and a or b, delims.keys()) |
|
250 skipinitialspace = delims[delim] == spaces |
|
251 if delim == '\n': # most likely a file with a single column |
|
252 delim = '' |
|
253 else: |
|
254 # there is *no* delimiter, it's a single column of quoted data |
|
255 delim = '' |
|
256 skipinitialspace = 0 |
|
257 |
|
258 return (quotechar, delim, skipinitialspace) |
|
259 |
|
260 |
|
261 def _guess_delimiter(self, data, delimiters): |
|
262 """ |
|
263 The delimiter /should/ occur the same number of times on |
|
264 each row. However, due to malformed data, it may not. We don't want |
|
265 an all or nothing approach, so we allow for small variations in this |
|
266 number. |
|
267 1) build a table of the frequency of each character on every line. |
|
268 2) build a table of freqencies of this frequency (meta-frequency?), |
|
269 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows, |
|
270 7 times in 2 rows' |
|
271 3) use the mode of the meta-frequency to determine the /expected/ |
|
272 frequency for that character |
|
273 4) find out how often the character actually meets that goal |
|
274 5) the character that best meets its goal is the delimiter |
|
275 For performance reasons, the data is evaluated in chunks, so it can |
|
276 try and evaluate the smallest portion of the data possible, evaluating |
|
277 additional chunks as necessary. |
|
278 """ |
|
279 |
|
280 data = filter(None, data.split('\n')) |
|
281 |
|
282 ascii = [chr(c) for c in range(127)] # 7-bit ASCII |
|
283 |
|
284 # build frequency tables |
|
285 chunkLength = min(10, len(data)) |
|
286 iteration = 0 |
|
287 charFrequency = {} |
|
288 modes = {} |
|
289 delims = {} |
|
290 start, end = 0, min(chunkLength, len(data)) |
|
291 while start < len(data): |
|
292 iteration += 1 |
|
293 for line in data[start:end]: |
|
294 for char in ascii: |
|
295 metaFrequency = charFrequency.get(char, {}) |
|
296 # must count even if frequency is 0 |
|
297 freq = line.count(char) |
|
298 # value is the mode |
|
299 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1 |
|
300 charFrequency[char] = metaFrequency |
|
301 |
|
302 for char in charFrequency.keys(): |
|
303 items = charFrequency[char].items() |
|
304 if len(items) == 1 and items[0][0] == 0: |
|
305 continue |
|
306 # get the mode of the frequencies |
|
307 if len(items) > 1: |
|
308 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, |
|
309 items) |
|
310 # adjust the mode - subtract the sum of all |
|
311 # other frequencies |
|
312 items.remove(modes[char]) |
|
313 modes[char] = (modes[char][0], modes[char][1] |
|
314 - reduce(lambda a, b: (0, a[1] + b[1]), |
|
315 items)[1]) |
|
316 else: |
|
317 modes[char] = items[0] |
|
318 |
|
319 # build a list of possible delimiters |
|
320 modeList = modes.items() |
|
321 total = float(chunkLength * iteration) |
|
322 # (rows of consistent data) / (number of rows) = 100% |
|
323 consistency = 1.0 |
|
324 # minimum consistency threshold |
|
325 threshold = 0.9 |
|
326 while len(delims) == 0 and consistency >= threshold: |
|
327 for k, v in modeList: |
|
328 if v[0] > 0 and v[1] > 0: |
|
329 if ((v[1]/total) >= consistency and |
|
330 (delimiters is None or k in delimiters)): |
|
331 delims[k] = v |
|
332 consistency -= 0.01 |
|
333 |
|
334 if len(delims) == 1: |
|
335 delim = delims.keys()[0] |
|
336 skipinitialspace = (data[0].count(delim) == |
|
337 data[0].count("%c " % delim)) |
|
338 return (delim, skipinitialspace) |
|
339 |
|
340 # analyze another chunkLength lines |
|
341 start = end |
|
342 end += chunkLength |
|
343 |
|
344 if not delims: |
|
345 return ('', 0) |
|
346 |
|
347 # if there's more than one, fall back to a 'preferred' list |
|
348 if len(delims) > 1: |
|
349 for d in self.preferred: |
|
350 if d in delims.keys(): |
|
351 skipinitialspace = (data[0].count(d) == |
|
352 data[0].count("%c " % d)) |
|
353 return (d, skipinitialspace) |
|
354 |
|
355 # nothing else indicates a preference, pick the character that |
|
356 # dominates(?) |
|
357 items = [(v,k) for (k,v) in delims.items()] |
|
358 items.sort() |
|
359 delim = items[-1][1] |
|
360 |
|
361 skipinitialspace = (data[0].count(delim) == |
|
362 data[0].count("%c " % delim)) |
|
363 return (delim, skipinitialspace) |
|
364 |
|
365 |
|
366 def has_header(self, sample): |
|
367 # Creates a dictionary of types of data in each column. If any |
|
368 # column is of a single type (say, integers), *except* for the first |
|
369 # row, then the first row is presumed to be labels. If the type |
|
370 # can't be determined, it is assumed to be a string in which case |
|
371 # the length of the string is the determining factor: if all of the |
|
372 # rows except for the first are the same length, it's a header. |
|
373 # Finally, a 'vote' is taken at the end for each column, adding or |
|
374 # subtracting from the likelihood of the first row being a header. |
|
375 |
|
376 rdr = reader(StringIO(sample), self.sniff(sample)) |
|
377 |
|
378 header = rdr.next() # assume first row is header |
|
379 |
|
380 columns = len(header) |
|
381 columnTypes = {} |
|
382 for i in range(columns): columnTypes[i] = None |
|
383 |
|
384 checked = 0 |
|
385 for row in rdr: |
|
386 # arbitrary number of rows to check, to keep it sane |
|
387 if checked > 20: |
|
388 break |
|
389 checked += 1 |
|
390 |
|
391 if len(row) != columns: |
|
392 continue # skip rows that have irregular number of columns |
|
393 |
|
394 for col in columnTypes.keys(): |
|
395 |
|
396 for thisType in [int, long, float, complex]: |
|
397 try: |
|
398 thisType(row[col]) |
|
399 break |
|
400 except (ValueError, OverflowError): |
|
401 pass |
|
402 else: |
|
403 # fallback to length of string |
|
404 thisType = len(row[col]) |
|
405 |
|
406 # treat longs as ints |
|
407 if thisType == long: |
|
408 thisType = int |
|
409 |
|
410 if thisType != columnTypes[col]: |
|
411 if columnTypes[col] is None: # add new column type |
|
412 columnTypes[col] = thisType |
|
413 else: |
|
414 # type is inconsistent, remove column from |
|
415 # consideration |
|
416 del columnTypes[col] |
|
417 |
|
418 # finally, compare results against first row and "vote" |
|
419 # on whether it's a header |
|
420 hasHeader = 0 |
|
421 for col, colType in columnTypes.items(): |
|
422 if type(colType) == type(0): # it's a length |
|
423 if len(header[col]) != colType: |
|
424 hasHeader += 1 |
|
425 else: |
|
426 hasHeader -= 1 |
|
427 else: # attempt typecast |
|
428 try: |
|
429 colType(header[col]) |
|
430 except (ValueError, TypeError): |
|
431 hasHeader += 1 |
|
432 else: |
|
433 hasHeader -= 1 |
|
434 |
|
435 return hasHeader > 0 |