1
2
3
4
5
6
7
8
9 """Some little helper for reading (and writing) common formats from and to
10 disk."""
11
12 __docformat__ = 'restructuredtext'
13
14 import numpy as N
15 import mvpa.support.copy as copy
16 from mvpa.base.dochelpers import enhancedDocString
17 from sets import Set
18 from re import sub as re_sub
19 from mvpa.base import warning
20
21 from mvpa.misc.support import Event
22
23 if __debug__:
24 from mvpa.base import debug
25
26
28 """Base class for data readers.
29
30 Every subclass has to put all information into to variable:
31
32 `self._data`: ndarray
33 The data array has to have the samples separating dimension along the
34 first axis.
35 `self._props`: dict
36 All other meaningful information has to be stored in a dictionary.
37
38 This class provides two methods (and associated properties) to retrieve
39 this information.
40 """
42 """Cheap init.
43 """
44 self._props = {}
45 self._data = None
46
47
49 """Return the dictionary with the data properties.
50 """
51 return self._props
52
53
55 """Return the data array.
56 """
57 return self._data
58
59
60 data = property(fget=getData, doc="Data array")
61 props = property(fget=getPropsAsDict, doc="Property dict")
62
63
64
66 """Read data that is stored in columns of text files.
67
68 All read data is available via a dictionary-like interface. If
69 column headers are available, the column names serve as dictionary keys.
70 If no header exists an articfical key is generated: str(number_of_column).
71
72 Splitting of text file lines is performed by the standard split() function
73 (which gets passed the `sep` argument as separator string) and each
74 element is converted into the desired datatype.
75
76 Because data is read into a dictionary no two columns can have the same
77 name in the header! Each column is stored as a list in the dictionary.
78 """
79 - def __init__(self, source, header=True, sep=None, headersep=None,
80 dtype=float, skiplines=0):
81 """Read data from file into a dictionary.
82
83 :Parameters:
84 source : basestring or dict
85 If values is given as a string all data is read from the
86 file and additonal keyword arguments can be sued to
87 customize the read procedure. If a dictionary is passed
88 a deepcopy is performed.
89 header : bool or list of basestring
90 Indicates whether the column names should be read from the
91 first line (`header=True`). If `header=False` unique
92 column names will be generated (see class docs). If
93 `header` is a python list, it's content is used as column
94 header names and its length has to match the number of
95 columns in the file.
96 sep : basestring or None
97 Separator string. The actual meaning depends on the output
98 format (see class docs).
99 headersep : basestring or None
100 Separator string used in the header. The actual meaning
101 depends on the output format (see class docs).
102 dtype : type or list(types)
103 Desired datatype(s). Datatype per column get be specified by
104 passing a list of types.
105 skiplines : int
106 Number of lines to skip at the beginning of the file.
107 """
108
109 dict.__init__(self)
110
111
112 self._header_order = None
113
114 if isinstance(source, str):
115 self._fromFile(source, header=header, sep=sep, headersep=headersep,
116 dtype=dtype, skiplines=skiplines)
117
118 elif isinstance(source, dict):
119 for k, v in source.iteritems():
120 self[k] = v
121
122 self._check()
123
124 else:
125 raise ValueError, 'Unkown source for ColumnData [%s]' \
126 % `type(source)`
127
128
129 classdict = self.__class__.__dict__
130 for k in self.keys():
131 if not classdict.has_key(k):
132 getter = "lambda self: self._getAttrib('%s')" % (k)
133
134 k_ = re_sub('[[\] ]', '_', k)
135
136 k_ = re_sub('__+', '_', k_)
137
138 k_ = re_sub('["\']', '', k_)
139 if __debug__:
140 debug("IOH", "Registering property %s for ColumnData key %s"
141 % (k_, k))
142
143
144
145 exec 'from %s import %s' % (self.__module__,
146 self.__class__.__name__)
147 exec "%s.%s = property(fget=%s)" % \
148 (self.__class__.__name__, k_, getter)
149
150
151
152
153
154
155
156
157
158 __doc__ = enhancedDocString('ColumnData', locals())
159
160
162 """Return corresponding value if given key is known to current instance
163
164 Is used for automatically added properties to the class.
165
166 :Raises:
167 ValueError:
168 If `key` is not known to given instance
169
170 :Returns:
171 Value if `key` is known
172 """
173 if self.has_key(key):
174 return self[key]
175 else:
176 raise ValueError, "Instance %s has no data about %s" \
177 % (`self`, `key`)
178
179
181 s = self.__class__.__name__
182 if len(self.keys())>0:
183 s += " %d rows, %d columns [" % \
184 (self.getNRows(), self.getNColumns())
185 s += reduce(lambda x, y: x+" %s" % y, self.keys())
186 s += "]"
187 return s
188
190 """Performs some checks for data integrity.
191 """
192 length = None
193 for k in self.keys():
194 if length == None:
195 length = len(self[k])
196 else:
197 if not len(self[k]) == length:
198 raise ValueError, "Data integrity lost. Columns do not " \
199 "have equal length."
200
201
202 - def _fromFile(self, filename, header, sep, headersep,
203 dtype, skiplines):
204 """Loads column data from file -- clears object first.
205 """
206
207 self.clear()
208
209 file_ = open(filename, 'r')
210
211 self._header_order = None
212
213 [ file_.readline() for x in range(skiplines) ]
214 """Simply skip some lines"""
215
216 if header == True:
217
218 hdr = file_.readline().split(headersep)
219
220 hdr = filter(lambda x:len(x.strip()), hdr)
221 self._header_order = hdr
222 elif isinstance(header, list):
223 hdr = header
224 else:
225 hdr = [ str(i) for i in xrange(len(file_.readline().split(sep))) ]
226
227 file_.seek(0)
228 [ file_.readline() for x in range(skiplines) ]
229
230
231
232 tbl = [ [] for i in xrange(len(hdr)) ]
233
234
235 if not isinstance(dtype, list):
236 dtype = [dtype] * len(hdr)
237
238
239 for line in file_:
240
241 line = line.strip()
242
243 if not line or line.startswith('#'):
244 continue
245 l = line.split(sep)
246
247 if not len(l) == len(hdr):
248 raise RuntimeError, \
249 "Number of entries in line [%i] does not match number " \
250 "of columns in header [%i]." % (len(l), len(hdr))
251
252 for i, v in enumerate(l):
253 if not dtype[i] is None:
254 try:
255 v = dtype[i](v)
256 except ValueError:
257 warning("Can't convert %s to desired datatype %s." %
258 (`v`, `dtype`) + " Leaving original type")
259 tbl[i].append(v)
260
261
262 if not len(tbl) == len(hdr):
263 raise RuntimeError, "Number of columns read from file does not " \
264 "match the number of header entries."
265
266
267 for i, v in enumerate(hdr):
268 self[v] = tbl[i]
269
270
272 """Merge column data.
273 """
274
275 for k, v in other.iteritems():
276 if not self.has_key(k):
277 raise ValueError, 'Unknown key [%s].' % `k`
278 if not isinstance(v, list):
279 raise ValueError, 'Can only merge list data, but got [%s].' \
280 % `type(v)`
281
282
283 self[k] += v
284
285
286 self._check()
287
288 return self
289
290
292 """Return new ColumnData with selected samples"""
293
294 data = copy.deepcopy(self)
295 for k, v in data.iteritems():
296 data[k] = [v[x] for x in selection]
297
298 data._check()
299 return data
300
301
303 """Returns the number of columns.
304 """
305 return len(self.keys())
306
307
308 - def tofile(self, filename, header=True, header_order=None, sep=' '):
309 """Write column data to a text file.
310
311 :Parameters:
312 filename : basestring
313 Target filename
314 header : bool
315 If `True` a column header is written, using the column
316 keys. If `False` no header is written.
317 header_order : None or list of basestring
318 If it is a list of strings, they will be used instead
319 of simply asking for the dictionary keys. However
320 these strings must match the dictionary keys in number
321 and identity. This argument type can be used to
322 determine the order of the columns in the output file.
323 The default value is `None`. In this case the columns
324 will be in an arbitrary order.
325 sep : basestring
326 String that is written as a separator between to data columns.
327 """
328
329 file_ = open(filename, 'w')
330
331
332 if header_order == None:
333 if self._header_order is None:
334 col_hdr = self.keys()
335 else:
336
337 col_hdr = self._header_order + \
338 list(Set(self.keys()).difference(
339 Set(self._header_order)))
340 else:
341 if not len(header_order) == self.getNColumns():
342 raise ValueError, 'Header list does not match number of ' \
343 'columns.'
344 for k in header_order:
345 if not self.has_key(k):
346 raise ValueError, 'Unknown key [%s]' % `k`
347 col_hdr = header_order
348
349 if header == True:
350 file_.write(sep.join(col_hdr) + '\n')
351
352
353 for r in xrange(self.getNRows()):
354
355 l = [str(self[k][r]) for k in col_hdr]
356
357 file_.write(sep.join(l) + '\n')
358
359 file_.close()
360
361
363 """Returns the number of rows.
364 """
365
366 if not len(self.keys()):
367 return 0
368
369 else:
370 return len(self[self.keys()[0]])
371
372 ncolumns = property(fget=getNColumns)
373 nrows = property(fget=getNRows)
374
375
376
378 """Read and write PyMVPA sample attribute definitions from and to text
379 files.
380 """
381 - def __init__(self, source, literallabels=False, header=None):
382 """Read PyMVPA sample attributes from disk.
383
384 :Parameters:
385 source: basestring
386 Filename of an atrribute file
387 literallabels: bool
388 Either labels are given as literal strings
389 header: None or bool or list of str
390 If None, ['labels', 'chunks'] is assumed. Otherwise the same
391 behavior as of `ColumnData`
392 """
393 if literallabels:
394 dtypes = [str, float]
395 else:
396 dtypes = float
397
398 if header is None:
399 header = ['labels', 'chunks']
400 ColumnData.__init__(self, source,
401 header=header,
402 sep=None, dtype=dtypes)
403
404
406 """Write sample attributes to a text file.
407 """
408 ColumnData.tofile(self, filename,
409 header=False,
410 header_order=['labels', 'chunks'],
411 sep=' ')
412
413
415 """Returns the number of samples in the file.
416 """
417 return self.getNRows()
418
419
421 """Convert into a list of `Event` instances.
422
423 Each change in the label or chunks value is taken as a new event onset.
424 The length of an event is determined by the number of identical
425 consecutive label-chunk combinations. Since the attributes list has no
426 sense of absolute timing, both `onset` and `duration` are determined and
427 stored in #samples units.
428
429 :Parameters:
430 kwargs
431 Any keyword arugment provided would be replicated, through all
432 the entries.
433 """
434 events = []
435 prev_onset = 0
436 old_comb = None
437 duration = 1
438
439 for r in xrange(self.nrows):
440
441 comb = (self.labels[r], self.chunks[r])
442
443
444 if not comb == old_comb:
445
446 if not old_comb is None:
447 events.append(
448 Event(onset=prev_onset, duration=duration,
449 label=old_comb[0], chunk=old_comb[1], **kwargs))
450
451 duration = 1
452
453 prev_onset = r
454
455
456 old_comb = comb
457 else:
458
459 duration += 1
460
461
462 if not old_comb is None:
463 events.append(
464 Event(onset=prev_onset, duration=duration,
465 label=old_comb[0], chunk=old_comb[1], **kwargs))
466
467 return events
468
469
470 nsamples = property(fget=getNSamples)
471
472
474 """Base class for sensor location readers.
475
476 Each subclass should provide x, y, z coordinates via the `pos_x`, `pos_y`,
477 and `pos_z` attrbibutes.
478
479 Axes should follow the following convention:
480
481 x-axis: left -> right
482 y-axis: anterior -> posterior
483 z-axis: superior -> inferior
484 """
489
490
492 """Get the sensor locations as an array.
493
494 :Returns:
495 (nchannels x 3) array with coordinates in (x, y, z)
496 """
497 return N.array((self.pos_x, self.pos_y, self.pos_z)).T
498
499
500
502 """Read sensor location definitions from a specific text file format.
503
504 File layout is assumed to be 5 columns:
505
506 1. sensor name
507 2. some useless integer
508 3. position on x-axis
509 4. position on y-axis
510 5. position on z-axis
511 """
513 """Read sensor locations from file.
514
515 :Parameter:
516 source : filename of an attribute file
517 """
518 SensorLocations.__init__(
519 self, source,
520 header=['names', 'some_number', 'pos_x', 'pos_y', 'pos_z'],
521 sep=None, dtype=[str, int, float, float, float])
522
523
525 """Read sensor location definitions from a specific text file format.
526
527 File layout is assumed to be 7 columns:
528
529 1: sensor name
530 2: position on y-axis
531 3: position on x-axis
532 4: position on z-axis
533 5-7: same as 2-4, but for some outer surface thingie.
534
535 Note that x and y seem to be swapped, ie. y as defined by SensorLocations
536 conventions seems to be first axis and followed by x.
537
538 Only inner surface coordinates are reported by `locations()`.
539 """
541 """Read sensor locations from file.
542
543 :Parameter:
544 source : filename of an attribute file
545 """
546 SensorLocations.__init__(
547 self, source,
548 header=['names', 'pos_y', 'pos_x', 'pos_z',
549 'pos_y2', 'pos_x2', 'pos_z2'],
550 sep=None, dtype=[str, float, float, float, float, float, float])
551
552
553 -def design2labels(columndata, baseline_label=0,
554 func=lambda x: x > 0.0):
555 """Helper to convert design matrix into a list of labels
556
557 Given a design, assign a single label to any given sample
558
559 TODO: fix description/naming
560
561 :Parameters:
562 columndata : ColumnData
563 Attributes where each known will be considered as a separate
564 explanatory variable (EV) in the design.
565 baseline_label
566 What label to assign for samples where none of EVs was given a value
567 func : functor
568 Function which decides either a value should be considered
569
570 :Output:
571 list of labels which are taken from column names in
572 ColumnData and baseline_label
573
574 """
575
576
577 keys = columndata.keys()
578 labels = []
579 for row in xrange(columndata.nrows):
580 entries = [ columndata[key][row] for key in keys ]
581
582 selected = filter(lambda x: func(x[1]), zip(keys, entries))
583 nselected = len(selected)
584
585 if nselected > 1:
586
587 raise ValueError, "Row #%i with items %s has multiple entries " \
588 "meeting the criterion. Cannot decide on the label" % \
589 (row, entries)
590 elif nselected == 1:
591 label = selected[0][0]
592 else:
593 label = baseline_label
594 labels.append(label)
595 return labels
596
597
598 __known_chunking_methods = {
599 'alllabels': 'Each chunk must contain instances of all labels'
600 }
601
602 -def labels2chunks(labels, method="alllabels", ignore_labels=None):
603 """Automagically decide on chunks based on labels
604
605 :Parameters:
606 labels
607 labels to base chunking on
608 method : basestring
609 codename for method to use. Known are %s
610 ignore_labels : list of basestring
611 depends on the method. If method ``alllabels``, then don't
612 seek for such labels in chunks. E.g. some 'reject' samples
613
614 :rtype: list
615 """ % __known_chunking_methods.keys()
616
617 chunks = []
618 if ignore_labels is None:
619 ignore_labels = []
620 alllabels = Set(labels).difference(Set(ignore_labels))
621 if method == 'alllabels':
622 seenlabels = Set()
623 lastlabel = None
624 chunk = 0
625 for label in labels:
626 if label != lastlabel:
627 if seenlabels == alllabels:
628 chunk += 1
629 seenlabels = Set()
630 lastlabel = label
631 if not label in ignore_labels:
632 seenlabels.union_update([label])
633 chunks.append(chunk)
634 chunks = N.array(chunks)
635
636 if seenlabels != alllabels:
637 chunks[chunks == chunk] = chunk-1
638 chunks = list(chunks)
639 else:
640 errmsg = "Unknown method to derive chunks is requested. Known are:\n"
641 for method, descr in __known_chunking_methods.iteritems():
642 errmsg += " %s : %s\n" % (method, descr)
643 raise ValueError, errmsg
644 return chunks
645