Package mvpa :: Package misc :: Package io :: Module base
[hide private]
[frames] | no frames]

Source Code for Module mvpa.misc.io.base

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Some little helper for reading (and writing) common formats from and to 
 10  disk.""" 
 11   
 12  __docformat__ = 'restructuredtext' 
 13   
 14  import numpy as N 
 15  import mvpa.support.copy as copy 
 16  from mvpa.base.dochelpers import enhancedDocString 
 17  from sets import Set 
 18  from re import sub as re_sub 
 19  from mvpa.base import warning 
 20   
 21  from mvpa.misc.support import Event 
 22   
 23  if __debug__: 
 24      from mvpa.base import debug 
 25   
 26   
27 -class DataReader(object):
28 """Base class for data readers. 29 30 Every subclass has to put all information into to variable: 31 32 `self._data`: ndarray 33 The data array has to have the samples separating dimension along the 34 first axis. 35 `self._props`: dict 36 All other meaningful information has to be stored in a dictionary. 37 38 This class provides two methods (and associated properties) to retrieve 39 this information. 40 """
41 - def __init__(self):
42 """Cheap init. 43 """ 44 self._props = {} 45 self._data = None
46 47
48 - def getPropsAsDict(self):
49 """Return the dictionary with the data properties. 50 """ 51 return self._props
52 53
54 - def getData(self):
55 """Return the data array. 56 """ 57 return self._data
58 59 60 data = property(fget=getData, doc="Data array") 61 props = property(fget=getPropsAsDict, doc="Property dict")
62 63 64
65 -class ColumnData(dict):
66 """Read data that is stored in columns of text files. 67 68 All read data is available via a dictionary-like interface. If 69 column headers are available, the column names serve as dictionary keys. 70 If no header exists an articfical key is generated: str(number_of_column). 71 72 Splitting of text file lines is performed by the standard split() function 73 (which gets passed the `sep` argument as separator string) and each 74 element is converted into the desired datatype. 75 76 Because data is read into a dictionary no two columns can have the same 77 name in the header! Each column is stored as a list in the dictionary. 78 """
79 - def __init__(self, source, header=True, sep=None, headersep=None, 80 dtype=float, skiplines=0):
81 """Read data from file into a dictionary. 82 83 :Parameters: 84 source : basestring or dict 85 If values is given as a string all data is read from the 86 file and additonal keyword arguments can be sued to 87 customize the read procedure. If a dictionary is passed 88 a deepcopy is performed. 89 header : bool or list of basestring 90 Indicates whether the column names should be read from the 91 first line (`header=True`). If `header=False` unique 92 column names will be generated (see class docs). If 93 `header` is a python list, it's content is used as column 94 header names and its length has to match the number of 95 columns in the file. 96 sep : basestring or None 97 Separator string. The actual meaning depends on the output 98 format (see class docs). 99 headersep : basestring or None 100 Separator string used in the header. The actual meaning 101 depends on the output format (see class docs). 102 dtype : type or list(types) 103 Desired datatype(s). Datatype per column get be specified by 104 passing a list of types. 105 skiplines : int 106 Number of lines to skip at the beginning of the file. 107 """ 108 # init base class 109 dict.__init__(self) 110 111 # intialize with default 112 self._header_order = None 113 114 if isinstance(source, str): 115 self._fromFile(source, header=header, sep=sep, headersep=headersep, 116 dtype=dtype, skiplines=skiplines) 117 118 elif isinstance(source, dict): 119 for k, v in source.iteritems(): 120 self[k] = v 121 # check data integrity 122 self._check() 123 124 else: 125 raise ValueError, 'Unkown source for ColumnData [%s]' \ 126 % `type(source)` 127 128 # generate missing properties for each item in the header 129 classdict = self.__class__.__dict__ 130 for k in self.keys(): 131 if not classdict.has_key(k): 132 getter = "lambda self: self._getAttrib('%s')" % (k) 133 # Sanitarize the key, substitute ' []' with '_' 134 k_ = re_sub('[[\] ]', '_', k) 135 # replace multipe _s 136 k_ = re_sub('__+', '_', k_) 137 # remove quotes 138 k_ = re_sub('["\']', '', k_) 139 if __debug__: 140 debug("IOH", "Registering property %s for ColumnData key %s" 141 % (k_, k)) 142 # make sure to import class directly into local namespace 143 # otherwise following does not work for classes defined 144 # elsewhere 145 exec 'from %s import %s' % (self.__module__, 146 self.__class__.__name__) 147 exec "%s.%s = property(fget=%s)" % \ 148 (self.__class__.__name__, k_, getter)
149 # TODO!!! Check if it is safe actually here to rely on value of 150 # k in lambda. May be it is treated as continuation and 151 # some local space would override it???? 152 #setattr(self.__class__, 153 # k, 154 # property(fget=lambda x: x._getAttrib("%s" % k))) 155 # it seems to be error-prone due to continuation... 156 157 158 __doc__ = enhancedDocString('ColumnData', locals()) 159 160
161 - def _getAttrib(self, key):
162 """Return corresponding value if given key is known to current instance 163 164 Is used for automatically added properties to the class. 165 166 :Raises: 167 ValueError: 168 If `key` is not known to given instance 169 170 :Returns: 171 Value if `key` is known 172 """ 173 if self.has_key(key): 174 return self[key] 175 else: 176 raise ValueError, "Instance %s has no data about %s" \ 177 % (`self`, `key`)
178 179
180 - def __str__(self):
181 s = self.__class__.__name__ 182 if len(self.keys())>0: 183 s += " %d rows, %d columns [" % \ 184 (self.getNRows(), self.getNColumns()) 185 s += reduce(lambda x, y: x+" %s" % y, self.keys()) 186 s += "]" 187 return s
188
189 - def _check(self):
190 """Performs some checks for data integrity. 191 """ 192 length = None 193 for k in self.keys(): 194 if length == None: 195 length = len(self[k]) 196 else: 197 if not len(self[k]) == length: 198 raise ValueError, "Data integrity lost. Columns do not " \ 199 "have equal length."
200 201
202 - def _fromFile(self, filename, header, sep, headersep, 203 dtype, skiplines):
204 """Loads column data from file -- clears object first. 205 """ 206 # make a clean table 207 self.clear() 208 209 file_ = open(filename, 'r') 210 211 self._header_order = None 212 213 [ file_.readline() for x in range(skiplines) ] 214 """Simply skip some lines""" 215 # make column names, either take header or generate 216 if header == True: 217 # read first line and split by 'sep' 218 hdr = file_.readline().split(headersep) 219 # remove bogus empty header titles 220 hdr = filter(lambda x:len(x.strip()), hdr) 221 self._header_order = hdr 222 elif isinstance(header, list): 223 hdr = header 224 else: 225 hdr = [ str(i) for i in xrange(len(file_.readline().split(sep))) ] 226 # reset file to not miss the first line 227 file_.seek(0) 228 [ file_.readline() for x in range(skiplines) ] 229 230 231 # string in lists: one per column 232 tbl = [ [] for i in xrange(len(hdr)) ] 233 234 # do per column dtypes 235 if not isinstance(dtype, list): 236 dtype = [dtype] * len(hdr) 237 238 # parse line by line and feed into the lists 239 for line in file_: 240 # get rid of leading and trailing whitespace 241 line = line.strip() 242 # ignore empty lines and comment lines 243 if not line or line.startswith('#'): 244 continue 245 l = line.split(sep) 246 247 if not len(l) == len(hdr): 248 raise RuntimeError, \ 249 "Number of entries in line [%i] does not match number " \ 250 "of columns in header [%i]." % (len(l), len(hdr)) 251 252 for i, v in enumerate(l): 253 if not dtype[i] is None: 254 try: 255 v = dtype[i](v) 256 except ValueError: 257 warning("Can't convert %s to desired datatype %s." % 258 (`v`, `dtype`) + " Leaving original type") 259 tbl[i].append(v) 260 261 # check 262 if not len(tbl) == len(hdr): 263 raise RuntimeError, "Number of columns read from file does not " \ 264 "match the number of header entries." 265 266 # fill dict 267 for i, v in enumerate(hdr): 268 self[v] = tbl[i]
269 270
271 - def __iadd__(self, other):
272 """Merge column data. 273 """ 274 # for all columns in the other object 275 for k, v in other.iteritems(): 276 if not self.has_key(k): 277 raise ValueError, 'Unknown key [%s].' % `k` 278 if not isinstance(v, list): 279 raise ValueError, 'Can only merge list data, but got [%s].' \ 280 % `type(v)` 281 # now it seems to be ok 282 # XXX check for datatype? 283 self[k] += v 284 285 # look for problems, like columns present in self, but not in other 286 self._check() 287 288 return self
289 290
291 - def selectSamples(self, selection):
292 """Return new ColumnData with selected samples""" 293 294 data = copy.deepcopy(self) 295 for k, v in data.iteritems(): 296 data[k] = [v[x] for x in selection] 297 298 data._check() 299 return data
300 301
302 - def getNColumns(self):
303 """Returns the number of columns. 304 """ 305 return len(self.keys())
306 307
308 - def tofile(self, filename, header=True, header_order=None, sep=' '):
309 """Write column data to a text file. 310 311 :Parameters: 312 filename : basestring 313 Target filename 314 header : bool 315 If `True` a column header is written, using the column 316 keys. If `False` no header is written. 317 header_order : None or list of basestring 318 If it is a list of strings, they will be used instead 319 of simply asking for the dictionary keys. However 320 these strings must match the dictionary keys in number 321 and identity. This argument type can be used to 322 determine the order of the columns in the output file. 323 The default value is `None`. In this case the columns 324 will be in an arbitrary order. 325 sep : basestring 326 String that is written as a separator between to data columns. 327 """ 328 # XXX do the try: except: dance 329 file_ = open(filename, 'w') 330 331 # write header 332 if header_order == None: 333 if self._header_order is None: 334 col_hdr = self.keys() 335 else: 336 # use stored order + newly added keys at the last columns 337 col_hdr = self._header_order + \ 338 list(Set(self.keys()).difference( 339 Set(self._header_order))) 340 else: 341 if not len(header_order) == self.getNColumns(): 342 raise ValueError, 'Header list does not match number of ' \ 343 'columns.' 344 for k in header_order: 345 if not self.has_key(k): 346 raise ValueError, 'Unknown key [%s]' % `k` 347 col_hdr = header_order 348 349 if header == True: 350 file_.write(sep.join(col_hdr) + '\n') 351 352 # for all rows 353 for r in xrange(self.getNRows()): 354 # get attributes for all keys 355 l = [str(self[k][r]) for k in col_hdr] 356 # write to file with proper separator 357 file_.write(sep.join(l) + '\n') 358 359 file_.close()
360 361
362 - def getNRows(self):
363 """Returns the number of rows. 364 """ 365 # no data no rows (after Bob Marley) 366 if not len(self.keys()): 367 return 0 368 # otherwise first key is as good as any other 369 else: 370 return len(self[self.keys()[0]])
371 372 ncolumns = property(fget=getNColumns) 373 nrows = property(fget=getNRows)
374 375 376
377 -class SampleAttributes(ColumnData):
378 """Read and write PyMVPA sample attribute definitions from and to text 379 files. 380 """
381 - def __init__(self, source, literallabels=False, header=None):
382 """Read PyMVPA sample attributes from disk. 383 384 :Parameters: 385 source: basestring 386 Filename of an atrribute file 387 literallabels: bool 388 Either labels are given as literal strings 389 header: None or bool or list of str 390 If None, ['labels', 'chunks'] is assumed. Otherwise the same 391 behavior as of `ColumnData` 392 """ 393 if literallabels: 394 dtypes = [str, float] 395 else: 396 dtypes = float 397 398 if header is None: 399 header = ['labels', 'chunks'] 400 ColumnData.__init__(self, source, 401 header=header, 402 sep=None, dtype=dtypes)
403 404
405 - def tofile(self, filename):
406 """Write sample attributes to a text file. 407 """ 408 ColumnData.tofile(self, filename, 409 header=False, 410 header_order=['labels', 'chunks'], 411 sep=' ')
412 413
414 - def getNSamples(self):
415 """Returns the number of samples in the file. 416 """ 417 return self.getNRows()
418 419
420 - def toEvents(self, **kwargs):
421 """Convert into a list of `Event` instances. 422 423 Each change in the label or chunks value is taken as a new event onset. 424 The length of an event is determined by the number of identical 425 consecutive label-chunk combinations. Since the attributes list has no 426 sense of absolute timing, both `onset` and `duration` are determined and 427 stored in #samples units. 428 429 :Parameters: 430 kwargs 431 Any keyword arugment provided would be replicated, through all 432 the entries. 433 """ 434 events = [] 435 prev_onset = 0 436 old_comb = None 437 duration = 1 438 # over all samples 439 for r in xrange(self.nrows): 440 # the label-chunk combination 441 comb = (self.labels[r], self.chunks[r]) 442 443 # check if things changed 444 if not comb == old_comb: 445 # did we ever had an event 446 if not old_comb is None: 447 events.append( 448 Event(onset=prev_onset, duration=duration, 449 label=old_comb[0], chunk=old_comb[1], **kwargs)) 450 # reset duration for next event 451 duration = 1 452 # store the current samples as onset for the next event 453 prev_onset = r 454 455 # update the reference combination 456 old_comb = comb 457 else: 458 # current event is lasting 459 duration += 1 460 461 # push the last event in the pipeline 462 if not old_comb is None: 463 events.append( 464 Event(onset=prev_onset, duration=duration, 465 label=old_comb[0], chunk=old_comb[1], **kwargs)) 466 467 return events
468 469 470 nsamples = property(fget=getNSamples)
471 472
473 -class SensorLocations(ColumnData):
474 """Base class for sensor location readers. 475 476 Each subclass should provide x, y, z coordinates via the `pos_x`, `pos_y`, 477 and `pos_z` attrbibutes. 478 479 Axes should follow the following convention: 480 481 x-axis: left -> right 482 y-axis: anterior -> posterior 483 z-axis: superior -> inferior 484 """
485 - def __init__(self, *args, **kwargs):
486 """Pass arguments to ColumnData. 487 """ 488 ColumnData.__init__(self, *args, **kwargs)
489 490
491 - def locations(self):
492 """Get the sensor locations as an array. 493 494 :Returns: 495 (nchannels x 3) array with coordinates in (x, y, z) 496 """ 497 return N.array((self.pos_x, self.pos_y, self.pos_z)).T
498 499 500
501 -class XAVRSensorLocations(SensorLocations):
502 """Read sensor location definitions from a specific text file format. 503 504 File layout is assumed to be 5 columns: 505 506 1. sensor name 507 2. some useless integer 508 3. position on x-axis 509 4. position on y-axis 510 5. position on z-axis 511 """
512 - def __init__(self, source):
513 """Read sensor locations from file. 514 515 :Parameter: 516 source : filename of an attribute file 517 """ 518 SensorLocations.__init__( 519 self, source, 520 header=['names', 'some_number', 'pos_x', 'pos_y', 'pos_z'], 521 sep=None, dtype=[str, int, float, float, float])
522 523
524 -class TuebingenMEGSensorLocations(SensorLocations):
525 """Read sensor location definitions from a specific text file format. 526 527 File layout is assumed to be 7 columns: 528 529 1: sensor name 530 2: position on y-axis 531 3: position on x-axis 532 4: position on z-axis 533 5-7: same as 2-4, but for some outer surface thingie. 534 535 Note that x and y seem to be swapped, ie. y as defined by SensorLocations 536 conventions seems to be first axis and followed by x. 537 538 Only inner surface coordinates are reported by `locations()`. 539 """
540 - def __init__(self, source):
541 """Read sensor locations from file. 542 543 :Parameter: 544 source : filename of an attribute file 545 """ 546 SensorLocations.__init__( 547 self, source, 548 header=['names', 'pos_y', 'pos_x', 'pos_z', 549 'pos_y2', 'pos_x2', 'pos_z2'], 550 sep=None, dtype=[str, float, float, float, float, float, float])
551 552
553 -def design2labels(columndata, baseline_label=0, 554 func=lambda x: x > 0.0):
555 """Helper to convert design matrix into a list of labels 556 557 Given a design, assign a single label to any given sample 558 559 TODO: fix description/naming 560 561 :Parameters: 562 columndata : ColumnData 563 Attributes where each known will be considered as a separate 564 explanatory variable (EV) in the design. 565 baseline_label 566 What label to assign for samples where none of EVs was given a value 567 func : functor 568 Function which decides either a value should be considered 569 570 :Output: 571 list of labels which are taken from column names in 572 ColumnData and baseline_label 573 574 """ 575 # doing it simple naive way but it should be of better control if 576 # we decide to process columndata with non-numeric entries etc 577 keys = columndata.keys() 578 labels = [] 579 for row in xrange(columndata.nrows): 580 entries = [ columndata[key][row] for key in keys ] 581 # which entries get selected 582 selected = filter(lambda x: func(x[1]), zip(keys, entries)) 583 nselected = len(selected) 584 585 if nselected > 1: 586 # if there is more than a single one -- we are in problem 587 raise ValueError, "Row #%i with items %s has multiple entries " \ 588 "meeting the criterion. Cannot decide on the label" % \ 589 (row, entries) 590 elif nselected == 1: 591 label = selected[0][0] 592 else: 593 label = baseline_label 594 labels.append(label) 595 return labels
596 597 598 __known_chunking_methods = { 599 'alllabels': 'Each chunk must contain instances of all labels' 600 } 601
602 -def labels2chunks(labels, method="alllabels", ignore_labels=None):
603 """Automagically decide on chunks based on labels 604 605 :Parameters: 606 labels 607 labels to base chunking on 608 method : basestring 609 codename for method to use. Known are %s 610 ignore_labels : list of basestring 611 depends on the method. If method ``alllabels``, then don't 612 seek for such labels in chunks. E.g. some 'reject' samples 613 614 :rtype: list 615 """ % __known_chunking_methods.keys() 616 617 chunks = [] 618 if ignore_labels is None: 619 ignore_labels = [] 620 alllabels = Set(labels).difference(Set(ignore_labels)) 621 if method == 'alllabels': 622 seenlabels = Set() 623 lastlabel = None 624 chunk = 0 625 for label in labels: 626 if label != lastlabel: 627 if seenlabels == alllabels: 628 chunk += 1 629 seenlabels = Set() 630 lastlabel = label 631 if not label in ignore_labels: 632 seenlabels.union_update([label]) 633 chunks.append(chunk) 634 chunks = N.array(chunks) 635 # fix up a bit the trailer 636 if seenlabels != alllabels: 637 chunks[chunks == chunk] = chunk-1 638 chunks = list(chunks) 639 else: 640 errmsg = "Unknown method to derive chunks is requested. Known are:\n" 641 for method, descr in __known_chunking_methods.iteritems(): 642 errmsg += " %s : %s\n" % (method, descr) 643 raise ValueError, errmsg 644 return chunks
645