Package mvpa :: Package datasets :: Module miscfx
[hide private]
[frames] | no frames]

Source Code for Module mvpa.datasets.miscfx

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Misc function performing operations on datasets. 
 10   
 11  All the functions defined in this module must accept dataset as the 
 12  first argument since they are bound to Dataset class in the trailer. 
 13  """ 
 14   
 15  __docformat__ = 'restructuredtext' 
 16   
 17  from sets import Set 
 18  from operator import isSequenceType 
 19   
 20  import numpy as N 
 21   
 22  from mvpa.datasets.base import Dataset, datasetmethod 
 23  from mvpa.misc.support import getBreakPoints 
 24   
 25  from mvpa.base import externals, warning 
 26   
 27  if __debug__: 
 28      from mvpa.base import debug 
 29   
 30  if externals.exists('scipy'): 
 31      from mvpa.datasets.miscfx_sp import detrend 
32 33 34 @datasetmethod 35 -def zscore(dataset, mean=None, std=None, 36 perchunk=True, baselinelabels=None, 37 pervoxel=True, targetdtype='float64'):
38 """Z-Score the samples of a `Dataset` (in-place). 39 40 `mean` and `std` can be used to pass custom values to the z-scoring. 41 Both may be scalars or arrays. 42 43 All computations are done *in place*. Data upcasting is done 44 automatically if necessary into `targetdtype` 45 46 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would 47 compute the corresponding measure based only on labels in `baselinelabels` 48 49 If `perchunk` is True samples within the same chunk are z-scored independent 50 of samples from other chunks, e.i. mean and standard deviation are 51 calculated individually. 52 """ 53 54 if __debug__ and perchunk \ 55 and N.array(dataset.samplesperchunk.values()).min() < 2: 56 warning("Z-scoring chunk-wise and one chunk with less than two " \ 57 "samples will set features in these samples to zero.") 58 59 # cast to floating point datatype if necessary 60 if str(dataset.samples.dtype).startswith('uint') \ 61 or str(dataset.samples.dtype).startswith('int'): 62 dataset.setSamplesDType(targetdtype) 63 64 def doit(samples, mean, std, statsamples=None): 65 """Internal method.""" 66 67 if statsamples is None: 68 # if nothing provided -- mean/std on all samples 69 statsamples = samples 70 71 if pervoxel: 72 axisarg = {'axis':0} 73 else: 74 axisarg = {} 75 76 # calculate mean if necessary 77 if mean is None: 78 mean = statsamples.mean(**axisarg) 79 80 # de-mean 81 samples -= mean 82 83 # calculate std-deviation if necessary 84 # XXX YOH: would that be actually what we want? 85 # may be we want actually estimate of deviation from the mean, 86 # which per se might be not statsamples.mean (see above)? 87 # if logic to be changed -- adjust ZScoreMapper as well 88 if std is None: 89 std = statsamples.std(**axisarg) 90 91 # do the z-scoring 92 if pervoxel: 93 samples[:, std != 0] /= std[std != 0] 94 else: 95 samples /= std 96 97 return samples
98 99 if baselinelabels is None: 100 statids = None 101 else: 102 statids = Set(dataset.idsbylabels(baselinelabels)) 103 104 # for the sake of speed yoh didn't simply create a list 105 # [True]*dataset.nsamples to provide easy selection of everything 106 if perchunk: 107 for c in dataset.uniquechunks: 108 slicer = N.where(dataset.chunks == c)[0] 109 if not statids is None: 110 statslicer = list(statids.intersection(Set(slicer))) 111 dataset.samples[slicer] = doit(dataset.samples[slicer], 112 mean, std, 113 dataset.samples[statslicer]) 114 else: 115 slicedsamples = dataset.samples[slicer] 116 dataset.samples[slicer] = doit(slicedsamples, 117 mean, std, 118 slicedsamples) 119 elif statids is None: 120 doit(dataset.samples, mean, std, dataset.samples) 121 else: 122 doit(dataset.samples, mean, std, dataset.samples[list(statids)]) 123
124 125 @datasetmethod 126 -def aggregateFeatures(dataset, fx=N.mean):
127 """Apply a function to each row of the samples matrix of a dataset. 128 129 The functor given as `fx` has to honour an `axis` keyword argument in the 130 way that NumPy used it (e.g. NumPy.mean, var). 131 132 :Returns: 133 a new `Dataset` object with the aggregated feature(s). 134 """ 135 agg = fx(dataset.samples, axis=1) 136 137 return Dataset(samples=N.array(agg, ndmin=2).T, 138 labels=dataset.labels, 139 chunks=dataset.chunks)
140
141 142 @datasetmethod 143 -def removeInvariantFeatures(dataset):
144 """Returns a new dataset with all invariant features removed. 145 """ 146 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
147
148 149 @datasetmethod 150 -def coarsenChunks(source, nchunks=4):
151 """Change chunking of the dataset 152 153 Group chunks into groups to match desired number of chunks. Makes 154 sense if originally there were no strong groupping into chunks or 155 each sample was independent, thus belonged to its own chunk 156 157 :Parameters: 158 source : Dataset or list of chunk ids 159 dataset or list of chunk ids to operate on. If Dataset, then its chunks 160 get modified 161 nchunks : int 162 desired number of chunks 163 """ 164 165 if isinstance(source, Dataset): 166 chunks = source.chunks 167 else: 168 chunks = source 169 chunks_unique = N.unique(chunks) 170 nchunks_orig = len(chunks_unique) 171 172 if nchunks_orig < nchunks: 173 raise ValueError, \ 174 "Original number of chunks is %d. Cannot coarse them " \ 175 "to get %d chunks" % (nchunks_orig, nchunks) 176 177 # figure out number of samples per each chunk 178 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique))) 179 for c in chunks: 180 counts[c] += 1 181 182 # now we need to group chunks to get more or less equalized number 183 # of samples per chunk. No sophistication is done -- just 184 # consecutively group to get close to desired number of samples 185 # per chunk 186 avg_chunk_size = N.sum(counts.values())*1.0/nchunks 187 chunks_groups = [] 188 cur_chunk = [] 189 nchunks = 0 190 cur_chunk_nsamples = 0 191 samples_counted = 0 192 for i, c in enumerate(chunks_unique): 193 cc = counts[c] 194 195 cur_chunk += [c] 196 cur_chunk_nsamples += cc 197 198 # time to get a new chunk? 199 if (samples_counted + cur_chunk_nsamples 200 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1: 201 chunks_groups.append(cur_chunk) 202 samples_counted += cur_chunk_nsamples 203 cur_chunk_nsamples = 0 204 cur_chunk = [] 205 nchunks += 1 206 207 if len(chunks_groups) != nchunks: 208 warning("Apparently logic in coarseChunks is wrong. " 209 "It was desired to get %d chunks, got %d" 210 % (nchunks, len(chunks_groups))) 211 212 # remap using groups 213 # create dictionary 214 chunks_map = {} 215 for i, group in enumerate(chunks_groups): 216 for c in group: 217 chunks_map[c] = i 218 219 chunks_new = [chunks_map[x] for x in chunks] 220 221 if __debug__: 222 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s" 223 % (chunks_map, chunks, chunks_new)) 224 225 if isinstance(source, Dataset): 226 if __debug__: 227 debug("DS", "Coarsing %d chunks into %d chunks for %s" 228 %(nchunks_orig, len(chunks_new), source)) 229 source.chunks = chunks_new 230 return 231 else: 232 return chunks_new
233
234 235 @datasetmethod 236 -def getSamplesPerChunkLabel(dataset):
237 """Returns an array with the number of samples per label in each chunk. 238 239 Array shape is (chunks x labels). 240 241 :Parameters: 242 dataset: Dataset 243 Source dataset. 244 """ 245 ul = dataset.uniquelabels 246 uc = dataset.uniquechunks 247 248 count = N.zeros((len(uc), len(ul)), dtype='uint') 249 250 for cc, c in enumerate(uc): 251 for lc, l in enumerate(ul): 252 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l, 253 dataset.chunks == c)) 254 255 return count
256