1
2
3
4
5
6
7
8
9 """Misc function performing operations on datasets.
10
11 All the functions defined in this module must accept dataset as the
12 first argument since they are bound to Dataset class in the trailer.
13 """
14
15 __docformat__ = 'restructuredtext'
16
17 from sets import Set
18 from operator import isSequenceType
19
20 import numpy as N
21
22 from mvpa.datasets.base import Dataset, datasetmethod
23 from mvpa.misc.support import getBreakPoints
24
25 from mvpa.base import externals, warning
26
27 if __debug__:
28 from mvpa.base import debug
29
30 if externals.exists('scipy'):
31 from mvpa.datasets.miscfx_sp import detrend
32
33
34 @datasetmethod
35 -def zscore(dataset, mean=None, std=None,
36 perchunk=True, baselinelabels=None,
37 pervoxel=True, targetdtype='float64'):
38 """Z-Score the samples of a `Dataset` (in-place).
39
40 `mean` and `std` can be used to pass custom values to the z-scoring.
41 Both may be scalars or arrays.
42
43 All computations are done *in place*. Data upcasting is done
44 automatically if necessary into `targetdtype`
45
46 If `baselinelabels` provided, and `mean` or `std` aren't provided, it would
47 compute the corresponding measure based only on labels in `baselinelabels`
48
49 If `perchunk` is True samples within the same chunk are z-scored independent
50 of samples from other chunks, e.i. mean and standard deviation are
51 calculated individually.
52 """
53
54 if __debug__ and perchunk \
55 and N.array(dataset.samplesperchunk.values()).min() < 2:
56 warning("Z-scoring chunk-wise and one chunk with less than two " \
57 "samples will set features in these samples to zero.")
58
59
60 if str(dataset.samples.dtype).startswith('uint') \
61 or str(dataset.samples.dtype).startswith('int'):
62 dataset.setSamplesDType(targetdtype)
63
64 def doit(samples, mean, std, statsamples=None):
65 """Internal method."""
66
67 if statsamples is None:
68
69 statsamples = samples
70
71 if pervoxel:
72 axisarg = {'axis':0}
73 else:
74 axisarg = {}
75
76
77 if mean is None:
78 mean = statsamples.mean(**axisarg)
79
80
81 samples -= mean
82
83
84
85
86
87
88 if std is None:
89 std = statsamples.std(**axisarg)
90
91
92 if pervoxel:
93 samples[:, std != 0] /= std[std != 0]
94 else:
95 samples /= std
96
97 return samples
98
99 if baselinelabels is None:
100 statids = None
101 else:
102 statids = Set(dataset.idsbylabels(baselinelabels))
103
104
105
106 if perchunk:
107 for c in dataset.uniquechunks:
108 slicer = N.where(dataset.chunks == c)[0]
109 if not statids is None:
110 statslicer = list(statids.intersection(Set(slicer)))
111 dataset.samples[slicer] = doit(dataset.samples[slicer],
112 mean, std,
113 dataset.samples[statslicer])
114 else:
115 slicedsamples = dataset.samples[slicer]
116 dataset.samples[slicer] = doit(slicedsamples,
117 mean, std,
118 slicedsamples)
119 elif statids is None:
120 doit(dataset.samples, mean, std, dataset.samples)
121 else:
122 doit(dataset.samples, mean, std, dataset.samples[list(statids)])
123
127 """Apply a function to each row of the samples matrix of a dataset.
128
129 The functor given as `fx` has to honour an `axis` keyword argument in the
130 way that NumPy used it (e.g. NumPy.mean, var).
131
132 :Returns:
133 a new `Dataset` object with the aggregated feature(s).
134 """
135 agg = fx(dataset.samples, axis=1)
136
137 return Dataset(samples=N.array(agg, ndmin=2).T,
138 labels=dataset.labels,
139 chunks=dataset.chunks)
140
144 """Returns a new dataset with all invariant features removed.
145 """
146 return dataset.selectFeatures(dataset.samples.std(axis=0).nonzero()[0])
147
151 """Change chunking of the dataset
152
153 Group chunks into groups to match desired number of chunks. Makes
154 sense if originally there were no strong groupping into chunks or
155 each sample was independent, thus belonged to its own chunk
156
157 :Parameters:
158 source : Dataset or list of chunk ids
159 dataset or list of chunk ids to operate on. If Dataset, then its chunks
160 get modified
161 nchunks : int
162 desired number of chunks
163 """
164
165 if isinstance(source, Dataset):
166 chunks = source.chunks
167 else:
168 chunks = source
169 chunks_unique = N.unique(chunks)
170 nchunks_orig = len(chunks_unique)
171
172 if nchunks_orig < nchunks:
173 raise ValueError, \
174 "Original number of chunks is %d. Cannot coarse them " \
175 "to get %d chunks" % (nchunks_orig, nchunks)
176
177
178 counts = dict(zip(chunks_unique, [ 0 ] * len(chunks_unique)))
179 for c in chunks:
180 counts[c] += 1
181
182
183
184
185
186 avg_chunk_size = N.sum(counts.values())*1.0/nchunks
187 chunks_groups = []
188 cur_chunk = []
189 nchunks = 0
190 cur_chunk_nsamples = 0
191 samples_counted = 0
192 for i, c in enumerate(chunks_unique):
193 cc = counts[c]
194
195 cur_chunk += [c]
196 cur_chunk_nsamples += cc
197
198
199 if (samples_counted + cur_chunk_nsamples
200 >= (nchunks+1)*avg_chunk_size) or i==nchunks_orig-1:
201 chunks_groups.append(cur_chunk)
202 samples_counted += cur_chunk_nsamples
203 cur_chunk_nsamples = 0
204 cur_chunk = []
205 nchunks += 1
206
207 if len(chunks_groups) != nchunks:
208 warning("Apparently logic in coarseChunks is wrong. "
209 "It was desired to get %d chunks, got %d"
210 % (nchunks, len(chunks_groups)))
211
212
213
214 chunks_map = {}
215 for i, group in enumerate(chunks_groups):
216 for c in group:
217 chunks_map[c] = i
218
219 chunks_new = [chunks_map[x] for x in chunks]
220
221 if __debug__:
222 debug("DS_", "Using dictionary %s to remap old chunks %s into new %s"
223 % (chunks_map, chunks, chunks_new))
224
225 if isinstance(source, Dataset):
226 if __debug__:
227 debug("DS", "Coarsing %d chunks into %d chunks for %s"
228 %(nchunks_orig, len(chunks_new), source))
229 source.chunks = chunks_new
230 return
231 else:
232 return chunks_new
233
237 """Returns an array with the number of samples per label in each chunk.
238
239 Array shape is (chunks x labels).
240
241 :Parameters:
242 dataset: Dataset
243 Source dataset.
244 """
245 ul = dataset.uniquelabels
246 uc = dataset.uniquechunks
247
248 count = N.zeros((len(uc), len(ul)), dtype='uint')
249
250 for cc, c in enumerate(uc):
251 for lc, l in enumerate(ul):
252 count[cc, lc] = N.sum(N.logical_and(dataset.labels == l,
253 dataset.chunks == c))
254
255 return count
256