1
2
3
4
5
6
7
8
9 """Unit tests for PyMVPA dataset handling"""
10
11 import unittest
12 import random
13 import numpy as N
14 from sets import Set
15 from mvpa.datasets import Dataset
16 from mvpa.datasets.miscfx import zscore, aggregateFeatures
17 from mvpa.mappers.mask import MaskMapper
18 from mvpa.misc.exceptions import DatasetError
19 from mvpa.support import copy
20
21 from tests_warehouse import datasets
22
24
26 """Test composition of new datasets by addition of existing ones
27 """
28 data = Dataset(samples=range(5), labels=1, chunks=1)
29
30 self.failUnlessEqual(
31 data.uniquelabels, [1],
32 msg="uniquelabels must be correctly recomputed")
33
34
35 self.failUnlessEqual( data.nsamples, 1)
36
37 self.failUnless(
38 (data.samples == N.array([[0, 1, 2, 3, 4]])).all() )
39
40
41 self.failUnless( (data.labels == N.array([1])).all() )
42 self.failUnless( (data.chunks == N.array([1])).all() )
43
44
45 self.failUnlessRaises( DatasetError,
46 data.__iadd__, Dataset(samples=N.ones((2,3)),
47 labels=1,
48 chunks=1))
49
50
51 dss = datasets['uni2large'].samples
52 data += Dataset(samples=dss[:2, :5], labels=2, chunks=2 )
53 self.failUnlessEqual( data.nfeatures, 5 )
54 self.failUnless((data.labels == N.array([1, 2, 2])).all() )
55 self.failUnless((data.chunks == N.array([1, 2, 2])).all() )
56
57
58 data += Dataset(samples=dss[3:5, :5], labels=3)
59 self.failUnless((data.chunks == N.array([1, 2, 2, 0, 1]) ).all())
60
61
62 self.failUnless((data.uniquelabels == N.array([1, 2, 3]) ).all())
63
64
65 self.failUnlessRaises(DatasetError,
66 Dataset,
67 samples=dss[:4, :5],
68 labels=[ 1, 2, 3 ],
69 chunks=2)
70
71
72 self.failUnlessRaises(DatasetError,
73 Dataset,
74 samples=dss[:4, :5],
75 labels=[ 1, 2, 3, 4 ],
76 chunks=[ 2, 2, 2 ])
77
78
80 """Testing feature selection: sorted/not sorted, feature groups
81 """
82 origdata = datasets['uni2large'].samples[:10, :20]
83 data = Dataset(samples=origdata, labels=2, chunks=2 )
84
85
86 data.defineFeatureGroups(N.repeat(range(4), 5))
87
88 unmasked = data.samples.copy()
89
90
91 self.failUnless( data.nfeatures == 20 )
92
93 features_to_select = [3, 0, 17]
94 features_to_select_copy = copy.deepcopy(features_to_select)
95 features_to_select_sorted = copy.deepcopy(features_to_select)
96 features_to_select_sorted.sort()
97
98 bsel = N.array([False]*20)
99 bsel[ features_to_select ] = True
100
101 for sel, issorted in \
102 [(data.selectFeatures( features_to_select, sort=False), False),
103 (data.selectFeatures( features_to_select, sort=True), True),
104 (data.select(slice(None), features_to_select), True),
105 (data.select(slice(None), N.array(features_to_select)), True),
106 (data.select(slice(None), bsel), True)
107 ]:
108 self.failUnless(sel.nfeatures == 3)
109
110
111 self.failUnless(sel.samples.shape == (10, 3))
112
113
114 fts = (features_to_select, features_to_select_sorted)[int(issorted)]
115 self.failUnless((unmasked[:, fts] == sel.samples).all())
116
117
118 self.failUnless((sel._dsattr['featuregroups'] == [0, 0, 3]).all())
119
120
121 self.failUnless(features_to_select==features_to_select_copy)
122
123
124 gsel = data.selectFeatures(groups=[2, 3])
125 self.failUnless(gsel.nfeatures == 10)
126 self.failUnless(set(gsel._dsattr['featuregroups']) == set([2, 3]))
127
128
130 origdata = datasets['uni2large'].samples[:100, :10].T
131 data = Dataset(samples=origdata, labels=2, chunks=2 )
132
133 self.failUnless( data.nsamples == 10 )
134
135
136 for sel in [ data.selectSamples(5),
137 data.select(5),
138 data.select(slice(5, 6)),
139 ]:
140 self.failUnless( sel.nsamples == 1 )
141 self.failUnless( data.nfeatures == 100 )
142 self.failUnless( sel.origids == [5] )
143
144
145 for sel in [ data.selectSamples([5, 5]),
146
147
148
149
150
151 ]:
152 self.failUnless( sel.nsamples == 2 )
153 self.failUnless( (sel.samples[0] == data.samples[5]).all() )
154 self.failUnless( (sel.samples[0] == sel.samples[1]).all() )
155 self.failUnless( len(sel.labels) == 2 )
156 self.failUnless( len(sel.chunks) == 2 )
157 self.failUnless((sel.origids == [5, 5]).all())
158
159 self.failUnless( sel.samples.shape == (2, 100) )
160
161
162 for sel in [ data.selectSamples(data.idsbylabels(2)),
163 data.select(labels=2),
164 data.select('labels', 2),
165 data.select('labels', [2]),
166 data['labels', [2]],
167 data['labels': [2], 'labels':2],
168 data['labels': [2]],
169 ]:
170 self.failUnless( sel.nsamples == data.nsamples )
171 self.failUnless( N.all(sel.samples == data.samples) )
172
173 for sel in [ data.selectSamples(data.idsbylabels(3)),
174 data.select(labels=3),
175 data.select('labels', 3),
176 data.select('labels', [3]),
177 ]:
178 self.failUnless( sel.nsamples == 0 )
179
180 data = Dataset(samples=origdata,
181 labels=[8, 9, 4, 3, 3, 3, 4, 2, 8, 9],
182 chunks=2)
183 for sel in [ data.selectSamples(data.idsbylabels([2, 3])),
184 data.select('labels', [2, 3]),
185 data.select('labels', [2, 3], labels=[1, 2, 3, 4]),
186 data.select('labels', [2, 3], chunks=[1, 2, 3, 4]),
187 data['labels':[2, 3], 'chunks':[1, 2, 3, 4]],
188 data['chunks':[1, 2, 3, 4], 'labels':[2, 3]],
189 ]:
190 self.failUnless(N.all(sel.origids == [ 3., 4., 5., 7.]))
191
192
193 self.failUnless( (data.uniquelabels == [2, 3, 4, 8, 9]).all() );
194
195
196
197 sel = data.selectSamples(data.idsbylabels([3, 4, 8, 9]))
198 self.failUnlessEqual(Set(sel.uniquelabels), Set([3, 4, 8, 9]))
199 self.failUnless((sel.origids == [0, 1, 2, 3, 4, 5, 6, 8, 9]).all())
200
201
203 """Test some obscure selections of samples via select() or __getitem__
204 """
205 origdata = datasets['uni2large'].samples[:100, :10].T
206 data = Dataset(samples=origdata,
207
208 labels=[8, 9, 4, 3, 3, 3, 3, 2, 8, 9],
209 chunks=[1, 2, 3, 2, 3, 1, 5, 6, 3, 6])
210
211
212 if __debug__:
213
214 self.failUnlessRaises(ValueError, data.__getitem__,
215 'labels', 'featu')
216
217
218 self.failUnlessRaises(ValueError, data.__getitem__, 1, 1, 1)
219
220
221 for sel in [ data.select('chunks', [2, 6], labels=[3, 2],
222 features=slice(None)),
223 data.select('all', 'all', labels=[2,3], chunks=[2, 6]),
224 data['chunks', [2, 6], 'labels', [3, 2]],
225 data[:, :, 'chunks', [2, 6], 'labels', [3, 2]],
226
227 data[3:8, 'chunks', [2, 6, 2, 6], 'labels', [3, 2]],
228 ]:
229 self.failUnless(N.all(sel.origids == [3, 7]))
230 self.failUnless(sel.nfeatures == 100)
231 self.failUnless(N.all(sel.samples == origdata[ [3, 7] ]))
232
233 target = origdata[ [3, 7] ]
234 target = target[:, [1, 3] ]
235
236 for sel in [ data.select('all', [1, 3],
237 'chunks', [2, 6], labels=[3, 2]),
238 data[:, [1,3], 'chunks', [2, 6], 'labels', [3, 2]],
239 data[:, [1,3], 'chunks', [2, 6], 'labels', [3, 2]],
240
241 data[3:8, [1, 1, 3, 1],
242 'chunks', [2, 6, 2, 6], 'labels', [3, 2]],
243 ]:
244 self.failUnless(N.all(sel.origids == [3, 7]))
245 self.failUnless(sel.nfeatures == 2)
246 self.failUnless(N.all(sel.samples == target))
247
248
249 self.failUnless(data.select(chunks=[23]).nsamples == 0)
250
251
252 self.failUnless(N.all(data.where(chunks=[2, 6])==[1, 3, 7, 9]))
253 self.failUnless(N.all(data.where(chunks=[2, 6], labels=[22, 3])==[3]))
254
255 idx = data.where('all', [1, 3, 10], labels=[2, 3, 4])
256 self.failUnless(N.all(idx[1] == [1, 3, 10]))
257 self.failUnless(N.all(idx[0] == range(2, 8)))
258
259 self.failUnless(data.where() is None)
260
261 self.failUnless(data.where(labels=[123]) == [])
262
263
277
278
280 data1 = Dataset(samples=N.ones((5, 5)), labels=1, chunks=1 )
281 data2 = Dataset(samples=N.ones((3, 5)), labels=2, chunks=1 )
282
283 merged = data1 + data2
284
285 self.failUnless( merged.nfeatures == 5 )
286 l12 = [1]*5 + [2]*3
287 l1 = [1]*8
288 self.failUnless( (merged.labels == l12).all() )
289 self.failUnless( (merged.chunks == l1).all() )
290
291 data1 += data2
292
293 self.failUnless( data1.nfeatures == 5 )
294 self.failUnless( (data1.labels == l12).all() )
295 self.failUnless( (data1.chunks == l1).all() )
296
297
299 """
300 """
301 data = Dataset(samples=N.ones((5, 1)), labels=range(5), chunks=1 )
302 data += Dataset(samples=N.ones((5, 1))+1, labels=range(5), chunks=2 )
303 data += Dataset(samples=N.ones((5, 1))+2, labels=range(5), chunks=3 )
304 data += Dataset(samples=N.ones((5, 1))+3, labels=range(5), chunks=4 )
305 data += Dataset(samples=N.ones((5, 1))+4, labels=range(5), chunks=5 )
306 self.failUnless( data.samplesperlabel == {0:5, 1:5, 2:5, 3:5, 4:5} )
307
308
309 sample = data.getRandomSamples( 2 )
310 self.failUnless( sample.samplesperlabel.values() == [ 2, 2, 2, 2, 2 ] )
311
312 self.failUnless( (data.uniquechunks == range(1, 6)).all() )
313
314
315 origlabels = data.labels.copy()
316
317 data.permuteLabels(True)
318
319 self.failIf( (data.labels == origlabels).all() )
320
321 data.permuteLabels(False)
322
323 self.failUnless( (data.labels == origlabels).all() )
324
325
326 data2 = Dataset(samples=data.samples,
327 labels=data.labels,
328 chunks=data.chunks )
329
330
331 self.failUnless( (data2.labels == origlabels).all() )
332
333
334 data2.permuteLabels( True )
335
336
337 self.failUnless( (data.labels == origlabels).all() )
338
339 self.failIf( (data2.labels == origlabels).all() )
340
341
343 """Test adding custom attributes to a dataset
344 """
345
346
347
348 ds = Dataset(samples=range(5), labels=1, chunks=1)
349 self.failUnlessRaises(AttributeError, lambda x:x.blobs, ds)
350 """Dataset.blobs should fail since .blobs wasn't yet registered"""
351
352
353 Dataset._registerAttribute("blobs", "_data", hasunique=True)
354 ds = Dataset(samples=range(5), labels=1, chunks=1)
355 self.failUnless(not ds.blobs != [ 0 ],
356 msg="By default new attributes supposed to get 0 as the value")
357
358 try:
359 ds.blobs = [1, 2]
360 self.fail(msg="Dataset.blobs=[1,2] should fail since "
361 "there is 5 samples")
362 except ValueError, e:
363 pass
364
365 try:
366 ds.blobs = [1]
367 except e:
368 self.fail(msg="We must be able to assign the attribute")
369
370
371
372
373
374
386
387
389 """Test z-scoring transformation
390 """
391
392 samples = N.array( (0,1,3,4,2,2,3,1,1,3,3,1,2,2,2,2) ).\
393 reshape((16, 1))
394 data = Dataset(samples=samples,
395 labels=range(16), chunks=[0]*16)
396 self.failUnlessEqual( data.samples.mean(), 2.0 )
397 self.failUnlessEqual( data.samples.std(), 1.0 )
398 zscore(data, perchunk=True)
399
400
401 check = N.array([-2,-1,1,2,0,0,1,-1,-1,1,1,-1,0,0,0,0],
402 dtype='float64').reshape(16,1)
403 self.failUnless( (data.samples == check).all() )
404
405 data = Dataset(samples=samples,
406 labels=range(16), chunks=[0]*16)
407 zscore(data, perchunk=False)
408 self.failUnless( (data.samples == check).all() )
409
410
411 data = Dataset(samples=samples,
412 labels=[0, 2, 2, 2, 1] + [2]*11,
413 chunks=[0]*16)
414 zscore(data, baselinelabels=[0, 1])
415 self.failUnless((samples == data.samples+1.0).all())
416
417
428
429
431 """Test creation of new dataset by applying a mapper"""
432 mapper = MaskMapper(N.array([1, 0, 1]))
433 dataset = Dataset(samples=N.arange(12).reshape( (4, 3) ),
434 labels=1,
435 chunks=1)
436 seldataset = dataset.applyMapper(featuresmapper=mapper)
437 self.failUnless( (dataset.selectFeatures([0, 2]).samples
438 == seldataset.samples).all() )
439
440
441
442 if __debug__:
443
444 self.failUnlessRaises(ValueError, mapper.reverse, [10, 20, 30])
445 self.failUnlessRaises(ValueError, mapper.forward, [10, 20])
446
447
448
449
450
451
452
453
455 """Test Dataset.idhash() if it gets changed if any of the
456 labels/chunks changes
457 """
458
459 dataset = Dataset(samples=N.arange(12).reshape( (4, 3) ),
460 labels=1,
461 chunks=1)
462 origid = dataset.idhash
463 dataset.labels = [3, 1, 2, 3]
464 self.failUnless(origid != dataset.idhash,
465 msg="Changing all labels should alter dataset's idhash")
466
467 origid = dataset.idhash
468
469 z = dataset.labels[1]
470 self.failUnlessEqual(origid, dataset.idhash,
471 msg="Accessing shouldn't change idhash")
472 z = dataset.chunks
473 self.failUnlessEqual(origid, dataset.idhash,
474 msg="Accessing shouldn't change idhash")
475 z[2] = 333
476 self.failUnless(origid != dataset.idhash,
477 msg="Changing value in attribute should change idhash")
478
479 origid = dataset.idhash
480 dataset.samples[1, 1] = 1000
481 self.failUnless(origid != dataset.idhash,
482 msg="Changing value in data should change idhash")
483
484
485 origid = dataset.idhash
486 dataset.permuteLabels(True)
487 self.failUnless(origid != dataset.idhash,
488 msg="Permutation also changes idhash")
489
490 dataset.permuteLabels(False)
491 self.failUnless(origid == dataset.idhash,
492 msg="idhash should be restored after "
493 "permuteLabels(False)")
494
495
512
513
520
521
523 """Test mapping of the labels from strings to numericals
524 """
525 od = {'apple':0, 'orange':1}
526 samples = [[3], [2], [3]]
527 labels_l = ['apple', 'orange', 'apple']
528
529
530 ds = Dataset(samples=samples, labels='orange')
531 self.failUnless(N.all(ds.labels == ['orange']*3))
532
533
534 for ds in [Dataset(samples=samples, labels=labels_l, labels_map=od),
535
536 Dataset(samples=samples, labels=labels_l, labels_map=True)]:
537 self.failUnless(N.all(ds.labels == [0, 1, 0]))
538 self.failUnless(ds.labels_map == od)
539 ds_ = ds[1]
540 self.failUnless(ds_.labels_map == od,
541 msg='selectSamples should provide full mapping preserved')
542
543
544 self.failUnlessRaises(ValueError, Dataset, samples=samples,
545 labels=labels_l, labels_map = {'apple':0})
546
547
548
549 ds2 = Dataset(samples=samples, labels=labels_l)
550 self.failUnlessEqual(ds2.labels_map, None)
551
552
553 od3 = {1:100, 2:101, 3:100}
554 ds3 = Dataset(samples=samples, labels=[1, 2, 3],
555 labels_map=od3)
556 self.failUnlessEqual(ds3.labels_map, od3)
557 self.failUnless(N.all(ds3.labels == [100, 101, 100]))
558
559 ds3_ = ds3[1]
560 self.failUnlessEqual(ds3.labels_map, od3)
561
562 ds4 = Dataset(samples=samples, labels=labels_l)
563
564
565 ds = Dataset(samples=samples, labels=labels_l, labels_map=od)
566
567 self.failUnlessRaises(ValueError, ds.setLabelsMap,
568 {'orange': 1, 'nonorange': 3})
569 new_map = {'tasty':0, 'crappy':1}
570 ds.labels_map = new_map.copy()
571 self.failUnlessEqual(ds.labels_map, new_map)
572
573
575 """Adding datasets needs special care whenever labels mapping
576 is used."""
577 samples = [[3], [2], [3]]
578 l1 = ['a', 'b', 'a']
579 l2 = ['b', 'a', 'c']
580 ds1 = Dataset(samples=samples, labels=l1,
581 labels_map={'a':1, 'b':2})
582 ds2 = Dataset(samples=samples, labels=l2,
583 labels_map={'c':1, 'a':4, 'b':2})
584
585
586 ds0 = Dataset(samples=samples, labels=l2)
587
588
589 lm1 = ds1.labels_map.copy()
590 lm2 = ds2.labels_map.copy()
591
592 ds3 = ds1 + ds2
593 self.failUnless(N.all(ds3.labels ==
594 N.hstack((ds1.labels, [2, 1, 5]))))
595 self.failUnless(ds1.labels_map == lm1)
596 self.failUnless(ds2.labels_map == lm2)
597
598
599 ds1 += ds2
600 self.failUnless(N.all(ds1.labels == ds3.labels))
601
602
603 self.failUnless(N.all(ds1.labels_map == ds3.labels_map))
604
605
606
607 self.failUnlessRaises(ValueError, ds1.__add__, ds0)
608 self.failUnlessRaises(ValueError, ds1.__iadd__, ds0)
609
610
612
613 ds = datasets['uni2small']
614
615 ds_ = ds.copy()
616
617 self.failUnless(N.all(ds.samples == ds_.samples))
618 self.failUnless(N.all(ds.labels == ds_.labels))
619 self.failUnless(N.all(ds.chunks == ds_.chunks))
620
621
622 ds_.samples[0, 0] = 1234
623 self.failUnless(N.any(ds.samples != ds_.samples))
624 self.failUnless(N.all(ds.labels == ds_.labels))
625 self.failUnless(N.all(ds.chunks == ds_.chunks))
626
627 ds_.labels = N.hstack(([123], ds_.labels[1:]))
628 self.failUnless(N.any(ds.samples != ds_.samples))
629 self.failUnless(N.any(ds.labels != ds_.labels))
630 self.failUnless(N.all(ds.chunks == ds_.chunks))
631
632 ds_.chunks = N.hstack(([1234], ds_.chunks[1:]))
633 self.failUnless(N.any(ds.samples != ds_.samples))
634 self.failUnless(N.any(ds.labels != ds_.labels))
635 self.failUnless(N.any(ds.chunks != ds_.chunks))
636
637 self.failUnless(N.any(ds.uniquelabels != ds_.uniquelabels))
638 self.failUnless(N.any(ds.uniquechunks != ds_.uniquechunks))
639
640
642 """Test detection of transition points
643
644 Shame on Yarik -- he didn't create unittests right away... damn me
645 """
646 ds = Dataset(samples=N.array(range(10), ndmin=2).T,
647 labels=[0,0,1,1,0,0,1,1,0,0],
648 chunks=[0,0,0,0,0,1,1,1,1,1])
649 self.failUnless(ds.idsonboundaries() == [0,2,4,5,6,8],
650 "We should have got ids whenever either chunk or "
651 "label changes")
652 self.failUnless(ds.idsonboundaries(attributes_to_track=['chunks'])
653 == [0, 5])
654
655 self.failUnless(ds.idsonboundaries(prior=1, post=-1,
656 attributes_to_track=['chunks'])
657 == [4, 9])
658 self.failUnless(ds.idsonboundaries(prior=2, post=-1,
659 attributes_to_track=['chunks'])
660 == [3, 4, 8, 9])
661 self.failUnless(ds.idsonboundaries(prior=2, post=-1,
662 attributes_to_track=['chunks'],
663 revert=True)
664 == [0, 1, 2, 5, 6, 7])
665 self.failUnless(ds.idsonboundaries(prior=1, post=1,
666 attributes_to_track=['chunks'])
667 == [0, 1, 4, 5, 6, 9])
668
669 self.failUnless(ds.idsonboundaries(prior=2) == range(10))
670
671
674
675
676 if __name__ == '__main__':
677 import runner
678