Package mvpa :: Package clfs :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module mvpa.clfs.stats

  1  # emacs: -*- mode: python; py-indent-offset: 4; indent-tabs-mode: nil -*- 
  2  # vi: set ft=python sts=4 ts=4 sw=4 et: 
  3  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  4  # 
  5  #   See COPYING file distributed along with the PyMVPA package for the 
  6  #   copyright and license terms. 
  7  # 
  8  ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ### ## 
  9  """Estimator for classifier error distributions.""" 
 10   
 11  __docformat__ = 'restructuredtext' 
 12   
 13  import numpy as N 
 14   
 15  from mvpa.base import externals, warning 
 16  from mvpa.misc.state import ClassWithCollections, StateVariable 
 17   
 18  if __debug__: 
 19      from mvpa.base import debug 
20 21 22 -class Nonparametric(object):
23 """Non-parametric 1d distribution -- derives cdf based on stored values. 24 25 Introduced to complement parametric distributions present in scipy.stats. 26 """ 27
28 - def __init__(self, dist_samples):
29 self._dist_samples = N.ravel(dist_samples)
30 31 32 @staticmethod
33 - def fit(dist_samples):
34 return [dist_samples]
35 36
37 - def cdf(self, x):
38 """Returns the cdf value at `x`. 39 """ 40 return N.vectorize(lambda v:(self._dist_samples <= v).mean())(x)
41
42 43 -def _pvalue(x, cdf_func, tail, return_tails=False, name=None):
44 """Helper function to return p-value(x) given cdf and tail 45 46 :Parameters: 47 cdf_func : callable 48 Function to be used to derive cdf values for x 49 tail : str ('left', 'right', 'any', 'both') 50 Which tail of the distribution to report. For 'any' and 'both' 51 it chooses the tail it belongs to based on the comparison to 52 p=0.5. In the case of 'any' significance is taken like in a 53 one-tailed test. 54 return_tails : bool 55 If True, a tuple return (pvalues, tails), where tails contain 56 1s if value was from the right tail, and 0 if the value was 57 from the left tail. 58 """ 59 is_scalar = N.isscalar(x) 60 if is_scalar: 61 x = [x] 62 63 cdf = cdf_func(x) 64 65 if __debug__ and 'CHECK_STABILITY' in debug.active: 66 cdf_min, cdf_max = N.min(cdf), N.max(cdf) 67 if cdf_min < 0 or cdf_max > 1.0: 68 s = ('', ' for %s' % name)[int(name is not None)] 69 warning('Stability check of cdf %s failed%s. Min=%s, max=%s' % \ 70 (cdf_func, s, cdf_min, cdf_max)) 71 72 # no escape but to assure that CDF is in the right range. Some 73 # distributions from scipy tend to jump away from [0,1] 74 cdf = N.clip(cdf, 0, 1.0) 75 76 if tail == 'left': 77 if return_tails: 78 right_tail = N.zeros(cdf.shape, dtype=bool) 79 elif tail == 'right': 80 cdf = 1 - cdf 81 if return_tails: 82 right_tail = N.ones(cdf.shape, dtype=bool) 83 elif tail in ('any', 'both'): 84 right_tail = (cdf >= 0.5) 85 cdf[right_tail] = 1.0 - cdf[right_tail] 86 if tail == 'both': 87 # we need to half the signficance 88 cdf *= 2 89 90 # Assure that NaNs didn't get significant value 91 cdf[N.isnan(x)] = 1.0 92 if is_scalar: res = cdf[0] 93 else: res = cdf 94 95 if return_tails: 96 return (res, right_tail) 97 else: 98 return res
99
100 101 -class NullDist(ClassWithCollections):
102 """Base class for null-hypothesis testing. 103 104 """ 105 106 # Although base class is not benefiting from states, derived 107 # classes do (MCNullDist). For the sake of avoiding multiple 108 # inheritance and associated headache -- let them all be ClassWithCollections, 109 # performance hit should be negligible in most of the scenarios 110 _ATTRIBUTE_COLLECTIONS = ['states'] 111
112 - def __init__(self, tail='both', **kwargs):
113 """Cheap initialization. 114 115 :Parameter: 116 tail: str ('left', 'right', 'any', 'both') 117 Which tail of the distribution to report. For 'any' and 'both' 118 it chooses the tail it belongs to based on the comparison to 119 p=0.5. In the case of 'any' significance is taken like in a 120 one-tailed test. 121 """ 122 ClassWithCollections.__init__(self, **kwargs) 123 124 self._setTail(tail)
125
126 - def __repr__(self, prefixes=[]):
127 return super(NullDist, self).__repr__( 128 prefixes=["tail=%s" % `self.__tail`] + prefixes)
129 130
131 - def _setTail(self, tail):
132 # sanity check 133 if tail not in ('left', 'right', 'any', 'both'): 134 raise ValueError, 'Unknown value "%s" to `tail` argument.' \ 135 % tail 136 self.__tail = tail
137 138
139 - def fit(self, measure, wdata, vdata=None):
140 """Implement to fit the distribution to the data.""" 141 raise NotImplementedError
142 143
144 - def cdf(self, x):
145 """Implementations return the value of the cumulative distribution 146 function (left or right tail dpending on the setting). 147 """ 148 raise NotImplementedError
149 150
151 - def p(self, x, **kwargs):
152 """Returns the p-value for values of `x`. 153 Returned values are determined left, right, or from any tail 154 depending on the constructor setting. 155 156 In case a `FeaturewiseDatasetMeasure` was used to estimate the 157 distribution the method returns an array. In that case `x` can be 158 a scalar value or an array of a matching shape. 159 """ 160 return _pvalue(x, self.cdf, self.__tail, **kwargs)
161 162 163 tail = property(fget=lambda x:x.__tail, fset=_setTail)
164
165 166 -class MCNullDist(NullDist):
167 """Null-hypothesis distribution is estimated from randomly permuted data labels. 168 169 The distribution is estimated by calling fit() with an appropriate 170 `DatasetMeasure` or `TransferError` instance and a training and a 171 validation dataset (in case of a `TransferError`). For a customizable 172 amount of cycles the training data labels are permuted and the 173 corresponding measure computed. In case of a `TransferError` this is the 174 error when predicting the *correct* labels of the validation dataset. 175 176 The distribution can be queried using the `cdf()` method, which can be 177 configured to report probabilities/frequencies from `left` or `right` tail, 178 i.e. fraction of the distribution that is lower or larger than some 179 critical value. 180 181 This class also supports `FeaturewiseDatasetMeasure`. In that case `cdf()` 182 returns an array of featurewise probabilities/frequencies. 183 """ 184 185 _DEV_DOC = """ 186 TODO automagically decide on the number of samples/permutations needed 187 Caution should be paid though since resultant distributions might be 188 quite far from some conventional ones (e.g. Normal) -- it is expected to 189 them to be bimodal (or actually multimodal) in many scenarios. 190 """ 191 192 dist_samples = StateVariable(enabled=False, 193 doc='Samples obtained for each permutation') 194
195 - def __init__(self, dist_class=Nonparametric, permutations=100, **kwargs):
196 """Initialize Monte-Carlo Permutation Null-hypothesis testing 197 198 :Parameters: 199 dist_class: class 200 This can be any class which provides parameters estimate 201 using `fit()` method to initialize the instance, and 202 provides `cdf(x)` method for estimating value of x in CDF. 203 All distributions from SciPy's 'stats' module can be used. 204 permutations: int 205 This many permutations of label will be performed to 206 determine the distribution under the null hypothesis. 207 """ 208 NullDist.__init__(self, **kwargs) 209 210 self._dist_class = dist_class 211 self._dist = [] # actual distributions 212 213 self.__permutations = permutations 214 """Number of permutations to compute the estimate the null 215 distribution."""
216
217 - def __repr__(self, prefixes=[]):
218 prefixes_ = ["permutations=%s" % self.__permutations] 219 if self._dist_class != Nonparametric: 220 prefixes_.insert(0, 'dist_class=%s' % `self._dist_class`) 221 return super(MCNullDist, self).__repr__( 222 prefixes=prefixes_ + prefixes)
223 224
225 - def fit(self, measure, wdata, vdata=None):
226 """Fit the distribution by performing multiple cycles which repeatedly 227 permuted labels in the training dataset. 228 229 :Parameters: 230 measure: (`Featurewise`)`DatasetMeasure` | `TransferError` 231 TransferError instance used to compute all errors. 232 wdata: `Dataset` which gets permuted and used to compute the 233 measure/transfer error multiple times. 234 vdata: `Dataset` used for validation. 235 If provided measure is assumed to be a `TransferError` and 236 working and validation dataset are passed onto it. 237 """ 238 dist_samples = [] 239 """Holds the values for randomized labels.""" 240 241 # decide on the arguments to measure 242 if not vdata is None: 243 measure_args = [vdata, wdata] 244 else: 245 measure_args = [wdata] 246 247 # estimate null-distribution 248 for p in xrange(self.__permutations): 249 # new permutation all the time 250 # but only permute the training data and keep the testdata constant 251 # 252 # TODO this really needs to be more clever! If data samples are 253 # shuffled within a class it really makes no difference for the 254 # classifier, hence the number of permutations to estimate the 255 # null-distribution of transfer errors can be reduced dramatically 256 # when the *right* permutations (the ones that matter) are done. 257 wdata.permuteLabels(True, perchunk=False) 258 259 # compute and store the measure of this permutation 260 # assume it has `TransferError` interface 261 dist_samples.append(measure(*measure_args)) 262 263 # restore original labels 264 wdata.permuteLabels(False, perchunk=False) 265 266 # store samples 267 self.dist_samples = dist_samples = N.asarray(dist_samples) 268 269 # fit distribution per each element 270 271 # to decide either it was done on scalars or vectors 272 shape = dist_samples.shape 273 nshape = len(shape) 274 # if just 1 dim, original data was scalar, just create an 275 # artif dimension for it 276 if nshape == 1: 277 dist_samples = dist_samples[:, N.newaxis] 278 279 # fit per each element. 280 # XXX could be more elegant? may be use N.vectorize? 281 dist_samples_rs = dist_samples.reshape((shape[0], -1)) 282 dist = [] 283 for samples in dist_samples_rs.T: 284 params = self._dist_class.fit(samples) 285 if __debug__ and 'STAT' in debug.active: 286 debug('STAT', 'Estimated parameters for the %s are %s' 287 % (self._dist_class, str(params))) 288 dist.append(self._dist_class(*params)) 289 self._dist = dist
290 291
292 - def cdf(self, x):
293 """Return value of the cumulative distribution function at `x`. 294 """ 295 if self._dist is None: 296 # XXX We might not want to descriminate that way since 297 # usually generators also have .cdf where they rely on the 298 # default parameters. But then what about Nonparametric 299 raise RuntimeError, "Distribution has to be fit first" 300 301 is_scalar = N.isscalar(x) 302 if is_scalar: 303 x = [x] 304 305 x = N.asanyarray(x) 306 xshape = x.shape 307 # assure x is a 1D array now 308 x = x.reshape((-1,)) 309 310 if len(self._dist) != len(x): 311 raise ValueError, 'Distribution was fit for structure with %d' \ 312 ' elements, whenever now queried with %d elements' \ 313 % (len(self._dist), len(x)) 314 315 # extract cdf values per each element 316 cdfs = [ dist.cdf(v) for v, dist in zip(x, self._dist) ] 317 return N.array(cdfs).reshape(xshape)
318 319
320 - def clean(self):
321 """Clean stored distributions 322 323 Storing all of the distributions might be too expensive 324 (e.g. in case of Nonparametric), and the scope of the object 325 might be too broad to wait for it to be destroyed. Clean would 326 bind dist_samples to empty list to let gc revoke the memory. 327 """ 328 self._dist = []
329
330 331 332 -class FixedNullDist(NullDist):
333 """Proxy/Adaptor class for SciPy distributions. 334 335 All distributions from SciPy's 'stats' module can be used with this class. 336 337 >>> import numpy as N 338 >>> from scipy import stats 339 >>> from mvpa.clfs.stats import FixedNullDist 340 >>> 341 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4)) 342 >>> dist.p(2) 343 0.5 344 >>> 345 >>> dist.cdf(N.arange(5)) 346 array([ 0.30853754, 0.40129367, 0.5 , 0.59870633, 0.69146246]) 347 >>> 348 >>> dist = FixedNullDist(stats.norm(loc=2, scale=4), tail='right') 349 >>> dist.p(N.arange(5)) 350 array([ 0.69146246, 0.59870633, 0.5 , 0.40129367, 0.30853754]) 351 """
352 - def __init__(self, dist, **kwargs):
353 """ 354 :Parameter: 355 dist: distribution object 356 This can be any object the has a `cdf()` method to report the 357 cumulative distribition function values. 358 """ 359 NullDist.__init__(self, **kwargs) 360 361 self._dist = dist
362 363
364 - def fit(self, measure, wdata, vdata=None):
365 """Does nothing since the distribution is already fixed.""" 366 pass
367 368
369 - def cdf(self, x):
370 """Return value of the cumulative distribution function at `x`. 371 """ 372 return self._dist.cdf(x)
373 374
375 - def __repr__(self, prefixes=[]):
376 prefixes_ = ["dist=%s" % `self._dist`] 377 return super(FixedNullDist, self).__repr__( 378 prefixes=prefixes_ + prefixes)
379
380 381 -class AdaptiveNullDist(FixedNullDist):
382 """Adaptive distribution which adjusts parameters according to the data 383 384 WiP: internal implementation might change 385 """
386 - def fit(self, measure, wdata, vdata=None):
387 """Cares about dimensionality of the feature space in measure 388 """ 389 390 try: 391 nfeatures = len(measure.feature_ids) 392 except ValueError: # XXX 393 nfeatures = N.prod(wdata.shape[1:]) 394 395 dist_gen = self._dist 396 if not hasattr(dist_gen, 'fit'): # frozen already 397 dist_gen = dist_gen.dist # rv_frozen at least has it ;) 398 399 args, kwargs = self._adapt(nfeatures, measure, wdata, vdata) 400 if __debug__: 401 debug('STAT', 'Adapted parameters for %s to be %s, %s' 402 % (dist_gen, args, kwargs)) 403 self._dist = dist_gen(*args, **kwargs)
404 405
406 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
407 raise NotImplementedError
408
409 410 -class AdaptiveRDist(AdaptiveNullDist):
411 """Adaptive rdist: params are (nfeatures-1, 0, 1) 412 """ 413
414 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
415 return (nfeatures-1, 0, 1), {}
416 417 # XXX: RDist has stability issue, just run 418 # python -c "import scipy.stats; print scipy.stats.rdist(541,0,1).cdf(0.72)" 419 # to get some improbable value, so we need to take care about that manually 420 # here
421 - def cdf(self, x):
422 cdf_ = self._dist.cdf(x) 423 bad_values = N.where(N.abs(cdf_)>1) 424 # XXX there might be better implementation (faster/elegant) using N.clip, 425 # the only problem is that instability results might flip the sign 426 # arbitrarily 427 if len(bad_values[0]): 428 # in this distribution we have mean at 0, so we can take that easily 429 # into account 430 cdf_bad = cdf_[bad_values] 431 x_bad = x[bad_values] 432 cdf_bad[x_bad<0] = 0.0 433 cdf_bad[x_bad>=0] = 1.0 434 cdf_[bad_values] = cdf_bad 435 return cdf_
436
437 438 -class AdaptiveNormal(AdaptiveNullDist):
439 """Adaptive Normal Distribution: params are (0, sqrt(1/nfeatures)) 440 """ 441
442 - def _adapt(self, nfeatures, measure, wdata, vdata=None):
443 return (0, 1.0/N.sqrt(nfeatures)), {}
444 445 446 if externals.exists('scipy'): 447 from mvpa.support.stats import scipy 448 from scipy.stats import kstest 449 """ 450 Thoughts: 451 452 So we can use `scipy.stats.kstest` (Kolmogorov-Smirnov test) to 453 check/reject H0 that samples come from a given distribution. But 454 since it is based on a full range of data, we might better of with 455 some ad-hoc checking by the detection power of the values in the 456 tail of a tentative distribution. 457 458 """ 459 460 # We need a way to fixate estimation of some parameters 461 # (e.g. mean) so lets create a simple proxy, or may be class 462 # generator later on, which would take care about punishing change 463 # from the 'right' arguments 464 465 import scipy
466 467 - class rv_semifrozen(object):
468 """Helper proxy-class to fit distribution when some parameters are known 469 470 It is an ugly hack with snippets of code taken from scipy, which is 471 Copyright (c) 2001, 2002 Enthought, Inc. 472 and is distributed under BSD license 473 http://www.scipy.org/License_Compatibility 474 """ 475
476 - def __init__(self, dist, loc=None, scale=None, args=None):
477 478 self._dist = dist 479 # loc and scale 480 theta = (loc, scale) 481 # args 482 Narg_ = dist.numargs 483 if args is not None: 484 Narg = len(args) 485 if Narg > Narg_: 486 raise ValueError, \ 487 'Distribution %s has only %d arguments. Got %d' \ 488 % (dist, Narg_, Narg) 489 args += (None,) * (Narg_ - Narg) 490 else: 491 args = (None,) * Narg_ 492 493 args_i = [i for i,v in enumerate(args) if v is None] 494 self._fargs = (list(args+theta), args_i) 495 """Arguments which should get some fixed value"""
496 497
498 - def nnlf(self, theta, x):
499 # - sum (log pdf(x, theta),axis=0) 500 # where theta are the parameters (including loc and scale) 501 # 502 fargs, fargs_i = self._fargs 503 try: 504 i=-1 505 if fargs[-1] is not None: 506 scale = fargs[-1] 507 else: 508 scale = theta[i] 509 i -= 1 510 511 if fargs[-2] is not None: 512 loc = fargs[-2] 513 else: 514 loc = theta[i] 515 i -= 1 516 517 args = theta[:i+1] 518 # adjust args if there were fixed 519 for i,a in zip(fargs_i, args): 520 fargs[i] = a 521 args = fargs[:-2] 522 523 except IndexError: 524 raise ValueError, "Not enough input arguments." 525 if not self._argcheck(*args) or scale <= 0: 526 return N.inf 527 x = N.asarray((x-loc) / scale) 528 cond0 = (x <= self.a) | (x >= self.b) 529 if (N.any(cond0)): 530 return N.inf 531 else: 532 return self._nnlf(x, *args) + len(x)*N.log(scale)
533
534 - def fit(self, data, *args, **kwds):
535 loc0, scale0 = map(kwds.get, ['loc', 'scale'], [0.0, 1.0]) 536 fargs, fargs_i = self._fargs 537 Narg = len(args) 538 Narg_ = self.numargs 539 if Narg != Narg_: 540 if Narg > Narg_: 541 raise ValueError, "Too many input arguments." 542 else: 543 args += (1.0,)*(self.numargs-Narg) 544 545 # Provide only those args which are not fixed, and 546 # append location and scale (if not fixed) at the end 547 if len(fargs_i) != Narg_: 548 x0 = tuple([args[i] for i in fargs_i]) 549 else: 550 x0 = args 551 if fargs[-2] is None: x0 = x0 + (loc0,) 552 if fargs[-1] is None: x0 = x0 + (scale0,) 553 554 opt_x = scipy.optimize.fmin(self.nnlf, x0, args=(N.ravel(data),), disp=0) 555 556 # reconstruct back 557 i = 0 558 loc, scale = fargs[-2:] 559 if fargs[-1] is None: 560 i -= 1 561 scale = opt_x[i] 562 if fargs[-2] is None: 563 i -= 1 564 loc = opt_x[i] 565 566 # assign those which weren't fixed 567 for i in fargs_i: 568 fargs[i] = opt_x[i] 569 570 #raise ValueError 571 opt_x = N.hstack((fargs[:-2], (loc, scale))) 572 return opt_x
573 574
575 - def __setattr__(self, a, v):
576 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 577 self._dist.__setattr__(a, v) 578 else: 579 object.__setattr__(self, a, v)
580 581
582 - def __getattribute__(self, a):
583 """We need to redirect all queries correspondingly 584 """ 585 if not a in ['_dist', '_fargs', 'fit', 'nnlf']: 586 return getattr(self._dist, a) 587 else: 588 return object.__getattribute__(self, a)
589
590 591 592 - def matchDistribution(data, nsamples=None, loc=None, scale=None, 593 args=None, test='kstest', distributions=None, 594 **kwargs):
595 """Determine best matching distribution. 596 597 Can be used for 'smelling' the data, as well to choose a 598 parametric distribution for data obtained from non-parametric 599 testing (e.g. `MCNullDist`). 600 601 WiP: use with caution, API might change 602 603 :Parameters: 604 data : N.ndarray 605 Array of the data for which to deduce the distribution. It has 606 to be sufficiently large to make a reliable conclusion 607 nsamples : int or None 608 If None -- use all samples in data to estimate parametric 609 distribution. Otherwise use only specified number randomly selected 610 from data. 611 loc : float or None 612 Loc for the distribution (if known) 613 scale : float or None 614 Scale for the distribution (if known) 615 test : basestring 616 What kind of testing to do. Choices: 617 'p-roc' : detection power for a given ROC. Needs two 618 parameters: `p=0.05` and `tail='both'` 619 'kstest' : 'full-body' distribution comparison. The best 620 choice is made by minimal reported distance after estimating 621 parameters of the distribution. Parameter `p=0.05` sets 622 threshold to reject null-hypothesis that distribution is the 623 same. 624 WARNING: older versions (e.g. 0.5.2 in etch) of scipy have 625 incorrect kstest implementation and do not function 626 properly 627 distributions : None or list of basestring or tuple(basestring, dict) 628 Distributions to check. If None, all known in scipy.stats 629 are tested. If distribution is specified as a tuple, then 630 it must contain name and additional parameters (name, loc, 631 scale, args) in the dictionary. Entry 'scipy' adds all known 632 in scipy.stats. 633 **kwargs 634 Additional arguments which are needed for each particular test 635 (see above) 636 637 :Example: 638 data = N.random.normal(size=(1000,1)); 639 matches = matchDistribution( 640 data, 641 distributions=['rdist', 642 ('rdist', {'name':'rdist_fixed', 643 'loc': 0.0, 644 'args': (10,)})], 645 nsamples=30, test='p-roc', p=0.05) 646 """ 647 648 # Handle parameters 649 _KNOWN_TESTS = ['p-roc', 'kstest'] 650 if not test in _KNOWN_TESTS: 651 raise ValueError, 'Unknown kind of test %s. Known are %s' \ 652 % (test, _KNOWN_TESTS) 653 654 data = N.ravel(data) 655 # data sampled 656 if nsamples is not None: 657 if __debug__: 658 debug('STAT', 'Sampling %d samples from data for the ' \ 659 'estimation of the distributions parameters' % nsamples) 660 indexes_selected = (N.random.sample(nsamples)*len(data)).astype(int) 661 data_selected = data[indexes_selected] 662 else: 663 indexes_selected = N.arange(len(data)) 664 data_selected = data 665 666 p_thr = kwargs.get('p', 0.05) 667 if test == 'p-roc': 668 tail = kwargs.get('tail', 'both') 669 data_p = _pvalue(data, Nonparametric(data).cdf, tail) 670 data_p_thr = N.abs(data_p) <= p_thr 671 true_positives = N.sum(data_p_thr) 672 if true_positives == 0: 673 raise ValueError, "Provided data has no elements in non-" \ 674 "parametric distribution under p<=%.3f. Please " \ 675 "increase the size of data or value of p" % p 676 if __debug__: 677 debug('STAT_', 'Number of positives in non-parametric ' 678 'distribution is %d' % true_positives) 679 680 if distributions is None: 681 distributions = ['scipy'] 682 683 # lets see if 'scipy' entry was in there 684 try: 685 scipy_ind = distributions.index('scipy') 686 distributions.pop(scipy_ind) 687 distributions += scipy.stats.distributions.__all__ 688 except ValueError: 689 pass 690 691 results = [] 692 for d in distributions: 693 dist_gen, loc_, scale_, args_ = None, loc, scale, args 694 if isinstance(d, basestring): 695 dist_gen = d 696 dist_name = d 697 elif isinstance(d, tuple): 698 if not (len(d)==2 and isinstance(d[1], dict)): 699 raise ValueError,\ 700 "Tuple specification of distribution must be " \ 701 "(d, {params}). Got %s" % (d,) 702 dist_gen = d[0] 703 loc_ = d[1].get('loc', loc) 704 scale_ = d[1].get('scale', scale) 705 args_ = d[1].get('args', args) 706 dist_name = d[1].get('name', str(dist_gen)) 707 else: 708 dist_gen = d 709 dist_name = str(d) 710 711 # perform actions which might puke for some distributions 712 try: 713 dist_gen_ = getattr(scipy.stats, dist_gen) 714 # specify distribution 'optimizer'. If loc or scale was provided, 715 # use home-brewed rv_semifrozen 716 if args_ is not None or loc_ is not None or scale_ is not None: 717 dist_opt = rv_semifrozen(dist_gen_, loc=loc_, scale=scale_, args=args_) 718 else: 719 dist_opt = dist_gen_ 720 dist_params = dist_opt.fit(data_selected) 721 if __debug__: 722 debug('STAT__', 723 'Got distribution parameters %s for %s' 724 % (dist_params, dist_name)) 725 if test == 'p-roc': 726 cdf_func = lambda x: dist_gen_.cdf(x, *dist_params) 727 # We need to compare detection under given p 728 cdf_p = N.abs(_pvalue(data, cdf_func, tail, name=dist_gen)) 729 cdf_p_thr = cdf_p <= p_thr 730 D, p = N.sum(N.abs(data_p_thr - cdf_p_thr))*1.0/true_positives, 1 731 if __debug__: res_sum = 'D=%.2f' % D 732 elif test == 'kstest': 733 D, p = kstest(data, d, args=dist_params) 734 if __debug__: res_sum = 'D=%.3f p=%.3f' % (D, p) 735 except (TypeError, ValueError, AttributeError, 736 NotImplementedError), e:#Exception, e: 737 if __debug__: 738 debug('STAT__', 739 'Testing for %s distribution failed due to %s' 740 % (d, str(e))) 741 continue 742 743 if p > p_thr and not N.isnan(D): 744 results += [ (D, dist_gen, dist_name, dist_params) ] 745 if __debug__: 746 debug('STAT_', 'Testing for %s dist.: %s' % (dist_name, res_sum)) 747 else: 748 if __debug__: 749 debug('STAT__', 'Cannot consider %s dist. with %s' 750 % (d, res_sum)) 751 continue 752 753 # sort in ascending order, so smaller is better 754 results.sort() 755 756 if __debug__ and 'STAT' in debug.active: 757 # find the best and report it 758 nresults = len(results) 759 sresult = lambda r:'%s(%s)=%.2f' % (r[1], ', '.join(map(str, r[3])), r[0]) 760 if nresults>0: 761 nnextbest = min(2, nresults-1) 762 snextbest = ', '.join(map(sresult, results[1:1+nnextbest])) 763 debug('STAT', 'Best distribution %s. Next best: %s' 764 % (sresult(results[0]), snextbest)) 765 else: 766 debug('STAT', 'Could not find suitable distribution') 767 768 # return all the results 769 return results
770 771 772 if externals.exists('pylab'): 773 import pylab as P
774 775 - def plotDistributionMatches(data, matches, nbins=31, nbest=5, 776 expand_tails=8, legend=2, plot_cdf=True, 777 p=None, tail='both'):
778 """Plot best matching distributions 779 780 :Parameters: 781 data : N.ndarray 782 Data which was used to obtain the matches 783 matches : list of tuples 784 Sorted matches as provided by matchDistribution 785 nbins : int 786 Number of bins in the histogram 787 nbest : int 788 Number of top matches to plot 789 expand_tails : int 790 How many bins away to add to parametrized distributions 791 plots 792 legend : int 793 Either to provide legend and statistics in the legend. 794 1 -- just lists distributions. 795 2 -- adds distance measure 796 3 -- tp/fp/fn in the case if p is provided 797 plot_cdf : bool 798 Either to plot cdf for data using non-parametric distribution 799 p : float or None 800 If not None, visualize null-hypothesis testing (given p). 801 Bars in the histogram which fall under given p are colored 802 in red. False positives and false negatives are marked as 803 triangle up and down symbols correspondingly 804 tail : ('left', 'right', 'any', 'both') 805 If p is not None, the choise of tail for null-hypothesis 806 testing 807 808 :Returns: tuple(histogram, list of lines) 809 """ 810 811 hist = P.hist(data, nbins, normed=1, align='center') 812 data_range = [N.min(data), N.max(data)] 813 814 # x's 815 x = hist[1] 816 dx = x[expand_tails] - x[0] # how much to expand tails by 817 x = N.hstack((x[:expand_tails] - dx, x, x[-expand_tails:] + dx)) 818 819 nonparam = Nonparametric(data) 820 # plot cdf 821 if plot_cdf: 822 P.plot(x, nonparam.cdf(x), 'k--', linewidth=1) 823 824 p_thr = p 825 826 data_p = _pvalue(data, nonparam.cdf, tail) 827 data_p_thr = (data_p <= p_thr).ravel() 828 829 x_p = _pvalue(x, Nonparametric(data).cdf, tail) 830 x_p_thr = N.abs(x_p) <= p_thr 831 # color bars which pass thresholding in red 832 for thr, bar in zip(x_p_thr[expand_tails:], hist[2]): 833 bar.set_facecolor(('w','r')[int(thr)]) 834 835 if not len(matches): 836 # no matches were provided 837 warning("No matching distributions were provided -- nothing to plot") 838 return (hist, ) 839 840 lines = [] 841 labels = [] 842 for i in xrange(min(nbest, len(matches))): 843 D, dist_gen, dist_name, params = matches[i] 844 dist = getattr(scipy.stats, dist_gen)(*params) 845 846 label = '%s' % (dist_name) 847 if legend > 1: label += '(D=%.2f)' % (D) 848 849 xcdf_p = N.abs(_pvalue(x, dist.cdf, tail)) 850 xcdf_p_thr = (xcdf_p <= p_thr).ravel() 851 852 if p is not None and legend > 2: 853 # We need to compare detection under given p 854 data_cdf_p = N.abs(_pvalue(data, dist.cdf, tail)) 855 data_cdf_p_thr = (data_cdf_p <= p_thr).ravel() 856 857 # true positives 858 tp = N.logical_and(data_cdf_p_thr, data_p_thr) 859 # false positives 860 fp = N.logical_and(data_cdf_p_thr, ~data_p_thr) 861 # false negatives 862 fn = N.logical_and(~data_cdf_p_thr, data_p_thr) 863 864 label += ' tp/fp/fn=%d/%d/%d)' % \ 865 tuple(map(N.sum, [tp,fp,fn])) 866 867 pdf = dist.pdf(x) 868 line = P.plot(x, pdf, '-', linewidth=2, label=label) 869 color = line[0].get_color() 870 871 if plot_cdf: 872 cdf = dist.cdf(x) 873 P.plot(x, cdf, ':', linewidth=1, color=color, label=label) 874 875 # TODO: decide on tp/fp/fn by not centers of the bins but 876 # by the values in data in the ranges covered by 877 # those bins. Then it would correspond to the values 878 # mentioned in the legend 879 if p is not None: 880 # true positives 881 xtp = N.logical_and(xcdf_p_thr, x_p_thr) 882 # false positives 883 xfp = N.logical_and(xcdf_p_thr, ~x_p_thr) 884 # false negatives 885 xfn = N.logical_and(~xcdf_p_thr, x_p_thr) 886 887 # no need to plot tp explicitely -- marked by color of the bar 888 # P.plot(x[xtp], pdf[xtp], 'o', color=color) 889 P.plot(x[xfp], pdf[xfp], '^', color=color) 890 P.plot(x[xfn], pdf[xfn], 'v', color=color) 891 892 lines.append(line) 893 labels.append(label) 894 895 if legend: 896 P.legend(lines, labels) 897 898 return (hist, lines)
899
900 #if True: 901 # data = N.random.normal(size=(1000,1)); 902 # matches = matchDistribution( 903 # data, 904 # distributions=['scipy', 905 # ('norm', {'name':'norm_known', 906 # 'scale': 1.0, 907 # 'loc': 0.0})], 908 # nsamples=30, test='p-roc', p=0.05) 909 # P.figure(); plotDistributionMatches(data, matches, nbins=101, 910 # p=0.05, legend=4, nbest=5) 911 912 913 -def autoNullDist(dist):
914 """Cheater for human beings -- wraps `dist` if needed with some 915 NullDist 916 917 tail and other arguments are assumed to be default as in 918 NullDist/MCNullDist 919 """ 920 if dist is None or isinstance(dist, NullDist): 921 return dist 922 elif hasattr(dist, 'fit'): 923 if __debug__: 924 debug('STAT', 'Wrapping %s into MCNullDist' % dist) 925 return MCNullDist(dist) 926 else: 927 if __debug__: 928 debug('STAT', 'Wrapping %s into FixedNullDist' % dist) 929 return FixedNullDist(dist)
930
931 932 # if no scipy, we need nanmean 933 -def _chk_asarray(a, axis):
934 if axis is None: 935 a = N.ravel(a) 936 outaxis = 0 937 else: 938 a = N.asarray(a) 939 outaxis = axis 940 return a, outaxis
941
942 -def nanmean(x, axis=0):
943 """Compute the mean over the given axis ignoring nans. 944 945 :Parameters: 946 x : ndarray 947 input array 948 axis : int 949 axis along which the mean is computed. 950 951 :Results: 952 m : float 953 the mean.""" 954 x, axis = _chk_asarray(x,axis) 955 x = x.copy() 956 Norig = x.shape[axis] 957 factor = 1.0-N.sum(N.isnan(x),axis)*1.0/Norig 958 959 x[N.isnan(x)] = 0 960 return N.mean(x,axis)/factor
961