[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

sampling.hxx
1 /************************************************************************/
2 /* */
3 /* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */
4 /* */
5 /* This file is part of the VIGRA computer vision library. */
6 /* The VIGRA Website is */
7 /* http://hci.iwr.uni-heidelberg.de/vigra/ */
8 /* Please direct questions, bug reports, and contributions to */
9 /* ullrich.koethe@iwr.uni-heidelberg.de or */
10 /* vigra@informatik.uni-hamburg.de */
11 /* */
12 /* Permission is hereby granted, free of charge, to any person */
13 /* obtaining a copy of this software and associated documentation */
14 /* files (the "Software"), to deal in the Software without */
15 /* restriction, including without limitation the rights to use, */
16 /* copy, modify, merge, publish, distribute, sublicense, and/or */
17 /* sell copies of the Software, and to permit persons to whom the */
18 /* Software is furnished to do so, subject to the following */
19 /* conditions: */
20 /* */
21 /* The above copyright notice and this permission notice shall be */
22 /* included in all copies or substantial portions of the */
23 /* Software. */
24 /* */
25 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32 /* OTHER DEALINGS IN THE SOFTWARE. */
33 /* */
34 /************************************************************************/
35 
36 #ifndef VIGRA_SAMPLING_HXX
37 #define VIGRA_SAMPLING_HXX
38 
39 #include "array_vector.hxx"
40 #include "random.hxx"
41 #include <map>
42 #include <memory>
43 #include <cmath>
44 
45 namespace vigra
46 {
47 
48 /** \addtogroup MachineLearning Machine Learning
49 **/
50 //@{
51 
52 
53 /**\brief Options object for the Sampler class.
54 
55  <b>usage:</b>
56 
57  \code
58  SamplerOptions opt = SamplerOptions()
59  .withReplacement()
60  .sampleProportion(0.5);
61  \endcode
62 
63  Note that the return value of all methods is <tt>*this</tt> which makes
64  concatenating of options as above possible.
65 */
67 {
68  public:
69 
70  double sample_proportion;
71  unsigned int sample_size;
72  bool sample_with_replacement;
73  bool stratified_sampling;
74 
76  : sample_proportion(1.0),
77  sample_size(0),
78  sample_with_replacement(true),
79  stratified_sampling(false)
80  {}
81 
82  /**\brief Sample from training population with replacement.
83  *
84  * <br> Default: true
85  */
87  {
88  sample_with_replacement = in;
89  return *this;
90  }
91 
92  /**\brief Sample from training population without replacement.
93  *
94  * <br> Default (if you don't call this function): false
95  */
97  {
98  sample_with_replacement = !in;
99  return *this;
100  }
101 
102  /**\brief Draw the given number of samples.
103  * If stratifiedSampling is true, the \a size is equally distributed
104  * across all strata (e.g. <tt>size / strataCount</tt> samples are taken
105  * from each stratum, subject to rounding).
106  *
107  * <br> Default: 0 (i.e. determine the count by means of sampleProportion())
108  */
109  SamplerOptions& sampleSize(unsigned int size)
110  {
111  sample_size = size;
112  return *this;
113  }
114 
115 
116  /**\brief Determine the number of samples to draw as a proportion of the total
117  * number. That is, we draw <tt>count = totalCount * proportion</tt> samples.
118  * This option is overridden when an absolute count is specified by sampleSize().
119  *
120  * If stratifiedSampling is true, the count is equally distributed
121  * across all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken
122  * from each stratum).
123  *
124  * <br> Default: 1.0
125  */
126  SamplerOptions& sampleProportion(double proportion)
127  {
128  vigra_precondition(proportion >= 0.0,
129  "SamplerOptions::sampleProportion(): argument must not be negative.");
130  sample_proportion = proportion;
131  return *this;
132  }
133 
134  /**\brief Draw equally many samples from each "stratum".
135  * A stratum is a group of like entities, e.g. pixels belonging
136  * to the same object class. This is useful to create balanced samples
137  * when the class probabilities are very unbalanced (e.g.
138  * when there are many background and few foreground pixels).
139  * Stratified sampling thus avoids that a trained classifier is biased
140  * towards the majority class.
141  *
142  * <br> Default (if you don't call this function): false
143  */
144  SamplerOptions& stratified(bool in = true)
145  {
146  stratified_sampling = in;
147  return *this;
148  }
149 };
150 
151 /************************************************************/
152 /* */
153 /* Sampler */
154 /* */
155 /************************************************************/
156 
157 /** \brief Create random samples from a sequence of indices.
158 
159  Selecting data items at random is a basic task of machine learning,
160  for example in boostrapping, RandomForest training, and cross validation.
161  This class implements various ways to select random samples via their indices.
162  Indices are assumed to be consecutive in
163  the range <tt>0 &lt;= index &lt; total_sample_count</tt>.
164 
165  The class always contains a current sample which can be accessed by
166  the index operator or by the function sampledIndices(). The indices
167  that are not in the current sample (out-of-bag indices) can be accessed
168  via the function oobIndices().
169 
170  The sampling method (with/without replacement, stratified or not) and the
171  number of samples to draw are determined by the option object
172  SamplerOptions.
173 
174  <b>Usage:</b>
175 
176  <b>\#include</b> <vigra/sampling.hxx><br>
177  Namespace: vigra
178 
179  Create a Sampler with default options, i.e. sample as many indices as there
180  are data elements, with replacement. On average, the sample will contain
181  <tt>0.63*totalCount</tt> distinct indices.
182 
183  \code
184  int totalCount = 10000; // total number of data elements
185  int numberOfSamples = 20; // repeat experiment 20 times
186  Sampler<> sampler(totalCount);
187  for(int k=0; k<numberOfSamples; ++k)
188  {
189  // process current sample
190  for(int i=0; i<sampler.sampleSize(); ++i)
191  {
192  int currentIndex = sampler[i];
193  processData(data[currentIndex]);
194  }
195  // create next sample
196  sampler.sample();
197  }
198  \endcode
199 
200  Create a Sampler for stratified sampling, without replacement.
201 
202  \code
203  // prepare the strata (i.e. specify which stratum each element belongs to)
204  int stratumSize1 = 2000, stratumSize2 = 8000,
205  totalCount = stratumSize1 + stratumSize2;
206  ArrayVerctor<int> strata(totalCount);
207  for(int i=0; i<stratumSize1; ++i)
208  strata[i] = 1;
209  for(int i=stratumSize1; i<stratumSize2; ++i)
210  strata[i] = 2;
211 
212  int sampleSize = 200; // i.e. sample 100 elements from each of the two strata
213  int numberOfSamples = 20; // repeat experiment 20 times
214  Sampler<> stratifiedSampler(strata.begin(), strata.end(),
215  SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize));
216  // create first sample
217  sampler.sample();
218 
219  for(int k=0; k<numberOfSamples; ++k)
220  {
221  // process current sample
222  for(int i=0; i<sampler.sampleSize(); ++i)
223  {
224  int currentIndex = sampler[i];
225  processData(data[currentIndex]);
226  }
227  // create next sample
228  sampler.sample();
229  }
230  \endcode
231 */
232 template<class Random = MersenneTwister >
233 class Sampler
234 {
235  public:
236  /** Internal type of the indices.
237  Currently, 64-bit indices are not supported because this
238  requires extension of the random number generator classes.
239  */
240  typedef Int32 IndexType;
241 
243 
244  /** Type of the array view object that is returned by
245  sampledIndices() and oobIndices().
246  */
248 
249  private:
250  typedef std::map<IndexType, IndexArrayType> StrataIndicesType;
251  typedef std::map<IndexType, int> StrataSizesType;
254 
255  static const int oobInvalid = -1;
256 
257  int total_count_, sample_size_;
258  mutable int current_oob_count_;
259  StrataIndicesType strata_indices_;
260  StrataSizesType strata_sample_size_;
261  IndexArrayType current_sample_;
262  mutable IndexArrayType current_oob_sample_;
263  IsUsedArrayType is_used_;
264  Random default_random_;
265  Random const & random_;
266  SamplerOptions options_;
267 
268  void initStrataCount()
269  {
270  // compute how many samples to take from each stratum
271  // (may be unequal if sample_size_ is not a multiple of strataCount())
272  int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount());
273  int strata_total_count = strata_sample_size * strataCount();
274 
275  for(StrataIndicesType::iterator i = strata_indices_.begin();
276  i != strata_indices_.end(); ++i)
277  {
278  if(strata_total_count > sample_size_)
279  {
280  strata_sample_size_[i->first] = strata_sample_size - 1;
281  --strata_total_count;
282  }
283  else
284  {
285  strata_sample_size_[i->first] = strata_sample_size;
286  }
287  }
288  }
289 
290  public:
291 
292  /** Create a sampler for \a totalCount data objects.
293 
294  In each invocation of <tt>sample()</tt> below, it will sample
295  indices according to the options passed. If no options are given,
296  <tt>totalCount</tt> indices will be drawn with replacement.
297  */
299  Random const * rnd = 0)
300  : total_count_(totalCount),
301  sample_size_(opt.sample_size == 0
302  ? (int)(std::ceil(total_count_ * opt.sample_proportion))
303  : opt.sample_size),
304  current_oob_count_(oobInvalid),
305  current_sample_(sample_size_),
306  current_oob_sample_(total_count_),
307  is_used_(total_count_),
308  default_random_(RandomSeed),
309  random_(rnd ? *rnd : default_random_),
310  options_(opt)
311  {
312  vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
313  "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
314 
315  vigra_precondition(!opt.stratified_sampling,
316  "Sampler(): Stratified sampling requested, but no strata given.");
317 
318  // initialize a single stratum containing all data
319  strata_indices_[0].resize(total_count_);
320  for(int i=0; i<total_count_; ++i)
321  strata_indices_[0][i] = i;
322 
323  initStrataCount();
324  //this is screwing up the random forest tests.
325  //sample();
326  }
327 
328  /** Create a sampler for stratified sampling.
329 
330  <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence
331  which specifies for each sample the stratum it belongs to. The
332  total number of data objects will be set to <tt>strataEnd - strataBegin</tt>.
333  Equally many samples (subject to rounding) will be drawn from each stratum,
334  unless the option object explicitly requests unstratified sampling,
335  in which case the strata are ignored.
336  */
337  template <class Iterator>
338  Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(),
339  Random const * rnd = 0)
340  : total_count_(strataEnd - strataBegin),
341  sample_size_(opt.sample_size == 0
342  ? (int)(std::ceil(total_count_ * opt.sample_proportion))
343  : opt.sample_size),
344  current_oob_count_(oobInvalid),
345  current_sample_(sample_size_),
346  current_oob_sample_(total_count_),
347  is_used_(total_count_),
348  default_random_(RandomSeed),
349  random_(rnd ? *rnd : default_random_),
350  options_(opt)
351  {
352  vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
353  "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
354 
355  // copy the strata indices
356  if(opt.stratified_sampling)
357  {
358  for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin)
359  {
360  strata_indices_[*strataBegin].push_back(i);
361  }
362  }
363  else
364  {
365  strata_indices_[0].resize(total_count_);
366  for(int i=0; i<total_count_; ++i)
367  strata_indices_[0][i] = i;
368  }
369 
370  vigra_precondition(sample_size_ >= (int)strata_indices_.size(),
371  "Sampler(): Requested sample count must be at least as large as the number of strata.");
372 
373  initStrataCount();
374  //this is screwing up the random forest tests.
375  //sample();
376  }
377 
378  /** Return the k-th index in the current sample.
379  */
380  IndexType operator[](int k) const
381  {
382  return current_sample_[k];
383  }
384 
385  /** Create a new sample.
386  */
387  void sample();
388 
389  /** The total number of data elements.
390  */
391  int totalCount() const
392  {
393  return total_count_;
394  }
395 
396  /** The number of data elements that have been sampled.
397  */
398  int sampleSize() const
399  {
400  return sample_size_;
401  }
402 
403  /** Same as sampleSize().
404  */
405  int size() const
406  {
407  return sample_size_;
408  }
409 
410  /** The number of strata to be used.
411  Will be 1 if no strata are given. Will be ignored when
412  stratifiedSampling() is false.
413  */
414  int strataCount() const
415  {
416  return strata_indices_.size();
417  }
418 
419  /** Whether to use stratified sampling.
420  (If this is false, strata will be ignored even if present.)
421  */
422  bool stratifiedSampling() const
423  {
424  return options_.stratified_sampling;
425  }
426 
427  /** Whether sampling should be performed with replacement.
428  */
429  bool withReplacement() const
430  {
431  return options_.sample_with_replacement;
432  }
433 
434  /** Return an array view containing the indices in the current sample.
435  */
437  {
438  return current_sample_;
439  }
440 
441  /** Return an array view containing the out-of-bag indices.
442  (i.e. the indices that are not in the current sample)
443  */
445  {
446  if(current_oob_count_ == oobInvalid)
447  {
448  current_oob_count_ = 0;
449  for(int i = 0; i<total_count_; ++i)
450  {
451  if(!is_used_[i])
452  {
453  current_oob_sample_[current_oob_count_] = i;
454  ++current_oob_count_;
455  }
456  }
457  }
458  return current_oob_sample_.subarray(0, current_oob_count_);
459  }
460  IsUsedArrayType const & is_used() const
461  {
462  return is_used_;
463  }
464 };
465 
466 
467 template<class Random>
469 {
470  current_oob_count_ = oobInvalid;
471  is_used_.init(false);
472 
473  if(options_.sample_with_replacement)
474  {
475  //Go thru all strata
476  int j = 0;
477  StrataIndicesType::iterator iter;
478  for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
479  {
480  // do sampling with replacement in each strata and copy data.
481  int stratum_size = iter->second.size();
482  for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
483  {
484  current_sample_[j] = iter->second[random_.uniformInt(stratum_size)];
485  is_used_[current_sample_[j]] = true;
486  }
487  }
488  }
489  else
490  {
491  //Go thru all strata
492  int j = 0;
493  StrataIndicesType::iterator iter;
494  for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
495  {
496  // do sampling without replacement in each strata and copy data.
497  int stratum_size = iter->second.size();
498  for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
499  {
500  std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]);
501  current_sample_[j] = iter->second[i];
502  is_used_[current_sample_[j]] = true;
503  }
504  }
505  }
506 }
507 
508 template<class Random =RandomTT800 >
509 class PoissonSampler
510 {
511 public:
512  Random randfloat;
513  typedef Int32 IndexType;
514  typedef vigra::ArrayVector <IndexType> IndexArrayType;
515  IndexArrayType used_indices_;
516  double lambda;
517  int minIndex;
518  int maxIndex;
519 
520  PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex)
521  : lambda(lambda),
522  minIndex(minIndex),
523  maxIndex(maxIndex)
524  {}
525 
526  void sample( )
527  {
528  used_indices_.clear();
529  IndexType i;
530  for(i=minIndex;i<maxIndex;++i)
531  {
532  //from http://en.wikipedia.org/wiki/Poisson_distribution
533  int k=0;
534  double p=1;
535  double L=exp(-lambda);
536  do
537  {
538  ++k;
539  p*=randfloat.uniform53();
540 
541  }while(p>L);
542  --k;
543  //Insert i this many time
544  while(k>0)
545  {
546  used_indices_.push_back(i);
547  --k;
548  }
549  }
550  }
551 
552  IndexType const & operator[](int in) const
553  {
554  return used_indices_[in];
555  }
556 
557  int numOfSamples() const
558  {
559  return used_indices_.size();
560  }
561 };
562 
563 //@}
564 
565 } // namespace vigra
566 
567 #endif /*VIGRA_SAMPLING_HXX*/

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.9.0 (Tue Sep 24 2013)