[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

sampling.hxx
1 /************************************************************************/
2 /* */
3 /* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */
4 /* */
5 /* This file is part of the VIGRA computer vision library. */
6 /* The VIGRA Website is */
7 /* http://hci.iwr.uni-heidelberg.de/vigra/ */
8 /* Please direct questions, bug reports, and contributions to */
9 /* ullrich.koethe@iwr.uni-heidelberg.de or */
10 /* vigra@informatik.uni-hamburg.de */
11 /* */
12 /* Permission is hereby granted, free of charge, to any person */
13 /* obtaining a copy of this software and associated documentation */
14 /* files (the "Software"), to deal in the Software without */
15 /* restriction, including without limitation the rights to use, */
16 /* copy, modify, merge, publish, distribute, sublicense, and/or */
17 /* sell copies of the Software, and to permit persons to whom the */
18 /* Software is furnished to do so, subject to the following */
19 /* conditions: */
20 /* */
21 /* The above copyright notice and this permission notice shall be */
22 /* included in all copies or substantial portions of the */
23 /* Software. */
24 /* */
25 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32 /* OTHER DEALINGS IN THE SOFTWARE. */
33 /* */
34 /************************************************************************/
35 
36 #ifndef VIGRA_SAMPLING_HXX
37 #define VIGRA_SAMPLING_HXX
38 
39 #include "array_vector.hxx"
40 #include "random.hxx"
41 #include <map>
42 #include <memory>
43 #include <cmath>
44 
45 namespace vigra
46 {
47 
48 /** \addtogroup MachineLearning Machine Learning
49 **/
50 //@{
51 
52 
53 /**\brief Options object for the Sampler class.
54 
55  <b>usage:</b>
56 
57  \code
58  SamplerOptions opt = SamplerOptions()
59  .withReplacement()
60  .sampleProportion(0.5);
61  \endcode
62 
63  Note that the return value of all methods is <tt>*this</tt> which makes
64  concatenating of options as above possible.
65 */
67 {
68  public:
69 
70  double sample_proportion;
71  unsigned int sample_size;
72  bool sample_with_replacement;
73  bool stratified_sampling;
74 
76  : sample_proportion(1.0),
77  sample_size(0),
78  sample_with_replacement(true),
79  stratified_sampling(false)
80  {}
81 
82  /**\brief Sample from training population with replacement.
83  *
84  * <br> Default: true
85  */
87  {
88  sample_with_replacement = in;
89  return *this;
90  }
91 
92  /**\brief Sample from training population without replacement.
93  *
94  * <br> Default (if you don't call this function): false
95  */
97  {
98  sample_with_replacement = !in;
99  return *this;
100  }
101 
102  /**\brief Draw the given number of samples.
103  * If stratifiedSampling is true, the \a size is equally distributed
104  * across all strata (e.g. <tt>size / strataCount</tt> samples are taken
105  * from each stratum, subject to rounding).
106  *
107  * <br> Default: 0 (i.e. determine the count by means of sampleProportion())
108  */
109  SamplerOptions& sampleSize(unsigned int size)
110  {
111  sample_size = size;
112  return *this;
113  }
114 
115 
116  /**\brief Determine the number of samples to draw as a proportion of the total
117  * number. That is, we draw <tt>count = totalCount * proportion</tt> samples.
118  * This option is overridden when an absolute count is specified by sampleSize().
119  *
120  * If stratifiedSampling is true, the count is equally distributed
121  * across all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken
122  * from each stratum).
123  *
124  * <br> Default: 1.0
125  */
126  SamplerOptions& sampleProportion(double proportion)
127  {
128  vigra_precondition(proportion >= 0.0,
129  "SamplerOptions::sampleProportion(): argument must not be negative.");
130  sample_proportion = proportion;
131  return *this;
132  }
133 
134  /**\brief Draw equally many samples from each "stratum".
135  * A stratum is a group of like entities, e.g. pixels belonging
136  * to the same object class. This is useful to create balanced samples
137  * when the class probabilities are very unbalanced (e.g.
138  * when there are many background and few foreground pixels).
139  * Stratified sampling thus avoids that a trained classifier is biased
140  * towards the majority class.
141  *
142  * <br> Default (if you don't call this function): false
143  */
144  SamplerOptions& stratified(bool in = true)
145  {
146  stratified_sampling = in;
147  return *this;
148  }
149 };
150 
151 /************************************************************/
152 /* */
153 /* Sampler */
154 /* */
155 /************************************************************/
156 
157 /** \brief Create random samples from a sequence of indices.
158 
159  Selecting data items at random is a basic task of machine learning,
160  for example in boostrapping, RandomForest training, and cross validation.
161  This class implements various ways to select random samples via their indices.
162  Indices are assumed to be consecutive in
163  the range <tt>0 &lt;= index &lt; total_sample_count</tt>.
164 
165  The class always contains a current sample which can be accessed by
166  the index operator or by the function sampledIndices(). The indices
167  that are not in the current sample (out-of-bag indices) can be accessed
168  via the function oobIndices().
169 
170  The sampling method (with/without replacement, stratified or not) and the
171  number of samples to draw are determined by the option object
172  SamplerOptions.
173 
174  <b>Usage:</b>
175 
176  <b>\#include</b> <vigra/sampling.hxx><br>
177  Namespace: vigra
178 
179  Create a Sampler with default options, i.e. sample as many indices as there
180  are data elements, with replacement. On average, the sample will contain
181  <tt>0.63*totalCount</tt> distinct indices.
182 
183  \code
184  int totalCount = 10000; // total number of data elements
185  int numberOfSamples = 20; // repeat experiment 20 times
186  Sampler<> sampler(totalCount);
187  for(int k=0; k<numberOfSamples; ++k)
188  {
189  // process current sample
190  for(int i=0; i<sampler.sampleSize(); ++i)
191  {
192  int currentIndex = sampler[i];
193  processData(data[currentIndex]);
194  }
195  // create next sample
196  sampler.sample();
197  }
198  \endcode
199 
200  Create a Sampler for stratified sampling, without replacement.
201 
202  \code
203  // prepare the strata (i.e. specify which stratum each element belongs to)
204  int stratumSize1 = 2000, stratumSize2 = 8000,
205  totalCount = stratumSize1 + stratumSize2;
206  ArrayVerctor<int> strata(totalCount);
207  for(int i=0; i<stratumSize1; ++i)
208  strata[i] = 1;
209  for(int i=stratumSize1; i<stratumSize2; ++i)
210  strata[i] = 2;
211 
212  int sampleSize = 200; // i.e. sample 100 elements from each of the two strata
213  int numberOfSamples = 20; // repeat experiment 20 times
214  Sampler<> stratifiedSampler(strata.begin(), strata.end(),
215  SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize));
216  // create first sample
217  sampler.sample();
218 
219  for(int k=0; k<numberOfSamples; ++k)
220  {
221  // process current sample
222  for(int i=0; i<sampler.sampleSize(); ++i)
223  {
224  int currentIndex = sampler[i];
225  processData(data[currentIndex]);
226  }
227  // create next sample
228  sampler.sample();
229  }
230  \endcode
231 */
232 template<class Random = MersenneTwister >
233 class Sampler
234 {
235  public:
236  /** Internal type of the indices.
237  Currently, 64-bit indices are not supported because this
238  requires extension of the random number generator classes.
239  */
240  typedef Int32 IndexType;
241 
243 
244  /** Type of the array view object that is returned by
245  sampledIndices() and oobIndices().
246  */
248 
249  private:
250  typedef std::map<IndexType, IndexArrayType> StrataIndicesType;
251  typedef std::map<IndexType, int> StrataSizesType;
254 
255  static const int oobInvalid = -1;
256 
257  int total_count_, sample_size_;
258  mutable int current_oob_count_;
259  StrataIndicesType strata_indices_;
260  StrataSizesType strata_sample_size_;
261  IndexArrayType current_sample_;
262  mutable IndexArrayType current_oob_sample_;
263  IsUsedArrayType is_used_;
264  Random const & random_;
265  SamplerOptions options_;
266 
267  void initStrataCount()
268  {
269  // compute how many samples to take from each stratum
270  // (may be unequal if sample_size_ is not a multiple of strataCount())
271  int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount());
272  int strata_total_count = strata_sample_size * strataCount();
273 
274  for(StrataIndicesType::iterator i = strata_indices_.begin();
275  i != strata_indices_.end(); ++i)
276  {
277  if(strata_total_count > sample_size_)
278  {
279  strata_sample_size_[i->first] = strata_sample_size - 1;
280  --strata_total_count;
281  }
282  else
283  {
284  strata_sample_size_[i->first] = strata_sample_size;
285  }
286  }
287  }
288 
289  public:
290 
291  /** Create a sampler for \a totalCount data objects.
292 
293  In each invocation of <tt>sample()</tt> below, it will sample
294  indices according to the options passed. If no options are given,
295  <tt>totalCount</tt> indices will be drawn with replacement.
296  */
298  Random const & rnd = Random(RandomSeed))
299  : total_count_(totalCount),
300  sample_size_(opt.sample_size == 0
301  ? (int)(std::ceil(total_count_ * opt.sample_proportion))
302  : opt.sample_size),
303  current_oob_count_(oobInvalid),
304  current_sample_(sample_size_),
305  current_oob_sample_(total_count_),
306  is_used_(total_count_),
307  random_(rnd),
308  options_(opt)
309  {
310  vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
311  "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
312 
313  vigra_precondition(!opt.stratified_sampling,
314  "Sampler(): Stratified sampling requested, but no strata given.");
315 
316  // initialize a single stratum containing all data
317  strata_indices_[0].resize(total_count_);
318  for(int i=0; i<total_count_; ++i)
319  strata_indices_[0][i] = i;
320 
321  initStrataCount();
322  //this is screwing up the random forest tests.
323  //sample();
324  }
325 
326  /** Create a sampler for stratified sampling.
327 
328  <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence
329  which specifies for each sample the stratum it belongs to. The
330  total number of data objects will be set to <tt>strataEnd - strataBegin</tt>.
331  Equally many samples (subject to rounding) will be drawn from each stratum,
332  unless the option object explicitly requests unstratified sampling,
333  in which case the strata are ignored.
334  */
335  template <class Iterator>
336  Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(),
337  Random const & rnd = Random(RandomSeed))
338  : total_count_(strataEnd - strataBegin),
339  sample_size_(opt.sample_size == 0
340  ? (int)(std::ceil(total_count_ * opt.sample_proportion))
341  : opt.sample_size),
342  current_oob_count_(oobInvalid),
343  current_sample_(sample_size_),
344  current_oob_sample_(total_count_),
345  is_used_(total_count_),
346  random_(rnd),
347  options_(opt)
348  {
349  vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_,
350  "Sampler(): Cannot draw without replacement when data size is smaller than sample count.");
351 
352  // copy the strata indices
353  if(opt.stratified_sampling)
354  {
355  for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin)
356  {
357  strata_indices_[*strataBegin].push_back(i);
358  }
359  }
360  else
361  {
362  strata_indices_[0].resize(total_count_);
363  for(int i=0; i<total_count_; ++i)
364  strata_indices_[0][i] = i;
365  }
366 
367  vigra_precondition(sample_size_ >= (int)strata_indices_.size(),
368  "Sampler(): Requested sample count must be at least as large as the number of strata.");
369 
370  initStrataCount();
371  //this is screwing up the random forest tests.
372  //sample();
373  }
374 
375  /** Return the k-th index in the current sample.
376  */
377  IndexType operator[](int k) const
378  {
379  return current_sample_[k];
380  }
381 
382  /** Create a new sample.
383  */
384  void sample();
385 
386  /** The total number of data elements.
387  */
388  int totalCount() const
389  {
390  return total_count_;
391  }
392 
393  /** The number of data elements that have been sampled.
394  */
395  int sampleSize() const
396  {
397  return sample_size_;
398  }
399 
400  /** Same as sampleSize().
401  */
402  int size() const
403  {
404  return sample_size_;
405  }
406 
407  /** The number of strata to be used.
408  Will be 1 if no strata are given. Will be ignored when
409  stratifiedSampling() is false.
410  */
411  int strataCount() const
412  {
413  return strata_indices_.size();
414  }
415 
416  /** Whether to use stratified sampling.
417  (If this is false, strata will be ignored even if present.)
418  */
419  bool stratifiedSampling() const
420  {
421  return options_.stratified_sampling;
422  }
423 
424  /** Whether sampling should be performed with replacement.
425  */
426  bool withReplacement() const
427  {
428  return options_.sample_with_replacement;
429  }
430 
431  /** Return an array view containing the indices in the current sample.
432  */
434  {
435  return current_sample_;
436  }
437 
438  /** Return an array view containing the out-of-bag indices.
439  (i.e. the indices that are not in the current sample)
440  */
442  {
443  if(current_oob_count_ == oobInvalid)
444  {
445  current_oob_count_ = 0;
446  for(int i = 0; i<total_count_; ++i)
447  {
448  if(!is_used_[i])
449  {
450  current_oob_sample_[current_oob_count_] = i;
451  ++current_oob_count_;
452  }
453  }
454  }
455  return current_oob_sample_.subarray(0, current_oob_count_);
456  }
457  IsUsedArrayType const & is_used() const
458  {
459  return is_used_;
460  }
461 };
462 
463 
464 template<class Random>
466 {
467  current_oob_count_ = oobInvalid;
468  is_used_.init(false);
469 
470  if(options_.sample_with_replacement)
471  {
472  //Go thru all strata
473  int j = 0;
474  StrataIndicesType::iterator iter;
475  for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
476  {
477  // do sampling with replacement in each strata and copy data.
478  int stratum_size = iter->second.size();
479  for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
480  {
481  current_sample_[j] = iter->second[random_.uniformInt(stratum_size)];
482  is_used_[current_sample_[j]] = true;
483  }
484  }
485  }
486  else
487  {
488  //Go thru all strata
489  int j = 0;
490  StrataIndicesType::iterator iter;
491  for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter)
492  {
493  // do sampling without replacement in each strata and copy data.
494  int stratum_size = iter->second.size();
495  for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j)
496  {
497  std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]);
498  current_sample_[j] = iter->second[i];
499  is_used_[current_sample_[j]] = true;
500  }
501  }
502  }
503 }
504 
505 template<class Random =RandomTT800 >
506 class PoissonSampler
507 {
508 public:
509  Random randfloat;
510  typedef Int32 IndexType;
511  typedef vigra::ArrayVector <IndexType> IndexArrayType;
512  IndexArrayType used_indices_;
513  double lambda;
514  int minIndex;
515  int maxIndex;
516 
517  PoissonSampler(double lambda,IndexType minIndex,IndexType maxIndex)
518  : lambda(lambda),
519  minIndex(minIndex),
520  maxIndex(maxIndex)
521  {}
522 
523  void sample( )
524  {
525  used_indices_.clear();
526  IndexType i;
527  for(i=minIndex;i<maxIndex;++i)
528  {
529  //from http://en.wikipedia.org/wiki/Poisson_distribution
530  int k=0;
531  double p=1;
532  double L=exp(-lambda);
533  do
534  {
535  ++k;
536  p*=randfloat.uniform53();
537 
538  }while(p>L);
539  --k;
540  //Insert i this many time
541  while(k>0)
542  {
543  used_indices_.push_back(i);
544  --k;
545  }
546  }
547  }
548 
549  IndexType const & operator[](int in) const
550  {
551  return used_indices_[in];
552  }
553 
554  int numOfSamples() const
555  {
556  return used_indices_.size();
557  }
558 };
559 
560 //@}
561 
562 } // namespace vigra
563 
564 #endif /*VIGRA_SAMPLING_HXX*/

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.9.0 (Wed Feb 27 2013)