[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]

rf_algorithm.hxx
1 /************************************************************************/
2 /* */
3 /* Copyright 2008-2009 by Rahul Nair */
4 /* */
5 /* This file is part of the VIGRA computer vision library. */
6 /* The VIGRA Website is */
7 /* http://hci.iwr.uni-heidelberg.de/vigra/ */
8 /* Please direct questions, bug reports, and contributions to */
9 /* ullrich.koethe@iwr.uni-heidelberg.de or */
10 /* vigra@informatik.uni-hamburg.de */
11 /* */
12 /* Permission is hereby granted, free of charge, to any person */
13 /* obtaining a copy of this software and associated documentation */
14 /* files (the "Software"), to deal in the Software without */
15 /* restriction, including without limitation the rights to use, */
16 /* copy, modify, merge, publish, distribute, sublicense, and/or */
17 /* sell copies of the Software, and to permit persons to whom the */
18 /* Software is furnished to do so, subject to the following */
19 /* conditions: */
20 /* */
21 /* The above copyright notice and this permission notice shall be */
22 /* included in all copies or substantial portions of the */
23 /* Software. */
24 /* */
25 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */
26 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */
27 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */
28 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */
29 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */
30 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */
31 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */
32 /* OTHER DEALINGS IN THE SOFTWARE. */
33 /* */
34 /************************************************************************/
35 #define VIGRA_RF_ALGORITHM_HXX
36 
37 #include <vector>
38 #include "splices.hxx"
39 #include <queue>
40 #include <fstream>
41 namespace vigra
42 {
43 
44 namespace rf
45 {
46 /** This namespace contains all algorithms developed for feature
47  * selection
48  *
49  */
50 namespace algorithms
51 {
52 
53 namespace detail
54 {
55  /** create a MultiArray containing only columns supplied between iterators
56  b and e
57  */
58  template<class OrigMultiArray,
59  class Iter,
60  class DestMultiArray>
61  void choose(OrigMultiArray const & in,
62  Iter const & b,
63  Iter const & e,
64  DestMultiArray & out)
65  {
66  int columnCount = std::distance(b, e);
67  int rowCount = in.shape(0);
68  out.reshape(MultiArrayShape<2>::type(rowCount, columnCount));
69  int ii = 0;
70  for(Iter iter = b; iter != e; ++iter, ++ii)
71  {
72  columnVector(out, ii) = columnVector(in, *iter);
73  }
74  }
75 }
76 
77 
78 
79 /** Standard random forest Errorrate callback functor
80  *
81  * returns the random forest error estimate when invoked.
82  */
84 {
85  RandomForestOptions options;
86 
87  public:
88  /** Default constructor
89  *
90  * optionally supply options to the random forest classifier
91  * \sa RandomForestOptions
92  */
94  : options(opt)
95  {}
96 
97  /** returns the RF OOB error estimate given features and
98  * labels
99  */
100  template<class Feature_t, class Response_t>
101  double operator() (Feature_t const & features,
102  Response_t const & response)
103  {
104  RandomForest<> rf(options);
106  rf.learn(features,
107  response,
109  return oob.oob_breiman;
110  }
111 };
112 
113 
114 /** Structure to hold Variable Selection results
115  */
117 {
118  bool initialized;
119 
120  public:
122  : initialized(false)
123  {}
124 
125  typedef std::vector<int> FeatureList_t;
126  typedef std::vector<double> ErrorList_t;
127  typedef FeatureList_t::iterator Pivot_t;
128 
129  Pivot_t pivot;
130 
131  /** list of features.
132  */
133  FeatureList_t selected;
134 
135  /** vector of size (number of features)
136  *
137  * the i-th entry encodes the error rate obtained
138  * while using features [0 - i](including i)
139  *
140  * if the i-th entry is -1 then no error rate was obtained
141  * this may happen if more than one feature is added to the
142  * selected list in one step of the algorithm.
143  *
144  * during initialisation error[m+n-1] is always filled
145  */
146  ErrorList_t errors;
147 
148 
149  /** errorrate using no features
150  */
151  double no_features;
152 
153  template<class FeatureT,
154  class ResponseT,
155  class Iter,
156  class ErrorRateCallBack>
157  bool init(FeatureT const & all_features,
158  ResponseT const & response,
159  Iter b,
160  Iter e,
161  ErrorRateCallBack errorcallback)
162  {
163  bool ret_ = init(all_features, response, errorcallback);
164  if(!ret_)
165  return false;
166  vigra_precondition(std::distance(b, e) == (std::ptrdiff_t)selected.size(),
167  "Number of features in ranking != number of features matrix");
168  std::copy(b, e, selected.begin());
169  return true;
170  }
171 
172  template<class FeatureT,
173  class ResponseT,
174  class Iter>
175  bool init(FeatureT const & all_features,
176  ResponseT const & response,
177  Iter b,
178  Iter e)
179  {
180  RFErrorCallback ecallback;
181  return init(all_features, response, b, e, ecallback);
182  }
183 
184 
185  template<class FeatureT,
186  class ResponseT>
187  bool init(FeatureT const & all_features,
188  ResponseT const & response)
189  {
190  return init(all_features, response, RFErrorCallback());
191  }
192  /**initialization routine. Will be called only once in the lifetime
193  * of a VariableSelectionResult. Subsequent calls will not reinitialize
194  * member variables.
195  *
196  * This is intended, to allow continuing variable selection at a point
197  * stopped in an earlier iteration.
198  *
199  * returns true if initialization was successful and false if
200  * the object was already initialized before.
201  */
202  template<class FeatureT,
203  class ResponseT,
204  class ErrorRateCallBack>
205  bool init(FeatureT const & all_features,
206  ResponseT const & response,
207  ErrorRateCallBack errorcallback)
208  {
209  if(initialized)
210  {
211  return false;
212  }
213  initialized = true;
214  // calculate error with all features
215  selected.resize(all_features.shape(1), 0);
216  for(unsigned int ii = 0; ii < selected.size(); ++ii)
217  selected[ii] = ii;
218  errors.resize(all_features.shape(1), -1);
219  errors.back() = errorcallback(all_features, response);
220 
221  // calculate error rate if no features are chosen
222  // corresponds to max(prior probability) of the classes
223  std::map<typename ResponseT::value_type, int> res_map;
224  std::vector<int> cts;
225  int counter = 0;
226  for(int ii = 0; ii < response.shape(0); ++ii)
227  {
228  if(res_map.find(response(ii, 0)) == res_map.end())
229  {
230  res_map[response(ii, 0)] = counter;
231  ++counter;
232  cts.push_back(0);
233  }
234  cts[res_map[response(ii,0)]] +=1;
235  }
236  no_features = double(*(std::max_element(cts.begin(),
237  cts.end())))
238  / double(response.shape(0));
239 
240  /*init not_selected vector;
241  not_selected.resize(all_features.shape(1), 0);
242  for(int ii = 0; ii < not_selected.size(); ++ii)
243  {
244  not_selected[ii] = ii;
245  }
246  initialized = true;
247  */
248  pivot = selected.begin();
249  return true;
250  }
251 };
252 
253 
254 
255 /** Perform forward selection
256  *
257  * \param features IN: n x p matrix containing n instances with p attributes/features
258  * used in the variable selection algorithm
259  * \param response IN: n x 1 matrix containing the corresponding response
260  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
261  * of the algorithm.
262  * Features between result.selected.begin() and result.pivot will
263  * be left untouched.
264  * \sa VariableSelectionResult
265  * \param errorcallback
266  * IN, OPTIONAL:
267  * Functor that returns the error rate given a set of
268  * features and labels. Default is the RandomForest OOB Error.
269  *
270  * Forward selection subsequently chooses the next feature that decreases the Error rate most.
271  *
272  * usage:
273  * \code
274  * MultiArray<2, double> features = createSomeFeatures();
275  * MultiArray<2, int> labels = createCorrespondingLabels();
276  * VariableSelectionResult result;
277  * forward_selection(features, labels, result);
278  * \endcode
279  * To use forward selection but ensure that a specific feature e.g. feature 5 is always
280  * included one would do the following
281  *
282  * \code
283  * VariableSelectionResult result;
284  * result.init(features, labels);
285  * std::swap(result.selected[0], result.selected[5]);
286  * result.setPivot(1);
287  * forward_selection(features, labels, result);
288  * \endcode
289  *
290  * \sa VariableSelectionResult
291  *
292  */
293 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
294 void forward_selection(FeatureT const & features,
295  ResponseT const & response,
296  VariableSelectionResult & result,
297  ErrorRateCallBack errorcallback)
298 {
299  VariableSelectionResult::FeatureList_t & selected = result.selected;
300  VariableSelectionResult::ErrorList_t & errors = result.errors;
301  VariableSelectionResult::Pivot_t & pivot = result.pivot;
302  int featureCount = features.shape(1);
303  // initialize result struct if in use for the first time
304  if(!result.init(features, response, errorcallback))
305  {
306  //result is being reused just ensure that the number of features is
307  //the same.
308  vigra_precondition((int)selected.size() == featureCount,
309  "forward_selection(): Number of features in Feature "
310  "matrix and number of features in previously used "
311  "result struct mismatch!");
312  }
313 
314 
315  int not_selected_size = std::distance(pivot, selected.end());
316  while(not_selected_size > 1)
317  {
318  std::vector<double> current_errors;
319  VariableSelectionResult::Pivot_t next = pivot;
320  for(int ii = 0; ii < not_selected_size; ++ii, ++next)
321  {
322  std::swap(*pivot, *next);
323  MultiArray<2, double> cur_feats;
324  detail::choose( features,
325  selected.begin(),
326  pivot+1,
327  cur_feats);
328  double error = errorcallback(cur_feats, response);
329  current_errors.push_back(error);
330  std::swap(*pivot, *next);
331  }
332  int pos = std::distance(current_errors.begin(),
333  std::min_element(current_errors.begin(),
334  current_errors.end()));
335  next = pivot;
336  std::advance(next, pos);
337  std::swap(*pivot, *next);
338  errors[std::distance(selected.begin(), pivot)] = current_errors[pos];
339 #ifdef RN_VERBOSE
340  std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", "));
341  std::cerr << "Choosing " << *pivot << " at error of " << current_errors[pos] << std::endl;
342 #endif
343  ++pivot;
344  not_selected_size = std::distance(pivot, selected.end());
345  }
346 }
347 template<class FeatureT, class ResponseT>
348 void forward_selection(FeatureT const & features,
349  ResponseT const & response,
350  VariableSelectionResult & result)
351 {
352  forward_selection(features, response, result, RFErrorCallback());
353 }
354 
355 
356 /** Perform backward elimination
357  *
358  * \param features IN: n x p matrix containing n instances with p attributes/features
359  * used in the variable selection algorithm
360  * \param response IN: n x 1 matrix containing the corresponding response
361  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
362  * of the algorithm.
363  * Features between result.pivot and result.selected.end() will
364  * be left untouched.
365  * \sa VariableSelectionResult
366  * \param errorcallback
367  * IN, OPTIONAL:
368  * Functor that returns the error rate given a set of
369  * features and labels. Default is the RandomForest OOB Error.
370  *
371  * Backward elimination subsequently eliminates features that have the least influence
372  * on the error rate
373  *
374  * usage:
375  * \code
376  * MultiArray<2, double> features = createSomeFeatures();
377  * MultiArray<2, int> labels = createCorrespondingLabels();
378  * VariableSelectionResult result;
379  * backward_elimination(features, labels, result);
380  * \endcode
381  * To use backward elimination but ensure that a specific feature e.g. feature 5 is always
382  * excluded one would do the following:
383  *
384  * \code
385  * VariableSelectionResult result;
386  * result.init(features, labels);
387  * std::swap(result.selected[result.selected.size()-1], result.selected[5]);
388  * result.setPivot(result.selected.size()-1);
389  * backward_elimination(features, labels, result);
390  * \endcode
391  *
392  * \sa VariableSelectionResult
393  *
394  */
395 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
396 void backward_elimination(FeatureT const & features,
397  ResponseT const & response,
398  VariableSelectionResult & result,
399  ErrorRateCallBack errorcallback)
400 {
401  int featureCount = features.shape(1);
402  VariableSelectionResult::FeatureList_t & selected = result.selected;
403  VariableSelectionResult::ErrorList_t & errors = result.errors;
404  VariableSelectionResult::Pivot_t & pivot = result.pivot;
405 
406  // initialize result struct if in use for the first time
407  if(!result.init(features, response, errorcallback))
408  {
409  //result is being reused just ensure that the number of features is
410  //the same.
411  vigra_precondition((int)selected.size() == featureCount,
412  "backward_elimination(): Number of features in Feature "
413  "matrix and number of features in previously used "
414  "result struct mismatch!");
415  }
416  pivot = selected.end() - 1;
417 
418  int selected_size = std::distance(selected.begin(), pivot);
419  while(selected_size > 1)
420  {
421  VariableSelectionResult::Pivot_t next = selected.begin();
422  std::vector<double> current_errors;
423  for(int ii = 0; ii < selected_size; ++ii, ++next)
424  {
425  std::swap(*pivot, *next);
426  MultiArray<2, double> cur_feats;
427  detail::choose( features,
428  selected.begin(),
429  pivot+1,
430  cur_feats);
431  double error = errorcallback(cur_feats, response);
432  current_errors.push_back(error);
433  std::swap(*pivot, *next);
434  }
435  int pos = std::distance(current_errors.begin(),
436  std::min_element(current_errors.begin(),
437  current_errors.end()));
438  next = selected.begin();
439  std::advance(next, pos);
440  std::swap(*pivot, *next);
441 // std::cerr << std::distance(selected.begin(), pivot) << " " << pos << " " << current_errors.size() << " " << errors.size() << std::endl;
442  errors[std::distance(selected.begin(), pivot)-1] = current_errors[pos];
443  selected_size = std::distance(selected.begin(), pivot);
444 #ifdef RN_VERBOSE
445  std::copy(current_errors.begin(), current_errors.end(), std::ostream_iterator<double>(std::cerr, ", "));
446  std::cerr << "Eliminating " << *pivot << " at error of " << current_errors[pos] << std::endl;
447 #endif
448  --pivot;
449  }
450 }
451 
452 template<class FeatureT, class ResponseT>
453 void backward_elimination(FeatureT const & features,
454  ResponseT const & response,
455  VariableSelectionResult & result)
456 {
457  backward_elimination(features, response, result, RFErrorCallback());
458 }
459 
460 /** Perform rank selection using a predefined ranking
461  *
462  * \param features IN: n x p matrix containing n instances with p attributes/features
463  * used in the variable selection algorithm
464  * \param response IN: n x 1 matrix containing the corresponding response
465  * \param result IN/OUT: VariableSelectionResult struct which will contain the results
466  * of the algorithm. The struct should be initialized with the
467  * predefined ranking.
468  *
469  * \sa VariableSelectionResult
470  * \param errorcallback
471  * IN, OPTIONAL:
472  * Functor that returns the error rate given a set of
473  * features and labels. Default is the RandomForest OOB Error.
474  *
475  * Often some variable importance, score measure is used to create the ordering in which
476  * variables have to be selected. This method takes such a ranking and calculates the
477  * corresponding error rates.
478  *
479  * usage:
480  * \code
481  * MultiArray<2, double> features = createSomeFeatures();
482  * MultiArray<2, int> labels = createCorrespondingLabels();
483  * std::vector<int> ranking = createRanking(features);
484  * VariableSelectionResult result;
485  * result.init(features, labels, ranking.begin(), ranking.end());
486  * backward_elimination(features, labels, result);
487  * \endcode
488  *
489  * \sa VariableSelectionResult
490  *
491  */
492 template<class FeatureT, class ResponseT, class ErrorRateCallBack>
493 void rank_selection (FeatureT const & features,
494  ResponseT const & response,
495  VariableSelectionResult & result,
496  ErrorRateCallBack errorcallback)
497 {
498  VariableSelectionResult::FeatureList_t & selected = result.selected;
499  VariableSelectionResult::ErrorList_t & errors = result.errors;
500  VariableSelectionResult::Pivot_t & iter = result.pivot;
501  int featureCount = features.shape(1);
502  // initialize result struct if in use for the first time
503  if(!result.init(features, response, errorcallback))
504  {
505  //result is being reused just ensure that the number of features is
506  //the same.
507  vigra_precondition((int)selected.size() == featureCount,
508  "forward_selection(): Number of features in Feature "
509  "matrix and number of features in previously used "
510  "result struct mismatch!");
511  }
512 
513  int ii = 0;
514  for(; iter != selected.end(); ++iter)
515  {
516  ++ii;
517  MultiArray<2, double> cur_feats;
518  detail::choose( features,
519  selected.begin(),
520  iter+1,
521  cur_feats);
522  double error = errorcallback(cur_feats, response);
523  errors[std::distance(selected.begin(), iter)] = error;
524 #ifdef RN_VERBOSE
525  std::copy(selected.begin(), iter+1, std::ostream_iterator<int>(std::cerr, ", "));
526  std::cerr << "Choosing " << *(iter+1) << " at error of " << error << std::endl;
527 #endif
528 
529  }
530 }
531 
532 template<class FeatureT, class ResponseT>
533 void rank_selection (FeatureT const & features,
534  ResponseT const & response,
535  VariableSelectionResult & result)
536 {
537  rank_selection(features, response, result, RFErrorCallback());
538 }
539 
540 
541 
542 enum ClusterLeafTypes{c_Leaf = 95, c_Node = 99};
543 
544 /* View of a Node in the hierarchical clustering
545  * class
546  * For internal use only -
547  * \sa NodeBase
548  */
549 class ClusterNode
550 : public NodeBase
551 {
552  public:
553 
554  typedef NodeBase BT;
555 
556  /**constructors **/
557  ClusterNode():NodeBase(){}
558  ClusterNode( int nCol,
559  BT::T_Container_type & topology,
560  BT::P_Container_type & split_param)
561  : BT(nCol + 5, 5,topology, split_param)
562  {
563  status() = 0;
564  BT::column_data()[0] = nCol;
565  if(nCol == 1)
566  BT::typeID() = c_Leaf;
567  else
568  BT::typeID() = c_Node;
569  }
570 
571  ClusterNode( BT::T_Container_type const & topology,
572  BT::P_Container_type const & split_param,
573  int n )
574  : NodeBase(5 , 5,topology, split_param, n)
575  {
576  //TODO : is there a more elegant way to do this?
577  BT::topology_size_ += BT::column_data()[0];
578  }
579 
580  ClusterNode( BT & node_)
581  : BT(5, 5, node_)
582  {
583  //TODO : is there a more elegant way to do this?
584  BT::topology_size_ += BT::column_data()[0];
585  BT::parameter_size_ += 0;
586  }
587  int index()
588  {
589  return static_cast<int>(BT::parameters_begin()[1]);
590  }
591  void set_index(int in)
592  {
593  BT::parameters_begin()[1] = in;
594  }
595  double& mean()
596  {
597  return BT::parameters_begin()[2];
598  }
599  double& stdev()
600  {
601  return BT::parameters_begin()[3];
602  }
603  double& status()
604  {
605  return BT::parameters_begin()[4];
606  }
607 };
608 
609 /** Stackentry class for HClustering class
610  */
611 struct HC_Entry
612 {
613  int parent;
614  int level;
615  int addr;
616  bool infm;
617  HC_Entry(int p, int l, int a, bool in)
618  : parent(p), level(l), addr(a), infm(in)
619  {}
620 };
621 
622 
623 /** Hierarchical Clustering class.
624  * Performs single linkage clustering
625  * \code
626  * Matrix<double> distance = get_distance_matrix();
627  * linkage.cluster(distance);
628  * // Draw clustering tree.
629  * Draw<double, int> draw(features, labels, "linkagetree.graph");
630  * linkage.breadth_first_traversal(draw);
631  * \endcode
632  * \sa ClusterImportanceVisitor
633  *
634  * once the clustering has taken place. Information queries can be made
635  * using the breadth_first_traversal() method and iterate() method
636  *
637  */
639 {
640 public:
642  ArrayVector<int> topology_;
643  ArrayVector<double> parameters_;
644  int begin_addr;
645 
646  // Calculates the distance between two
647  double dist_func(double a, double b)
648  {
649  return std::min(a, b);
650  }
651 
652  /** Visit each node with a Functor
653  * in creation order (should be depth first)
654  */
655  template<class Functor>
656  void iterate(Functor & tester)
657  {
658 
659  std::vector<int> stack;
660  stack.push_back(begin_addr);
661  while(!stack.empty())
662  {
663  ClusterNode node(topology_, parameters_, stack.back());
664  stack.pop_back();
665  if(!tester(node))
666  {
667  if(node.columns_size() != 1)
668  {
669  stack.push_back(node.child(0));
670  stack.push_back(node.child(1));
671  }
672  }
673  }
674  }
675 
676  /** Perform breadth first traversal of hierarchical cluster tree
677  */
678  template<class Functor>
679  void breadth_first_traversal(Functor & tester)
680  {
681 
682  std::queue<HC_Entry> queue;
683  int level = 0;
684  int parent = -1;
685  int addr = -1;
686  bool infm = false;
687  queue.push(HC_Entry(parent,level,begin_addr, infm));
688  while(!queue.empty())
689  {
690  level = queue.front().level;
691  parent = queue.front().parent;
692  addr = queue.front().addr;
693  infm = queue.front().infm;
694  ClusterNode node(topology_, parameters_, queue.front().addr);
695  ClusterNode parnt;
696  if(parent != -1)
697  {
698  parnt = ClusterNode(topology_, parameters_, parent);
699  }
700  queue.pop();
701  bool istrue = tester(node, level, parnt, infm);
702  if(node.columns_size() != 1)
703  {
704  queue.push(HC_Entry(addr, level +1,node.child(0),istrue));
705  queue.push(HC_Entry(addr, level +1,node.child(1),istrue));
706  }
707  }
708  }
709  /**save to HDF5 - defunct - has to be updated to new HDF5 interface
710  */
711 #ifdef HasHDF5
712  void save(std::string file, std::string prefix)
713  {
714 
715  vigra::writeHDF5(file.c_str(), (prefix + "topology").c_str(),
717  Shp(topology_.size(),1),
718  topology_.data()));
719  vigra::writeHDF5(file.c_str(), (prefix + "parameters").c_str(),
721  Shp(parameters_.size(), 1),
722  parameters_.data()));
723  vigra::writeHDF5(file.c_str(), (prefix + "begin_addr").c_str(),
724  MultiArrayView<2, int>(Shp(1,1), &begin_addr));
725 
726  }
727 #endif
728 
729  /**Perform single linkage clustering
730  * \param distance distance matrix used. \sa CorrelationVisitor
731  */
732  template<class T, class C>
734  {
735  MultiArray<2, T> dist(distance);
736  std::vector<std::pair<int, int> > addr;
737  typedef std::pair<int, int> Entry;
738  int index = 0;
739  for(int ii = 0; ii < distance.shape(0); ++ii)
740  {
741  addr.push_back(std::make_pair(topology_.size(), ii));
742  ClusterNode leaf(1, topology_, parameters_);
743  leaf.set_index(index);
744  ++index;
745  leaf.columns_begin()[0] = ii;
746  }
747 
748  while(addr.size() != 1)
749  {
750  //find the two nodes with the smallest distance
751  int ii_min = 0;
752  int jj_min = 1;
753  double min_dist = dist((addr.begin()+ii_min)->second,
754  (addr.begin()+jj_min)->second);
755  for(unsigned int ii = 0; ii < addr.size(); ++ii)
756  {
757  for(unsigned int jj = ii+1; jj < addr.size(); ++jj)
758  {
759  if( dist((addr.begin()+ii_min)->second,
760  (addr.begin()+jj_min)->second)
761  > dist((addr.begin()+ii)->second,
762  (addr.begin()+jj)->second))
763  {
764  min_dist = dist((addr.begin()+ii)->second,
765  (addr.begin()+jj)->second);
766  ii_min = ii;
767  jj_min = jj;
768  }
769  }
770  }
771 
772  //merge two nodes
773  int col_size = 0;
774  // The problem is that creating a new node invalidates the iterators stored
775  // in firstChild and secondChild.
776  {
777  ClusterNode firstChild(topology_,
778  parameters_,
779  (addr.begin() +ii_min)->first);
780  ClusterNode secondChild(topology_,
781  parameters_,
782  (addr.begin() +jj_min)->first);
783  col_size = firstChild.columns_size() + secondChild.columns_size();
784  }
785  int cur_addr = topology_.size();
786  begin_addr = cur_addr;
787 // std::cerr << col_size << std::endl;
788  ClusterNode parent(col_size,
789  topology_,
790  parameters_);
791  ClusterNode firstChild(topology_,
792  parameters_,
793  (addr.begin() +ii_min)->first);
794  ClusterNode secondChild(topology_,
795  parameters_,
796  (addr.begin() +jj_min)->first);
797  parent.parameters_begin()[0] = min_dist;
798  parent.set_index(index);
799  ++index;
800  std::merge(firstChild.columns_begin(), firstChild.columns_end(),
801  secondChild.columns_begin(),secondChild.columns_end(),
802  parent.columns_begin());
803  //merge nodes in addr
804  int to_keep;
805  int to_desc;
806  int ii_keep;
807  if(*parent.columns_begin() == *firstChild.columns_begin())
808  {
809  parent.child(0) = (addr.begin()+ii_min)->first;
810  parent.child(1) = (addr.begin()+jj_min)->first;
811  (addr.begin()+ii_min)->first = cur_addr;
812  ii_keep = ii_min;
813  to_keep = (addr.begin()+ii_min)->second;
814  to_desc = (addr.begin()+jj_min)->second;
815  addr.erase(addr.begin()+jj_min);
816  }
817  else
818  {
819  parent.child(1) = (addr.begin()+ii_min)->first;
820  parent.child(0) = (addr.begin()+jj_min)->first;
821  (addr.begin()+jj_min)->first = cur_addr;
822  ii_keep = jj_min;
823  to_keep = (addr.begin()+jj_min)->second;
824  to_desc = (addr.begin()+ii_min)->second;
825  addr.erase(addr.begin()+ii_min);
826  }
827  //update distances;
828 
829  for(int jj = 0 ; jj < (int)addr.size(); ++jj)
830  {
831  if(jj == ii_keep)
832  continue;
833  double bla = dist_func(
834  dist(to_desc, (addr.begin()+jj)->second),
835  dist((addr.begin()+ii_keep)->second,
836  (addr.begin()+jj)->second));
837 
838  dist((addr.begin()+ii_keep)->second,
839  (addr.begin()+jj)->second) = bla;
840  dist((addr.begin()+jj)->second,
841  (addr.begin()+ii_keep)->second) = bla;
842  }
843  }
844  }
845 
846 };
847 
848 
849 /** Normalize the status value in the HClustering tree (HClustering Visitor)
850  */
852 {
853 public:
854  double n;
855  /** Constructor
856  * \param m normalize status() by m
857  */
858  NormalizeStatus(double m)
859  :n(m)
860  {}
861  template<class Node>
862  bool operator()(Node& node)
863  {
864  node.status()/=n;
865  return false;
866  }
867 };
868 
869 
870 /** Perform Permutation importance on HClustering clusters
871  * (See visit_after_tree() method of visitors::VariableImportance to
872  * see the basic idea. (Just that we apply the permutation not only to
873  * variables but also to clusters))
874  */
875 template<class Iter, class DT>
877 {
878 public:
880  Matrix<double> tmp_mem_;
881  MultiArrayView<2, double> perm_imp;
882  MultiArrayView<2, double> orig_imp;
883  Matrix<double> feats_;
884  Matrix<int> labels_;
885  const int nPerm;
886  DT const & dt;
887  int index;
888  int oob_size;
889 
890  template<class Feat_T, class Label_T>
891  PermuteCluster(Iter a,
892  Iter b,
893  Feat_T const & feats,
894  Label_T const & labls,
897  int np,
898  DT const & dt_)
899  :tmp_mem_(_spl(a, b).size(), feats.shape(1)),
900  perm_imp(p_imp),
901  orig_imp(o_imp),
902  feats_(_spl(a,b).size(), feats.shape(1)),
903  labels_(_spl(a,b).size(),1),
904  nPerm(np),
905  dt(dt_),
906  index(0),
907  oob_size(b-a)
908  {
909  copy_splice(_spl(a,b),
910  _spl(feats.shape(1)),
911  feats,
912  feats_);
913  copy_splice(_spl(a,b),
914  _spl(labls.shape(1)),
915  labls,
916  labels_);
917  }
918 
919  template<class Node>
920  bool operator()(Node& node)
921  {
922  tmp_mem_ = feats_;
923  RandomMT19937 random;
924  int class_count = perm_imp.shape(1) - 1;
925  //permute columns together
926  for(int kk = 0; kk < nPerm; ++kk)
927  {
928  tmp_mem_ = feats_;
929  for(int ii = 0; ii < rowCount(feats_); ++ii)
930  {
931  int index = random.uniformInt(rowCount(feats_) - ii) +ii;
932  for(int jj = 0; jj < node.columns_size(); ++jj)
933  {
934  if(node.columns_begin()[jj] != feats_.shape(1))
935  tmp_mem_(ii, node.columns_begin()[jj])
936  = tmp_mem_(index, node.columns_begin()[jj]);
937  }
938  }
939 
940  for(int ii = 0; ii < rowCount(tmp_mem_); ++ii)
941  {
942  if(dt
943  .predictLabel(rowVector(tmp_mem_, ii))
944  == labels_(ii, 0))
945  {
946  //per class
947  ++perm_imp(index,labels_(ii, 0));
948  //total
949  ++perm_imp(index, class_count);
950  }
951  }
952  }
953  double node_status = perm_imp(index, class_count);
954  node_status /= nPerm;
955  node_status -= orig_imp(0, class_count);
956  node_status *= -1;
957  node_status /= oob_size;
958  node.status() += node_status;
959  ++index;
960 
961  return false;
962  }
963 };
964 
965 /** Convert ClusteringTree into a list (HClustering visitor)
966  */
968 {
969 public:
970  /** NumberOfClusters x NumberOfVariables MultiArrayView containing
971  * in each row the variable belonging to a cluster
972  */
974  int index;
976  :variables(vars), index(0)
977  {}
978 #ifdef HasHDF5
979  void save(std::string file, std::string prefix)
980  {
981  vigra::writeHDF5(file.c_str(), (prefix + "_variables").c_str(),
982  variables);
983  }
984 #endif
985 
986  template<class Node>
987  bool operator()(Node& node)
988  {
989  for(int ii = 0; ii < node.columns_size(); ++ii)
990  variables(index, ii) = node.columns_begin()[ii];
991  ++index;
992  return false;
993  }
994 };
995 /** corrects the status fields of a linkage Clustering (HClustering Visitor)
996  *
997  * such that status(currentNode) = min(status(parent), status(currentNode))
998  * \sa cluster_permutation_importance()
999  */
1001 {
1002 public:
1003  template<class Nde>
1004  bool operator()(Nde & cur, int level, Nde parent, bool infm)
1005  {
1006  if(parent.hasData_)
1007  cur.status() = std::min(parent.status(), cur.status());
1008  return true;
1009  }
1010 };
1011 
1012 
1013 /** draw current linkage Clustering (HClustering Visitor)
1014  *
1015  * create a graphviz .dot file
1016  * usage:
1017  * \code
1018  * Matrix<double> distance = get_distance_matrix();
1019  * linkage.cluster(distance);
1020  * Draw<double, int> draw(features, labels, "linkagetree.graph");
1021  * linkage.breadth_first_traversal(draw);
1022  * \endcode
1023  */
1024 template<class T1,
1025  class T2,
1026  class C1 = UnstridedArrayTag,
1027  class C2 = UnstridedArrayTag>
1028 class Draw
1029 {
1030 public:
1031  typedef MultiArrayShape<2>::type Shp;
1032  MultiArrayView<2, T1, C1> const & features_;
1033  MultiArrayView<2, T2, C2> const & labels_;
1034  std::ofstream graphviz;
1035 
1036 
1037  Draw(MultiArrayView<2, T1, C1> const & features,
1038  MultiArrayView<2, T2, C2> const& labels,
1039  std::string const gz)
1040  :features_(features), labels_(labels),
1041  graphviz(gz.c_str(), std::ios::out)
1042  {
1043  graphviz << "digraph G\n{\n node [shape=\"record\"]";
1044  }
1045  ~Draw()
1046  {
1047  graphviz << "\n}\n";
1048  graphviz.close();
1049  }
1050 
1051  template<class Nde>
1052  bool operator()(Nde & cur, int level, Nde parent, bool infm)
1053  {
1054  graphviz << "node" << cur.index() << " [style=\"filled\"][label = \" #Feats: "<< cur.columns_size() << "\\n";
1055  graphviz << " status: " << cur.status() << "\\n";
1056  for(int kk = 0; kk < cur.columns_size(); ++kk)
1057  {
1058  graphviz << cur.columns_begin()[kk] << " ";
1059  if(kk % 15 == 14)
1060  graphviz << "\\n";
1061  }
1062  graphviz << "\"] [color = \"" <<cur.status() << " 1.000 1.000\"];\n";
1063  if(parent.hasData_)
1064  graphviz << "\"node" << parent.index() << "\" -> \"node" << cur.index() <<"\";\n";
1065  return true;
1066  }
1067 };
1068 
1069 /** calculate Cluster based permutation importance while learning. (RandomForestVisitor)
1070  */
1072 {
1073  public:
1074 
1075  /** List of variables as produced by GetClusterVariables
1076  */
1078  /** Corresponding importance measures
1079  */
1081  /** Corresponding error
1082  */
1084  int repetition_count_;
1085  bool in_place_;
1086  HClustering & clustering;
1087 
1088 
1089 #ifdef HasHDF5
1090  void save(std::string filename, std::string prefix)
1091  {
1092  std::string prefix1 = "cluster_importance_" + prefix;
1093  writeHDF5(filename.c_str(),
1094  prefix1.c_str(),
1096  prefix1 = "vars_" + prefix;
1097  writeHDF5(filename.c_str(),
1098  prefix1.c_str(),
1099  variables);
1100  }
1101 #endif
1102 
1103  ClusterImportanceVisitor(HClustering & clst, int rep_cnt = 10)
1104  : repetition_count_(rep_cnt), clustering(clst)
1105 
1106  {}
1107 
1108  /** Allocate enough memory
1109  */
1110  template<class RF, class PR>
1111  void visit_at_beginning(RF const & rf, PR const & pr)
1112  {
1113  Int32 const class_count = rf.ext_param_.class_count_;
1114  Int32 const column_count = rf.ext_param_.column_count_+1;
1116  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1117  class_count+1));
1119  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1120  class_count+1));
1121  variables
1122  .reshape(MultiArrayShape<2>::type(2*column_count-1,
1123  column_count), -1);
1125  clustering.iterate(gcv);
1126 
1127  }
1128 
1129  /**compute permutation based var imp.
1130  * (Only an Array of size oob_sample_count x 1 is created.
1131  * - apposed to oob_sample_count x feature_count in the other method.
1132  *
1133  * \sa FieldProxy
1134  */
1135  template<class RF, class PR, class SM, class ST>
1136  void after_tree_ip_impl(RF& rf, PR & pr, SM & sm, ST & st, int index)
1137  {
1138  typedef MultiArrayShape<2>::type Shp_t;
1139  Int32 column_count = rf.ext_param_.column_count_ +1;
1140  Int32 class_count = rf.ext_param_.class_count_;
1141 
1142  // remove the const cast on the features (yep , I know what I am
1143  // doing here.) data is not destroyed.
1144  typename PR::Feature_t & features
1145  = const_cast<typename PR::Feature_t &>(pr.features());
1146 
1147  //find the oob indices of current tree.
1148  ArrayVector<Int32> oob_indices;
1150  iter;
1151 
1152  if(rf.ext_param_.actual_msample_ < pr.features().shape(0)- 10000)
1153  {
1154  ArrayVector<int> cts(2, 0);
1155  ArrayVector<Int32> indices(pr.features().shape(0));
1156  for(int ii = 0; ii < pr.features().shape(0); ++ii)
1157  indices.push_back(ii);
1158  std::random_shuffle(indices.begin(), indices.end());
1159  for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1160  {
1161  if(!sm.is_used()[indices[ii]] && cts[pr.response()(indices[ii], 0)] < 3000)
1162  {
1163  oob_indices.push_back(indices[ii]);
1164  ++cts[pr.response()(indices[ii], 0)];
1165  }
1166  }
1167  }
1168  else
1169  {
1170  for(int ii = 0; ii < rf.ext_param_.row_count_; ++ii)
1171  if(!sm.is_used()[ii])
1172  oob_indices.push_back(ii);
1173  }
1174 
1175  // Random foo
1176  RandomMT19937 random(RandomSeed);
1178  randint(random);
1179 
1180  //make some space for the results
1182  oob_right(Shp_t(1, class_count + 1));
1183 
1184  // get the oob success rate with the original samples
1185  for(iter = oob_indices.begin();
1186  iter != oob_indices.end();
1187  ++iter)
1188  {
1189  if(rf.tree(index)
1190  .predictLabel(rowVector(features, *iter))
1191  == pr.response()(*iter, 0))
1192  {
1193  //per class
1194  ++oob_right[pr.response()(*iter,0)];
1195  //total
1196  ++oob_right[class_count];
1197  }
1198  }
1199 
1201  perm_oob_right (Shp_t(2* column_count-1, class_count + 1));
1202 
1203  PermuteCluster<ArrayVector<Int32>::iterator,typename RF::DecisionTree_t>
1204  pc(oob_indices.begin(), oob_indices.end(),
1205  pr.features(),
1206  pr.response(),
1207  perm_oob_right,
1208  oob_right,
1209  repetition_count_,
1210  rf.tree(index));
1211  clustering.iterate(pc);
1212 
1213  perm_oob_right /= repetition_count_;
1214  for(int ii = 0; ii < rowCount(perm_oob_right); ++ii)
1215  rowVector(perm_oob_right, ii) -= oob_right;
1216 
1217  perm_oob_right *= -1;
1218  perm_oob_right /= oob_indices.size();
1219  cluster_importance_ += perm_oob_right;
1220  }
1221 
1222  /** calculate permutation based impurity after every tree has been
1223  * learned default behaviour is that this happens out of place.
1224  * If you have very big data sets and want to avoid copying of data
1225  * set the in_place_ flag to true.
1226  */
1227  template<class RF, class PR, class SM, class ST>
1228  void visit_after_tree(RF& rf, PR & pr, SM & sm, ST & st, int index)
1229  {
1230  after_tree_ip_impl(rf, pr, sm, st, index);
1231  }
1232 
1233  /** Normalise variable importance after the number of trees is known.
1234  */
1235  template<class RF, class PR>
1236  void visit_at_end(RF & rf, PR & pr)
1237  {
1238  NormalizeStatus nrm(rf.tree_count());
1239  clustering.iterate(nrm);
1240  cluster_importance_ /= rf.trees_.size();
1241  }
1242 };
1243 
1244 /** Perform hierarchical clustering of variables and assess importance of clusters
1245  *
1246  * \param features IN: n x p matrix containing n instances with p attributes/features
1247  * used in the variable selection algorithm
1248  * \param response IN: n x 1 matrix containing the corresponding response
1249  * \param linkage OUT: Hierarchical grouping of variables.
1250  * \param distance OUT: distance matrix used for creating the linkage
1251  *
1252  * Performs Hierarchical clustering of variables. And calculates the permutation importance
1253  * measures of each of the clusters. Use the Draw functor to create human readable output
1254  * The cluster-permutation importance measure corresponds to the normal permutation importance
1255  * measure with all columns corresponding to a cluster permuted.
1256  * The importance measure for each cluster is stored as the status() field of each clusternode
1257  * \sa HClustering
1258  *
1259  * usage:
1260  * \code
1261  * MultiArray<2, double> features = createSomeFeatures();
1262  * MultiArray<2, int> labels = createCorrespondingLabels();
1263  * HClustering linkage;
1264  * MultiArray<2, double> distance;
1265  * cluster_permutation_importance(features, labels, linkage, distance)
1266  * // create graphviz output
1267  *
1268  * Draw<double, int> draw(features, labels, "linkagetree.graph");
1269  * linkage.breadth_first_traversal(draw);
1270  *
1271  * \endcode
1272  *
1273  *
1274  */
1275 template<class FeatureT, class ResponseT>
1276 void cluster_permutation_importance(FeatureT const & features,
1277  ResponseT const & response,
1278  HClustering & linkage,
1279  MultiArray<2, double> & distance)
1280 {
1281 
1282  RandomForestOptions opt;
1283  opt.tree_count(100);
1284  if(features.shape(0) > 40000)
1285  opt.samples_per_tree(20000).use_stratification(RF_EQUAL);
1286 
1287 
1288  vigra::RandomForest<int> RF(opt);
1291  RF.learn(features, response,
1292  create_visitor(missc, progress));
1293  distance = missc.distance;
1294  /*
1295  missc.save(exp_dir + dset.name() + "_result.h5", dset.name()+"MACH");
1296  */
1297 
1298 
1299  // Produce linkage
1300  linkage.cluster(distance);
1301 
1302  //linkage.save(exp_dir + dset.name() + "_result.h5", "_linkage_CC/");
1303  vigra::RandomForest<int> RF2(opt);
1304  ClusterImportanceVisitor ci(linkage);
1305  RF2.learn(features,
1306  response,
1307  create_visitor(progress, ci));
1308 
1309 
1310  CorrectStatus cs;
1311  linkage.breadth_first_traversal(cs);
1312 
1313  //ci.save(exp_dir + dset.name() + "_result.h5", dset.name());
1314  //Draw<double, int> draw(dset.features(), dset.response(), exp_dir+ dset.name() + ".graph");
1315  //linkage.breadth_first_traversal(draw);
1316 
1317 }
1318 
1319 
1320 template<class FeatureT, class ResponseT>
1321 void cluster_permutation_importance(FeatureT const & features,
1322  ResponseT const & response,
1323  HClustering & linkage)
1324 {
1325  MultiArray<2, double> distance;
1326  cluster_permutation_importance(features, response, linkage, distance);
1327 }
1328 
1329 
1330 template<class Array1, class Vector1>
1331 void get_ranking(Array1 const & in, Vector1 & out)
1332 {
1333  std::map<double, int> mymap;
1334  for(int ii = 0; ii < in.size(); ++ii)
1335  mymap[in[ii]] = ii;
1336  for(std::map<double, int>::reverse_iterator iter = mymap.rbegin(); iter!= mymap.rend(); ++iter)
1337  {
1338  out.push_back(iter->second);
1339  }
1340 }
1341 }//namespace algorithms
1342 }//namespace rf
1343 }//namespace vigra

© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de)
Heidelberg Collaboratory for Image Processing, University of Heidelberg, Germany

html generated using doxygen and Python
vigra 1.8.0 (Wed Sep 26 2012)