SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * 00008 * Written (W) 2006 Christian Gehl 00009 * Written (W) 2006-2009 Soeren Sonnenburg 00010 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00011 */ 00012 00013 #include "classifier/KNN.h" 00014 #include "features/Labels.h" 00015 #include "lib/Mathematics.h" 00016 #include "lib/Signal.h" 00017 00018 using namespace shogun; 00019 00020 CKNN::CKNN() 00021 : CDistanceMachine(), k(3), num_classes(0), num_train_labels(0), train_labels(NULL) 00022 { 00023 } 00024 00025 CKNN::CKNN(int32_t k_, CDistance* d, CLabels* trainlab) 00026 : CDistanceMachine(), k(k_), num_classes(0), train_labels(NULL) 00027 { 00028 ASSERT(d); 00029 ASSERT(trainlab); 00030 00031 set_distance(d); 00032 set_labels(trainlab); 00033 num_train_labels=trainlab->get_num_labels(); 00034 } 00035 00036 00037 CKNN::~CKNN() 00038 { 00039 delete[] train_labels; 00040 } 00041 00042 bool CKNN::train(CFeatures* data) 00043 { 00044 ASSERT(labels); 00045 ASSERT(distance); 00046 00047 if (data) 00048 { 00049 if (labels->get_num_labels() != data->get_num_vectors()) 00050 SG_ERROR("Number of training vectors does not match number of labels\n"); 00051 distance->init(data, data); 00052 } 00053 00054 train_labels=labels->get_int_labels(num_train_labels); 00055 ASSERT(train_labels); 00056 ASSERT(num_train_labels>0); 00057 00058 int32_t max_class=train_labels[0]; 00059 int32_t min_class=train_labels[0]; 00060 00061 int32_t i; 00062 for (i=1; i<num_train_labels; i++) 00063 { 00064 max_class=CMath::max(max_class, train_labels[i]); 00065 min_class=CMath::min(min_class, train_labels[i]); 00066 } 00067 00068 for (i=0; i<num_train_labels; i++) 00069 train_labels[i]-=min_class; 00070 00071 min_label=min_class; 00072 num_classes=max_class-min_class+1; 00073 00074 SG_INFO( "num_classes: %d (%+d to %+d) num_train: %d\n", num_classes, min_class, max_class, num_train_labels); 00075 return true; 00076 } 00077 00078 CLabels* CKNN::classify() 00079 { 00080 ASSERT(num_classes>0); 00081 ASSERT(distance); 00082 ASSERT(distance->get_num_vec_rhs()); 00083 00084 int32_t num_lab=distance->get_num_vec_rhs(); 00085 ASSERT(k<=num_lab); 00086 00087 CLabels* output=new CLabels(num_lab); 00088 00089 //distances to train data and working buffer of train_labels 00090 float64_t* dists=new float64_t[num_train_labels]; 00091 int32_t* train_lab=new int32_t[num_train_labels]; 00092 00094 int32_t* classes=new int32_t[num_classes]; 00095 00096 ASSERT(dists); 00097 ASSERT(train_lab); 00098 ASSERT(classes); 00099 00100 SG_INFO( "%d test examples\n", num_lab); 00101 CSignal::clear_cancel(); 00102 00103 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++) 00104 { 00105 SG_PROGRESS(i, 0, num_lab); 00106 00107 // lhs idx 1..n and rhs idx i 00108 distances_lhs(dists,0,num_train_labels-1,i); 00109 int32_t j; 00110 for (j=0; j<num_train_labels; j++) 00111 train_lab[j]=train_labels[j]; 00112 00113 //sort the distance vector for test example j to all train examples 00114 //classes[1..k] then holds the classes for minimum distance 00115 CMath::qsort_index(dists, train_lab, num_train_labels); 00116 00117 //compute histogram of class outputs of the first k nearest neighbours 00118 for (j=0; j<num_classes; j++) 00119 classes[j]=0; 00120 00121 for (j=0; j<k; j++) 00122 classes[train_lab[j]]++; 00123 00124 //choose the class that got 'outputted' most often 00125 int32_t out_idx=0; 00126 int32_t out_max=0; 00127 00128 for (j=0; j<num_classes; j++) 00129 { 00130 if (out_max< classes[j]) 00131 { 00132 out_idx= j; 00133 out_max= classes[j]; 00134 } 00135 } 00136 00137 output->set_label(i, out_idx+min_label); 00138 } 00139 00140 delete[] dists; 00141 delete[] train_lab; 00142 delete[] classes; 00143 00144 return output; 00145 } 00146 00147 CLabels* CKNN::classify(CFeatures* data) 00148 { 00149 if (!distance) 00150 SG_ERROR("No distance assigned!\n"); 00151 00152 CFeatures* lhs=distance->get_lhs(); 00153 if (!lhs || !lhs->get_num_vectors()) 00154 { 00155 SG_UNREF(lhs); 00156 SG_ERROR("No vectors on left hand side\n"); 00157 } 00158 distance->init(lhs, data); 00159 SG_UNREF(lhs); 00160 00161 return classify(); 00162 } 00163 00164 void CKNN::classify_for_multiple_k(int32_t** dst, int32_t* num_vec, int32_t* k_out) 00165 { 00166 ASSERT(dst); 00167 ASSERT(k_out); 00168 ASSERT(num_vec); 00169 00170 ASSERT(num_classes>0); 00171 ASSERT(distance); 00172 ASSERT(distance->get_num_vec_rhs()); 00173 00174 int32_t num_lab=distance->get_num_vec_rhs(); 00175 ASSERT(k<=num_lab); 00176 00177 int32_t* output=(int32_t*) malloc(sizeof(int32_t)*k*num_lab); 00178 00179 //distances to train data and working buffer of train_labels 00180 float64_t* dists=new float64_t[num_train_labels]; 00181 int32_t* train_lab=new int32_t[num_train_labels]; 00182 00184 int32_t* classes=new int32_t[num_classes]; 00185 00186 SG_INFO( "%d test examples\n", num_lab); 00187 CSignal::clear_cancel(); 00188 00189 for (int32_t i=0; i<num_lab && (!CSignal::cancel_computations()); i++) 00190 { 00191 SG_PROGRESS(i, 0, num_lab); 00192 00193 // lhs idx 1..n and rhs idx i 00194 distances_lhs(dists,0,num_train_labels-1,i); 00195 for (int32_t j=0; j<num_train_labels; j++) 00196 train_lab[j]=train_labels[j]; 00197 00198 //sort the distance vector for test example j to all train examples 00199 //classes[1..k] then holds the classes for minimum distance 00200 CMath::qsort_index(dists, train_lab, num_train_labels); 00201 00202 //compute histogram of class outputs of the first k nearest neighbours 00203 for (int32_t j=0; j<num_classes; j++) 00204 classes[j]=0; 00205 00206 for (int32_t j=0; j<k; j++) 00207 { 00208 classes[train_lab[j]]++; 00209 00210 //choose the class that got 'outputted' most often 00211 int32_t out_idx=0; 00212 int32_t out_max=0; 00213 00214 for (int32_t c=0; c<num_classes; c++) 00215 { 00216 if (out_max< classes[c]) 00217 { 00218 out_idx= c; 00219 out_max= classes[c]; 00220 } 00221 } 00222 output[j*num_lab+i]=out_idx+min_label; 00223 } 00224 } 00225 00226 delete[] dists; 00227 delete[] train_lab; 00228 delete[] classes; 00229 00230 *dst=output; 00231 *k_out=k; 00232 *num_vec=num_lab; 00233 } 00234 00235 bool CKNN::load(FILE* srcfile) 00236 { 00237 SG_SET_LOCALE_C; 00238 SG_RESET_LOCALE; 00239 return false; 00240 } 00241 00242 bool CKNN::save(FILE* dstfile) 00243 { 00244 SG_SET_LOCALE_C; 00245 SG_RESET_LOCALE; 00246 return false; 00247 }