SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 2007-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include "clustering/KMeans.h" 00013 #include "distance/Distance.h" 00014 #include "features/Labels.h" 00015 #include "features/SimpleFeatures.h" 00016 #include "lib/Mathematics.h" 00017 #include "base/Parallel.h" 00018 00019 #ifndef WIN32 00020 #include <pthread.h> 00021 #endif 00022 00023 #define MUSRECALC 00024 00025 #define PAR_THRESH 10 00026 00027 using namespace shogun; 00028 00029 CKMeans::CKMeans() 00030 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL), 00031 mus(NULL), Weights(NULL) 00032 { 00033 } 00034 00035 CKMeans::CKMeans(int32_t k_, CDistance* d) 00036 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL), 00037 mus(NULL), Weights(NULL) 00038 { 00039 set_distance(d); 00040 } 00041 00042 CKMeans::~CKMeans() 00043 { 00044 delete[] R; 00045 delete[] mus; 00046 } 00047 00048 bool CKMeans::train(CFeatures* data) 00049 { 00050 ASSERT(distance); 00051 00052 if (data) 00053 distance->init(data, data); 00054 00055 ASSERT(distance->get_feature_type()==F_DREAL); 00056 00057 CSimpleFeatures<float64_t>* lhs=(CSimpleFeatures<float64_t>*) distance->get_lhs(); 00058 ASSERT(lhs); 00059 int32_t num=lhs->get_num_vectors(); 00060 SG_UNREF(lhs); 00061 00062 Weights=new float64_t[num]; 00063 for (int32_t i=0; i<num; i++) 00064 Weights[i]=1.0; 00065 00066 clustknb(false, NULL); 00067 delete[] Weights; 00068 00069 return true; 00070 } 00071 00072 bool CKMeans::load(FILE* srcfile) 00073 { 00074 SG_SET_LOCALE_C; 00075 SG_RESET_LOCALE; 00076 return false; 00077 } 00078 00079 bool CKMeans::save(FILE* dstfile) 00080 { 00081 SG_SET_LOCALE_C; 00082 SG_RESET_LOCALE; 00083 return false; 00084 } 00085 00086 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00087 struct thread_data 00088 { 00089 float64_t* x; 00090 CSimpleFeatures<float64_t>* y; 00091 float64_t* z; 00092 int32_t n1, n2, m; 00093 int32_t js, je; /* defines the matrix stripe */ 00094 int32_t offs; 00095 }; 00096 #endif // DOXYGEN_SHOULD_SKIP_THIS 00097 00098 namespace shogun 00099 { 00100 void *sqdist_thread_func(void * P) 00101 { 00102 struct thread_data *TD=(struct thread_data*) P; 00103 float64_t* x=TD->x; 00104 CSimpleFeatures<float64_t>* y=TD->y; 00105 float64_t* z=TD->z; 00106 int32_t n1=TD->n1, 00107 m=TD->m, 00108 js=TD->js, 00109 je=TD->je, 00110 offs=TD->offs, 00111 j,i,k; 00112 00113 for (j=js; j<je; j++) 00114 { 00115 int32_t vlen=0; 00116 bool vfree=false; 00117 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree); 00118 00119 for (i=0; i<n1; i++) 00120 { 00121 float64_t sum=0; 00122 for (k=0; k<m; k++) 00123 sum = sum + CMath::sq(x[i*m + k] - vec[k]); 00124 z[j*n1 + i] = sum; 00125 } 00126 00127 y->free_feature_vector(vec, j, vfree); 00128 } 00129 return NULL; 00130 } 00131 } 00132 00133 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start) 00134 { 00135 ASSERT(distance && distance->get_feature_type()==F_DREAL); 00136 CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs(); 00137 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0); 00138 00139 int32_t XSize=lhs->get_num_vectors(); 00140 dimensions=lhs->get_num_features(); 00141 int32_t i, changed=1; 00142 const int32_t XDimk=dimensions*k; 00143 int32_t iter=0; 00144 00145 delete[] R; 00146 R=new float64_t[k]; 00147 00148 delete[] mus; 00149 mus=new float64_t[XDimk]; 00150 00151 int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t)); 00152 float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t)); 00153 float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t)); 00154 00156 CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0); 00157 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus); 00158 00159 int32_t vlen=0; 00160 bool vfree=false; 00161 float64_t* vec=NULL; 00162 00163 /* ClList=zeros(XSize,1) ; */ 00164 for (i=0; i<XSize; i++) ClList[i]=0; 00165 /* weights_set=zeros(k,1) ; */ 00166 for (i=0; i<k; i++) weights_set[i]=0; 00167 00168 /* mus=zeros(dimensions, k) ; */ 00169 for (i=0; i<XDimk; i++) mus[i]=0; 00170 00171 if (!use_old_mus) 00172 { 00173 /* random clustering (select random subsets) */ 00174 /* ks=ceil(rand(1,XSize)*k); 00175 * for i=1:k, 00176 * actks= (ks==i); 00177 * c=sum(actks); 00178 * weights_set(i)=c; 00179 * 00180 * ClList(actks)=i*ones(1, c); 00181 * 00182 * if ~mus_recalc, 00183 * if c>1 00184 * mus(:,i) = sum(XData(:,actks)')'/c; 00185 * elseif c>0 00186 * mus(:,i) = XData(:,actks); 00187 * end; 00188 * end; 00189 * end ; */ 00190 00191 for (i=0; i<XSize; i++) 00192 { 00193 const int32_t Cl=CMath::random(0, k-1); 00194 int32_t j; 00195 float64_t weight=Weights[i]; 00196 00197 weights_set[Cl]+=weight; 00198 ClList[i]=Cl; 00199 00200 vec=lhs->get_feature_vector(i, vlen, vfree); 00201 00202 for (j=0; j<dimensions; j++) 00203 mus[Cl*dimensions+j] += weight*vec[j]; 00204 00205 lhs->free_feature_vector(vec, i, vfree); 00206 } 00207 for (i=0; i<k; i++) 00208 { 00209 int32_t j; 00210 00211 if (weights_set[i]!=0.0) 00212 for (j=0; j<dimensions; j++) 00213 mus[i*dimensions+j] /= weights_set[i]; 00214 } 00215 } 00216 else 00217 { 00218 ASSERT(mus_start); 00219 00221 rhs_mus->copy_feature_matrix(mus_start,dimensions,k); 00222 float64_t* p_dists=dists; 00223 00224 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k) 00225 distances_rhs(p_dists,0,k,idx); 00226 p_dists=NULL; 00227 00228 for (i=0; i<XSize; i++) 00229 { 00230 float64_t mini=dists[i*k]; 00231 int32_t Cl = 0, j; 00232 00233 for (j=1; j<k; j++) 00234 { 00235 if (dists[i*k+j]<mini) 00236 { 00237 Cl=j; 00238 mini=dists[i*k+j]; 00239 } 00240 } 00241 ClList[i]=Cl; 00242 } 00243 00244 /* Compute the sum of all points belonging to a cluster 00245 * and count the points */ 00246 for (i=0; i<XSize; i++) 00247 { 00248 const int32_t Cl = ClList[i]; 00249 float64_t weight=Weights[i]; 00250 weights_set[Cl]+=weight; 00251 #ifndef MUSRECALC 00252 vec=lhs->get_feature_vector(i, vlen, vfree); 00253 00254 for (j=0; j<dimensions; j++) 00255 mus[Cl*dimensions+j] += weight*vec[j]; 00256 00257 lhs->free_feature_vector(vec, i, vfree); 00258 #endif 00259 } 00260 #ifndef MUSRECALC 00261 /* normalization to get the mean */ 00262 for (i=0; i<k; i++) 00263 { 00264 if (weights_set[i]!=0.0) 00265 { 00266 int32_t j; 00267 for (j=0; j<dimensions; j++) 00268 mus[i*dimensions+j] /= weights_set[i]; 00269 } 00270 } 00271 #endif 00272 } 00273 00274 00275 00276 while (changed && (iter<max_iter)) 00277 { 00278 iter++; 00279 if (iter==max_iter-1) 00280 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1); 00281 00282 if (iter%1000 == 0) 00283 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed); 00284 changed=0; 00285 00286 #ifdef MUSRECALC 00287 /* mus=zeros(dimensions, k) ; */ 00288 for (i=0; i<XDimk; i++) mus[i]=0; 00289 00290 for (i=0; i<XSize; i++) 00291 { 00292 int32_t j; 00293 int32_t Cl=ClList[i]; 00294 float64_t weight=Weights[i]; 00295 00296 vec=lhs->get_feature_vector(i, vlen, vfree); 00297 00298 for (j=0; j<dimensions; j++) 00299 mus[Cl*dimensions+j] += weight*vec[j]; 00300 00301 lhs->free_feature_vector(vec, i, vfree); 00302 } 00303 for (i=0; i<k; i++) 00304 { 00305 int32_t j; 00306 00307 if (weights_set[i]!=0.0) 00308 for (j=0; j<dimensions; j++) 00309 mus[i*dimensions+j] /= weights_set[i]; 00310 } 00311 #endif 00312 00313 rhs_mus->copy_feature_matrix(mus,dimensions,k); 00314 00315 for (i=0; i<XSize; i++) 00316 { 00317 /* ks=ceil(rand(1,XSize)*XSize) ; */ 00318 const int32_t Pat= CMath::random(0, XSize-1); 00319 const int32_t ClList_Pat=ClList[Pat]; 00320 int32_t imini, j; 00321 float64_t mini, weight; 00322 00323 weight=Weights[Pat]; 00324 00325 /* compute the distance of this point to all centers */ 00326 for(int32_t idx_k=0;idx_k<k;idx_k++) 00327 dists[idx_k]=distance->distance(Pat,idx_k); 00328 00329 /* [mini,imini]=min(dists(:,i)) ; */ 00330 imini=0 ; mini=dists[0]; 00331 for (j=1; j<k; j++) 00332 if (dists[j]<mini) 00333 { 00334 mini=dists[j]; 00335 imini=j; 00336 } 00337 00338 if (imini!=ClList_Pat) 00339 { 00340 changed= changed + 1; 00341 00342 /* weights_set(imini) = weights_set(imini) + weight ; */ 00343 weights_set[imini]+= weight; 00344 /* weights_set(j) = weights_set(j) - weight ; */ 00345 weights_set[ClList_Pat]-= weight; 00346 00347 /* mu_new=mu_old + (x - mu_old)/(n+1) */ 00348 /* mus(:,imini)=mus(:,imini) + (XData(:,i) - mus(:,imini)) * (weight / weights_set(imini)) ; */ 00349 vec=lhs->get_feature_vector(Pat, vlen, vfree); 00350 00351 for (j=0; j<dimensions; j++) 00352 mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]); 00353 00354 lhs->free_feature_vector(vec, Pat, vfree); 00355 00356 /* mu_new = mu_old - (x - mu_old)/(n-1) */ 00357 /* if weights_set(j)~=0 */ 00358 if (weights_set[ClList_Pat]!=0.0) 00359 { 00360 /* mus(:,j)=mus(:,j) - (XData(:,i) - mus(:,j)) * (weight/weights_set(j)) ; */ 00361 vec=lhs->get_feature_vector(Pat, vlen, vfree); 00362 00363 for (j=0; j<dimensions; j++) 00364 mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]); 00365 lhs->free_feature_vector(vec, Pat, vfree); 00366 } 00367 else 00368 /* mus(:,j)=zeros(dimensions,1) ; */ 00369 for (j=0; j<dimensions; j++) 00370 mus[ClList_Pat*dimensions+j]=0; 00371 00372 /* ClList(i)= imini ; */ 00373 ClList[Pat] = imini; 00374 } 00375 } 00376 } 00377 00378 /* compute the ,,variances'' of the clusters */ 00379 for (i=0; i<k; i++) 00380 { 00381 float64_t rmin1=0; 00382 float64_t rmin2=0; 00383 00384 bool first_round=true; 00385 00386 for (int32_t j=0; j<k; j++) 00387 { 00388 if (j!=i) 00389 { 00390 int32_t l; 00391 float64_t dist = 0; 00392 00393 for (l=0; l<dimensions; l++) 00394 dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]); 00395 00396 if (first_round) 00397 { 00398 rmin1=dist; 00399 rmin2=dist; 00400 first_round=false; 00401 } 00402 else 00403 { 00404 if ((dist<rmin2) && (dist>=rmin1)) 00405 rmin2=dist; 00406 00407 if (dist<rmin1) 00408 { 00409 rmin2=rmin1; 00410 rmin1=dist; 00411 } 00412 } 00413 } 00414 } 00415 00416 R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2)); 00417 } 00418 distance->replace_rhs(rhs_cache); 00419 delete rhs_mus; 00420 00421 free(ClList); 00422 free(weights_set); 00423 free(dists); 00424 SG_UNREF(lhs); 00425 }