00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "clustering/KMeans.h"
00013 #include "distance/Distance.h"
00014 #include "features/Labels.h"
00015 #include "features/SimpleFeatures.h"
00016 #include "lib/Mathematics.h"
00017 #include "base/Parallel.h"
00018
00019 #ifndef WIN32
00020 #include <pthread.h>
00021 #endif
00022
00023 #define MUSRECALC
00024
00025 #define PAR_THRESH 10
00026
00027 using namespace shogun;
00028
00029 CKMeans::CKMeans()
00030 : CDistanceMachine(), max_iter(10000), k(3), dimensions(0), R(NULL),
00031 mus(NULL), Weights(NULL)
00032 {
00033 }
00034
00035 CKMeans::CKMeans(int32_t k_, CDistance* d)
00036 : CDistanceMachine(), max_iter(10000), k(k_), dimensions(0), R(NULL),
00037 mus(NULL), Weights(NULL)
00038 {
00039 set_distance(d);
00040 }
00041
00042 CKMeans::~CKMeans()
00043 {
00044 delete[] R;
00045 delete[] mus;
00046 }
00047
00048 bool CKMeans::train(CFeatures* data)
00049 {
00050 ASSERT(distance);
00051
00052 if (data)
00053 distance->init(data, data);
00054
00055 ASSERT(distance->get_feature_type()==F_DREAL);
00056
00057 CSimpleFeatures<float64_t>* lhs=(CSimpleFeatures<float64_t>*) distance->get_lhs();
00058 ASSERT(lhs);
00059 int32_t num=lhs->get_num_vectors();
00060 SG_UNREF(lhs);
00061
00062 Weights=new float64_t[num];
00063 for (int32_t i=0; i<num; i++)
00064 Weights[i]=1.0;
00065
00066 clustknb(false, NULL);
00067 delete[] Weights;
00068
00069 return true;
00070 }
00071
00072 bool CKMeans::load(FILE* srcfile)
00073 {
00074 return false;
00075 }
00076
00077 bool CKMeans::save(FILE* dstfile)
00078 {
00079 return false;
00080 }
00081
00082 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00083 struct thread_data
00084 {
00085 float64_t* x;
00086 CSimpleFeatures<float64_t>* y;
00087 float64_t* z;
00088 int32_t n1, n2, m;
00089 int32_t js, je;
00090 int32_t offs;
00091 };
00092 #endif // DOXYGEN_SHOULD_SKIP_THIS
00093
00094 namespace shogun
00095 {
00096 void *sqdist_thread_func(void * P)
00097 {
00098 struct thread_data *TD=(struct thread_data*) P;
00099 float64_t* x=TD->x;
00100 CSimpleFeatures<float64_t>* y=TD->y;
00101 float64_t* z=TD->z;
00102 int32_t n1=TD->n1,
00103 m=TD->m,
00104 js=TD->js,
00105 je=TD->je,
00106 offs=TD->offs,
00107 j,i,k;
00108
00109 for (j=js; j<je; j++)
00110 {
00111 int32_t vlen=0;
00112 bool vfree=false;
00113 float64_t* vec=y->get_feature_vector(j+offs, vlen, vfree);
00114
00115 for (i=0; i<n1; i++)
00116 {
00117 float64_t sum=0;
00118 for (k=0; k<m; k++)
00119 sum = sum + CMath::sq(x[i*m + k] - vec[k]);
00120 z[j*n1 + i] = sum;
00121 }
00122
00123 y->free_feature_vector(vec, j, vfree);
00124 }
00125 return NULL;
00126 }
00127 }
00128
00129 void CKMeans::sqdist(
00130 float64_t* x, CSimpleFeatures<float64_t>* y, float64_t* z, int32_t n1, int32_t offs,
00131 int32_t n2, int32_t m)
00132 {
00133 const int32_t num_threads=parallel->get_num_threads();
00134 int32_t nc, n2_nc = n2/num_threads;
00135 thread_data* TD = new thread_data[num_threads];
00136 pthread_t* tid = new pthread_t[num_threads];
00137 void *status;
00138
00139
00140 TD[0].x=x ; TD[0].y=y ; TD[0].z=z ;
00141 TD[0].n1=n1 ; TD[0].n2=n2 ; TD[0].m=m;
00142 TD[0].offs=offs;
00143
00144 if (n2>PAR_THRESH)
00145 {
00146 ASSERT(PAR_THRESH>1);
00147
00148
00149 for (nc=0; nc<num_threads; nc++)
00150 {
00151 TD[nc]=TD[0];
00152 TD[nc].js=nc*n2_nc;
00153 if (nc+1==num_threads)
00154 TD[nc].je=n2;
00155 else
00156 TD[nc].je=(nc+1)*n2_nc;
00157
00158 pthread_create(&tid[nc], NULL, sqdist_thread_func, (void*)&TD[nc]);
00159 }
00160
00161
00162 for (nc=0; nc<num_threads; nc++)
00163 pthread_join(tid[nc], &status);
00164
00165 }
00166 else
00167 {
00168
00169 TD[0].js=0 ; TD[0].je=n2;
00170 sqdist_thread_func((void *)&TD[0]);
00171 }
00172
00173 delete[] tid;
00174 delete[] TD;
00175 }
00176
00177 void CKMeans::clustknb(bool use_old_mus, float64_t *mus_start)
00178 {
00179 ASSERT(distance && distance->get_feature_type()==F_DREAL);
00180 CSimpleFeatures<float64_t>* lhs = (CSimpleFeatures<float64_t>*) distance->get_lhs();
00181 ASSERT(lhs && lhs->get_num_features()>0 && lhs->get_num_vectors()>0);
00182
00183 int32_t XSize=lhs->get_num_vectors();
00184 dimensions=lhs->get_num_features();
00185 int32_t i, changed=1;
00186 const int32_t XDimk=dimensions*k;
00187 int32_t iter=0;
00188
00189 delete[] R;
00190 R=new float64_t[k];
00191
00192 delete[] mus;
00193 mus=new float64_t[XDimk];
00194
00195 int32_t *ClList = (int32_t*) calloc(XSize, sizeof(int32_t));
00196 float64_t *weights_set = (float64_t*) calloc(k, sizeof(float64_t));
00197 float64_t *dists = (float64_t*) calloc(k*XSize, sizeof(float64_t));
00199 CSimpleFeatures<float64_t>* rhs_mus = new CSimpleFeatures<float64_t>(0);
00200 CFeatures* rhs_cache = distance->replace_rhs(rhs_mus);
00201
00202 int32_t vlen=0;
00203 bool vfree=false;
00204 float64_t* vec=NULL;
00205
00206
00207 for (i=0; i<XSize; i++) ClList[i]=0;
00208
00209 for (i=0; i<k; i++) weights_set[i]=0;
00210
00211
00212 for (i=0; i<XDimk; i++) mus[i]=0;
00213
00214 if (!use_old_mus)
00215 {
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234 for (i=0; i<XSize; i++)
00235 {
00236 const int32_t Cl=CMath::random(0, k-1);
00237 int32_t j;
00238 float64_t weight=Weights[i];
00239
00240 weights_set[Cl]+=weight;
00241 ClList[i]=Cl;
00242
00243 vec=lhs->get_feature_vector(i, vlen, vfree);
00244
00245 for (j=0; j<dimensions; j++)
00246 mus[Cl*dimensions+j] += weight*vec[j];
00247
00248 lhs->free_feature_vector(vec, i, vfree);
00249 }
00250 for (i=0; i<k; i++)
00251 {
00252 int32_t j;
00253
00254 if (weights_set[i]!=0.0)
00255 for (j=0; j<dimensions; j++)
00256 mus[i*dimensions+j] /= weights_set[i];
00257 }
00258 }
00259 else
00260 {
00261 ASSERT(mus_start);
00262
00263
00265 rhs_mus->copy_feature_matrix(mus_start,dimensions,k);
00266 float64_t* p_dists=dists;
00267
00268 for(int32_t idx=0;idx<XSize;idx++,p_dists+=k)
00269 distances_rhs(p_dists,0,k,idx);
00270 p_dists=NULL;
00271
00272 for (i=0; i<XSize; i++)
00273 {
00274 float64_t mini=dists[i*k];
00275 int32_t Cl = 0, j;
00276
00277 for (j=1; j<k; j++)
00278 {
00279 if (dists[i*k+j]<mini)
00280 {
00281 Cl=j;
00282 mini=dists[i*k+j];
00283 }
00284 }
00285 ClList[i]=Cl;
00286 }
00287
00288
00289
00290 for (i=0; i<XSize; i++)
00291 {
00292 const int32_t Cl = ClList[i];
00293 float64_t weight=Weights[i];
00294 weights_set[Cl]+=weight;
00295 #ifndef MUSRECALC
00296 vec=lhs->get_feature_vector(i, vlen, vfree);
00297
00298 for (j=0; j<dimensions; j++)
00299 mus[Cl*dimensions+j] += weight*vec[j];
00300
00301 lhs->free_feature_vector(vec, i, vfree);
00302 #endif
00303 }
00304 #ifndef MUSRECALC
00305
00306 for (i=0; i<k; i++)
00307 {
00308 if (weights_set[i]!=0.0)
00309 {
00310 int32_t j;
00311 for (j=0; j<dimensions; j++)
00312 mus[i*dimensions+j] /= weights_set[i];
00313 }
00314 }
00315 #endif
00316 }
00317
00318
00319
00320 while (changed && (iter<max_iter))
00321 {
00322 iter++;
00323 if (iter==max_iter-1)
00324 SG_WARNING("kmeans clustering changed throughout %d iterations stopping...\n", max_iter-1);
00325
00326 if (iter%1000 == 0)
00327 SG_INFO("Iteration[%d/%d]: Assignment of %i patterns changed.\n", iter, max_iter, changed);
00328 changed=0;
00329
00330 #ifdef MUSRECALC
00331
00332 for (i=0; i<XDimk; i++) mus[i]=0;
00333
00334 for (i=0; i<XSize; i++)
00335 {
00336 int32_t j;
00337 int32_t Cl=ClList[i];
00338 float64_t weight=Weights[i];
00339
00340 vec=lhs->get_feature_vector(i, vlen, vfree);
00341
00342 for (j=0; j<dimensions; j++)
00343 mus[Cl*dimensions+j] += weight*vec[j];
00344
00345 lhs->free_feature_vector(vec, i, vfree);
00346 }
00347 for (i=0; i<k; i++)
00348 {
00349 int32_t j;
00350
00351 if (weights_set[i]!=0.0)
00352 for (j=0; j<dimensions; j++)
00353 mus[i*dimensions+j] /= weights_set[i];
00354 }
00355 #endif
00357 rhs_mus->copy_feature_matrix(mus,dimensions,k);
00358
00359 for (i=0; i<XSize; i++)
00360 {
00361
00362 const int32_t Pat= CMath::random(0, XSize-1);
00363 const int32_t ClList_Pat=ClList[Pat];
00364 int32_t imini, j;
00365 float64_t mini, weight;
00366
00367 weight=Weights[Pat];
00368
00369
00370
00372 for(int32_t idx_k=0;idx_k<k;idx_k++)
00373 dists[idx_k]=distance->distance(Pat,idx_k);
00374
00375
00376 imini=0 ; mini=dists[0];
00377 for (j=1; j<k; j++)
00378 if (dists[j]<mini)
00379 {
00380 mini=dists[j];
00381 imini=j;
00382 }
00383
00384 if (imini!=ClList_Pat)
00385 {
00386 changed= changed + 1;
00387
00388
00389 weights_set[imini]+= weight;
00390
00391 weights_set[ClList_Pat]-= weight;
00392
00393
00394
00395 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00396
00397 for (j=0; j<dimensions; j++)
00398 mus[imini*dimensions+j]-=(vec[j]-mus[imini*dimensions+j])*(weight/weights_set[imini]);
00399
00400 lhs->free_feature_vector(vec, Pat, vfree);
00401
00402
00403
00404 if (weights_set[ClList_Pat]!=0.0)
00405 {
00406
00407 vec=lhs->get_feature_vector(Pat, vlen, vfree);
00408
00409 for (j=0; j<dimensions; j++)
00410 mus[ClList_Pat*dimensions+j]-=(vec[j]-mus[ClList_Pat*dimensions+j])*(weight/weights_set[ClList_Pat]);
00411 lhs->free_feature_vector(vec, Pat, vfree);
00412 }
00413 else
00414
00415 for (j=0; j<dimensions; j++)
00416 mus[ClList_Pat*dimensions+j]=0;
00417
00418
00419 ClList[Pat] = imini;
00420 }
00421 }
00422 }
00423
00424
00425 for (i=0; i<k; i++)
00426 {
00427 float64_t rmin1=0;
00428 float64_t rmin2=0;
00429
00430 bool first_round=true;
00431
00432 for (int32_t j=0; j<k; j++)
00433 {
00434 if (j!=i)
00435 {
00436 int32_t l;
00437 float64_t dist = 0;
00438
00439 for (l=0; l<dimensions; l++)
00440 dist+=CMath::sq(mus[i*dimensions+l]-mus[j*dimensions+l]);
00441
00442 if (first_round)
00443 {
00444 rmin1=dist;
00445 rmin2=dist;
00446 first_round=false;
00447 }
00448 else
00449 {
00450 if ((dist<rmin2) && (dist>=rmin1))
00451 rmin2=dist;
00452
00453 if (dist<rmin1)
00454 {
00455 rmin2=rmin1;
00456 rmin1=dist;
00457 }
00458 }
00459 }
00460 }
00461
00462 R[i]=(0.7*sqrt(rmin1)+0.3*sqrt(rmin2));
00463 }
00464 distance->replace_rhs(rhs_cache);
00465 delete rhs_mus;
00466
00467 free(ClList);
00468 free(weights_set);
00469 free(dists);
00470 SG_UNREF(lhs);
00471 }