SHOGUN v0.9.0
|
00001 /* Copyright 2006 Vikas Sindhwani (vikass@cs.uchicago.edu) 00002 SVM-lin: Fast SVM Solvers for Supervised and Semi-supervised Learning 00003 00004 This file is part of SVM-lin. 00005 00006 SVM-lin is free software; you can redistribute it and/or modify 00007 it under the terms of the GNU General Public License as published by 00008 the Free Software Foundation; either version 2 of the License, or 00009 (at your option) any later version. 00010 00011 SVM-lin is distributed in the hope that it will be useful, 00012 but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 GNU General Public License for more details. 00015 00016 You should have received a copy of the GNU General Public License 00017 along with SVM-lin (see gpl.txt); if not, write to the Free Software 00018 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00021 #include <stdio.h> 00022 #include <stdlib.h> 00023 #include <ctype.h> 00024 #include <algorithm> 00025 00026 #include "lib/io.h" 00027 #include "lib/Mathematics.h" 00028 #include "features/SparseFeatures.h" 00029 #include "classifier/svm/ssl.h" 00030 00031 namespace shogun 00032 { 00033 void ssl_train(struct data *Data, 00034 struct options *Options, 00035 struct vector_double *Weights, 00036 struct vector_double *Outputs) 00037 { 00038 // initialize 00039 initialize(Weights,Data->n,0.0); 00040 initialize(Outputs,Data->m,0.0); 00041 vector_int *Subset = new vector_int[1]; 00042 initialize(Subset,Data->m); 00043 // call the right algorithm 00044 int32_t optimality = 0; 00045 switch(Options->algo) 00046 { 00047 case -1: 00048 SG_SINFO("Regularized Least Squares Regression (CGLS)\n"); 00049 optimality=CGLS(Data,Options,Subset,Weights,Outputs); 00050 break; 00051 case RLS: 00052 SG_SINFO("Regularized Least Squares Classification (CGLS)\n"); 00053 optimality=CGLS(Data,Options,Subset,Weights,Outputs); 00054 break; 00055 case SVM: 00056 SG_SINFO("Modified Finite Newton L2-SVM (L2-SVM-MFN)\n"); 00057 optimality=L2_SVM_MFN(Data,Options,Weights,Outputs,0); 00058 break; 00059 case TSVM: 00060 SG_SINFO("Transductive L2-SVM (TSVM)\n"); 00061 optimality=TSVM_MFN(Data,Options,Weights,Outputs); 00062 break; 00063 case DA_SVM: 00064 SG_SINFO("Deterministic Annealing Semi-supervised L2-SVM (DAS3VM)\n"); 00065 optimality=DA_S3VM(Data,Options,Weights,Outputs); 00066 break; 00067 default: 00068 SG_SERROR("Algorithm unspecified"); 00069 } 00070 00071 delete[] Subset->vec; 00072 delete[] Subset; 00073 return; 00074 } 00075 00076 int32_t CGLS( 00077 const struct data *Data, const struct options *Options, 00078 const struct vector_int *Subset, struct vector_double *Weights, 00079 struct vector_double *Outputs) 00080 { 00081 SG_SDEBUG("CGLS starting..."); 00082 00083 /* Disassemble the structures */ 00084 int32_t active = Subset->d; 00085 int32_t *J = Subset->vec; 00086 CDotFeatures* features=Data->features; 00087 float64_t *Y = Data->Y; 00088 float64_t *C = Data->C; 00089 int32_t n = Data->n; 00090 float64_t lambda = Options->lambda; 00091 int32_t cgitermax = Options->cgitermax; 00092 float64_t epsilon = Options->epsilon; 00093 float64_t *beta = Weights->vec; 00094 float64_t *o = Outputs->vec; 00095 // initialize z 00096 float64_t *z = new float64_t[active]; 00097 float64_t *q = new float64_t[active]; 00098 int32_t ii=0; 00099 for (int32_t i = active ; i-- ;){ 00100 ii=J[i]; 00101 z[i] = C[ii]*(Y[ii] - o[ii]); 00102 } 00103 float64_t *r = new float64_t[n]; 00104 for (int32_t i = n ; i-- ;) 00105 r[i] = 0.0; 00106 for (register int32_t j=0; j < active; j++) 00107 { 00108 features->add_to_dense_vec(z[j], J[j], r, n-1); 00109 r[n-1]+=Options->bias*z[j]; //bias (modelled as last dim) 00110 } 00111 float64_t *p = new float64_t[n]; 00112 float64_t omega1 = 0.0; 00113 for (int32_t i = n ; i-- ;) 00114 { 00115 r[i] -= lambda*beta[i]; 00116 p[i] = r[i]; 00117 omega1 += r[i]*r[i]; 00118 } 00119 float64_t omega_p = omega1; 00120 float64_t omega_q = 0.0; 00121 float64_t inv_omega2 = 1/omega1; 00122 float64_t scale = 0.0; 00123 float64_t omega_z=0.0; 00124 float64_t gamma = 0.0; 00125 int32_t cgiter = 0; 00126 int32_t optimality = 0; 00127 float64_t epsilon2 = epsilon*epsilon; 00128 // iterate 00129 while(cgiter < cgitermax) 00130 { 00131 cgiter++; 00132 omega_q=0.0; 00133 float64_t t=0.0; 00134 register int32_t i,j; 00135 // #pragma omp parallel for private(i,j) 00136 for (i=0; i < active; i++) 00137 { 00138 ii=J[i]; 00139 t=features->dense_dot(ii, p, n-1); 00140 t+=Options->bias*p[n-1]; //bias (modelled as last dim) 00141 q[i]=t; 00142 omega_q += C[ii]*t*t; 00143 } 00144 gamma = omega1/(lambda*omega_p + omega_q); 00145 inv_omega2 = 1/omega1; 00146 for (i = n ; i-- ;) 00147 { 00148 r[i] = 0.0; 00149 beta[i] += gamma*p[i]; 00150 } 00151 omega_z=0.0; 00152 for (i = active ; i-- ;) 00153 { 00154 ii=J[i]; 00155 o[ii] += gamma*q[i]; 00156 z[i] -= gamma*C[ii]*q[i]; 00157 omega_z+=z[i]*z[i]; 00158 } 00159 for (j=0; j < active; j++) 00160 { 00161 t=z[j]; 00162 00163 features->add_to_dense_vec(t, J[j], r, n-1); 00164 r[n-1]+=Options->bias*t; //bias (modelled as last dim) 00165 } 00166 omega1 = 0.0; 00167 for (i = n ; i-- ;) 00168 { 00169 r[i] -= lambda*beta[i]; 00170 omega1 += r[i]*r[i]; 00171 } 00172 if(omega1 < epsilon2*omega_z) 00173 { 00174 optimality=1; 00175 break; 00176 } 00177 omega_p=0.0; 00178 scale=omega1*inv_omega2; 00179 for(i = n ; i-- ;) 00180 { 00181 p[i] = r[i] + p[i]*scale; 00182 omega_p += p[i]*p[i]; 00183 } 00184 } 00185 SG_SDEBUG("...Done."); 00186 SG_SINFO("CGLS converged in %d iteration(s)", cgiter); 00187 00188 delete[] z; 00189 delete[] q; 00190 delete[] r; 00191 delete[] p; 00192 return optimality; 00193 } 00194 00195 int32_t L2_SVM_MFN( 00196 const struct data *Data, struct options *Options, 00197 struct vector_double *Weights, struct vector_double *Outputs, 00198 int32_t ini) 00199 { 00200 /* Disassemble the structures */ 00201 CDotFeatures* features=Data->features; 00202 float64_t *Y = Data->Y; 00203 float64_t *C = Data->C; 00204 int32_t n = Data->n; 00205 int32_t m = Data->m; 00206 float64_t lambda = Options->lambda; 00207 float64_t epsilon; 00208 float64_t *w = Weights->vec; 00209 float64_t *o = Outputs->vec; 00210 float64_t F_old = 0.0; 00211 float64_t F = 0.0; 00212 float64_t diff=0.0; 00213 vector_int *ActiveSubset = new vector_int[1]; 00214 ActiveSubset->vec = new int32_t[m]; 00215 ActiveSubset->d = m; 00216 // initialize 00217 if(ini==0) { 00218 epsilon=BIG_EPSILON; 00219 Options->cgitermax=SMALL_CGITERMAX; 00220 Options->epsilon=BIG_EPSILON; 00221 } 00222 else {epsilon = Options->epsilon;} 00223 for (int32_t i=0;i<n;i++) F+=w[i]*w[i]; 00224 F=0.5*lambda*F; 00225 int32_t active=0; 00226 int32_t inactive=m-1; // l-1 00227 for (int32_t i=0; i<m ; i++) 00228 { 00229 diff=1-Y[i]*o[i]; 00230 if(diff>0) 00231 { 00232 ActiveSubset->vec[active]=i; 00233 active++; 00234 F+=0.5*C[i]*diff*diff; 00235 } 00236 else 00237 { 00238 ActiveSubset->vec[inactive]=i; 00239 inactive--; 00240 } 00241 } 00242 ActiveSubset->d=active; 00243 int32_t iter=0; 00244 int32_t opt=0; 00245 int32_t opt2=0; 00246 vector_double *Weights_bar = new vector_double[1]; 00247 vector_double *Outputs_bar = new vector_double[1]; 00248 float64_t *w_bar = new float64_t[n]; 00249 float64_t *o_bar = new float64_t[m]; 00250 Weights_bar->vec=w_bar; 00251 Outputs_bar->vec=o_bar; 00252 Weights_bar->d=n; 00253 Outputs_bar->d=m; 00254 float64_t delta=0.0; 00255 float64_t t=0.0; 00256 int32_t ii = 0; 00257 while(iter<MFNITERMAX) 00258 { 00259 iter++; 00260 SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples, objective_value = %f)\n", iter, active, F); 00261 for (int32_t i=n; i-- ;) 00262 w_bar[i]=w[i]; 00263 for (int32_t i=m; i-- ;) 00264 o_bar[i]=o[i]; 00265 00266 opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar); 00267 for(register int32_t i=active; i < m; i++) 00268 { 00269 ii=ActiveSubset->vec[i]; 00270 00271 t=features->dense_dot(ii, w_bar, n-1); 00272 t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim) 00273 00274 o_bar[ii]=t; 00275 } 00276 if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;}; 00277 opt2=1; 00278 for (int32_t i=0;i<m;i++) 00279 { 00280 ii=ActiveSubset->vec[i]; 00281 if(i<active) 00282 opt2=(opt2 && (Y[ii]*o_bar[ii]<=1+epsilon)); 00283 else 00284 opt2=(opt2 && (Y[ii]*o_bar[ii]>=1-epsilon)); 00285 if(opt2==0) break; 00286 } 00287 if(opt && opt2) // l 00288 { 00289 if(epsilon==BIG_EPSILON) 00290 { 00291 epsilon=EPSILON; 00292 Options->epsilon=EPSILON; 00293 SG_SDEBUG("epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f", BIG_EPSILON , EPSILON); 00294 continue; 00295 } 00296 else 00297 { 00298 for (int32_t i=n; i-- ;) 00299 w[i]=w_bar[i]; 00300 for (int32_t i=m; i-- ;) 00301 o[i]=o_bar[i]; 00302 delete[] ActiveSubset->vec; 00303 delete[] ActiveSubset; 00304 delete[] o_bar; 00305 delete[] w_bar; 00306 delete[] Weights_bar; 00307 delete[] Outputs_bar; 00308 SG_SINFO("L2_SVM_MFN converged (optimality) in %d", iter); 00309 return 1; 00310 } 00311 } 00312 delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m); 00313 SG_SDEBUG("LINE_SEARCH delta = %f\n", delta); 00314 F_old=F; 00315 F=0.0; 00316 for (int32_t i=n; i-- ;) { 00317 w[i]+=delta*(w_bar[i]-w[i]); 00318 F+=w[i]*w[i]; 00319 } 00320 F=0.5*lambda*F; 00321 active=0; 00322 inactive=m-1; 00323 for (int32_t i=0; i<m ; i++) 00324 { 00325 o[i]+=delta*(o_bar[i]-o[i]); 00326 diff=1-Y[i]*o[i]; 00327 if(diff>0) 00328 { 00329 ActiveSubset->vec[active]=i; 00330 active++; 00331 F+=0.5*C[i]*diff*diff; 00332 } 00333 else 00334 { 00335 ActiveSubset->vec[inactive]=i; 00336 inactive--; 00337 } 00338 } 00339 ActiveSubset->d=active; 00340 if(CMath::abs(F-F_old)<RELATIVE_STOP_EPS*CMath::abs(F_old)) 00341 { 00342 SG_SINFO("L2_SVM_MFN converged (rel. criterion) in %d iterations", iter); 00343 return 2; 00344 } 00345 } 00346 delete[] ActiveSubset->vec; 00347 delete[] ActiveSubset; 00348 delete[] o_bar; 00349 delete[] w_bar; 00350 delete[] Weights_bar; 00351 delete[] Outputs_bar; 00352 SG_SINFO("L2_SVM_MFN converged (max iter exceeded) in %d iterations", iter); 00353 return 0; 00354 } 00355 00356 float64_t line_search(float64_t *w, 00357 float64_t *w_bar, 00358 float64_t lambda, 00359 float64_t *o, 00360 float64_t *o_bar, 00361 float64_t *Y, 00362 float64_t *C, 00363 int32_t d, /* data dimensionality -- 'n' */ 00364 int32_t l) /* number of examples */ 00365 { 00366 float64_t omegaL = 0.0; 00367 float64_t omegaR = 0.0; 00368 float64_t diff=0.0; 00369 for(int32_t i=d; i--; ) 00370 { 00371 diff=w_bar[i]-w[i]; 00372 omegaL+=w[i]*diff; 00373 omegaR+=w_bar[i]*diff; 00374 } 00375 omegaL=lambda*omegaL; 00376 omegaR=lambda*omegaR; 00377 float64_t L=0.0; 00378 float64_t R=0.0; 00379 int32_t ii=0; 00380 for (int32_t i=0;i<l;i++) 00381 { 00382 if(Y[i]*o[i]<1) 00383 { 00384 diff=C[i]*(o_bar[i]-o[i]); 00385 L+=(o[i]-Y[i])*diff; 00386 R+=(o_bar[i]-Y[i])*diff; 00387 } 00388 } 00389 L+=omegaL; 00390 R+=omegaR; 00391 Delta* deltas=new Delta[l]; 00392 int32_t p=0; 00393 for(int32_t i=0;i<l;i++) 00394 { 00395 diff=Y[i]*(o_bar[i]-o[i]); 00396 00397 if(Y[i]*o[i]<1) 00398 { 00399 if(diff>0) 00400 { 00401 deltas[p].delta=(1-Y[i]*o[i])/diff; 00402 deltas[p].index=i; 00403 deltas[p].s=-1; 00404 p++; 00405 } 00406 } 00407 else 00408 { 00409 if(diff<0) 00410 { 00411 deltas[p].delta=(1-Y[i]*o[i])/diff; 00412 deltas[p].index=i; 00413 deltas[p].s=1; 00414 p++; 00415 } 00416 } 00417 } 00418 std::sort(deltas,deltas+p); 00419 float64_t delta_prime=0.0; 00420 for (int32_t i=0;i<p;i++) 00421 { 00422 delta_prime = L + deltas[i].delta*(R-L); 00423 if(delta_prime>=0) 00424 break; 00425 ii=deltas[i].index; 00426 diff=(deltas[i].s)*C[ii]*(o_bar[ii]-o[ii]); 00427 L+=diff*(o[ii]-Y[ii]); 00428 R+=diff*(o_bar[ii]-Y[ii]); 00429 } 00430 delete [] deltas; 00431 return (-L/(R-L)); 00432 } 00433 00434 int32_t TSVM_MFN( 00435 const struct data *Data, struct options *Options, 00436 struct vector_double *Weights, struct vector_double *Outputs) 00437 { 00438 /* Setup labeled-only examples and train L2_SVM_MFN */ 00439 struct data *Data_Labeled = new data[1]; 00440 struct vector_double *Outputs_Labeled = new vector_double[1]; 00441 initialize(Outputs_Labeled,Data->l,0.0); 00442 SG_SDEBUG("Initializing weights, unknown labels"); 00443 GetLabeledData(Data_Labeled,Data); /* gets labeled data and sets C=1/l */ 00444 L2_SVM_MFN(Data_Labeled, Options, Weights,Outputs_Labeled,0); 00446 /* Use this weight vector to classify R*u unlabeled examples as 00447 positive*/ 00448 int32_t p=0,q=0; 00449 float64_t t=0.0; 00450 int32_t *JU = new int32_t[Data->u]; 00451 float64_t *ou = new float64_t[Data->u]; 00452 float64_t lambda_0 = TSVM_LAMBDA_SMALL; 00453 for (int32_t i=0;i<Data->m;i++) 00454 { 00455 if(Data->Y[i]==0.0) 00456 { 00457 t=Data->features->dense_dot(i, Weights->vec, Data->n-1); 00458 t+=Options->bias*Weights->vec[Data->n-1]; //bias (modelled as last dim) 00459 00460 Outputs->vec[i]=t; 00461 Data->C[i]=lambda_0*1.0/Data->u; 00462 JU[q]=i; 00463 ou[q]=t; 00464 q++; 00465 } 00466 else 00467 { 00468 Outputs->vec[i]=Outputs_Labeled->vec[p]; 00469 Data->C[i]=1.0/Data->l; 00470 p++; 00471 } 00472 } 00473 std::nth_element(ou,ou+int32_t((1-Options->R)*Data->u-1),ou+Data->u); 00474 float64_t thresh=*(ou+int32_t((1-Options->R)*Data->u)-1); 00475 delete [] ou; 00476 for (int32_t i=0;i<Data->u;i++) 00477 { 00478 if(Outputs->vec[JU[i]]>thresh) 00479 Data->Y[JU[i]]=1.0; 00480 else 00481 Data->Y[JU[i]]=-1.0; 00482 } 00483 for (int32_t i=0;i<Data->n;i++) 00484 Weights->vec[i]=0.0; 00485 for (int32_t i=0;i<Data->m;i++) 00486 Outputs->vec[i]=0.0; 00487 L2_SVM_MFN(Data,Options,Weights,Outputs,0); 00488 int32_t num_switches=0; 00489 int32_t s=0; 00490 int32_t last_round=0; 00491 while(lambda_0 <= Options->lambda_u) 00492 { 00493 int32_t iter2=0; 00494 while(1){ 00495 s=switch_labels(Data->Y,Outputs->vec,JU,Data->u,Options->S); 00496 if(s==0) break; 00497 iter2++; 00498 SG_SDEBUG("****** lambda_0 = %f iteration = %d ************************************\n", lambda_0, iter2); 00499 SG_SDEBUG("Optimizing unknown labels. switched %d labels.\n"); 00500 num_switches+=s; 00501 SG_SDEBUG("Optimizing weights\n"); 00502 L2_SVM_MFN(Data,Options,Weights,Outputs,1); 00503 } 00504 if(last_round==1) break; 00505 lambda_0=TSVM_ANNEALING_RATE*lambda_0; 00506 if(lambda_0 >= Options->lambda_u) {lambda_0 = Options->lambda_u; last_round=1;} 00507 for (int32_t i=0;i<Data->u;i++) 00508 Data->C[JU[i]]=lambda_0*1.0/Data->u; 00509 SG_SDEBUG("****** lambda0 increased to %f%% of lambda_u = %f ************************\n", lambda_0*100/Options->lambda_u, Options->lambda_u); 00510 SG_SDEBUG("Optimizing weights\n"); 00511 L2_SVM_MFN(Data,Options,Weights,Outputs,1); 00512 } 00513 SG_SDEBUG("Total Number of Switches = %d\n", num_switches); 00514 /* reset labels */ 00515 for (int32_t i=0;i<Data->u;i++) Data->Y[JU[i]] = 0.0; 00516 float64_t F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u); 00517 SG_SDEBUG("Objective Value = %f\n",F); 00518 delete [] JU; 00519 return num_switches; 00520 } 00521 00522 int32_t switch_labels(float64_t* Y, float64_t* o, int32_t* JU, int32_t u, int32_t S) 00523 { 00524 int32_t npos=0; 00525 int32_t nneg=0; 00526 for (int32_t i=0;i<u;i++) 00527 { 00528 if((Y[JU[i]]>0) && (o[JU[i]]<1.0)) npos++; 00529 if((Y[JU[i]]<0) && (-o[JU[i]]<1.0)) nneg++; 00530 } 00531 Delta* positive=new Delta[npos]; 00532 Delta* negative=new Delta[nneg]; 00533 int32_t p=0; 00534 int32_t n=0; 00535 int32_t ii=0; 00536 for (int32_t i=0;i<u;i++) 00537 { 00538 ii=JU[i]; 00539 if((Y[ii]>0.0) && (o[ii]<1.0)) { 00540 positive[p].delta=o[ii]; 00541 positive[p].index=ii; 00542 positive[p].s=0; 00543 p++;}; 00544 if((Y[ii]<0.0) && (-o[ii]<1.0)) 00545 { 00546 negative[n].delta=-o[ii]; 00547 negative[n].index=ii; 00548 negative[n].s=0; 00549 n++;}; 00550 } 00551 std::sort(positive,positive+npos); 00552 std::sort(negative,negative+nneg); 00553 int32_t s=-1; 00554 while(1) 00555 { 00556 s++; 00557 if((s>=S) || (positive[s].delta>=-negative[s].delta) || (s>=npos) || (s>=nneg)) 00558 break; 00559 Y[positive[s].index]=-1.0; 00560 Y[negative[s].index]= 1.0; 00561 } 00562 delete [] positive; 00563 delete [] negative; 00564 return s; 00565 } 00566 00567 int32_t DA_S3VM( 00568 struct data *Data, struct options *Options, struct vector_double *Weights, 00569 struct vector_double *Outputs) 00570 { 00571 float64_t T = DA_INIT_TEMP*Options->lambda_u; 00572 int32_t iter1 = 0, iter2 =0; 00573 float64_t *p = new float64_t[Data->u]; 00574 float64_t *q = new float64_t[Data->u]; 00575 float64_t *g = new float64_t[Data->u]; 00576 float64_t F,F_min; 00577 float64_t *w_min = new float64_t[Data->n]; 00578 float64_t *o_min = new float64_t[Data->m]; 00579 float64_t *w = Weights->vec; 00580 float64_t *o = Outputs->vec; 00581 float64_t kl_divergence = 1.0; 00582 /*initialize */ 00583 SG_SDEBUG("Initializing weights, p"); 00584 for (int32_t i=0;i<Data->u; i++) 00585 p[i] = Options->R; 00586 /* record which examples are unlabeled */ 00587 int32_t *JU = new int32_t[Data->u]; 00588 int32_t j=0; 00589 for(int32_t i=0;i<Data->m;i++) 00590 { 00591 if(Data->Y[i]==0.0) 00592 {JU[j]=i;j++;} 00593 } 00594 float64_t H = entropy(p,Data->u); 00595 optimize_w(Data,p,Options,Weights,Outputs,0); 00596 F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u); 00597 F_min = F; 00598 for (int32_t i=0;i<Weights->d;i++) 00599 w_min[i]=w[i]; 00600 for (int32_t i=0;i<Outputs->d;i++) 00601 o_min[i]=o[i]; 00602 while((iter1 < DA_OUTER_ITERMAX) && (H > Options->epsilon)) 00603 { 00604 iter1++; 00605 iter2=0; 00606 kl_divergence=1.0; 00607 while((iter2 < DA_INNER_ITERMAX) && (kl_divergence > Options->epsilon)) 00608 { 00609 iter2++; 00610 for (int32_t i=0;i<Data->u;i++) 00611 { 00612 q[i]=p[i]; 00613 g[i] = Options->lambda_u*((o[JU[i]] > 1 ? 0 : (1 - o[JU[i]])*(1 - o[JU[i]])) - (o[JU[i]]< -1 ? 0 : (1 + o[JU[i]])*(1 + o[JU[i]]))); 00614 } 00615 SG_SDEBUG("Optimizing p.\n"); 00616 optimize_p(g,Data->u,T,Options->R,p); 00617 kl_divergence=KL(p,q,Data->u); 00618 SG_SDEBUG("Optimizing weights\n"); 00619 optimize_w(Data,p,Options,Weights,Outputs,1); 00620 F = transductive_cost(norm_square(Weights),Data->Y,Outputs->vec,Outputs->d,Options->lambda,Options->lambda_u); 00621 if(F < F_min) 00622 { 00623 F_min = F; 00624 for (int32_t i=0;i<Weights->d;i++) 00625 w_min[i]=w[i]; 00626 for (int32_t i=0;i<Outputs->d;i++) 00627 o_min[i]=o[i]; 00628 } 00629 SG_SDEBUG("***** outer_iter = %d T = %g inner_iter = %d kl = %g cost = %g *****\n",iter1,T,iter2,kl_divergence,F); 00630 } 00631 H = entropy(p,Data->u); 00632 SG_SDEBUG("***** Finished outer_iter = %d T = %g Entropy = %g ***\n", iter1,T,H); 00633 T = T/DA_ANNEALING_RATE; 00634 } 00635 for (int32_t i=0;i<Weights->d;i++) 00636 w[i]=w_min[i]; 00637 for (int32_t i=0;i<Outputs->d;i++) 00638 o[i]=o_min[i]; 00639 /* may want to reset the original Y */ 00640 delete [] p; 00641 delete [] q; 00642 delete [] g; 00643 delete [] JU; 00644 delete [] w_min; 00645 delete [] o_min; 00646 SG_SINFO("(min) Objective Value = %f", F_min); 00647 return 1; 00648 } 00649 00650 int32_t optimize_w( 00651 const struct data *Data, const float64_t *p, struct options *Options, 00652 struct vector_double *Weights, struct vector_double *Outputs, int32_t ini) 00653 { 00654 int32_t i,j; 00655 CDotFeatures* features=Data->features; 00656 int32_t n = Data->n; 00657 int32_t m = Data->m; 00658 int32_t u = Data->u; 00659 float64_t lambda = Options->lambda; 00660 float64_t epsilon; 00661 float64_t *w = Weights->vec; 00662 float64_t *o = new float64_t[m+u]; 00663 float64_t *Y = new float64_t[m+u]; 00664 float64_t *C = new float64_t[m+u]; 00665 int32_t *labeled_indices = new int32_t[m]; 00666 float64_t F_old = 0.0; 00667 float64_t F = 0.0; 00668 float64_t diff=0.0; 00669 float64_t lambda_u_by_u = Options->lambda_u/u; 00670 vector_int *ActiveSubset = new vector_int[1]; 00671 ActiveSubset->vec = new int32_t[m]; 00672 ActiveSubset->d = m; 00673 // initialize 00674 if(ini==0) 00675 { 00676 epsilon=BIG_EPSILON; 00677 Options->cgitermax=SMALL_CGITERMAX; 00678 Options->epsilon=BIG_EPSILON;} 00679 else {epsilon = Options->epsilon;} 00680 00681 for(i=0;i<n;i++) F+=w[i]*w[i]; 00682 F=lambda*F; 00683 int32_t active=0; 00684 int32_t inactive=m-1; // l-1 00685 float64_t temp1; 00686 float64_t temp2; 00687 00688 j = 0; 00689 for(i=0; i<m ; i++) 00690 { 00691 o[i]=Outputs->vec[i]; 00692 if(Data->Y[i]==0.0) 00693 { 00694 labeled_indices[i]=0; 00695 o[m+j]=o[i]; 00696 Y[i]=1; 00697 Y[m+j]=-1; 00698 C[i]=lambda_u_by_u*p[j]; 00699 C[m+j]=lambda_u_by_u*(1-p[j]); 00700 ActiveSubset->vec[active]=i; 00701 active++; 00702 diff = 1 - CMath::abs(o[i]); 00703 if(diff>0) 00704 { 00705 Data->Y[i] = 2*p[j]-1; 00706 Data->C[i] = lambda_u_by_u; 00707 temp1 = (1 - o[i]); 00708 temp2 = (1 + o[i]); 00709 F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2); 00710 } 00711 else 00712 { 00713 if(o[i]>0) 00714 { 00715 Data->Y[i] = -1.0; 00716 Data->C[i] = C[m+j]; 00717 } 00718 else 00719 { 00720 Data->Y[i] = 1.0; 00721 Data->C[i] = C[i]; 00722 } 00723 temp1 = (1-Data->Y[i]*o[i]); 00724 F+= Data->C[i]*temp1*temp1; 00725 } 00726 j++; 00727 } 00728 else 00729 { 00730 labeled_indices[i]=1; 00731 Y[i]=Data->Y[i]; 00732 C[i]=1.0/Data->l; 00733 Data->C[i]=1.0/Data->l; 00734 diff=1-Data->Y[i]*o[i]; 00735 if(diff>0) 00736 { 00737 ActiveSubset->vec[active]=i; 00738 active++; 00739 F+=Data->C[i]*diff*diff; 00740 } 00741 else 00742 { 00743 ActiveSubset->vec[inactive]=i; 00744 inactive--; 00745 } 00746 } 00747 } 00748 F=0.5*F; 00749 ActiveSubset->d=active; 00750 int32_t iter=0; 00751 int32_t opt=0; 00752 int32_t opt2=0; 00753 vector_double *Weights_bar = new vector_double[1]; 00754 vector_double *Outputs_bar = new vector_double[1]; 00755 float64_t *w_bar = new float64_t[n]; 00756 float64_t *o_bar = new float64_t[m+u]; 00757 Weights_bar->vec=w_bar; 00758 Outputs_bar->vec=o_bar; 00759 Weights_bar->d=n; 00760 Outputs_bar->d=m; /* read only the top m ; bottom u will be copies */ 00761 float64_t delta=0.0; 00762 float64_t t=0.0; 00763 int32_t ii = 0; 00764 while(iter<MFNITERMAX) 00765 { 00766 iter++; 00767 SG_SDEBUG("L2_SVM_MFN Iteration# %d (%d active examples, objective_value = %f)", iter, active, F); 00768 for(i=n; i-- ;) 00769 w_bar[i]=w[i]; 00770 00771 for(i=m+u; i-- ;) 00772 o_bar[i]=o[i]; 00773 opt=CGLS(Data,Options,ActiveSubset,Weights_bar,Outputs_bar); 00774 for(i=active; i < m; i++) 00775 { 00776 ii=ActiveSubset->vec[i]; 00777 t=features->dense_dot(ii, w_bar, n-1); 00778 t+=Options->bias*w_bar[n-1]; //bias (modelled as last dim) 00779 00780 o_bar[ii]=t; 00781 } 00782 // make o_bar consistent in the bottom half 00783 j=0; 00784 for(i=0; i<m;i++) 00785 { 00786 if(labeled_indices[i]==0) 00787 {o_bar[m+j]=o_bar[i]; j++;}; 00788 } 00789 if(ini==0) {Options->cgitermax=CGITERMAX; ini=1;}; 00790 opt2=1; 00791 for(i=0; i < m ;i++) 00792 { 00793 ii=ActiveSubset->vec[i]; 00794 if(i<active) 00795 { 00796 if(labeled_indices[ii]==1) 00797 opt2=(opt2 && (Data->Y[ii]*o_bar[ii]<=1+epsilon)); 00798 else 00799 { 00800 if(CMath::abs(o[ii])<1) 00801 opt2=(opt2 && (CMath::abs(o_bar[ii])<=1+epsilon)); 00802 else 00803 opt2=(opt2 && (CMath::abs(o_bar[ii])>=1-epsilon)); 00804 } 00805 } 00806 else 00807 opt2=(opt2 && (Data->Y[ii]*o_bar[ii]>=1-epsilon)); 00808 if(opt2==0) break; 00809 } 00810 if(opt && opt2) // l 00811 { 00812 if(epsilon==BIG_EPSILON) 00813 { 00814 epsilon=EPSILON; 00815 Options->epsilon=EPSILON; 00816 SG_SDEBUG( "epsilon = %f case converged (speedup heuristic 2). Continuing with epsilon=%f\n", BIG_EPSILON, EPSILON); 00817 continue; 00818 } 00819 else 00820 { 00821 for(i=n; i-- ;) 00822 w[i]=w_bar[i]; 00823 for(i=m; i-- ;) 00824 Outputs->vec[i]=o_bar[i]; 00825 for(i=m; i-- ;) 00826 { 00827 if(labeled_indices[i]==0) 00828 Data->Y[i]=0.0; 00829 } 00830 delete[] ActiveSubset->vec; 00831 delete[] ActiveSubset; 00832 delete[] o_bar; 00833 delete[] w_bar; 00834 delete[] o; 00835 delete[] Weights_bar; 00836 delete[] Outputs_bar; 00837 delete[] Y; 00838 delete[] C; 00839 delete[] labeled_indices; 00840 SG_SINFO("L2_SVM_MFN converged in %d iteration(s)", iter); 00841 return 1; 00842 } 00843 } 00844 00845 delta=line_search(w,w_bar,lambda,o,o_bar,Y,C,n,m+u); 00846 SG_SDEBUG("LINE_SEARCH delta = %f", delta); 00847 F_old=F; 00848 F=0.0; 00849 for(i=0;i<n;i++) {w[i]+=delta*(w_bar[i]-w[i]); F+=w[i]*w[i];} 00850 F=lambda*F; 00851 j=0; 00852 active=0; 00853 inactive=m-1; 00854 for(i=0; i<m ; i++) 00855 { 00856 o[i]+=delta*(o_bar[i]-o[i]); 00857 if(labeled_indices[i]==0) 00858 { 00859 o[m+j]=o[i]; 00860 ActiveSubset->vec[active]=i; 00861 active++; 00862 diff = 1 - CMath::abs(o[i]); 00863 if(diff>0) 00864 { 00865 Data->Y[i] = 2*p[j]-1; 00866 Data->C[i] = lambda_u_by_u; 00867 temp1 = (1 - o[i]); 00868 temp2 = (1 + o[i]); 00869 F+=lambda_u_by_u*(p[j]*temp1*temp1 + (1-p[j])*temp2*temp2); 00870 } 00871 else 00872 { 00873 if(o[i]>0) 00874 { 00875 Data->Y[i] = -1; 00876 Data->C[i] = C[m+j]; 00877 } 00878 else 00879 { 00880 Data->Y[i] = +1; 00881 Data->C[i] = C[i]; 00882 } 00883 temp1=(1-Data->Y[i]*o[i]); 00884 F+= Data->C[i]*temp1*temp1; 00885 } 00886 j++; 00887 } 00888 else 00889 { 00890 diff=1-Data->Y[i]*o[i]; 00891 if(diff>0) 00892 { 00893 ActiveSubset->vec[active]=i; 00894 active++; 00895 F+=Data->C[i]*diff*diff; 00896 } 00897 else 00898 { 00899 ActiveSubset->vec[inactive]=i; 00900 inactive--; 00901 } 00902 } 00903 } 00904 F=0.5*F; 00905 ActiveSubset->d=active; 00906 if(CMath::abs(F-F_old)<EPSILON) 00907 break; 00908 } 00909 for(i=m; i-- ;) 00910 { 00911 Outputs->vec[i]=o[i]; 00912 if(labeled_indices[i]==0) 00913 Data->Y[i]=0.0; 00914 } 00915 delete[] ActiveSubset->vec; 00916 delete[] labeled_indices; 00917 delete[] ActiveSubset; 00918 delete[] o_bar; 00919 delete[] w_bar; 00920 delete[] o; 00921 delete[] Weights_bar; 00922 delete[] Outputs_bar; 00923 delete[] Y; 00924 delete[] C; 00925 SG_SINFO("L2_SVM_MFN converged in %d iterations", iter); 00926 return 0; 00927 } 00928 00929 void optimize_p( 00930 const float64_t* g, int32_t u, float64_t T, float64_t r, float64_t* p) 00931 { 00932 int32_t iter=0; 00933 float64_t epsilon=1e-10; 00934 int32_t maxiter=500; 00935 float64_t nu_minus=g[0]; 00936 float64_t nu_plus=g[0]; 00937 for (int32_t i=0;i<u;i++) 00938 { 00939 if(g[i]<nu_minus) nu_minus=g[i]; 00940 if(g[i]>nu_plus) nu_plus=g[i]; 00941 }; 00942 00943 float64_t b=T*log((1-r)/r); 00944 nu_minus-=b; 00945 nu_plus-=b; 00946 float64_t nu=(nu_plus+nu_minus)/2; 00947 float64_t Bnu=0.0; 00948 float64_t BnuPrime=0.0; 00949 float64_t s=0.0; 00950 float64_t tmp=0.0; 00951 for (int32_t i=0;i<u;i++) 00952 { 00953 s=exp((g[i]-nu)/T); 00954 if(!(CMath::is_infinity(s))) 00955 { 00956 tmp=1.0/(1.0+s); 00957 Bnu+=tmp; 00958 BnuPrime+=s*tmp*tmp; 00959 } 00960 } 00961 Bnu=Bnu/u; 00962 Bnu-=r; 00963 BnuPrime=BnuPrime/(T*u); 00964 float64_t nuHat=0.0; 00965 while((CMath::abs(Bnu)>epsilon) && (iter < maxiter)) 00966 { 00967 iter++; 00968 if(CMath::abs(BnuPrime)>0.0) 00969 nuHat=nu-Bnu/BnuPrime; 00970 if((CMath::abs(BnuPrime) > 0.0) | (nuHat>nu_plus) | (nuHat < nu_minus)) 00971 nu=(nu_minus+nu_plus)/2.0; 00972 else 00973 nu=nuHat; 00974 Bnu=0.0; 00975 BnuPrime=0.0; 00976 for(int32_t i=0;i<u;i++) 00977 { 00978 s=exp((g[i]-nu)/T); 00979 if(!(CMath::is_infinity(s))) 00980 { 00981 tmp=1.0/(1.0+s); 00982 Bnu+=tmp; 00983 BnuPrime+=s*tmp*tmp; 00984 } 00985 } 00986 Bnu=Bnu/u; 00987 Bnu-=r; 00988 BnuPrime=BnuPrime/(T*u); 00989 if(Bnu<0) 00990 nu_minus=nu; 00991 else 00992 nu_plus=nu; 00993 if(CMath::abs(nu_minus-nu_plus)<epsilon) 00994 break; 00995 } 00996 if(CMath::abs(Bnu)>epsilon) 00997 SG_SWARNING("Warning (Root): root not found to required precision\n"); 00998 00999 for (int32_t i=0;i<u;i++) 01000 { 01001 s=exp((g[i]-nu)/T); 01002 if(CMath::is_infinity(s)) p[i]=0.0; 01003 else p[i]=1.0/(1.0+s); 01004 } 01005 SG_SINFO(" root (nu) = %f B(nu) = %f", nu, Bnu); 01006 } 01007 01008 float64_t transductive_cost( 01009 float64_t normWeights, float64_t *Y, float64_t *Outputs, int32_t m, 01010 float64_t lambda, float64_t lambda_u) 01011 { 01012 float64_t F1=0.0,F2=0.0, o=0.0, y=0.0; 01013 int32_t u=0,l=0; 01014 for (int32_t i=0;i<m;i++) 01015 { 01016 o=Outputs[i]; 01017 y=Y[i]; 01018 if(y==0.0) 01019 {F1 += CMath::abs(o) > 1 ? 0 : (1 - CMath::abs(o))*(1 - CMath::abs(o)); u++;} 01020 else 01021 {F2 += y*o > 1 ? 0 : (1-y*o)*(1-y*o); l++;} 01022 } 01023 float64_t F; 01024 F = 0.5*(lambda*normWeights + lambda_u*F1/u + F2/l); 01025 return F; 01026 } 01027 01028 float64_t entropy(const float64_t *p, int32_t u) 01029 { 01030 float64_t h=0.0; 01031 float64_t q=0.0; 01032 for (int32_t i=0;i<u;i++) 01033 { 01034 q=p[i]; 01035 if(q>0 && q<1) 01036 h+= -(q*CMath::log2(q) + (1-q)*CMath::log2(1-q)); 01037 } 01038 return h/u; 01039 } 01040 01041 float64_t KL(const float64_t *p, const float64_t *q, int32_t u) 01042 { 01043 float64_t h=0.0; 01044 float64_t p1=0.0; 01045 float64_t q1=0.0; 01046 float64_t g=0.0; 01047 for (int32_t i=0;i<u;i++) 01048 { 01049 p1=p[i]; 01050 q1=q[i]; 01051 if(p1>1-1e-8) p1-=1e-8; 01052 if(p1<1-1e-8) p1+=1e-8; 01053 if(q1>1-1e-8) q1-=1e-8; 01054 if(q1<1-1e-8) q1+=1e-8; 01055 g= (p1*CMath::log2(p1/q1) + (1-p1)*CMath::log2((1-p1)/(1-q1))); 01056 if(CMath::abs(g)<1e-12 || CMath::is_nan(g)) g=0.0; 01057 h+=g; 01058 } 01059 return h/u; 01060 } 01061 01062 /********************** UTILITIES ********************/ 01063 float64_t norm_square(const vector_double *A) 01064 { 01065 float64_t x=0.0, t=0.0; 01066 for(int32_t i=0;i<A->d;i++) 01067 { 01068 t=A->vec[i]; 01069 x+=t*t; 01070 } 01071 return x; 01072 } 01073 01074 void initialize(struct vector_double *A, int32_t k, float64_t a) 01075 { 01076 float64_t *vec = new float64_t[k]; 01077 for (int32_t i=0;i<k;i++) 01078 vec[i]=a; 01079 A->vec = vec; 01080 A->d = k; 01081 return; 01082 } 01083 01084 void initialize(struct vector_int *A, int32_t k) 01085 { 01086 int32_t *vec = new int32_t[k]; 01087 for(int32_t i=0;i<k;i++) 01088 vec[i]=i; 01089 A->vec = vec; 01090 A->d = k; 01091 return; 01092 } 01093 01094 void GetLabeledData(struct data *D, const struct data *Data) 01095 { 01096 /*FIXME 01097 int32_t *J = new int[Data->l]; 01098 D->C = new float64_t[Data->l]; 01099 D->Y = new float64_t[Data->l]; 01100 int32_t nz=0; 01101 int32_t k=0; 01102 int32_t rowptrs_=Data->l; 01103 for(int32_t i=0;i<Data->m;i++) 01104 { 01105 if(Data->Y[i]!=0.0) 01106 { 01107 J[k]=i; 01108 D->Y[k]=Data->Y[i]; 01109 D->C[k]=1.0/Data->l; 01110 nz+=(Data->rowptr[i+1] - Data->rowptr[i]); 01111 k++; 01112 } 01113 } 01114 D->val = new float64_t[nz]; 01115 D->colind = new int32_t[nz]; 01116 D->rowptr = new int32_trowptrs_+1]; 01117 nz=0; 01118 for(int32_t i=0;i<Data->l;i++) 01119 { 01120 D->rowptr[i]=nz; 01121 for(int32_t j=Data->rowptr[J[i]];j<Data->rowptr[J[i]+1];j++) 01122 { 01123 D->val[nz] = Data->val[j]; 01124 D->colind[nz] = Data->colind[j]; 01125 nz++; 01126 } 01127 } 01128 D->rowptr[rowptrs_]=nz; 01129 D->nz=nz; 01130 D->l=Data->l; 01131 D->m=Data->l; 01132 D->n=Data->n; 01133 D->u=0; 01134 delete [] J;*/ 01135 } 01136 }