SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Gunnar Raetsch 00008 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "lib/common.h" 00012 #include "lib/io.h" 00013 #include "kernel/SalzbergWordStringKernel.h" 00014 #include "features/Features.h" 00015 #include "features/StringFeatures.h" 00016 #include "features/Labels.h" 00017 #include "classifier/PluginEstimate.h" 00018 00019 using namespace shogun; 00020 00021 CSalzbergWordStringKernel::CSalzbergWordStringKernel() 00022 : CStringKernel<uint16_t>(0) 00023 { 00024 init(); 00025 } 00026 00027 CSalzbergWordStringKernel::CSalzbergWordStringKernel(int32_t size, CPluginEstimate* pie, CLabels* labels) 00028 : CStringKernel<uint16_t>(size) 00029 { 00030 init(); 00031 estimate=pie; 00032 00033 if (labels) 00034 set_prior_probs_from_labels(labels); 00035 } 00036 00037 CSalzbergWordStringKernel::CSalzbergWordStringKernel( 00038 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, 00039 CPluginEstimate* pie, CLabels* labels) 00040 : CStringKernel<uint16_t>(10),estimate(pie) 00041 { 00042 init(); 00043 estimate=pie; 00044 00045 if (labels) 00046 set_prior_probs_from_labels(labels); 00047 00048 init(l, r); 00049 } 00050 00051 CSalzbergWordStringKernel::~CSalzbergWordStringKernel() 00052 { 00053 cleanup(); 00054 } 00055 00056 bool CSalzbergWordStringKernel::init(CFeatures* p_l, CFeatures* p_r) 00057 { 00058 CStringKernel<uint16_t>::init(p_l,p_r); 00059 CStringFeatures<uint16_t>* l=(CStringFeatures<uint16_t>*) p_l; 00060 ASSERT(l); 00061 CStringFeatures<uint16_t>* r=(CStringFeatures<uint16_t>*) p_r; 00062 ASSERT(r); 00063 00064 int32_t i; 00065 initialized=false; 00066 00067 if (sqrtdiag_lhs!=sqrtdiag_rhs) 00068 delete[] sqrtdiag_rhs; 00069 sqrtdiag_rhs=NULL; 00070 delete[] sqrtdiag_lhs; 00071 sqrtdiag_lhs=NULL; 00072 if (ld_mean_lhs!=ld_mean_rhs) 00073 delete[] ld_mean_rhs; 00074 ld_mean_rhs=NULL; 00075 delete[] ld_mean_lhs; 00076 ld_mean_lhs=NULL; 00077 00078 sqrtdiag_lhs=new float64_t[l->get_num_vectors()]; 00079 ld_mean_lhs=new float64_t[l->get_num_vectors()]; 00080 00081 for (i=0; i<l->get_num_vectors(); i++) 00082 sqrtdiag_lhs[i]=1; 00083 00084 if (l==r) 00085 { 00086 sqrtdiag_rhs=sqrtdiag_lhs; 00087 ld_mean_rhs=ld_mean_lhs; 00088 } 00089 else 00090 { 00091 sqrtdiag_rhs=new float64_t[r->get_num_vectors()]; 00092 for (i=0; i<r->get_num_vectors(); i++) 00093 sqrtdiag_rhs[i]=1; 00094 00095 ld_mean_rhs=new float64_t[r->get_num_vectors()]; 00096 } 00097 00098 float64_t* l_ld_mean_lhs=ld_mean_lhs; 00099 float64_t* l_ld_mean_rhs=ld_mean_rhs; 00100 00101 //from our knowledge first normalize variance to 1 and then norm=1 does the job 00102 if (!initialized) 00103 { 00104 int32_t num_vectors=l->get_num_vectors(); 00105 num_symbols=(int32_t) l->get_num_symbols(); 00106 int32_t llen=l->get_vector_length(0); 00107 int32_t rlen=r->get_vector_length(0); 00108 num_params=(int32_t) llen*l->get_num_symbols(); 00109 int32_t num_params2=(int32_t) llen*l->get_num_symbols()+rlen*r->get_num_symbols(); 00110 if ((!estimate) || (!estimate->check_models())) 00111 { 00112 SG_ERROR( "no estimate available\n"); 00113 return false ; 00114 } ; 00115 if (num_params2!=estimate->get_num_params()) 00116 { 00117 SG_ERROR( "number of parameters of estimate and feature representation do not match\n"); 00118 return false ; 00119 } ; 00120 00121 delete[] variance; 00122 delete[] mean; 00123 mean=new float64_t[num_params]; 00124 ASSERT(mean); 00125 variance=new float64_t[num_params]; 00126 ASSERT(variance); 00127 00128 for (i=0; i<num_params; i++) 00129 { 00130 mean[i]=0; 00131 variance[i]=0; 00132 } 00133 00134 00135 // compute mean 00136 for (i=0; i<num_vectors; i++) 00137 { 00138 int32_t len; 00139 bool free_vec; 00140 uint16_t* vec=l->get_feature_vector(i, len, free_vec); 00141 00142 for (int32_t j=0; j<len; j++) 00143 { 00144 int32_t idx=compute_index(j, vec[j]); 00145 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ; 00146 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ; 00147 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ; 00148 00149 mean[idx] += value/num_vectors ; 00150 } 00151 l->free_feature_vector(vec, i, free_vec); 00152 } 00153 00154 // compute variance 00155 for (i=0; i<num_vectors; i++) 00156 { 00157 int32_t len; 00158 bool free_vec; 00159 uint16_t* vec=l->get_feature_vector(i, len, free_vec); 00160 00161 for (int32_t j=0; j<len; j++) 00162 { 00163 for (int32_t k=0; k<4; k++) 00164 { 00165 int32_t idx=compute_index(j, k); 00166 if (k!=vec[j]) 00167 variance[idx]+=mean[idx]*mean[idx]/num_vectors; 00168 else 00169 { 00170 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ; 00171 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ; 00172 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ; 00173 00174 variance[idx] += CMath::sq(value-mean[idx])/num_vectors; 00175 } 00176 } 00177 } 00178 l->free_feature_vector(vec, i, free_vec); 00179 } 00180 00181 00182 // compute sum_i m_i^2/s_i^2 00183 sum_m2_s2=0 ; 00184 for (i=0; i<num_params; i++) 00185 { 00186 if (variance[i]<1e-14) // then it is likely to be numerical inaccuracy 00187 variance[i]=1 ; 00188 00189 //fprintf(stderr, "%i: mean=%1.2e std=%1.2e\n", i, mean[i], std[i]) ; 00190 sum_m2_s2 += mean[i]*mean[i]/(variance[i]) ; 00191 } ; 00192 } 00193 00194 // compute sum of 00195 //result -= feature*mean[a_idx]/variance[a_idx] ; 00196 00197 for (i=0; i<l->get_num_vectors(); i++) 00198 { 00199 int32_t alen ; 00200 bool free_avec; 00201 uint16_t* avec=l->get_feature_vector(i, alen, free_avec); 00202 float64_t result=0 ; 00203 for (int32_t j=0; j<alen; j++) 00204 { 00205 int32_t a_idx = compute_index(j, avec[j]) ; 00206 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[j], j) ; 00207 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[j], j) ; 00208 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ; 00209 00210 if (variance[a_idx]!=0) 00211 result-=value*mean[a_idx]/variance[a_idx]; 00212 } 00213 ld_mean_lhs[i]=result ; 00214 00215 l->free_feature_vector(avec, i, free_avec); 00216 } 00217 00218 if (ld_mean_lhs!=ld_mean_rhs) 00219 { 00220 // compute sum of 00221 //result -= feature*mean[b_idx]/variance[b_idx] ; 00222 for (i=0; i<r->get_num_vectors(); i++) 00223 { 00224 int32_t alen; 00225 bool free_avec; 00226 uint16_t* avec=r->get_feature_vector(i, alen, free_avec); 00227 float64_t result=0; 00228 00229 for (int32_t j=0; j<alen; j++) 00230 { 00231 int32_t a_idx = compute_index(j, avec[j]) ; 00232 float64_t theta_p=1/estimate->log_derivative_pos_obsolete( 00233 avec[j], j) ; 00234 float64_t theta_n=1/estimate->log_derivative_neg_obsolete( 00235 avec[j], j) ; 00236 float64_t value=(theta_p/(pos_prior*theta_p+neg_prior*theta_n)); 00237 00238 result -= value*mean[a_idx]/variance[a_idx] ; 00239 } 00240 00241 ld_mean_rhs[i]=result; 00242 l->free_feature_vector(avec, i, free_avec); 00243 } 00244 } 00245 00246 //warning hacky 00247 // 00248 this->lhs=l; 00249 this->rhs=l; 00250 ld_mean_lhs = l_ld_mean_lhs ; 00251 ld_mean_rhs = l_ld_mean_lhs ; 00252 00253 //compute normalize to 1 values 00254 for (i=0; i<lhs->get_num_vectors(); i++) 00255 { 00256 sqrtdiag_lhs[i]=sqrt(compute(i,i)); 00257 00258 //trap divide by zero exception 00259 if (sqrtdiag_lhs[i]==0) 00260 sqrtdiag_lhs[i]=1e-16; 00261 } 00262 00263 // if lhs is different from rhs (train/test data) 00264 // compute also the normalization for rhs 00265 if (sqrtdiag_lhs!=sqrtdiag_rhs) 00266 { 00267 this->lhs=r; 00268 this->rhs=r; 00269 ld_mean_lhs = l_ld_mean_rhs ; 00270 ld_mean_rhs = l_ld_mean_rhs ; 00271 00272 //compute normalize to 1 values 00273 for (i=0; i<rhs->get_num_vectors(); i++) 00274 { 00275 sqrtdiag_rhs[i]=sqrt(compute(i,i)); 00276 00277 //trap divide by zero exception 00278 if (sqrtdiag_rhs[i]==0) 00279 sqrtdiag_rhs[i]=1e-16; 00280 } 00281 } 00282 00283 this->lhs=l; 00284 this->rhs=r; 00285 ld_mean_lhs = l_ld_mean_lhs ; 00286 ld_mean_rhs = l_ld_mean_rhs ; 00287 00288 initialized = true ; 00289 return init_normalizer(); 00290 } 00291 00292 void CSalzbergWordStringKernel::cleanup() 00293 { 00294 delete[] variance; 00295 variance=NULL; 00296 00297 delete[] mean; 00298 mean=NULL; 00299 00300 if (sqrtdiag_lhs != sqrtdiag_rhs) 00301 delete[] sqrtdiag_rhs; 00302 sqrtdiag_rhs=NULL; 00303 00304 delete[] sqrtdiag_lhs; 00305 sqrtdiag_lhs=NULL; 00306 00307 if (ld_mean_lhs!=ld_mean_rhs) 00308 delete[] ld_mean_rhs ; 00309 ld_mean_rhs=NULL; 00310 00311 delete[] ld_mean_lhs ; 00312 ld_mean_lhs=NULL; 00313 00314 CKernel::cleanup(); 00315 } 00316 00317 float64_t CSalzbergWordStringKernel::compute(int32_t idx_a, int32_t idx_b) 00318 { 00319 int32_t alen, blen; 00320 bool free_avec, free_bvec; 00321 uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00322 uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00323 // can only deal with strings of same length 00324 ASSERT(alen==blen); 00325 00326 float64_t result = sum_m2_s2 ; // does not contain 0-th element 00327 00328 for (int32_t i=0; i<alen; i++) 00329 { 00330 if (avec[i]==bvec[i]) 00331 { 00332 int32_t a_idx = compute_index(i, avec[i]) ; 00333 00334 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[i], i) ; 00335 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[i], i) ; 00336 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ; 00337 00338 result += value*value/variance[a_idx] ; 00339 } 00340 } 00341 result += ld_mean_lhs[idx_a] + ld_mean_rhs[idx_b] ; 00342 00343 ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00344 ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00345 00346 if (initialized) 00347 result /= (sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]) ; 00348 00349 return result; 00350 } 00351 00352 void CSalzbergWordStringKernel::set_prior_probs_from_labels(CLabels* labels) 00353 { 00354 ASSERT(labels); 00355 00356 int32_t num_pos=0, num_neg=0; 00357 for (int32_t i=0; i<labels->get_num_labels(); i++) 00358 { 00359 if (labels->get_int_label(i)==1) 00360 num_pos++; 00361 if (labels->get_int_label(i)==-1) 00362 num_neg++; 00363 } 00364 00365 SG_INFO("priors: pos=%1.3f (%i) neg=%1.3f (%i)\n", 00366 (float64_t) num_pos/(num_pos+num_neg), num_pos, 00367 (float64_t) num_neg/(num_pos+num_neg), num_neg); 00368 00369 set_prior_probs( 00370 (float64_t)num_pos/(num_pos+num_neg), 00371 (float64_t)num_neg/(num_pos+num_neg)); 00372 } 00373 00374 void CSalzbergWordStringKernel::init() 00375 { 00376 estimate=NULL; 00377 mean=NULL; 00378 variance=NULL; 00379 00380 sqrtdiag_lhs=NULL; 00381 sqrtdiag_rhs=NULL; 00382 00383 ld_mean_lhs=NULL; 00384 ld_mean_rhs=NULL; 00385 00386 num_params=0; 00387 num_symbols=0; 00388 sum_m2_s2=0; 00389 pos_prior=0.5; 00390 00391 neg_prior=0.5; 00392 initialized=false; 00393 }