00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "lib/common.h"
00012 #include "lib/io.h"
00013 #include "kernel/SalzbergWordStringKernel.h"
00014 #include "features/Features.h"
00015 #include "features/StringFeatures.h"
00016 #include "features/Labels.h"
00017 #include "classifier/PluginEstimate.h"
00018
00019 using namespace shogun;
00020
00021 CSalzbergWordStringKernel::CSalzbergWordStringKernel(int32_t size, CPluginEstimate* pie, CLabels* labels)
00022 : CStringKernel<uint16_t>(size), estimate(pie), mean(NULL), variance(NULL),
00023 sqrtdiag_lhs(NULL), sqrtdiag_rhs(NULL),
00024 ld_mean_lhs(NULL), ld_mean_rhs(NULL),
00025 num_params(0), num_symbols(0), sum_m2_s2(0), pos_prior(0.5),
00026 neg_prior(0.5), initialized(false)
00027 {
00028 if (labels)
00029 set_prior_probs_from_labels(labels);
00030 }
00031
00032 CSalzbergWordStringKernel::CSalzbergWordStringKernel(
00033 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r,
00034 CPluginEstimate* pie, CLabels* labels)
00035 : CStringKernel<uint16_t>(10),estimate(pie), mean(NULL), variance(NULL),
00036 sqrtdiag_lhs(NULL), sqrtdiag_rhs(NULL),
00037 ld_mean_lhs(NULL), ld_mean_rhs(NULL),
00038 num_params(0), num_symbols(0), sum_m2_s2(0), pos_prior(0.5),
00039 neg_prior(0.5), initialized(false)
00040 {
00041 if (labels)
00042 set_prior_probs_from_labels(labels);
00043
00044 init(l, r);
00045 }
00046
00047 CSalzbergWordStringKernel::~CSalzbergWordStringKernel()
00048 {
00049 cleanup();
00050 }
00051
00052 bool CSalzbergWordStringKernel::init(CFeatures* p_l, CFeatures* p_r)
00053 {
00054 CStringKernel<uint16_t>::init(p_l,p_r);
00055 CStringFeatures<uint16_t>* l=(CStringFeatures<uint16_t>*) p_l;
00056 ASSERT(l);
00057 CStringFeatures<uint16_t>* r=(CStringFeatures<uint16_t>*) p_r;
00058 ASSERT(r);
00059
00060 int32_t i;
00061 initialized=false;
00062
00063 if (sqrtdiag_lhs!=sqrtdiag_rhs)
00064 delete[] sqrtdiag_rhs;
00065 sqrtdiag_rhs=NULL;
00066 delete[] sqrtdiag_lhs;
00067 sqrtdiag_lhs=NULL;
00068 if (ld_mean_lhs!=ld_mean_rhs)
00069 delete[] ld_mean_rhs;
00070 ld_mean_rhs=NULL;
00071 delete[] ld_mean_lhs;
00072 ld_mean_lhs=NULL;
00073
00074 sqrtdiag_lhs=new float64_t[l->get_num_vectors()];
00075 ld_mean_lhs=new float64_t[l->get_num_vectors()];
00076
00077 for (i=0; i<l->get_num_vectors(); i++)
00078 sqrtdiag_lhs[i]=1;
00079
00080 if (l==r)
00081 {
00082 sqrtdiag_rhs=sqrtdiag_lhs;
00083 ld_mean_rhs=ld_mean_lhs;
00084 }
00085 else
00086 {
00087 sqrtdiag_rhs=new float64_t[r->get_num_vectors()];
00088 for (i=0; i<r->get_num_vectors(); i++)
00089 sqrtdiag_rhs[i]=1;
00090
00091 ld_mean_rhs=new float64_t[r->get_num_vectors()];
00092 }
00093
00094 float64_t* l_ld_mean_lhs=ld_mean_lhs;
00095 float64_t* l_ld_mean_rhs=ld_mean_rhs;
00096
00097
00098 if (!initialized)
00099 {
00100 int32_t num_vectors=l->get_num_vectors();
00101 num_symbols=(int32_t) l->get_num_symbols();
00102 int32_t llen=l->get_vector_length(0);
00103 int32_t rlen=r->get_vector_length(0);
00104 num_params=(int32_t) llen*l->get_num_symbols();
00105 int32_t num_params2=(int32_t) llen*l->get_num_symbols()+rlen*r->get_num_symbols();
00106 if ((!estimate) || (!estimate->check_models()))
00107 {
00108 SG_ERROR( "no estimate available\n");
00109 return false ;
00110 } ;
00111 if (num_params2!=estimate->get_num_params())
00112 {
00113 SG_ERROR( "number of parameters of estimate and feature representation do not match\n");
00114 return false ;
00115 } ;
00116
00117 delete[] variance;
00118 delete[] mean;
00119 mean=new float64_t[num_params];
00120 ASSERT(mean);
00121 variance=new float64_t[num_params];
00122 ASSERT(variance);
00123
00124 for (i=0; i<num_params; i++)
00125 {
00126 mean[i]=0;
00127 variance[i]=0;
00128 }
00129
00130
00131
00132 for (i=0; i<num_vectors; i++)
00133 {
00134 int32_t len;
00135 bool free_vec;
00136 uint16_t* vec=l->get_feature_vector(i, len, free_vec);
00137
00138 for (int32_t j=0; j<len; j++)
00139 {
00140 int32_t idx=compute_index(j, vec[j]);
00141 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
00142 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
00143 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00144
00145 mean[idx] += value/num_vectors ;
00146 }
00147 l->free_feature_vector(vec, i, free_vec);
00148 }
00149
00150
00151 for (i=0; i<num_vectors; i++)
00152 {
00153 int32_t len;
00154 bool free_vec;
00155 uint16_t* vec=l->get_feature_vector(i, len, free_vec);
00156
00157 for (int32_t j=0; j<len; j++)
00158 {
00159 for (int32_t k=0; k<4; k++)
00160 {
00161 int32_t idx=compute_index(j, k);
00162 if (k!=vec[j])
00163 variance[idx]+=mean[idx]*mean[idx]/num_vectors;
00164 else
00165 {
00166 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
00167 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
00168 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00169
00170 variance[idx] += CMath::sq(value-mean[idx])/num_vectors;
00171 }
00172 }
00173 }
00174 l->free_feature_vector(vec, i, free_vec);
00175 }
00176
00177
00178
00179 sum_m2_s2=0 ;
00180 for (i=0; i<num_params; i++)
00181 {
00182 if (variance[i]<1e-14)
00183 variance[i]=1 ;
00184
00185
00186 sum_m2_s2 += mean[i]*mean[i]/(variance[i]) ;
00187 } ;
00188 }
00189
00190
00191
00192
00193 for (i=0; i<l->get_num_vectors(); i++)
00194 {
00195 int32_t alen ;
00196 bool free_avec;
00197 uint16_t* avec=l->get_feature_vector(i, alen, free_avec);
00198 float64_t result=0 ;
00199 for (int32_t j=0; j<alen; j++)
00200 {
00201 int32_t a_idx = compute_index(j, avec[j]) ;
00202 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[j], j) ;
00203 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[j], j) ;
00204 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00205
00206 if (variance[a_idx]!=0)
00207 result-=value*mean[a_idx]/variance[a_idx];
00208 }
00209 ld_mean_lhs[i]=result ;
00210
00211 l->free_feature_vector(avec, i, free_avec);
00212 }
00213
00214 if (ld_mean_lhs!=ld_mean_rhs)
00215 {
00216
00217
00218 for (i=0; i<r->get_num_vectors(); i++)
00219 {
00220 int32_t alen;
00221 bool free_avec;
00222 uint16_t* avec=r->get_feature_vector(i, alen, free_avec);
00223 float64_t result=0;
00224
00225 for (int32_t j=0; j<alen; j++)
00226 {
00227 int32_t a_idx = compute_index(j, avec[j]) ;
00228 float64_t theta_p=1/estimate->log_derivative_pos_obsolete(
00229 avec[j], j) ;
00230 float64_t theta_n=1/estimate->log_derivative_neg_obsolete(
00231 avec[j], j) ;
00232 float64_t value=(theta_p/(pos_prior*theta_p+neg_prior*theta_n));
00233
00234 result -= value*mean[a_idx]/variance[a_idx] ;
00235 }
00236
00237 ld_mean_rhs[i]=result;
00238 l->free_feature_vector(avec, i, free_avec);
00239 }
00240 }
00241
00242
00243
00244 this->lhs=l;
00245 this->rhs=l;
00246 ld_mean_lhs = l_ld_mean_lhs ;
00247 ld_mean_rhs = l_ld_mean_lhs ;
00248
00249
00250 for (i=0; i<lhs->get_num_vectors(); i++)
00251 {
00252 sqrtdiag_lhs[i]=sqrt(compute(i,i));
00253
00254
00255 if (sqrtdiag_lhs[i]==0)
00256 sqrtdiag_lhs[i]=1e-16;
00257 }
00258
00259
00260
00261 if (sqrtdiag_lhs!=sqrtdiag_rhs)
00262 {
00263 this->lhs=r;
00264 this->rhs=r;
00265 ld_mean_lhs = l_ld_mean_rhs ;
00266 ld_mean_rhs = l_ld_mean_rhs ;
00267
00268
00269 for (i=0; i<rhs->get_num_vectors(); i++)
00270 {
00271 sqrtdiag_rhs[i]=sqrt(compute(i,i));
00272
00273
00274 if (sqrtdiag_rhs[i]==0)
00275 sqrtdiag_rhs[i]=1e-16;
00276 }
00277 }
00278
00279 this->lhs=l;
00280 this->rhs=r;
00281 ld_mean_lhs = l_ld_mean_lhs ;
00282 ld_mean_rhs = l_ld_mean_rhs ;
00283
00284 initialized = true ;
00285 return init_normalizer();
00286 }
00287
00288 void CSalzbergWordStringKernel::cleanup()
00289 {
00290 delete[] variance;
00291 variance=NULL;
00292
00293 delete[] mean;
00294 mean=NULL;
00295
00296 if (sqrtdiag_lhs != sqrtdiag_rhs)
00297 delete[] sqrtdiag_rhs;
00298 sqrtdiag_rhs=NULL;
00299
00300 delete[] sqrtdiag_lhs;
00301 sqrtdiag_lhs=NULL;
00302
00303 if (ld_mean_lhs!=ld_mean_rhs)
00304 delete[] ld_mean_rhs ;
00305 ld_mean_rhs=NULL;
00306
00307 delete[] ld_mean_lhs ;
00308 ld_mean_lhs=NULL;
00309
00310 CKernel::cleanup();
00311 }
00312
00313 float64_t CSalzbergWordStringKernel::compute(int32_t idx_a, int32_t idx_b)
00314 {
00315 int32_t alen, blen;
00316 bool free_avec, free_bvec;
00317 uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00318 uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00319
00320 ASSERT(alen==blen);
00321
00322 float64_t result = sum_m2_s2 ;
00323
00324 for (int32_t i=0; i<alen; i++)
00325 {
00326 if (avec[i]==bvec[i])
00327 {
00328 int32_t a_idx = compute_index(i, avec[i]) ;
00329
00330 float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[i], i) ;
00331 float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[i], i) ;
00332 float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
00333
00334 result += value*value/variance[a_idx] ;
00335 }
00336 }
00337 result += ld_mean_lhs[idx_a] + ld_mean_rhs[idx_b] ;
00338
00339 ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00340 ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00341
00342 if (initialized)
00343 result /= (sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]) ;
00344
00345 return result;
00346 }
00347
00348 void CSalzbergWordStringKernel::set_prior_probs_from_labels(CLabels* labels)
00349 {
00350 ASSERT(labels);
00351
00352 int32_t num_pos=0, num_neg=0;
00353 for (int32_t i=0; i<labels->get_num_labels(); i++)
00354 {
00355 if (labels->get_int_label(i)==1)
00356 num_pos++;
00357 if (labels->get_int_label(i)==-1)
00358 num_neg++;
00359 }
00360
00361 SG_INFO("priors: pos=%1.3f (%i) neg=%1.3f (%i)\n",
00362 (float64_t) num_pos/(num_pos+num_neg), num_pos,
00363 (float64_t) num_neg/(num_pos+num_neg), num_neg);
00364
00365 set_prior_probs(
00366 (float64_t)num_pos/(num_pos+num_neg),
00367 (float64_t)num_neg/(num_pos+num_neg));
00368 }