SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
SalzbergWordStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2009 Gunnar Raetsch
8  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
9  */
10 
11 #include <shogun/lib/common.h>
12 #include <shogun/io/SGIO.h>
16 #include <shogun/features/Labels.h>
18 
19 using namespace shogun;
20 
22 : CStringKernel<uint16_t>(0)
23 {
24  init();
25 }
26 
28 : CStringKernel<uint16_t>(size)
29 {
30  init();
31  estimate=pie;
32 
33  if (labels)
35 }
36 
39  CPluginEstimate* pie, CLabels* labels)
40 : CStringKernel<uint16_t>(10),estimate(pie)
41 {
42  init();
43  estimate=pie;
44 
45  if (labels)
47 
48  init(l, r);
49 }
50 
52 {
53  cleanup();
54 }
55 
56 bool CSalzbergWordStringKernel::init(CFeatures* p_l, CFeatures* p_r)
57 {
60  ASSERT(l);
62  ASSERT(r);
63 
64  int32_t i;
65  initialized=false;
66 
69  sqrtdiag_rhs=NULL;
71  sqrtdiag_lhs=NULL;
74  ld_mean_rhs=NULL;
76  ld_mean_lhs=NULL;
77 
80 
81  for (i=0; i<l->get_num_vectors(); i++)
82  sqrtdiag_lhs[i]=1;
83 
84  if (l==r)
85  {
88  }
89  else
90  {
92  for (i=0; i<r->get_num_vectors(); i++)
93  sqrtdiag_rhs[i]=1;
94 
96  }
97 
98  float64_t* l_ld_mean_lhs=ld_mean_lhs;
99  float64_t* l_ld_mean_rhs=ld_mean_rhs;
100 
101  //from our knowledge first normalize variance to 1 and then norm=1 does the job
102  if (!initialized)
103  {
104  int32_t num_vectors=l->get_num_vectors();
105  num_symbols=(int32_t) l->get_num_symbols();
106  int32_t llen=l->get_vector_length(0);
107  int32_t rlen=r->get_vector_length(0);
108  num_params=(int32_t) llen*l->get_num_symbols();
109  int32_t num_params2=(int32_t) llen*l->get_num_symbols()+rlen*r->get_num_symbols();
110  if ((!estimate) || (!estimate->check_models()))
111  {
112  SG_ERROR( "no estimate available\n");
113  return false ;
114  } ;
115  if (num_params2!=estimate->get_num_params())
116  {
117  SG_ERROR( "number of parameters of estimate and feature representation do not match\n");
118  return false ;
119  } ;
120 
121  SG_FREE(variance);
122  SG_FREE(mean);
124  ASSERT(mean);
126  ASSERT(variance);
127 
128  for (i=0; i<num_params; i++)
129  {
130  mean[i]=0;
131  variance[i]=0;
132  }
133 
134 
135  // compute mean
136  for (i=0; i<num_vectors; i++)
137  {
138  int32_t len;
139  bool free_vec;
140  uint16_t* vec=l->get_feature_vector(i, len, free_vec);
141 
142  for (int32_t j=0; j<len; j++)
143  {
144  int32_t idx=compute_index(j, vec[j]);
145  float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
146  float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
147  float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
148 
149  mean[idx] += value/num_vectors ;
150  }
151  l->free_feature_vector(vec, i, free_vec);
152  }
153 
154  // compute variance
155  for (i=0; i<num_vectors; i++)
156  {
157  int32_t len;
158  bool free_vec;
159  uint16_t* vec=l->get_feature_vector(i, len, free_vec);
160 
161  for (int32_t j=0; j<len; j++)
162  {
163  for (int32_t k=0; k<4; k++)
164  {
165  int32_t idx=compute_index(j, k);
166  if (k!=vec[j])
167  variance[idx]+=mean[idx]*mean[idx]/num_vectors;
168  else
169  {
170  float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(vec[j], j) ;
171  float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(vec[j], j) ;
172  float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
173 
174  variance[idx] += CMath::sq(value-mean[idx])/num_vectors;
175  }
176  }
177  }
178  l->free_feature_vector(vec, i, free_vec);
179  }
180 
181 
182  // compute sum_i m_i^2/s_i^2
183  sum_m2_s2=0 ;
184  for (i=0; i<num_params; i++)
185  {
186  if (variance[i]<1e-14) // then it is likely to be numerical inaccuracy
187  variance[i]=1 ;
188 
189  //fprintf(stderr, "%i: mean=%1.2e std=%1.2e\n", i, mean[i], std[i]) ;
190  sum_m2_s2 += mean[i]*mean[i]/(variance[i]) ;
191  } ;
192  }
193 
194  // compute sum of
195  //result -= feature*mean[a_idx]/variance[a_idx] ;
196 
197  for (i=0; i<l->get_num_vectors(); i++)
198  {
199  int32_t alen ;
200  bool free_avec;
201  uint16_t* avec=l->get_feature_vector(i, alen, free_avec);
202  float64_t result=0 ;
203  for (int32_t j=0; j<alen; j++)
204  {
205  int32_t a_idx = compute_index(j, avec[j]) ;
206  float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[j], j) ;
207  float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[j], j) ;
208  float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
209 
210  if (variance[a_idx]!=0)
211  result-=value*mean[a_idx]/variance[a_idx];
212  }
213  ld_mean_lhs[i]=result ;
214 
215  l->free_feature_vector(avec, i, free_avec);
216  }
217 
218  if (ld_mean_lhs!=ld_mean_rhs)
219  {
220  // compute sum of
221  //result -= feature*mean[b_idx]/variance[b_idx] ;
222  for (i=0; i<r->get_num_vectors(); i++)
223  {
224  int32_t alen;
225  bool free_avec;
226  uint16_t* avec=r->get_feature_vector(i, alen, free_avec);
227  float64_t result=0;
228 
229  for (int32_t j=0; j<alen; j++)
230  {
231  int32_t a_idx = compute_index(j, avec[j]) ;
233  avec[j], j) ;
235  avec[j], j) ;
236  float64_t value=(theta_p/(pos_prior*theta_p+neg_prior*theta_n));
237 
238  result -= value*mean[a_idx]/variance[a_idx] ;
239  }
240 
241  ld_mean_rhs[i]=result;
242  r->free_feature_vector(avec, i, free_avec);
243  }
244  }
245 
246  //warning hacky
247  //
248  this->lhs=l;
249  this->rhs=l;
250  ld_mean_lhs = l_ld_mean_lhs ;
251  ld_mean_rhs = l_ld_mean_lhs ;
252 
253  //compute normalize to 1 values
254  for (i=0; i<lhs->get_num_vectors(); i++)
255  {
256  sqrtdiag_lhs[i]=sqrt(compute(i,i));
257 
258  //trap divide by zero exception
259  if (sqrtdiag_lhs[i]==0)
260  sqrtdiag_lhs[i]=1e-16;
261  }
262 
263  // if lhs is different from rhs (train/test data)
264  // compute also the normalization for rhs
266  {
267  this->lhs=r;
268  this->rhs=r;
269  ld_mean_lhs = l_ld_mean_rhs ;
270  ld_mean_rhs = l_ld_mean_rhs ;
271 
272  //compute normalize to 1 values
273  for (i=0; i<rhs->get_num_vectors(); i++)
274  {
275  sqrtdiag_rhs[i]=sqrt(compute(i,i));
276 
277  //trap divide by zero exception
278  if (sqrtdiag_rhs[i]==0)
279  sqrtdiag_rhs[i]=1e-16;
280  }
281  }
282 
283  this->lhs=l;
284  this->rhs=r;
285  ld_mean_lhs = l_ld_mean_lhs ;
286  ld_mean_rhs = l_ld_mean_rhs ;
287 
288  initialized = true ;
289  return init_normalizer();
290 }
291 
293 {
294  SG_FREE(variance);
295  variance=NULL;
296 
297  SG_FREE(mean);
298  mean=NULL;
299 
300  if (sqrtdiag_lhs != sqrtdiag_rhs)
302  sqrtdiag_rhs=NULL;
303 
305  sqrtdiag_lhs=NULL;
306 
309  ld_mean_rhs=NULL;
310 
312  ld_mean_lhs=NULL;
313 
315 }
316 
317 float64_t CSalzbergWordStringKernel::compute(int32_t idx_a, int32_t idx_b)
318 {
319  int32_t alen, blen;
320  bool free_avec, free_bvec;
321  uint16_t* avec=((CStringFeatures<uint16_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
322  uint16_t* bvec=((CStringFeatures<uint16_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
323  // can only deal with strings of same length
324  ASSERT(alen==blen);
325 
326  float64_t result = sum_m2_s2 ; // does not contain 0-th element
327 
328  for (int32_t i=0; i<alen; i++)
329  {
330  if (avec[i]==bvec[i])
331  {
332  int32_t a_idx = compute_index(i, avec[i]) ;
333 
334  float64_t theta_p = 1/estimate->log_derivative_pos_obsolete(avec[i], i) ;
335  float64_t theta_n = 1/estimate->log_derivative_neg_obsolete(avec[i], i) ;
336  float64_t value = (theta_p/(pos_prior*theta_p+neg_prior*theta_n)) ;
337 
338  result += value*value/variance[a_idx] ;
339  }
340  }
341  result += ld_mean_lhs[idx_a] + ld_mean_rhs[idx_b] ;
342 
343  ((CStringFeatures<uint16_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
344  ((CStringFeatures<uint16_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
345 
346  if (initialized)
347  result /= (sqrtdiag_lhs[idx_a]*sqrtdiag_rhs[idx_b]) ;
348 
349  return result;
350 }
351 
353 {
354  ASSERT(labels);
355 
356  int32_t num_pos=0, num_neg=0;
357  for (int32_t i=0; i<labels->get_num_labels(); i++)
358  {
359  if (labels->get_int_label(i)==1)
360  num_pos++;
361  if (labels->get_int_label(i)==-1)
362  num_neg++;
363  }
364 
365  SG_INFO("priors: pos=%1.3f (%i) neg=%1.3f (%i)\n",
366  (float64_t) num_pos/(num_pos+num_neg), num_pos,
367  (float64_t) num_neg/(num_pos+num_neg), num_neg);
368 
370  (float64_t)num_pos/(num_pos+num_neg),
371  (float64_t)num_neg/(num_pos+num_neg));
372 }
373 
374 void CSalzbergWordStringKernel::init()
375 {
376  estimate=NULL;
377  mean=NULL;
378  variance=NULL;
379 
380  sqrtdiag_lhs=NULL;
381  sqrtdiag_rhs=NULL;
382 
383  ld_mean_lhs=NULL;
384  ld_mean_rhs=NULL;
385 
386  num_params=0;
387  num_symbols=0;
388  sum_m2_s2=0;
389  pos_prior=0.5;
390 
391  neg_prior=0.5;
392  initialized=false;
393 }

SHOGUN Machine Learning Toolbox - Documentation