00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017
00018 #include "kernel/WeightedDegreeStringKernel.h"
00019 #include "kernel/FirstElementKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022
00023 #ifndef WIN32
00024 #include <pthread.h>
00025 #endif
00026
00027
00028 #ifdef HAVE_BOOST_SERIALIZATION
00029 #include <boost/serialization/export.hpp>
00030 BOOST_CLASS_EXPORT(shogun::CWeightedDegreeStringKernel);
00031 #endif //HAVE_BOOST_SERIALIZATION
00032
00033 using namespace shogun;
00034
00035 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00036 struct S_THREAD_PARAM
00037 {
00038
00039 int32_t* vec;
00040 float64_t* result;
00041 float64_t* weights;
00042 CWeightedDegreeStringKernel* kernel;
00043 CTrie<DNATrie>* tries;
00044 float64_t factor;
00045 int32_t j;
00046 int32_t start;
00047 int32_t end;
00048 int32_t length;
00049 int32_t* vec_idx;
00050 };
00051 #endif // DOXYGEN_SHOULD_SKIP_THIS
00052
00053
00054 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00055 int32_t degree_, EWDKernType type_)
00056 : CStringKernel<char>(10),weights(NULL),position_weights(NULL),
00057 weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0),
00058 max_mismatch(0), seq_length(0), block_computation(true),
00059 num_block_weights_external(0), block_weights_external(NULL),
00060 block_weights(NULL), type(type_), which_degree(-1), tries(NULL),
00061 tree_initialized(false), alphabet(NULL)
00062 {
00063 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00064 lhs=NULL;
00065 rhs=NULL;
00066
00067 if (type!=E_EXTERNAL)
00068 set_wd_weights_by_type(type);
00069
00070 set_normalizer(new CFirstElementKernelNormalizer());
00071 }
00072
00073 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00074 float64_t *weights_, int32_t degree_)
00075 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00076 weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00077 max_mismatch(0), seq_length(0), block_computation(true),
00078 num_block_weights_external(0), block_weights_external(NULL),
00079 block_weights(NULL), type(E_EXTERNAL), which_degree(-1), tries(NULL),
00080 tree_initialized(false), alphabet(NULL)
00081 {
00082 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00083 lhs=NULL;
00084 rhs=NULL;
00085
00086 weights=new float64_t[degree*(1+max_mismatch)];
00087 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00088 weights[i]=weights_[i];
00089 set_normalizer(new CFirstElementKernelNormalizer());
00090 }
00091
00092 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00093 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree_)
00094 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00095 weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00096 max_mismatch(0), seq_length(0), block_computation(true),
00097 num_block_weights_external(0), block_weights_external(NULL),
00098 block_weights(NULL), type(E_WD), which_degree(-1), tries(NULL),
00099 tree_initialized(false), alphabet(NULL)
00100 {
00101 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00102
00103 set_wd_weights_by_type(type);
00104 set_normalizer(new CFirstElementKernelNormalizer());
00105 init(l, r);
00106 }
00107
00108 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00109 {
00110 cleanup();
00111
00112 delete[] weights;
00113 weights=NULL;
00114
00115 delete[] block_weights;
00116 block_weights=NULL;
00117
00118 delete[] position_weights;
00119 position_weights=NULL;
00120
00121 delete[] weights_buffer;
00122 weights_buffer=NULL;
00123 }
00124
00125
00126 void CWeightedDegreeStringKernel::remove_lhs()
00127 {
00128 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00129 delete_optimization();
00130
00131 if (tries!=NULL)
00132 tries->destroy();
00133
00134 CKernel::remove_lhs();
00135 }
00136
00137 void CWeightedDegreeStringKernel::create_empty_tries()
00138 {
00139 ASSERT(lhs);
00140
00141 seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00142
00143 if (tries!=NULL)
00144 {
00145 tries->destroy() ;
00146 tries->create(seq_length, max_mismatch==0) ;
00147 }
00148 }
00149
00150 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00151 {
00152 int32_t lhs_changed=(lhs!=l);
00153 int32_t rhs_changed=(rhs!=r);
00154
00155 CStringKernel<char>::init(l,r);
00156
00157 SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00158 SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00159
00160 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00161 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00162
00163 int32_t len=sf_l->get_max_vector_length();
00164 if (lhs_changed && !sf_l->have_same_length(len))
00165 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00166
00167 if (rhs_changed && !sf_r->have_same_length(len))
00168 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00169
00170 SG_UNREF(alphabet);
00171 alphabet=sf_l->get_alphabet();
00172 CAlphabet* ralphabet=sf_r->get_alphabet();
00173
00174 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00175 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00176
00177 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00178 SG_UNREF(ralphabet);
00179
00180 if (tries!=NULL) {
00181 tries->delete_trees(max_mismatch==0);
00182 SG_UNREF(tries);
00183 }
00184 tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00185 create_empty_tries();
00186
00187 init_block_weights();
00188
00189 return init_normalizer();
00190 }
00191
00192 void CWeightedDegreeStringKernel::cleanup()
00193 {
00194 SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00195 delete_optimization();
00196
00197 delete[] block_weights;
00198 block_weights=NULL;
00199
00200 if (tries!=NULL)
00201 {
00202 tries->destroy();
00203 SG_UNREF(tries);
00204 tries=NULL;
00205 }
00206
00207 seq_length=0;
00208 tree_initialized = false;
00209
00210 SG_UNREF(alphabet);
00211 alphabet=NULL;
00212
00213 CKernel::cleanup();
00214 }
00215
00216 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00217 {
00218 if (tree_num<0)
00219 SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00220
00221 delete_optimization();
00222
00223 if (tree_num<0)
00224 SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00225
00226 for (int32_t i=0; i<count; i++)
00227 {
00228 if (tree_num<0)
00229 {
00230 if ( (i % (count/10+1)) == 0)
00231 SG_PROGRESS(i, 0, count);
00232
00233 if (max_mismatch==0)
00234 add_example_to_tree(IDX[i], alphas[i]) ;
00235 else
00236 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00237
00238
00239 }
00240 else
00241 {
00242 if (max_mismatch==0)
00243 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00244 else
00245 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00246 }
00247 }
00248
00249 if (tree_num<0)
00250 SG_DONE();
00251
00252
00253
00254 set_is_initialized(true) ;
00255 return true ;
00256 }
00257
00258 bool CWeightedDegreeStringKernel::delete_optimization()
00259 {
00260 if (get_is_initialized())
00261 {
00262 if (tries!=NULL)
00263 tries->delete_trees(max_mismatch==0);
00264 set_is_initialized(false);
00265 return true;
00266 }
00267
00268 return false;
00269 }
00270
00271
00272 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00273 char* avec, int32_t alen, char* bvec, int32_t blen)
00274 {
00275 float64_t sum = 0.0;
00276
00277 for (int32_t i=0; i<alen; i++)
00278 {
00279 float64_t sumi = 0.0;
00280 int32_t mismatches=0;
00281
00282 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00283 {
00284 if (avec[i+j]!=bvec[i+j])
00285 {
00286 mismatches++ ;
00287 if (mismatches>max_mismatch)
00288 break ;
00289 } ;
00290 sumi += weights[j+degree*mismatches];
00291 }
00292 if (position_weights!=NULL)
00293 sum+=position_weights[i]*sumi ;
00294 else
00295 sum+=sumi ;
00296 }
00297 return sum ;
00298 }
00299
00300 float64_t CWeightedDegreeStringKernel::compute_using_block(
00301 char* avec, int32_t alen, char* bvec, int32_t blen)
00302 {
00303 ASSERT(alen==blen);
00304
00305 float64_t sum=0;
00306 int32_t match_len=-1;
00307
00308 for (int32_t i=0; i<alen; i++)
00309 {
00310 if (avec[i]==bvec[i])
00311 match_len++;
00312 else
00313 {
00314 if (match_len>=0)
00315 sum+=block_weights[match_len];
00316 match_len=-1;
00317 }
00318 }
00319
00320 if (match_len>=0)
00321 sum+=block_weights[match_len];
00322
00323 return sum;
00324 }
00325
00326 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00327 char* avec, int32_t alen, char* bvec, int32_t blen)
00328 {
00329 float64_t sum = 0.0;
00330
00331 for (int32_t i=0; i<alen; i++)
00332 {
00333 float64_t sumi = 0.0;
00334
00335 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00336 {
00337 if (avec[i+j]!=bvec[i+j])
00338 break ;
00339 sumi += weights[j];
00340 }
00341 if (position_weights!=NULL)
00342 sum+=position_weights[i]*sumi ;
00343 else
00344 sum+=sumi ;
00345 }
00346 return sum ;
00347 }
00348
00349 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00350 char* avec, int32_t alen, char* bvec, int32_t blen)
00351 {
00352 float64_t sum = 0.0;
00353
00354 for (int32_t i=0; i<alen; i++)
00355 {
00356 float64_t sumi=0.0;
00357 for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00358 {
00359 if (avec[i+j]!=bvec[i+j])
00360 break;
00361 sumi += weights[i*degree+j];
00362 }
00363 if (position_weights!=NULL)
00364 sum += position_weights[i]*sumi ;
00365 else
00366 sum += sumi ;
00367 }
00368
00369 return sum ;
00370 }
00371
00372
00373 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00374 {
00375 int32_t alen, blen;
00376 bool free_avec, free_bvec;
00377 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00378 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00379 float64_t result=0;
00380
00381 if (max_mismatch==0 && length==0 && block_computation)
00382 result=compute_using_block(avec, alen, bvec, blen);
00383 else
00384 {
00385 if (max_mismatch>0)
00386 result=compute_with_mismatch(avec, alen, bvec, blen);
00387 else if (length==0)
00388 result=compute_without_mismatch(avec, alen, bvec, blen);
00389 else
00390 result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00391 }
00392 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00393 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00394
00395 return result;
00396 }
00397
00398
00399 void CWeightedDegreeStringKernel::add_example_to_tree(
00400 int32_t idx, float64_t alpha)
00401 {
00402 ASSERT(alphabet);
00403 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00404
00405 int32_t len=0;
00406 bool free_vec;
00407 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00408 ASSERT(max_mismatch==0);
00409 int32_t *vec=new int32_t[len];
00410
00411 for (int32_t i=0; i<len; i++)
00412 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00413 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00414
00415 if (length == 0 || max_mismatch > 0)
00416 {
00417 for (int32_t i=0; i<len; i++)
00418 {
00419 float64_t alpha_pw=alpha;
00420
00421
00422 if (alpha_pw==0.0)
00423 continue;
00424 ASSERT(tries);
00425 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00426 }
00427 }
00428 else
00429 {
00430 for (int32_t i=0; i<len; i++)
00431 {
00432 float64_t alpha_pw=alpha;
00433
00434
00435 if (alpha_pw==0.0)
00436 continue ;
00437 ASSERT(tries);
00438 tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00439 }
00440 }
00441 delete[] vec ;
00442 tree_initialized=true ;
00443 }
00444
00445 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00446 int32_t idx, float64_t alpha, int32_t tree_num)
00447 {
00448 ASSERT(alphabet);
00449 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00450
00451 int32_t len;
00452 bool free_vec;
00453 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00454 ASSERT(max_mismatch==0);
00455 int32_t *vec = new int32_t[len] ;
00456
00457 for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00458 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00459 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00460
00461
00462 ASSERT(tries);
00463 if (alpha!=0.0)
00464 tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00465
00466 delete[] vec ;
00467 tree_initialized=true ;
00468 }
00469
00470 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00471 {
00472 ASSERT(tries);
00473 ASSERT(alphabet);
00474 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00475
00476 int32_t len ;
00477 bool free_vec;
00478 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00479
00480 int32_t *vec = new int32_t[len] ;
00481
00482 for (int32_t i=0; i<len; i++)
00483 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00484 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00485
00486 for (int32_t i=0; i<len; i++)
00487 {
00488 if (alpha!=0.0)
00489 tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00490 }
00491
00492 delete[] vec ;
00493 tree_initialized=true ;
00494 }
00495
00496 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00497 int32_t idx, float64_t alpha, int32_t tree_num)
00498 {
00499 ASSERT(tries);
00500 ASSERT(alphabet);
00501 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00502
00503 int32_t len=0;
00504 bool free_vec;
00505 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00506 int32_t *vec=new int32_t[len];
00507
00508 for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00509 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00510 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00511
00512 if (alpha!=0.0)
00513 {
00514 tries->add_example_to_tree_mismatch_recursion(
00515 NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00516 0, 0, max_mismatch, weights);
00517 }
00518
00519 delete[] vec;
00520 tree_initialized=true;
00521 }
00522
00523
00524 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00525 {
00526 ASSERT(alphabet);
00527 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00528
00529 int32_t len=0;
00530 bool free_vec;
00531 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00532 ASSERT(char_vec && len>0);
00533 int32_t *vec=new int32_t[len];
00534
00535 for (int32_t i=0; i<len; i++)
00536 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00537 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00538
00539 float64_t sum=0;
00540 ASSERT(tries);
00541 for (int32_t i=0; i<len; i++)
00542 sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00543
00544 delete[] vec;
00545 return normalizer->normalize_rhs(sum, idx);
00546 }
00547
00548 void CWeightedDegreeStringKernel::compute_by_tree(
00549 int32_t idx, float64_t* LevelContrib)
00550 {
00551 ASSERT(alphabet);
00552 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00553
00554 int32_t len ;
00555 bool free_vec;
00556 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00557
00558 int32_t *vec = new int32_t[len] ;
00559
00560 for (int32_t i=0; i<len; i++)
00561 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00562 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00563
00564 ASSERT(tries);
00565 for (int32_t i=0; i<len; i++)
00566 {
00567 tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00568 normalizer->normalize_rhs(1.0, idx),
00569 mkl_stepsize, weights, (length!=0));
00570 }
00571
00572 delete[] vec ;
00573 }
00574
00575 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00576 {
00577 ASSERT(tries);
00578 return tries->compute_abs_weights(len);
00579 }
00580
00581 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00582 {
00583 ASSERT(degree>0);
00584 ASSERT(p_type==E_WD);
00585
00586 delete[] weights;
00587 weights=new float64_t[degree];
00588 if (weights)
00589 {
00590 int32_t i;
00591 float64_t sum=0;
00592 for (i=0; i<degree; i++)
00593 {
00594 weights[i]=degree-i;
00595 sum+=weights[i];
00596 }
00597 for (i=0; i<degree; i++)
00598 weights[i]/=sum;
00599
00600 for (i=0; i<degree; i++)
00601 {
00602 for (int32_t j=1; j<=max_mismatch; j++)
00603 {
00604 if (j<i+1)
00605 {
00606 int32_t nk=CMath::nchoosek(i+1, j);
00607 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00608 }
00609 else
00610 weights[i+j*degree]= 0;
00611 }
00612 }
00613
00614 if (which_degree>=0)
00615 {
00616 ASSERT(which_degree<degree);
00617 for (i=0; i<degree; i++)
00618 {
00619 if (i!=which_degree)
00620 weights[i]=0;
00621 else
00622 weights[i]=1;
00623 }
00624 }
00625 return true;
00626 }
00627 else
00628 return false;
00629 }
00630
00631 bool CWeightedDegreeStringKernel::set_weights(
00632 float64_t* ws, int32_t d, int32_t len)
00633 {
00634 if (d!=degree || len<1)
00635 SG_ERROR("Dimension mismatch (should be de(seq_length | 1) x degree)\n");
00636
00637 length=len;
00638
00639 if (length==0)
00640 length=1;
00641
00642 int32_t num_weights=degree*(length+max_mismatch);
00643 delete[] weights;
00644 weights=new float64_t[num_weights];
00645
00646 if (weights)
00647 {
00648 for (int32_t i=0; i<num_weights; i++) {
00649 if (ws[i])
00650 weights[i]=ws[i];
00651 }
00652 return true;
00653 }
00654 else
00655 return false;
00656 }
00657
00658 bool CWeightedDegreeStringKernel::set_position_weights(
00659 float64_t* pws, int32_t len)
00660 {
00661 if (len==0)
00662 {
00663 delete[] position_weights;
00664 position_weights=NULL;
00665 ASSERT(tries);
00666 tries->set_position_weights(position_weights);
00667 }
00668
00669 if (seq_length!=len)
00670 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00671
00672 delete[] position_weights;
00673 position_weights=new float64_t[len];
00674 ASSERT(tries);
00675 tries->set_position_weights(position_weights);
00676
00677 if (position_weights)
00678 {
00679 for (int32_t i=0; i<len; i++)
00680 position_weights[i]=pws[i];
00681 return true;
00682 }
00683 else
00684 return false;
00685 }
00686
00687 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00688 {
00689 delete[] block_weights;
00690 block_weights=new float64_t[CMath::max(seq_length,degree)];
00691
00692 if (block_weights)
00693 {
00694 int32_t k;
00695 float64_t d=degree;
00696
00697 for (k=0; k<degree; k++)
00698 block_weights[k]=
00699 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00700 for (k=degree; k<seq_length; k++)
00701 block_weights[k]=(-d+3*k+4)/3;
00702 }
00703
00704 return (block_weights!=NULL);
00705 }
00706
00707 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00708 {
00709 ASSERT(weights);
00710 delete[] block_weights;
00711 block_weights=new float64_t[CMath::max(seq_length,degree)];
00712
00713 if (block_weights)
00714 {
00715 int32_t i=0;
00716 block_weights[0]=weights[0];
00717 for (i=1; i<CMath::max(seq_length,degree); i++)
00718 block_weights[i]=0;
00719
00720 for (i=1; i<CMath::max(seq_length,degree); i++)
00721 {
00722 block_weights[i]=block_weights[i-1];
00723
00724 float64_t contrib=0;
00725 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00726 contrib+=weights[j];
00727
00728 block_weights[i]+=contrib;
00729 }
00730 }
00731
00732 return (block_weights!=NULL);
00733 }
00734
00735 bool CWeightedDegreeStringKernel::init_block_weights_const()
00736 {
00737 delete[] block_weights;
00738 block_weights=new float64_t[seq_length];
00739
00740 if (block_weights)
00741 {
00742 for (int32_t i=1; i<seq_length+1 ; i++)
00743 block_weights[i-1]=1.0/seq_length;
00744 }
00745
00746 return (block_weights!=NULL);
00747 }
00748
00749 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00750 {
00751 delete[] block_weights;
00752 block_weights=new float64_t[seq_length];
00753
00754 if (block_weights)
00755 {
00756 for (int32_t i=1; i<seq_length+1 ; i++)
00757 block_weights[i-1]=degree*i;
00758 }
00759
00760 return (block_weights!=NULL);
00761 }
00762
00763 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00764 {
00765 delete[] block_weights;
00766 block_weights=new float64_t[seq_length];
00767
00768 if (block_weights)
00769 {
00770 for (int32_t i=1; i<degree+1 ; i++)
00771 block_weights[i-1]=((float64_t) i)*i;
00772
00773 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00774 block_weights[i-1]=i;
00775 }
00776
00777 return (block_weights!=NULL);
00778 }
00779
00780 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00781 {
00782 delete[] block_weights;
00783 block_weights=new float64_t[seq_length];
00784
00785 if (block_weights)
00786 {
00787 for (int32_t i=1; i<degree+1 ; i++)
00788 block_weights[i-1]=((float64_t) i)*i*i;
00789
00790 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00791 block_weights[i-1]=i;
00792 }
00793
00794 return (block_weights!=NULL);
00795 }
00796
00797 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00798 {
00799 delete[] block_weights;
00800 block_weights=new float64_t[seq_length];
00801
00802 if (block_weights)
00803 {
00804 for (int32_t i=1; i<degree+1 ; i++)
00805 block_weights[i-1]=exp(((float64_t) i/10.0));
00806
00807 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00808 block_weights[i-1]=i;
00809 }
00810
00811 return (block_weights!=NULL);
00812 }
00813
00814 bool CWeightedDegreeStringKernel::init_block_weights_log()
00815 {
00816 delete[] block_weights;
00817 block_weights=new float64_t[seq_length];
00818
00819 if (block_weights)
00820 {
00821 for (int32_t i=1; i<degree+1 ; i++)
00822 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00823
00824 for (int32_t i=degree+1; i<seq_length+1 ; i++)
00825 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00826 }
00827
00828 return (block_weights!=NULL);
00829 }
00830
00831 bool CWeightedDegreeStringKernel::init_block_weights_external()
00832 {
00833 if (block_weights_external && (seq_length == num_block_weights_external) )
00834 {
00835 delete[] block_weights;
00836 block_weights=new float64_t[seq_length];
00837
00838 if (block_weights)
00839 {
00840 for (int32_t i=0; i<seq_length; i++)
00841 block_weights[i]=block_weights_external[i];
00842 }
00843 }
00844 else {
00845 SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
00846 }
00847 return (block_weights!=NULL);
00848 }
00849
00850 bool CWeightedDegreeStringKernel::init_block_weights()
00851 {
00852 switch (type)
00853 {
00854 case E_WD:
00855 return init_block_weights_from_wd();
00856 case E_EXTERNAL:
00857 return init_block_weights_from_wd_external();
00858 case E_BLOCK_CONST:
00859 return init_block_weights_const();
00860 case E_BLOCK_LINEAR:
00861 return init_block_weights_linear();
00862 case E_BLOCK_SQPOLY:
00863 return init_block_weights_sqpoly();
00864 case E_BLOCK_CUBICPOLY:
00865 return init_block_weights_cubicpoly();
00866 case E_BLOCK_EXP:
00867 return init_block_weights_exp();
00868 case E_BLOCK_LOG:
00869 return init_block_weights_log();
00870 case E_BLOCK_EXTERNAL:
00871 return init_block_weights_external();
00872 default:
00873 return false;
00874 };
00875 }
00876
00877
00878 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00879 {
00880 S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00881 int32_t j=params->j;
00882 CWeightedDegreeStringKernel* wd=params->kernel;
00883 CTrie<DNATrie>* tries=params->tries;
00884 float64_t* weights=params->weights;
00885 int32_t length=params->length;
00886 int32_t* vec=params->vec;
00887 float64_t* result=params->result;
00888 float64_t factor=params->factor;
00889 int32_t* vec_idx=params->vec_idx;
00890
00891 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00892 CAlphabet* alpha=wd->alphabet;
00893
00894 for (int32_t i=params->start; i<params->end; i++)
00895 {
00896 int32_t len=0;
00897 bool free_vec;
00898 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
00899 for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00900 vec[k]=alpha->remap_to_bin(char_vec[k]);
00901 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
00902
00903 ASSERT(tries);
00904
00905 result[i]+=factor*
00906 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00907 }
00908
00909 SG_UNREF(rhs_feat);
00910
00911 return NULL;
00912 }
00913
00914 void CWeightedDegreeStringKernel::compute_batch(
00915 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00916 int32_t* IDX, float64_t* alphas, float64_t factor)
00917 {
00918 ASSERT(tries);
00919 ASSERT(alphabet);
00920 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00921 ASSERT(rhs);
00922 ASSERT(num_vec<=rhs->get_num_vectors());
00923 ASSERT(num_vec>0);
00924 ASSERT(vec_idx);
00925 ASSERT(result);
00926 create_empty_tries();
00927
00928 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00929 ASSERT(num_feat>0);
00930 int32_t num_threads=parallel->get_num_threads();
00931 ASSERT(num_threads>0);
00932 int32_t* vec=new int32_t[num_threads*num_feat];
00933
00934 if (num_threads < 2)
00935 {
00936 #ifdef CYGWIN
00937 for (int32_t j=0; j<num_feat; j++)
00938 #else
00939 CSignal::clear_cancel();
00940 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00941 #endif
00942 {
00943 init_optimization(num_suppvec, IDX, alphas, j);
00944 S_THREAD_PARAM params;
00945 params.vec=vec;
00946 params.result=result;
00947 params.weights=weights;
00948 params.kernel=this;
00949 params.tries=tries;
00950 params.factor=factor;
00951 params.j=j;
00952 params.start=0;
00953 params.end=num_vec;
00954 params.length=length;
00955 params.vec_idx=vec_idx;
00956 compute_batch_helper((void*) ¶ms);
00957
00958 SG_PROGRESS(j,0,num_feat);
00959 }
00960 }
00961 #ifndef WIN32
00962 else
00963 {
00964 CSignal::clear_cancel();
00965 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00966 {
00967 init_optimization(num_suppvec, IDX, alphas, j);
00968 pthread_t* threads = new pthread_t[num_threads-1];
00969 S_THREAD_PARAM* params = new S_THREAD_PARAM[num_threads];
00970 int32_t step= num_vec/num_threads;
00971 int32_t t;
00972
00973 for (t=0; t<num_threads-1; t++)
00974 {
00975 params[t].vec=&vec[num_feat*t];
00976 params[t].result=result;
00977 params[t].weights=weights;
00978 params[t].kernel=this;
00979 params[t].tries=tries;
00980 params[t].factor=factor;
00981 params[t].j=j;
00982 params[t].start = t*step;
00983 params[t].end = (t+1)*step;
00984 params[t].length=length;
00985 params[t].vec_idx=vec_idx;
00986 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)¶ms[t]);
00987 }
00988 params[t].vec=&vec[num_feat*t];
00989 params[t].result=result;
00990 params[t].weights=weights;
00991 params[t].kernel=this;
00992 params[t].tries=tries;
00993 params[t].factor=factor;
00994 params[t].j=j;
00995 params[t].start=t*step;
00996 params[t].end=num_vec;
00997 params[t].length=length;
00998 params[t].vec_idx=vec_idx;
00999 compute_batch_helper((void*) ¶ms[t]);
01000
01001 for (t=0; t<num_threads-1; t++)
01002 pthread_join(threads[t], NULL);
01003 SG_PROGRESS(j,0,num_feat);
01004
01005 delete[] params;
01006 delete[] threads;
01007 }
01008 }
01009 #endif
01010
01011 delete[] vec;
01012
01013
01014
01015 create_empty_tries();
01016 }
01017
01018 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
01019 {
01020 if (type==E_EXTERNAL && max!=0) {
01021 return false;
01022 }
01023
01024 max_mismatch=max;
01025
01026 if (lhs!=NULL && rhs!=NULL)
01027 return init(lhs, rhs);
01028 else
01029 return true;
01030 }