00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017
00018 #include "kernel/WeightedDegreePositionStringKernel.h"
00019 #include "kernel/SqrtDiagKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022
00023 #include "classifier/svm/SVM.h"
00024
00025 #ifndef WIN32
00026 #include <pthread.h>
00027 #endif
00028
00029 using namespace shogun;
00030
00031 #define TRIES(X) ((use_poim_tries) ? (poim_tries.X) : (tries.X))
00032
00033 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00034 template <class Trie> struct S_THREAD_PARAM
00035 {
00036 int32_t* vec;
00037 float64_t* result;
00038 float64_t* weights;
00039 CWeightedDegreePositionStringKernel* kernel;
00040 CTrie<Trie>* tries;
00041 float64_t factor;
00042 int32_t j;
00043 int32_t start;
00044 int32_t end;
00045 int32_t length;
00046 int32_t max_shift;
00047 int32_t* shift;
00048 int32_t* vec_idx;
00049 };
00050 #endif // DOXYGEN_SHOULD_SKIP_THIS
00051
00052 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00053 int32_t size, int32_t d, int32_t mm, int32_t mkls)
00054 : CStringKernel<char>(size), weights(NULL), position_weights(NULL),
00055 position_weights_lhs(NULL), position_weights_rhs(NULL),
00056 weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00057 max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00058 num_block_weights_external(0), block_weights_external(NULL),
00059 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00060 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00061 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00062 alphabet(NULL)
00063 {
00064 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00065 set_wd_weights();
00066 ASSERT(weights);
00067
00068 set_normalizer(new CSqrtDiagKernelNormalizer());
00069 }
00070
00071 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00072 int32_t size, float64_t* w, int32_t d, int32_t mm, int32_t* s, int32_t sl,
00073 int32_t mkls)
00074 : CStringKernel<char>(size), weights(NULL), position_weights(NULL),
00075 position_weights_lhs(NULL), position_weights_rhs(NULL),
00076 weights_buffer(NULL), mkl_stepsize(mkls), degree(d), length(0),
00077 max_mismatch(mm), seq_length(0), shift(NULL), shift_len(0),
00078 num_block_weights_external(0), block_weights_external(NULL),
00079 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00080 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00081 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00082 alphabet(NULL)
00083 {
00084 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00085
00086 weights=new float64_t[d*(1+max_mismatch)];
00087 for (int32_t i=0; i<d*(1+max_mismatch); i++)
00088 weights[i]=w[i];
00089
00090 set_shifts(s, sl);
00091
00092 set_normalizer(new CSqrtDiagKernelNormalizer());
00093 }
00094
00095 CWeightedDegreePositionStringKernel::CWeightedDegreePositionStringKernel(
00096 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t d)
00097 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00098 position_weights_lhs(NULL), position_weights_rhs(NULL),
00099 weights_buffer(NULL), mkl_stepsize(1), degree(d), length(0),
00100 max_mismatch(0), seq_length(0), shift(NULL), shift_len(0),
00101 num_block_weights_external(0), block_weights_external(NULL),
00102 block_weights(NULL), type(E_EXTERNAL), tries(d), poim_tries(d),
00103 tree_initialized(false), use_poim_tries(false), m_poim_distrib(NULL),
00104 m_poim(NULL), m_poim_num_sym(0), m_poim_num_feat(0), m_poim_result_len(0),
00105 alphabet(NULL)
00106 {
00107 properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00108 set_wd_weights();
00109 ASSERT(weights);
00110 set_normalizer(new CSqrtDiagKernelNormalizer());
00111
00112 init(l, r);
00113 }
00114
00115
00116 CWeightedDegreePositionStringKernel::~CWeightedDegreePositionStringKernel()
00117 {
00118 cleanup();
00119 cleanup_POIM2();
00120
00121 delete[] shift;
00122 shift=NULL;
00123
00124 delete[] weights;
00125 weights=NULL ;
00126
00127 delete[] block_weights;
00128 block_weights=NULL;
00129
00130 delete[] position_weights;
00131 position_weights=NULL;
00132
00133 delete[] position_weights_lhs;
00134 position_weights_lhs=NULL;
00135
00136 delete[] position_weights_rhs;
00137 position_weights_rhs=NULL;
00138
00139 delete[] weights_buffer;
00140 weights_buffer=NULL;
00141 }
00142
00143 void CWeightedDegreePositionStringKernel::remove_lhs()
00144 {
00145 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00146 delete_optimization();
00147
00148 tries.destroy();
00149 poim_tries.destroy();
00150
00151 CKernel::remove_lhs();
00152 }
00153
00154 void CWeightedDegreePositionStringKernel::create_empty_tries()
00155 {
00156 ASSERT(lhs);
00157 seq_length = ((CStringFeatures<char>*) lhs)->get_max_vector_length();
00158
00159 if (opt_type==SLOWBUTMEMEFFICIENT)
00160 {
00161 tries.create(seq_length, true);
00162 poim_tries.create(seq_length, true);
00163 }
00164 else if (opt_type==FASTBUTMEMHUNGRY)
00165 {
00166 tries.create(seq_length, false);
00167 poim_tries.create(seq_length, false);
00168 }
00169 else
00170 SG_ERROR( "unknown optimization type\n");
00171 }
00172
00173 bool CWeightedDegreePositionStringKernel::init(CFeatures* l, CFeatures* r)
00174 {
00175 int32_t lhs_changed = (lhs!=l) ;
00176 int32_t rhs_changed = (rhs!=r) ;
00177
00178 CStringKernel<char>::init(l,r);
00179
00180 SG_DEBUG( "lhs_changed: %i\n", lhs_changed) ;
00181 SG_DEBUG( "rhs_changed: %i\n", rhs_changed) ;
00182
00183 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00184 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00185
00186
00187 if (shift_len==0) {
00188 shift_len=sf_l->get_vector_length(0);
00189 int32_t *shifts=new int32_t[shift_len];
00190 for (int32_t i=0; i<shift_len; i++) {
00191 shifts[i]=1;
00192 }
00193 set_shifts(shifts, shift_len);
00194 delete[] shifts;
00195 }
00196
00197
00198 int32_t len=sf_l->get_max_vector_length();
00199 if (lhs_changed && !sf_l->have_same_length(len))
00200 SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00201
00202 if (rhs_changed && !sf_r->have_same_length(len))
00203 SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00204
00205 SG_UNREF(alphabet);
00206 alphabet= sf_l->get_alphabet();
00207 CAlphabet* ralphabet=sf_r->get_alphabet();
00208
00209 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00210 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00211
00212 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00213 SG_UNREF(ralphabet);
00214
00215
00216 create_empty_tries();
00217 init_block_weights();
00218
00219 return init_normalizer();
00220 }
00221
00222 void CWeightedDegreePositionStringKernel::cleanup()
00223 {
00224 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00225 delete_optimization();
00226
00227 delete[] block_weights;
00228 block_weights=NULL;
00229
00230 tries.destroy();
00231 poim_tries.destroy();
00232
00233 seq_length = 0;
00234 tree_initialized = false;
00235
00236 SG_UNREF(alphabet);
00237 alphabet=NULL;
00238
00239 CKernel::cleanup();
00240 }
00241
00242 bool CWeightedDegreePositionStringKernel::init_optimization(
00243 int32_t p_count, int32_t * IDX, float64_t * alphas, int32_t tree_num,
00244 int32_t upto_tree)
00245 {
00246 ASSERT(position_weights_lhs==NULL);
00247 ASSERT(position_weights_rhs==NULL);
00248
00249 if (upto_tree<0)
00250 upto_tree=tree_num;
00251
00252 if (max_mismatch!=0)
00253 {
00254 SG_ERROR( "CWeightedDegreePositionStringKernel optimization not implemented for mismatch!=0\n");
00255 return false ;
00256 }
00257
00258 if (tree_num<0)
00259 SG_DEBUG( "deleting CWeightedDegreePositionStringKernel optimization\n");
00260
00261 delete_optimization();
00262
00263 if (tree_num<0)
00264 SG_DEBUG( "initializing CWeightedDegreePositionStringKernel optimization\n") ;
00265
00266 for (int32_t i=0; i<p_count; i++)
00267 {
00268 if (tree_num<0)
00269 {
00270 if ( (i % (p_count/10+1)) == 0)
00271 SG_PROGRESS(i,0,p_count);
00272 add_example_to_tree(IDX[i], alphas[i]);
00273 }
00274 else
00275 {
00276 for (int32_t t=tree_num; t<=upto_tree; t++)
00277 add_example_to_single_tree(IDX[i], alphas[i], t);
00278 }
00279 }
00280
00281 if (tree_num<0)
00282 SG_DONE();
00283
00284 set_is_initialized(true) ;
00285 return true ;
00286 }
00287
00288 bool CWeightedDegreePositionStringKernel::delete_optimization()
00289 {
00290 if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00291 {
00292 tries.set_use_compact_terminal_nodes(false) ;
00293 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00294 }
00295
00296 if (get_is_initialized())
00297 {
00298 if (opt_type==SLOWBUTMEMEFFICIENT)
00299 tries.delete_trees(true);
00300 else if (opt_type==FASTBUTMEMHUNGRY)
00301 tries.delete_trees(false);
00302 else {
00303 SG_ERROR( "unknown optimization type\n");
00304 }
00305 set_is_initialized(false);
00306
00307 return true;
00308 }
00309
00310 return false;
00311 }
00312
00313 float64_t CWeightedDegreePositionStringKernel::compute_with_mismatch(
00314 char* avec, int32_t alen, char* bvec, int32_t blen)
00315 {
00316 float64_t* max_shift_vec= new float64_t[max_shift];
00317 float64_t sum0=0 ;
00318 for (int32_t i=0; i<max_shift; i++)
00319 max_shift_vec[i]=0 ;
00320
00321
00322 for (int32_t i=0; i<alen; i++)
00323 {
00324 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00325 continue ;
00326
00327 int32_t mismatches=0;
00328 float64_t sumi = 0.0 ;
00329 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00330 {
00331 if (avec[i+j]!=bvec[i+j])
00332 {
00333 mismatches++ ;
00334 if (mismatches>max_mismatch)
00335 break ;
00336 } ;
00337 sumi += weights[j+degree*mismatches];
00338 }
00339 if (position_weights!=NULL)
00340 sum0 += position_weights[i]*sumi ;
00341 else
00342 sum0 += sumi ;
00343 } ;
00344
00345 for (int32_t i=0; i<alen; i++)
00346 {
00347 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00348 {
00349 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00350 continue ;
00351
00352 float64_t sumi1 = 0.0 ;
00353
00354 int32_t mismatches=0;
00355 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00356 {
00357 if (avec[i+j+k]!=bvec[i+j])
00358 {
00359 mismatches++ ;
00360 if (mismatches>max_mismatch)
00361 break ;
00362 } ;
00363 sumi1 += weights[j+degree*mismatches];
00364 }
00365 float64_t sumi2 = 0.0 ;
00366
00367 mismatches=0;
00368 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00369 {
00370 if (avec[i+j]!=bvec[i+j+k])
00371 {
00372 mismatches++ ;
00373 if (mismatches>max_mismatch)
00374 break ;
00375 } ;
00376 sumi2 += weights[j+degree*mismatches];
00377 }
00378 if (position_weights!=NULL)
00379 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00380 else
00381 max_shift_vec[k-1] += sumi1 + sumi2 ;
00382 } ;
00383 }
00384
00385 float64_t result = sum0 ;
00386 for (int32_t i=0; i<max_shift; i++)
00387 result += max_shift_vec[i]/(2*(i+1)) ;
00388
00389 delete[] max_shift_vec;
00390 return result ;
00391 }
00392
00393 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch(
00394 char* avec, int32_t alen, char* bvec, int32_t blen)
00395 {
00396 float64_t* max_shift_vec = new float64_t[max_shift];
00397 float64_t sum0=0 ;
00398 for (int32_t i=0; i<max_shift; i++)
00399 max_shift_vec[i]=0 ;
00400
00401
00402 for (int32_t i=0; i<alen; i++)
00403 {
00404 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00405 continue ;
00406
00407 float64_t sumi = 0.0 ;
00408 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00409 {
00410 if (avec[i+j]!=bvec[i+j])
00411 break ;
00412 sumi += weights[j];
00413 }
00414 if (position_weights!=NULL)
00415 sum0 += position_weights[i]*sumi ;
00416 else
00417 sum0 += sumi ;
00418 } ;
00419
00420 for (int32_t i=0; i<alen; i++)
00421 {
00422 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00423 {
00424 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00425 continue ;
00426
00427 float64_t sumi1 = 0.0 ;
00428
00429 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00430 {
00431 if (avec[i+j+k]!=bvec[i+j])
00432 break ;
00433 sumi1 += weights[j];
00434 }
00435 float64_t sumi2 = 0.0 ;
00436
00437 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00438 {
00439 if (avec[i+j]!=bvec[i+j+k])
00440 break ;
00441 sumi2 += weights[j];
00442 }
00443 if (position_weights!=NULL)
00444 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00445 else
00446 max_shift_vec[k-1] += sumi1 + sumi2 ;
00447 } ;
00448 }
00449
00450 float64_t result = sum0 ;
00451 for (int32_t i=0; i<max_shift; i++)
00452 result += max_shift_vec[i]/(2*(i+1)) ;
00453
00454 delete[] max_shift_vec;
00455 return result ;
00456 }
00457
00458 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_matrix(
00459 char* avec, int32_t alen, char* bvec, int32_t blen)
00460 {
00461 float64_t* max_shift_vec = new float64_t[max_shift];
00462 float64_t sum0=0 ;
00463 for (int32_t i=0; i<max_shift; i++)
00464 max_shift_vec[i]=0 ;
00465
00466
00467 for (int32_t i=0; i<alen; i++)
00468 {
00469 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00470 continue ;
00471 float64_t sumi = 0.0 ;
00472 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00473 {
00474 if (avec[i+j]!=bvec[i+j])
00475 break ;
00476 sumi += weights[i*degree+j];
00477 }
00478 if (position_weights!=NULL)
00479 sum0 += position_weights[i]*sumi ;
00480 else
00481 sum0 += sumi ;
00482 } ;
00483
00484 for (int32_t i=0; i<alen; i++)
00485 {
00486 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00487 {
00488 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00489 continue ;
00490
00491 float64_t sumi1 = 0.0 ;
00492
00493 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00494 {
00495 if (avec[i+j+k]!=bvec[i+j])
00496 break ;
00497 sumi1 += weights[i*degree+j];
00498 }
00499 float64_t sumi2 = 0.0 ;
00500
00501 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00502 {
00503 if (avec[i+j]!=bvec[i+j+k])
00504 break ;
00505 sumi2 += weights[i*degree+j];
00506 }
00507 if (position_weights!=NULL)
00508 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00509 else
00510 max_shift_vec[k-1] += sumi1 + sumi2 ;
00511 } ;
00512 }
00513
00514 float64_t result = sum0 ;
00515 for (int32_t i=0; i<max_shift; i++)
00516 result += max_shift_vec[i]/(2*(i+1)) ;
00517
00518 delete[] max_shift_vec;
00519 return result ;
00520 }
00521
00522 float64_t CWeightedDegreePositionStringKernel::compute_without_mismatch_position_weights(
00523 char* avec, float64_t* pos_weights_lhs, int32_t alen, char* bvec,
00524 float64_t* pos_weights_rhs, int32_t blen)
00525 {
00526
00527 float64_t* max_shift_vec = new float64_t[max_shift];
00528 float64_t sum0=0 ;
00529 for (int32_t i=0; i<max_shift; i++)
00530 max_shift_vec[i]=0 ;
00531
00532
00533 for (int32_t i=0; i<alen; i++)
00534 {
00535 if ((position_weights!=NULL) && (position_weights[i]==0.0))
00536 continue ;
00537
00538 float64_t sumi = 0.0 ;
00539 float64_t posweight_lhs = 0.0 ;
00540 float64_t posweight_rhs = 0.0 ;
00541 for (int32_t j=0; (j<degree) && (i+j<alen); j++)
00542 {
00543 posweight_lhs += pos_weights_lhs[i+j] ;
00544 posweight_rhs += pos_weights_rhs[i+j] ;
00545
00546 if (avec[i+j]!=bvec[i+j])
00547 break ;
00548 sumi += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00549 }
00550 if (position_weights!=NULL)
00551 sum0 += position_weights[i]*sumi ;
00552 else
00553 sum0 += sumi ;
00554 } ;
00555
00556 for (int32_t i=0; i<alen; i++)
00557 {
00558 for (int32_t k=1; (k<=shift[i]) && (i+k<alen); k++)
00559 {
00560 if ((position_weights!=NULL) && (position_weights[i]==0.0) && (position_weights[i+k]==0.0))
00561 continue ;
00562
00563
00564 float64_t sumi1 = 0.0 ;
00565 float64_t posweight_lhs = 0.0 ;
00566 float64_t posweight_rhs = 0.0 ;
00567 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00568 {
00569 posweight_lhs += pos_weights_lhs[i+j+k] ;
00570 posweight_rhs += pos_weights_rhs[i+j] ;
00571 if (avec[i+j+k]!=bvec[i+j])
00572 break ;
00573 sumi1 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00574 }
00575
00576 float64_t sumi2 = 0.0 ;
00577 posweight_lhs = 0.0 ;
00578 posweight_rhs = 0.0 ;
00579 for (int32_t j=0; (j<degree) && (i+j+k<alen); j++)
00580 {
00581 posweight_lhs += pos_weights_lhs[i+j] ;
00582 posweight_rhs += pos_weights_rhs[i+j+k] ;
00583 if (avec[i+j]!=bvec[i+j+k])
00584 break ;
00585 sumi2 += weights[j]*(posweight_lhs/(j+1))*(posweight_rhs/(j+1)) ;
00586 }
00587 if (position_weights!=NULL)
00588 max_shift_vec[k-1] += position_weights[i]*sumi1 + position_weights[i+k]*sumi2 ;
00589 else
00590 max_shift_vec[k-1] += sumi1 + sumi2 ;
00591 } ;
00592 }
00593
00594 float64_t result = sum0 ;
00595 for (int32_t i=0; i<max_shift; i++)
00596 result += max_shift_vec[i]/(2*(i+1)) ;
00597
00598 delete[] max_shift_vec;
00599 return result ;
00600 }
00601
00602
00603 float64_t CWeightedDegreePositionStringKernel::compute(
00604 int32_t idx_a, int32_t idx_b)
00605 {
00606 int32_t alen, blen;
00607 bool free_avec, free_bvec;
00608
00609 char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00610 char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00611
00612 ASSERT(alen==blen);
00613 ASSERT(shift_len==alen);
00614
00615 float64_t result = 0 ;
00616 if (position_weights_lhs!=NULL || position_weights_rhs!=NULL)
00617 {
00618 ASSERT(max_mismatch==0);
00619 float64_t* position_weights_rhs_ = position_weights_rhs ;
00620 if (lhs==rhs)
00621 {
00622
00623 position_weights_rhs_ = position_weights_lhs ;
00624 }
00625 else
00626 {
00627
00628 }
00629
00630 result = compute_without_mismatch_position_weights(avec, &position_weights_lhs[idx_a*alen], alen, bvec, &position_weights_rhs_[idx_b*blen], blen) ;
00631 }
00632 else if (max_mismatch > 0)
00633 result = compute_with_mismatch(avec, alen, bvec, blen) ;
00634 else if (length==0)
00635 result = compute_without_mismatch(avec, alen, bvec, blen) ;
00636 else
00637 result = compute_without_mismatch_matrix(avec, alen, bvec, blen) ;
00638
00639 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00640 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00641
00642 return result ;
00643 }
00644
00645
00646 void CWeightedDegreePositionStringKernel::add_example_to_tree(
00647 int32_t idx, float64_t alpha)
00648 {
00649 ASSERT(position_weights_lhs==NULL);
00650 ASSERT(position_weights_rhs==NULL);
00651 ASSERT(alphabet);
00652 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00653
00654 int32_t len=0;
00655 bool free_vec;
00656 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00657 ASSERT(max_mismatch==0);
00658 int32_t *vec = new int32_t[len] ;
00659
00660 for (int32_t i=0; i<len; i++)
00661 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00662 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00663
00664 if (opt_type==FASTBUTMEMHUNGRY)
00665 {
00666
00667 ASSERT(!TRIES(get_use_compact_terminal_nodes()));
00668 }
00669
00670 for (int32_t i=0; i<len; i++)
00671 {
00672 int32_t max_s=-1;
00673
00674 if (opt_type==SLOWBUTMEMEFFICIENT)
00675 max_s=0;
00676 else if (opt_type==FASTBUTMEMHUNGRY)
00677 max_s=shift[i];
00678 else {
00679 SG_ERROR( "unknown optimization type\n");
00680 }
00681
00682 for (int32_t s=max_s; s>=0; s--)
00683 {
00684 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00685 TRIES(add_to_trie(i, s, vec, alpha_pw, weights, (length!=0))) ;
00686
00687
00688 if ((s==0) || (i+s>=len))
00689 continue;
00690
00691 TRIES(add_to_trie(i+s, -s, vec, alpha_pw, weights, (length!=0))) ;
00692
00693 }
00694 }
00695
00696 delete[] vec ;
00697 tree_initialized=true ;
00698 }
00699
00700 void CWeightedDegreePositionStringKernel::add_example_to_single_tree(
00701 int32_t idx, float64_t alpha, int32_t tree_num)
00702 {
00703 ASSERT(position_weights_lhs==NULL);
00704 ASSERT(position_weights_rhs==NULL);
00705 ASSERT(alphabet);
00706 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00707
00708 int32_t len=0;
00709 bool free_vec;
00710 char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00711 ASSERT(max_mismatch==0);
00712 int32_t *vec=new int32_t[len];
00713 int32_t max_s=-1;
00714
00715 if (opt_type==SLOWBUTMEMEFFICIENT)
00716 max_s=0;
00717 else if (opt_type==FASTBUTMEMHUNGRY)
00718 {
00719 ASSERT(!tries.get_use_compact_terminal_nodes());
00720 max_s=shift[tree_num];
00721 }
00722 else {
00723 SG_ERROR( "unknown optimization type\n");
00724 }
00725 for (int32_t i=CMath::max(0,tree_num-max_shift);
00726 i<CMath::min(len,tree_num+degree+max_shift); i++)
00727 {
00728 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00729 }
00730 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00731
00732 for (int32_t s=max_s; s>=0; s--)
00733 {
00734 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00735 tries.add_to_trie(tree_num, s, vec, alpha_pw, weights, (length!=0)) ;
00736
00737 }
00738
00739 if (opt_type==FASTBUTMEMHUNGRY)
00740 {
00741 for (int32_t i=CMath::max(0,tree_num-max_shift); i<CMath::min(len,tree_num+max_shift+1); i++)
00742 {
00743 int32_t s=tree_num-i;
00744 if ((i+s<len) && (s>=1) && (s<=shift[i]))
00745 {
00746 float64_t alpha_pw = normalizer->normalize_lhs((s==0) ? (alpha) : (alpha/(2.0*s)), idx);
00747 tries.add_to_trie(tree_num, -s, vec, alpha_pw, weights, (length!=0)) ;
00748
00749 }
00750 }
00751 }
00752 delete[] vec ;
00753 tree_initialized=true ;
00754 }
00755
00756 float64_t CWeightedDegreePositionStringKernel::compute_by_tree(int32_t idx)
00757 {
00758 ASSERT(position_weights_lhs==NULL);
00759 ASSERT(position_weights_rhs==NULL);
00760 ASSERT(alphabet);
00761 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00762
00763 float64_t sum=0;
00764 int32_t len=0;
00765 bool free_vec;
00766 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00767 ASSERT(max_mismatch==0);
00768 int32_t *vec=new int32_t[len];
00769
00770 for (int32_t i=0; i<len; i++)
00771 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00772
00773 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00774
00775 for (int32_t i=0; i<len; i++)
00776 sum += tries.compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0)) ;
00777
00778 if (opt_type==SLOWBUTMEMEFFICIENT)
00779 {
00780 for (int32_t i=0; i<len; i++)
00781 {
00782 for (int32_t s=1; (s<=shift[i]) && (i+s<len); s++)
00783 {
00784 sum+=tries.compute_by_tree_helper(vec, len, i, i+s, i, weights, (length!=0))/(2*s) ;
00785 sum+=tries.compute_by_tree_helper(vec, len, i+s, i, i+s, weights, (length!=0))/(2*s) ;
00786 }
00787 }
00788 }
00789
00790 delete[] vec ;
00791
00792 return normalizer->normalize_rhs(sum, idx);
00793 }
00794
00795 void CWeightedDegreePositionStringKernel::compute_by_tree(
00796 int32_t idx, float64_t* LevelContrib)
00797 {
00798 ASSERT(position_weights_lhs==NULL);
00799 ASSERT(position_weights_rhs==NULL);
00800 ASSERT(alphabet);
00801 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00802
00803 int32_t len=0;
00804 bool free_vec;
00805 char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00806 ASSERT(max_mismatch==0);
00807 int32_t *vec=new int32_t[len];
00808
00809 for (int32_t i=0; i<len; i++)
00810 vec[i]=alphabet->remap_to_bin(char_vec[i]);
00811
00812 ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00813
00814 for (int32_t i=0; i<len; i++)
00815 {
00816 tries.compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00817 normalizer->normalize_rhs(1.0, idx), mkl_stepsize, weights,
00818 (length!=0));
00819 }
00820
00821 if (opt_type==SLOWBUTMEMEFFICIENT)
00822 {
00823 for (int32_t i=0; i<len; i++)
00824 for (int32_t k=1; (k<=shift[i]) && (i+k<len); k++)
00825 {
00826 tries.compute_by_tree_helper(vec, len, i, i+k, i, LevelContrib,
00827 normalizer->normalize_rhs(1.0/(2*k), idx), mkl_stepsize,
00828 weights, (length!=0)) ;
00829 tries.compute_by_tree_helper(vec, len, i+k, i, i+k,
00830 LevelContrib, normalizer->normalize_rhs(1.0/(2*k), idx),
00831 mkl_stepsize, weights, (length!=0)) ;
00832 }
00833 }
00834
00835 delete[] vec ;
00836 }
00837
00838 float64_t* CWeightedDegreePositionStringKernel::compute_abs_weights(
00839 int32_t &len)
00840 {
00841 return tries.compute_abs_weights(len);
00842 }
00843
00844 bool CWeightedDegreePositionStringKernel::set_shifts(
00845 int32_t* shift_, int32_t shift_len_)
00846 {
00847 delete[] shift;
00848
00849 shift_len = shift_len_ ;
00850 shift = new int32_t[shift_len] ;
00851
00852 if (shift)
00853 {
00854 max_shift = 0 ;
00855
00856 for (int32_t i=0; i<shift_len; i++)
00857 {
00858 shift[i] = shift_[i] ;
00859 if (shift[i]>max_shift)
00860 max_shift = shift[i] ;
00861 }
00862
00863 ASSERT(max_shift>=0 && max_shift<=shift_len);
00864 }
00865
00866 return false;
00867 }
00868
00869 bool CWeightedDegreePositionStringKernel::set_wd_weights()
00870 {
00871 ASSERT(degree>0);
00872
00873 delete[] weights;
00874 weights=new float64_t[degree];
00875 if (weights)
00876 {
00877 int32_t i;
00878 float64_t sum=0;
00879 for (i=0; i<degree; i++)
00880 {
00881 weights[i]=degree-i;
00882 sum+=weights[i];
00883 }
00884 for (i=0; i<degree; i++)
00885 weights[i]/=sum;
00886
00887 for (i=0; i<degree; i++)
00888 {
00889 for (int32_t j=1; j<=max_mismatch; j++)
00890 {
00891 if (j<i+1)
00892 {
00893 int32_t nk=CMath::nchoosek(i+1, j);
00894 weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00895 }
00896 else
00897 weights[i+j*degree]= 0;
00898 }
00899 }
00900
00901 return true;
00902 }
00903 else
00904 return false;
00905 }
00906
00907 bool CWeightedDegreePositionStringKernel::set_weights(
00908 float64_t* ws, int32_t d, int32_t len)
00909 {
00910 SG_DEBUG("degree = %i d=%i\n", degree, d);
00911 degree=d;
00912 length=len;
00913
00914 if (len <= 0)
00915 len=1;
00916
00917 delete[] weights;
00918 weights=new float64_t[d*len];
00919
00920 if (weights)
00921 {
00922 for (int32_t i=0; i<degree*len; i++)
00923 weights[i]=ws[i];
00924 return true;
00925 }
00926 else
00927 return false;
00928 }
00929
00930 bool CWeightedDegreePositionStringKernel::set_position_weights(
00931 float64_t* pws, int32_t len)
00932 {
00933 if (seq_length==0)
00934 seq_length=len;
00935
00936 if (seq_length!=len)
00937 {
00938 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00939 return false;
00940 }
00941 delete[] position_weights;
00942 position_weights=new float64_t[len];
00943 tries.set_position_weights(position_weights);
00944
00945 if (position_weights)
00946 {
00947 for (int32_t i=0; i<len; i++)
00948 position_weights[i]=pws[i];
00949 return true;
00950 }
00951 else
00952 return false;
00953 }
00954
00955 bool CWeightedDegreePositionStringKernel::set_position_weights_lhs(float64_t* pws, int32_t len, int32_t num)
00956 {
00957 fprintf(stderr, "set_position_weights_lhs %i %i %i\n", len, num, seq_length);
00958
00959 if (position_weights_rhs==position_weights_lhs)
00960 position_weights_rhs=NULL;
00961 else
00962 delete_position_weights_rhs();
00963
00964 if (len==0)
00965 {
00966 return delete_position_weights_lhs();
00967 }
00968
00969 if (seq_length!=len)
00970 {
00971 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00972 return false;
00973 }
00974 if (0)
00975 {
00976
00977 if (!lhs)
00978 {
00979 SG_ERROR("lhs=NULL\n");
00980 return false ;
00981 }
00982 if (lhs->get_num_vectors()!=num)
00983 {
00984 SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num);
00985 return false;
00986 }
00987 }
00988
00989 delete[] position_weights_lhs;
00990 position_weights_lhs=new float64_t[len*num];
00991 if (position_weights_lhs)
00992 {
00993 for (int32_t i=0; i<len*num; i++)
00994 position_weights_lhs[i]=pws[i];
00995 return true;
00996 }
00997 else
00998 return false;
00999 }
01000
01001 bool CWeightedDegreePositionStringKernel::set_position_weights_rhs(
01002 float64_t* pws, int32_t len, int32_t num)
01003 {
01004 fprintf(stderr, "set_position_weights_rhs %i %i %i\n", len, num, seq_length);
01005 if (len==0)
01006 {
01007 if (position_weights_rhs==position_weights_lhs)
01008 {
01009 position_weights_rhs=NULL;
01010 return true;
01011 }
01012 return delete_position_weights_rhs();
01013 }
01014
01015 if (seq_length!=len)
01016 {
01017 SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
01018 return false;
01019 }
01020 if (0)
01021 {
01022
01023 if (!rhs)
01024 {
01025 if (!lhs)
01026 {
01027 SG_ERROR("rhs=NULL and lhs=NULL\n");
01028 return false;
01029 }
01030 if (lhs->get_num_vectors()!=num)
01031 {
01032 SG_ERROR("lhs->get_num_vectors()=%i, num=%i\n", lhs->get_num_vectors(), num);
01033 return false;
01034 }
01035 } else
01036 if (rhs->get_num_vectors()!=num)
01037 {
01038 SG_ERROR("rhs->get_num_vectors()=%i, num=%i\n", rhs->get_num_vectors(), num);
01039 return false;
01040 }
01041 }
01042
01043 delete[] position_weights_rhs;
01044 position_weights_rhs=new float64_t[len*num];
01045 if (position_weights_rhs)
01046 {
01047 for (int32_t i=0; i<len*num; i++)
01048 position_weights_rhs[i]=pws[i];
01049 return true;
01050 }
01051 else
01052 return false;
01053 }
01054
01055 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd()
01056 {
01057 delete[] block_weights;
01058 block_weights=new float64_t[CMath::max(seq_length,degree)];
01059
01060 if (block_weights)
01061 {
01062 int32_t k;
01063 float64_t d=degree;
01064
01065 for (k=0; k<degree; k++)
01066 block_weights[k]=
01067 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
01068 for (k=degree; k<seq_length; k++)
01069 block_weights[k]=(-d+3*k+4)/3;
01070 }
01071
01072 return (block_weights!=NULL);
01073 }
01074
01075 bool CWeightedDegreePositionStringKernel::init_block_weights_from_wd_external()
01076 {
01077 ASSERT(weights);
01078 delete[] block_weights;
01079 block_weights=new float64_t[CMath::max(seq_length,degree)];
01080
01081 if (block_weights)
01082 {
01083 int32_t i=0;
01084 block_weights[0]=weights[0];
01085 for (i=1; i<CMath::max(seq_length,degree); i++)
01086 block_weights[i]=0;
01087
01088 for (i=1; i<CMath::max(seq_length,degree); i++)
01089 {
01090 block_weights[i]=block_weights[i-1];
01091
01092 float64_t contrib=0;
01093 for (int32_t j=0; j<CMath::min(degree,i+1); j++)
01094 contrib+=weights[j];
01095
01096 block_weights[i]+=contrib;
01097 }
01098 }
01099
01100 return (block_weights!=NULL);
01101 }
01102
01103 bool CWeightedDegreePositionStringKernel::init_block_weights_const()
01104 {
01105 delete[] block_weights;
01106 block_weights=new float64_t[seq_length];
01107
01108 if (block_weights)
01109 {
01110 for (int32_t i=1; i<seq_length+1 ; i++)
01111 block_weights[i-1]=1.0/seq_length;
01112 }
01113
01114 return (block_weights!=NULL);
01115 }
01116
01117 bool CWeightedDegreePositionStringKernel::init_block_weights_linear()
01118 {
01119 delete[] block_weights;
01120 block_weights=new float64_t[seq_length];
01121
01122 if (block_weights)
01123 {
01124 for (int32_t i=1; i<seq_length+1 ; i++)
01125 block_weights[i-1]=degree*i;
01126 }
01127
01128 return (block_weights!=NULL);
01129 }
01130
01131 bool CWeightedDegreePositionStringKernel::init_block_weights_sqpoly()
01132 {
01133 delete[] block_weights;
01134 block_weights=new float64_t[seq_length];
01135
01136 if (block_weights)
01137 {
01138 for (int32_t i=1; i<degree+1 ; i++)
01139 block_weights[i-1]=((float64_t) i)*i;
01140
01141 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01142 block_weights[i-1]=i;
01143 }
01144
01145 return (block_weights!=NULL);
01146 }
01147
01148 bool CWeightedDegreePositionStringKernel::init_block_weights_cubicpoly()
01149 {
01150 delete[] block_weights;
01151 block_weights=new float64_t[seq_length];
01152
01153 if (block_weights)
01154 {
01155 for (int32_t i=1; i<degree+1 ; i++)
01156 block_weights[i-1]=((float64_t) i)*i*i;
01157
01158 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01159 block_weights[i-1]=i;
01160 }
01161
01162 return (block_weights!=NULL);
01163 }
01164
01165 bool CWeightedDegreePositionStringKernel::init_block_weights_exp()
01166 {
01167 delete[] block_weights;
01168 block_weights=new float64_t[seq_length];
01169
01170 if (block_weights)
01171 {
01172 for (int32_t i=1; i<degree+1 ; i++)
01173 block_weights[i-1]=exp(((float64_t) i/10.0));
01174
01175 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01176 block_weights[i-1]=i;
01177 }
01178
01179 return (block_weights!=NULL);
01180 }
01181
01182 bool CWeightedDegreePositionStringKernel::init_block_weights_log()
01183 {
01184 delete[] block_weights;
01185 block_weights=new float64_t[seq_length];
01186
01187 if (block_weights)
01188 {
01189 for (int32_t i=1; i<degree+1 ; i++)
01190 block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
01191
01192 for (int32_t i=degree+1; i<seq_length+1 ; i++)
01193 block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
01194 }
01195
01196 return (block_weights!=NULL);
01197 }
01198
01199 bool CWeightedDegreePositionStringKernel::init_block_weights_external()
01200 {
01201 if (block_weights_external && (seq_length == num_block_weights_external) )
01202 {
01203 delete[] block_weights;
01204 block_weights=new float64_t[seq_length];
01205
01206 if (block_weights)
01207 {
01208 for (int32_t i=0; i<seq_length; i++)
01209 block_weights[i]=block_weights_external[i];
01210 }
01211 }
01212 else {
01213 SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
01214 }
01215 return (block_weights!=NULL);
01216 }
01217
01218 bool CWeightedDegreePositionStringKernel::init_block_weights()
01219 {
01220 switch (type)
01221 {
01222 case E_WD:
01223 return init_block_weights_from_wd();
01224 case E_EXTERNAL:
01225 return init_block_weights_from_wd_external();
01226 case E_BLOCK_CONST:
01227 return init_block_weights_const();
01228 case E_BLOCK_LINEAR:
01229 return init_block_weights_linear();
01230 case E_BLOCK_SQPOLY:
01231 return init_block_weights_sqpoly();
01232 case E_BLOCK_CUBICPOLY:
01233 return init_block_weights_cubicpoly();
01234 case E_BLOCK_EXP:
01235 return init_block_weights_exp();
01236 case E_BLOCK_LOG:
01237 return init_block_weights_log();
01238 case E_BLOCK_EXTERNAL:
01239 return init_block_weights_external();
01240 default:
01241 return false;
01242 };
01243 }
01244
01245
01246
01247 void* CWeightedDegreePositionStringKernel::compute_batch_helper(void* p)
01248 {
01249 S_THREAD_PARAM<DNATrie>* params = (S_THREAD_PARAM<DNATrie>*) p;
01250 int32_t j=params->j;
01251 CWeightedDegreePositionStringKernel* wd=params->kernel;
01252 CTrie<DNATrie>* tries=params->tries;
01253 float64_t* weights=params->weights;
01254 int32_t length=params->length;
01255 int32_t max_shift=params->max_shift;
01256 int32_t* vec=params->vec;
01257 float64_t* result=params->result;
01258 float64_t factor=params->factor;
01259 int32_t* shift=params->shift;
01260 int32_t* vec_idx=params->vec_idx;
01261
01262 for (int32_t i=params->start; i<params->end; i++)
01263 {
01264 int32_t len=0;
01265 CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
01266 CAlphabet* alpha=wd->alphabet;
01267
01268 bool free_vec;
01269 char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
01270 for (int32_t k=CMath::max(0,j-max_shift); k<CMath::min(len,j+wd->get_degree()+max_shift); k++)
01271 vec[k]=alpha->remap_to_bin(char_vec[k]);
01272 rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
01273
01274 SG_UNREF(rhs_feat);
01275
01276 result[i] += factor*wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
01277
01278 if (wd->get_optimization_type()==SLOWBUTMEMEFFICIENT)
01279 {
01280 for (int32_t q=CMath::max(0,j-max_shift); q<CMath::min(len,j+max_shift+1); q++)
01281 {
01282 int32_t s=j-q ;
01283 if ((s>=1) && (s<=shift[q]) && (q+s<len))
01284 {
01285 result[i] +=
01286 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01287 len, q, q+s, q, weights, (length!=0)),
01288 vec_idx[i])/(2.0*s);
01289 }
01290 }
01291
01292 for (int32_t s=1; (s<=shift[j]) && (j+s<len); s++)
01293 {
01294 result[i] +=
01295 wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec,
01296 len, j+s, j, j+s, weights, (length!=0)),
01297 vec_idx[i])/(2.0*s);
01298 }
01299 }
01300 }
01301
01302 return NULL;
01303 }
01304
01305 void CWeightedDegreePositionStringKernel::compute_batch(
01306 int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
01307 int32_t* IDX, float64_t* alphas, float64_t factor)
01308 {
01309 ASSERT(alphabet);
01310 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01311 ASSERT(position_weights_lhs==NULL);
01312 ASSERT(position_weights_rhs==NULL);
01313 ASSERT(rhs);
01314 ASSERT(num_vec<=rhs->get_num_vectors());
01315 ASSERT(num_vec>0);
01316 ASSERT(vec_idx);
01317 ASSERT(result);
01318 create_empty_tries();
01319
01320 int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01321 ASSERT(num_feat>0);
01322 int32_t num_threads=parallel->get_num_threads();
01323 ASSERT(num_threads>0);
01324 int32_t* vec=new int32_t[num_threads*num_feat];
01325
01326 if (num_threads < 2)
01327 {
01328 #ifdef WIN32
01329 for (int32_t j=0; j<num_feat; j++)
01330 #else
01331 CSignal::clear_cancel();
01332 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01333 #endif
01334 {
01335 init_optimization(num_suppvec, IDX, alphas, j);
01336 S_THREAD_PARAM<DNATrie> params;
01337 params.vec=vec;
01338 params.result=result;
01339 params.weights=weights;
01340 params.kernel=this;
01341 params.tries=&tries;
01342 params.factor=factor;
01343 params.j=j;
01344 params.start=0;
01345 params.end=num_vec;
01346 params.length=length;
01347 params.max_shift=max_shift;
01348 params.shift=shift;
01349 params.vec_idx=vec_idx;
01350 compute_batch_helper((void*) ¶ms);
01351
01352 SG_PROGRESS(j,0,num_feat);
01353 }
01354 }
01355 #ifndef WIN32
01356 else
01357 {
01358
01359 CSignal::clear_cancel();
01360 for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
01361 {
01362 init_optimization(num_suppvec, IDX, alphas, j);
01363 pthread_t* threads = new pthread_t[num_threads-1];
01364 S_THREAD_PARAM<DNATrie>* params = new S_THREAD_PARAM<DNATrie>[num_threads];
01365 int32_t step= num_vec/num_threads;
01366 int32_t t;
01367
01368 for (t=0; t<num_threads-1; t++)
01369 {
01370 params[t].vec=&vec[num_feat*t];
01371 params[t].result=result;
01372 params[t].weights=weights;
01373 params[t].kernel=this;
01374 params[t].tries=&tries;
01375 params[t].factor=factor;
01376 params[t].j=j;
01377 params[t].start = t*step;
01378 params[t].end = (t+1)*step;
01379 params[t].length=length;
01380 params[t].max_shift=max_shift;
01381 params[t].shift=shift;
01382 params[t].vec_idx=vec_idx;
01383 pthread_create(&threads[t], NULL, CWeightedDegreePositionStringKernel::compute_batch_helper, (void*)¶ms[t]);
01384 }
01385
01386 params[t].vec=&vec[num_feat*t];
01387 params[t].result=result;
01388 params[t].weights=weights;
01389 params[t].kernel=this;
01390 params[t].tries=&tries;
01391 params[t].factor=factor;
01392 params[t].j=j;
01393 params[t].start=t*step;
01394 params[t].end=num_vec;
01395 params[t].length=length;
01396 params[t].max_shift=max_shift;
01397 params[t].shift=shift;
01398 params[t].vec_idx=vec_idx;
01399 compute_batch_helper((void*) ¶ms[t]);
01400
01401 for (t=0; t<num_threads-1; t++)
01402 pthread_join(threads[t], NULL);
01403 SG_PROGRESS(j,0,num_feat);
01404
01405 delete[] params;
01406 delete[] threads;
01407 }
01408 }
01409 #endif
01410
01411 delete[] vec;
01412
01413
01414
01415 create_empty_tries();
01416 }
01417
01418 float64_t* CWeightedDegreePositionStringKernel::compute_scoring(
01419 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* result,
01420 int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01421 {
01422 ASSERT(position_weights_lhs==NULL);
01423 ASSERT(position_weights_rhs==NULL);
01424
01425 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01426 ASSERT(num_feat>0);
01427 ASSERT(alphabet);
01428 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01429
01430 num_sym=4;
01431
01432 ASSERT(max_degree>0);
01433
01434
01435 int32_t* nofsKmers=new int32_t[max_degree];
01436 float64_t** C=new float64_t*[max_degree];
01437 float64_t** L=new float64_t*[max_degree];
01438 float64_t** R=new float64_t*[max_degree];
01439
01440 int32_t i;
01441 int32_t k;
01442
01443
01444 int32_t bigtabSize=0;
01445 for (k=0; k<max_degree; ++k )
01446 {
01447 nofsKmers[k]=(int32_t) CMath::pow(num_sym, k+1);
01448 const int32_t tabSize=nofsKmers[k]*num_feat;
01449 bigtabSize+=tabSize;
01450 }
01451 result=new float64_t[bigtabSize];
01452
01453
01454 int32_t tabOffs=0;
01455 for( k = 0; k < max_degree; ++k )
01456 {
01457 const int32_t tabSize = nofsKmers[k] * num_feat;
01458 C[k] = &result[tabOffs];
01459 L[k] = new float64_t[ tabSize ];
01460 R[k] = new float64_t[ tabSize ];
01461 tabOffs+=tabSize;
01462 for(i = 0; i < tabSize; i++ )
01463 {
01464 C[k][i] = 0.0;
01465 L[k][i] = 0.0;
01466 R[k][i] = 0.0;
01467 }
01468 }
01469
01470
01471 float64_t* margFactors=new float64_t[degree];
01472
01473 int32_t* x = new int32_t[ degree+1 ];
01474 int32_t* substrs = new int32_t[ degree+1 ];
01475
01476 margFactors[0] = 1.0;
01477 substrs[0] = 0;
01478 for( k=1; k < degree; ++k ) {
01479 margFactors[k] = 0.25 * margFactors[k-1];
01480 substrs[k] = -1;
01481 }
01482 substrs[degree] = -1;
01483
01484 struct TreeParseInfo info;
01485 info.num_sym = num_sym;
01486 info.num_feat = num_feat;
01487 info.p = -1;
01488 info.k = -1;
01489 info.nofsKmers = nofsKmers;
01490 info.margFactors = margFactors;
01491 info.x = x;
01492 info.substrs = substrs;
01493 info.y0 = 0;
01494 info.C_k = NULL;
01495 info.L_k = NULL;
01496 info.R_k = NULL;
01497
01498
01499 i = 0;
01500 for( k = 0; k < max_degree; ++k )
01501 {
01502 const int32_t nofKmers = nofsKmers[ k ];
01503 info.C_k = C[k];
01504 info.L_k = L[k];
01505 info.R_k = R[k];
01506
01507
01508 for(int32_t p = 0; p < num_feat; ++p )
01509 {
01510 init_optimization( num_suppvec, IDX, alphas, p );
01511 int32_t tree = p ;
01512 for(int32_t j = 0; j < degree+1; j++ ) {
01513 x[j] = -1;
01514 }
01515 tries.traverse( tree, p, info, 0, x, k );
01516 SG_PROGRESS(i++,0,num_feat*max_degree);
01517 }
01518
01519
01520 if( k > 0 ) {
01521 const int32_t j = k - 1;
01522 const int32_t nofJmers = (int32_t) CMath::pow( num_sym, j+1 );
01523 for(int32_t p = 0; p < num_feat; ++p ) {
01524 const int32_t offsetJ = nofJmers * p;
01525 const int32_t offsetJ1 = nofJmers * (p+1);
01526 const int32_t offsetK = nofKmers * p;
01527 int32_t y;
01528 int32_t sym;
01529 for( y = 0; y < nofJmers; ++y ) {
01530 for( sym = 0; sym < num_sym; ++sym ) {
01531 const int32_t y_sym = num_sym*y + sym;
01532 const int32_t sym_y = nofJmers*sym + y;
01533 ASSERT(0<=y_sym && y_sym<nofKmers);
01534 ASSERT(0<=sym_y && sym_y<nofKmers);
01535 C[k][ y_sym + offsetK ] += L[j][ y + offsetJ ];
01536 if( p < num_feat-1 ) {
01537 C[k][ sym_y + offsetK ] += R[j][ y + offsetJ1 ];
01538 }
01539 }
01540 }
01541 }
01542 }
01543
01544
01545
01546
01547
01548
01549
01550
01551
01552
01553
01554 }
01555
01556
01557 num_feat=1;
01558 num_sym = bigtabSize;
01559
01560 delete[] nofsKmers;
01561 delete[] margFactors;
01562 delete[] substrs;
01563 delete[] x;
01564 delete[] C;
01565 for( k = 0; k < max_degree; ++k ) {
01566 delete[] L[k];
01567 delete[] R[k];
01568 }
01569 delete[] L;
01570 delete[] R;
01571 return result;
01572 }
01573
01574 char* CWeightedDegreePositionStringKernel::compute_consensus(
01575 int32_t &num_feat, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01576 {
01577 ASSERT(position_weights_lhs==NULL);
01578 ASSERT(position_weights_rhs==NULL);
01579
01580 ASSERT(degree<=32);
01581 ASSERT(!tries.get_use_compact_terminal_nodes());
01582 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01583 ASSERT(num_feat>0);
01584 ASSERT(alphabet);
01585 ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
01586
01587
01588 char* result=new char[num_feat];
01589
01590
01591 int32_t num_tables=CMath::max(1,num_feat-degree+1);
01592 CDynamicArray<ConsensusEntry>** table=new CDynamicArray<ConsensusEntry>*[num_tables];
01593
01594 for (int32_t i=0; i<num_tables; i++)
01595 table[i]=new CDynamicArray<ConsensusEntry>(num_suppvec/10);
01596
01597
01598 for (int32_t i=0; i<num_tables; i++)
01599 {
01600 bool cumulative=false;
01601
01602 if (i<num_tables-1)
01603 init_optimization(num_suppvec, IDX, alphas, i);
01604 else
01605 {
01606 init_optimization(num_suppvec, IDX, alphas, i, num_feat-1);
01607 cumulative=true;
01608 }
01609
01610 if (i==0)
01611 tries.fill_backtracking_table(i, NULL, table[i], cumulative, weights);
01612 else
01613 tries.fill_backtracking_table(i, table[i-1], table[i], cumulative, weights);
01614
01615 SG_PROGRESS(i,0,num_feat);
01616 }
01617
01618
01619
01620
01621
01622
01623
01624
01625
01626
01627
01628
01629
01630
01631
01632
01633
01634
01635
01636
01637
01638
01639
01640 const char* acgt="ACGT";
01641
01642
01643 int32_t max_idx=-1;
01644 float32_t max_score=0;
01645 int32_t num_elements=table[num_tables-1]->get_num_elements();
01646
01647 for (int32_t i=0; i<num_elements; i++)
01648 {
01649 float64_t sc=table[num_tables-1]->get_element(i).score;
01650 if (sc>max_score || max_idx==-1)
01651 {
01652 max_idx=i;
01653 max_score=sc;
01654 }
01655 }
01656 uint64_t endstr=table[num_tables-1]->get_element(max_idx).string;
01657
01658 SG_INFO("max_idx:%d num_el:%d num_feat:%d num_tables:%d max_score:%f\n", max_idx, num_elements, num_feat, num_tables, max_score);
01659
01660 for (int32_t i=0; i<degree; i++)
01661 result[num_feat-1-i]=acgt[(endstr >> (2*i)) & 3];
01662
01663 if (num_tables>1)
01664 {
01665 for (int32_t i=num_tables-1; i>=0; i--)
01666 {
01667
01668 result[i]=acgt[table[i]->get_element(max_idx).string >> (2*(degree-1)) & 3];
01669 max_idx=table[i]->get_element(max_idx).bt;
01670 }
01671 }
01672
01673
01674
01675
01676
01677
01678
01679
01680
01681
01682
01683 for (int32_t i=0; i<num_tables; i++)
01684 delete table[i];
01685
01686 delete[] table;
01687 return result;
01688 }
01689
01690
01691 float64_t* CWeightedDegreePositionStringKernel::extract_w(
01692 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01693 float64_t* w_result, int32_t num_suppvec, int32_t* IDX, float64_t* alphas)
01694 {
01695 delete_optimization();
01696 use_poim_tries=true;
01697 poim_tries.delete_trees(false);
01698
01699
01700 ASSERT(position_weights_lhs==NULL);
01701 ASSERT(position_weights_rhs==NULL);
01702 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01703 ASSERT(num_feat>0);
01704 ASSERT(alphabet->get_alphabet()==DNA);
01705 ASSERT(max_degree>0);
01706
01707
01708 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01709 const int32_t seqLen = num_feat;
01710 float64_t** subs;
01711 int32_t i;
01712 int32_t k;
01713
01714
01715
01716
01717 int32_t* offsets;
01718 int32_t offset;
01719 offsets = new int32_t[ max_degree ];
01720 offset = 0;
01721 for( k = 0; k < max_degree; ++k ) {
01722 offsets[k] = offset;
01723 const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
01724 const int32_t tabSize = nofsKmers * seqLen;
01725 offset += tabSize;
01726 }
01727
01728 const int32_t bigTabSize = offset;
01729 w_result=new float64_t[bigTabSize];
01730 for (i=0; i<bigTabSize; ++i)
01731 w_result[i]=0;
01732
01733
01734 subs = new float64_t*[ max_degree ];
01735 ASSERT( subs != NULL );
01736 for( k = 0; k < max_degree; ++k ) {
01737 subs[k] = &w_result[ offsets[k] ];
01738 }
01739 delete[] offsets;
01740
01741
01742 init_optimization( num_suppvec, IDX, alphas, -1);
01743 poim_tries.POIMs_extract_W( subs, max_degree );
01744
01745
01746 delete[] subs;
01747 num_feat = 1;
01748 num_sym = bigTabSize;
01749 use_poim_tries=false;
01750 poim_tries.delete_trees(false);
01751 return w_result;
01752 }
01753
01754 float64_t* CWeightedDegreePositionStringKernel::compute_POIM(
01755 int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
01756 float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
01757 float64_t* alphas, float64_t* distrib )
01758 {
01759 delete_optimization();
01760 use_poim_tries=true;
01761 poim_tries.delete_trees(false);
01762
01763
01764 ASSERT(position_weights_lhs==NULL);
01765 ASSERT(position_weights_rhs==NULL);
01766 num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
01767 ASSERT(num_feat>0);
01768 ASSERT(alphabet->get_alphabet()==DNA);
01769 ASSERT(max_degree!=0);
01770 ASSERT(distrib);
01771
01772
01773 static const int32_t NUM_SYMS = poim_tries.NUM_SYMS;
01774 const int32_t seqLen = num_feat;
01775 float64_t** subs;
01776 int32_t i;
01777 int32_t k;
01778
01779
01780
01781
01782
01783
01784
01785
01786
01787
01788
01789
01790 const int32_t debug = ( max_degree < 0 ) ? ( abs(max_degree) % 4 + 1 ) : 0;
01791 if( debug ) {
01792 max_degree = abs(max_degree) / 4;
01793 switch( debug ) {
01794 case 1: {
01795 printf( "POIM DEBUGGING: substring only (max order=%d)\n", max_degree );
01796 break;
01797 }
01798 case 2: {
01799 printf( "POIM DEBUGGING: superstring only (max order=%d)\n", max_degree );
01800 break;
01801 }
01802 case 3: {
01803 printf( "POIM DEBUGGING: left overlap only (max order=%d)\n", max_degree );
01804 break;
01805 }
01806 case 4: {
01807 printf( "POIM DEBUGGING: right overlap only (max order=%d)\n", max_degree );
01808 break;
01809 }
01810 default: {
01811 printf( "POIM DEBUGGING: something is wrong (max order=%d)\n", max_degree );
01812 ASSERT(0);
01813 break;
01814 }
01815 }
01816 }
01817
01818
01819 int32_t* offsets;
01820 int32_t offset;
01821 offsets = new int32_t[ max_degree ];
01822 offset = 0;
01823 for( k = 0; k < max_degree; ++k ) {
01824 offsets[k] = offset;
01825 const int32_t nofsKmers = (int32_t) CMath::pow( NUM_SYMS, k+1 );
01826 const int32_t tabSize = nofsKmers * seqLen;
01827 offset += tabSize;
01828 }
01829
01830 const int32_t bigTabSize=offset;
01831 poim_result=new float64_t[bigTabSize];
01832 for (i=0; i<bigTabSize; ++i )
01833 poim_result[i]=0;
01834
01835
01836 subs=new float64_t*[max_degree];
01837 for (k=0; k<max_degree; ++k)
01838 subs[k]=&poim_result[offsets[k]];
01839
01840 delete[] offsets;
01841
01842
01843 init_optimization( num_suppvec, IDX, alphas, -1);
01844 poim_tries.POIMs_precalc_SLR( distrib );
01845
01846
01847 if( debug==0 || debug==1 ) {
01848 poim_tries.POIMs_extract_W( subs, max_degree );
01849 for( k = 1; k < max_degree; ++k ) {
01850 const int32_t nofKmers2 = ( k > 1 ) ? (int32_t) CMath::pow(NUM_SYMS,k-1) : 0;
01851 const int32_t nofKmers1 = (int32_t) CMath::pow( NUM_SYMS, k );
01852 const int32_t nofKmers0 = nofKmers1 * NUM_SYMS;
01853 for( i = 0; i < seqLen; ++i ) {
01854 float64_t* const subs_k2i1 = ( k>1 && i<seqLen-1 ) ? &subs[k-2][(i+1)*nofKmers2] : NULL;
01855 float64_t* const subs_k1i1 = ( i < seqLen-1 ) ? &subs[k-1][(i+1)*nofKmers1] : NULL;
01856 float64_t* const subs_k1i0 = & subs[ k-1 ][ i*nofKmers1 ];
01857 float64_t* const subs_k0i = & subs[ k-0 ][ i*nofKmers0 ];
01858 int32_t y0;
01859 for( y0 = 0; y0 < nofKmers0; ++y0 ) {
01860 const int32_t y1l = y0 / NUM_SYMS;
01861 const int32_t y1r = y0 % nofKmers1;
01862 const int32_t y2 = y1r / NUM_SYMS;
01863 subs_k0i[ y0 ] += subs_k1i0[ y1l ];
01864 if( i < seqLen-1 ) {
01865 subs_k0i[ y0 ] += subs_k1i1[ y1r ];
01866 if( k > 1 ) {
01867 subs_k0i[ y0 ] -= subs_k2i1[ y2 ];
01868 }
01869 }
01870 }
01871 }
01872 }
01873 }
01874
01875
01876 poim_tries.POIMs_add_SLR( subs, max_degree, debug );
01877
01878
01879 delete[] subs;
01880 num_feat = 1;
01881 num_sym = bigTabSize;
01882
01883 use_poim_tries=false;
01884 poim_tries.delete_trees(false);
01885
01886 return poim_result;
01887 }
01888
01889
01890 void CWeightedDegreePositionStringKernel::prepare_POIM2(
01891 float64_t* distrib, int32_t num_sym, int32_t num_feat)
01892 {
01893 free(m_poim_distrib);
01894 m_poim_distrib=(float64_t*)malloc(num_sym*num_feat*sizeof(float64_t));
01895 ASSERT(m_poim_distrib);
01896
01897 memcpy(m_poim_distrib, distrib, num_sym*num_feat*sizeof(float64_t));
01898 m_poim_num_sym=num_sym;
01899 m_poim_num_feat=num_feat;
01900 }
01901
01902 void CWeightedDegreePositionStringKernel::compute_POIM2(
01903 int32_t max_degree, CSVM* svm)
01904 {
01905 ASSERT(svm);
01906 int32_t num_suppvec=svm->get_num_support_vectors();
01907 int32_t* sv_idx=new int32_t[num_suppvec];
01908 float64_t* sv_weight=new float64_t[num_suppvec];
01909
01910 for (int32_t i=0; i<num_suppvec; i++)
01911 {
01912 sv_idx[i]=svm->get_support_vector(i);
01913 sv_weight[i]=svm->get_alpha(i);
01914 }
01915
01916 if ((max_degree < 1) || (max_degree > 12))
01917 {
01918
01919 SG_WARNING( "max_degree out of range 1..12 (%d). setting to 1.\n", max_degree);
01920 max_degree=1;
01921 }
01922
01923 int32_t num_feat = m_poim_num_feat;
01924 int32_t num_sym = m_poim_num_sym;
01925 free(m_poim);
01926
01927 m_poim = compute_POIM(max_degree, num_feat, num_sym, NULL, num_suppvec, sv_idx,
01928 sv_weight, m_poim_distrib);
01929
01930 ASSERT(num_feat==1);
01931 m_poim_result_len=num_sym;
01932
01933 delete[] sv_weight;
01934 delete[] sv_idx;
01935 }
01936
01937 void CWeightedDegreePositionStringKernel::get_POIM2(
01938 float64_t** poim, int32_t* result_len)
01939 {
01940 *poim=(float64_t*) malloc(m_poim_result_len*sizeof(float64_t));
01941 ASSERT(*poim);
01942 memcpy(*poim, m_poim, m_poim_result_len*sizeof(float64_t)) ;
01943 *result_len=m_poim_result_len ;
01944 }
01945
01946 void CWeightedDegreePositionStringKernel::cleanup_POIM2()
01947 {
01948 free(m_poim) ;
01949 m_poim=NULL ;
01950 free(m_poim_distrib) ;
01951 m_poim_distrib=NULL ;
01952 m_poim_num_sym=0 ;
01953 m_poim_num_sym=0 ;
01954 m_poim_result_len=0 ;
01955 }
01956