WeightedDegreeStringKernel.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "lib/common.h"
00013 #include "lib/io.h"
00014 #include "lib/Signal.h"
00015 #include "lib/Trie.h"
00016 #include "base/Parallel.h"
00017 
00018 #include "kernel/WeightedDegreeStringKernel.h"
00019 #include "kernel/FirstElementKernelNormalizer.h"
00020 #include "features/Features.h"
00021 #include "features/StringFeatures.h"
00022 
00023 #ifndef WIN32
00024 #include <pthread.h>
00025 #endif
00026 
00027 
00028 #ifdef HAVE_BOOST_SERIALIZATION
00029 #include <boost/serialization/export.hpp>
00030 BOOST_CLASS_EXPORT(shogun::CWeightedDegreeStringKernel);
00031 #endif //HAVE_BOOST_SERIALIZATION
00032 
00033 using namespace shogun;
00034 
00035 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00036 struct S_THREAD_PARAM
00037 {
00038 
00039     int32_t* vec;
00040     float64_t* result;
00041     float64_t* weights;
00042     CWeightedDegreeStringKernel* kernel;
00043     CTrie<DNATrie>* tries;
00044     float64_t factor;
00045     int32_t j;
00046     int32_t start;
00047     int32_t end;
00048     int32_t length;
00049     int32_t* vec_idx;
00050 };
00051 #endif // DOXYGEN_SHOULD_SKIP_THIS
00052 
00053 
00054 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00055     int32_t degree_, EWDKernType type_)
00056 : CStringKernel<char>(10),weights(NULL),position_weights(NULL),
00057     weights_buffer(NULL), mkl_stepsize(1),degree(degree_), length(0),
00058     max_mismatch(0), seq_length(0), block_computation(true),
00059     num_block_weights_external(0), block_weights_external(NULL),
00060     block_weights(NULL), type(type_), which_degree(-1), tries(NULL),
00061     tree_initialized(false), alphabet(NULL)
00062 {
00063     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00064     lhs=NULL;
00065     rhs=NULL;
00066 
00067     if (type!=E_EXTERNAL)
00068         set_wd_weights_by_type(type);
00069 
00070     set_normalizer(new CFirstElementKernelNormalizer());
00071 }
00072 
00073 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel (
00074     float64_t *weights_, int32_t degree_)
00075 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00076     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00077     max_mismatch(0), seq_length(0), block_computation(true),
00078     num_block_weights_external(0), block_weights_external(NULL),
00079     block_weights(NULL), type(E_EXTERNAL), which_degree(-1), tries(NULL),
00080     tree_initialized(false), alphabet(NULL)
00081 {
00082     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00083     lhs=NULL;
00084     rhs=NULL;
00085 
00086     weights=new float64_t[degree*(1+max_mismatch)];
00087     for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00088         weights[i]=weights_[i];
00089     set_normalizer(new CFirstElementKernelNormalizer());
00090 }
00091 
00092 CWeightedDegreeStringKernel::CWeightedDegreeStringKernel(
00093     CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree_)
00094 : CStringKernel<char>(10), weights(NULL), position_weights(NULL),
00095     weights_buffer(NULL), mkl_stepsize(1), degree(degree_), length(0),
00096     max_mismatch(0), seq_length(0), block_computation(true),
00097     num_block_weights_external(0), block_weights_external(NULL),
00098     block_weights(NULL), type(E_WD), which_degree(-1), tries(NULL),
00099     tree_initialized(false), alphabet(NULL)
00100 {
00101     properties |= KP_LINADD | KP_KERNCOMBINATION | KP_BATCHEVALUATION;
00102 
00103     set_wd_weights_by_type(type);
00104     set_normalizer(new CFirstElementKernelNormalizer());
00105     init(l, r);
00106 }
00107 
00108 CWeightedDegreeStringKernel::~CWeightedDegreeStringKernel()
00109 {
00110     cleanup();
00111 
00112     delete[] weights;
00113     weights=NULL;
00114 
00115     delete[] block_weights;
00116     block_weights=NULL;
00117 
00118     delete[] position_weights;
00119     position_weights=NULL;
00120 
00121     delete[] weights_buffer;
00122     weights_buffer=NULL;
00123 }
00124 
00125 
00126 void CWeightedDegreeStringKernel::remove_lhs()
00127 {
00128     SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00129     delete_optimization();
00130 
00131     if (tries!=NULL)
00132         tries->destroy();
00133 
00134     CKernel::remove_lhs();
00135 }
00136 
00137 void CWeightedDegreeStringKernel::create_empty_tries()
00138 {
00139     ASSERT(lhs);
00140 
00141     seq_length=((CStringFeatures<char>*) lhs)->get_max_vector_length();
00142 
00143     if (tries!=NULL)
00144     {
00145         tries->destroy() ;
00146         tries->create(seq_length, max_mismatch==0) ;
00147     }
00148 }
00149 
00150 bool CWeightedDegreeStringKernel::init(CFeatures* l, CFeatures* r)
00151 {
00152     int32_t lhs_changed=(lhs!=l);
00153     int32_t rhs_changed=(rhs!=r);
00154 
00155     CStringKernel<char>::init(l,r);
00156 
00157     SG_DEBUG("lhs_changed: %i\n", lhs_changed);
00158     SG_DEBUG("rhs_changed: %i\n", rhs_changed);
00159 
00160     CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l;
00161     CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r;
00162 
00163     int32_t len=sf_l->get_max_vector_length();
00164     if (lhs_changed && !sf_l->have_same_length(len))
00165         SG_ERROR("All strings in WD kernel must have same length (lhs wrong)!\n");
00166 
00167     if (rhs_changed && !sf_r->have_same_length(len))
00168         SG_ERROR("All strings in WD kernel must have same length (rhs wrong)!\n");
00169 
00170     SG_UNREF(alphabet);
00171     alphabet=sf_l->get_alphabet();
00172     CAlphabet* ralphabet=sf_r->get_alphabet();
00173 
00174     if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA)))
00175         properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION);
00176 
00177     ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet());
00178     SG_UNREF(ralphabet);
00179 
00180     if (tries!=NULL) {
00181         tries->delete_trees(max_mismatch==0);
00182         SG_UNREF(tries);
00183     }
00184     tries=new CTrie<DNATrie>(degree, max_mismatch==0);
00185     create_empty_tries();
00186 
00187     init_block_weights();
00188 
00189     return init_normalizer();
00190 }
00191 
00192 void CWeightedDegreeStringKernel::cleanup()
00193 {
00194     SG_DEBUG("deleting CWeightedDegreeStringKernel optimization\n");
00195     delete_optimization();
00196 
00197     delete[] block_weights;
00198     block_weights=NULL;
00199 
00200     if (tries!=NULL)
00201     {
00202         tries->destroy();
00203         SG_UNREF(tries);
00204         tries=NULL;
00205     }
00206 
00207     seq_length=0;
00208     tree_initialized = false;
00209 
00210     SG_UNREF(alphabet);
00211     alphabet=NULL;
00212 
00213     CKernel::cleanup();
00214 }
00215 
00216 bool CWeightedDegreeStringKernel::init_optimization(int32_t count, int32_t* IDX, float64_t* alphas, int32_t tree_num)
00217 {
00218     if (tree_num<0)
00219         SG_DEBUG( "deleting CWeightedDegreeStringKernel optimization\n");
00220 
00221     delete_optimization();
00222 
00223     if (tree_num<0)
00224         SG_DEBUG( "initializing CWeightedDegreeStringKernel optimization\n") ;
00225 
00226     for (int32_t i=0; i<count; i++)
00227     {
00228         if (tree_num<0)
00229         {
00230             if ( (i % (count/10+1)) == 0)
00231                 SG_PROGRESS(i, 0, count);
00232 
00233             if (max_mismatch==0)
00234                 add_example_to_tree(IDX[i], alphas[i]) ;
00235             else
00236                 add_example_to_tree_mismatch(IDX[i], alphas[i]) ;
00237 
00238             //SG_DEBUG( "number of used trie nodes: %i\n", tries.get_num_used_nodes()) ;
00239         }
00240         else
00241         {
00242             if (max_mismatch==0)
00243                 add_example_to_single_tree(IDX[i], alphas[i], tree_num) ;
00244             else
00245                 add_example_to_single_tree_mismatch(IDX[i], alphas[i], tree_num) ;
00246         }
00247     }
00248 
00249     if (tree_num<0)
00250         SG_DONE();
00251 
00252     //tries.compact_nodes(NO_CHILD, 0, weights) ;
00253 
00254     set_is_initialized(true) ;
00255     return true ;
00256 }
00257 
00258 bool CWeightedDegreeStringKernel::delete_optimization()
00259 {
00260     if (get_is_initialized())
00261     {
00262         if (tries!=NULL)
00263             tries->delete_trees(max_mismatch==0);
00264         set_is_initialized(false);
00265         return true;
00266     }
00267 
00268     return false;
00269 }
00270 
00271 
00272 float64_t CWeightedDegreeStringKernel::compute_with_mismatch(
00273     char* avec, int32_t alen, char* bvec, int32_t blen)
00274 {
00275     float64_t sum = 0.0;
00276 
00277     for (int32_t i=0; i<alen; i++)
00278     {
00279         float64_t sumi = 0.0;
00280         int32_t mismatches=0;
00281 
00282         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00283         {
00284             if (avec[i+j]!=bvec[i+j])
00285             {
00286                 mismatches++ ;
00287                 if (mismatches>max_mismatch)
00288                     break ;
00289             } ;
00290             sumi += weights[j+degree*mismatches];
00291         }
00292         if (position_weights!=NULL)
00293             sum+=position_weights[i]*sumi ;
00294         else
00295             sum+=sumi ;
00296     }
00297     return sum ;
00298 }
00299 
00300 float64_t CWeightedDegreeStringKernel::compute_using_block(
00301     char* avec, int32_t alen, char* bvec, int32_t blen)
00302 {
00303     ASSERT(alen==blen);
00304 
00305     float64_t sum=0;
00306     int32_t match_len=-1;
00307 
00308     for (int32_t i=0; i<alen; i++)
00309     {
00310         if (avec[i]==bvec[i])
00311             match_len++;
00312         else
00313         {
00314             if (match_len>=0)
00315                 sum+=block_weights[match_len];
00316             match_len=-1;
00317         }
00318     }
00319 
00320     if (match_len>=0)
00321         sum+=block_weights[match_len];
00322 
00323     return sum;
00324 }
00325 
00326 float64_t CWeightedDegreeStringKernel::compute_without_mismatch(
00327     char* avec, int32_t alen, char* bvec, int32_t blen)
00328 {
00329     float64_t sum = 0.0;
00330 
00331     for (int32_t i=0; i<alen; i++)
00332     {
00333         float64_t sumi = 0.0;
00334 
00335         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00336         {
00337             if (avec[i+j]!=bvec[i+j])
00338                 break ;
00339             sumi += weights[j];
00340         }
00341         if (position_weights!=NULL)
00342             sum+=position_weights[i]*sumi ;
00343         else
00344             sum+=sumi ;
00345     }
00346     return sum ;
00347 }
00348 
00349 float64_t CWeightedDegreeStringKernel::compute_without_mismatch_matrix(
00350     char* avec, int32_t alen, char* bvec, int32_t blen)
00351 {
00352     float64_t sum = 0.0;
00353 
00354     for (int32_t i=0; i<alen; i++)
00355     {
00356         float64_t sumi=0.0;
00357         for (int32_t j=0; (i+j<alen) && (j<degree); j++)
00358         {
00359             if (avec[i+j]!=bvec[i+j])
00360                 break;
00361             sumi += weights[i*degree+j];
00362         }
00363         if (position_weights!=NULL)
00364             sum += position_weights[i]*sumi ;
00365         else
00366             sum += sumi ;
00367     }
00368 
00369     return sum ;
00370 }
00371 
00372 
00373 float64_t CWeightedDegreeStringKernel::compute(int32_t idx_a, int32_t idx_b)
00374 {
00375     int32_t alen, blen;
00376     bool free_avec, free_bvec;
00377     char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec);
00378     char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec);
00379     float64_t result=0;
00380 
00381     if (max_mismatch==0 && length==0 && block_computation)
00382         result=compute_using_block(avec, alen, bvec, blen);
00383     else
00384     {
00385         if (max_mismatch>0)
00386             result=compute_with_mismatch(avec, alen, bvec, blen);
00387         else if (length==0)
00388             result=compute_without_mismatch(avec, alen, bvec, blen);
00389         else
00390             result=compute_without_mismatch_matrix(avec, alen, bvec, blen);
00391     }
00392     ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec);
00393     ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec);
00394 
00395     return result;
00396 }
00397 
00398 
00399 void CWeightedDegreeStringKernel::add_example_to_tree(
00400     int32_t idx, float64_t alpha)
00401 {
00402     ASSERT(alphabet);
00403     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00404 
00405     int32_t len=0;
00406     bool free_vec;
00407     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00408     ASSERT(max_mismatch==0);
00409     int32_t *vec=new int32_t[len];
00410 
00411     for (int32_t i=0; i<len; i++)
00412         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00413     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00414 
00415     if (length == 0 || max_mismatch > 0)
00416     {
00417         for (int32_t i=0; i<len; i++)
00418         {
00419             float64_t alpha_pw=alpha;
00420             /*if (position_weights!=NULL)
00421               alpha_pw *= position_weights[i] ;*/
00422             if (alpha_pw==0.0)
00423                 continue;
00424             ASSERT(tries);
00425             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00426         }
00427     }
00428     else
00429     {
00430         for (int32_t i=0; i<len; i++)
00431         {
00432             float64_t alpha_pw=alpha;
00433             /*if (position_weights!=NULL)
00434               alpha_pw = alpha*position_weights[i] ;*/
00435             if (alpha_pw==0.0)
00436                 continue ;
00437             ASSERT(tries);
00438             tries->add_to_trie(i, 0, vec, normalizer->normalize_lhs(alpha_pw, idx), weights, (length!=0));
00439         }
00440     }
00441     delete[] vec ;
00442     tree_initialized=true ;
00443 }
00444 
00445 void CWeightedDegreeStringKernel::add_example_to_single_tree(
00446     int32_t idx, float64_t alpha, int32_t tree_num)
00447 {
00448     ASSERT(alphabet);
00449     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00450 
00451     int32_t len;
00452     bool free_vec;
00453     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00454     ASSERT(max_mismatch==0);
00455     int32_t *vec = new int32_t[len] ;
00456 
00457     for (int32_t i=tree_num; i<tree_num+degree && i<len; i++)
00458         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00459     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00460 
00461 
00462     ASSERT(tries);
00463     if (alpha!=0.0)
00464         tries->add_to_trie(tree_num, 0, vec, normalizer->normalize_lhs(alpha, idx), weights, (length!=0));
00465 
00466     delete[] vec ;
00467     tree_initialized=true ;
00468 }
00469 
00470 void CWeightedDegreeStringKernel::add_example_to_tree_mismatch(int32_t idx, float64_t alpha)
00471 {
00472     ASSERT(tries);
00473     ASSERT(alphabet);
00474     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00475 
00476     int32_t len ;
00477     bool free_vec;
00478     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00479 
00480     int32_t *vec = new int32_t[len] ;
00481 
00482     for (int32_t i=0; i<len; i++)
00483         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00484     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00485 
00486     for (int32_t i=0; i<len; i++)
00487     {
00488         if (alpha!=0.0)
00489             tries->add_example_to_tree_mismatch_recursion(NO_CHILD, i, normalizer->normalize_lhs(alpha, idx), &vec[i], len-i, 0, 0, max_mismatch, weights);
00490     }
00491 
00492     delete[] vec ;
00493     tree_initialized=true ;
00494 }
00495 
00496 void CWeightedDegreeStringKernel::add_example_to_single_tree_mismatch(
00497     int32_t idx, float64_t alpha, int32_t tree_num)
00498 {
00499     ASSERT(tries);
00500     ASSERT(alphabet);
00501     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00502 
00503     int32_t len=0;
00504     bool free_vec;
00505     char* char_vec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx, len, free_vec);
00506     int32_t *vec=new int32_t[len];
00507 
00508     for (int32_t i=tree_num; i<len && i<tree_num+degree; i++)
00509         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00510     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00511 
00512     if (alpha!=0.0)
00513     {
00514         tries->add_example_to_tree_mismatch_recursion(
00515             NO_CHILD, tree_num, normalizer->normalize_lhs(alpha, idx), &vec[tree_num], len-tree_num,
00516             0, 0, max_mismatch, weights);
00517     }
00518 
00519     delete[] vec;
00520     tree_initialized=true;
00521 }
00522 
00523 
00524 float64_t CWeightedDegreeStringKernel::compute_by_tree(int32_t idx)
00525 {
00526     ASSERT(alphabet);
00527     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00528 
00529     int32_t len=0;
00530     bool free_vec;
00531     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00532     ASSERT(char_vec && len>0);
00533     int32_t *vec=new int32_t[len];
00534 
00535     for (int32_t i=0; i<len; i++)
00536         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00537     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00538 
00539     float64_t sum=0;
00540     ASSERT(tries);
00541     for (int32_t i=0; i<len; i++)
00542         sum+=tries->compute_by_tree_helper(vec, len, i, i, i, weights, (length!=0));
00543 
00544     delete[] vec;
00545     return normalizer->normalize_rhs(sum, idx);
00546 }
00547 
00548 void CWeightedDegreeStringKernel::compute_by_tree(
00549     int32_t idx, float64_t* LevelContrib)
00550 {
00551     ASSERT(alphabet);
00552     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00553 
00554     int32_t len ;
00555     bool free_vec;
00556     char* char_vec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx, len, free_vec);
00557 
00558     int32_t *vec = new int32_t[len] ;
00559 
00560     for (int32_t i=0; i<len; i++)
00561         vec[i]=alphabet->remap_to_bin(char_vec[i]);
00562     ((CStringFeatures<char>*) lhs)->free_feature_vector(char_vec, idx, free_vec);
00563 
00564     ASSERT(tries);
00565     for (int32_t i=0; i<len; i++)
00566     {
00567         tries->compute_by_tree_helper(vec, len, i, i, i, LevelContrib,
00568                 normalizer->normalize_rhs(1.0, idx),
00569                 mkl_stepsize, weights, (length!=0));
00570     }
00571 
00572     delete[] vec ;
00573 }
00574 
00575 float64_t *CWeightedDegreeStringKernel::compute_abs_weights(int32_t &len)
00576 {
00577     ASSERT(tries);
00578     return tries->compute_abs_weights(len);
00579 }
00580 
00581 bool CWeightedDegreeStringKernel::set_wd_weights_by_type(EWDKernType p_type)
00582 {
00583     ASSERT(degree>0);
00584     ASSERT(p_type==E_WD); 
00585 
00586     delete[] weights;
00587     weights=new float64_t[degree];
00588     if (weights)
00589     {
00590         int32_t i;
00591         float64_t sum=0;
00592         for (i=0; i<degree; i++)
00593         {
00594             weights[i]=degree-i;
00595             sum+=weights[i];
00596         }
00597         for (i=0; i<degree; i++)
00598             weights[i]/=sum;
00599 
00600         for (i=0; i<degree; i++)
00601         {
00602             for (int32_t j=1; j<=max_mismatch; j++)
00603             {
00604                 if (j<i+1)
00605                 {
00606                     int32_t nk=CMath::nchoosek(i+1, j);
00607                     weights[i+j*degree]=weights[i]/(nk*CMath::pow(3.0,j));
00608                 }
00609                 else
00610                     weights[i+j*degree]= 0;
00611             }
00612         }
00613 
00614         if (which_degree>=0)
00615         {
00616             ASSERT(which_degree<degree);
00617             for (i=0; i<degree; i++)
00618             {
00619                 if (i!=which_degree)
00620                     weights[i]=0;
00621                 else
00622                     weights[i]=1;
00623             }
00624         }
00625         return true;
00626     }
00627     else
00628         return false;
00629 }
00630 
00631 bool CWeightedDegreeStringKernel::set_weights(
00632     float64_t* ws, int32_t d, int32_t len)
00633 {
00634     if (d!=degree || len<1)
00635         SG_ERROR("Dimension mismatch (should be de(seq_length | 1) x degree)\n");
00636 
00637     length=len;
00638 
00639     if (length==0)
00640         length=1;
00641 
00642     int32_t num_weights=degree*(length+max_mismatch);
00643     delete[] weights;
00644     weights=new float64_t[num_weights];
00645 
00646     if (weights)
00647     {
00648         for (int32_t i=0; i<num_weights; i++) {
00649             if (ws[i]) // len(ws) might be != num_weights?
00650                 weights[i]=ws[i];
00651         }
00652         return true;
00653     }
00654     else
00655         return false;
00656 }
00657 
00658 bool CWeightedDegreeStringKernel::set_position_weights(
00659     float64_t* pws, int32_t len)
00660 {
00661     if (len==0)
00662     {
00663         delete[] position_weights;
00664         position_weights=NULL;
00665         ASSERT(tries);
00666         tries->set_position_weights(position_weights);
00667     }
00668 
00669     if (seq_length!=len)
00670         SG_ERROR("seq_length = %i, position_weights_length=%i\n", seq_length, len);
00671 
00672     delete[] position_weights;
00673     position_weights=new float64_t[len];
00674     ASSERT(tries);
00675     tries->set_position_weights(position_weights);
00676 
00677     if (position_weights)
00678     {
00679         for (int32_t i=0; i<len; i++)
00680             position_weights[i]=pws[i];
00681         return true;
00682     }
00683     else
00684         return false;
00685 }
00686 
00687 bool CWeightedDegreeStringKernel::init_block_weights_from_wd()
00688 {
00689     delete[] block_weights;
00690     block_weights=new float64_t[CMath::max(seq_length,degree)];
00691 
00692     if (block_weights)
00693     {
00694         int32_t k;
00695         float64_t d=degree; // use float to evade rounding errors below
00696 
00697         for (k=0; k<degree; k++)
00698             block_weights[k]=
00699                 (-CMath::pow(k, 3)+(3*d-3)*CMath::pow(k, 2)+(9*d-2)*k+6*d)/(3*d*(d+1));
00700         for (k=degree; k<seq_length; k++)
00701             block_weights[k]=(-d+3*k+4)/3;
00702     }
00703 
00704     return (block_weights!=NULL);
00705 }
00706 
00707 bool CWeightedDegreeStringKernel::init_block_weights_from_wd_external()
00708 {
00709     ASSERT(weights);
00710     delete[] block_weights;
00711     block_weights=new float64_t[CMath::max(seq_length,degree)];
00712 
00713     if (block_weights)
00714     {
00715         int32_t i=0;
00716         block_weights[0]=weights[0];
00717         for (i=1; i<CMath::max(seq_length,degree); i++)
00718             block_weights[i]=0;
00719 
00720         for (i=1; i<CMath::max(seq_length,degree); i++)
00721         {
00722             block_weights[i]=block_weights[i-1];
00723 
00724             float64_t contrib=0;
00725             for (int32_t j=0; j<CMath::min(degree,i+1); j++)
00726                 contrib+=weights[j];
00727 
00728             block_weights[i]+=contrib;
00729         }
00730     }
00731 
00732     return (block_weights!=NULL);
00733 }
00734 
00735 bool CWeightedDegreeStringKernel::init_block_weights_const()
00736 {
00737     delete[] block_weights;
00738     block_weights=new float64_t[seq_length];
00739 
00740     if (block_weights)
00741     {
00742         for (int32_t i=1; i<seq_length+1 ; i++)
00743             block_weights[i-1]=1.0/seq_length;
00744     }
00745 
00746     return (block_weights!=NULL);
00747 }
00748 
00749 bool CWeightedDegreeStringKernel::init_block_weights_linear()
00750 {
00751     delete[] block_weights;
00752     block_weights=new float64_t[seq_length];
00753 
00754     if (block_weights)
00755     {
00756         for (int32_t i=1; i<seq_length+1 ; i++)
00757             block_weights[i-1]=degree*i;
00758     }
00759 
00760     return (block_weights!=NULL);
00761 }
00762 
00763 bool CWeightedDegreeStringKernel::init_block_weights_sqpoly()
00764 {
00765     delete[] block_weights;
00766     block_weights=new float64_t[seq_length];
00767 
00768     if (block_weights)
00769     {
00770         for (int32_t i=1; i<degree+1 ; i++)
00771             block_weights[i-1]=((float64_t) i)*i;
00772 
00773         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00774             block_weights[i-1]=i;
00775     }
00776 
00777     return (block_weights!=NULL);
00778 }
00779 
00780 bool CWeightedDegreeStringKernel::init_block_weights_cubicpoly()
00781 {
00782     delete[] block_weights;
00783     block_weights=new float64_t[seq_length];
00784 
00785     if (block_weights)
00786     {
00787         for (int32_t i=1; i<degree+1 ; i++)
00788             block_weights[i-1]=((float64_t) i)*i*i;
00789 
00790         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00791             block_weights[i-1]=i;
00792     }
00793 
00794     return (block_weights!=NULL);
00795 }
00796 
00797 bool CWeightedDegreeStringKernel::init_block_weights_exp()
00798 {
00799     delete[] block_weights;
00800     block_weights=new float64_t[seq_length];
00801 
00802     if (block_weights)
00803     {
00804         for (int32_t i=1; i<degree+1 ; i++)
00805             block_weights[i-1]=exp(((float64_t) i/10.0));
00806 
00807         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00808             block_weights[i-1]=i;
00809     }
00810 
00811     return (block_weights!=NULL);
00812 }
00813 
00814 bool CWeightedDegreeStringKernel::init_block_weights_log()
00815 {
00816     delete[] block_weights;
00817     block_weights=new float64_t[seq_length];
00818 
00819     if (block_weights)
00820     {
00821         for (int32_t i=1; i<degree+1 ; i++)
00822             block_weights[i-1]=CMath::pow(CMath::log((float64_t) i),2);
00823 
00824         for (int32_t i=degree+1; i<seq_length+1 ; i++)
00825             block_weights[i-1]=i-degree+1+CMath::pow(CMath::log(degree+1.0),2);
00826     }
00827 
00828     return (block_weights!=NULL);
00829 }
00830 
00831 bool CWeightedDegreeStringKernel::init_block_weights_external()
00832 {
00833     if (block_weights_external && (seq_length == num_block_weights_external) )
00834     {
00835         delete[] block_weights;
00836         block_weights=new float64_t[seq_length];
00837 
00838         if (block_weights)
00839         {
00840             for (int32_t i=0; i<seq_length; i++)
00841                 block_weights[i]=block_weights_external[i];
00842         }
00843     }
00844     else {
00845       SG_ERROR( "sequence longer then weights (seqlen:%d, wlen:%d)\n", seq_length, block_weights_external);
00846    }
00847     return (block_weights!=NULL);
00848 }
00849 
00850 bool CWeightedDegreeStringKernel::init_block_weights()
00851 {
00852     switch (type)
00853     {
00854         case E_WD:
00855             return init_block_weights_from_wd();
00856         case E_EXTERNAL:
00857             return init_block_weights_from_wd_external();
00858         case E_BLOCK_CONST:
00859             return init_block_weights_const();
00860         case E_BLOCK_LINEAR:
00861             return init_block_weights_linear();
00862         case E_BLOCK_SQPOLY:
00863             return init_block_weights_sqpoly();
00864         case E_BLOCK_CUBICPOLY:
00865             return init_block_weights_cubicpoly();
00866         case E_BLOCK_EXP:
00867             return init_block_weights_exp();
00868         case E_BLOCK_LOG:
00869             return init_block_weights_log();
00870         case E_BLOCK_EXTERNAL:
00871             return init_block_weights_external();
00872         default:
00873             return false;
00874     };
00875 }
00876 
00877 
00878 void* CWeightedDegreeStringKernel::compute_batch_helper(void* p)
00879 {
00880     S_THREAD_PARAM* params = (S_THREAD_PARAM*) p;
00881     int32_t j=params->j;
00882     CWeightedDegreeStringKernel* wd=params->kernel;
00883     CTrie<DNATrie>* tries=params->tries;
00884     float64_t* weights=params->weights;
00885     int32_t length=params->length;
00886     int32_t* vec=params->vec;
00887     float64_t* result=params->result;
00888     float64_t factor=params->factor;
00889     int32_t* vec_idx=params->vec_idx;
00890 
00891     CStringFeatures<char>* rhs_feat=((CStringFeatures<char>*) wd->get_rhs());
00892     CAlphabet* alpha=wd->alphabet;
00893 
00894     for (int32_t i=params->start; i<params->end; i++)
00895     {
00896         int32_t len=0;
00897         bool free_vec;
00898         char* char_vec=rhs_feat->get_feature_vector(vec_idx[i], len, free_vec);
00899         for (int32_t k=j; k<CMath::min(len,j+wd->get_degree()); k++)
00900             vec[k]=alpha->remap_to_bin(char_vec[k]);
00901         rhs_feat->free_feature_vector(char_vec, vec_idx[i], free_vec);
00902 
00903         ASSERT(tries);
00904 
00905         result[i]+=factor*
00906             wd->normalizer->normalize_rhs(tries->compute_by_tree_helper(vec, len, j, j, j, weights, (length!=0)), vec_idx[i]);
00907     }
00908 
00909     SG_UNREF(rhs_feat);
00910 
00911     return NULL;
00912 }
00913 
00914 void CWeightedDegreeStringKernel::compute_batch(
00915     int32_t num_vec, int32_t* vec_idx, float64_t* result, int32_t num_suppvec,
00916     int32_t* IDX, float64_t* alphas, float64_t factor)
00917 {
00918     ASSERT(tries);
00919     ASSERT(alphabet);
00920     ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00921     ASSERT(rhs);
00922     ASSERT(num_vec<=rhs->get_num_vectors());
00923     ASSERT(num_vec>0);
00924     ASSERT(vec_idx);
00925     ASSERT(result);
00926     create_empty_tries();
00927 
00928     int32_t num_feat=((CStringFeatures<char>*) rhs)->get_max_vector_length();
00929     ASSERT(num_feat>0);
00930     int32_t num_threads=parallel->get_num_threads();
00931     ASSERT(num_threads>0);
00932     int32_t* vec=new int32_t[num_threads*num_feat];
00933 
00934     if (num_threads < 2)
00935     {
00936 #ifdef CYGWIN
00937         for (int32_t j=0; j<num_feat; j++)
00938 #else
00939         CSignal::clear_cancel();
00940         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00941 #endif
00942         {
00943             init_optimization(num_suppvec, IDX, alphas, j);
00944             S_THREAD_PARAM params;
00945             params.vec=vec;
00946             params.result=result;
00947             params.weights=weights;
00948             params.kernel=this;
00949             params.tries=tries;
00950             params.factor=factor;
00951             params.j=j;
00952             params.start=0;
00953             params.end=num_vec;
00954             params.length=length;
00955             params.vec_idx=vec_idx;
00956             compute_batch_helper((void*) &params);
00957 
00958             SG_PROGRESS(j,0,num_feat);
00959         }
00960     }
00961 #ifndef WIN32
00962     else
00963     {
00964         CSignal::clear_cancel();
00965         for (int32_t j=0; j<num_feat && !CSignal::cancel_computations(); j++)
00966         {
00967             init_optimization(num_suppvec, IDX, alphas, j);
00968             pthread_t* threads = new pthread_t[num_threads-1];
00969             S_THREAD_PARAM* params = new S_THREAD_PARAM[num_threads];
00970             int32_t step= num_vec/num_threads;
00971             int32_t t;
00972 
00973             for (t=0; t<num_threads-1; t++)
00974             {
00975                 params[t].vec=&vec[num_feat*t];
00976                 params[t].result=result;
00977                 params[t].weights=weights;
00978                 params[t].kernel=this;
00979                 params[t].tries=tries;
00980                 params[t].factor=factor;
00981                 params[t].j=j;
00982                 params[t].start = t*step;
00983                 params[t].end = (t+1)*step;
00984                 params[t].length=length;
00985                 params[t].vec_idx=vec_idx;
00986                 pthread_create(&threads[t], NULL, CWeightedDegreeStringKernel::compute_batch_helper, (void*)&params[t]);
00987             }
00988             params[t].vec=&vec[num_feat*t];
00989             params[t].result=result;
00990             params[t].weights=weights;
00991             params[t].kernel=this;
00992             params[t].tries=tries;
00993             params[t].factor=factor;
00994             params[t].j=j;
00995             params[t].start=t*step;
00996             params[t].end=num_vec;
00997             params[t].length=length;
00998             params[t].vec_idx=vec_idx;
00999             compute_batch_helper((void*) &params[t]);
01000 
01001             for (t=0; t<num_threads-1; t++)
01002                 pthread_join(threads[t], NULL);
01003             SG_PROGRESS(j,0,num_feat);
01004 
01005             delete[] params;
01006             delete[] threads;
01007         }
01008     }
01009 #endif
01010 
01011     delete[] vec;
01012 
01013     //really also free memory as this can be huge on testing especially when
01014     //using the combined kernel
01015     create_empty_tries();
01016 }
01017 
01018 bool CWeightedDegreeStringKernel::set_max_mismatch(int32_t max)
01019 {
01020     if (type==E_EXTERNAL && max!=0) {
01021         return false;
01022     }
01023 
01024     max_mismatch=max;
01025 
01026     if (lhs!=NULL && rhs!=NULL)
01027         return init(lhs, rhs);
01028     else
01029         return true;
01030 }

SHOGUN Machine Learning Toolbox - Documentation