WeightedDegreePositionStringKernel.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _WEIGHTEDDEGREEPOSITIONSTRINGKERNEL_H___
00013 #define _WEIGHTEDDEGREEPOSITIONSTRINGKERNEL_H___
00014 
00015 #include "lib/common.h"
00016 #include "kernel/StringKernel.h"
00017 #include "kernel/WeightedDegreeStringKernel.h"
00018 #include "lib/Trie.h"
00019 
00020 namespace shogun
00021 {
00022 
00023 class CSVM;
00024 
00048 class CWeightedDegreePositionStringKernel: public CStringKernel<char>
00049 {
00050     public:
00058         CWeightedDegreePositionStringKernel(
00059             int32_t size, int32_t degree,
00060             int32_t max_mismatch=0, int32_t mkl_stepsize=1);
00061 
00072         CWeightedDegreePositionStringKernel(
00073             int32_t size, float64_t* weights, int32_t degree,
00074             int32_t max_mismatch, int32_t* shift, int32_t shift_len,
00075             int32_t mkl_stepsize=1);
00076 
00083         CWeightedDegreePositionStringKernel(
00084             CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree);
00085 
00086         virtual ~CWeightedDegreePositionStringKernel();
00087 
00094         virtual bool init(CFeatures* l, CFeatures* r);
00095 
00097         virtual void cleanup();
00098 
00103         virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREEPOS; }
00104 
00109         virtual const char* get_name() const { return "WeightedDegreePos"; }
00110 
00118         inline virtual bool init_optimization(
00119             int32_t p_count, int32_t *IDX, float64_t * alphas)
00120         { 
00121             return init_optimization(p_count, IDX, alphas, -1);
00122         }
00123 
00135         virtual bool init_optimization(
00136             int32_t count, int32_t *IDX, float64_t * alphas, int32_t tree_num,
00137             int32_t upto_tree=-1);
00138 
00143         virtual bool delete_optimization();
00144 
00150         inline virtual float64_t compute_optimized(int32_t idx)
00151         { 
00152             ASSERT(get_is_initialized());
00153             ASSERT(alphabet);
00154             ASSERT(alphabet->get_alphabet()==DNA || alphabet->get_alphabet()==RNA);
00155             return compute_by_tree(idx);
00156         }
00157 
00162         static void* compute_batch_helper(void* p);
00163 
00174         virtual void compute_batch(
00175             int32_t num_vec, int32_t* vec_idx, float64_t* target,
00176             int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00177             float64_t factor=1.0);
00178 
00182         inline virtual void clear_normal()
00183         {
00184             if ((opt_type==FASTBUTMEMHUNGRY) && (tries.get_use_compact_terminal_nodes()))
00185             {
00186                 tries.set_use_compact_terminal_nodes(false) ;
00187                 SG_DEBUG( "disabling compact trie nodes with FASTBUTMEMHUNGRY\n") ;
00188             }
00189 
00190             if (get_is_initialized())
00191             {
00192                 if (opt_type==SLOWBUTMEMEFFICIENT)
00193                     tries.delete_trees(true); 
00194                 else if (opt_type==FASTBUTMEMHUNGRY)
00195                     tries.delete_trees(false);  // still buggy
00196                 else
00197                     SG_ERROR( "unknown optimization type\n");
00198 
00199                 set_is_initialized(false);
00200             }
00201         }
00202 
00208         inline virtual void add_to_normal(int32_t idx, float64_t weight)
00209         {
00210             add_example_to_tree(idx, weight);
00211             set_is_initialized(true);
00212         }
00213 
00218         inline virtual int32_t get_num_subkernels()
00219         {
00220             if (position_weights!=NULL)
00221                 return (int32_t) ceil(1.0*seq_length/mkl_stepsize) ;
00222             if (length==0)
00223                 return (int32_t) ceil(1.0*get_degree()/mkl_stepsize);
00224             return (int32_t) ceil(1.0*get_degree()*length/mkl_stepsize) ;
00225         }
00226 
00232         inline void compute_by_subkernel(
00233             int32_t idx, float64_t * subkernel_contrib)
00234         { 
00235             if (get_is_initialized())
00236             {
00237                 compute_by_tree(idx, subkernel_contrib);
00238                 return ;
00239             }
00240 
00241             SG_ERROR( "CWeightedDegreePositionStringKernel optimization not initialized\n") ;
00242         }
00243 
00249         inline const float64_t* get_subkernel_weights(int32_t& num_weights)
00250         {
00251             num_weights = get_num_subkernels() ;
00252 
00253             delete[] weights_buffer ;
00254             weights_buffer = new float64_t[num_weights] ;
00255 
00256             if (position_weights!=NULL)
00257                 for (int32_t i=0; i<num_weights; i++)
00258                     weights_buffer[i] = position_weights[i*mkl_stepsize] ;
00259             else
00260                 for (int32_t i=0; i<num_weights; i++)
00261                     weights_buffer[i] = weights[i*mkl_stepsize] ;
00262 
00263             return weights_buffer ;
00264         }
00265 
00271         inline void set_subkernel_weights(
00272             float64_t* weights2, int32_t num_weights2)
00273         {
00274             int32_t num_weights = get_num_subkernels() ;
00275             if (num_weights!=num_weights2)
00276                 SG_ERROR( "number of weights do not match\n") ;
00277 
00278             if (position_weights!=NULL)
00279                 for (int32_t i=0; i<num_weights; i++)
00280                     for (int32_t j=0; j<mkl_stepsize; j++)
00281                     {
00282                         if (i*mkl_stepsize+j<seq_length)
00283                             position_weights[i*mkl_stepsize+j] = weights2[i] ;
00284                     }
00285             else if (length==0)
00286             {
00287                 for (int32_t i=0; i<num_weights; i++)
00288                     for (int32_t j=0; j<mkl_stepsize; j++)
00289                         if (i*mkl_stepsize+j<get_degree())
00290                             weights[i*mkl_stepsize+j] = weights2[i] ;
00291             }
00292             else
00293             {
00294                 for (int32_t i=0; i<num_weights; i++)
00295                     for (int32_t j=0; j<mkl_stepsize; j++)
00296                         if (i*mkl_stepsize+j<get_degree()*length)
00297                             weights[i*mkl_stepsize+j] = weights2[i] ;
00298             }
00299         }
00300 
00301         // other kernel tree operations
00307         float64_t* compute_abs_weights(int32_t & len);
00308 
00313         bool is_tree_initialized() { return tree_initialized; }
00314 
00319         inline int32_t get_max_mismatch() { return max_mismatch; }
00320 
00325         inline int32_t get_degree() { return degree; }
00326 
00332         inline float64_t *get_degree_weights(int32_t& d, int32_t& len)
00333         {
00334             d=degree;
00335             len=length;
00336             return weights;
00337         }
00338 
00344         inline float64_t *get_weights(int32_t& num_weights)
00345         {
00346             if (position_weights!=NULL)
00347             {
00348                 num_weights = seq_length ;
00349                 return position_weights ;
00350             }
00351             if (length==0)
00352                 num_weights = degree ;
00353             else
00354                 num_weights = degree*length ;
00355             return weights;
00356         }
00357 
00363         inline float64_t *get_position_weights(int32_t& len)
00364         {
00365             len=seq_length;
00366             return position_weights;
00367         }
00368 
00374         bool set_shifts(int32_t* shifts, int32_t len);
00375 
00382         virtual bool set_weights(float64_t* weights, int32_t d, int32_t len=0);
00383 
00388         virtual bool set_wd_weights();
00389 
00396         virtual bool set_position_weights(float64_t* pws, int32_t len);
00397 
00405         bool set_position_weights_lhs(float64_t* pws, int32_t len, int32_t num);
00406 
00414         bool set_position_weights_rhs(float64_t* pws, int32_t len, int32_t num);
00415 
00420         bool init_block_weights();
00421 
00426         bool init_block_weights_from_wd();
00427 
00432         bool init_block_weights_from_wd_external();
00433 
00438         bool init_block_weights_const();
00439 
00444         bool init_block_weights_linear();
00445 
00450         bool init_block_weights_sqpoly();
00451 
00456         bool init_block_weights_cubicpoly();
00457 
00462         bool init_block_weights_exp();
00463 
00468         bool init_block_weights_log();
00469 
00474         bool init_block_weights_external();
00475 
00480         bool delete_position_weights()
00481         {
00482             delete[] position_weights;
00483             position_weights=NULL;
00484             return true;
00485         }
00486 
00491         bool delete_position_weights_lhs()
00492         {
00493             delete[] position_weights_lhs;
00494             position_weights_lhs=NULL;
00495             return true;
00496         }
00497 
00502         bool delete_position_weights_rhs()
00503         {
00504             delete[] position_weights_rhs;
00505             position_weights_rhs=NULL;
00506             return true;
00507         }
00508 
00514         virtual float64_t compute_by_tree(int32_t idx);
00515 
00521         virtual void compute_by_tree(int32_t idx, float64_t* LevelContrib);
00522 
00535         float64_t* compute_scoring(
00536             int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
00537             float64_t* target, int32_t num_suppvec, int32_t* IDX,
00538             float64_t* weights);
00539 
00548         char* compute_consensus(
00549             int32_t &num_feat, int32_t num_suppvec, int32_t* IDX,
00550             float64_t* alphas);
00551 
00563         float64_t* extract_w(
00564             int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
00565             float64_t* w_result, int32_t num_suppvec, int32_t* IDX,
00566             float64_t* alphas);
00567 
00580         float64_t* compute_POIM(
00581             int32_t max_degree, int32_t& num_feat, int32_t& num_sym,
00582             float64_t* poim_result, int32_t num_suppvec, int32_t* IDX,
00583             float64_t* alphas, float64_t* distrib);
00584 
00591         void prepare_POIM2(
00592             float64_t* distrib, int32_t num_sym, int32_t num_feat);
00593 
00600         void compute_POIM2(int32_t max_degree, CSVM* svm);
00601 
00607         void get_POIM2(float64_t** poim, int32_t* result_len);
00608 
00610         void cleanup_POIM2();
00611         
00612     protected:
00614         void create_empty_tries();
00615 
00621         virtual void add_example_to_tree(
00622             int32_t idx, float64_t weight);
00623 
00630         void add_example_to_single_tree(
00631             int32_t idx, float64_t weight, int32_t tree_num);
00632 
00641         virtual float64_t compute(int32_t idx_a, int32_t idx_b);
00642 
00651         float64_t compute_with_mismatch(
00652             char* avec, int32_t alen, char* bvec, int32_t blen);
00653 
00662         float64_t compute_without_mismatch(
00663             char* avec, int32_t alen, char* bvec, int32_t blen);
00664 
00673         float64_t compute_without_mismatch_matrix(
00674             char* avec, int32_t alen, char* bvec, int32_t blen);
00675 
00686         float64_t compute_without_mismatch_position_weights(
00687             char* avec, float64_t *posweights_lhs, int32_t alen,
00688             char* bvec, float64_t *posweights_rhs, int32_t blen);
00689 
00691         virtual void remove_lhs();
00692 
00693     protected:
00695         float64_t* weights;
00697         float64_t* position_weights;
00699         float64_t* position_weights_lhs;
00701         float64_t* position_weights_rhs;
00703         bool* position_mask;
00704 
00706         float64_t* weights_buffer;
00708         int32_t mkl_stepsize;
00709 
00711         int32_t degree;
00713         int32_t length;
00714 
00716         int32_t max_mismatch;
00718         int32_t seq_length;
00719 
00721         int32_t *shift;
00723         int32_t shift_len;
00725         int32_t max_shift;
00726 
00728         bool block_computation;
00729 
00731         int32_t num_block_weights_external;
00733         float64_t* block_weights_external;
00734 
00736         float64_t* block_weights;
00738         EWDKernType type;
00740         int32_t which_degree;
00741 
00743         CTrie<DNATrie> tries;
00745         CTrie<POIMTrie> poim_tries;
00746 
00748         bool tree_initialized;
00750         bool use_poim_tries;
00751 
00753         float64_t* m_poim_distrib;
00755         float64_t* m_poim;
00756 
00758         int32_t m_poim_num_sym;
00760         int32_t m_poim_num_feat;
00762         int32_t m_poim_result_len;
00763 
00765         CAlphabet* alphabet;
00766 };
00767 }
00768 #endif /* _WEIGHTEDDEGREEPOSITIONSTRINGKERNEL_H__ */

SHOGUN Machine Learning Toolbox - Documentation