00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _WEIGHTEDDEGREESTRINGKERNEL_H___
00013 #define _WEIGHTEDDEGREESTRINGKERNEL_H___
00014
00015 #include "lib/common.h"
00016 #include "lib/Trie.h"
00017 #include "kernel/StringKernel.h"
00018 #include "features/StringFeatures.h"
00019
00020 namespace shogun
00021 {
00022
00023 enum EWDKernType
00024 {
00025 E_WD=0,
00026 E_EXTERNAL=1,
00027
00028 E_BLOCK_CONST=2,
00029 E_BLOCK_LINEAR=3,
00030 E_BLOCK_SQPOLY=4,
00031 E_BLOCK_CUBICPOLY=5,
00032 E_BLOCK_EXP=6,
00033 E_BLOCK_LOG=7,
00034 E_BLOCK_EXTERNAL=8
00035 };
00036
00037
00052 class CWeightedDegreeStringKernel: public CStringKernel<char>
00053 {
00054 public:
00055
00061 CWeightedDegreeStringKernel(int32_t degree, EWDKernType type=E_WD);
00062
00068 CWeightedDegreeStringKernel(float64_t* weights, int32_t degree);
00069
00076 CWeightedDegreeStringKernel(
00077 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree);
00078
00079 virtual ~CWeightedDegreeStringKernel();
00080
00087 virtual bool init(CFeatures* l, CFeatures* r);
00088
00090 virtual void cleanup();
00091
00099 EWDKernType get_type() const
00100 {
00101 return type;
00102 }
00103
00108 int32_t get_degree() const
00109 {
00110 return degree;
00111 }
00112
00118 int32_t get_max_mismatch() const
00119 {
00120 return max_mismatch;
00121 }
00122
00127 virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREE; }
00128
00133 virtual const char* get_name() const { return "WeightedDegree"; }
00134
00142 inline virtual bool init_optimization(
00143 int32_t count, int32_t *IDX, float64_t* alphas)
00144 {
00145 return init_optimization(count, IDX, alphas, -1);
00146 }
00147
00158 virtual bool init_optimization(
00159 int32_t count, int32_t *IDX, float64_t* alphas, int32_t tree_num);
00160
00165 virtual bool delete_optimization();
00166
00172 virtual float64_t compute_optimized(int32_t idx)
00173 {
00174 if (get_is_initialized())
00175 return compute_by_tree(idx);
00176
00177 SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00178 return 0;
00179 }
00180
00185 static void* compute_batch_helper(void* p);
00186
00197 virtual void compute_batch(
00198 int32_t num_vec, int32_t* vec_idx, float64_t* target,
00199 int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00200 float64_t factor=1.0);
00201
00205 inline virtual void clear_normal()
00206 {
00207 if (get_is_initialized())
00208 {
00209 tries->delete_trees(max_mismatch==0);
00210 set_is_initialized(false);
00211 }
00212 }
00213
00219 inline virtual void add_to_normal(int32_t idx, float64_t weight)
00220 {
00221 if (max_mismatch==0)
00222 add_example_to_tree(idx, weight);
00223 else
00224 add_example_to_tree_mismatch(idx, weight);
00225
00226 set_is_initialized(true);
00227 }
00228
00233 inline virtual int32_t get_num_subkernels()
00234 {
00235 if (position_weights!=NULL)
00236 return (int32_t) ceil(1.0*seq_length/mkl_stepsize) ;
00237 if (length==0)
00238 return (int32_t) ceil(1.0*get_degree()/mkl_stepsize);
00239 return (int32_t) ceil(1.0*get_degree()*length/mkl_stepsize) ;
00240 }
00241
00247 inline void compute_by_subkernel(
00248 int32_t idx, float64_t * subkernel_contrib)
00249 {
00250 if (get_is_initialized())
00251 {
00252 compute_by_tree(idx, subkernel_contrib);
00253 return ;
00254 }
00255
00256 SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00257 }
00258
00264 inline const float64_t* get_subkernel_weights(int32_t& num_weights)
00265 {
00266 num_weights = get_num_subkernels();
00267
00268 delete[] weights_buffer ;
00269 weights_buffer = new float64_t[num_weights];
00270
00271 if (position_weights!=NULL)
00272 for (int32_t i=0; i<num_weights; i++)
00273 weights_buffer[i] = position_weights[i*mkl_stepsize];
00274 else
00275 for (int32_t i=0; i<num_weights; i++)
00276 weights_buffer[i] = weights[i*mkl_stepsize];
00277
00278 return weights_buffer;
00279 }
00280
00286 inline void set_subkernel_weights(
00287 float64_t* weights2, int32_t num_weights2)
00288 {
00289 int32_t num_weights = get_num_subkernels();
00290 if (num_weights!=num_weights2)
00291 SG_ERROR( "number of weights do not match\n");
00292
00293 if (position_weights!=NULL)
00294 {
00295 for (int32_t i=0; i<num_weights; i++)
00296 {
00297 for (int32_t j=0; j<mkl_stepsize; j++)
00298 {
00299 if (i*mkl_stepsize+j<seq_length)
00300 position_weights[i*mkl_stepsize+j] = weights2[i];
00301 }
00302 }
00303 }
00304 else if (length==0)
00305 {
00306 for (int32_t i=0; i<num_weights; i++)
00307 {
00308 for (int32_t j=0; j<mkl_stepsize; j++)
00309 {
00310 if (i*mkl_stepsize+j<get_degree())
00311 weights[i*mkl_stepsize+j] = weights2[i];
00312 }
00313 }
00314 }
00315 else
00316 {
00317 for (int32_t i=0; i<num_weights; i++)
00318 {
00319 for (int32_t j=0; j<mkl_stepsize; j++)
00320 {
00321 if (i*mkl_stepsize+j<get_degree()*length)
00322 weights[i*mkl_stepsize+j] = weights2[i];
00323 }
00324 }
00325 }
00326 }
00327
00328
00334 float64_t *compute_abs_weights(int32_t & len);
00335
00342 void compute_by_tree(int32_t idx, float64_t *LevelContrib);
00343
00348 bool is_tree_initialized() { return tree_initialized; }
00349
00355 inline float64_t *get_degree_weights(int32_t& d, int32_t& len)
00356 {
00357 d=degree;
00358 len=length;
00359 return weights;
00360 }
00361
00367 inline float64_t *get_weights(int32_t& num_weights)
00368 {
00369 if (position_weights!=NULL)
00370 {
00371 num_weights = seq_length ;
00372 return position_weights ;
00373 }
00374 if (length==0)
00375 num_weights = degree ;
00376 else
00377 num_weights = degree*length ;
00378 return weights;
00379 }
00380
00386 inline float64_t *get_position_weights(int32_t& len)
00387 {
00388 len=seq_length;
00389 return position_weights;
00390 }
00391
00397 bool set_wd_weights_by_type(EWDKernType type);
00398
00405 void set_wd_weights(float64_t* p_weights, int32_t d)
00406 {
00407 set_weights(p_weights,d,0);
00408 }
00409
00416 bool set_weights(float64_t* weights, int32_t d, int32_t len);
00417
00424 bool set_position_weights(float64_t* position_weights, int32_t len=0);
00425
00430 bool init_block_weights();
00431
00436 bool init_block_weights_from_wd();
00437
00442 bool init_block_weights_from_wd_external();
00443
00448 bool init_block_weights_const();
00449
00454 bool init_block_weights_linear();
00455
00460 bool init_block_weights_sqpoly();
00461
00466 bool init_block_weights_cubicpoly();
00467
00472 bool init_block_weights_exp();
00473
00478 bool init_block_weights_log();
00479
00484 bool init_block_weights_external();
00485
00490 bool delete_position_weights()
00491 {
00492 delete[] position_weights;
00493 position_weights=NULL;
00494 return true;
00495 }
00496
00502 bool set_max_mismatch(int32_t max);
00503
00508 inline int32_t get_max_mismatch() { return max_mismatch; }
00509
00515 inline bool set_degree(int32_t deg) { degree=deg; return true; }
00516
00521 inline int32_t get_degree() { return degree; }
00522
00528 inline bool set_use_block_computation(bool block)
00529 {
00530 block_computation=block;
00531 return true;
00532 }
00533
00538 inline bool get_use_block_computation() { return block_computation; }
00539
00545 inline bool set_mkl_stepsize(int32_t step)
00546 {
00547 if (step<1)
00548 SG_ERROR("Stepsize must be a positive integer\n");
00549 mkl_stepsize=step;
00550 return true;
00551 }
00552
00557 inline int32_t get_mkl_stepsize() { return mkl_stepsize; }
00558
00564 inline bool set_which_degree(int32_t which)
00565 {
00566 which_degree=which;
00567 return true;
00568 }
00569
00574 inline int32_t get_which_degree() { return which_degree; }
00575
00576 protected:
00578 void create_empty_tries();
00579
00585 void add_example_to_tree(int32_t idx, float64_t weight);
00586
00593 void add_example_to_single_tree(
00594 int32_t idx, float64_t weight, int32_t tree_num);
00595
00601 void add_example_to_tree_mismatch(int32_t idx, float64_t weight);
00602
00609 void add_example_to_single_tree_mismatch(
00610 int32_t idx, float64_t weight, int32_t tree_num);
00611
00617 float64_t compute_by_tree(int32_t idx);
00618
00627 float64_t compute(int32_t idx_a, int32_t idx_b);
00628
00637 float64_t compute_with_mismatch(
00638 char* avec, int32_t alen, char* bvec, int32_t blen);
00639
00648 float64_t compute_without_mismatch(
00649 char* avec, int32_t alen, char* bvec, int32_t blen);
00650
00659 float64_t compute_without_mismatch_matrix(
00660 char* avec, int32_t alen, char* bvec, int32_t blen);
00661
00670 float64_t compute_using_block(char* avec, int32_t alen,
00671 char* bvec, int32_t blen);
00672
00674 virtual void remove_lhs();
00675
00676
00677 #ifdef HAVE_BOOST_SERIALIZATION
00678 private:
00679
00680
00681
00682
00683
00684
00685
00686
00687
00688
00689
00690
00691
00692
00693
00694
00695
00696
00697
00698 friend class ::boost::serialization::access;
00699
00700
00701
00702 template<class Archive>
00703 void save(Archive & ar, const unsigned int archive_version) const
00704 {
00705
00706 SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00707
00708 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00709
00710
00713 ar & mkl_stepsize ;
00714 ar & degree;
00715 ar & length;
00716 ar & max_mismatch ;
00717
00718 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00719 ar & weights[i];
00720
00721
00722
00723
00724
00725 ar & seq_length ;
00726
00727 ar & initialized ;
00728 ar & block_computation;
00729
00730
00731
00732
00733 ar & num_block_weights_external;
00734
00735 for (int32_t i=0; i < num_block_weights_external; ++i)
00736 {
00737 ar & block_weights_external[i];
00738 }
00739
00740
00741
00742 ar & type;
00743 ar & which_degree;
00744
00745
00746
00747
00748
00749
00750
00751
00752
00753
00754 SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00755
00756 }
00757
00758 template<class Archive>
00759 void load(Archive & ar, const unsigned int archive_version)
00760 {
00761 SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00762
00763 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00764
00765
00768 ar & mkl_stepsize ;
00769 ar & degree;
00770 ar & length;
00771 ar & max_mismatch ;
00772
00773 weights=new float64_t[degree*(1+max_mismatch)];
00774 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00775 ar & weights[i];
00776
00777
00778
00779
00780
00781
00782 ar & seq_length ;
00783
00784 ar & initialized ;
00785 ar & block_computation;
00786
00787
00788
00789
00790 ar & num_block_weights_external;
00791
00792 block_weights_external = new float64_t[num_block_weights_external];
00793 for (int32_t i=0; i < num_block_weights_external; ++i)
00794 {
00795 ar & block_weights_external[i];
00796 }
00797
00798
00799
00800 ar & type;
00801 ar & which_degree;
00802
00803
00804
00805
00806
00807 SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00808
00809 }
00810
00811 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
00812
00813
00814 public:
00815
00816 virtual std::string toString() const
00817 {
00818 std::ostringstream s;
00819
00820 ::boost::archive::text_oarchive oa(s);
00821
00822 oa << *this;
00823
00824 return s.str();
00825 }
00826
00827 virtual void fromString(std::string str)
00828 {
00829
00830 std::istringstream is(str);
00831
00832 ::boost::archive::text_iarchive ia(is);
00833
00834 ia >> *this;
00835
00836 }
00837
00838 #endif //HAVE_BOOST_SERIALIZATION
00839
00840
00841 protected:
00845 float64_t* weights;
00847 float64_t* position_weights;
00849 float64_t* weights_buffer;
00851 int32_t mkl_stepsize;
00853 int32_t degree;
00855 int32_t length;
00856
00858 int32_t max_mismatch;
00860 int32_t seq_length;
00861
00863 bool initialized;
00864
00866 bool block_computation;
00867
00869 int32_t num_block_weights_external;
00871 float64_t* block_weights_external;
00872
00874 float64_t* block_weights;
00876 EWDKernType type;
00878 int32_t which_degree;
00879
00881 CTrie<DNATrie>* tries;
00882
00884 bool tree_initialized;
00885
00887 CAlphabet* alphabet;
00888 };
00889
00890 }
00891
00892 #ifdef HAVE_BOOST_SERIALIZATION
00893
00894 namespace boost
00895 {
00896 namespace serialization
00897 {
00898 template<class Archive>
00899 inline void save_construct_data(Archive & ar, const shogun::CWeightedDegreeStringKernel* const t, const unsigned int file_version)
00900 {
00901
00902
00903 int32_t size = 10;
00904 ar << size;
00905
00906 shogun::EWDKernType type = t->get_type();
00907 ar << type;
00908
00909 int32_t degree = t->get_degree();
00910 ar << degree;
00911
00912 int32_t max_mismatch = t->get_max_mismatch();
00913 ar << max_mismatch;
00914
00915
00916
00917
00918
00919
00920
00921 ar.register_type(static_cast<shogun::CStringFeatures<char> *>(NULL));
00922
00923 const shogun::CStringFeatures<char>* const lhs = dynamic_cast<shogun::CStringFeatures<char>* >(const_cast<shogun::CWeightedDegreeStringKernel*>(t)->get_lhs());
00924
00925 const shogun::CStringFeatures<char>* const rhs = dynamic_cast<shogun::CStringFeatures<char>* >(const_cast<shogun::CWeightedDegreeStringKernel*>(t)->get_rhs());
00926
00927
00928
00929
00930
00931
00932 ar << lhs;
00933 ar << rhs;
00934
00935
00936
00937
00938
00939 }
00940
00941 template<class Archive>
00942 inline void load_construct_data(Archive & ar, shogun::CWeightedDegreeStringKernel * t, const unsigned int file_version)
00943 {
00944
00945 std::cout << "loading WDK from non-defaultconstruct data works" << std::endl;
00946
00947
00948 int32_t size;
00949 shogun::EWDKernType type;
00950 int32_t degree;
00951 int32_t max_mismatch;
00952
00953
00954
00955
00956 ar >> size;
00957 ar >> type;
00958 ar >> degree;
00959 ar >> max_mismatch;
00960
00961
00962
00963 shogun::CStringFeatures<char>* lhs;
00964 shogun::CStringFeatures<char>* rhs;
00965
00966
00967 ar >> lhs;
00968 ar >> rhs;
00969
00970 ::new(t)shogun::CWeightedDegreeStringKernel(lhs, rhs, degree);
00971
00972
00973
00974 std::cout << "done loading WDK from non-defaultconstruct data" << std::endl;
00975 }
00976 }
00977 }
00978 #endif //HAVE_BOOST_SERIALIZATION
00979
00980
00981
00982 #endif