WeightedDegreeStringKernel.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _WEIGHTEDDEGREESTRINGKERNEL_H___
00013 #define _WEIGHTEDDEGREESTRINGKERNEL_H___
00014 
00015 #include "lib/common.h"
00016 #include "lib/Trie.h"
00017 #include "kernel/StringKernel.h"
00018 #include "features/StringFeatures.h"
00019 
00020 namespace shogun
00021 {
00022 
00023 enum EWDKernType
00024 {
00025     E_WD=0,
00026     E_EXTERNAL=1,
00027 
00028     E_BLOCK_CONST=2,
00029     E_BLOCK_LINEAR=3,
00030     E_BLOCK_SQPOLY=4,
00031     E_BLOCK_CUBICPOLY=5,
00032     E_BLOCK_EXP=6,
00033     E_BLOCK_LOG=7,
00034     E_BLOCK_EXTERNAL=8
00035 };
00036 
00037 
00052 class CWeightedDegreeStringKernel: public CStringKernel<char>
00053 {
00054     public:
00055 
00061         CWeightedDegreeStringKernel(int32_t degree, EWDKernType type=E_WD);
00062 
00068         CWeightedDegreeStringKernel(float64_t* weights, int32_t degree);
00069 
00076         CWeightedDegreeStringKernel(
00077             CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t degree);
00078 
00079         virtual ~CWeightedDegreeStringKernel();
00080 
00087         virtual bool init(CFeatures* l, CFeatures* r);
00088 
00090         virtual void cleanup();
00091 
00099         EWDKernType get_type() const
00100         {
00101             return type;
00102         }
00103 
00108         int32_t get_degree() const
00109         {
00110             return degree;
00111         }
00112 
00118         int32_t get_max_mismatch() const
00119         {
00120             return max_mismatch;
00121         }
00122 
00127         virtual EKernelType get_kernel_type() { return K_WEIGHTEDDEGREE; }
00128 
00133         virtual const char* get_name() const { return "WeightedDegree"; }
00134 
00142         inline virtual bool init_optimization(
00143             int32_t count, int32_t *IDX, float64_t* alphas)
00144         {
00145             return init_optimization(count, IDX, alphas, -1);
00146         }
00147 
00158         virtual bool init_optimization(
00159             int32_t count, int32_t *IDX, float64_t* alphas, int32_t tree_num);
00160 
00165         virtual bool delete_optimization();
00166 
00172         virtual float64_t compute_optimized(int32_t idx)
00173         {
00174             if (get_is_initialized())
00175                 return compute_by_tree(idx);
00176 
00177             SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00178             return 0;
00179         }
00180 
00185         static void* compute_batch_helper(void* p);
00186 
00197         virtual void compute_batch(
00198             int32_t num_vec, int32_t* vec_idx, float64_t* target,
00199             int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00200             float64_t factor=1.0);
00201 
00205         inline virtual void clear_normal()
00206         {
00207             if (get_is_initialized())
00208             {
00209                 tries->delete_trees(max_mismatch==0);
00210                 set_is_initialized(false);
00211             }
00212         }
00213 
00219         inline virtual void add_to_normal(int32_t idx, float64_t weight)
00220         {
00221             if (max_mismatch==0)
00222                 add_example_to_tree(idx, weight);
00223             else
00224                 add_example_to_tree_mismatch(idx, weight);
00225 
00226             set_is_initialized(true);
00227         }
00228 
00233         inline virtual int32_t get_num_subkernels()
00234         {
00235             if (position_weights!=NULL)
00236                 return (int32_t) ceil(1.0*seq_length/mkl_stepsize) ;
00237             if (length==0)
00238                 return (int32_t) ceil(1.0*get_degree()/mkl_stepsize);
00239             return (int32_t) ceil(1.0*get_degree()*length/mkl_stepsize) ;
00240         }
00241 
00247         inline void compute_by_subkernel(
00248             int32_t idx, float64_t * subkernel_contrib)
00249         {
00250             if (get_is_initialized())
00251             {
00252                 compute_by_tree(idx, subkernel_contrib);
00253                 return ;
00254             }
00255 
00256             SG_ERROR( "CWeightedDegreeStringKernel optimization not initialized\n");
00257         }
00258 
00264         inline const float64_t* get_subkernel_weights(int32_t& num_weights)
00265         {
00266             num_weights = get_num_subkernels();
00267 
00268             delete[] weights_buffer ;
00269             weights_buffer = new float64_t[num_weights];
00270 
00271             if (position_weights!=NULL)
00272                 for (int32_t i=0; i<num_weights; i++)
00273                     weights_buffer[i] = position_weights[i*mkl_stepsize];
00274             else
00275                 for (int32_t i=0; i<num_weights; i++)
00276                     weights_buffer[i] = weights[i*mkl_stepsize];
00277 
00278             return weights_buffer;
00279         }
00280 
00286         inline void set_subkernel_weights(
00287             float64_t* weights2, int32_t num_weights2)
00288         {
00289             int32_t num_weights = get_num_subkernels();
00290             if (num_weights!=num_weights2)
00291                 SG_ERROR( "number of weights do not match\n");
00292 
00293             if (position_weights!=NULL)
00294             {
00295                 for (int32_t i=0; i<num_weights; i++)
00296                 {
00297                     for (int32_t j=0; j<mkl_stepsize; j++)
00298                     {
00299                         if (i*mkl_stepsize+j<seq_length)
00300                             position_weights[i*mkl_stepsize+j] = weights2[i];
00301                     }
00302                 }
00303             }
00304             else if (length==0)
00305             {
00306                 for (int32_t i=0; i<num_weights; i++)
00307                 {
00308                     for (int32_t j=0; j<mkl_stepsize; j++)
00309                     {
00310                         if (i*mkl_stepsize+j<get_degree())
00311                             weights[i*mkl_stepsize+j] = weights2[i];
00312                     }
00313                 }
00314             }
00315             else
00316             {
00317                 for (int32_t i=0; i<num_weights; i++)
00318                 {
00319                     for (int32_t j=0; j<mkl_stepsize; j++)
00320                     {
00321                         if (i*mkl_stepsize+j<get_degree()*length)
00322                             weights[i*mkl_stepsize+j] = weights2[i];
00323                     }
00324                 }
00325             }
00326         }
00327 
00328         // other kernel tree operations
00334         float64_t *compute_abs_weights(int32_t & len);
00335 
00342         void compute_by_tree(int32_t idx, float64_t *LevelContrib);
00343 
00348         bool is_tree_initialized() { return tree_initialized; }
00349 
00355         inline float64_t *get_degree_weights(int32_t& d, int32_t& len)
00356         {
00357             d=degree;
00358             len=length;
00359             return weights;
00360         }
00361 
00367         inline float64_t *get_weights(int32_t& num_weights)
00368         {
00369             if (position_weights!=NULL)
00370             {
00371                 num_weights = seq_length ;
00372                 return position_weights ;
00373             }
00374             if (length==0)
00375                 num_weights = degree ;
00376             else
00377                 num_weights = degree*length ;
00378             return weights;
00379         }
00380 
00386         inline float64_t *get_position_weights(int32_t& len)
00387         {
00388             len=seq_length;
00389             return position_weights;
00390         }
00391 
00397         bool set_wd_weights_by_type(EWDKernType type);
00398 
00405         void set_wd_weights(float64_t* p_weights, int32_t d)
00406         {
00407             set_weights(p_weights,d,0);
00408         }
00409 
00416         bool set_weights(float64_t* weights, int32_t d, int32_t len);
00417 
00424         bool set_position_weights(float64_t* position_weights, int32_t len=0);
00425 
00430         bool init_block_weights();
00431 
00436         bool init_block_weights_from_wd();
00437 
00442         bool init_block_weights_from_wd_external();
00443 
00448         bool init_block_weights_const();
00449 
00454         bool init_block_weights_linear();
00455 
00460         bool init_block_weights_sqpoly();
00461 
00466         bool init_block_weights_cubicpoly();
00467 
00472         bool init_block_weights_exp();
00473 
00478         bool init_block_weights_log();
00479 
00484         bool init_block_weights_external();
00485 
00490         bool delete_position_weights()
00491         {
00492             delete[] position_weights;
00493             position_weights=NULL;
00494             return true;
00495         }
00496 
00502         bool set_max_mismatch(int32_t max);
00503 
00508         inline int32_t get_max_mismatch() { return max_mismatch; }
00509 
00515         inline bool set_degree(int32_t deg) { degree=deg; return true; }
00516 
00521         inline int32_t get_degree() { return degree; }
00522 
00528         inline bool set_use_block_computation(bool block)
00529         {
00530             block_computation=block;
00531             return true;
00532         }
00533 
00538         inline bool get_use_block_computation() { return block_computation; }
00539 
00545         inline bool set_mkl_stepsize(int32_t step)
00546         {
00547             if (step<1)
00548                 SG_ERROR("Stepsize must be a positive integer\n");
00549             mkl_stepsize=step;
00550             return true;
00551         }
00552 
00557         inline int32_t get_mkl_stepsize() { return mkl_stepsize; }
00558 
00564         inline bool set_which_degree(int32_t which)
00565         {
00566             which_degree=which;
00567             return true;
00568         }
00569 
00574         inline int32_t get_which_degree() { return which_degree; }
00575 
00576     protected:
00578         void create_empty_tries();
00579 
00585         void add_example_to_tree(int32_t idx, float64_t weight);
00586 
00593         void add_example_to_single_tree(
00594             int32_t idx, float64_t weight, int32_t tree_num);
00595 
00601         void add_example_to_tree_mismatch(int32_t idx, float64_t weight);
00602 
00609         void add_example_to_single_tree_mismatch(
00610             int32_t idx, float64_t weight, int32_t tree_num);
00611 
00617         float64_t compute_by_tree(int32_t idx);
00618 
00627         float64_t compute(int32_t idx_a, int32_t idx_b);
00628 
00637         float64_t compute_with_mismatch(
00638             char* avec, int32_t alen, char* bvec, int32_t blen);
00639 
00648         float64_t compute_without_mismatch(
00649             char* avec, int32_t alen, char* bvec, int32_t blen);
00650 
00659         float64_t compute_without_mismatch_matrix(
00660             char* avec, int32_t alen, char* bvec, int32_t blen);
00661 
00670         float64_t compute_using_block(char* avec, int32_t alen,
00671             char* bvec, int32_t blen);
00672 
00674         virtual void remove_lhs();
00675 
00676 
00677 #ifdef HAVE_BOOST_SERIALIZATION
00678     private:
00679         /*
00680            friend class ::boost::serialization::access;
00681 
00682            template<class Archive>
00683            void serialize(Archive & ar, const unsigned int archive_version)
00684            {
00685 
00686            SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00687 
00688            ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00689 
00690            SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00691 
00692            }
00693 
00694 */
00695         // serialization needs to split up in save/load because
00696         // the serialization of pointers to natives (int* & friends)
00697         // requires a workaround
00698         friend class ::boost::serialization::access;
00699         //  friend std::ostream & operator<<(std::ostream &os, const CWeightedDegreeStringKernel &gp);
00700         //template<class Archive>
00701         //friend void ::boost::serialization::save_construct_data(Archive & ar, const CWeightedDegreeStringKernel* t, const unsigned int file_version);
00702         template<class Archive>
00703             void save(Archive & ar, const unsigned int archive_version) const
00704             {
00705 
00706                 SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00707 
00708                 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00709 
00710 
00713                 ar & mkl_stepsize ;
00714                 ar & degree;
00715                 ar & length;
00716                 ar & max_mismatch ;
00717 
00718                 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00719                     ar & weights[i];
00720 
00721                 //TODO how long?
00722                 //float64_t* position_weights ;
00723                 //float64_t* weights_buffer ;
00724 
00725                 ar & seq_length ;
00726 
00727                 ar & initialized ;
00728                 ar & block_computation;
00729                 //ar & use_normalization ;
00730 
00731                 //ar & normalization_const;
00732 
00733                 ar & num_block_weights_external;
00734                 //float64_t* block_weights_external;
00735                 for (int32_t i=0; i < num_block_weights_external; ++i)
00736                 {
00737                     ar & block_weights_external[i];
00738                 }
00739 
00740                 //TODO how long
00741                 //float64_t* block_weights;
00742                 ar & type;
00743                 ar & which_degree;
00744 
00745                 //TODO implement
00746                 //CTrie<DNATrie> tries ;
00747                 //ar & tree_initialized ;
00748 
00749 
00750                 //CWeightedDegreeStringKernel* tmp = const_cast<CWeightedDegreeStringKernel*>(this);
00751                 //tmp->create_empty_tries();
00752                 //create_empty_tries();
00753 
00754                 SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00755 
00756             }
00757 
00758         template<class Archive>
00759             void load(Archive & ar, const unsigned int archive_version)
00760             {
00761                 SG_DEBUG("archiving CWeightedDegreeStringKernel\n");
00762 
00763                 ar & ::boost::serialization::base_object<CStringKernel<char> >(*this);
00764 
00765 
00768                 ar & mkl_stepsize ;
00769                 ar & degree;
00770                 ar & length;
00771                 ar & max_mismatch ;
00772 
00773                 weights=new float64_t[degree*(1+max_mismatch)];
00774                 for (int32_t i=0; i<degree*(1+max_mismatch); i++)
00775                     ar & weights[i];
00776 
00777 
00778                 //TODO how long?
00779                 //float64_t* position_weights ;
00780                 //float64_t* weights_buffer ;
00781 
00782                 ar & seq_length ;
00783 
00784                 ar & initialized ;
00785                 ar & block_computation;
00786                 //ar & use_normalization ;
00787 
00788                 //ar & normalization_const;
00789 
00790                 ar & num_block_weights_external;
00791                 //float64_t* block_weights_external;
00792                 block_weights_external = new float64_t[num_block_weights_external];
00793                 for (int32_t i=0; i < num_block_weights_external; ++i)
00794                 {
00795                     ar & block_weights_external[i];
00796                 }
00797 
00798                 //TODO how long
00799                 //float64_t* block_weights;
00800                 ar & type;
00801                 ar & which_degree;
00802 
00803                 //TODO implement
00804                 //CTrie<DNATrie> tries ;
00805                 //ar & tree_initialized ;
00806 
00807                 SG_DEBUG("done with CWeightedDegreeStringKernel\n");
00808 
00809             }
00810 
00811         GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
00812 
00813 
00814     public:
00815 
00816         virtual std::string toString() const
00817         {
00818             std::ostringstream s;
00819 
00820             ::boost::archive::text_oarchive oa(s);
00821 
00822             oa << *this;
00823 
00824             return s.str();
00825         }
00826 
00827         virtual void fromString(std::string str)
00828         {
00829 
00830             std::istringstream is(str);
00831 
00832             ::boost::archive::text_iarchive ia(is);
00833 
00834             ia >> *this;
00835 
00836         }
00837 
00838 #endif //HAVE_BOOST_SERIALIZATION
00839 
00840 
00841     protected:
00845         float64_t* weights;
00847         float64_t* position_weights;
00849         float64_t* weights_buffer;
00851         int32_t mkl_stepsize;
00853         int32_t degree;
00855         int32_t length;
00856 
00858         int32_t max_mismatch;
00860         int32_t seq_length;
00861 
00863         bool initialized;
00864 
00866         bool block_computation;
00867 
00869         int32_t num_block_weights_external;
00871         float64_t* block_weights_external;
00872 
00874         float64_t* block_weights;
00876         EWDKernType type;
00878         int32_t which_degree;
00879 
00881         CTrie<DNATrie>* tries;
00882 
00884         bool tree_initialized;
00885 
00887         CAlphabet* alphabet;
00888 };
00889 
00890 }
00891 
00892 #ifdef HAVE_BOOST_SERIALIZATION
00893 //http://www.koders.com/cpp/fidB8C82A2BBA651A5E4EEC668EDE70B86EA017E937.aspx
00894 namespace boost
00895 {
00896     namespace serialization
00897     {
00898         template<class Archive>
00899             inline void save_construct_data(Archive & ar, const shogun::CWeightedDegreeStringKernel* const t, const unsigned int file_version)
00900             {
00901                 //TODO it has to be possible to access protected fields directly
00902                 //CWeightedDegreeStringKernel(INT size, EWDKernType type, INT degree, INT max_mismatch, bool use_normalization=true, bool block_computation=false, INT mkl_stepsize=1, INT which_deg=-1) ;
00903                 int32_t size = 10;
00904                 ar << size;
00905 
00906                 shogun::EWDKernType type = t->get_type();
00907                 ar << type;
00908 
00909                 int32_t degree = t->get_degree();
00910                 ar << degree;
00911 
00912                 int32_t max_mismatch = t->get_max_mismatch();
00913                 ar << max_mismatch;
00914 
00915                 //   TODO solution to the problem is that create_empty_tries has to be called
00916                 //   _after_ lhs, and rhs are set.
00917 
00918                 //   other solution -> serialize tree
00919 
00920 
00921                 ar.register_type(static_cast<shogun::CStringFeatures<char> *>(NULL));
00922 
00923                 const shogun::CStringFeatures<char>* const lhs = dynamic_cast<shogun::CStringFeatures<char>* >(const_cast<shogun::CWeightedDegreeStringKernel*>(t)->get_lhs());
00924 
00925                 const shogun::CStringFeatures<char>* const rhs = dynamic_cast<shogun::CStringFeatures<char>* >(const_cast<shogun::CWeightedDegreeStringKernel*>(t)->get_rhs());
00926                 //CStringFeatures<char>* lhs = (CStringFeatures<char>*) (const_cast<CWeightedDegreeStringKernel*>(t)->get_lhs());
00927                 //CStringFeatures<char>* rhs = (CStringFeatures<char>*) (const_cast<CWeightedDegreeStringKernel*>(t)->get_rhs());
00928 
00929                 //    const CFeatures* const lhs = t->get_lhs();
00930                 //    const CFeatures* const rhs = t->get_rhs();
00931 
00932                 ar << lhs;
00933                 ar << rhs;
00934 
00935                 //ar << dynamic_cast<CStringFeatures<char>*>(rhs);
00936                 //ar << t->get_lhs();
00937                 //ar << t->get_rhs();
00938 
00939             }
00940 
00941         template<class Archive>
00942             inline void load_construct_data(Archive & ar, shogun::CWeightedDegreeStringKernel * t, const unsigned int file_version)
00943             {
00944 
00945                 std::cout << "loading WDK from non-defaultconstruct data works" << std::endl;
00946 
00947 
00948                 int32_t size;
00949                 shogun::EWDKernType type;
00950                 int32_t degree;
00951                 int32_t max_mismatch;
00952 
00953                 //      CStringFeatures<char>* lhs;
00954                 //      CStringFeatures<char>* rhs;
00955 
00956                 ar >> size;
00957                 ar >> type;
00958                 ar >> degree;
00959                 ar >> max_mismatch;
00960 
00961                 //      ::new(t)CWeightedDegreeStringKernel(size, type, degree, max_mismatch);
00962 
00963                 shogun::CStringFeatures<char>* lhs;
00964                 shogun::CStringFeatures<char>* rhs;
00965 
00966 
00967                 ar >> lhs;
00968                 ar >> rhs;
00969 
00970                 ::new(t)shogun::CWeightedDegreeStringKernel(lhs, rhs, degree);
00971 
00972                 //t->set_max_mismatch(max_mismatch);
00973 
00974                 std::cout << "done loading WDK from non-defaultconstruct data" << std::endl;
00975             }
00976     } // serialization
00977 } // namespace boost
00978 #endif //HAVE_BOOST_SERIALIZATION
00979 
00980 
00981 
00982 #endif /* _WEIGHTEDDEGREESTRINGKERNEL_H__ */

SHOGUN Machine Learning Toolbox - Documentation