Kernel.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _KERNEL_H___
00013 #define _KERNEL_H___
00014 
00015 #include "lib/common.h"
00016 #include "lib/Signal.h"
00017 #include "lib/Mathematics.h"
00018 #include "base/SGObject.h"
00019 #include "features/Features.h"
00020 #include "kernel/KernelNormalizer.h"
00021 
00022 namespace shogun
00023 {
00024     class CFeatures;
00025     class CKernelNormalizer;
00026     enum EFeatureType;
00027     enum EFeatureClass;
00028 
00029 #ifdef USE_SHORTREAL_KERNELCACHE
00030     typedef float32_t KERNELCACHE_ELEM;
00031 #else
00032     typedef float64_t KERNELCACHE_ELEM;
00033 #endif
00034 
00035 typedef int64_t KERNELCACHE_IDX;
00036 
00037 
00038 enum EOptimizationType
00039 {
00040     FASTBUTMEMHUNGRY,
00041     SLOWBUTMEMEFFICIENT
00042 };
00043 
00044 enum EKernelType
00045 {
00046     K_UNKNOWN = 0,
00047     K_LINEAR = 10,
00048     K_SPARSELINEAR = 11,
00049     K_POLY = 20,
00050     K_GAUSSIAN = 30,
00051     K_SPARSEGAUSSIAN = 31,
00052     K_GAUSSIANSHIFT = 32,
00053     K_HISTOGRAM = 40,
00054     K_SALZBERG = 41,
00055     K_LOCALITYIMPROVED = 50,
00056     K_SIMPLELOCALITYIMPROVED = 60,
00057     K_FIXEDDEGREE = 70,
00058     K_WEIGHTEDDEGREE =    80,
00059     K_WEIGHTEDDEGREEPOS = 81,
00060     K_WEIGHTEDCOMMWORDSTRING = 90,
00061     K_POLYMATCH = 100,
00062     K_ALIGNMENT = 110,
00063     K_COMMWORDSTRING = 120,
00064     K_COMMULONGSTRING = 121,
00065     K_COMBINED = 140,
00066     K_AUC = 150,
00067     K_CUSTOM = 160,
00068     K_SIGMOID = 170,
00069     K_CHI2 = 180,
00070     K_DIAG = 190,
00071     K_CONST = 200,
00072     K_DISTANCE = 220,
00073     K_LOCALALIGNMENT = 230,
00074     K_PYRAMIDCHI2 = 240,
00075     K_OLIGO = 250,
00076     K_MATCHWORD = 260,
00077     K_TPPK = 270,
00078     K_REGULATORYMODULES = 280
00079 };
00080 
00081 enum EKernelProperty
00082 {
00083     KP_NONE = 0,
00084     KP_LINADD = 1,  // Kernels that can be optimized via doing normal updates w + dw
00085     KP_KERNCOMBINATION = 2, // Kernels that are infact a linear combination of subkernels K=\sum_i b_i*K_i
00086     KP_BATCHEVALUATION = 4  // Kernels that can on the fly generate normals in linadd and more quickly/memory efficient process batches instead of single examples
00087 };
00088 
00090 template <class T> struct K_THREAD_PARAM
00091 {
00093     CKernel* kernel;
00095     int32_t start;
00097     int32_t end;
00099     int32_t total_start;
00101     int32_t total_end;
00103     int32_t m;
00105     int32_t n;
00107     T* result;
00109     bool symmetric;
00111     bool verbose;
00112 };
00113 
00114 class CSVM;
00115 
00141 class CKernel : public CSGObject
00142 {
00143     friend class CVarianceKernelNormalizer;
00144     friend class CSqrtDiagKernelNormalizer;
00145     friend class CAvgDiagKernelNormalizer;
00146     friend class CRidgeKernelNormalizer;
00147     friend class CFirstElementKernelNormalizer;
00148     friend class CTanimotoKernelNormalizer;
00149     friend class CDiceKernelNormalizer;
00150 
00151     public:
00152 
00156         CKernel();
00157 
00158 
00163         CKernel(int32_t size);
00164 
00171         CKernel(CFeatures* l, CFeatures* r, int32_t size);
00172 
00173         virtual ~CKernel();
00174 
00182         inline float64_t kernel(int32_t idx_a, int32_t idx_b)
00183         {
00184             if (idx_a<0 || idx_b<0 || idx_a>=num_lhs || idx_b>=num_rhs)
00185             {
00186                 SG_ERROR("Index out of Range: idx_a=%d/%d idx_b=%d/%d\n",
00187                         idx_a,num_lhs, idx_b,num_rhs);
00188             }
00189 
00190             return normalizer->normalize(compute(idx_a, idx_b), idx_a, idx_b);
00191         }
00192 
00199         void get_kernel_matrix(float64_t** dst, int32_t* m, int32_t* n);
00200 
00208         template <class T>
00209         T* get_kernel_matrix(int32_t &m, int32_t &n, T* target)
00210         {
00211             T* result = NULL;
00212 
00213             if (!has_features())
00214                 SG_ERROR( "no features assigned to kernel\n");
00215 
00216             if (target && (m!=get_num_vec_lhs() ||
00217                         n!=get_num_vec_rhs()) )
00218             {
00219                 SG_ERROR( "kernel matrix size mismatch\n");
00220             }
00221 
00222             m=get_num_vec_lhs();
00223             n=get_num_vec_rhs();
00224 
00225             int64_t total_num = int64_t(m)*n;
00226 
00227             // if lhs == rhs and sizes match assume k(i,j)=k(j,i)
00228             bool symmetric= (lhs && lhs==rhs && m==n);
00229 
00230             SG_DEBUG( "returning kernel matrix of size %dx%d\n", m, n);
00231 
00232             if (target)
00233                 result=target;
00234             else
00235                 result=new T[total_num];
00236 
00237             int32_t num_threads=parallel->get_num_threads();
00238             if (num_threads < 2)
00239             {
00240                 K_THREAD_PARAM<T> params;
00241                 params.kernel=this;
00242                 params.result=result;
00243                 params.start=0;
00244                 params.end=m;
00245                 params.total_start=0;
00246                 params.total_end=total_num;
00247                 params.n=n;
00248                 params.m=m;
00249                 params.symmetric=symmetric;
00250                 params.verbose=true;
00251                 get_kernel_matrix_helper<T>((void*) &params);
00252             }
00253             else
00254             {
00255                 pthread_t* threads = new pthread_t[num_threads-1];
00256                 K_THREAD_PARAM<T>* params = new K_THREAD_PARAM<T>[num_threads];
00257                 int64_t step= total_num/num_threads;
00258 
00259                 int32_t t;
00260 
00261                 for (t=0; t<num_threads-1; t++)
00262                 {
00263                     params[t].kernel = this;
00264                     params[t].result = result;
00265                     params[t].start = compute_row_start(t*step, n, symmetric);
00266                     params[t].end = compute_row_start((t+1)*step, n, symmetric);
00267                     params[t].total_start=t*step;
00268                     params[t].total_end=(t+1)*step;
00269                     params[t].n=n;
00270                     params[t].m=m;
00271                     params[t].symmetric=symmetric;
00272                     params[t].verbose=false;
00273                     pthread_create(&threads[t], NULL,
00274                             CKernel::get_kernel_matrix_helper<T>, (void*)&params[t]);
00275                 }
00276 
00277                 params[t].kernel = this;
00278                 params[t].result = result;
00279                 params[t].start = compute_row_start(t*step, n, symmetric);
00280                 params[t].end = m;
00281                 params[t].total_start=t*step;
00282                 params[t].total_end=total_num;
00283                 params[t].n=n;
00284                 params[t].m=m;
00285                 params[t].symmetric=symmetric;
00286                 params[t].verbose=true;
00287                 get_kernel_matrix_helper<T>(&params[t]);
00288 
00289                 for (t=0; t<num_threads-1; t++)
00290                     pthread_join(threads[t], NULL);
00291 
00292                 delete[] params;
00293                 delete[] threads;
00294             }
00295 
00296             SG_DONE();
00297 
00298             return result;
00299         }
00300 
00301 
00312         virtual bool init(CFeatures* lhs, CFeatures* rhs);
00313 
00318         virtual bool set_normalizer(CKernelNormalizer* normalizer);
00319 
00324         virtual CKernelNormalizer* get_normalizer();
00325 
00329         virtual bool init_normalizer();
00330 
00337         virtual void cleanup();
00338 
00344         bool load(char* fname);
00345 
00351         bool save(char* fname);
00352 
00357         inline CFeatures* get_lhs() { SG_REF(lhs); return lhs; }
00358 
00363         inline CFeatures* get_rhs() { SG_REF(rhs); return rhs; }
00364 
00369         virtual inline int32_t get_num_vec_lhs()
00370         {
00371             return num_lhs;
00372         }
00373 
00378         virtual inline int32_t get_num_vec_rhs()
00379         {
00380             return num_rhs;
00381         }
00382 
00387         virtual inline bool has_features()
00388         {
00389             return lhs && rhs;
00390         }
00391 
00396         inline bool lhs_equals_rhs()
00397         {
00398             return lhs==rhs;
00399         }
00400 
00402         virtual void remove_lhs_and_rhs();
00403 
00405         virtual void remove_lhs();
00406 
00408         virtual void remove_rhs();
00409 
00417         virtual EKernelType get_kernel_type()=0 ;
00418 
00425         virtual EFeatureType get_feature_type()=0;
00426 
00433         virtual EFeatureClass get_feature_class()=0;
00434 
00439         inline void set_cache_size(int32_t size)
00440         {
00441             cache_size = size;
00442 
00443         }
00444 
00449         inline int32_t get_cache_size() { return cache_size; }
00450 
00451 
00452 
00454         void list_kernel();
00455 
00461         inline bool has_property(EKernelProperty p) { return (properties & p) != 0; }
00462 
00466         virtual void clear_normal();
00467 
00473         virtual void add_to_normal(int32_t vector_idx, float64_t weight);
00474 
00479         inline EOptimizationType get_optimization_type() { return opt_type; }
00480 
00485         virtual inline void set_optimization_type(EOptimizationType t) { opt_type=t;}
00486 
00491         inline bool get_is_initialized() { return optimization_initialized; }
00492 
00500         virtual bool init_optimization(
00501             int32_t count, int32_t *IDX, float64_t *weights);
00502 
00507         virtual bool delete_optimization();
00508 
00514         bool init_optimization_svm(CSVM * svm) ;
00515 
00521         virtual float64_t compute_optimized(int32_t vector_idx);
00522 
00531         virtual void compute_batch(
00532             int32_t num_vec, int32_t* vec_idx, float64_t* target,
00533             int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00534             float64_t factor=1.0);
00535 
00540         inline float64_t get_combined_kernel_weight() { return combined_kernel_weight; }
00541 
00546         inline void set_combined_kernel_weight(float64_t nw) { combined_kernel_weight=nw; }
00547 
00552         virtual int32_t get_num_subkernels();
00553 
00559         virtual void compute_by_subkernel(
00560             int32_t vector_idx, float64_t * subkernel_contrib);
00561 
00567         virtual const float64_t* get_subkernel_weights(int32_t& num_weights);
00568 
00574         virtual void set_subkernel_weights(
00575             float64_t* weights, int32_t num_weights);
00576 
00577     protected:
00582         inline void set_property(EKernelProperty p)
00583         {
00584             properties |= p;
00585         }
00586 
00591         inline void unset_property(EKernelProperty p)
00592         {
00593             properties &= (properties | p) ^ p;
00594         }
00595 
00600         inline void set_is_initialized(bool p_init) { optimization_initialized=p_init; }
00601 
00612         virtual float64_t compute(int32_t x, int32_t y)=0;
00613 
00614 
00621         int32_t compute_row_start(int64_t offs, int32_t n, bool symmetric)
00622         {
00623             int32_t i_start;
00624 
00625             if (symmetric)
00626                 i_start=(int32_t) CMath::floor(n-CMath::sqrt(CMath::sq((float64_t) n)-offs));
00627             else
00628                 i_start=(int32_t) (offs/int64_t(n));
00629 
00630             return i_start;
00631         }
00632 
00633 
00638         template <class T>
00639         static void* get_kernel_matrix_helper(void* p)
00640         {
00641             K_THREAD_PARAM<T>* params= (K_THREAD_PARAM<T>*) p;
00642             int32_t i_start=params->start;
00643             int32_t i_end=params->end;
00644             CKernel* k=params->kernel;
00645             T* result=params->result;
00646             bool symmetric=params->symmetric;
00647             int32_t n=params->n;
00648             int32_t m=params->m;
00649             bool verbose=params->verbose;
00650             int64_t total_start=params->total_start;
00651             int64_t total_end=params->total_end;
00652             int64_t total=total_start;
00653 
00654             for (int32_t i=i_start; i<i_end; i++)
00655             {
00656                 int32_t j_start=0;
00657 
00658                 if (symmetric)
00659                     j_start=i;
00660 
00661                 for (int32_t j=j_start; j<n; j++)
00662                 {
00663                     float64_t v=k->kernel(i,j);
00664                     result[i+j*m]=v;
00665 
00666                     if (symmetric && i!=j)
00667                         result[j+i*m]=v;
00668 
00669                     if (verbose)
00670                     {
00671                         total++;
00672 
00673                         if (symmetric && i!=j)
00674                             total++;
00675 
00676                         if (total%100 == 0)
00677                             k->SG_PROGRESS(total, total_start, total_end);
00678 
00679                         if (CSignal::cancel_computations())
00680                             break;
00681                     }
00682                 }
00683 
00684             }
00685 
00686             return NULL;
00687         }
00688 
00689 
00691 
00692 
00693 #ifdef HAVE_BOOST_SERIALIZATION
00694     private:
00695 
00696         friend class ::boost::serialization::access;
00697         template<class Archive>
00698             void serialize(Archive & ar, const unsigned int archive_version)
00699             {
00700 
00701                 SG_DEBUG("archiving CKernel\n");
00702 
00703                 ar & ::boost::serialization::base_object<CSGObject>(*this);
00704 
00705                 ar & cache_size;
00706 
00707 
00708 
00709                 //TODO
00710                 //KERNELCACHE_ELEM* kernel_matrix;
00711 
00712                 //TODO
00713                 //SHORTREAL * precomputed_matrix ;
00714                 //ar & precompute_subkernel_matrix ;
00715                 //ar & precompute_matrix ;
00716 
00717                 ar & rhs;
00718                 ar & lhs;
00719 
00720                 ar & combined_kernel_weight;
00721 
00722                 ar & optimization_initialized;
00723 
00724                 ar & opt_type;
00725 
00726                 ar & properties;
00727 
00728                 SG_DEBUG("done with CKernel\n");
00729 
00730             }
00731 
00732 #endif //HAVE_BOOST_SERIALIZATION
00733 
00734 
00735 
00736     protected:
00738         int32_t cache_size;
00739 
00740 
00741 
00744         KERNELCACHE_ELEM* kernel_matrix;
00745 
00747         CFeatures* lhs;
00749         CFeatures* rhs;
00750 
00752         int32_t num_lhs;
00754         int32_t num_rhs;
00755 
00757         float64_t combined_kernel_weight;
00758 
00760         bool optimization_initialized;
00764         EOptimizationType opt_type;
00765 
00767         uint64_t  properties;
00768 
00771         CKernelNormalizer* normalizer;
00772 };
00773 
00774 }
00775 #endif /* _KERNEL_H__ */

SHOGUN Machine Learning Toolbox - Documentation