SHOGUN v0.9.0
Kernel.h
浏览该文件的文档。
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _KERNEL_H___
00013 #define _KERNEL_H___
00014 
00015 #include "lib/common.h"
00016 #include "lib/Signal.h"
00017 #include "lib/File.h"
00018 #include "lib/Mathematics.h"
00019 #include "base/SGObject.h"
00020 #include "features/Features.h"
00021 #include "kernel/KernelNormalizer.h"
00022 
00023 #include <vector>
00024 #include <set>
00025 #include <string>
00026 
00027 namespace shogun
00028 {
00029     class CFile;
00030     class CFeatures;
00031     class CKernelNormalizer;
00032     enum EFeatureType;
00033     enum EFeatureClass;
00034 
00035 #ifdef USE_SHORTREAL_KERNELCACHE
00036     typedef float32_t KERNELCACHE_ELEM;
00037 #else
00038     typedef float64_t KERNELCACHE_ELEM;
00039 #endif
00040 
00041 typedef int64_t KERNELCACHE_IDX;
00042 
00043 
00044 enum EOptimizationType
00045 {
00046     FASTBUTMEMHUNGRY,
00047     SLOWBUTMEMEFFICIENT
00048 };
00049 
00050 enum EKernelType
00051 {
00052     K_UNKNOWN = 0,
00053     K_LINEAR = 10,
00054     K_POLY = 20,
00055     K_GAUSSIAN = 30,
00056     K_GAUSSIANSHIFT = 32,
00057     K_GAUSSIANMATCH = 33,
00058     K_HISTOGRAM = 40,
00059     K_SALZBERG = 41,
00060     K_LOCALITYIMPROVED = 50,
00061     K_SIMPLELOCALITYIMPROVED = 60,
00062     K_FIXEDDEGREE = 70,
00063     K_WEIGHTEDDEGREE =    80,
00064     K_WEIGHTEDDEGREEPOS = 81,
00065     K_WEIGHTEDDEGREERBF = 82,
00066     K_WEIGHTEDCOMMWORDSTRING = 90,
00067     K_POLYMATCH = 100,
00068     K_ALIGNMENT = 110,
00069     K_COMMWORDSTRING = 120,
00070     K_COMMULONGSTRING = 121,
00071     K_SPECTRUMMISMATCHRBF = 122,
00072     K_COMBINED = 140,
00073     K_AUC = 150,
00074     K_CUSTOM = 160,
00075     K_SIGMOID = 170,
00076     K_CHI2 = 180,
00077     K_DIAG = 190,
00078     K_CONST = 200,
00079     K_DISTANCE = 220,
00080     K_LOCALALIGNMENT = 230,
00081     K_PYRAMIDCHI2 = 240,
00082     K_OLIGO = 250,
00083     K_MATCHWORD = 260,
00084     K_TPPK = 270,
00085     K_REGULATORYMODULES = 280,
00086     K_SPARSESPATIALSAMPLE = 290,
00087     K_HISTOGRAMINTERSECTION = 300
00088 };
00089 
00090 enum EKernelProperty
00091 {
00092     KP_NONE = 0,
00093     KP_LINADD = 1,  // Kernels that can be optimized via doing normal updates w + dw
00094     KP_KERNCOMBINATION = 2, // Kernels that are infact a linear combination of subkernels K=\sum_i b_i*K_i
00095     KP_BATCHEVALUATION = 4  // Kernels that can on the fly generate normals in linadd and more quickly/memory efficient process batches instead of single examples
00096 };
00097 
00099 template <class T> struct K_THREAD_PARAM
00100 {
00102     CKernel* kernel;
00104     int32_t start;
00106     int32_t end;
00108     int32_t total_start;
00110     int32_t total_end;
00112     int32_t m;
00114     int32_t n;
00116     T* result;
00118     bool symmetric;
00120     bool verbose;
00121 };
00122 
00123 class CSVM;
00124 
00150 class CKernel : public CSGObject
00151 {
00152     friend class CVarianceKernelNormalizer;
00153     friend class CSqrtDiagKernelNormalizer;
00154     friend class CAvgDiagKernelNormalizer;
00155     friend class CRidgeKernelNormalizer;
00156     friend class CFirstElementKernelNormalizer;
00157     friend class CMultitaskKernelNormalizer;
00158     friend class CMultitaskKernelMklNormalizer;
00159     friend class CMultitaskKernelMaskNormalizer;
00160     friend class CMultitaskKernelMaskPairNormalizer;
00161     friend class CTanimotoKernelNormalizer;
00162     friend class CDiceKernelNormalizer;
00163     friend class CZeroMeanCenterKernelNormalizer;
00164 
00165     public:
00166 
00170         CKernel();
00171 
00172 
00177         CKernel(int32_t size);
00178 
00185         CKernel(CFeatures* l, CFeatures* r, int32_t size);
00186 
00187         virtual ~CKernel();
00188 
00196         inline float64_t kernel(int32_t idx_a, int32_t idx_b)
00197         {
00198             if (idx_a<0 || idx_b<0 || idx_a>=num_lhs || idx_b>=num_rhs)
00199             {
00200                 SG_ERROR("Index out of Range: idx_a=%d/%d idx_b=%d/%d\n",
00201                         idx_a,num_lhs, idx_b,num_rhs);
00202             }
00203 
00204             return normalizer->normalize(compute(idx_a, idx_b), idx_a, idx_b);
00205         }
00206 
00213         void get_kernel_matrix(float64_t** dst, int32_t* m, int32_t* n);
00214 
00215 
00221         virtual std::vector<float64_t> get_kernel_col(int32_t j)
00222         {
00223 
00224             std::vector<float64_t> col = std::vector<float64_t>(num_rhs);
00225 
00226             for (int32_t i=0; i!=num_rhs; i++)
00227             {
00228                 col[i] = kernel(i,j);
00229             }
00230 
00231             return col;
00232 
00233         }
00234 
00235 
00241         virtual std::vector<float64_t> get_kernel_row(int32_t i)
00242         {
00243 
00244             std::vector<float64_t> row = std::vector<float64_t>(num_lhs);
00245 
00246             for (int32_t j=0; j!=num_lhs; j++)
00247             {
00248                 row[j] = kernel(i,j);
00249             }
00250 
00251             return row;
00252 
00253         }
00254 
00255 
00263         template <class T>
00264         T* get_kernel_matrix(int32_t &m, int32_t &n, T* target)
00265         {
00266             T* result = NULL;
00267 
00268             if (!has_features())
00269                 SG_ERROR( "no features assigned to kernel\n");
00270 
00271             if (target && (m!=get_num_vec_lhs() ||
00272                         n!=get_num_vec_rhs()) )
00273             {
00274                 SG_ERROR( "kernel matrix size mismatch\n");
00275             }
00276 
00277             m=get_num_vec_lhs();
00278             n=get_num_vec_rhs();
00279 
00280             int64_t total_num = int64_t(m)*n;
00281 
00282             // if lhs == rhs and sizes match assume k(i,j)=k(j,i)
00283             bool symmetric= (lhs && lhs==rhs && m==n);
00284 
00285             SG_DEBUG( "returning kernel matrix of size %dx%d\n", m, n);
00286 
00287             if (target)
00288                 result=target;
00289             else
00290                 result=new T[total_num];
00291 
00292             int32_t num_threads=parallel->get_num_threads();
00293             if (num_threads < 2)
00294             {
00295                 K_THREAD_PARAM<T> params;
00296                 params.kernel=this;
00297                 params.result=result;
00298                 params.start=0;
00299                 params.end=m;
00300                 params.total_start=0;
00301                 params.total_end=total_num;
00302                 params.n=n;
00303                 params.m=m;
00304                 params.symmetric=symmetric;
00305                 params.verbose=true;
00306                 get_kernel_matrix_helper<T>((void*) &params);
00307             }
00308             else
00309             {
00310                 pthread_t* threads = new pthread_t[num_threads-1];
00311                 K_THREAD_PARAM<T>* params = new K_THREAD_PARAM<T>[num_threads];
00312                 int64_t step= total_num/num_threads;
00313 
00314                 int32_t t;
00315 
00316                 for (t=0; t<num_threads-1; t++)
00317                 {
00318                     params[t].kernel = this;
00319                     params[t].result = result;
00320                     params[t].start = compute_row_start(t*step, n, symmetric);
00321                     params[t].end = compute_row_start((t+1)*step, n, symmetric);
00322                     params[t].total_start=t*step;
00323                     params[t].total_end=(t+1)*step;
00324                     params[t].n=n;
00325                     params[t].m=m;
00326                     params[t].symmetric=symmetric;
00327                     params[t].verbose=false;
00328                     pthread_create(&threads[t], NULL,
00329                             CKernel::get_kernel_matrix_helper<T>, (void*)&params[t]);
00330                 }
00331 
00332                 params[t].kernel = this;
00333                 params[t].result = result;
00334                 params[t].start = compute_row_start(t*step, n, symmetric);
00335                 params[t].end = m;
00336                 params[t].total_start=t*step;
00337                 params[t].total_end=total_num;
00338                 params[t].n=n;
00339                 params[t].m=m;
00340                 params[t].symmetric=symmetric;
00341                 params[t].verbose=true;
00342                 get_kernel_matrix_helper<T>(&params[t]);
00343 
00344                 for (t=0; t<num_threads-1; t++)
00345                     pthread_join(threads[t], NULL);
00346 
00347                 delete[] params;
00348                 delete[] threads;
00349             }
00350 
00351             SG_DONE();
00352 
00353             return result;
00354         }
00355 
00356 
00367         virtual bool init(CFeatures* lhs, CFeatures* rhs);
00368 
00373         virtual bool set_normalizer(CKernelNormalizer* normalizer);
00374 
00379         virtual CKernelNormalizer* get_normalizer();
00380 
00384         virtual bool init_normalizer();
00385 
00392         virtual void cleanup();
00393 
00398         void load(CFile* loader);
00399 
00404         void save(CFile* writer);
00405 
00410         inline CFeatures* get_lhs() { SG_REF(lhs); return lhs; }
00411 
00416         inline CFeatures* get_rhs() { SG_REF(rhs); return rhs; }
00417 
00422         virtual inline int32_t get_num_vec_lhs()
00423         {
00424             return num_lhs;
00425         }
00426 
00431         virtual inline int32_t get_num_vec_rhs()
00432         {
00433             return num_rhs;
00434         }
00435 
00440         virtual inline bool has_features()
00441         {
00442             return lhs && rhs;
00443         }
00444 
00449         inline bool get_lhs_equals_rhs()
00450         {
00451             return lhs_equals_rhs;
00452         }
00453 
00455         virtual void remove_lhs_and_rhs();
00456 
00458         virtual void remove_lhs();
00459 
00461         virtual void remove_rhs();
00462 
00470         virtual EKernelType get_kernel_type()=0 ;
00471 
00478         virtual EFeatureType get_feature_type()=0;
00479 
00486         virtual EFeatureClass get_feature_class()=0;
00487 
00492         inline void set_cache_size(int32_t size)
00493         {
00494             cache_size = size;
00495 
00496         }
00497 
00502         inline int32_t get_cache_size() { return cache_size; }
00503 
00504 
00505 
00507         void list_kernel();
00508 
00514         inline bool has_property(EKernelProperty p) { return (properties & p) != 0; }
00515 
00519         virtual void clear_normal();
00520 
00526         virtual void add_to_normal(int32_t vector_idx, float64_t weight);
00527 
00532         inline EOptimizationType get_optimization_type() { return opt_type; }
00533 
00538         virtual inline void set_optimization_type(EOptimizationType t) { opt_type=t;}
00539 
00544         inline bool get_is_initialized() { return optimization_initialized; }
00545 
00553         virtual bool init_optimization(
00554             int32_t count, int32_t *IDX, float64_t *weights);
00555 
00560         virtual bool delete_optimization();
00561 
00567         bool init_optimization_svm(CSVM * svm) ;
00568 
00574         virtual float64_t compute_optimized(int32_t vector_idx);
00575 
00584         virtual void compute_batch(
00585             int32_t num_vec, int32_t* vec_idx, float64_t* target,
00586             int32_t num_suppvec, int32_t* IDX, float64_t* alphas,
00587             float64_t factor=1.0);
00588 
00593         inline float64_t get_combined_kernel_weight() { return combined_kernel_weight; }
00594 
00599         inline void set_combined_kernel_weight(float64_t nw) { combined_kernel_weight=nw; }
00600 
00605         virtual int32_t get_num_subkernels();
00606 
00612         virtual void compute_by_subkernel(
00613             int32_t vector_idx, float64_t * subkernel_contrib);
00614 
00620         virtual const float64_t* get_subkernel_weights(int32_t& num_weights);
00621 
00627         virtual void set_subkernel_weights(
00628             float64_t* weights, int32_t num_weights);
00629 
00630     protected:
00635         inline void set_property(EKernelProperty p)
00636         {
00637             properties |= p;
00638         }
00639 
00644         inline void unset_property(EKernelProperty p)
00645         {
00646             properties &= (properties | p) ^ p;
00647         }
00648 
00653         inline void set_is_initialized(bool p_init) { optimization_initialized=p_init; }
00654 
00665         virtual float64_t compute(int32_t x, int32_t y)=0;
00666 
00673         int32_t compute_row_start(int64_t offs, int32_t n, bool symmetric)
00674         {
00675             int32_t i_start;
00676 
00677             if (symmetric)
00678                 i_start=(int32_t) CMath::floor(n-CMath::sqrt(CMath::sq((float64_t) n)-offs));
00679             else
00680                 i_start=(int32_t) (offs/int64_t(n));
00681 
00682             return i_start;
00683         }
00684 
00689         template <class T>
00690         static void* get_kernel_matrix_helper(void* p)
00691         {
00692             K_THREAD_PARAM<T>* params= (K_THREAD_PARAM<T>*) p;
00693             int32_t i_start=params->start;
00694             int32_t i_end=params->end;
00695             CKernel* k=params->kernel;
00696             T* result=params->result;
00697             bool symmetric=params->symmetric;
00698             int32_t n=params->n;
00699             int32_t m=params->m;
00700             bool verbose=params->verbose;
00701             int64_t total_start=params->total_start;
00702             int64_t total_end=params->total_end;
00703             int64_t total=total_start;
00704 
00705             for (int32_t i=i_start; i<i_end; i++)
00706             {
00707                 int32_t j_start=0;
00708 
00709                 if (symmetric)
00710                     j_start=i;
00711 
00712                 for (int32_t j=j_start; j<n; j++)
00713                 {
00714                     float64_t v=k->kernel(i,j);
00715                     result[i+j*m]=v;
00716 
00717                     if (symmetric && i!=j)
00718                         result[j+i*m]=v;
00719 
00720                     if (verbose)
00721                     {
00722                         total++;
00723 
00724                         if (symmetric && i!=j)
00725                             total++;
00726 
00727                         if (total%100 == 0)
00728                             k->SG_PROGRESS(total, total_start, total_end);
00729 
00730                         if (CSignal::cancel_computations())
00731                             break;
00732                     }
00733                 }
00734 
00735             }
00736 
00737             return NULL;
00738         }
00739 
00748         virtual void load_serializable_post() throw (ShogunException);
00749 
00758         virtual void save_serializable_pre() throw (ShogunException);
00759 
00768         virtual void save_serializable_post() throw (ShogunException);
00769 
00770     private:
00773         void init();
00774 
00775 
00776 
00778 
00779     protected:
00781         int32_t cache_size;
00782 
00783 
00784 
00787         KERNELCACHE_ELEM* kernel_matrix;
00788 
00790         CFeatures* lhs;
00792         CFeatures* rhs;
00793 
00795         bool lhs_equals_rhs;
00796 
00798         int32_t num_lhs;
00800         int32_t num_rhs;
00801 
00803         float64_t combined_kernel_weight;
00804 
00806         bool optimization_initialized;
00810         EOptimizationType opt_type;
00811 
00813         uint64_t  properties;
00814 
00817         CKernelNormalizer* normalizer;
00818 };
00819 
00820 }
00821 #endif /* _KERNEL_H__ */

SHOGUN Machine Learning Toolbox - Documentation