SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #ifndef _KERNEL_H___ 00013 #define _KERNEL_H___ 00014 00015 #include "lib/common.h" 00016 #include "lib/Signal.h" 00017 #include "lib/File.h" 00018 #include "lib/Mathematics.h" 00019 #include "base/SGObject.h" 00020 #include "features/Features.h" 00021 #include "kernel/KernelNormalizer.h" 00022 00023 #include <vector> 00024 #include <set> 00025 #include <string> 00026 00027 namespace shogun 00028 { 00029 class CFile; 00030 class CFeatures; 00031 class CKernelNormalizer; 00032 enum EFeatureType; 00033 enum EFeatureClass; 00034 00035 #ifdef USE_SHORTREAL_KERNELCACHE 00036 typedef float32_t KERNELCACHE_ELEM; 00037 #else 00038 typedef float64_t KERNELCACHE_ELEM; 00039 #endif 00040 00041 typedef int64_t KERNELCACHE_IDX; 00042 00043 00044 enum EOptimizationType 00045 { 00046 FASTBUTMEMHUNGRY, 00047 SLOWBUTMEMEFFICIENT 00048 }; 00049 00050 enum EKernelType 00051 { 00052 K_UNKNOWN = 0, 00053 K_LINEAR = 10, 00054 K_POLY = 20, 00055 K_GAUSSIAN = 30, 00056 K_GAUSSIANSHIFT = 32, 00057 K_GAUSSIANMATCH = 33, 00058 K_HISTOGRAM = 40, 00059 K_SALZBERG = 41, 00060 K_LOCALITYIMPROVED = 50, 00061 K_SIMPLELOCALITYIMPROVED = 60, 00062 K_FIXEDDEGREE = 70, 00063 K_WEIGHTEDDEGREE = 80, 00064 K_WEIGHTEDDEGREEPOS = 81, 00065 K_WEIGHTEDDEGREERBF = 82, 00066 K_WEIGHTEDCOMMWORDSTRING = 90, 00067 K_POLYMATCH = 100, 00068 K_ALIGNMENT = 110, 00069 K_COMMWORDSTRING = 120, 00070 K_COMMULONGSTRING = 121, 00071 K_SPECTRUMMISMATCHRBF = 122, 00072 K_COMBINED = 140, 00073 K_AUC = 150, 00074 K_CUSTOM = 160, 00075 K_SIGMOID = 170, 00076 K_CHI2 = 180, 00077 K_DIAG = 190, 00078 K_CONST = 200, 00079 K_DISTANCE = 220, 00080 K_LOCALALIGNMENT = 230, 00081 K_PYRAMIDCHI2 = 240, 00082 K_OLIGO = 250, 00083 K_MATCHWORD = 260, 00084 K_TPPK = 270, 00085 K_REGULATORYMODULES = 280, 00086 K_SPARSESPATIALSAMPLE = 290, 00087 K_HISTOGRAMINTERSECTION = 300 00088 }; 00089 00090 enum EKernelProperty 00091 { 00092 KP_NONE = 0, 00093 KP_LINADD = 1, // Kernels that can be optimized via doing normal updates w + dw 00094 KP_KERNCOMBINATION = 2, // Kernels that are infact a linear combination of subkernels K=\sum_i b_i*K_i 00095 KP_BATCHEVALUATION = 4 // Kernels that can on the fly generate normals in linadd and more quickly/memory efficient process batches instead of single examples 00096 }; 00097 00099 template <class T> struct K_THREAD_PARAM 00100 { 00102 CKernel* kernel; 00104 int32_t start; 00106 int32_t end; 00108 int32_t total_start; 00110 int32_t total_end; 00112 int32_t m; 00114 int32_t n; 00116 T* result; 00118 bool symmetric; 00120 bool verbose; 00121 }; 00122 00123 class CSVM; 00124 00150 class CKernel : public CSGObject 00151 { 00152 friend class CVarianceKernelNormalizer; 00153 friend class CSqrtDiagKernelNormalizer; 00154 friend class CAvgDiagKernelNormalizer; 00155 friend class CRidgeKernelNormalizer; 00156 friend class CFirstElementKernelNormalizer; 00157 friend class CMultitaskKernelNormalizer; 00158 friend class CMultitaskKernelMklNormalizer; 00159 friend class CMultitaskKernelMaskNormalizer; 00160 friend class CMultitaskKernelMaskPairNormalizer; 00161 friend class CTanimotoKernelNormalizer; 00162 friend class CDiceKernelNormalizer; 00163 friend class CZeroMeanCenterKernelNormalizer; 00164 00165 public: 00166 00170 CKernel(); 00171 00172 00177 CKernel(int32_t size); 00178 00185 CKernel(CFeatures* l, CFeatures* r, int32_t size); 00186 00187 virtual ~CKernel(); 00188 00196 inline float64_t kernel(int32_t idx_a, int32_t idx_b) 00197 { 00198 if (idx_a<0 || idx_b<0 || idx_a>=num_lhs || idx_b>=num_rhs) 00199 { 00200 SG_ERROR("Index out of Range: idx_a=%d/%d idx_b=%d/%d\n", 00201 idx_a,num_lhs, idx_b,num_rhs); 00202 } 00203 00204 return normalizer->normalize(compute(idx_a, idx_b), idx_a, idx_b); 00205 } 00206 00213 void get_kernel_matrix(float64_t** dst, int32_t* m, int32_t* n); 00214 00215 00221 virtual std::vector<float64_t> get_kernel_col(int32_t j) 00222 { 00223 00224 std::vector<float64_t> col = std::vector<float64_t>(num_rhs); 00225 00226 for (int32_t i=0; i!=num_rhs; i++) 00227 { 00228 col[i] = kernel(i,j); 00229 } 00230 00231 return col; 00232 00233 } 00234 00235 00241 virtual std::vector<float64_t> get_kernel_row(int32_t i) 00242 { 00243 00244 std::vector<float64_t> row = std::vector<float64_t>(num_lhs); 00245 00246 for (int32_t j=0; j!=num_lhs; j++) 00247 { 00248 row[j] = kernel(i,j); 00249 } 00250 00251 return row; 00252 00253 } 00254 00255 00263 template <class T> 00264 T* get_kernel_matrix(int32_t &m, int32_t &n, T* target) 00265 { 00266 T* result = NULL; 00267 00268 if (!has_features()) 00269 SG_ERROR( "no features assigned to kernel\n"); 00270 00271 if (target && (m!=get_num_vec_lhs() || 00272 n!=get_num_vec_rhs()) ) 00273 { 00274 SG_ERROR( "kernel matrix size mismatch\n"); 00275 } 00276 00277 m=get_num_vec_lhs(); 00278 n=get_num_vec_rhs(); 00279 00280 int64_t total_num = int64_t(m)*n; 00281 00282 // if lhs == rhs and sizes match assume k(i,j)=k(j,i) 00283 bool symmetric= (lhs && lhs==rhs && m==n); 00284 00285 SG_DEBUG( "returning kernel matrix of size %dx%d\n", m, n); 00286 00287 if (target) 00288 result=target; 00289 else 00290 result=new T[total_num]; 00291 00292 int32_t num_threads=parallel->get_num_threads(); 00293 if (num_threads < 2) 00294 { 00295 K_THREAD_PARAM<T> params; 00296 params.kernel=this; 00297 params.result=result; 00298 params.start=0; 00299 params.end=m; 00300 params.total_start=0; 00301 params.total_end=total_num; 00302 params.n=n; 00303 params.m=m; 00304 params.symmetric=symmetric; 00305 params.verbose=true; 00306 get_kernel_matrix_helper<T>((void*) ¶ms); 00307 } 00308 else 00309 { 00310 pthread_t* threads = new pthread_t[num_threads-1]; 00311 K_THREAD_PARAM<T>* params = new K_THREAD_PARAM<T>[num_threads]; 00312 int64_t step= total_num/num_threads; 00313 00314 int32_t t; 00315 00316 for (t=0; t<num_threads-1; t++) 00317 { 00318 params[t].kernel = this; 00319 params[t].result = result; 00320 params[t].start = compute_row_start(t*step, n, symmetric); 00321 params[t].end = compute_row_start((t+1)*step, n, symmetric); 00322 params[t].total_start=t*step; 00323 params[t].total_end=(t+1)*step; 00324 params[t].n=n; 00325 params[t].m=m; 00326 params[t].symmetric=symmetric; 00327 params[t].verbose=false; 00328 pthread_create(&threads[t], NULL, 00329 CKernel::get_kernel_matrix_helper<T>, (void*)¶ms[t]); 00330 } 00331 00332 params[t].kernel = this; 00333 params[t].result = result; 00334 params[t].start = compute_row_start(t*step, n, symmetric); 00335 params[t].end = m; 00336 params[t].total_start=t*step; 00337 params[t].total_end=total_num; 00338 params[t].n=n; 00339 params[t].m=m; 00340 params[t].symmetric=symmetric; 00341 params[t].verbose=true; 00342 get_kernel_matrix_helper<T>(¶ms[t]); 00343 00344 for (t=0; t<num_threads-1; t++) 00345 pthread_join(threads[t], NULL); 00346 00347 delete[] params; 00348 delete[] threads; 00349 } 00350 00351 SG_DONE(); 00352 00353 return result; 00354 } 00355 00356 00367 virtual bool init(CFeatures* lhs, CFeatures* rhs); 00368 00373 virtual bool set_normalizer(CKernelNormalizer* normalizer); 00374 00379 virtual CKernelNormalizer* get_normalizer(); 00380 00384 virtual bool init_normalizer(); 00385 00392 virtual void cleanup(); 00393 00398 void load(CFile* loader); 00399 00404 void save(CFile* writer); 00405 00410 inline CFeatures* get_lhs() { SG_REF(lhs); return lhs; } 00411 00416 inline CFeatures* get_rhs() { SG_REF(rhs); return rhs; } 00417 00422 virtual inline int32_t get_num_vec_lhs() 00423 { 00424 return num_lhs; 00425 } 00426 00431 virtual inline int32_t get_num_vec_rhs() 00432 { 00433 return num_rhs; 00434 } 00435 00440 virtual inline bool has_features() 00441 { 00442 return lhs && rhs; 00443 } 00444 00449 inline bool get_lhs_equals_rhs() 00450 { 00451 return lhs_equals_rhs; 00452 } 00453 00455 virtual void remove_lhs_and_rhs(); 00456 00458 virtual void remove_lhs(); 00459 00461 virtual void remove_rhs(); 00462 00470 virtual EKernelType get_kernel_type()=0 ; 00471 00478 virtual EFeatureType get_feature_type()=0; 00479 00486 virtual EFeatureClass get_feature_class()=0; 00487 00492 inline void set_cache_size(int32_t size) 00493 { 00494 cache_size = size; 00495 00496 } 00497 00502 inline int32_t get_cache_size() { return cache_size; } 00503 00504 00505 00507 void list_kernel(); 00508 00514 inline bool has_property(EKernelProperty p) { return (properties & p) != 0; } 00515 00519 virtual void clear_normal(); 00520 00526 virtual void add_to_normal(int32_t vector_idx, float64_t weight); 00527 00532 inline EOptimizationType get_optimization_type() { return opt_type; } 00533 00538 virtual inline void set_optimization_type(EOptimizationType t) { opt_type=t;} 00539 00544 inline bool get_is_initialized() { return optimization_initialized; } 00545 00553 virtual bool init_optimization( 00554 int32_t count, int32_t *IDX, float64_t *weights); 00555 00560 virtual bool delete_optimization(); 00561 00567 bool init_optimization_svm(CSVM * svm) ; 00568 00574 virtual float64_t compute_optimized(int32_t vector_idx); 00575 00584 virtual void compute_batch( 00585 int32_t num_vec, int32_t* vec_idx, float64_t* target, 00586 int32_t num_suppvec, int32_t* IDX, float64_t* alphas, 00587 float64_t factor=1.0); 00588 00593 inline float64_t get_combined_kernel_weight() { return combined_kernel_weight; } 00594 00599 inline void set_combined_kernel_weight(float64_t nw) { combined_kernel_weight=nw; } 00600 00605 virtual int32_t get_num_subkernels(); 00606 00612 virtual void compute_by_subkernel( 00613 int32_t vector_idx, float64_t * subkernel_contrib); 00614 00620 virtual const float64_t* get_subkernel_weights(int32_t& num_weights); 00621 00627 virtual void set_subkernel_weights( 00628 float64_t* weights, int32_t num_weights); 00629 00630 protected: 00635 inline void set_property(EKernelProperty p) 00636 { 00637 properties |= p; 00638 } 00639 00644 inline void unset_property(EKernelProperty p) 00645 { 00646 properties &= (properties | p) ^ p; 00647 } 00648 00653 inline void set_is_initialized(bool p_init) { optimization_initialized=p_init; } 00654 00665 virtual float64_t compute(int32_t x, int32_t y)=0; 00666 00673 int32_t compute_row_start(int64_t offs, int32_t n, bool symmetric) 00674 { 00675 int32_t i_start; 00676 00677 if (symmetric) 00678 i_start=(int32_t) CMath::floor(n-CMath::sqrt(CMath::sq((float64_t) n)-offs)); 00679 else 00680 i_start=(int32_t) (offs/int64_t(n)); 00681 00682 return i_start; 00683 } 00684 00689 template <class T> 00690 static void* get_kernel_matrix_helper(void* p) 00691 { 00692 K_THREAD_PARAM<T>* params= (K_THREAD_PARAM<T>*) p; 00693 int32_t i_start=params->start; 00694 int32_t i_end=params->end; 00695 CKernel* k=params->kernel; 00696 T* result=params->result; 00697 bool symmetric=params->symmetric; 00698 int32_t n=params->n; 00699 int32_t m=params->m; 00700 bool verbose=params->verbose; 00701 int64_t total_start=params->total_start; 00702 int64_t total_end=params->total_end; 00703 int64_t total=total_start; 00704 00705 for (int32_t i=i_start; i<i_end; i++) 00706 { 00707 int32_t j_start=0; 00708 00709 if (symmetric) 00710 j_start=i; 00711 00712 for (int32_t j=j_start; j<n; j++) 00713 { 00714 float64_t v=k->kernel(i,j); 00715 result[i+j*m]=v; 00716 00717 if (symmetric && i!=j) 00718 result[j+i*m]=v; 00719 00720 if (verbose) 00721 { 00722 total++; 00723 00724 if (symmetric && i!=j) 00725 total++; 00726 00727 if (total%100 == 0) 00728 k->SG_PROGRESS(total, total_start, total_end); 00729 00730 if (CSignal::cancel_computations()) 00731 break; 00732 } 00733 } 00734 00735 } 00736 00737 return NULL; 00738 } 00739 00748 virtual void load_serializable_post() throw (ShogunException); 00749 00758 virtual void save_serializable_pre() throw (ShogunException); 00759 00768 virtual void save_serializable_post() throw (ShogunException); 00769 00770 private: 00773 void init(); 00774 00775 00776 00778 00779 protected: 00781 int32_t cache_size; 00782 00783 00784 00787 KERNELCACHE_ELEM* kernel_matrix; 00788 00790 CFeatures* lhs; 00792 CFeatures* rhs; 00793 00795 bool lhs_equals_rhs; 00796 00798 int32_t num_lhs; 00800 int32_t num_rhs; 00801 00803 float64_t combined_kernel_weight; 00804 00806 bool optimization_initialized; 00810 EOptimizationType opt_type; 00811 00813 uint64_t properties; 00814 00817 CKernelNormalizer* normalizer; 00818 }; 00819 00820 } 00821 #endif /* _KERNEL_H__ */