SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2008 Gunnar Raetsch 00008 * Written (W) 1999-2009 Soeren Sonnenburg 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include "preproc/PruneVarSubMean.h" 00013 #include "preproc/SimplePreProc.h" 00014 #include "features/Features.h" 00015 #include "features/SimpleFeatures.h" 00016 #include "lib/io.h" 00017 #include "lib/Mathematics.h" 00018 00019 using namespace shogun; 00020 00021 CPruneVarSubMean::CPruneVarSubMean(bool divide) 00022 : CSimplePreProc<float64_t>("PruneVarSubMean","PVSM"), idx(NULL), mean(NULL), 00023 std(NULL), num_idx(0), divide_by_std(divide), initialized(false) 00024 { 00025 } 00026 00027 CPruneVarSubMean::~CPruneVarSubMean() 00028 { 00029 cleanup(); 00030 } 00031 00033 bool CPruneVarSubMean::init(CFeatures* p_f) 00034 { 00035 if (!initialized) 00036 { 00037 ASSERT(p_f->get_feature_class()==C_SIMPLE); 00038 ASSERT(p_f->get_feature_type()==F_DREAL); 00039 00040 CSimpleFeatures<float64_t> *f=(CSimpleFeatures<float64_t>*) p_f; 00041 int32_t num_examples=f->get_num_vectors(); 00042 int32_t num_features=((CSimpleFeatures<float64_t>*)f)->get_num_features(); 00043 00044 delete[] mean; 00045 delete[] idx; 00046 delete[] std; 00047 mean=NULL; 00048 idx=NULL; 00049 std=NULL; 00050 00051 mean=new float64_t[num_features]; 00052 float64_t* var=new float64_t[num_features]; 00053 int32_t i,j; 00054 00055 for (i=0; i<num_features; i++) 00056 { 00057 mean[i]=0; 00058 var[i]=0 ; 00059 } 00060 00061 // compute mean 00062 for (i=0; i<num_examples; i++) 00063 { 00064 int32_t len ; bool free ; 00065 float64_t* feature=f->get_feature_vector(i, len, free) ; 00066 00067 for (j=0; j<len; j++) 00068 mean[j]+=feature[j]; 00069 00070 f->free_feature_vector(feature, i, free) ; 00071 } 00072 00073 for (j=0; j<num_features; j++) 00074 mean[j]/=num_examples ; 00075 00076 // compute var 00077 for (i=0; i<num_examples; i++) 00078 { 00079 int32_t len ; bool free ; 00080 float64_t* feature=f->get_feature_vector(i, len, free) ; 00081 00082 for (j=0; j<num_features; j++) 00083 var[j]+=(mean[j]-feature[j])*(mean[j]-feature[j]) ; 00084 00085 f->free_feature_vector(feature, i, free) ; 00086 } 00087 00088 int32_t num_ok=0; 00089 int32_t* idx_ok=new int[num_features]; 00090 00091 for (j=0; j<num_features; j++) 00092 { 00093 var[j]/=num_examples ; 00094 00095 if (var[j]>=1e-14) 00096 { 00097 idx_ok[num_ok]=j ; 00098 num_ok++ ; 00099 } 00100 } 00101 00102 SG_INFO( "Reducing number of features from %i to %i\n", num_features, num_ok) ; 00103 00104 delete[] idx ; 00105 idx=new int[num_ok]; 00106 float64_t* new_mean=new float64_t[num_ok]; 00107 std=new float64_t[num_ok]; 00108 00109 for (j=0; j<num_ok; j++) 00110 { 00111 idx[j]=idx_ok[j] ; 00112 new_mean[j]=mean[idx_ok[j]]; 00113 std[j]=sqrt(var[idx_ok[j]]); 00114 } 00115 num_idx=num_ok ; 00116 delete[] idx_ok ; 00117 delete[] mean; 00118 delete[] var; 00119 mean=new_mean; 00120 00121 initialized=true; 00122 return true ; 00123 } 00124 else 00125 return false; 00126 } 00127 00129 void CPruneVarSubMean::cleanup() 00130 { 00131 delete[] idx; 00132 idx=NULL; 00133 delete[] mean; 00134 mean=NULL; 00135 delete[] std; 00136 std=NULL; 00137 } 00138 00142 float64_t* CPruneVarSubMean::apply_to_feature_matrix(CFeatures* f) 00143 { 00144 ASSERT(initialized); 00145 00146 int32_t num_vectors=0; 00147 int32_t num_features=0; 00148 float64_t* m=((CSimpleFeatures<float64_t>*) f)->get_feature_matrix(num_features, num_vectors); 00149 00150 SG_INFO( "get Feature matrix: %ix%i\n", num_vectors, num_features); 00151 SG_INFO( "Preprocessing feature matrix\n"); 00152 for (int32_t vec=0; vec<num_vectors; vec++) 00153 { 00154 float64_t* v_src=&m[num_features*vec]; 00155 float64_t* v_dst=&m[num_idx*vec]; 00156 00157 if (divide_by_std) 00158 { 00159 for (int32_t feat=0; feat<num_idx; feat++) 00160 v_dst[feat]=(v_src[idx[feat]]-mean[feat])/std[feat]; 00161 } 00162 else 00163 { 00164 for (int32_t feat=0; feat<num_idx; feat++) 00165 v_dst[feat]=(v_src[idx[feat]]-mean[feat]); 00166 } 00167 } 00168 00169 ((CSimpleFeatures<float64_t>*) f)->set_num_features(num_idx); 00170 ((CSimpleFeatures<float64_t>*) f)->get_feature_matrix(num_features, num_vectors); 00171 SG_INFO( "new Feature matrix: %ix%i\n", num_vectors, num_features); 00172 00173 return m; 00174 } 00175 00178 float64_t* CPruneVarSubMean::apply_to_feature_vector(float64_t* f, int32_t &len) 00179 { 00180 float64_t* ret=NULL; 00181 00182 if (initialized) 00183 { 00184 ret=new float64_t[num_idx] ; 00185 00186 if (divide_by_std) 00187 { 00188 for (int32_t i=0; i<num_idx; i++) 00189 ret[i]=(f[idx[i]]-mean[i])/std[i]; 00190 } 00191 else 00192 { 00193 for (int32_t i=0; i<num_idx; i++) 00194 ret[i]=(f[idx[i]]-mean[i]); 00195 } 00196 len=num_idx ; 00197 } 00198 else 00199 { 00200 ret=new float64_t[len] ; 00201 for (int32_t i=0; i<len; i++) 00202 ret[i]=f[i]; 00203 } 00204 00205 return ret; 00206 }