SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "lib/common.h" 00012 #include "kernel/WeightedCommWordStringKernel.h" 00013 #include "features/StringFeatures.h" 00014 #include "lib/io.h" 00015 00016 using namespace shogun; 00017 00018 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel() 00019 : CCommWordStringKernel(0, false) 00020 { 00021 init(); 00022 } 00023 00024 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel( 00025 int32_t size, bool us) 00026 : CCommWordStringKernel(size, us) 00027 { 00028 ASSERT(us==false); 00029 init(); 00030 } 00031 00032 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel( 00033 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool us, 00034 int32_t size) 00035 : CCommWordStringKernel(size, us) 00036 { 00037 ASSERT(us==false); 00038 init(); 00039 00040 init(l,r); 00041 } 00042 00043 CWeightedCommWordStringKernel::~CWeightedCommWordStringKernel() 00044 { 00045 delete[] weights; 00046 } 00047 00048 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r) 00049 { 00050 ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() == 00051 ((CStringFeatures<uint16_t>*) r)->get_order()); 00052 degree=((CStringFeatures<uint16_t>*) l)->get_order(); 00053 set_wd_weights(); 00054 00055 CCommWordStringKernel::init(l,r); 00056 return init_normalizer(); 00057 } 00058 00059 void CWeightedCommWordStringKernel::cleanup() 00060 { 00061 delete[] weights; 00062 weights=NULL; 00063 00064 CCommWordStringKernel::cleanup(); 00065 } 00066 00067 bool CWeightedCommWordStringKernel::set_wd_weights() 00068 { 00069 delete[] weights; 00070 weights=new float64_t[degree]; 00071 00072 int32_t i; 00073 float64_t sum=0; 00074 for (i=0; i<degree; i++) 00075 { 00076 weights[i]=degree-i; 00077 sum+=weights[i]; 00078 } 00079 for (i=0; i<degree; i++) 00080 weights[i]=CMath::sqrt(weights[i]/sum); 00081 00082 return weights!=NULL; 00083 } 00084 00085 bool CWeightedCommWordStringKernel::set_weights(float64_t* w, int32_t d) 00086 { 00087 ASSERT(d==degree); 00088 00089 delete[] weights; 00090 weights=new float64_t[degree]; 00091 for (int32_t i=0; i<degree; i++) 00092 weights[i]=CMath::sqrt(w[i]); 00093 return true; 00094 } 00095 00096 float64_t CWeightedCommWordStringKernel::compute_helper( 00097 int32_t idx_a, int32_t idx_b, bool do_sort) 00098 { 00099 int32_t alen, blen; 00100 bool free_avec, free_bvec; 00101 00102 CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs; 00103 CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs; 00104 00105 uint16_t* av=l->get_feature_vector(idx_a, alen, free_avec); 00106 uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bvec); 00107 00108 uint16_t* avec=av; 00109 uint16_t* bvec=bv; 00110 00111 if (do_sort) 00112 { 00113 if (alen>0) 00114 { 00115 avec=new uint16_t[alen]; 00116 memcpy(avec, av, sizeof(uint16_t)*alen); 00117 CMath::radix_sort(avec, alen); 00118 } 00119 else 00120 avec=NULL; 00121 00122 if (blen>0) 00123 { 00124 bvec=new uint16_t[blen]; 00125 memcpy(bvec, bv, sizeof(uint16_t)*blen); 00126 CMath::radix_sort(bvec, blen); 00127 } 00128 else 00129 bvec=NULL; 00130 } 00131 else 00132 { 00133 if ( (l->get_num_preproc() != l->get_num_preprocessed()) || 00134 (r->get_num_preproc() != r->get_num_preprocessed())) 00135 { 00136 SG_ERROR("not all preprocessors have been applied to training (%d/%d)" 00137 " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preproc(), 00138 r->get_num_preprocessed(), r->get_num_preproc()); 00139 } 00140 } 00141 00142 float64_t result=0; 00143 uint8_t mask=0; 00144 00145 for (int32_t d=0; d<degree; d++) 00146 { 00147 mask = mask | (1 << (degree-d-1)); 00148 uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask); 00149 00150 int32_t left_idx=0; 00151 int32_t right_idx=0; 00152 float64_t weight=weights[d]*weights[d]; 00153 00154 while (left_idx < alen && right_idx < blen) 00155 { 00156 uint16_t lsym=avec[left_idx] & masked; 00157 uint16_t rsym=bvec[right_idx] & masked; 00158 00159 if (lsym == rsym) 00160 { 00161 int32_t old_left_idx=left_idx; 00162 int32_t old_right_idx=right_idx; 00163 00164 while (left_idx<alen && (avec[left_idx] & masked) ==lsym) 00165 left_idx++; 00166 00167 while (right_idx<blen && (bvec[right_idx] & masked) ==lsym) 00168 right_idx++; 00169 00170 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx); 00171 } 00172 else if (lsym<rsym) 00173 left_idx++; 00174 else 00175 right_idx++; 00176 } 00177 } 00178 00179 if (do_sort) 00180 { 00181 delete[] avec; 00182 delete[] bvec; 00183 } 00184 00185 l->free_feature_vector(av, idx_a, free_avec); 00186 r->free_feature_vector(bv, idx_b, free_bvec); 00187 00188 return result; 00189 } 00190 00191 void CWeightedCommWordStringKernel::add_to_normal( 00192 int32_t vec_idx, float64_t weight) 00193 { 00194 int32_t len=-1; 00195 bool free_vec; 00196 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) lhs; 00197 uint16_t* vec=s->get_feature_vector(vec_idx, len, free_vec); 00198 00199 if (len>0) 00200 { 00201 for (int32_t j=0; j<len; j++) 00202 { 00203 uint8_t mask=0; 00204 int32_t offs=0; 00205 for (int32_t d=0; d<degree; d++) 00206 { 00207 mask = mask | (1 << (degree-d-1)); 00208 int32_t idx=s->get_masked_symbols(vec[j], mask); 00209 idx=s->shift_symbol(idx, degree-d-1); 00210 dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx); 00211 offs+=s->shift_offset(1,d+1); 00212 } 00213 } 00214 00215 set_is_initialized(true); 00216 } 00217 00218 s->free_feature_vector(vec, vec_idx, free_vec); 00219 } 00220 00221 void CWeightedCommWordStringKernel::merge_normal() 00222 { 00223 ASSERT(get_is_initialized()); 00224 ASSERT(use_sign==false); 00225 00226 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs; 00227 uint32_t num_symbols=(uint32_t) s->get_num_symbols(); 00228 int32_t dic_size=1<<(sizeof(uint16_t)*8); 00229 float64_t* dic=new float64_t[dic_size]; 00230 memset(dic, 0, sizeof(float64_t)*dic_size); 00231 00232 for (uint32_t sym=0; sym<num_symbols; sym++) 00233 { 00234 float64_t result=0; 00235 uint8_t mask=0; 00236 int32_t offs=0; 00237 for (int32_t d=0; d<degree; d++) 00238 { 00239 mask = mask | (1 << (degree-d-1)); 00240 int32_t idx=s->get_masked_symbols(sym, mask); 00241 idx=s->shift_symbol(idx, degree-d-1); 00242 result += dictionary_weights[offs + idx]; 00243 offs+=s->shift_offset(1,d+1); 00244 } 00245 dic[sym]=result; 00246 } 00247 00248 init_dictionary(1<<(sizeof(uint16_t)*8)); 00249 memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size); 00250 delete[] dic; 00251 } 00252 00253 float64_t CWeightedCommWordStringKernel::compute_optimized(int32_t i) 00254 { 00255 if (!get_is_initialized()) 00256 SG_ERROR( "CCommWordStringKernel optimization not initialized\n"); 00257 00258 ASSERT(use_sign==false); 00259 00260 float64_t result=0; 00261 bool free_vec; 00262 int32_t len=-1; 00263 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs; 00264 uint16_t* vec=s->get_feature_vector(i, len, free_vec); 00265 00266 if (vec && len>0) 00267 { 00268 for (int32_t j=0; j<len; j++) 00269 { 00270 uint8_t mask=0; 00271 int32_t offs=0; 00272 for (int32_t d=0; d<degree; d++) 00273 { 00274 mask = mask | (1 << (degree-d-1)); 00275 int32_t idx=s->get_masked_symbols(vec[j], mask); 00276 idx=s->shift_symbol(idx, degree-d-1); 00277 result += dictionary_weights[offs + idx]*weights[d]; 00278 offs+=s->shift_offset(1,d+1); 00279 } 00280 } 00281 00282 result=normalizer->normalize_rhs(result, i); 00283 } 00284 s->free_feature_vector(vec, i, free_vec); 00285 return result; 00286 } 00287 00288 float64_t* CWeightedCommWordStringKernel::compute_scoring( 00289 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target, 00290 int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init) 00291 { 00292 if (do_init) 00293 CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas); 00294 00295 int32_t dic_size=1<<(sizeof(uint16_t)*9); 00296 float64_t* dic=new float64_t[dic_size]; 00297 memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size); 00298 00299 merge_normal(); 00300 float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat, 00301 num_sym, target, num_suppvec, IDX, alphas, false); 00302 00303 init_dictionary(1<<(sizeof(uint16_t)*9)); 00304 memcpy(dictionary_weights,dic, sizeof(float64_t)*dic_size); 00305 delete[] dic; 00306 00307 return result; 00308 } 00309 00310 void CWeightedCommWordStringKernel::init() 00311 { 00312 degree=0; 00313 weights=NULL; 00314 00315 init_dictionary(1<<(sizeof(uint16_t)*9)); 00316 00317 m_parameters->add_vector(&weights, °ree, "weights", 00318 "weights for each of the subkernels of degree 1...d"); 00319 }