SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "lib/common.h" 00012 #include "kernel/CommUlongStringKernel.h" 00013 #include "kernel/SqrtDiagKernelNormalizer.h" 00014 #include "features/StringFeatures.h" 00015 #include "lib/io.h" 00016 00017 using namespace shogun; 00018 00019 CCommUlongStringKernel::CCommUlongStringKernel(int32_t size, bool us) 00020 : CStringKernel<uint64_t>(size), use_sign(us) 00021 { 00022 properties |= KP_LINADD; 00023 clear_normal(); 00024 00025 set_normalizer(new CSqrtDiagKernelNormalizer()); 00026 } 00027 00028 CCommUlongStringKernel::CCommUlongStringKernel( 00029 CStringFeatures<uint64_t>* l, CStringFeatures<uint64_t>* r, bool us, 00030 int32_t size) 00031 : CStringKernel<uint64_t>(size), use_sign(us) 00032 { 00033 properties |= KP_LINADD; 00034 clear_normal(); 00035 set_normalizer(new CSqrtDiagKernelNormalizer()); 00036 init(l,r); 00037 } 00038 00039 CCommUlongStringKernel::~CCommUlongStringKernel() 00040 { 00041 cleanup(); 00042 } 00043 00044 void CCommUlongStringKernel::remove_lhs() 00045 { 00046 delete_optimization(); 00047 00048 #ifdef SVMLIGHT 00049 if (lhs) 00050 cache_reset(); 00051 #endif 00052 00053 lhs = NULL ; 00054 rhs = NULL ; 00055 } 00056 00057 void CCommUlongStringKernel::remove_rhs() 00058 { 00059 #ifdef SVMLIGHT 00060 if (rhs) 00061 cache_reset(); 00062 #endif 00063 00064 rhs = lhs; 00065 } 00066 00067 bool CCommUlongStringKernel::init(CFeatures* l, CFeatures* r) 00068 { 00069 CStringKernel<uint64_t>::init(l,r); 00070 return init_normalizer(); 00071 } 00072 00073 void CCommUlongStringKernel::cleanup() 00074 { 00075 delete_optimization(); 00076 clear_normal(); 00077 CKernel::cleanup(); 00078 } 00079 00080 float64_t CCommUlongStringKernel::compute(int32_t idx_a, int32_t idx_b) 00081 { 00082 int32_t alen, blen; 00083 bool free_avec, free_bvec; 00084 uint64_t* avec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00085 uint64_t* bvec=((CStringFeatures<uint64_t>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00086 00087 float64_t result=0; 00088 00089 int32_t left_idx=0; 00090 int32_t right_idx=0; 00091 00092 if (use_sign) 00093 { 00094 while (left_idx < alen && right_idx < blen) 00095 { 00096 if (avec[left_idx]==bvec[right_idx]) 00097 { 00098 uint64_t sym=avec[left_idx]; 00099 00100 while (left_idx< alen && avec[left_idx]==sym) 00101 left_idx++; 00102 00103 while (right_idx< blen && bvec[right_idx]==sym) 00104 right_idx++; 00105 00106 result++; 00107 } 00108 else if (avec[left_idx]<bvec[right_idx]) 00109 left_idx++; 00110 else 00111 right_idx++; 00112 } 00113 } 00114 else 00115 { 00116 while (left_idx < alen && right_idx < blen) 00117 { 00118 if (avec[left_idx]==bvec[right_idx]) 00119 { 00120 int32_t old_left_idx=left_idx; 00121 int32_t old_right_idx=right_idx; 00122 00123 uint64_t sym=avec[left_idx]; 00124 00125 while (left_idx< alen && avec[left_idx]==sym) 00126 left_idx++; 00127 00128 while (right_idx< blen && bvec[right_idx]==sym) 00129 right_idx++; 00130 00131 result+=((float64_t) (left_idx-old_left_idx)) * ((float64_t) (right_idx-old_right_idx)); 00132 } 00133 else if (avec[left_idx]<bvec[right_idx]) 00134 left_idx++; 00135 else 00136 right_idx++; 00137 } 00138 } 00139 ((CStringFeatures<uint64_t>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00140 ((CStringFeatures<uint64_t>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00141 00142 return result; 00143 } 00144 00145 void CCommUlongStringKernel::add_to_normal(int32_t vec_idx, float64_t weight) 00146 { 00147 int32_t t=0; 00148 int32_t j=0; 00149 int32_t k=0; 00150 int32_t last_j=0; 00151 int32_t len=-1; 00152 bool free_vec; 00153 uint64_t* vec=((CStringFeatures<uint64_t>*) lhs)->get_feature_vector(vec_idx, len, free_vec); 00154 00155 if (vec && len>0) 00156 { 00157 //use malloc not new [] as DynamicArray uses it 00158 uint64_t* dic= (uint64_t*) malloc( 00159 (len+dictionary.get_num_elements())*sizeof(uint64_t)); 00160 float64_t* dic_weights= (float64_t*) malloc( 00161 (len+dictionary.get_num_elements())*sizeof(float64_t)); 00162 00163 if (use_sign) 00164 { 00165 for (j=1; j<len; j++) 00166 { 00167 if (vec[j]==vec[j-1]) 00168 continue; 00169 00170 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx); 00171 } 00172 00173 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight, vec_idx); 00174 00175 while (k<dictionary.get_num_elements()) 00176 { 00177 dic[t]=dictionary[k]; 00178 dic_weights[t]=dictionary_weights[k]; 00179 t++; 00180 k++; 00181 } 00182 } 00183 else 00184 { 00185 for (j=1; j<len; j++) 00186 { 00187 if (vec[j]==vec[j-1]) 00188 continue; 00189 00190 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx); 00191 last_j = j; 00192 } 00193 00194 merge_dictionaries(t, j, k, vec, dic, dic_weights, weight*(j-last_j), vec_idx); 00195 00196 while (k<dictionary.get_num_elements()) 00197 { 00198 dic[t]=dictionary[k]; 00199 dic_weights[t]=dictionary_weights[k]; 00200 t++; 00201 k++; 00202 } 00203 } 00204 00205 dictionary.set_array(dic, t, len+dictionary.get_num_elements()); 00206 dictionary_weights.set_array(dic_weights, t, len+dictionary.get_num_elements()); 00207 } 00208 ((CStringFeatures<uint64_t>*) lhs)->free_feature_vector(vec, vec_idx, free_vec); 00209 00210 set_is_initialized(true); 00211 } 00212 00213 void CCommUlongStringKernel::clear_normal() 00214 { 00215 dictionary.resize_array(0); 00216 dictionary_weights.resize_array(0); 00217 set_is_initialized(false); 00218 } 00219 00220 bool CCommUlongStringKernel::init_optimization( 00221 int32_t count, int32_t *IDX, float64_t * weights) 00222 { 00223 clear_normal(); 00224 00225 if (count<=0) 00226 { 00227 set_is_initialized(true); 00228 SG_DEBUG( "empty set of SVs\n"); 00229 return true; 00230 } 00231 00232 SG_DEBUG( "initializing CCommUlongStringKernel optimization\n"); 00233 00234 for (int32_t i=0; i<count; i++) 00235 { 00236 if ( (i % (count/10+1)) == 0) 00237 SG_PROGRESS(i, 0, count); 00238 00239 add_to_normal(IDX[i], weights[i]); 00240 } 00241 00242 SG_PRINT( "Done. \n"); 00243 00244 set_is_initialized(true); 00245 return true; 00246 } 00247 00248 bool CCommUlongStringKernel::delete_optimization() 00249 { 00250 SG_DEBUG( "deleting CCommUlongStringKernel optimization\n"); 00251 clear_normal(); 00252 return true; 00253 } 00254 00255 // binary search for each feature. trick: as features are sorted save last found idx in old_idx and 00256 // only search in the remainder of the dictionary 00257 float64_t CCommUlongStringKernel::compute_optimized(int32_t i) 00258 { 00259 float64_t result = 0; 00260 int32_t j, last_j=0; 00261 int32_t old_idx = 0; 00262 00263 if (!get_is_initialized()) 00264 { 00265 SG_ERROR( "CCommUlongStringKernel optimization not initialized\n"); 00266 return 0 ; 00267 } 00268 00269 00270 00271 int32_t alen = -1; 00272 bool free_avec; 00273 uint64_t* avec=((CStringFeatures<uint64_t>*) rhs)-> 00274 get_feature_vector(i, alen, free_avec); 00275 00276 if (avec && alen>0) 00277 { 00278 if (use_sign) 00279 { 00280 for (j=1; j<alen; j++) 00281 { 00282 if (avec[j]==avec[j-1]) 00283 continue; 00284 00285 int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[j-1]); 00286 00287 if (idx!=-1) 00288 { 00289 if (dictionary[idx+old_idx] == avec[j-1]) 00290 result += dictionary_weights[idx+old_idx]; 00291 00292 old_idx+=idx; 00293 } 00294 } 00295 00296 int32_t idx = CMath::binary_search(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[alen-1]); 00297 if (idx!=-1) 00298 result += dictionary_weights[idx+old_idx]; 00299 } 00300 else 00301 { 00302 for (j=1; j<alen; j++) 00303 { 00304 if (avec[j]==avec[j-1]) 00305 continue; 00306 00307 int32_t idx = CMath::binary_search_max_lower_equal(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[j-1]); 00308 00309 if (idx!=-1) 00310 { 00311 if (dictionary[idx+old_idx] == avec[j-1]) 00312 result += dictionary_weights[idx+old_idx]*(j-last_j); 00313 00314 old_idx+=idx; 00315 } 00316 00317 last_j = j; 00318 } 00319 00320 int32_t idx = CMath::binary_search(&(dictionary.get_array()[old_idx]), dictionary.get_num_elements()-old_idx, avec[alen-1]); 00321 if (idx!=-1) 00322 result += dictionary_weights[idx+old_idx]*(alen-last_j); 00323 } 00324 } 00325 00326 ((CStringFeatures<uint64_t>*) rhs)->free_feature_vector(avec, i, free_avec); 00327 00328 return normalizer->normalize_rhs(result, i); 00329 }