SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 00011 #include "features/HashedWDFeaturesTransposed.h" 00012 #include "lib/io.h" 00013 #include "lib/Signal.h" 00014 #include "base/Parallel.h" 00015 00016 #ifndef WIN32 00017 #include <pthread.h> 00018 #endif 00019 00020 using namespace shogun; 00021 00022 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00023 struct HASHEDWD_THREAD_PARAM 00024 { 00025 CHashedWDFeaturesTransposed* hf; 00026 int32_t* sub_index; 00027 float64_t* output; 00028 int32_t start; 00029 int32_t stop; 00030 float64_t* alphas; 00031 float64_t* vec; 00032 float64_t bias; 00033 bool progress; 00034 uint32_t* index; 00035 }; 00036 #endif // DOXYGEN_SHOULD_SKIP_THIS 00037 00038 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(void) 00039 :CDotFeatures() 00040 { 00041 SG_UNSTABLE( 00042 "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(void)", 00043 "\n"); 00044 00045 strings = NULL; 00046 transposed_strings = NULL; 00047 00048 degree = 0; 00049 start_degree = 0; 00050 from_degree = 0; 00051 string_length = 0; 00052 num_strings = 0; 00053 alphabet_size = 0; 00054 w_dim = 0; 00055 partial_w_dim = 0; 00056 wd_weights = NULL; 00057 mask = 0; 00058 m_hash_bits = 0; 00059 00060 normalization_const = 0.0; 00061 } 00062 00063 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(CStringFeatures<uint8_t>* str, 00064 int32_t start_order, int32_t order, int32_t from_order, 00065 int32_t hash_bits) : CDotFeatures() 00066 { 00067 ASSERT(start_order>=0); 00068 ASSERT(start_order<order); 00069 ASSERT(order<=from_order); 00070 ASSERT(hash_bits>0); 00071 ASSERT(str); 00072 ASSERT(str->have_same_length()); 00073 SG_REF(str); 00074 00075 strings=str; 00076 int32_t transposed_num_feat=0; 00077 int32_t transposed_num_vec=0; 00078 transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec); 00079 00080 string_length=str->get_max_vector_length(); 00081 num_strings=str->get_num_vectors(); 00082 ASSERT(transposed_num_feat==num_strings); 00083 ASSERT(transposed_num_vec==string_length); 00084 00085 CAlphabet* alpha=str->get_alphabet(); 00086 alphabet_size=alpha->get_num_symbols(); 00087 SG_UNREF(alpha); 00088 00089 degree=order; 00090 start_degree=start_order; 00091 if (start_degree!=0) 00092 SG_NOTIMPLEMENTED; 00093 from_degree=from_order; 00094 m_hash_bits=hash_bits; 00095 set_wd_weights(); 00096 set_normalization_const(); 00097 } 00098 00099 CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed(const CHashedWDFeaturesTransposed& orig) 00100 : CDotFeatures(orig), strings(orig.strings), transposed_strings(transposed_strings), 00101 degree(orig.degree), start_degree(orig.start_degree), 00102 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits), 00103 normalization_const(orig.normalization_const) 00104 { 00105 SG_REF(strings); 00106 string_length=strings->get_max_vector_length(); 00107 num_strings=strings->get_num_vectors(); 00108 CAlphabet* alpha=strings->get_alphabet(); 00109 alphabet_size=alpha->get_num_symbols(); 00110 SG_UNREF(alpha); 00111 00112 set_wd_weights(); 00113 } 00114 00115 CHashedWDFeaturesTransposed::~CHashedWDFeaturesTransposed() 00116 { 00117 for (int32_t i=0; i<string_length; i++) 00118 delete[] transposed_strings[i].string; 00119 delete[] transposed_strings; 00120 00121 SG_UNREF(strings); 00122 delete[] wd_weights; 00123 } 00124 00125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00126 { 00127 ASSERT(df); 00128 ASSERT(df->get_feature_type() == get_feature_type()); 00129 ASSERT(df->get_feature_class() == get_feature_class()); 00130 CHashedWDFeaturesTransposed* wdf = (CHashedWDFeaturesTransposed*) df; 00131 00132 int32_t len1, len2; 00133 bool free_vec1, free_vec2; 00134 00135 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00136 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2); 00137 00138 ASSERT(len1==len2); 00139 00140 float64_t sum=0.0; 00141 00142 for (int32_t i=0; i<len1; i++) 00143 { 00144 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00145 { 00146 if (vec1[i+j]!=vec2[i+j]) 00147 break; 00148 if (j>=start_degree) 00149 sum += wd_weights[j]*wd_weights[j]; 00150 } 00151 } 00152 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00153 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00154 return sum/CMath::sq(normalization_const); 00155 } 00156 00157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len) 00158 { 00159 if (vec2_len != w_dim) 00160 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00161 00162 float64_t sum=0; 00163 int32_t len; 00164 bool free_vec1; 00165 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00166 uint32_t* val=new uint32_t[len]; 00167 00168 uint32_t offs=0; 00169 00170 CMath::fill_vector(val, len, 0xDEADBEAF); 00171 00172 for (int32_t i=0; i < len; i++) 00173 { 00174 uint32_t o=offs; 00175 for (int32_t k=0; k<degree && i+k<len; k++) 00176 { 00177 const float64_t wd = wd_weights[k]; 00178 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]); 00179 val[i]=h; 00180 #ifdef DEBUG_HASHEDWD 00181 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00182 #endif 00183 sum+=vec2[o+(h & mask)]*wd; 00184 o+=partial_w_dim; 00185 } 00186 offs+=partial_w_dim*degree; 00187 } 00188 delete[] val; 00189 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00190 00191 return sum/normalization_const; 00192 } 00193 00194 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b) 00195 { 00196 ASSERT(output); 00197 // write access is internally between output[start..stop] so the following 00198 // line is necessary to write to output[0...(stop-start-1)] 00199 output-=start; 00200 ASSERT(start>=0); 00201 ASSERT(start<stop); 00202 ASSERT(stop<=get_num_vectors()); 00203 uint32_t* index=new uint32_t[stop]; 00204 00205 int32_t num_vectors=stop-start; 00206 ASSERT(num_vectors>0); 00207 00208 int32_t num_threads=parallel->get_num_threads(); 00209 ASSERT(num_threads>0); 00210 00211 CSignal::clear_cancel(); 00212 00213 if (dim != w_dim) 00214 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim); 00215 00216 #ifndef WIN32 00217 if (num_threads < 2) 00218 { 00219 #endif 00220 HASHEDWD_THREAD_PARAM params; 00221 params.hf=this; 00222 params.sub_index=NULL; 00223 params.output=output; 00224 params.start=start; 00225 params.stop=stop; 00226 params.alphas=alphas; 00227 params.vec=vec; 00228 params.bias=b; 00229 params.progress=false; //true; 00230 params.index=index; 00231 dense_dot_range_helper((void*) ¶ms); 00232 #ifndef WIN32 00233 } 00234 else 00235 { 00236 pthread_t* threads = new pthread_t[num_threads-1]; 00237 HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads]; 00238 int32_t step= num_vectors/num_threads; 00239 00240 int32_t t; 00241 00242 for (t=0; t<num_threads-1; t++) 00243 { 00244 params[t].hf = this; 00245 params[t].sub_index=NULL; 00246 params[t].output = output; 00247 params[t].start = start+t*step; 00248 params[t].stop = start+(t+1)*step; 00249 params[t].alphas=alphas; 00250 params[t].vec=vec; 00251 params[t].bias=b; 00252 params[t].progress = false; 00253 params[t].index=index; 00254 pthread_create(&threads[t], NULL, 00255 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]); 00256 } 00257 00258 params[t].hf = this; 00259 params[t].sub_index=NULL; 00260 params[t].output = output; 00261 params[t].start = start+t*step; 00262 params[t].stop = stop; 00263 params[t].alphas=alphas; 00264 params[t].vec=vec; 00265 params[t].bias=b; 00266 params[t].progress = false; //true; 00267 params[t].index=index; 00268 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]); 00269 00270 for (t=0; t<num_threads-1; t++) 00271 pthread_join(threads[t], NULL); 00272 00273 delete[] params; 00274 delete[] threads; 00275 delete[] index; 00276 } 00277 #endif 00278 00279 #ifndef WIN32 00280 if ( CSignal::cancel_computations() ) 00281 SG_INFO( "prematurely stopped. \n"); 00282 #endif 00283 } 00284 00285 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b) 00286 { 00287 ASSERT(sub_index); 00288 ASSERT(output); 00289 00290 uint32_t* index=new uint32_t[num]; 00291 00292 int32_t num_threads=parallel->get_num_threads(); 00293 ASSERT(num_threads>0); 00294 00295 CSignal::clear_cancel(); 00296 00297 if (dim != w_dim) 00298 SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim); 00299 00300 #ifndef WIN32 00301 if (num_threads < 2) 00302 { 00303 #endif 00304 HASHEDWD_THREAD_PARAM params; 00305 params.hf=this; 00306 params.sub_index=sub_index; 00307 params.output=output; 00308 params.start=0; 00309 params.stop=num; 00310 params.alphas=alphas; 00311 params.vec=vec; 00312 params.bias=b; 00313 params.progress=false; //true; 00314 params.index=index; 00315 dense_dot_range_helper((void*) ¶ms); 00316 #ifndef WIN32 00317 } 00318 else 00319 { 00320 pthread_t* threads = new pthread_t[num_threads-1]; 00321 HASHEDWD_THREAD_PARAM* params = new HASHEDWD_THREAD_PARAM[num_threads]; 00322 int32_t step= num/num_threads; 00323 00324 int32_t t; 00325 00326 for (t=0; t<num_threads-1; t++) 00327 { 00328 params[t].hf = this; 00329 params[t].sub_index=sub_index; 00330 params[t].output = output; 00331 params[t].start = t*step; 00332 params[t].stop = (t+1)*step; 00333 params[t].alphas=alphas; 00334 params[t].vec=vec; 00335 params[t].bias=b; 00336 params[t].progress = false; 00337 params[t].index=index; 00338 pthread_create(&threads[t], NULL, 00339 CHashedWDFeaturesTransposed::dense_dot_range_helper, (void*)¶ms[t]); 00340 } 00341 00342 params[t].hf = this; 00343 params[t].sub_index=sub_index; 00344 params[t].output = output; 00345 params[t].start = t*step; 00346 params[t].stop = num; 00347 params[t].alphas=alphas; 00348 params[t].vec=vec; 00349 params[t].bias=b; 00350 params[t].progress = false; //true; 00351 params[t].index=index; 00352 CHashedWDFeaturesTransposed::dense_dot_range_helper((void*) ¶ms[t]); 00353 00354 for (t=0; t<num_threads-1; t++) 00355 pthread_join(threads[t], NULL); 00356 00357 delete[] params; 00358 delete[] threads; 00359 delete[] index; 00360 } 00361 #endif 00362 00363 #ifndef WIN32 00364 if ( CSignal::cancel_computations() ) 00365 SG_INFO( "prematurely stopped. \n"); 00366 #endif 00367 } 00368 00369 void* CHashedWDFeaturesTransposed::dense_dot_range_helper(void* p) 00370 { 00371 HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p; 00372 CHashedWDFeaturesTransposed* hf=par->hf; 00373 int32_t* sub_index=par->sub_index; 00374 float64_t* output=par->output; 00375 int32_t start=par->start; 00376 int32_t stop=par->stop; 00377 float64_t* alphas=par->alphas; 00378 float64_t* vec=par->vec; 00379 float64_t bias=par->bias; 00380 bool progress=par->progress; 00381 uint32_t* index=par->index; 00382 int32_t string_length=hf->string_length; 00383 int32_t degree=hf->degree; 00384 float64_t* wd_weights=hf->wd_weights; 00385 TString<uint8_t>* transposed_strings=hf->transposed_strings; 00386 uint32_t mask=hf->mask; 00387 int32_t partial_w_dim=hf->partial_w_dim; 00388 float64_t normalization_const=hf->normalization_const; 00389 00390 if (sub_index) 00391 { 00392 for (int32_t j=start; j<stop; j++) 00393 output[j]=0.0; 00394 00395 uint32_t offs=0; 00396 for (int32_t i=0; i<string_length; i++) 00397 { 00398 uint32_t o=offs; 00399 for (int32_t k=0; k<degree && i+k<string_length; k++) 00400 { 00401 const float64_t wd = wd_weights[k]; 00402 uint8_t* dim=transposed_strings[i+k].string; 00403 uint32_t h; 00404 00405 for (int32_t j=start; j<stop; j++) 00406 { 00407 uint8_t bval=dim[sub_index[j]]; 00408 if (k==0) 00409 h=CHash::IncrementalMurmurHash2(bval, 0xDEADBEAF); 00410 else 00411 h=CHash::IncrementalMurmurHash2(bval, index[j]); 00412 index[j]=h; 00413 output[j]+=vec[o + (h & mask)]*wd; 00414 } 00415 o+=partial_w_dim; 00416 } 00417 offs+=partial_w_dim*degree; 00418 00419 if (progress) 00420 hf->io->progress(i, 0,string_length, i); 00421 } 00422 00423 for (int32_t j=start; j<stop; j++) 00424 { 00425 if (alphas) 00426 output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias; 00427 else 00428 output[j]=output[j]/normalization_const+bias; 00429 } 00430 } 00431 else 00432 { 00433 CMath::fill_vector(&output[start], stop-start, 0.0); 00434 00435 uint32_t offs=0; 00436 for (int32_t i=0; i<string_length; i++) 00437 { 00438 uint32_t o=offs; 00439 for (int32_t k=0; k<degree && i+k<string_length; k++) 00440 { 00441 const float64_t wd = wd_weights[k]; 00442 uint8_t* dim=transposed_strings[i+k].string; 00443 uint32_t h; 00444 00445 for (int32_t j=start; j<stop; j++) 00446 { 00447 if (k==0) 00448 h=CHash::IncrementalMurmurHash2(dim[j], 0xDEADBEAF); 00449 else 00450 h=CHash::IncrementalMurmurHash2(dim[j], index[j]); 00451 index[j]=h; 00452 output[j]+=vec[o + (h & mask)]*wd; 00453 } 00454 o+=partial_w_dim; 00455 } 00456 offs+=partial_w_dim*degree; 00457 00458 if (progress) 00459 hf->io->progress(i, 0,string_length, i); 00460 } 00461 00462 for (int32_t j=start; j<stop; j++) 00463 { 00464 if (alphas) 00465 output[j]=output[j]*alphas[j]/normalization_const+bias; 00466 else 00467 output[j]=output[j]/normalization_const+bias; 00468 } 00469 } 00470 00471 return NULL; 00472 } 00473 00474 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00475 { 00476 if (vec2_len != w_dim) 00477 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00478 00479 int32_t len; 00480 bool free_vec1; 00481 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00482 uint32_t* val=new uint32_t[len]; 00483 00484 uint32_t offs=0; 00485 float64_t factor=alpha/normalization_const; 00486 if (abs_val) 00487 factor=CMath::abs(factor); 00488 00489 CMath::fill_vector(val, len, 0xDEADBEAF); 00490 00491 for (int32_t i=0; i<len; i++) 00492 { 00493 uint32_t o=offs; 00494 for (int32_t k=0; k<degree && i+k<len; k++) 00495 { 00496 float64_t wd = wd_weights[k]*factor; 00497 00498 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]); 00499 val[i]=h; 00500 00501 #ifdef DEBUG_HASHEDWD 00502 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h); 00503 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00504 #endif 00505 vec2[o+(h & mask)]+=wd; 00506 o+=partial_w_dim; 00507 } 00508 offs+=partial_w_dim*degree; 00509 } 00510 00511 delete[] val; 00512 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00513 } 00514 00515 void CHashedWDFeaturesTransposed::set_wd_weights() 00516 { 00517 ASSERT(degree>0); 00518 00519 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1; 00520 partial_w_dim=1<<m_hash_bits; 00521 w_dim=partial_w_dim*string_length*(degree-start_degree); 00522 00523 wd_weights=new float64_t[degree]; 00524 00525 for (int32_t i=0; i<degree; i++) 00526 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00527 00528 SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, " 00529 "dim=%d partial_dim=%d num=%d, len=%d\n", 00530 degree, from_degree, alphabet_size, 00531 w_dim, partial_w_dim, num_strings, string_length); 00532 } 00533 00534 00535 void CHashedWDFeaturesTransposed::set_normalization_const(float64_t n) 00536 { 00537 if (n==0) 00538 { 00539 normalization_const=0; 00540 for (int32_t i=0; i<degree; i++) 00541 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00542 00543 normalization_const=CMath::sqrt(normalization_const); 00544 } 00545 else 00546 normalization_const=n; 00547 00548 SG_DEBUG("normalization_const:%f\n", normalization_const); 00549 } 00550 00551 CFeatures* CHashedWDFeaturesTransposed::duplicate() const 00552 { 00553 return new CHashedWDFeaturesTransposed(*this); 00554 }