SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 */ 00010 00011 #include "features/HashedWDFeatures.h" 00012 #include "lib/io.h" 00013 00014 using namespace shogun; 00015 00016 CHashedWDFeatures::CHashedWDFeatures(void) :CDotFeatures() 00017 { 00018 SG_UNSTABLE("CHashedWDFeatures::CHashedWDFeatures(void)", "\n"); 00019 00020 strings = NULL; 00021 00022 degree = 0; 00023 start_degree = 0; 00024 from_degree = 0; 00025 string_length = 0; 00026 num_strings = 0; 00027 alphabet_size = 0; 00028 w_dim = 0; 00029 partial_w_dim = 0; 00030 wd_weights = NULL; 00031 mask = 0; 00032 m_hash_bits = 0; 00033 00034 normalization_const = 0.0; 00035 } 00036 00037 CHashedWDFeatures::CHashedWDFeatures(CStringFeatures<uint8_t>* str, 00038 int32_t start_order, int32_t order, int32_t from_order, 00039 int32_t hash_bits) : CDotFeatures() 00040 { 00041 ASSERT(start_order>=0); 00042 ASSERT(start_order<order); 00043 ASSERT(order<=from_order); 00044 ASSERT(hash_bits>0); 00045 ASSERT(str); 00046 ASSERT(str->have_same_length()); 00047 SG_REF(str); 00048 00049 strings=str; 00050 string_length=str->get_max_vector_length(); 00051 num_strings=str->get_num_vectors(); 00052 CAlphabet* alpha=str->get_alphabet(); 00053 alphabet_size=alpha->get_num_symbols(); 00054 SG_UNREF(alpha); 00055 00056 degree=order; 00057 start_degree=start_order; 00058 from_degree=from_order; 00059 m_hash_bits=hash_bits; 00060 set_wd_weights(); 00061 set_normalization_const(); 00062 } 00063 00064 CHashedWDFeatures::CHashedWDFeatures(const CHashedWDFeatures& orig) 00065 : CDotFeatures(orig), strings(orig.strings), 00066 degree(orig.degree), start_degree(orig.start_degree), 00067 from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits), 00068 normalization_const(orig.normalization_const) 00069 { 00070 SG_REF(strings); 00071 string_length=strings->get_max_vector_length(); 00072 num_strings=strings->get_num_vectors(); 00073 CAlphabet* alpha=strings->get_alphabet(); 00074 alphabet_size=alpha->get_num_symbols(); 00075 SG_UNREF(alpha); 00076 00077 set_wd_weights(); 00078 } 00079 00080 CHashedWDFeatures::~CHashedWDFeatures() 00081 { 00082 SG_UNREF(strings); 00083 delete[] wd_weights; 00084 } 00085 00086 float64_t CHashedWDFeatures::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 00087 { 00088 ASSERT(df); 00089 ASSERT(df->get_feature_type() == get_feature_type()); 00090 ASSERT(df->get_feature_class() == get_feature_class()); 00091 CHashedWDFeatures* wdf = (CHashedWDFeatures*) df; 00092 00093 int32_t len1, len2; 00094 bool free_vec1, free_vec2; 00095 00096 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1); 00097 uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2); 00098 00099 ASSERT(len1==len2); 00100 00101 float64_t sum=0.0; 00102 00103 for (int32_t i=0; i<len1; i++) 00104 { 00105 for (int32_t j=0; (i+j<len1) && (j<degree); j++) 00106 { 00107 if (vec1[i+j]!=vec2[i+j]) 00108 break; 00109 if (j>=start_degree) 00110 sum += wd_weights[j]*wd_weights[j]; 00111 } 00112 } 00113 strings->free_feature_vector(vec1, vec_idx1, free_vec1); 00114 wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2); 00115 return sum/CMath::sq(normalization_const); 00116 } 00117 00118 float64_t CHashedWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len) 00119 { 00120 if (vec2_len != w_dim) 00121 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00122 00123 float64_t sum=0; 00124 int32_t lim=CMath::min(degree, string_length); 00125 int32_t len; 00126 bool free_vec1; 00127 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00128 uint32_t* val=new uint32_t[len]; 00129 00130 uint32_t offs=0; 00131 00132 if (start_degree>0) 00133 { 00134 // compute hash for strings of length start_degree-1 00135 for (int32_t i=0; i+start_degree < len; i++) 00136 val[i]=CHash::MurmurHash2(&vec[i], start_degree, 0xDEADBEAF); 00137 } 00138 else 00139 CMath::fill_vector(val, len, 0xDEADBEAF); 00140 00141 for (int32_t k=start_degree; k<lim; k++) 00142 { 00143 float64_t wd = wd_weights[k]; 00144 00145 uint32_t o=offs; 00146 for (int32_t i=0; i+k < len; i++) 00147 { 00148 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]); 00149 val[i]=h; 00150 #ifdef DEBUG_HASHEDWD 00151 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00152 #endif 00153 sum+=vec2[o+(h & mask)]*wd; 00154 o+=partial_w_dim; 00155 } 00156 offs+=partial_w_dim*len; 00157 } 00158 delete[] val; 00159 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00160 00161 return sum/normalization_const; 00162 } 00163 00164 void CHashedWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00165 { 00166 if (vec2_len != w_dim) 00167 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00168 00169 int32_t lim=CMath::min(degree, string_length); 00170 int32_t len; 00171 bool free_vec1; 00172 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00173 uint32_t* val=new uint32_t[len]; 00174 00175 uint32_t offs=0; 00176 00177 if (start_degree>0) 00178 { 00179 // compute hash for strings of length start_degree-1 00180 for (int32_t i=0; i+start_degree < len; i++) 00181 val[i]=CHash::MurmurHash2(&vec[i], start_degree, 0xDEADBEAF); 00182 } 00183 else 00184 CMath::fill_vector(val, len, 0xDEADBEAF); 00185 00186 for (int32_t k=start_degree; k<lim; k++) 00187 { 00188 float64_t wd = alpha*wd_weights[k]/normalization_const; 00189 00190 if (abs_val) 00191 wd=CMath::abs(wd); 00192 00193 uint32_t o=offs; 00194 for (int32_t i=0; i+k < len; i++) 00195 { 00196 const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]); 00197 val[i]=h; 00198 00199 #ifdef DEBUG_HASHEDWD 00200 SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h); 00201 SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h); 00202 #endif 00203 vec2[o+(h & mask)]+=wd; 00204 o+=partial_w_dim; 00205 } 00206 offs+=partial_w_dim*len; 00207 } 00208 00209 delete[] val; 00210 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00211 } 00212 00213 void CHashedWDFeatures::set_wd_weights() 00214 { 00215 ASSERT(degree>0); 00216 00217 mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1; 00218 partial_w_dim=1<<m_hash_bits; 00219 w_dim=partial_w_dim*string_length*(degree-start_degree); 00220 00221 wd_weights=new float64_t[degree]; 00222 00223 for (int32_t i=0; i<degree; i++) 00224 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1))); 00225 00226 SG_DEBUG("created HashedWDFeatures with d=%d (%d), alphabetsize=%d, " 00227 "dim=%d partial_dim=%d num=%d, len=%d\n", 00228 degree, from_degree, alphabet_size, 00229 w_dim, partial_w_dim, num_strings, string_length); 00230 } 00231 00232 00233 void CHashedWDFeatures::set_normalization_const(float64_t n) 00234 { 00235 if (n==0) 00236 { 00237 normalization_const=0; 00238 for (int32_t i=0; i<degree; i++) 00239 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i]; 00240 00241 normalization_const=CMath::sqrt(normalization_const); 00242 } 00243 else 00244 normalization_const=n; 00245 00246 SG_DEBUG("normalization_const:%f\n", normalization_const); 00247 } 00248 00249 CFeatures* CHashedWDFeatures::duplicate() const 00250 { 00251 return new CHashedWDFeatures(*this); 00252 }