00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "lib/common.h"
00012 #include "kernel/WeightedCommWordStringKernel.h"
00013 #include "features/StringFeatures.h"
00014 #include "lib/io.h"
00015
00016 using namespace shogun;
00017
00018 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00019 int32_t size, bool us)
00020 : CCommWordStringKernel(size, us), degree(0), weights(NULL)
00021 {
00022 init_dictionary(1<<(sizeof(uint16_t)*9));
00023 ASSERT(us==false);
00024 }
00025
00026 CWeightedCommWordStringKernel::CWeightedCommWordStringKernel(
00027 CStringFeatures<uint16_t>* l, CStringFeatures<uint16_t>* r, bool us,
00028 int32_t size)
00029 : CCommWordStringKernel(size, us), degree(0), weights(NULL)
00030 {
00031 init_dictionary(1<<(sizeof(uint16_t)*9));
00032 ASSERT(us==false);
00033
00034 init(l,r);
00035 }
00036
00037 CWeightedCommWordStringKernel::~CWeightedCommWordStringKernel()
00038 {
00039 delete[] weights;
00040 }
00041
00042 bool CWeightedCommWordStringKernel::init(CFeatures* l, CFeatures* r)
00043 {
00044 ASSERT(((CStringFeatures<uint16_t>*) l)->get_order() ==
00045 ((CStringFeatures<uint16_t>*) r)->get_order());
00046 degree=((CStringFeatures<uint16_t>*) l)->get_order();
00047 set_wd_weights();
00048
00049 CCommWordStringKernel::init(l,r);
00050 return init_normalizer();
00051 }
00052
00053 void CWeightedCommWordStringKernel::cleanup()
00054 {
00055 delete[] weights;
00056 weights=NULL;
00057
00058 CCommWordStringKernel::cleanup();
00059 }
00060
00061 bool CWeightedCommWordStringKernel::set_wd_weights()
00062 {
00063 delete[] weights;
00064 weights=new float64_t[degree];
00065
00066 int32_t i;
00067 float64_t sum=0;
00068 for (i=0; i<degree; i++)
00069 {
00070 weights[i]=degree-i;
00071 sum+=weights[i];
00072 }
00073 for (i=0; i<degree; i++)
00074 weights[i]=CMath::sqrt(weights[i]/sum);
00075
00076 return weights!=NULL;
00077 }
00078
00079 bool CWeightedCommWordStringKernel::set_weights(float64_t* w, int32_t d)
00080 {
00081 ASSERT(d==degree);
00082
00083 delete[] weights;
00084 weights=new float64_t[degree];
00085 for (int32_t i=0; i<degree; i++)
00086 weights[i]=CMath::sqrt(w[i]);
00087 return true;
00088 }
00089
00090 float64_t CWeightedCommWordStringKernel::compute_helper(
00091 int32_t idx_a, int32_t idx_b, bool do_sort)
00092 {
00093 int32_t alen, blen;
00094 bool free_avec, free_bvec;
00095
00096 CStringFeatures<uint16_t>* l = (CStringFeatures<uint16_t>*) lhs;
00097 CStringFeatures<uint16_t>* r = (CStringFeatures<uint16_t>*) rhs;
00098
00099 uint16_t* av=l->get_feature_vector(idx_a, alen, free_avec);
00100 uint16_t* bv=r->get_feature_vector(idx_b, blen, free_bvec);
00101
00102 uint16_t* avec=av;
00103 uint16_t* bvec=bv;
00104
00105 if (do_sort)
00106 {
00107 if (alen>0)
00108 {
00109 avec=new uint16_t[alen];
00110 memcpy(avec, av, sizeof(uint16_t)*alen);
00111 CMath::radix_sort(avec, alen);
00112 }
00113 else
00114 avec=NULL;
00115
00116 if (blen>0)
00117 {
00118 bvec=new uint16_t[blen];
00119 memcpy(bvec, bv, sizeof(uint16_t)*blen);
00120 CMath::radix_sort(bvec, blen);
00121 }
00122 else
00123 bvec=NULL;
00124 }
00125 else
00126 {
00127 if ( (l->get_num_preproc() != l->get_num_preprocessed()) ||
00128 (r->get_num_preproc() != r->get_num_preprocessed()))
00129 {
00130 SG_ERROR("not all preprocessors have been applied to training (%d/%d)"
00131 " or test (%d/%d) data\n", l->get_num_preprocessed(), l->get_num_preproc(),
00132 r->get_num_preprocessed(), r->get_num_preproc());
00133 }
00134 }
00135
00136 float64_t result=0;
00137 uint8_t mask=0;
00138
00139 for (int32_t d=0; d<degree; d++)
00140 {
00141 mask = mask | (1 << (degree-d-1));
00142 uint16_t masked=((CStringFeatures<uint16_t>*) lhs)->get_masked_symbols(0xffff, mask);
00143
00144 int32_t left_idx=0;
00145 int32_t right_idx=0;
00146 float64_t weight=weights[d]*weights[d];
00147
00148 while (left_idx < alen && right_idx < blen)
00149 {
00150 uint16_t lsym=avec[left_idx] & masked;
00151 uint16_t rsym=bvec[right_idx] & masked;
00152
00153 if (lsym == rsym)
00154 {
00155 int32_t old_left_idx=left_idx;
00156 int32_t old_right_idx=right_idx;
00157
00158 while (left_idx<alen && (avec[left_idx] & masked) ==lsym)
00159 left_idx++;
00160
00161 while (right_idx<blen && (bvec[right_idx] & masked) ==lsym)
00162 right_idx++;
00163
00164 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00165 }
00166 else if (lsym<rsym)
00167 left_idx++;
00168 else
00169 right_idx++;
00170 }
00171 }
00172
00173 if (do_sort)
00174 {
00175 delete[] avec;
00176 delete[] bvec;
00177 }
00178
00179 l->free_feature_vector(av, idx_a, free_avec);
00180 r->free_feature_vector(bv, idx_b, free_bvec);
00181
00182 return result;
00183 }
00184
00185 void CWeightedCommWordStringKernel::add_to_normal(
00186 int32_t vec_idx, float64_t weight)
00187 {
00188 int32_t len=-1;
00189 bool free_vec;
00190 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) lhs;
00191 uint16_t* vec=s->get_feature_vector(vec_idx, len, free_vec);
00192
00193 if (len>0)
00194 {
00195 for (int32_t j=0; j<len; j++)
00196 {
00197 uint8_t mask=0;
00198 int32_t offs=0;
00199 for (int32_t d=0; d<degree; d++)
00200 {
00201 mask = mask | (1 << (degree-d-1));
00202 int32_t idx=s->get_masked_symbols(vec[j], mask);
00203 idx=s->shift_symbol(idx, degree-d-1);
00204 dictionary_weights[offs + idx] += normalizer->normalize_lhs(weight*weights[d], vec_idx);
00205 offs+=s->shift_offset(1,d+1);
00206 }
00207 }
00208
00209 set_is_initialized(true);
00210 }
00211
00212 s->free_feature_vector(vec, vec_idx, free_vec);
00213 }
00214
00215 void CWeightedCommWordStringKernel::merge_normal()
00216 {
00217 ASSERT(get_is_initialized());
00218 ASSERT(use_sign==false);
00219
00220 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00221 uint32_t num_symbols=(uint32_t) s->get_num_symbols();
00222 int32_t dic_size=1<<(sizeof(uint16_t)*8);
00223 float64_t* dic=new float64_t[dic_size];
00224 memset(dic, 0, sizeof(float64_t)*dic_size);
00225
00226 for (uint32_t sym=0; sym<num_symbols; sym++)
00227 {
00228 float64_t result=0;
00229 uint8_t mask=0;
00230 int32_t offs=0;
00231 for (int32_t d=0; d<degree; d++)
00232 {
00233 mask = mask | (1 << (degree-d-1));
00234 int32_t idx=s->get_masked_symbols(sym, mask);
00235 idx=s->shift_symbol(idx, degree-d-1);
00236 result += dictionary_weights[offs + idx];
00237 offs+=s->shift_offset(1,d+1);
00238 }
00239 dic[sym]=result;
00240 }
00241
00242 init_dictionary(1<<(sizeof(uint16_t)*8));
00243 memcpy(dictionary_weights, dic, sizeof(float64_t)*dic_size);
00244 delete[] dic;
00245 }
00246
00247 float64_t CWeightedCommWordStringKernel::compute_optimized(int32_t i)
00248 {
00249 if (!get_is_initialized())
00250 SG_ERROR( "CCommWordStringKernel optimization not initialized\n");
00251
00252 ASSERT(use_sign==false);
00253
00254 float64_t result=0;
00255 bool free_vec;
00256 int32_t len=-1;
00257 CStringFeatures<uint16_t>* s=(CStringFeatures<uint16_t>*) rhs;
00258 uint16_t* vec=s->get_feature_vector(i, len, free_vec);
00259
00260 if (vec && len>0)
00261 {
00262 for (int32_t j=0; j<len; j++)
00263 {
00264 uint8_t mask=0;
00265 int32_t offs=0;
00266 for (int32_t d=0; d<degree; d++)
00267 {
00268 mask = mask | (1 << (degree-d-1));
00269 int32_t idx=s->get_masked_symbols(vec[j], mask);
00270 idx=s->shift_symbol(idx, degree-d-1);
00271 result += dictionary_weights[offs + idx]*weights[d];
00272 offs+=s->shift_offset(1,d+1);
00273 }
00274 }
00275
00276 result=normalizer->normalize_rhs(result, i);
00277 }
00278 s->free_feature_vector(vec, i, free_vec);
00279 return result;
00280 }
00281
00282 float64_t* CWeightedCommWordStringKernel::compute_scoring(
00283 int32_t max_degree, int32_t& num_feat, int32_t& num_sym, float64_t* target,
00284 int32_t num_suppvec, int32_t* IDX, float64_t* alphas, bool do_init)
00285 {
00286 if (do_init)
00287 CCommWordStringKernel::init_optimization(num_suppvec, IDX, alphas);
00288
00289 int32_t dic_size=1<<(sizeof(uint16_t)*9);
00290 float64_t* dic=new float64_t[dic_size];
00291 memcpy(dic, dictionary_weights, sizeof(float64_t)*dic_size);
00292
00293 merge_normal();
00294 float64_t* result=CCommWordStringKernel::compute_scoring(max_degree, num_feat,
00295 num_sym, target, num_suppvec, IDX, alphas, false);
00296
00297 init_dictionary(1<<(sizeof(uint16_t)*9));
00298 memcpy(dictionary_weights,dic, sizeof(float64_t)*dic_size);
00299 delete[] dic;
00300
00301 return result;
00302 }