WDFeatures.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2009 Soeren Sonnenburg
00008  * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include "features/WDFeatures.h"
00012 #include "lib/io.h"
00013 
00014 using namespace shogun;
00015 
00016 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00017         int32_t order, int32_t from_order) : CDotFeatures()
00018 {
00019     ASSERT(str);
00020     ASSERT(str->have_same_length());
00021     SG_REF(str);
00022 
00023     strings=str;
00024     string_length=str->get_max_vector_length();
00025     num_strings=str->get_num_vectors();
00026     CAlphabet* alpha=str->get_alphabet();
00027     alphabet_size=alpha->get_num_symbols();
00028     SG_UNREF(alpha);
00029 
00030     degree=order;
00031     from_degree=from_order;
00032     set_wd_weights();
00033     set_normalization_const();
00034 
00035 }
00036 
00037 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00038     : CDotFeatures(orig), strings(orig.strings),
00039     degree(orig.degree), from_degree(orig.from_degree)
00040 {
00041     SG_REF(strings);
00042     string_length=strings->get_max_vector_length();
00043     num_strings=strings->get_num_vectors();
00044     CAlphabet* alpha=strings->get_alphabet();
00045     alphabet_size=alpha->get_num_symbols();
00046     SG_UNREF(alpha);
00047 
00048     set_wd_weights();
00049     set_normalization_const();
00050 }
00051 
00052 CWDFeatures::~CWDFeatures()
00053 {
00054     SG_UNREF(strings);
00055 }
00056 
00057 float64_t CWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00058 {
00059     int32_t len1, len2;
00060     bool free_vec1, free_vec2;
00061 
00062     uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00063     uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00064 
00065     ASSERT(len1==len2);
00066 
00067     float64_t sum=0.0;
00068 
00069     for (int32_t i=0; i<len1; i++)
00070     {
00071         for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00072         {
00073             if (vec1[i+j]!=vec2[i+j])
00074                 break ;
00075             sum += wd_weights[j]*wd_weights[j];
00076         }
00077     }
00078     strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00079     strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00080     return sum;
00081 }
00082 
00083 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00084 {
00085     if (vec2_len != w_dim)
00086         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00087 
00088     float64_t sum=0;
00089     int32_t lim=CMath::min(degree, string_length);
00090     int32_t len;
00091     bool free_vec1;
00092     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00093     int32_t* val=new int32_t[len];
00094     CMath::fill_vector(val, len, 0);
00095 
00096     int32_t asize=alphabet_size;
00097     int32_t asizem1=1;
00098     int32_t offs=0;
00099 
00100     for (int32_t k=0; k<lim; k++)
00101     {
00102         float64_t wd = wd_weights[k];
00103 
00104         int32_t o=offs;
00105         for (int32_t i=0; i+k < len; i++) 
00106         {
00107             val[i]+=asizem1*vec[i+k];
00108             sum+=vec2[val[i]+o]*wd;
00109             o+=asize;
00110         }
00111         offs+=asize*len;
00112         asize*=alphabet_size;
00113         asizem1*=alphabet_size;
00114     }
00115     delete[] val;
00116     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00117 
00118     return sum/normalization_const;
00119 }
00120 
00121 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00122 {
00123     if (vec2_len != w_dim)
00124         SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00125 
00126     int32_t lim=CMath::min(degree, string_length);
00127     int32_t len;
00128     bool free_vec1;
00129     uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00130     int32_t* val=new int32_t[len];
00131     CMath::fill_vector(val, len, 0);
00132 
00133     int32_t asize=alphabet_size;
00134     int32_t asizem1=1;
00135     int32_t offs=0;
00136 
00137     for (int32_t k=0; k<lim; k++)
00138     {
00139         float64_t wd = alpha*wd_weights[k]/normalization_const;
00140 
00141         if (abs_val)
00142             wd=CMath::abs(wd);
00143 
00144         int32_t o=offs;
00145         for (int32_t i=0; i+k < len; i++) 
00146         {
00147             val[i]+=asizem1*vec[i+k];
00148             vec2[val[i]+o]+=wd;
00149             o+=asize;
00150         }
00151         offs+=asize*len;
00152         asize*=alphabet_size;
00153         asizem1*=alphabet_size;
00154     }
00155     delete[] val;
00156 
00157     strings->free_feature_vector(vec, vec_idx1, free_vec1);
00158 }
00159 
00160 void CWDFeatures::set_wd_weights()
00161 {
00162     ASSERT(degree>0 && degree<=8);
00163     wd_weights=new float64_t[degree];
00164     w_dim=0;
00165 
00166     for (int32_t i=0; i<degree; i++)
00167     {
00168         w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00169         wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00170     }
00171     SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
00172 }
00173 
00174 
00175 void CWDFeatures::set_normalization_const()
00176 {
00177     normalization_const=0;
00178     for (int32_t i=0; i<degree; i++)
00179         normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00180 
00181     normalization_const=CMath::sqrt(normalization_const);
00182     SG_DEBUG("normalization_const:%f\n", normalization_const);
00183 }
00184 
00185 CFeatures* CWDFeatures::duplicate() const
00186 {
00187     return new CWDFeatures(*this);
00188 }

SHOGUN Machine Learning Toolbox - Documentation