00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "features/WDFeatures.h"
00012 #include "lib/io.h"
00013
00014 using namespace shogun;
00015
00016 CWDFeatures::CWDFeatures(CStringFeatures<uint8_t>* str,
00017 int32_t order, int32_t from_order) : CDotFeatures()
00018 {
00019 ASSERT(str);
00020 ASSERT(str->have_same_length());
00021 SG_REF(str);
00022
00023 strings=str;
00024 string_length=str->get_max_vector_length();
00025 num_strings=str->get_num_vectors();
00026 CAlphabet* alpha=str->get_alphabet();
00027 alphabet_size=alpha->get_num_symbols();
00028 SG_UNREF(alpha);
00029
00030 degree=order;
00031 from_degree=from_order;
00032 set_wd_weights();
00033 set_normalization_const();
00034
00035 }
00036
00037 CWDFeatures::CWDFeatures(const CWDFeatures& orig)
00038 : CDotFeatures(orig), strings(orig.strings),
00039 degree(orig.degree), from_degree(orig.from_degree)
00040 {
00041 SG_REF(strings);
00042 string_length=strings->get_max_vector_length();
00043 num_strings=strings->get_num_vectors();
00044 CAlphabet* alpha=strings->get_alphabet();
00045 alphabet_size=alpha->get_num_symbols();
00046 SG_UNREF(alpha);
00047
00048 set_wd_weights();
00049 set_normalization_const();
00050 }
00051
00052 CWDFeatures::~CWDFeatures()
00053 {
00054 SG_UNREF(strings);
00055 }
00056
00057 float64_t CWDFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00058 {
00059 int32_t len1, len2;
00060 bool free_vec1, free_vec2;
00061
00062 uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00063 uint8_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00064
00065 ASSERT(len1==len2);
00066
00067 float64_t sum=0.0;
00068
00069 for (int32_t i=0; i<len1; i++)
00070 {
00071 for (int32_t j=0; (i+j<len1) && (j<degree); j++)
00072 {
00073 if (vec1[i+j]!=vec2[i+j])
00074 break ;
00075 sum += wd_weights[j]*wd_weights[j];
00076 }
00077 }
00078 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00079 strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00080 return sum;
00081 }
00082
00083 float64_t CWDFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00084 {
00085 if (vec2_len != w_dim)
00086 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00087
00088 float64_t sum=0;
00089 int32_t lim=CMath::min(degree, string_length);
00090 int32_t len;
00091 bool free_vec1;
00092 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00093 int32_t* val=new int32_t[len];
00094 CMath::fill_vector(val, len, 0);
00095
00096 int32_t asize=alphabet_size;
00097 int32_t asizem1=1;
00098 int32_t offs=0;
00099
00100 for (int32_t k=0; k<lim; k++)
00101 {
00102 float64_t wd = wd_weights[k];
00103
00104 int32_t o=offs;
00105 for (int32_t i=0; i+k < len; i++)
00106 {
00107 val[i]+=asizem1*vec[i+k];
00108 sum+=vec2[val[i]+o]*wd;
00109 o+=asize;
00110 }
00111 offs+=asize*len;
00112 asize*=alphabet_size;
00113 asizem1*=alphabet_size;
00114 }
00115 delete[] val;
00116 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00117
00118 return sum/normalization_const;
00119 }
00120
00121 void CWDFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00122 {
00123 if (vec2_len != w_dim)
00124 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
00125
00126 int32_t lim=CMath::min(degree, string_length);
00127 int32_t len;
00128 bool free_vec1;
00129 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
00130 int32_t* val=new int32_t[len];
00131 CMath::fill_vector(val, len, 0);
00132
00133 int32_t asize=alphabet_size;
00134 int32_t asizem1=1;
00135 int32_t offs=0;
00136
00137 for (int32_t k=0; k<lim; k++)
00138 {
00139 float64_t wd = alpha*wd_weights[k]/normalization_const;
00140
00141 if (abs_val)
00142 wd=CMath::abs(wd);
00143
00144 int32_t o=offs;
00145 for (int32_t i=0; i+k < len; i++)
00146 {
00147 val[i]+=asizem1*vec[i+k];
00148 vec2[val[i]+o]+=wd;
00149 o+=asize;
00150 }
00151 offs+=asize*len;
00152 asize*=alphabet_size;
00153 asizem1*=alphabet_size;
00154 }
00155 delete[] val;
00156
00157 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00158 }
00159
00160 void CWDFeatures::set_wd_weights()
00161 {
00162 ASSERT(degree>0 && degree<=8);
00163 wd_weights=new float64_t[degree];
00164 w_dim=0;
00165
00166 for (int32_t i=0; i<degree; i++)
00167 {
00168 w_dim+=CMath::pow(alphabet_size, i+1)*string_length;
00169 wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
00170 }
00171 SG_DEBUG("created WDFeatures with d=%d (%d), alphabetsize=%d, dim=%d num=%d, len=%d\n", degree, from_degree, alphabet_size, w_dim, num_strings, string_length);
00172 }
00173
00174
00175 void CWDFeatures::set_normalization_const()
00176 {
00177 normalization_const=0;
00178 for (int32_t i=0; i<degree; i++)
00179 normalization_const+=(string_length-i)*wd_weights[i]*wd_weights[i];
00180
00181 normalization_const=CMath::sqrt(normalization_const);
00182 SG_DEBUG("normalization_const:%f\n", normalization_const);
00183 }
00184
00185 CFeatures* CWDFeatures::duplicate() const
00186 {
00187 return new CWDFeatures(*this);
00188 }