ImplicitWeightedSpecFeatures.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include "features/ImplicitWeightedSpecFeatures.h"
00012 #include "lib/io.h"
00013
00014 using namespace shogun;
00015
00016 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(CStringFeatures<uint16_t>* str, bool normalize) : CDotFeatures()
00017 {
00018 ASSERT(str);
00019 strings=str;
00020 SG_REF(strings)
00021 normalization_factors=NULL;
00022 spec_weights=NULL;
00023 num_strings = str->get_num_vectors();
00024 alphabet_size = str->get_original_num_symbols();
00025 degree=str->get_order();
00026 set_wd_weights();
00027
00028 SG_DEBUG("WEIGHTED SPEC alphasz=%d, size=%d, num_str=%d\n", alphabet_size,
00029 spec_size, num_strings);
00030
00031 if (normalize)
00032 compute_normalization_const();
00033 }
00034
00035 void CImplicitWeightedSpecFeatures::compute_normalization_const()
00036 {
00037 float64_t* factors=new float64_t[num_strings];
00038
00039 for (int32_t i=0; i<num_strings; i++)
00040 factors[i]=1.0/CMath::sqrt(dot(i,i));
00041
00042 normalization_factors=factors;
00043
00044 }
00045
00046 bool CImplicitWeightedSpecFeatures::set_wd_weights()
00047 {
00048 delete[] spec_weights;
00049 spec_weights=new float64_t[degree];
00050
00051 int32_t i;
00052 float64_t sum=0;
00053 spec_size=0;
00054
00055 for (i=0; i<degree; i++)
00056 {
00057 spec_size+=CMath::pow(alphabet_size, i+1);
00058 spec_weights[i]=degree-i;
00059 sum+=spec_weights[i];
00060 }
00061 for (i=0; i<degree; i++)
00062 spec_weights[i]=CMath::sqrt(spec_weights[i]/sum);
00063
00064 return spec_weights!=NULL;
00065 }
00066
00067 bool CImplicitWeightedSpecFeatures::set_weights(float64_t* w, int32_t d)
00068 {
00069 ASSERT(d==degree);
00070
00071 delete[] spec_weights;
00072 spec_weights=new float64_t[degree];
00073 for (int32_t i=0; i<degree; i++)
00074 spec_weights[i]=CMath::sqrt(w[i]);
00075 return true;
00076 }
00077
00078 CImplicitWeightedSpecFeatures::CImplicitWeightedSpecFeatures(const CImplicitWeightedSpecFeatures& orig) : CDotFeatures(orig),
00079 num_strings(orig.num_strings),
00080 alphabet_size(orig.alphabet_size), spec_size(orig.spec_size)
00081 {
00082 SG_NOTIMPLEMENTED;
00083 SG_REF(strings);
00084 }
00085
00086 CImplicitWeightedSpecFeatures::~CImplicitWeightedSpecFeatures()
00087 {
00088 SG_UNREF(strings);
00089 delete[] spec_weights;
00090 delete[] normalization_factors;
00091 }
00092
00093 float64_t CImplicitWeightedSpecFeatures::dot(int32_t vec_idx1, int32_t vec_idx2)
00094 {
00095 ASSERT(vec_idx1 < num_strings);
00096 ASSERT(vec_idx2 < num_strings);
00097
00098 int32_t len1=-1;
00099 int32_t len2=-1;
00100 bool free_vec1;
00101 bool free_vec2;
00102 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00103 uint16_t* vec2=strings->get_feature_vector(vec_idx2, len2, free_vec2);
00104
00105 float64_t result=0;
00106 uint8_t mask=0;
00107
00108 for (int32_t d=0; d<degree; d++)
00109 {
00110 mask = mask | (1 << (degree-d-1));
00111 uint16_t masked=strings->get_masked_symbols(0xffff, mask);
00112
00113 int32_t left_idx=0;
00114 int32_t right_idx=0;
00115 float64_t weight=spec_weights[d]*spec_weights[d];
00116
00117 while (left_idx < len1 && right_idx < len2)
00118 {
00119 uint16_t lsym=vec1[left_idx] & masked;
00120 uint16_t rsym=vec2[right_idx] & masked;
00121
00122 if (lsym == rsym)
00123 {
00124 int32_t old_left_idx=left_idx;
00125 int32_t old_right_idx=right_idx;
00126
00127 while (left_idx<len1 && (vec1[left_idx] & masked) ==lsym)
00128 left_idx++;
00129
00130 while (right_idx<len2 && (vec2[right_idx] & masked) ==lsym)
00131 right_idx++;
00132
00133 result+=weight*(left_idx-old_left_idx)*(right_idx-old_right_idx);
00134 }
00135 else if (lsym<rsym)
00136 left_idx++;
00137 else
00138 right_idx++;
00139 }
00140 }
00141
00142 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00143 strings->free_feature_vector(vec2, vec_idx2, free_vec2);
00144
00145 if (normalization_factors)
00146 return result*normalization_factors[vec_idx1]*normalization_factors[vec_idx2];
00147 else
00148 return result;
00149 }
00150
00151 float64_t CImplicitWeightedSpecFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
00152 {
00153 ASSERT(vec2_len == spec_size);
00154 ASSERT(vec_idx1 < num_strings);
00155
00156 float64_t result=0;
00157 int32_t len1=-1;
00158 bool free_vec1;
00159 uint16_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00160
00161 if (vec1 && len1>0)
00162 {
00163 for (int32_t j=0; j<len1; j++)
00164 {
00165 uint8_t mask=0;
00166 int32_t offs=0;
00167 uint16_t v=*vec1++;
00168
00169 for (int32_t d=0; d<degree; d++)
00170 {
00171 mask = mask | (1 << (degree-d-1));
00172 int32_t idx=strings->get_masked_symbols(v, mask);
00173 idx=strings->shift_symbol(idx, degree-d-1);
00174 result += vec2[offs + idx]*spec_weights[d];
00175 offs+=strings->shift_offset(1,d+1);
00176 }
00177 }
00178
00179 strings->free_feature_vector(vec1, vec_idx1, free_vec1);
00180
00181 if (normalization_factors)
00182 result*=normalization_factors[vec_idx1];
00183 }
00184 else
00185 SG_ERROR("huh?\n");
00186
00187 return result;
00188 }
00189
00190 void CImplicitWeightedSpecFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
00191 {
00192 int32_t len1=-1;
00193 bool free_vec1;
00194 uint16_t* vec=strings->get_feature_vector(vec_idx1, len1, free_vec1);
00195
00196 if (normalization_factors)
00197 alpha*=normalization_factors[vec_idx1];
00198
00199 if (vec && len1>0)
00200 {
00201 for (int32_t j=0; j<len1; j++)
00202 {
00203 uint8_t mask=0;
00204 int32_t offs=0;
00205 for (int32_t d=0; d<degree; d++)
00206 {
00207 mask = mask | (1 << (degree-d-1));
00208 int32_t idx=strings->get_masked_symbols(vec[j], mask);
00209 idx=strings->shift_symbol(idx, degree-d-1);
00210 if (abs_val)
00211 vec2[offs + idx] += CMath::abs(alpha*spec_weights[d]);
00212 else
00213 vec2[offs + idx] += alpha*spec_weights[d];
00214 offs+=strings->shift_offset(1,d+1);
00215 }
00216 }
00217 }
00218
00219 strings->free_feature_vector(vec, vec_idx1, free_vec1);
00220 }
00221
00222 CFeatures* CImplicitWeightedSpecFeatures::duplicate() const
00223 {
00224 return new CImplicitWeightedSpecFeatures(*this);
00225 }