LinearHMM.cpp
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #include "distributions/LinearHMM.h"
00013 #include "lib/common.h"
00014 #include "features/StringFeatures.h"
00015 #include "lib/io.h"
00016
00017 using namespace shogun;
00018
00019 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f)
00020 : CDistribution(), transition_probs(NULL), log_transition_probs(NULL)
00021 {
00022 features=f;
00023 sequence_length = f->get_vector_length(0);
00024 num_symbols = (int32_t) f->get_num_symbols();
00025 num_params = sequence_length*num_symbols;
00026 }
00027
00028 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols)
00029 : CDistribution(), transition_probs(NULL), log_transition_probs(NULL)
00030 {
00031 sequence_length = p_num_features;
00032 num_symbols = p_num_symbols;
00033 num_params = sequence_length*num_symbols;
00034 }
00035
00036 CLinearHMM::~CLinearHMM()
00037 {
00038 delete[] transition_probs;
00039 delete[] log_transition_probs;
00040 }
00041
00042 bool CLinearHMM::train(CFeatures* data)
00043 {
00044 if (data)
00045 {
00046 if (data->get_feature_class() != C_STRING ||
00047 data->get_feature_type() != F_WORD)
00048 {
00049 SG_ERROR("Expected features of class string type word!\n");
00050 }
00051 set_features(data);
00052 }
00053 delete[] transition_probs;
00054 delete[] log_transition_probs;
00055 int32_t* int_transition_probs=new int32_t[num_params];
00056
00057 int32_t vec;
00058 int32_t i;
00059
00060 for (i=0; i< num_params; i++)
00061 int_transition_probs[i]=0;
00062
00063 for (vec=0; vec<features->get_num_vectors(); vec++)
00064 {
00065 int32_t len;
00066 bool free_vec;
00067
00068 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00069 get_feature_vector(vec, len, free_vec);
00070
00071
00072 for (int32_t feat=0; feat<len ; feat++)
00073 int_transition_probs[feat*num_symbols+vector[feat]]++;
00074
00075 ((CStringFeatures<uint16_t>*) features)->
00076 free_feature_vector(vector, vec, free_vec);
00077 }
00078
00079
00080 transition_probs=new float64_t[num_params];
00081 log_transition_probs=new float64_t[num_params];
00082
00083 for (i=0;i<sequence_length;i++)
00084 {
00085 for (int32_t j=0; j<num_symbols; j++)
00086 {
00087 float64_t sum=0;
00088 int32_t offs=i*num_symbols+
00089 ((CStringFeatures<uint16_t> *) features)->
00090 get_masked_symbols((uint16_t)j,(uint8_t) 254);
00091 int32_t original_num_symbols=(int32_t)
00092 ((CStringFeatures<uint16_t> *) features)->
00093 get_original_num_symbols();
00094
00095 for (int32_t k=0; k<original_num_symbols; k++)
00096 sum+=int_transition_probs[offs+k];
00097
00098 transition_probs[i*num_symbols+j]=
00099 (int_transition_probs[i*num_symbols+j]+pseudo_count)/
00100 (sum+((CStringFeatures<uint16_t> *) features)->
00101 get_original_num_symbols()*pseudo_count);
00102 log_transition_probs[i*num_symbols+j]=
00103 log(transition_probs[i*num_symbols+j]);
00104 }
00105 }
00106
00107 delete[] int_transition_probs;
00108 return true;
00109 }
00110
00111 bool CLinearHMM::train(
00112 const int32_t* indizes, int32_t num_indizes, float64_t pseudo)
00113 {
00114 delete[] transition_probs;
00115 delete[] log_transition_probs;
00116 int32_t* int_transition_probs=new int32_t[num_params];
00117 int32_t vec;
00118 int32_t i;
00119
00120 for (i=0; i< num_params; i++)
00121 int_transition_probs[i]=0;
00122
00123 for (vec=0; vec<num_indizes; vec++)
00124 {
00125 int32_t len;
00126 bool free_vec;
00127
00128 ASSERT(indizes[vec]>=0 &&
00129 indizes[vec]<((CStringFeatures<uint16_t>*) features)->
00130 get_num_vectors());
00131 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00132 get_feature_vector(indizes[vec], len, free_vec);
00133 ((CStringFeatures<uint16_t>*) features)->
00134 free_feature_vector(vector, indizes[vec], free_vec);
00135
00136
00137
00138 for (int32_t feat=0; feat<len ; feat++)
00139 int_transition_probs[feat*num_symbols+vector[feat]]++;
00140 }
00141
00142
00143 transition_probs=new float64_t[num_params];
00144 log_transition_probs=new float64_t[num_params];
00145
00146 for (i=0;i<sequence_length;i++)
00147 {
00148 for (int32_t j=0; j<num_symbols; j++)
00149 {
00150 float64_t sum=0;
00151 int32_t original_num_symbols=(int32_t)
00152 ((CStringFeatures<uint16_t> *) features)->
00153 get_original_num_symbols();
00154 for (int32_t k=0; k<original_num_symbols; k++)
00155 {
00156 sum+=int_transition_probs[i*num_symbols+
00157 ((CStringFeatures<uint16_t>*) features)->
00158 get_masked_symbols((uint16_t)j,(uint8_t) 254)+k];
00159 }
00160
00161 transition_probs[i*num_symbols+j]=
00162 (int_transition_probs[i*num_symbols+j]+pseudo)/
00163 (sum+((CStringFeatures<uint16_t>*) features)->
00164 get_original_num_symbols()*pseudo);
00165 log_transition_probs[i*num_symbols+j]=
00166 log(transition_probs[i*num_symbols+j]);
00167 }
00168 }
00169
00170 delete[] int_transition_probs;
00171 return true;
00172 }
00173
00174 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len)
00175 {
00176 float64_t result=log_transition_probs[vector[0]];
00177
00178 for (int32_t i=1; i<len; i++)
00179 result+=log_transition_probs[i*num_symbols+vector[i]];
00180
00181 return result;
00182 }
00183
00184 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example)
00185 {
00186 int32_t len;
00187 bool free_vec;
00188 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00189 get_feature_vector(num_example, len, free_vec);
00190 float64_t result=log_transition_probs[vector[0]];
00191
00192 for (int32_t i=1; i<len; i++)
00193 result+=log_transition_probs[i*num_symbols+vector[i]];
00194
00195 ((CStringFeatures<uint16_t>*) features)->
00196 free_feature_vector(vector, num_example, free_vec);
00197
00198 return result;
00199 }
00200
00201 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len)
00202 {
00203 float64_t result=transition_probs[vector[0]];
00204
00205 for (int32_t i=1; i<len; i++)
00206 result*=transition_probs[i*num_symbols+vector[i]];
00207
00208 return result;
00209 }
00210
00211 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example)
00212 {
00213 int32_t len;
00214 bool free_vec;
00215 uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00216 get_feature_vector(num_example, len, free_vec);
00217 float64_t result=0;
00218 int32_t position=num_param/num_symbols;
00219 ASSERT(position>=0 && position<len);
00220 uint16_t sym=(uint16_t) (num_param-position*num_symbols);
00221
00222 if (vector[position]==sym && transition_probs[num_param]!=0)
00223 result=1.0/transition_probs[num_param];
00224 ((CStringFeatures<uint16_t>*) features)->
00225 free_feature_vector(vector, num_example, free_vec);
00226
00227 return result;
00228 }
00229
00230 void CLinearHMM::get_transition_probs(float64_t** dst, int32_t* num)
00231 {
00232 *num=num_params;
00233 size_t sz=sizeof(*transition_probs)*(*num);
00234 *dst=(float64_t*) malloc(sz);
00235 ASSERT(dst);
00236
00237 memcpy(*dst, transition_probs, sz);
00238 }
00239
00240 bool CLinearHMM::set_transition_probs(const float64_t* src, int32_t num)
00241 {
00242 if (num!=-1)
00243 ASSERT(num==num_params);
00244
00245 if (!log_transition_probs)
00246 log_transition_probs=new float64_t[num_params];
00247
00248 if (!transition_probs)
00249 transition_probs=new float64_t[num_params];
00250
00251 for (int32_t i=0; i<num_params; i++)
00252 {
00253 transition_probs[i]=src[i];
00254 log_transition_probs[i]=log(transition_probs[i]);
00255 }
00256
00257 return true;
00258 }
00259
00260 void CLinearHMM::get_log_transition_probs(float64_t** dst, int32_t* num)
00261 {
00262 *num=num_params;
00263 size_t sz=sizeof(*log_transition_probs)*(*num);
00264 *dst=(float64_t*) malloc(sz);
00265 ASSERT(dst);
00266
00267 memcpy(*dst, log_transition_probs, sz);
00268 }
00269
00270 bool CLinearHMM::set_log_transition_probs(const float64_t* src, int32_t num)
00271 {
00272 if (num!=-1)
00273 ASSERT(num==num_params);
00274
00275 if (!log_transition_probs)
00276 log_transition_probs=new float64_t[num_params];
00277
00278 if (!transition_probs)
00279 transition_probs=new float64_t[num_params];
00280
00281 for (int32_t i=0; i< num_params; i++)
00282 {
00283 log_transition_probs[i]=src[i];
00284 transition_probs[i]=exp(log_transition_probs[i]);
00285 }
00286
00287 return true;
00288 }
00289
00290
00291
00292