LinearHMM.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #include "distributions/LinearHMM.h"
00013 #include "lib/common.h"
00014 #include "features/StringFeatures.h"
00015 #include "lib/io.h"
00016 
00017 using namespace shogun;
00018 
00019 CLinearHMM::CLinearHMM(CStringFeatures<uint16_t>* f)
00020 : CDistribution(), transition_probs(NULL), log_transition_probs(NULL)
00021 {
00022     features=f;
00023     sequence_length = f->get_vector_length(0);
00024     num_symbols     = (int32_t) f->get_num_symbols();
00025     num_params      = sequence_length*num_symbols;
00026 }
00027 
00028 CLinearHMM::CLinearHMM(int32_t p_num_features, int32_t p_num_symbols)
00029 : CDistribution(), transition_probs(NULL), log_transition_probs(NULL)
00030 {
00031     sequence_length = p_num_features;
00032     num_symbols     = p_num_symbols;
00033     num_params      = sequence_length*num_symbols;
00034 }
00035 
00036 CLinearHMM::~CLinearHMM()
00037 {
00038     delete[] transition_probs;
00039     delete[] log_transition_probs;
00040 }
00041 
00042 bool CLinearHMM::train(CFeatures* data)
00043 {
00044     if (data)
00045     {
00046         if (data->get_feature_class() != C_STRING ||
00047                 data->get_feature_type() != F_WORD)
00048         {
00049             SG_ERROR("Expected features of class string type word!\n");
00050         }
00051         set_features(data);
00052     }
00053     delete[] transition_probs;
00054     delete[] log_transition_probs;
00055     int32_t* int_transition_probs=new int32_t[num_params];
00056 
00057     int32_t vec;
00058     int32_t i;
00059 
00060     for (i=0; i< num_params; i++)
00061         int_transition_probs[i]=0;
00062 
00063     for (vec=0; vec<features->get_num_vectors(); vec++)
00064     {
00065         int32_t len;
00066         bool free_vec;
00067 
00068         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00069             get_feature_vector(vec, len, free_vec);
00070 
00071         //just count the symbols per position -> transition_probsogram
00072         for (int32_t feat=0; feat<len ; feat++)
00073             int_transition_probs[feat*num_symbols+vector[feat]]++;
00074 
00075         ((CStringFeatures<uint16_t>*) features)->
00076             free_feature_vector(vector, vec, free_vec);
00077     }
00078 
00079     //trade memory for speed
00080     transition_probs=new float64_t[num_params];
00081     log_transition_probs=new float64_t[num_params];
00082 
00083     for (i=0;i<sequence_length;i++)
00084     {
00085         for (int32_t j=0; j<num_symbols; j++)
00086         {
00087             float64_t sum=0;
00088             int32_t offs=i*num_symbols+
00089                 ((CStringFeatures<uint16_t> *) features)->
00090                     get_masked_symbols((uint16_t)j,(uint8_t) 254);
00091             int32_t original_num_symbols=(int32_t)
00092                 ((CStringFeatures<uint16_t> *) features)->
00093                     get_original_num_symbols();
00094 
00095             for (int32_t k=0; k<original_num_symbols; k++)
00096                 sum+=int_transition_probs[offs+k];
00097 
00098             transition_probs[i*num_symbols+j]=
00099                 (int_transition_probs[i*num_symbols+j]+pseudo_count)/
00100                 (sum+((CStringFeatures<uint16_t> *) features)->
00101                     get_original_num_symbols()*pseudo_count);
00102             log_transition_probs[i*num_symbols+j]=
00103                 log(transition_probs[i*num_symbols+j]);
00104         }
00105     }
00106 
00107     delete[] int_transition_probs;
00108     return true;
00109 }
00110 
00111 bool CLinearHMM::train(
00112     const int32_t* indizes, int32_t num_indizes, float64_t pseudo)
00113 {
00114     delete[] transition_probs;
00115     delete[] log_transition_probs;
00116     int32_t* int_transition_probs=new int32_t[num_params];
00117     int32_t vec;
00118     int32_t i;
00119 
00120     for (i=0; i< num_params; i++)
00121         int_transition_probs[i]=0;
00122 
00123     for (vec=0; vec<num_indizes; vec++)
00124     {
00125         int32_t len;
00126         bool free_vec;
00127 
00128         ASSERT(indizes[vec]>=0 &&
00129             indizes[vec]<((CStringFeatures<uint16_t>*) features)->
00130                 get_num_vectors());
00131         uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00132             get_feature_vector(indizes[vec], len, free_vec);
00133         ((CStringFeatures<uint16_t>*) features)->
00134             free_feature_vector(vector, indizes[vec], free_vec);
00135 
00136         //just count the symbols per position -> transition_probsogram
00137         //
00138         for (int32_t feat=0; feat<len ; feat++)
00139             int_transition_probs[feat*num_symbols+vector[feat]]++;
00140     }
00141 
00142     //trade memory for speed
00143     transition_probs=new float64_t[num_params];
00144     log_transition_probs=new float64_t[num_params];
00145 
00146     for (i=0;i<sequence_length;i++)
00147     {
00148         for (int32_t j=0; j<num_symbols; j++)
00149         {
00150             float64_t sum=0;
00151             int32_t original_num_symbols=(int32_t)
00152                 ((CStringFeatures<uint16_t> *) features)->
00153                     get_original_num_symbols();
00154             for (int32_t k=0; k<original_num_symbols; k++)
00155             {
00156                 sum+=int_transition_probs[i*num_symbols+
00157                     ((CStringFeatures<uint16_t>*) features)->
00158                         get_masked_symbols((uint16_t)j,(uint8_t) 254)+k];
00159             }
00160 
00161             transition_probs[i*num_symbols+j]=
00162                 (int_transition_probs[i*num_symbols+j]+pseudo)/
00163                 (sum+((CStringFeatures<uint16_t>*) features)->
00164                     get_original_num_symbols()*pseudo);
00165             log_transition_probs[i*num_symbols+j]=
00166                 log(transition_probs[i*num_symbols+j]);
00167         }
00168     }
00169 
00170     delete[] int_transition_probs;
00171     return true;
00172 }
00173 
00174 float64_t CLinearHMM::get_log_likelihood_example(uint16_t* vector, int32_t len)
00175 {
00176     float64_t result=log_transition_probs[vector[0]];
00177 
00178     for (int32_t i=1; i<len; i++)
00179         result+=log_transition_probs[i*num_symbols+vector[i]];
00180     
00181     return result;
00182 }
00183 
00184 float64_t CLinearHMM::get_log_likelihood_example(int32_t num_example)
00185 {
00186     int32_t len;
00187     bool free_vec;
00188     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00189         get_feature_vector(num_example, len, free_vec);
00190     float64_t result=log_transition_probs[vector[0]];
00191 
00192     for (int32_t i=1; i<len; i++)
00193         result+=log_transition_probs[i*num_symbols+vector[i]];
00194 
00195     ((CStringFeatures<uint16_t>*) features)->
00196         free_feature_vector(vector, num_example, free_vec);
00197 
00198     return result;
00199 }
00200 
00201 float64_t CLinearHMM::get_likelihood_example(uint16_t* vector, int32_t len)
00202 {
00203     float64_t result=transition_probs[vector[0]];
00204 
00205     for (int32_t i=1; i<len; i++)
00206         result*=transition_probs[i*num_symbols+vector[i]];
00207     
00208     return result;
00209 }
00210 
00211 float64_t CLinearHMM::get_log_derivative(int32_t num_param, int32_t num_example)
00212 {
00213     int32_t len;
00214     bool free_vec;
00215     uint16_t* vector=((CStringFeatures<uint16_t>*) features)->
00216         get_feature_vector(num_example, len, free_vec);
00217     float64_t result=0;
00218     int32_t position=num_param/num_symbols;
00219     ASSERT(position>=0 && position<len);
00220     uint16_t sym=(uint16_t) (num_param-position*num_symbols);
00221 
00222     if (vector[position]==sym && transition_probs[num_param]!=0)
00223         result=1.0/transition_probs[num_param];
00224     ((CStringFeatures<uint16_t>*) features)->
00225         free_feature_vector(vector, num_example, free_vec);
00226 
00227     return result;
00228 }
00229 
00230 void CLinearHMM::get_transition_probs(float64_t** dst, int32_t* num)
00231 {
00232     *num=num_params;
00233     size_t sz=sizeof(*transition_probs)*(*num);
00234     *dst=(float64_t*) malloc(sz);
00235     ASSERT(dst);
00236 
00237     memcpy(*dst, transition_probs, sz);
00238 }
00239 
00240 bool CLinearHMM::set_transition_probs(const float64_t* src, int32_t num)
00241 {
00242     if (num!=-1)
00243         ASSERT(num==num_params);
00244 
00245     if (!log_transition_probs)
00246         log_transition_probs=new float64_t[num_params];
00247 
00248     if (!transition_probs)
00249         transition_probs=new float64_t[num_params];
00250 
00251     for (int32_t i=0; i<num_params; i++)
00252     {
00253         transition_probs[i]=src[i];
00254         log_transition_probs[i]=log(transition_probs[i]);
00255     }
00256 
00257     return true;
00258 }
00259 
00260 void CLinearHMM::get_log_transition_probs(float64_t** dst, int32_t* num)
00261 {
00262     *num=num_params;
00263     size_t sz=sizeof(*log_transition_probs)*(*num);
00264     *dst=(float64_t*) malloc(sz);
00265     ASSERT(dst);
00266 
00267     memcpy(*dst, log_transition_probs, sz);
00268 }
00269 
00270 bool CLinearHMM::set_log_transition_probs(const float64_t* src, int32_t num)
00271 {
00272     if (num!=-1)
00273         ASSERT(num==num_params);
00274 
00275     if (!log_transition_probs)
00276         log_transition_probs=new float64_t[num_params];
00277 
00278     if (!transition_probs)
00279         transition_probs=new float64_t[num_params];
00280 
00281     for (int32_t i=0; i< num_params; i++)
00282     {
00283         log_transition_probs[i]=src[i];
00284         transition_probs[i]=exp(log_transition_probs[i]);
00285     }
00286 
00287     return true;
00288 }
00289 
00290 
00291 
00292 

SHOGUN Machine Learning Toolbox - Documentation