SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include "distributions/Histogram.h" 00013 #include "lib/common.h" 00014 #include "features/StringFeatures.h" 00015 #include "lib/io.h" 00016 #include "lib/Mathematics.h" 00017 00018 using namespace shogun; 00019 00020 CHistogram::CHistogram() 00021 : CDistribution() 00022 { 00023 hist=new float64_t[1<<16]; 00024 } 00025 00026 CHistogram::CHistogram(CStringFeatures<uint16_t> *f) 00027 : CDistribution() 00028 { 00029 hist=new float64_t[1<<16]; 00030 features=f; 00031 } 00032 00033 CHistogram::~CHistogram() 00034 { 00035 delete[] hist; 00036 } 00037 00038 bool CHistogram::train(CFeatures* data) 00039 { 00040 int32_t vec; 00041 int32_t feat; 00042 int32_t i; 00043 00044 if (data) 00045 { 00046 if (data->get_feature_class() != C_STRING || 00047 data->get_feature_type() != F_WORD) 00048 { 00049 SG_ERROR("Expected features of class string type word\n"); 00050 } 00051 set_features(data); 00052 } 00053 00054 ASSERT(features); 00055 ASSERT(features->get_feature_class()==C_STRING); 00056 ASSERT(features->get_feature_type()==F_WORD); 00057 00058 for (i=0; i< (int32_t) (1<<16); i++) 00059 hist[i]=0; 00060 00061 for (vec=0; vec<features->get_num_vectors(); vec++) 00062 { 00063 int32_t len; 00064 bool free_vec; 00065 00066 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00067 get_feature_vector(vec, len, free_vec); 00068 00069 for (feat=0; feat<len ; feat++) 00070 hist[vector[feat]]++; 00071 00072 ((CStringFeatures<uint16_t>*) features)-> 00073 free_feature_vector(vector, vec, free_vec); 00074 } 00075 00076 for (i=0; i< (int32_t) (1<<16); i++) 00077 hist[i]=log(hist[i]); 00078 00079 return true; 00080 } 00081 00082 float64_t CHistogram::get_log_likelihood_example(int32_t num_example) 00083 { 00084 ASSERT(features); 00085 ASSERT(features->get_feature_class()==C_STRING); 00086 ASSERT(features->get_feature_type()==F_WORD); 00087 00088 int32_t len; 00089 bool free_vec; 00090 float64_t loglik=0; 00091 00092 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00093 get_feature_vector(num_example, len, free_vec); 00094 00095 for (int32_t i=0; i<len; i++) 00096 loglik+=hist[vector[i]]; 00097 00098 ((CStringFeatures<uint16_t>*) features)-> 00099 free_feature_vector(vector, num_example, free_vec); 00100 00101 return loglik; 00102 } 00103 00104 float64_t CHistogram::get_log_derivative(int32_t num_param, int32_t num_example) 00105 { 00106 if (hist[num_param] < CMath::ALMOST_NEG_INFTY) 00107 return -CMath::INFTY; 00108 else 00109 { 00110 ASSERT(features); 00111 ASSERT(features->get_feature_class()==C_STRING); 00112 ASSERT(features->get_feature_type()==F_WORD); 00113 00114 int32_t len; 00115 bool free_vec; 00116 float64_t deriv=0; 00117 00118 uint16_t* vector=((CStringFeatures<uint16_t>*) features)-> 00119 get_feature_vector(num_example, len, free_vec); 00120 00121 int32_t num_occurences=0; 00122 00123 for (int32_t i=0; i<len; i++) 00124 { 00125 deriv+=hist[vector[i]]; 00126 00127 if (vector[i]==num_param) 00128 num_occurences++; 00129 } 00130 00131 ((CStringFeatures<uint16_t>*) features)-> 00132 free_feature_vector(vector, num_example, free_vec); 00133 00134 if (num_occurences>0) 00135 deriv+=CMath::log((float64_t) num_occurences)-hist[num_param]; 00136 else 00137 deriv=-CMath::INFTY; 00138 00139 return deriv; 00140 } 00141 } 00142 00143 float64_t CHistogram::get_log_model_parameter(int32_t num_param) 00144 { 00145 return hist[num_param]; 00146 } 00147 00148 bool CHistogram::set_histogram(float64_t* src, int32_t num) 00149 { 00150 ASSERT(num==get_num_model_parameters()); 00151 00152 delete[] hist; 00153 hist=new float64_t[num]; 00154 for (int32_t i=0; i<num; i++) { 00155 hist[i]=src[i]; 00156 } 00157 00158 return true; 00159 } 00160 00161 void CHistogram::get_histogram(float64_t** dst, int32_t* num) 00162 { 00163 *num=get_num_model_parameters(); 00164 size_t sz=sizeof(*hist)*(*num); 00165 *dst=(float64_t*) malloc(sz); 00166 ASSERT(dst); 00167 00168 memcpy(*dst, hist, sz); 00169 } 00170