SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include "features/SNPFeatures.h" 00012 #include "lib/io.h" 00013 #include "features/Alphabet.h" 00014 00015 using namespace shogun; 00016 00017 CSNPFeatures::CSNPFeatures(void) 00018 { 00019 SG_UNSTABLE("CSNPFeatures::CSNPFeatures(void)", "\n"); 00020 00021 strings = NULL; 00022 00023 string_length = 0; 00024 num_strings = 0; 00025 w_dim = 0; 00026 00027 normalization_const = 0.0; 00028 00029 m_str_min = NULL; 00030 m_str_maj = NULL; 00031 } 00032 00033 CSNPFeatures::CSNPFeatures(CStringFeatures<uint8_t>* str) : CDotFeatures(), 00034 m_str_min(NULL), m_str_maj(NULL) 00035 { 00036 ASSERT(str); 00037 ASSERT(str->have_same_length()); 00038 SG_REF(str); 00039 00040 strings=str; 00041 string_length=str->get_max_vector_length(); 00042 ASSERT((string_length & 1) == 0); // length divisible by 2 00043 w_dim=3*string_length/2; 00044 num_strings=str->get_num_vectors(); 00045 CAlphabet* alpha=str->get_alphabet(); 00046 ASSERT(alpha->get_alphabet()==SNP); 00047 SG_UNREF(alpha); 00048 00049 obtain_base_strings(); 00050 set_normalization_const(); 00051 00052 } 00053 00054 CSNPFeatures::CSNPFeatures(const CSNPFeatures& orig) 00055 : CDotFeatures(orig), strings(orig.strings), 00056 normalization_const(orig.normalization_const), 00057 m_str_min(NULL), m_str_maj(NULL) 00058 { 00059 SG_REF(strings); 00060 string_length=strings->get_max_vector_length(); 00061 ASSERT((string_length & 1) == 0); // length divisible by 2 00062 w_dim=3*string_length; 00063 num_strings=strings->get_num_vectors(); 00064 CAlphabet* alpha=strings->get_alphabet(); 00065 SG_UNREF(alpha); 00066 00067 obtain_base_strings(); 00068 } 00069 00070 CSNPFeatures::~CSNPFeatures() 00071 { 00072 SG_UNREF(strings); 00073 } 00074 00075 float64_t CSNPFeatures::dot(int32_t idx_a, CDotFeatures* df, int32_t idx_b) 00076 { 00077 ASSERT(df); 00078 ASSERT(df->get_feature_type() == get_feature_type()); 00079 ASSERT(df->get_feature_class() == get_feature_class()); 00080 CSNPFeatures* sf=(CSNPFeatures*) df; 00081 00082 int32_t alen, blen; 00083 bool free_avec, free_bvec; 00084 00085 uint8_t* avec = strings->get_feature_vector(idx_a, alen, free_avec); 00086 uint8_t* bvec = sf->strings->get_feature_vector(idx_b, blen, free_bvec); 00087 00088 ASSERT(alen==blen); 00089 if (alen!=string_length) 00090 SG_ERROR("alen (%d) !=string_length (%d)\n", alen, string_length); 00091 ASSERT(m_str_min); 00092 ASSERT(m_str_maj); 00093 00094 float64_t total=0; 00095 00096 for (int32_t i = 0; i<alen-1; i+=2) 00097 { 00098 int32_t sumaa=0; 00099 int32_t sumbb=0; 00100 int32_t sumab=0; 00101 00102 uint8_t a1=avec[i]; 00103 uint8_t a2=avec[i+1]; 00104 uint8_t b1=bvec[i]; 00105 uint8_t b2=bvec[i+1]; 00106 00107 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0')) 00108 sumab++; 00109 else if (a1==a2 && b1==b2) 00110 { 00111 if (a1!=b1) 00112 continue; 00113 00114 if (a1==m_str_min[i]) 00115 sumaa++; 00116 else if (a1==m_str_maj[i]) 00117 sumbb++; 00118 else 00119 { 00120 SG_ERROR("The impossible happened i=%d a1=%c " 00121 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, a1,a2, b1,b2, m_str_min[i], m_str_maj[i]); 00122 } 00123 00124 } 00125 total+=sumaa+sumbb+sumab; 00126 } 00127 00128 strings->free_feature_vector(avec, idx_a, free_avec); 00129 sf->strings->free_feature_vector(bvec, idx_b, free_bvec); 00130 return total; 00131 } 00132 00133 float64_t CSNPFeatures::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len) 00134 { 00135 if (vec2_len != w_dim) 00136 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00137 00138 float64_t sum=0; 00139 int32_t len; 00140 bool free_vec1; 00141 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00142 int32_t offs=0; 00143 00144 for (int32_t i=0; i<len; i+=2) 00145 { 00146 int32_t dim=0; 00147 00148 char a1=vec[i]; 00149 char a2=vec[i+1]; 00150 00151 if (a1==a2 && a1!='0' && a2!='0') 00152 { 00153 if (a1==m_str_min[i]) 00154 dim=1; 00155 else if (a1==m_str_maj[i]) 00156 dim=2; 00157 else 00158 { 00159 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n", 00160 i, a1,a2, m_str_min[i], m_str_maj[i]); 00161 } 00162 } 00163 00164 sum+=vec2[offs+dim]; 00165 offs+=3; 00166 } 00167 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00168 00169 return sum/normalization_const; 00170 } 00171 00172 void CSNPFeatures::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val) 00173 { 00174 if (vec2_len != w_dim) 00175 SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim); 00176 00177 int32_t len; 00178 bool free_vec1; 00179 uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1); 00180 int32_t offs=0; 00181 00182 if (abs_val) 00183 alpha=CMath::abs(alpha); 00184 00185 for (int32_t i=0; i<len; i+=2) 00186 { 00187 int32_t dim=0; 00188 00189 char a1=vec[i]; 00190 char a2=vec[i+1]; 00191 00192 if (a1==a2 && a1!='0' && a2!='0') 00193 { 00194 if (a1==m_str_min[i]) 00195 dim=1; 00196 else if (a1==m_str_maj[i]) 00197 dim=2; 00198 else 00199 { 00200 SG_ERROR("The impossible happened i=%d a1=%c a2=%c min=%c maj=%c\n", 00201 i, a1,a2, m_str_min[i], m_str_maj[i]); 00202 } 00203 } 00204 00205 vec2[offs+dim]+=alpha; 00206 offs+=3; 00207 } 00208 strings->free_feature_vector(vec, vec_idx1, free_vec1); 00209 } 00210 00211 void CSNPFeatures::obtain_base_strings() 00212 { 00213 for (int32_t i=0; i<num_strings; i++) 00214 { 00215 int32_t len; 00216 bool free_vec; 00217 uint8_t* vec = ((CStringFeatures<uint8_t>*) strings)->get_feature_vector(i, len, free_vec); 00218 ASSERT(string_length==len); 00219 00220 if (i==0) 00221 { 00222 size_t tlen=(len+1)*sizeof(uint8_t); 00223 m_str_min=(uint8_t*) malloc(tlen); 00224 m_str_maj=(uint8_t*) malloc(tlen); 00225 memset(m_str_min, 0, tlen); 00226 memset(m_str_maj, 0, tlen); 00227 } 00228 00229 for (int32_t j=0; j<len; j++) 00230 { 00231 // skip sequencing errors 00232 if (vec[j]=='0') 00233 continue; 00234 00235 if (m_str_min[j]==0) 00236 m_str_min[j]=vec[j]; 00237 else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j]) 00238 m_str_maj[j]=vec[j]; 00239 } 00240 00241 ((CStringFeatures<uint8_t>*) strings)->free_feature_vector(vec, i, free_vec); 00242 } 00243 00244 for (int32_t j=0; j<string_length; j++) 00245 { 00246 // if only one symbol occurs use 0 00247 if (m_str_min[j]==0) 00248 m_str_min[j]='0'; 00249 if (m_str_maj[j]==0) 00250 m_str_maj[j]='0'; 00251 00252 if (m_str_min[j]>m_str_maj[j]) 00253 CMath::swap(m_str_min[j], m_str_maj[j]); 00254 } 00255 } 00256 00257 void CSNPFeatures::set_normalization_const(float64_t n) 00258 { 00259 if (n==0) 00260 { 00261 normalization_const=string_length; 00262 normalization_const=CMath::sqrt(normalization_const); 00263 } 00264 else 00265 normalization_const=n; 00266 00267 SG_DEBUG("normalization_const:%f\n", normalization_const); 00268 } 00269 00270 void* CSNPFeatures::get_feature_iterator(int32_t vector_index) 00271 { 00272 return NULL; 00273 } 00274 00275 bool CSNPFeatures::get_next_feature(int32_t& index, float64_t& value, void* iterator) 00276 { 00277 return false; 00278 } 00279 00280 void CSNPFeatures::free_feature_iterator(void* iterator) 00281 { 00282 } 00283 00284 CFeatures* CSNPFeatures::duplicate() const 00285 { 00286 return new CSNPFeatures(*this); 00287 }