SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #include <vector> 00013 00014 #include "lib/common.h" 00015 #include "lib/io.h" 00016 #include "lib/Signal.h" 00017 #include "lib/Trie.h" 00018 #include "base/Parallel.h" 00019 00020 #include "kernel/SpectrumRBFKernel.h" 00021 #include "features/Features.h" 00022 #include "features/StringFeatures.h" 00023 #include <math.h> 00024 00025 #include <vector> 00026 #include <string> 00027 #include <fstream> 00028 #include <cmath> 00029 00030 #include <assert.h> 00031 00032 #ifndef WIN32 00033 #include <pthread.h> 00034 #endif 00035 00036 00037 using namespace shogun; 00038 00039 CSpectrumRBFKernel::CSpectrumRBFKernel(void) 00040 : CStringKernel<char>(0) 00041 { 00042 SG_UNSTABLE("CSpectrumRBFKernel::CSpectrumRBFKernel(void)", "\n"); 00043 00044 alphabet = NULL; 00045 degree = 0; 00046 AA_matrix = NULL; 00047 width = 0.0; 00048 sequences = NULL; 00049 string_features = NULL; 00050 nof_sequences = 0; 00051 max_sequence_length = 0; 00052 00053 initialized = false; 00054 00055 max_mismatch = 0; 00056 target_letter_0 = 0; 00057 } 00058 00059 CSpectrumRBFKernel::CSpectrumRBFKernel (int32_t size, float64_t *AA_matrix_, int32_t degree_, float64_t width_) 00060 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0) 00061 { 00062 lhs=NULL; 00063 rhs=NULL; 00064 00065 target_letter_0=-1 ; 00066 00067 AA_matrix=new float64_t[128*128]; 00068 00069 00070 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00071 00072 read_profiles_and_sequences(); 00073 00074 //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN); 00075 string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, IUPAC_AMINO_ACID); 00076 init(string_features, string_features); 00077 } 00078 00079 CSpectrumRBFKernel::CSpectrumRBFKernel( 00080 CStringFeatures<char>* l, CStringFeatures<char>* r, int32_t size, float64_t* AA_matrix_, int32_t degree_, float64_t width_) 00081 : CStringKernel<char>(size), alphabet(NULL), degree(degree_), width(width_), sequences(NULL), string_features(NULL), nof_sequences(0), max_sequence_length(0) 00082 { 00083 target_letter_0=-1 ; 00084 00085 AA_matrix=new float64_t[128*128]; 00086 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00087 00088 init(l, r); 00089 } 00090 00091 CSpectrumRBFKernel::~CSpectrumRBFKernel() 00092 { 00093 cleanup(); 00094 delete[] AA_matrix ; 00095 } 00096 00097 00098 void CSpectrumRBFKernel::remove_lhs() 00099 { 00100 00101 CKernel::remove_lhs(); 00102 } 00103 00104 void CSpectrumRBFKernel::read_profiles_and_sequences() 00105 { 00106 00107 int32_t aa_to_index[128];//profile 00108 aa_to_index['A'] = 0; 00109 aa_to_index['R'] = 1; 00110 aa_to_index['N'] = 2; 00111 aa_to_index['D'] = 3; 00112 aa_to_index['C'] = 4; 00113 aa_to_index['Q'] = 5; 00114 aa_to_index['E'] = 6; 00115 aa_to_index['G'] = 7; 00116 aa_to_index['H'] = 8; 00117 aa_to_index['I'] = 9; 00118 aa_to_index['L'] = 10; 00119 aa_to_index['K'] = 11; 00120 aa_to_index['M'] = 12; 00121 aa_to_index['F'] = 13; 00122 aa_to_index['P'] = 14; 00123 aa_to_index['S'] = 15; 00124 aa_to_index['T'] = 16; 00125 aa_to_index['W'] = 17; 00126 aa_to_index['Y'] = 18; 00127 aa_to_index['V'] = 19; 00128 SG_DEBUG("initializing background\n"); 00129 double background[20]; // profile 00130 background[0]=0.0799912015849807; //A 00131 background[1]=0.0484482507611578;//R 00132 background[2]=0.044293531582512;//N 00133 background[3]=0.0578891399707563;//D 00134 background[4]=0.0171846021407367;//C 00135 background[5]=0.0380578923048682;//Q 00136 background[6]=0.0638169929675978;//E 00137 background[7]=0.0760659374742852;//G 00138 background[8]=0.0223465499452473;//H 00139 background[9]=0.0550905793661343;//I 00140 background[10]=0.0866897071203864;//L 00141 background[11]=0.060458245507428;//K 00142 background[12]=0.0215379186368154;//M 00143 background[13]=0.0396348024787477;//F 00144 background[14]=0.0465746314476874;//P 00145 background[15]=0.0630028230885602;//S 00146 background[16]=0.0580394726014824;//T 00147 background[17]=0.0144991866213453;//W 00148 background[18]=0.03635438623143;//Y 00149 background[19]=0.0700241481678408;//V 00150 00151 00152 std::vector<std::string> seqs; 00153 //int32_t nof_sequences = 7329; 00154 00155 double C = 0.8; 00156 const char *filename="/fml/ag-raetsch/home/toussaint/scp/aawd_compbio_workshop/code_nora/data/profile/profiles"; 00157 std::ifstream fin(filename); 00158 00159 SG_DEBUG("Reading profiles from %s\n", filename); 00160 std::string line; 00161 while (!fin.eof()) 00162 { 00163 std::getline(fin, line); 00164 00165 if (line[0] == '>') // new sequence 00166 { 00167 int idx = line.find_first_of(' '); 00168 sequence_labels.push_back(line.substr(1,idx-1)); 00169 std::getline(fin, line); 00170 std::string orig_sequence = line; 00171 std::string sequence=""; 00172 00173 int len_line = line.length(); 00174 00175 // skip 3 lines 00176 00177 std::getline(fin, line); 00178 std::getline(fin, line); 00179 std::getline(fin, line); 00180 00181 profiles.push_back(std::vector<double>()); 00182 00183 std::vector<double>& curr_profile = profiles.back(); 00184 for (int i=0; i < len_line; ++i) 00185 { 00186 std::getline(fin, line); 00187 int a = line.find_first_not_of(' '); // index position 00188 int b = line.find_first_of(' ', a); // index position 00189 a = line.find_first_not_of(' ', b); // aa position 00190 b = line.find_first_of(' ', a); // aa position 00191 std::string aa=line.substr(a,b-a); 00192 if (0) //(aa =="B" || aa == "X" || aa == "Z") 00193 { 00194 int pos = seqs.size()+1; 00195 SG_DEBUG("Skipping aa in sequence %d\n", pos); 00196 continue; 00197 } 00198 else 00199 { 00200 sequence += aa; 00201 00202 a = line.find_first_not_of(' ', b); // beginning of block to ignore 00203 b = line.find_first_of(' ', a); // aa position 00204 00205 for (int j=0; j < 19; ++j) 00206 { 00207 a = line.find_first_not_of(' ', b); 00208 b = line.find_first_of(' ', a); 00209 } 00210 00211 int all_zeros = 1; 00212 // interesting block 00213 for (int j=0; j < 20; ++j) 00214 { 00215 a = line.find_first_not_of(' ', b); 00216 b = line.find_first_of(' ', a); 00217 double p = atof(line.substr(a, b-a).c_str()); 00218 if (p > 0) 00219 { 00220 all_zeros = 0; 00221 } 00222 double value = -1* std::log(C*(p/100)+(1-C)*background[j]); // taken from Leslie's example, C actually corresponds to 1/(1+C) 00223 curr_profile.push_back(value); 00224 //SG_DEBUG("seq %d aa %d value %f p %f bg %f\n", i, j, value,p, background[j]); 00225 } 00226 00227 if (all_zeros) 00228 { 00229 SG_DEBUG(">>>>>>>>>>>>>>> all zeros"); 00230 if (aa !="B" && aa != "X" && aa != "Z") 00231 { 00232 //profile[i][temp_profile_index]=-log(C+(1-C)*background[re_candidate[temp_profile_index]]); 00233 int32_t aa_index = aa_to_index[(int)aa.c_str()[0]]; 00234 double value = -1* std::log(C+(1-C)*background[aa_index]); // taken from Leslie's example, C actually corresponds to 1/(1+C) 00235 SG_DEBUG("before %f\n", profiles.back()[(i-1) * 20 + aa_index]); 00236 curr_profile[(i*20) + aa_index] = value; 00237 SG_DEBUG(">>> aa %c \t %d \t %f\n", aa.c_str()[0], aa_index, value); 00238 00239 /* 00240 for (int z=0; z <20; ++z) 00241 { 00242 SG_DEBUG(" %d \t %f\t", z, curr_profile[z]); 00243 } 00244 SG_DEBUG("\n"); 00245 */ 00246 } 00247 } 00248 } 00249 } 00250 00251 if (curr_profile.size() != 20 * sequence.length()) 00252 { 00253 SG_ERROR("Something's wrong with the profile.\n"); 00254 break; 00255 } 00256 00257 seqs.push_back(sequence); 00258 00259 00260 /* 00261 // 6 irrelevant lines 00262 for (int i=0; i < 6; ++i) 00263 { 00264 std::getline(fin, line); 00265 } 00266 // 00267 */ 00268 } 00269 } 00270 00271 fin.close(); 00272 00273 nof_sequences = seqs.size(); 00274 sequences = new TString<char>[nof_sequences]; 00275 00276 int max_len = 0; 00277 for (int i=0; i < nof_sequences; ++i) 00278 { 00279 int len = seqs[i].length(); 00280 sequences[i].string = new char[len+1]; 00281 sequences[i].length = len; 00282 strcpy(sequences[i].string, seqs[i].c_str()); 00283 00284 if (len > max_len) max_len = len; 00285 } 00286 00287 max_sequence_length = max_len; 00288 //string_features = new CStringFeatures<char>(sequences, nof_sequences, max_sequence_length, PROTEIN); 00289 00290 } 00291 00292 bool CSpectrumRBFKernel::init(CFeatures* l, CFeatures* r) 00293 { 00294 // >> profile 00295 /* 00296 read_profiles_and_sequences(); 00297 l = string_features; 00298 r = string_features; 00299 */ 00300 // << profile 00301 00302 int32_t lhs_changed=(lhs!=l); 00303 int32_t rhs_changed=(rhs!=r); 00304 00305 CStringKernel<char>::init(l,r); 00306 00307 SG_DEBUG("lhs_changed: %i\n", lhs_changed); 00308 SG_DEBUG("rhs_changed: %i\n", rhs_changed); 00309 00310 CStringFeatures<char>* sf_l=(CStringFeatures<char>*) l; 00311 CStringFeatures<char>* sf_r=(CStringFeatures<char>*) r; 00312 00313 SG_UNREF(alphabet); 00314 alphabet=sf_l->get_alphabet(); 00315 CAlphabet* ralphabet=sf_r->get_alphabet(); 00316 00317 if (!((alphabet->get_alphabet()==DNA) || (alphabet->get_alphabet()==RNA))) 00318 properties &= ((uint64_t) (-1)) ^ (KP_LINADD | KP_BATCHEVALUATION); 00319 00320 ASSERT(ralphabet->get_alphabet()==alphabet->get_alphabet()); 00321 SG_UNREF(ralphabet); 00322 00323 00324 return init_normalizer(); 00325 } 00326 00327 void CSpectrumRBFKernel::cleanup() 00328 { 00329 00330 SG_UNREF(alphabet); 00331 alphabet=NULL; 00332 00333 CKernel::cleanup(); 00334 } 00335 00336 inline bool isaa(char c) 00337 { 00338 if (c<65 || c>89 || c=='B' || c=='J' || c=='O' || c=='U' || c=='X' || c=='Z') 00339 return false ; 00340 return true ; 00341 } 00342 00343 float64_t CSpectrumRBFKernel::AA_helper(const char* path, const int seq_degree, const char* joint_seq, unsigned int index) 00344 { 00345 //const char* AA = "ARNDCQEGHILKMFPSTWYV"; 00346 float64_t diff=0.0 ; 00347 00348 for (int i=0; i<seq_degree; i++) 00349 { 00350 if (!isaa(path[i])||!isaa(joint_seq[index+i])) 00351 diff+=1.4 ; 00352 else 00353 { 00354 diff += AA_matrix[ (path[i]-1)*128 + path[i] - 1] ; 00355 diff -= 2*AA_matrix[ (path[i]-1)*128 + joint_seq[index+i] - 1] ; 00356 diff += AA_matrix[ (joint_seq[index+i]-1)*128 + joint_seq[index+i] - 1] ; 00357 if (CMath::is_nan(diff)) 00358 fprintf(stderr, "nan occurred: '%c' '%c'\n", path[i], joint_seq[index+i]) ; 00359 } 00360 } 00361 00362 return exp( - diff/width) ; 00363 } 00364 00365 float64_t CSpectrumRBFKernel::compute(int32_t idx_a, int32_t idx_b) 00366 { 00367 int32_t alen, blen; 00368 bool afree, bfree; 00369 00370 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, afree); 00371 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, bfree); 00372 00373 float64_t result=0; 00374 for (int32_t i=0; i<alen; i++) 00375 { 00376 for (int32_t j=0; j<blen; j++) 00377 { 00378 if ((i+degree<=alen) && (j+degree<=blen)) 00379 result += AA_helper(&(avec[i]), degree, bvec, j) ; 00380 } 00381 } 00382 00383 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, afree); 00384 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, bfree); 00385 return result; 00386 } 00387 00388 bool CSpectrumRBFKernel::set_AA_matrix( 00389 float64_t* AA_matrix_) 00390 { 00391 00392 if (AA_matrix_) 00393 { 00394 SG_DEBUG("Setting AA_matrix\n") ; 00395 memcpy(AA_matrix, AA_matrix_, 128*128*sizeof(float64_t)) ; 00396 return true ; 00397 } 00398 00399 return false; 00400 }