SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006-2009 Soeren Sonnenburg 00008 * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #include <string.h> 00012 #include <math.h> 00013 00014 #include "features/Alphabet.h" 00015 #include "lib/io.h" 00016 #include "base/Parameter.h" 00017 00018 using namespace shogun; 00019 00020 //define numbers for the bases 00021 const uint8_t CAlphabet::B_A=0; 00022 const uint8_t CAlphabet::B_C=1; 00023 const uint8_t CAlphabet::B_G=2; 00024 const uint8_t CAlphabet::B_T=3; 00025 const uint8_t CAlphabet::B_0=4; 00026 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff; 00027 const char* CAlphabet::alphabet_names[18]={ 00028 "DNA","RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM", 00029 "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID", 00030 "NONE", "DIGIT", "DIGIT2", "RAWDIGIT", "RAWDIGIT2", "UNKNOWN", 00031 "SNP", "RAWSNP"}; 00032 00033 00034 CAlphabet::CAlphabet() 00035 : CSGObject() 00036 { 00037 init(); 00038 } 00039 00040 CAlphabet::CAlphabet(char* al, int32_t len) 00041 : CSGObject() 00042 { 00043 init(); 00044 00045 EAlphabet alpha=NONE; 00046 00047 if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA"))) 00048 alpha = DNA; 00049 else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA"))) 00050 alpha = RAWDNA; 00051 else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA"))) 00052 alpha = RNA; 00053 else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN"))) 00054 alpha = PROTEIN; 00055 else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY"))) 00056 alpha = BINARY; 00057 else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM"))) 00058 alpha = ALPHANUM; 00059 else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE"))) 00060 alpha = CUBE; 00061 else if (len>=(int32_t) strlen("DIGIT2") && !strncmp(al, "DIGIT2", strlen("DIGIT2"))) 00062 alpha = DIGIT2; 00063 else if (len>=(int32_t) strlen("DIGIT") && !strncmp(al, "DIGIT", strlen("DIGIT"))) 00064 alpha = DIGIT; 00065 else if (len>=(int32_t) strlen("RAWDIGIT2") && !strncmp(al, "RAWDIGIT2", strlen("RAWDIGIT2"))) 00066 alpha = RAWDIGIT2; 00067 else if (len>=(int32_t) strlen("RAWDIGIT") && !strncmp(al, "RAWDIGIT", strlen("RAWDIGIT"))) 00068 alpha = RAWDIGIT; 00069 else if (len>=(int32_t) strlen("SNP") && !strncmp(al, "SNP", strlen("SNP"))) 00070 alpha = SNP; 00071 else if (len>=(int32_t) strlen("RAWSNP") && !strncmp(al, "RAWSNP", strlen("RAWSNP"))) 00072 alpha = RAWSNP; 00073 else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) || 00074 (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW")))) 00075 alpha = RAWBYTE; 00076 else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID"))) 00077 alpha = IUPAC_NUCLEIC_ACID; 00078 else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID"))) 00079 alpha = IUPAC_AMINO_ACID; 00080 else { 00081 SG_ERROR( "unknown alphabet %s\n", al); 00082 } 00083 00084 set_alphabet(alpha); 00085 } 00086 00087 CAlphabet::CAlphabet(EAlphabet alpha) 00088 : CSGObject() 00089 { 00090 init(); 00091 set_alphabet(alpha); 00092 } 00093 00094 CAlphabet::CAlphabet(CAlphabet* a) 00095 : CSGObject() 00096 { 00097 init(); 00098 ASSERT(a); 00099 set_alphabet(a->get_alphabet()); 00100 copy_histogram(a); 00101 } 00102 00103 CAlphabet::~CAlphabet() 00104 { 00105 } 00106 00107 bool CAlphabet::set_alphabet(EAlphabet alpha) 00108 { 00109 bool result=true; 00110 alphabet=alpha; 00111 00112 switch (alphabet) 00113 { 00114 case DNA: 00115 case RAWDNA: 00116 num_symbols = 4; 00117 break; 00118 case RNA: 00119 num_symbols = 4; 00120 break; 00121 case PROTEIN: 00122 num_symbols = 26; 00123 break; 00124 case BINARY: 00125 num_symbols = 2; 00126 break; 00127 case ALPHANUM: 00128 num_symbols = 36; 00129 break; 00130 case CUBE: 00131 num_symbols = 6; 00132 break; 00133 case RAWBYTE: 00134 num_symbols = 256; 00135 break; 00136 case IUPAC_NUCLEIC_ACID: 00137 num_symbols = 16; 00138 break; 00139 case IUPAC_AMINO_ACID: 00140 num_symbols = 23; 00141 break; 00142 case NONE: 00143 num_symbols = 0; 00144 break; 00145 case DIGIT2: 00146 num_symbols = 3; 00147 break; 00148 case DIGIT: 00149 num_symbols = 10; 00150 break; 00151 case RAWDIGIT2: 00152 num_symbols = 3; 00153 break; 00154 case RAWDIGIT: 00155 num_symbols = 10; 00156 break; 00157 case SNP: 00158 num_symbols = 5; 00159 break; 00160 case RAWSNP: 00161 num_symbols = 5; 00162 break; 00163 default: 00164 num_symbols = 0; 00165 result=false; 00166 break; 00167 } 00168 00169 num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2)); 00170 init_map_table(); 00171 clear_histogram(); 00172 00173 SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet)); 00174 00175 return result; 00176 } 00177 00178 void CAlphabet::init_map_table() 00179 { 00180 for (int32_t i=0; i<(1<<(8*sizeof(uint8_t))); i++) 00181 { 00182 maptable_to_bin[i] = MAPTABLE_UNDEF; 00183 maptable_to_char[i] = MAPTABLE_UNDEF; 00184 valid_chars[i] = false; 00185 } 00186 00187 switch (alphabet) 00188 { 00189 case RAWDIGIT: 00190 for (uint8_t i=0; i<=9; i++) 00191 { 00192 valid_chars[i]=true; 00193 maptable_to_bin[i]=i; 00194 maptable_to_char[i]=i; 00195 } 00196 break; 00197 00198 case RAWDIGIT2: 00199 for (uint8_t i=0; i<=2; i++) 00200 { 00201 valid_chars[i]=true; 00202 maptable_to_bin[i]=i; 00203 maptable_to_char[i]=i; 00204 } 00205 break; 00206 00207 case DIGIT: 00208 valid_chars[(uint8_t) '0']=true; 00209 valid_chars[(uint8_t) '1']=true; 00210 valid_chars[(uint8_t) '2']=true; 00211 valid_chars[(uint8_t) '3']=true; 00212 valid_chars[(uint8_t) '4']=true; 00213 valid_chars[(uint8_t) '5']=true; 00214 valid_chars[(uint8_t) '6']=true; 00215 valid_chars[(uint8_t) '7']=true; 00216 valid_chars[(uint8_t) '8']=true; 00217 valid_chars[(uint8_t) '9']=true; //Translation '0-9' -> 0-9 00218 00219 maptable_to_bin[(uint8_t) '0']=0; 00220 maptable_to_bin[(uint8_t) '1']=1; 00221 maptable_to_bin[(uint8_t) '2']=2; 00222 maptable_to_bin[(uint8_t) '3']=3; 00223 maptable_to_bin[(uint8_t) '4']=4; 00224 maptable_to_bin[(uint8_t) '5']=5; 00225 maptable_to_bin[(uint8_t) '6']=6; 00226 maptable_to_bin[(uint8_t) '7']=7; 00227 maptable_to_bin[(uint8_t) '8']=8; 00228 maptable_to_bin[(uint8_t) '9']=9; //Translation '0-9' -> 0-9 00229 00230 maptable_to_char[(uint8_t) 0]='0'; 00231 maptable_to_char[(uint8_t) 1]='1'; 00232 maptable_to_char[(uint8_t) 2]='2'; 00233 maptable_to_char[(uint8_t) 3]='3'; 00234 maptable_to_char[(uint8_t) 4]='4'; 00235 maptable_to_char[(uint8_t) 5]='5'; 00236 maptable_to_char[(uint8_t) 6]='6'; 00237 maptable_to_char[(uint8_t) 7]='7'; 00238 maptable_to_char[(uint8_t) 8]='8'; 00239 maptable_to_char[(uint8_t) 9]='9'; //Translation 0-9 -> '0-9' 00240 break; 00241 00242 case DIGIT2: 00243 valid_chars[(uint8_t) '0']=true; 00244 valid_chars[(uint8_t) '1']=true; 00245 valid_chars[(uint8_t) '2']=true; //Translation '0-2' -> 0-2 00246 00247 maptable_to_bin[(uint8_t) '0']=0; 00248 maptable_to_bin[(uint8_t) '1']=1; 00249 maptable_to_bin[(uint8_t) '2']=2; //Translation '0-2' -> 0-2 00250 00251 maptable_to_char[(uint8_t) 0]='0'; 00252 maptable_to_char[(uint8_t) 1]='1'; 00253 maptable_to_char[(uint8_t) 2]='2'; //Translation 0-2 -> '0-2' 00254 break; 00255 00256 case CUBE: 00257 valid_chars[(uint8_t) '1']=true; 00258 valid_chars[(uint8_t) '2']=true; 00259 valid_chars[(uint8_t) '3']=true; 00260 valid_chars[(uint8_t) '4']=true; 00261 valid_chars[(uint8_t) '5']=true; 00262 valid_chars[(uint8_t) '6']=true; //Translation '123456' -> 012345 00263 00264 maptable_to_bin[(uint8_t) '1']=0; 00265 maptable_to_bin[(uint8_t) '2']=1; 00266 maptable_to_bin[(uint8_t) '3']=2; 00267 maptable_to_bin[(uint8_t) '4']=3; 00268 maptable_to_bin[(uint8_t) '5']=4; 00269 maptable_to_bin[(uint8_t) '6']=5; //Translation '123456' -> 012345 00270 00271 maptable_to_char[(uint8_t) 0]='1'; 00272 maptable_to_char[(uint8_t) 1]='2'; 00273 maptable_to_char[(uint8_t) 2]='3'; 00274 maptable_to_char[(uint8_t) 3]='4'; 00275 maptable_to_char[(uint8_t) 4]='5'; 00276 maptable_to_char[(uint8_t) 5]='6'; //Translation 012345->'123456' 00277 break; 00278 00279 case PROTEIN: 00280 { 00281 int32_t skip=0 ; 00282 for (int32_t i=0; i<21; i++) 00283 { 00284 if (i==1) skip++ ; 00285 if (i==8) skip++ ; 00286 if (i==12) skip++ ; 00287 if (i==17) skip++ ; 00288 valid_chars['A'+i+skip]=true; 00289 maptable_to_bin['A'+i+skip]=i ; 00290 maptable_to_char[i]='A'+i+skip ; 00291 } ; //Translation 012345->acde...xy -- the protein code 00292 } ; 00293 break; 00294 00295 case BINARY: 00296 valid_chars[(uint8_t) '0']=true; 00297 valid_chars[(uint8_t) '1']=true; 00298 00299 maptable_to_bin[(uint8_t) '0']=0; 00300 maptable_to_bin[(uint8_t) '1']=1; 00301 00302 maptable_to_char[0]=(uint8_t) '0'; 00303 maptable_to_char[1]=(uint8_t) '1'; 00304 break; 00305 00306 case ALPHANUM: 00307 { 00308 for (int32_t i=0; i<26; i++) 00309 { 00310 valid_chars['A'+i]=true; 00311 maptable_to_bin['A'+i]=i ; 00312 maptable_to_char[i]='A'+i ; 00313 } ; 00314 for (int32_t i=0; i<10; i++) 00315 { 00316 valid_chars['0'+i]=true; 00317 maptable_to_bin['0'+i]=26+i ; 00318 maptable_to_char[26+i]='0'+i ; 00319 } ; //Translation 012345->acde...xy0123456789 00320 } ; 00321 break; 00322 00323 case RAWBYTE: 00324 { 00325 //identity 00326 for (int32_t i=0; i<256; i++) 00327 { 00328 valid_chars[i]=true; 00329 maptable_to_bin[i]=i; 00330 maptable_to_char[i]=i; 00331 } 00332 } 00333 break; 00334 00335 case DNA: 00336 valid_chars[(uint8_t) 'A']=true; 00337 valid_chars[(uint8_t) 'C']=true; 00338 valid_chars[(uint8_t) 'G']=true; 00339 valid_chars[(uint8_t) 'T']=true; 00340 00341 maptable_to_bin[(uint8_t) 'A']=B_A; 00342 maptable_to_bin[(uint8_t) 'C']=B_C; 00343 maptable_to_bin[(uint8_t) 'G']=B_G; 00344 maptable_to_bin[(uint8_t) 'T']=B_T; 00345 00346 maptable_to_char[B_A]='A'; 00347 maptable_to_char[B_C]='C'; 00348 maptable_to_char[B_G]='G'; 00349 maptable_to_char[B_T]='T'; 00350 break; 00351 case RAWDNA: 00352 { 00353 //identity 00354 for (int32_t i=0; i<4; i++) 00355 { 00356 valid_chars[i]=true; 00357 maptable_to_bin[i]=i; 00358 maptable_to_char[i]=i; 00359 } 00360 } 00361 break; 00362 00363 case SNP: 00364 valid_chars[(uint8_t) 'A']=true; 00365 valid_chars[(uint8_t) 'C']=true; 00366 valid_chars[(uint8_t) 'G']=true; 00367 valid_chars[(uint8_t) 'T']=true; 00368 valid_chars[(uint8_t) '0']=true; 00369 00370 maptable_to_bin[(uint8_t) 'A']=B_A; 00371 maptable_to_bin[(uint8_t) 'C']=B_C; 00372 maptable_to_bin[(uint8_t) 'G']=B_G; 00373 maptable_to_bin[(uint8_t) 'T']=B_T; 00374 maptable_to_bin[(uint8_t) '0']=B_0; 00375 00376 maptable_to_char[B_A]='A'; 00377 maptable_to_char[B_C]='C'; 00378 maptable_to_char[B_G]='G'; 00379 maptable_to_char[B_T]='T'; 00380 maptable_to_char[B_0]='0'; 00381 break; 00382 case RAWSNP: 00383 { 00384 //identity 00385 for (int32_t i=0; i<5; i++) 00386 { 00387 valid_chars[i]=true; 00388 maptable_to_bin[i]=i; 00389 maptable_to_char[i]=i; 00390 } 00391 } 00392 break; 00393 00394 case RNA: 00395 valid_chars[(uint8_t) 'A']=true; 00396 valid_chars[(uint8_t) 'C']=true; 00397 valid_chars[(uint8_t) 'G']=true; 00398 valid_chars[(uint8_t) 'U']=true; 00399 00400 maptable_to_bin[(uint8_t) 'A']=B_A; 00401 maptable_to_bin[(uint8_t) 'C']=B_C; 00402 maptable_to_bin[(uint8_t) 'G']=B_G; 00403 maptable_to_bin[(uint8_t) 'U']=B_T; 00404 00405 maptable_to_char[B_A]='A'; 00406 maptable_to_char[B_C]='C'; 00407 maptable_to_char[B_G]='G'; 00408 maptable_to_char[B_T]='U'; 00409 break; 00410 00411 case IUPAC_NUCLEIC_ACID: 00412 valid_chars[(uint8_t) 'A']=true; // A Adenine 00413 valid_chars[(uint8_t) 'C']=true; // C Cytosine 00414 valid_chars[(uint8_t) 'G']=true; // G Guanine 00415 valid_chars[(uint8_t) 'T']=true; // T Thymine 00416 valid_chars[(uint8_t) 'U']=true; // U Uracil 00417 valid_chars[(uint8_t) 'R']=true; // R Purine (A or G) 00418 valid_chars[(uint8_t) 'Y']=true; // Y Pyrimidine (C, T, or U) 00419 valid_chars[(uint8_t) 'M']=true; // M C or A 00420 valid_chars[(uint8_t) 'K']=true; // K T, U, or G 00421 valid_chars[(uint8_t) 'W']=true; // W T, U, or A 00422 valid_chars[(uint8_t) 'S']=true; // S C or G 00423 valid_chars[(uint8_t) 'B']=true; // B C, T, U, or G (not A) 00424 valid_chars[(uint8_t) 'D']=true; // D A, T, U, or G (not C) 00425 valid_chars[(uint8_t) 'H']=true; // H A, T, U, or C (not G) 00426 valid_chars[(uint8_t) 'V']=true; // V A, C, or G (not T, not U) 00427 valid_chars[(uint8_t) 'N']=true; // N Any base (A, C, G, T, or U) 00428 00429 maptable_to_bin[(uint8_t) 'A']=0; // A Adenine 00430 maptable_to_bin[(uint8_t) 'C']=1; // C Cytosine 00431 maptable_to_bin[(uint8_t) 'G']=2; // G Guanine 00432 maptable_to_bin[(uint8_t) 'T']=3; // T Thymine 00433 maptable_to_bin[(uint8_t) 'U']=4; // U Uracil 00434 maptable_to_bin[(uint8_t) 'R']=5; // R Purine (A or G) 00435 maptable_to_bin[(uint8_t) 'Y']=6; // Y Pyrimidine (C, T, or U) 00436 maptable_to_bin[(uint8_t) 'M']=7; // M C or A 00437 maptable_to_bin[(uint8_t) 'K']=8; // K T, U, or G 00438 maptable_to_bin[(uint8_t) 'W']=9; // W T, U, or A 00439 maptable_to_bin[(uint8_t) 'S']=10; // S C or G 00440 maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A) 00441 maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C) 00442 maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G) 00443 maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U) 00444 maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U) 00445 00446 maptable_to_char[0]=(uint8_t) 'A'; // A Adenine 00447 maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine 00448 maptable_to_char[2]=(uint8_t) 'G'; // G Guanine 00449 maptable_to_char[3]=(uint8_t) 'T'; // T Thymine 00450 maptable_to_char[4]=(uint8_t) 'U'; // U Uracil 00451 maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G) 00452 maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U) 00453 maptable_to_char[7]=(uint8_t) 'M'; // M C or A 00454 maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G 00455 maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A 00456 maptable_to_char[10]=(uint8_t) 'S'; // S C or G 00457 maptable_to_char[11]=(uint8_t) 'B'; // B C, T, U, or G (not A) 00458 maptable_to_char[12]=(uint8_t) 'D'; // D A, T, U, or G (not C) 00459 maptable_to_char[13]=(uint8_t) 'H'; // H A, T, U, or C (not G) 00460 maptable_to_char[14]=(uint8_t) 'V'; // V A, C, or G (not T, not U) 00461 maptable_to_char[15]=(uint8_t) 'N'; // N Any base (A, C, G, T, or U) 00462 break; 00463 00464 case IUPAC_AMINO_ACID: 00465 valid_chars[(uint8_t) 'A']=true; //A Ala Alanine 00466 valid_chars[(uint8_t) 'R']=true; //R Arg Arginine 00467 valid_chars[(uint8_t) 'N']=true; //N Asn Asparagine 00468 valid_chars[(uint8_t) 'D']=true; //D Asp Aspartic acid 00469 valid_chars[(uint8_t) 'C']=true; //C Cys Cysteine 00470 valid_chars[(uint8_t) 'Q']=true; //Q Gln Glutamine 00471 valid_chars[(uint8_t) 'E']=true; //E Glu Glutamic acid 00472 valid_chars[(uint8_t) 'G']=true; //G Gly Glycine 00473 valid_chars[(uint8_t) 'H']=true; //H His Histidine 00474 valid_chars[(uint8_t) 'I']=true; //I Ile Isoleucine 00475 valid_chars[(uint8_t) 'L']=true; //L Leu Leucine 00476 valid_chars[(uint8_t) 'K']=true; //K Lys Lysine 00477 valid_chars[(uint8_t) 'M']=true; //M Met Methionine 00478 valid_chars[(uint8_t) 'F']=true; //F Phe Phenylalanine 00479 valid_chars[(uint8_t) 'P']=true; //P Pro Proline 00480 valid_chars[(uint8_t) 'S']=true; //S Ser Serine 00481 valid_chars[(uint8_t) 'T']=true; //T Thr Threonine 00482 valid_chars[(uint8_t) 'W']=true; //W Trp Tryptophan 00483 valid_chars[(uint8_t) 'Y']=true; //Y Tyr Tyrosine 00484 valid_chars[(uint8_t) 'V']=true; //V Val Valine 00485 valid_chars[(uint8_t) 'B']=true; //B Asx Aspartic acid or Asparagine 00486 valid_chars[(uint8_t) 'Z']=true; //Z Glx Glutamine or Glutamic acid 00487 valid_chars[(uint8_t) 'X']=true; //X Xaa Any amino acid 00488 00489 maptable_to_bin[(uint8_t) 'A']=0; //A Ala Alanine 00490 maptable_to_bin[(uint8_t) 'R']=1; //R Arg Arginine 00491 maptable_to_bin[(uint8_t) 'N']=2; //N Asn Asparagine 00492 maptable_to_bin[(uint8_t) 'D']=3; //D Asp Aspartic acid 00493 maptable_to_bin[(uint8_t) 'C']=4; //C Cys Cysteine 00494 maptable_to_bin[(uint8_t) 'Q']=5; //Q Gln Glutamine 00495 maptable_to_bin[(uint8_t) 'E']=6; //E Glu Glutamic acid 00496 maptable_to_bin[(uint8_t) 'G']=7; //G Gly Glycine 00497 maptable_to_bin[(uint8_t) 'H']=8; //H His Histidine 00498 maptable_to_bin[(uint8_t) 'I']=9; //I Ile Isoleucine 00499 maptable_to_bin[(uint8_t) 'L']=10; //L Leu Leucine 00500 maptable_to_bin[(uint8_t) 'K']=11; //K Lys Lysine 00501 maptable_to_bin[(uint8_t) 'M']=12; //M Met Methionine 00502 maptable_to_bin[(uint8_t) 'F']=13; //F Phe Phenylalanine 00503 maptable_to_bin[(uint8_t) 'P']=14; //P Pro Proline 00504 maptable_to_bin[(uint8_t) 'S']=15; //S Ser Serine 00505 maptable_to_bin[(uint8_t) 'T']=16; //T Thr Threonine 00506 maptable_to_bin[(uint8_t) 'W']=17; //W Trp Tryptophan 00507 maptable_to_bin[(uint8_t) 'Y']=18; //Y Tyr Tyrosine 00508 maptable_to_bin[(uint8_t) 'V']=19; //V Val Valine 00509 maptable_to_bin[(uint8_t) 'B']=20; //B Asx Aspartic acid or Asparagine 00510 maptable_to_bin[(uint8_t) 'Z']=21; //Z Glx Glutamine or Glutamic acid 00511 maptable_to_bin[(uint8_t) 'X']=22; //X Xaa Any amino acid 00512 00513 maptable_to_char[0]=(uint8_t) 'A'; //A Ala Alanine 00514 maptable_to_char[1]=(uint8_t) 'R'; //R Arg Arginine 00515 maptable_to_char[2]=(uint8_t) 'N'; //N Asn Asparagine 00516 maptable_to_char[3]=(uint8_t) 'D'; //D Asp Aspartic acid 00517 maptable_to_char[4]=(uint8_t) 'C'; //C Cys Cysteine 00518 maptable_to_char[5]=(uint8_t) 'Q'; //Q Gln Glutamine 00519 maptable_to_char[6]=(uint8_t) 'E'; //E Glu Glutamic acid 00520 maptable_to_char[7]=(uint8_t) 'G'; //G Gly Glycine 00521 maptable_to_char[8]=(uint8_t) 'H'; //H His Histidine 00522 maptable_to_char[9]=(uint8_t) 'I'; //I Ile Isoleucine 00523 maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine 00524 maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine 00525 maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine 00526 maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine 00527 maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline 00528 maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine 00529 maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine 00530 maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan 00531 maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine 00532 maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine 00533 maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine 00534 maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid 00535 maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid 00536 break; 00537 00538 default: 00539 break; //leave uninitialised 00540 }; 00541 } 00542 00543 void CAlphabet::clear_histogram() 00544 { 00545 memset(histogram, 0, sizeof(histogram)); 00546 print_histogram(); 00547 } 00548 00549 int32_t CAlphabet::get_max_value_in_histogram() 00550 { 00551 int32_t max_sym=-1; 00552 for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--) 00553 { 00554 if (histogram[i]) 00555 { 00556 max_sym=i; 00557 break; 00558 } 00559 } 00560 00561 return max_sym; 00562 } 00563 00564 int32_t CAlphabet::get_num_symbols_in_histogram() 00565 { 00566 int32_t num_sym=0; 00567 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00568 { 00569 if (histogram[i]) 00570 num_sym++; 00571 } 00572 00573 return num_sym; 00574 } 00575 00576 int32_t CAlphabet::get_num_bits_in_histogram() 00577 { 00578 int32_t num_sym=get_num_symbols_in_histogram(); 00579 if (num_sym>0) 00580 return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2)); 00581 else 00582 return 0; 00583 } 00584 00585 void CAlphabet::print_histogram() 00586 { 00587 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00588 { 00589 if (histogram[i]) 00590 SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]); 00591 } 00592 } 00593 00594 bool CAlphabet::check_alphabet(bool print_error) 00595 { 00596 bool result = true; 00597 00598 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++) 00599 { 00600 if (histogram[i]>0 && valid_chars[i]==0) 00601 { 00602 result=false; 00603 break; 00604 } 00605 } 00606 00607 if (!result && print_error) 00608 { 00609 print_histogram(); 00610 SG_ERROR( "ALPHABET does not contain all symbols in histogram\n"); 00611 } 00612 00613 return result; 00614 } 00615 00616 bool CAlphabet::check_alphabet_size(bool print_error) 00617 { 00618 if (get_num_bits_in_histogram() > get_num_bits()) 00619 { 00620 if (print_error) 00621 { 00622 print_histogram(); 00623 fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ; 00624 SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n"); 00625 } 00626 return false; 00627 } 00628 else 00629 return true; 00630 00631 } 00632 00633 void CAlphabet::copy_histogram(CAlphabet* a) 00634 { 00635 memcpy(histogram, a->get_histogram(), sizeof(histogram)); 00636 } 00637 00638 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet) 00639 { 00640 00641 int32_t idx; 00642 switch (alphabet) 00643 { 00644 case DNA: 00645 idx=0; 00646 break; 00647 case RAWDNA: 00648 idx=1; 00649 break; 00650 case RNA: 00651 idx=2; 00652 break; 00653 case PROTEIN: 00654 idx=3; 00655 break; 00656 case BINARY: 00657 idx=4; 00658 break; 00659 case ALPHANUM: 00660 idx=5; 00661 break; 00662 case CUBE: 00663 idx=6; 00664 break; 00665 case RAWBYTE: 00666 idx=7; 00667 break; 00668 case IUPAC_NUCLEIC_ACID: 00669 idx=8; 00670 break; 00671 case IUPAC_AMINO_ACID: 00672 idx=9; 00673 break; 00674 case NONE: 00675 idx=10; 00676 break; 00677 case DIGIT: 00678 idx=11; 00679 break; 00680 case DIGIT2: 00681 idx=12; 00682 break; 00683 default: 00684 idx=13; 00685 break; 00686 } 00687 return alphabet_names[idx]; 00688 } 00689 00690 void CAlphabet::init() 00691 { 00692 alphabet = NONE; 00693 num_symbols = 0; 00694 num_bits = 0; 00695 00696 memset(valid_chars, 0, sizeof (valid_chars)); 00697 memset(maptable_to_bin, 0, sizeof (maptable_to_bin)); 00698 memset(maptable_to_char, 0, sizeof (maptable_to_char)); 00699 memset(histogram, 0, sizeof (histogram)); 00700 00701 00702 m_parameters->add((machine_int_t*) &alphabet, "alphabet", 00703 "Alphabet enum."); 00704 m_parameters->add(&num_symbols, "num_symbols", 00705 "Number of symbols."); 00706 m_parameters->add(&num_bits, "num_bits", 00707 "Number of bits."); 00708 00709 /* We don't need to serialize the mapping tables / they can be computed 00710 * after de-serializing. Lets not serialize the histogram for now. Doesn't 00711 * really make sense. */ 00712 00713 /* m_parameters->add_histogram(&histogram, sizeof(histogram), 00714 "histogram", 00715 "Histogram."); */ 00716 } 00717 00718 void CAlphabet::load_serializable_post(void) throw (ShogunException) 00719 { 00720 CSGObject::load_serializable_post(); 00721 00722 init_map_table(); 00723 }