00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <string.h>
00012 #include <math.h>
00013
00014 #include "features/Alphabet.h"
00015 #include "lib/io.h"
00016
00017 using namespace shogun;
00018
00019
00020 const uint8_t CAlphabet::B_A=0;
00021 const uint8_t CAlphabet::B_C=1;
00022 const uint8_t CAlphabet::B_G=2;
00023 const uint8_t CAlphabet::B_T=3;
00024 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
00025 const char* CAlphabet::alphabet_names[12]={"DNA", "RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM", "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID", "NONE", "UNKNOWN"};
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035 CAlphabet::CAlphabet(char* al, int32_t len)
00036 : CSGObject()
00037 {
00038 EAlphabet alpha=NONE;
00039
00040 if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00041 alpha = DNA;
00042 else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00043 alpha = RAWDNA;
00044 else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00045 alpha = RNA;
00046 else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00047 alpha = PROTEIN;
00048 else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
00049 alpha = BINARY;
00050 else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00051 alpha = ALPHANUM;
00052 else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00053 alpha = CUBE;
00054 else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
00055 (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00056 alpha = RAWBYTE;
00057 else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00058 alpha = IUPAC_NUCLEIC_ACID;
00059 else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00060 alpha = IUPAC_AMINO_ACID;
00061 else {
00062 SG_ERROR( "unknown alphabet %s\n", al);
00063 }
00064
00065 set_alphabet(alpha);
00066 }
00067
00068 CAlphabet::CAlphabet(EAlphabet alpha)
00069 : CSGObject()
00070 {
00071 set_alphabet(alpha);
00072 }
00073
00074 CAlphabet::CAlphabet(CAlphabet* a)
00075 : CSGObject()
00076 {
00077 ASSERT(a);
00078 set_alphabet(a->get_alphabet());
00079 copy_histogram(a);
00080 }
00081
00082 CAlphabet::~CAlphabet()
00083 {
00084 }
00085
00086 bool CAlphabet::set_alphabet(EAlphabet alpha)
00087 {
00088 bool result=true;
00089 alphabet=alpha;
00090
00091 switch (alphabet)
00092 {
00093 case DNA:
00094 case RAWDNA:
00095 num_symbols = 4;
00096 break;
00097 case RNA:
00098 num_symbols = 4;
00099 break;
00100 case PROTEIN:
00101 num_symbols = 26;
00102 break;
00103 case BINARY:
00104 num_symbols = 2;
00105 break;
00106 case ALPHANUM:
00107 num_symbols = 36;
00108 break;
00109 case CUBE:
00110 num_symbols = 6;
00111 break;
00112 case RAWBYTE:
00113 num_symbols = 256;
00114 break;
00115 case IUPAC_NUCLEIC_ACID:
00116 num_symbols = 16;
00117 break;
00118 case IUPAC_AMINO_ACID:
00119 num_symbols = 23;
00120 break;
00121 case NONE:
00122 num_symbols = 0;
00123 break;
00124 default:
00125 num_symbols = 0;
00126 result=false;
00127 break;
00128 }
00129
00130 num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
00131 init_map_table();
00132 clear_histogram();
00133
00134 SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00135
00136 return result;
00137 }
00138
00139 void CAlphabet::init_map_table()
00140 {
00141 int32_t i;
00142 for (i=0; i<(1<<(8*sizeof(uint8_t))); i++)
00143 {
00144 maptable_to_bin[i] = MAPTABLE_UNDEF;
00145 maptable_to_char[i] = MAPTABLE_UNDEF;
00146 valid_chars[i] = false;
00147 }
00148
00149 switch (alphabet)
00150 {
00151 case CUBE:
00152 valid_chars[(uint8_t) '1']=true;
00153 valid_chars[(uint8_t) '2']=true;
00154 valid_chars[(uint8_t) '3']=true;
00155 valid_chars[(uint8_t) '4']=true;
00156 valid_chars[(uint8_t) '5']=true;
00157 valid_chars[(uint8_t) '6']=true;
00158
00159 maptable_to_bin[(uint8_t) '1']=0;
00160 maptable_to_bin[(uint8_t) '2']=1;
00161 maptable_to_bin[(uint8_t) '3']=2;
00162 maptable_to_bin[(uint8_t) '4']=3;
00163 maptable_to_bin[(uint8_t) '5']=4;
00164 maptable_to_bin[(uint8_t) '6']=5;
00165
00166 maptable_to_char[(uint8_t) 0]='1';
00167 maptable_to_char[(uint8_t) 1]='2';
00168 maptable_to_char[(uint8_t) 2]='3';
00169 maptable_to_char[(uint8_t) 3]='4';
00170 maptable_to_char[(uint8_t) 4]='5';
00171 maptable_to_char[(uint8_t) 5]='6';
00172 break;
00173
00174 case PROTEIN:
00175 {
00176 int32_t skip=0 ;
00177 for (i=0; i<21; i++)
00178 {
00179 if (i==1) skip++ ;
00180 if (i==8) skip++ ;
00181 if (i==12) skip++ ;
00182 if (i==17) skip++ ;
00183 valid_chars['A'+i+skip]=true;
00184 maptable_to_bin['A'+i+skip]=i ;
00185 maptable_to_char[i]='A'+i+skip ;
00186 } ;
00187 } ;
00188 break;
00189
00190 case BINARY:
00191 valid_chars[(uint8_t) '0']=true;
00192 valid_chars[(uint8_t) '1']=true;
00193
00194 maptable_to_bin[(uint8_t) '0']=0;
00195 maptable_to_bin[(uint8_t) '1']=1;
00196
00197 maptable_to_char[0]=(uint8_t) '0';
00198 maptable_to_char[1]=(uint8_t) '1';
00199 break;
00200
00201 case ALPHANUM:
00202 {
00203 for (i=0; i<26; i++)
00204 {
00205 valid_chars['A'+i]=true;
00206 maptable_to_bin['A'+i]=i ;
00207 maptable_to_char[i]='A'+i ;
00208 } ;
00209 for (i=0; i<10; i++)
00210 {
00211 valid_chars['0'+i]=true;
00212 maptable_to_bin['0'+i]=26+i ;
00213 maptable_to_char[26+i]='0'+i ;
00214 } ;
00215 } ;
00216 break;
00217
00218 case RAWBYTE:
00219 {
00220
00221 for (i=0; i<256; i++)
00222 {
00223 valid_chars[i]=true;
00224 maptable_to_bin[i]=i;
00225 maptable_to_char[i]=i;
00226 }
00227 }
00228 break;
00229
00230 case DNA:
00231 valid_chars[(uint8_t) 'A']=true;
00232 valid_chars[(uint8_t) 'C']=true;
00233 valid_chars[(uint8_t) 'G']=true;
00234 valid_chars[(uint8_t) 'T']=true;
00235
00236 maptable_to_bin[(uint8_t) 'A']=B_A;
00237 maptable_to_bin[(uint8_t) 'C']=B_C;
00238 maptable_to_bin[(uint8_t) 'G']=B_G;
00239 maptable_to_bin[(uint8_t) 'T']=B_T;
00240
00241 maptable_to_char[B_A]='A';
00242 maptable_to_char[B_C]='C';
00243 maptable_to_char[B_G]='G';
00244 maptable_to_char[B_T]='T';
00245 break;
00246 case RAWDNA:
00247 {
00248
00249 for (i=0; i<4; i++)
00250 {
00251 valid_chars[i]=true;
00252 maptable_to_bin[i]=i;
00253 maptable_to_char[i]=i;
00254 }
00255 }
00256 break;
00257
00258 case RNA:
00259 valid_chars[(uint8_t) 'A']=true;
00260 valid_chars[(uint8_t) 'C']=true;
00261 valid_chars[(uint8_t) 'G']=true;
00262 valid_chars[(uint8_t) 'U']=true;
00263
00264 maptable_to_bin[(uint8_t) 'A']=B_A;
00265 maptable_to_bin[(uint8_t) 'C']=B_C;
00266 maptable_to_bin[(uint8_t) 'G']=B_G;
00267 maptable_to_bin[(uint8_t) 'U']=B_T;
00268
00269 maptable_to_char[B_A]='A';
00270 maptable_to_char[B_C]='C';
00271 maptable_to_char[B_G]='G';
00272 maptable_to_char[B_T]='U';
00273 break;
00274
00275 case IUPAC_NUCLEIC_ACID:
00276 valid_chars[(uint8_t) 'A']=true;
00277 valid_chars[(uint8_t) 'C']=true;
00278 valid_chars[(uint8_t) 'G']=true;
00279 valid_chars[(uint8_t) 'T']=true;
00280 valid_chars[(uint8_t) 'U']=true;
00281 valid_chars[(uint8_t) 'R']=true;
00282 valid_chars[(uint8_t) 'Y']=true;
00283 valid_chars[(uint8_t) 'M']=true;
00284 valid_chars[(uint8_t) 'K']=true;
00285 valid_chars[(uint8_t) 'W']=true;
00286 valid_chars[(uint8_t) 'S']=true;
00287 valid_chars[(uint8_t) 'B']=true;
00288 valid_chars[(uint8_t) 'D']=true;
00289 valid_chars[(uint8_t) 'H']=true;
00290 valid_chars[(uint8_t) 'V']=true;
00291 valid_chars[(uint8_t) 'N']=true;
00292
00293 maptable_to_bin[(uint8_t) 'A']=0;
00294 maptable_to_bin[(uint8_t) 'C']=1;
00295 maptable_to_bin[(uint8_t) 'G']=2;
00296 maptable_to_bin[(uint8_t) 'T']=3;
00297 maptable_to_bin[(uint8_t) 'U']=4;
00298 maptable_to_bin[(uint8_t) 'R']=5;
00299 maptable_to_bin[(uint8_t) 'Y']=6;
00300 maptable_to_bin[(uint8_t) 'M']=7;
00301 maptable_to_bin[(uint8_t) 'K']=8;
00302 maptable_to_bin[(uint8_t) 'W']=9;
00303 maptable_to_bin[(uint8_t) 'S']=10;
00304 maptable_to_bin[(uint8_t) 'B']=11;
00305 maptable_to_bin[(uint8_t) 'D']=12;
00306 maptable_to_bin[(uint8_t) 'H']=13;
00307 maptable_to_bin[(uint8_t) 'V']=14;
00308 maptable_to_bin[(uint8_t) 'N']=15;
00309
00310 maptable_to_char[0]=(uint8_t) 'A';
00311 maptable_to_char[1]=(uint8_t) 'C';
00312 maptable_to_char[2]=(uint8_t) 'G';
00313 maptable_to_char[3]=(uint8_t) 'T';
00314 maptable_to_char[4]=(uint8_t) 'U';
00315 maptable_to_char[5]=(uint8_t) 'R';
00316 maptable_to_char[6]=(uint8_t) 'Y';
00317 maptable_to_char[7]=(uint8_t) 'M';
00318 maptable_to_char[8]=(uint8_t) 'K';
00319 maptable_to_char[9]=(uint8_t) 'W';
00320 maptable_to_char[10]=(uint8_t) 'S';
00321 maptable_to_char[11]=(uint8_t) 'B';
00322 maptable_to_char[12]=(uint8_t) 'D';
00323 maptable_to_char[13]=(uint8_t) 'H';
00324 maptable_to_char[14]=(uint8_t) 'V';
00325 maptable_to_char[15]=(uint8_t) 'N';
00326 break;
00327
00328 case IUPAC_AMINO_ACID:
00329 valid_chars[(uint8_t) 'A']=true;
00330 valid_chars[(uint8_t) 'R']=true;
00331 valid_chars[(uint8_t) 'N']=true;
00332 valid_chars[(uint8_t) 'D']=true;
00333 valid_chars[(uint8_t) 'C']=true;
00334 valid_chars[(uint8_t) 'Q']=true;
00335 valid_chars[(uint8_t) 'E']=true;
00336 valid_chars[(uint8_t) 'G']=true;
00337 valid_chars[(uint8_t) 'H']=true;
00338 valid_chars[(uint8_t) 'I']=true;
00339 valid_chars[(uint8_t) 'L']=true;
00340 valid_chars[(uint8_t) 'K']=true;
00341 valid_chars[(uint8_t) 'M']=true;
00342 valid_chars[(uint8_t) 'F']=true;
00343 valid_chars[(uint8_t) 'P']=true;
00344 valid_chars[(uint8_t) 'S']=true;
00345 valid_chars[(uint8_t) 'T']=true;
00346 valid_chars[(uint8_t) 'W']=true;
00347 valid_chars[(uint8_t) 'Y']=true;
00348 valid_chars[(uint8_t) 'V']=true;
00349 valid_chars[(uint8_t) 'B']=true;
00350 valid_chars[(uint8_t) 'Z']=true;
00351 valid_chars[(uint8_t) 'X']=true;
00352
00353 maptable_to_bin[(uint8_t) 'A']=0;
00354 maptable_to_bin[(uint8_t) 'R']=1;
00355 maptable_to_bin[(uint8_t) 'N']=2;
00356 maptable_to_bin[(uint8_t) 'D']=3;
00357 maptable_to_bin[(uint8_t) 'C']=4;
00358 maptable_to_bin[(uint8_t) 'Q']=5;
00359 maptable_to_bin[(uint8_t) 'E']=6;
00360 maptable_to_bin[(uint8_t) 'G']=7;
00361 maptable_to_bin[(uint8_t) 'H']=8;
00362 maptable_to_bin[(uint8_t) 'I']=9;
00363 maptable_to_bin[(uint8_t) 'L']=10;
00364 maptable_to_bin[(uint8_t) 'K']=11;
00365 maptable_to_bin[(uint8_t) 'M']=12;
00366 maptable_to_bin[(uint8_t) 'F']=13;
00367 maptable_to_bin[(uint8_t) 'P']=14;
00368 maptable_to_bin[(uint8_t) 'S']=15;
00369 maptable_to_bin[(uint8_t) 'T']=16;
00370 maptable_to_bin[(uint8_t) 'W']=17;
00371 maptable_to_bin[(uint8_t) 'Y']=18;
00372 maptable_to_bin[(uint8_t) 'V']=19;
00373 maptable_to_bin[(uint8_t) 'B']=20;
00374 maptable_to_bin[(uint8_t) 'Z']=21;
00375 maptable_to_bin[(uint8_t) 'X']=22;
00376
00377 maptable_to_char[0]=(uint8_t) 'A';
00378 maptable_to_char[1]=(uint8_t) 'R';
00379 maptable_to_char[2]=(uint8_t) 'N';
00380 maptable_to_char[3]=(uint8_t) 'D';
00381 maptable_to_char[4]=(uint8_t) 'C';
00382 maptable_to_char[5]=(uint8_t) 'Q';
00383 maptable_to_char[6]=(uint8_t) 'E';
00384 maptable_to_char[7]=(uint8_t) 'G';
00385 maptable_to_char[8]=(uint8_t) 'H';
00386 maptable_to_char[9]=(uint8_t) 'I';
00387 maptable_to_char[10]=(uint8_t) 'L';
00388 maptable_to_char[11]=(uint8_t) 'K';
00389 maptable_to_char[12]=(uint8_t) 'M';
00390 maptable_to_char[13]=(uint8_t) 'F';
00391 maptable_to_char[14]=(uint8_t) 'P';
00392 maptable_to_char[15]=(uint8_t) 'S';
00393 maptable_to_char[16]=(uint8_t) 'T';
00394 maptable_to_char[17]=(uint8_t) 'W';
00395 maptable_to_char[18]=(uint8_t) 'Y';
00396 maptable_to_char[19]=(uint8_t) 'V';
00397 maptable_to_char[20]=(uint8_t) 'B';
00398 maptable_to_char[21]=(uint8_t) 'Z';
00399 maptable_to_char[22]=(uint8_t) 'X';
00400 break;
00401
00402 default:
00403 break;
00404 };
00405 }
00406
00407 void CAlphabet::clear_histogram()
00408 {
00409 memset(histogram, 0, sizeof(histogram));
00410 print_histogram();
00411 }
00412
00413 int32_t CAlphabet::get_max_value_in_histogram()
00414 {
00415 int32_t max_sym=-1;
00416 for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
00417 {
00418 if (histogram[i])
00419 {
00420 max_sym=i;
00421 break;
00422 }
00423 }
00424
00425 return max_sym;
00426 }
00427
00428 int32_t CAlphabet::get_num_symbols_in_histogram()
00429 {
00430 int32_t num_sym=0;
00431 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00432 {
00433 if (histogram[i])
00434 num_sym++;
00435 }
00436
00437 return num_sym;
00438 }
00439
00440 int32_t CAlphabet::get_num_bits_in_histogram()
00441 {
00442 int32_t num_sym=get_num_symbols_in_histogram();
00443 if (num_sym>0)
00444 return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
00445 else
00446 return 0;
00447 }
00448
00449 void CAlphabet::print_histogram()
00450 {
00451 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00452 {
00453 if (histogram[i])
00454 SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00455 }
00456 }
00457
00458 bool CAlphabet::check_alphabet(bool print_error)
00459 {
00460 bool result = true;
00461
00462 for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00463 {
00464 if (histogram[i]>0 && valid_chars[i]==0)
00465 {
00466 result=false;
00467 break;
00468 }
00469 }
00470
00471 if (!result && print_error)
00472 {
00473 print_histogram();
00474 SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00475 }
00476
00477 return result;
00478 }
00479
00480 bool CAlphabet::check_alphabet_size(bool print_error)
00481 {
00482 if (get_num_bits_in_histogram() > get_num_bits())
00483 {
00484 if (print_error)
00485 {
00486 print_histogram();
00487 fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00488 SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00489 }
00490 return false;
00491 }
00492 else
00493 return true;
00494
00495 }
00496
00497 void CAlphabet::copy_histogram(CAlphabet* a)
00498 {
00499 memcpy(histogram, a->get_histogram(), sizeof(histogram));
00500 }
00501
00502 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet)
00503 {
00504
00505 int32_t idx;
00506 switch (alphabet)
00507 {
00508 case DNA:
00509 idx=0;
00510 break;
00511 case RAWDNA:
00512 idx=1;
00513 break;
00514 case RNA:
00515 idx=2;
00516 break;
00517 case PROTEIN:
00518 idx=3;
00519 break;
00520 case BINARY:
00521 idx=4;
00522 break;
00523 case ALPHANUM:
00524 idx=5;
00525 break;
00526 case CUBE:
00527 idx=6;
00528 break;
00529 case RAWBYTE:
00530 idx=7;
00531 break;
00532 case IUPAC_NUCLEIC_ACID:
00533 idx=8;
00534 break;
00535 case IUPAC_AMINO_ACID:
00536 idx=9;
00537 break;
00538 case NONE:
00539 idx=10;
00540 break;
00541 default:
00542 idx=11;
00543 break;
00544 }
00545 return alphabet_names[idx];
00546 }
00547
00548 #ifdef HAVE_BOOST_SERIALIZATION
00549 std::string CAlphabet::to_string() const
00550 {
00551 std::ostringstream s;
00552 ::boost::archive::text_oarchive oa(s);
00553 oa << this;
00554 return s.str();
00555 }
00556
00557 void CAlphabet::from_string(std::string str)
00558 {
00559 std::istringstream is(str);
00560 ::boost::archive::text_iarchive ia(is);
00561
00562
00563 CAlphabet* tmp = const_cast<CAlphabet*>(this);
00564
00565 ia >> tmp;
00566 *this = *tmp;
00567 }
00568
00569 void CAlphabet::to_file(std::string filename) const
00570 {
00571 std::ofstream os(filename.c_str(), std::ios::binary);
00572 ::boost::archive::binary_oarchive oa(os);
00573 oa << this;
00574 }
00575
00576 void CAlphabet::from_file(std::string filename)
00577 {
00578 std::ifstream is(filename.c_str(), std::ios::binary);
00579 ::boost::archive::binary_iarchive ia(is);
00580 CAlphabet* tmp= const_cast<CAlphabet*>(this);
00581 ia >> tmp;
00582 }
00583 #endif //HAVE_BOOST_SERIALIZATION