Alphabet.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <string.h>
00012 #include <math.h>
00013 
00014 #include "features/Alphabet.h"
00015 #include "lib/io.h"
00016 
00017 using namespace shogun;
00018 
00019 //define numbers for the bases
00020 const uint8_t CAlphabet::B_A=0;
00021 const uint8_t CAlphabet::B_C=1;
00022 const uint8_t CAlphabet::B_G=2;
00023 const uint8_t CAlphabet::B_T=3;
00024 const uint8_t CAlphabet::MAPTABLE_UNDEF=0xff;
00025 const char* CAlphabet::alphabet_names[12]={"DNA", "RAWDNA", "RNA", "PROTEIN", "BINARY", "ALPHANUM", "CUBE", "RAW", "IUPAC_NUCLEIC_ACID", "IUPAC_AMINO_ACID", "NONE", "UNKNOWN"};
00026 
00027 /*
00028 CAlphabet::CAlphabet()
00029 : CSGObject()
00030 {
00031 
00032 }
00033 */
00034 
00035 CAlphabet::CAlphabet(char* al, int32_t len)
00036 : CSGObject()
00037 {
00038     EAlphabet alpha=NONE;
00039 
00040     if (len>=(int32_t) strlen("DNA") && !strncmp(al, "DNA", strlen("DNA")))
00041         alpha = DNA;
00042     else if (len>=(int32_t) strlen("RAWDNA") && !strncmp(al, "RAWDNA", strlen("RAWDNA")))
00043         alpha = RAWDNA;
00044     else if (len>=(int32_t) strlen("RNA") && !strncmp(al, "RNA", strlen("RNA")))
00045         alpha = RNA;
00046     else if (len>=(int32_t) strlen("PROTEIN") && !strncmp(al, "PROTEIN", strlen("PROTEIN")))
00047         alpha = PROTEIN;
00048     else if (len>=(int32_t) strlen("BINARY") && !strncmp(al, "BINARY", strlen("IBINARY")))
00049         alpha = BINARY;
00050     else if (len>=(int32_t) strlen("ALPHANUM") && !strncmp(al, "ALPHANUM", strlen("ALPHANUM")))
00051         alpha = ALPHANUM;
00052     else if (len>=(int32_t) strlen("CUBE") && !strncmp(al, "CUBE", strlen("CUBE")))
00053         alpha = CUBE;
00054     else if ((len>=(int32_t) strlen("BYTE") && !strncmp(al, "BYTE", strlen("BYTE"))) ||
00055             (len>=(int32_t) strlen("RAW") && !strncmp(al, "RAW", strlen("RAW"))))
00056         alpha = RAWBYTE;
00057     else if (len>=(int32_t) strlen("IUPAC_NUCLEIC_ACID") && !strncmp(al, "IUPAC_NUCLEIC_ACID", strlen("IUPAC_NUCLEIC_ACID")))
00058         alpha = IUPAC_NUCLEIC_ACID;
00059     else if (len>=(int32_t) strlen("IUPAC_AMINO_ACID") && !strncmp(al, "IUPAC_AMINO_ACID", strlen("IUPAC_AMINO_ACID")))
00060         alpha = IUPAC_AMINO_ACID;
00061     else {
00062       SG_ERROR( "unknown alphabet %s\n", al);
00063    }
00064 
00065     set_alphabet(alpha);
00066 }
00067 
00068 CAlphabet::CAlphabet(EAlphabet alpha)
00069 : CSGObject()
00070 {
00071     set_alphabet(alpha);
00072 }
00073 
00074 CAlphabet::CAlphabet(CAlphabet* a)
00075 : CSGObject()
00076 {
00077     ASSERT(a);
00078     set_alphabet(a->get_alphabet());
00079     copy_histogram(a);
00080 }
00081 
00082 CAlphabet::~CAlphabet()
00083 {
00084 }
00085 
00086 bool CAlphabet::set_alphabet(EAlphabet alpha)
00087 {
00088     bool result=true;
00089     alphabet=alpha;
00090 
00091     switch (alphabet)
00092     {
00093         case DNA:
00094         case RAWDNA:
00095             num_symbols = 4;
00096             break;
00097         case RNA:
00098             num_symbols = 4;
00099             break;
00100         case PROTEIN:
00101             num_symbols = 26;
00102             break;
00103         case BINARY:
00104             num_symbols = 2;
00105             break;
00106         case ALPHANUM:
00107             num_symbols = 36;
00108             break;
00109         case CUBE:
00110             num_symbols = 6;
00111             break;
00112         case RAWBYTE:
00113             num_symbols = 256;
00114             break;
00115         case IUPAC_NUCLEIC_ACID:
00116             num_symbols = 16;
00117             break;
00118         case IUPAC_AMINO_ACID:
00119             num_symbols = 23;
00120             break;
00121         case NONE:
00122             num_symbols = 0;
00123             break;
00124         default:
00125             num_symbols = 0;
00126             result=false;
00127             break;
00128     }
00129 
00130     num_bits=(int32_t) ceil(log((float64_t) num_symbols)/log((float64_t) 2));
00131     init_map_table();
00132     clear_histogram();
00133 
00134     SG_DEBUG( "initialised alphabet %s\n", get_alphabet_name(alphabet));
00135 
00136     return result;
00137 }
00138 
00139 void CAlphabet::init_map_table()
00140 {
00141     int32_t i;
00142     for (i=0; i<(1<<(8*sizeof(uint8_t))); i++)
00143     {
00144         maptable_to_bin[i] = MAPTABLE_UNDEF;
00145         maptable_to_char[i] = MAPTABLE_UNDEF;
00146         valid_chars[i] = false;
00147     }
00148 
00149     switch (alphabet)
00150     {
00151         case CUBE:
00152             valid_chars[(uint8_t) '1']=true;
00153             valid_chars[(uint8_t) '2']=true;
00154             valid_chars[(uint8_t) '3']=true;
00155             valid_chars[(uint8_t) '4']=true;
00156             valid_chars[(uint8_t) '5']=true;
00157             valid_chars[(uint8_t) '6']=true;    //Translation '123456' -> 012345
00158 
00159             maptable_to_bin[(uint8_t) '1']=0;
00160             maptable_to_bin[(uint8_t) '2']=1;
00161             maptable_to_bin[(uint8_t) '3']=2;
00162             maptable_to_bin[(uint8_t) '4']=3;
00163             maptable_to_bin[(uint8_t) '5']=4;
00164             maptable_to_bin[(uint8_t) '6']=5;   //Translation '123456' -> 012345
00165 
00166             maptable_to_char[(uint8_t) 0]='1';
00167             maptable_to_char[(uint8_t) 1]='2';
00168             maptable_to_char[(uint8_t) 2]='3';
00169             maptable_to_char[(uint8_t) 3]='4';
00170             maptable_to_char[(uint8_t) 4]='5';
00171             maptable_to_char[(uint8_t) 5]='6';  //Translation 012345->'123456'
00172             break;
00173 
00174         case PROTEIN:
00175             {
00176                 int32_t skip=0 ;
00177                 for (i=0; i<21; i++)
00178                 {
00179                     if (i==1) skip++ ;
00180                     if (i==8) skip++ ;
00181                     if (i==12) skip++ ;
00182                     if (i==17) skip++ ;
00183                     valid_chars['A'+i+skip]=true;
00184                     maptable_to_bin['A'+i+skip]=i ;
00185                     maptable_to_char[i]='A'+i+skip ;
00186                 } ;                   //Translation 012345->acde...xy -- the protein code
00187             } ;
00188             break;
00189 
00190         case BINARY:
00191             valid_chars[(uint8_t) '0']=true;
00192             valid_chars[(uint8_t) '1']=true;
00193 
00194             maptable_to_bin[(uint8_t) '0']=0;
00195             maptable_to_bin[(uint8_t) '1']=1;
00196 
00197             maptable_to_char[0]=(uint8_t) '0';
00198             maptable_to_char[1]=(uint8_t) '1';
00199             break;
00200 
00201         case ALPHANUM:
00202             {
00203                 for (i=0; i<26; i++)
00204                 {
00205                     valid_chars['A'+i]=true;
00206                     maptable_to_bin['A'+i]=i ;
00207                     maptable_to_char[i]='A'+i ;
00208                 } ;
00209                 for (i=0; i<10; i++)
00210                 {
00211                     valid_chars['0'+i]=true;
00212                     maptable_to_bin['0'+i]=26+i ;
00213                     maptable_to_char[26+i]='0'+i ;
00214                 } ;        //Translation 012345->acde...xy0123456789
00215             } ;
00216             break;
00217 
00218         case RAWBYTE:
00219             {
00220                 //identity
00221                 for (i=0; i<256; i++)
00222                 {
00223                     valid_chars[i]=true;
00224                     maptable_to_bin[i]=i;
00225                     maptable_to_char[i]=i;
00226                 }
00227             }
00228             break;
00229 
00230         case DNA:
00231             valid_chars[(uint8_t) 'A']=true;
00232             valid_chars[(uint8_t) 'C']=true;
00233             valid_chars[(uint8_t) 'G']=true;
00234             valid_chars[(uint8_t) 'T']=true;
00235 
00236             maptable_to_bin[(uint8_t) 'A']=B_A;
00237             maptable_to_bin[(uint8_t) 'C']=B_C;
00238             maptable_to_bin[(uint8_t) 'G']=B_G;
00239             maptable_to_bin[(uint8_t) 'T']=B_T;
00240 
00241             maptable_to_char[B_A]='A';
00242             maptable_to_char[B_C]='C';
00243             maptable_to_char[B_G]='G';
00244             maptable_to_char[B_T]='T';
00245             break;
00246         case RAWDNA:
00247             {
00248                 //identity
00249                 for (i=0; i<4; i++)
00250                 {
00251                     valid_chars[i]=true;
00252                     maptable_to_bin[i]=i;
00253                     maptable_to_char[i]=i;
00254                 }
00255             }
00256             break;
00257 
00258         case RNA:
00259             valid_chars[(uint8_t) 'A']=true;
00260             valid_chars[(uint8_t) 'C']=true;
00261             valid_chars[(uint8_t) 'G']=true;
00262             valid_chars[(uint8_t) 'U']=true;
00263 
00264             maptable_to_bin[(uint8_t) 'A']=B_A;
00265             maptable_to_bin[(uint8_t) 'C']=B_C;
00266             maptable_to_bin[(uint8_t) 'G']=B_G;
00267             maptable_to_bin[(uint8_t) 'U']=B_T;
00268 
00269             maptable_to_char[B_A]='A';
00270             maptable_to_char[B_C]='C';
00271             maptable_to_char[B_G]='G';
00272             maptable_to_char[B_T]='U';
00273             break;
00274 
00275         case IUPAC_NUCLEIC_ACID:
00276             valid_chars[(uint8_t) 'A']=true; // A   Adenine
00277             valid_chars[(uint8_t) 'C']=true; // C   Cytosine
00278             valid_chars[(uint8_t) 'G']=true; // G   Guanine
00279             valid_chars[(uint8_t) 'T']=true; // T   Thymine
00280             valid_chars[(uint8_t) 'U']=true; // U   Uracil
00281             valid_chars[(uint8_t) 'R']=true; // R   Purine (A or G)
00282             valid_chars[(uint8_t) 'Y']=true; // Y   Pyrimidine (C, T, or U)
00283             valid_chars[(uint8_t) 'M']=true; // M   C or A
00284             valid_chars[(uint8_t) 'K']=true; // K   T, U, or G
00285             valid_chars[(uint8_t) 'W']=true; // W   T, U, or A
00286             valid_chars[(uint8_t) 'S']=true; // S   C or G
00287             valid_chars[(uint8_t) 'B']=true; // B   C, T, U, or G (not A)
00288             valid_chars[(uint8_t) 'D']=true; // D   A, T, U, or G (not C)
00289             valid_chars[(uint8_t) 'H']=true; // H   A, T, U, or C (not G)
00290             valid_chars[(uint8_t) 'V']=true; // V   A, C, or G (not T, not U)
00291             valid_chars[(uint8_t) 'N']=true; // N   Any base (A, C, G, T, or U)
00292 
00293             maptable_to_bin[(uint8_t) 'A']=0; // A  Adenine
00294             maptable_to_bin[(uint8_t) 'C']=1; // C  Cytosine
00295             maptable_to_bin[(uint8_t) 'G']=2; // G  Guanine
00296             maptable_to_bin[(uint8_t) 'T']=3; // T  Thymine
00297             maptable_to_bin[(uint8_t) 'U']=4; // U  Uracil
00298             maptable_to_bin[(uint8_t) 'R']=5; // R  Purine (A or G)
00299             maptable_to_bin[(uint8_t) 'Y']=6; // Y  Pyrimidine (C, T, or U)
00300             maptable_to_bin[(uint8_t) 'M']=7; // M  C or A
00301             maptable_to_bin[(uint8_t) 'K']=8; // K  T, U, or G
00302             maptable_to_bin[(uint8_t) 'W']=9; // W  T, U, or A
00303             maptable_to_bin[(uint8_t) 'S']=10; // S C or G
00304             maptable_to_bin[(uint8_t) 'B']=11; // B C, T, U, or G (not A)
00305             maptable_to_bin[(uint8_t) 'D']=12; // D A, T, U, or G (not C)
00306             maptable_to_bin[(uint8_t) 'H']=13; // H A, T, U, or C (not G)
00307             maptable_to_bin[(uint8_t) 'V']=14; // V A, C, or G (not T, not U)
00308             maptable_to_bin[(uint8_t) 'N']=15; // N Any base (A, C, G, T, or U)
00309 
00310             maptable_to_char[0]=(uint8_t) 'A'; // A Adenine
00311             maptable_to_char[1]=(uint8_t) 'C'; // C Cytosine
00312             maptable_to_char[2]=(uint8_t) 'G'; // G Guanine
00313             maptable_to_char[3]=(uint8_t) 'T'; // T Thymine
00314             maptable_to_char[4]=(uint8_t) 'U'; // U Uracil
00315             maptable_to_char[5]=(uint8_t) 'R'; // R Purine (A or G)
00316             maptable_to_char[6]=(uint8_t) 'Y'; // Y Pyrimidine (C, T, or U)
00317             maptable_to_char[7]=(uint8_t) 'M'; // M C or A
00318             maptable_to_char[8]=(uint8_t) 'K'; // K T, U, or G
00319             maptable_to_char[9]=(uint8_t) 'W'; // W T, U, or A
00320             maptable_to_char[10]=(uint8_t) 'S'; // S    C or G
00321             maptable_to_char[11]=(uint8_t) 'B'; // B    C, T, U, or G (not A)
00322             maptable_to_char[12]=(uint8_t) 'D'; // D    A, T, U, or G (not C)
00323             maptable_to_char[13]=(uint8_t) 'H'; // H    A, T, U, or C (not G)
00324             maptable_to_char[14]=(uint8_t) 'V'; // V    A, C, or G (not T, not U)
00325             maptable_to_char[15]=(uint8_t) 'N'; // N    Any base (A, C, G, T, or U)
00326             break;
00327 
00328         case IUPAC_AMINO_ACID:
00329             valid_chars[(uint8_t) 'A']=true; //A    Ala Alanine
00330             valid_chars[(uint8_t) 'R']=true; //R    Arg Arginine
00331             valid_chars[(uint8_t) 'N']=true; //N    Asn Asparagine
00332             valid_chars[(uint8_t) 'D']=true; //D    Asp Aspartic acid
00333             valid_chars[(uint8_t) 'C']=true; //C    Cys Cysteine
00334             valid_chars[(uint8_t) 'Q']=true; //Q    Gln Glutamine
00335             valid_chars[(uint8_t) 'E']=true; //E    Glu Glutamic acid
00336             valid_chars[(uint8_t) 'G']=true; //G    Gly Glycine
00337             valid_chars[(uint8_t) 'H']=true; //H    His Histidine
00338             valid_chars[(uint8_t) 'I']=true; //I    Ile Isoleucine
00339             valid_chars[(uint8_t) 'L']=true; //L    Leu Leucine
00340             valid_chars[(uint8_t) 'K']=true; //K    Lys Lysine
00341             valid_chars[(uint8_t) 'M']=true; //M    Met Methionine
00342             valid_chars[(uint8_t) 'F']=true; //F    Phe Phenylalanine
00343             valid_chars[(uint8_t) 'P']=true; //P    Pro Proline
00344             valid_chars[(uint8_t) 'S']=true; //S    Ser Serine
00345             valid_chars[(uint8_t) 'T']=true; //T    Thr Threonine
00346             valid_chars[(uint8_t) 'W']=true; //W    Trp Tryptophan
00347             valid_chars[(uint8_t) 'Y']=true; //Y    Tyr Tyrosine
00348             valid_chars[(uint8_t) 'V']=true; //V    Val Valine
00349             valid_chars[(uint8_t) 'B']=true; //B    Asx Aspartic acid or Asparagine
00350             valid_chars[(uint8_t) 'Z']=true; //Z    Glx Glutamine or Glutamic acid
00351             valid_chars[(uint8_t) 'X']=true; //X    Xaa Any amino acid
00352 
00353             maptable_to_bin[(uint8_t) 'A']=0;  //A  Ala Alanine
00354             maptable_to_bin[(uint8_t) 'R']=1;  //R  Arg Arginine
00355             maptable_to_bin[(uint8_t) 'N']=2;  //N  Asn Asparagine
00356             maptable_to_bin[(uint8_t) 'D']=3;  //D  Asp Aspartic acid
00357             maptable_to_bin[(uint8_t) 'C']=4;  //C  Cys Cysteine
00358             maptable_to_bin[(uint8_t) 'Q']=5;  //Q  Gln Glutamine
00359             maptable_to_bin[(uint8_t) 'E']=6;  //E  Glu Glutamic acid
00360             maptable_to_bin[(uint8_t) 'G']=7;  //G  Gly Glycine
00361             maptable_to_bin[(uint8_t) 'H']=8;  //H  His Histidine
00362             maptable_to_bin[(uint8_t) 'I']=9;  //I  Ile Isoleucine
00363             maptable_to_bin[(uint8_t) 'L']=10; //L  Leu Leucine
00364             maptable_to_bin[(uint8_t) 'K']=11; //K  Lys Lysine
00365             maptable_to_bin[(uint8_t) 'M']=12; //M  Met Methionine
00366             maptable_to_bin[(uint8_t) 'F']=13; //F  Phe Phenylalanine
00367             maptable_to_bin[(uint8_t) 'P']=14; //P  Pro Proline
00368             maptable_to_bin[(uint8_t) 'S']=15; //S  Ser Serine
00369             maptable_to_bin[(uint8_t) 'T']=16; //T  Thr Threonine
00370             maptable_to_bin[(uint8_t) 'W']=17; //W  Trp Tryptophan
00371             maptable_to_bin[(uint8_t) 'Y']=18; //Y  Tyr Tyrosine
00372             maptable_to_bin[(uint8_t) 'V']=19; //V  Val Valine
00373             maptable_to_bin[(uint8_t) 'B']=20; //B  Asx Aspartic acid or Asparagine
00374             maptable_to_bin[(uint8_t) 'Z']=21; //Z  Glx Glutamine or Glutamic acid
00375             maptable_to_bin[(uint8_t) 'X']=22; //X  Xaa Any amino acid
00376 
00377             maptable_to_char[0]=(uint8_t) 'A';  //A Ala Alanine
00378             maptable_to_char[1]=(uint8_t) 'R';  //R Arg Arginine
00379             maptable_to_char[2]=(uint8_t) 'N';  //N Asn Asparagine
00380             maptable_to_char[3]=(uint8_t) 'D';  //D Asp Aspartic acid
00381             maptable_to_char[4]=(uint8_t) 'C';  //C Cys Cysteine
00382             maptable_to_char[5]=(uint8_t) 'Q';  //Q Gln Glutamine
00383             maptable_to_char[6]=(uint8_t) 'E';  //E Glu Glutamic acid
00384             maptable_to_char[7]=(uint8_t) 'G';  //G Gly Glycine
00385             maptable_to_char[8]=(uint8_t) 'H';  //H His Histidine
00386             maptable_to_char[9]=(uint8_t) 'I';  //I Ile Isoleucine
00387             maptable_to_char[10]=(uint8_t) 'L'; //L Leu Leucine
00388             maptable_to_char[11]=(uint8_t) 'K'; //K Lys Lysine
00389             maptable_to_char[12]=(uint8_t) 'M'; //M Met Methionine
00390             maptable_to_char[13]=(uint8_t) 'F'; //F Phe Phenylalanine
00391             maptable_to_char[14]=(uint8_t) 'P'; //P Pro Proline
00392             maptable_to_char[15]=(uint8_t) 'S'; //S Ser Serine
00393             maptable_to_char[16]=(uint8_t) 'T'; //T Thr Threonine
00394             maptable_to_char[17]=(uint8_t) 'W'; //W Trp Tryptophan
00395             maptable_to_char[18]=(uint8_t) 'Y'; //Y Tyr Tyrosine
00396             maptable_to_char[19]=(uint8_t) 'V'; //V Val Valine
00397             maptable_to_char[20]=(uint8_t) 'B'; //B Asx Aspartic acid or Asparagine
00398             maptable_to_char[21]=(uint8_t) 'Z'; //Z Glx Glutamine or Glutamic acid
00399             maptable_to_char[22]=(uint8_t) 'X'; //X Xaa Any amino acid
00400             break;
00401 
00402         default:
00403             break; //leave uninitialised
00404     };
00405 }
00406 
00407 void CAlphabet::clear_histogram()
00408 {
00409     memset(histogram, 0, sizeof(histogram));
00410     print_histogram();
00411 }
00412 
00413 int32_t CAlphabet::get_max_value_in_histogram()
00414 {
00415     int32_t max_sym=-1;
00416     for (int32_t i=(int32_t) (1 <<(sizeof(uint8_t)*8))-1;i>=0; i--)
00417     {
00418         if (histogram[i])
00419         {
00420             max_sym=i;
00421             break;
00422         }
00423     }
00424 
00425     return max_sym;
00426 }
00427 
00428 int32_t CAlphabet::get_num_symbols_in_histogram()
00429 {
00430     int32_t num_sym=0;
00431     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00432     {
00433         if (histogram[i])
00434             num_sym++;
00435     }
00436 
00437     return num_sym;
00438 }
00439 
00440 int32_t CAlphabet::get_num_bits_in_histogram()
00441 {
00442     int32_t num_sym=get_num_symbols_in_histogram();
00443     if (num_sym>0)
00444         return (int32_t) ceil(log((float64_t) num_sym)/log((float64_t) 2));
00445     else
00446         return 0;
00447 }
00448 
00449 void CAlphabet::print_histogram()
00450 {
00451     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00452     {
00453         if (histogram[i])
00454             SG_PRINT( "hist[%d]=%lld\n", i, histogram[i]);
00455     }
00456 }
00457 
00458 bool CAlphabet::check_alphabet(bool print_error)
00459 {
00460     bool result = true;
00461 
00462     for (int32_t i=0; i<(int32_t) (1 <<(sizeof(uint8_t)*8)); i++)
00463     {
00464         if (histogram[i]>0 && valid_chars[i]==0)
00465         {
00466             result=false;
00467             break;
00468         }
00469     }
00470 
00471     if (!result && print_error)
00472     {
00473         print_histogram();
00474         SG_ERROR( "ALPHABET does not contain all symbols in histogram\n");
00475     }
00476 
00477     return result;
00478 }
00479 
00480 bool CAlphabet::check_alphabet_size(bool print_error)
00481 {
00482     if (get_num_bits_in_histogram() > get_num_bits())
00483     {
00484         if (print_error)
00485         {
00486             print_histogram();
00487             fprintf(stderr, "get_num_bits_in_histogram()=%i > get_num_bits()=%i\n", get_num_bits_in_histogram(), get_num_bits()) ;
00488          SG_ERROR( "ALPHABET too small to contain all symbols in histogram\n");
00489         }
00490         return false;
00491     }
00492     else
00493         return true;
00494 
00495 }
00496 
00497 void CAlphabet::copy_histogram(CAlphabet* a)
00498 {
00499     memcpy(histogram, a->get_histogram(), sizeof(histogram));
00500 }
00501 
00502 const char* CAlphabet::get_alphabet_name(EAlphabet alphabet)
00503 {
00504 
00505     int32_t idx;
00506     switch (alphabet)
00507     {
00508         case DNA:
00509             idx=0;
00510             break;
00511         case RAWDNA:
00512             idx=1;
00513             break;
00514         case RNA:
00515             idx=2;
00516             break;
00517         case PROTEIN:
00518             idx=3;
00519             break;
00520         case BINARY:
00521             idx=4;
00522             break;
00523         case ALPHANUM:
00524             idx=5;
00525             break;
00526         case CUBE:
00527             idx=6;
00528             break;
00529         case RAWBYTE:
00530             idx=7;
00531             break;
00532         case IUPAC_NUCLEIC_ACID:
00533             idx=8;
00534             break;
00535         case IUPAC_AMINO_ACID:
00536             idx=9;
00537             break;
00538         case NONE:
00539             idx=10;
00540             break;
00541         default:
00542             idx=11;
00543             break;
00544     }
00545     return alphabet_names[idx];
00546 }
00547 
00548 #ifdef HAVE_BOOST_SERIALIZATION
00549 std::string CAlphabet::to_string() const
00550 {
00551     std::ostringstream s;
00552     ::boost::archive::text_oarchive oa(s);
00553     oa << this;
00554     return s.str();
00555 }
00556 
00557 void CAlphabet::from_string(std::string str)
00558 {
00559     std::istringstream is(str);
00560     ::boost::archive::text_iarchive ia(is);
00561 
00562     //cast away constness
00563     CAlphabet* tmp = const_cast<CAlphabet*>(this);
00564 
00565     ia >> tmp;
00566     *this = *tmp;
00567 }
00568 
00569 void CAlphabet::to_file(std::string filename) const
00570 {
00571     std::ofstream os(filename.c_str(), std::ios::binary);
00572     ::boost::archive::binary_oarchive oa(os);
00573     oa << this;
00574 }
00575 
00576 void CAlphabet::from_file(std::string filename)
00577 {
00578     std::ifstream is(filename.c_str(), std::ios::binary);
00579     ::boost::archive::binary_iarchive ia(is);
00580     CAlphabet* tmp= const_cast<CAlphabet*>(this);
00581     ia >> tmp;
00582 }
00583 #endif //HAVE_BOOST_SERIALIZATION

SHOGUN Machine Learning Toolbox - Documentation