Alphabet.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 2006-2009 Soeren Sonnenburg
00008  * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013 
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017 
00018 namespace shogun
00019 {
00021 enum EAlphabet
00022 {
00024     DNA=0,
00025 
00027     RAWDNA=1,
00028 
00030     RNA=2,
00031 
00033     PROTEIN=3,
00034 
00035     // BINARY just 0 and 1
00036     BINARY=4,
00037 
00039     ALPHANUM=5,
00040 
00042     CUBE=6,
00043 
00045     RAWBYTE=7,
00046 
00048     IUPAC_NUCLEIC_ACID=8,
00049 
00051     IUPAC_AMINO_ACID=9,
00052 
00054     NONE=10,
00055 
00057     UNKNOWN=11
00058 };
00059 
00060 
00071 class CAlphabet : public CSGObject
00072 {
00073     public:
00074 
00078         //CAlphabet();
00079 
00080 
00086         CAlphabet(char* alpha, int32_t len);
00087 
00092         CAlphabet(EAlphabet alpha);
00093 
00098         CAlphabet(CAlphabet* alpha);
00099         virtual ~CAlphabet();
00100 
00105         bool set_alphabet(EAlphabet alpha);
00106 
00111         inline EAlphabet get_alphabet() const
00112         {
00113             return alphabet;
00114         }
00115 
00120         inline int32_t get_num_symbols() const
00121         {
00122             return num_symbols;
00123         }
00124 
00130         inline int32_t get_num_bits() const
00131         {
00132             return num_bits;
00133         }
00134 
00140         inline uint8_t remap_to_bin(uint8_t c)
00141         {
00142             return maptable_to_bin[c];
00143         }
00144 
00150         inline uint8_t remap_to_char(uint8_t c)
00151         {
00152             return maptable_to_char[c];
00153         }
00154 
00156         void clear_histogram();
00157 
00163         template <class T>
00164         void add_string_to_histogram(T* p, int64_t len)
00165         {
00166             for (int64_t i=0; i<len; i++)
00167                 add_byte_to_histogram((uint8_t) (p[i]));
00168         }
00169 
00174         inline void add_byte_to_histogram(uint8_t p)
00175         {
00176             histogram[p]++;
00177         }
00178 
00180         void print_histogram();
00181 
00187         inline void get_hist(int64_t** h, int32_t* len)
00188         {
00189             int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00190             ASSERT(h && len);
00191             *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00192             ASSERT(*h);
00193             *len=hist_size;
00194             ASSERT(*len);
00195             memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00196         }
00197 
00199         inline const int64_t* get_histogram()
00200         {
00201             return &histogram[0];
00202         }
00203 
00210         bool check_alphabet(bool print_error=true);
00211 
00218         inline bool is_valid(uint8_t c)
00219         {
00220             return valid_chars[c];
00221         }
00222 
00228         bool check_alphabet_size(bool print_error=true);
00229 
00234         int32_t get_num_symbols_in_histogram();
00235 
00240         int32_t get_max_value_in_histogram();
00241 
00248         int32_t get_num_bits_in_histogram();
00249 
00254         static const char* get_alphabet_name(EAlphabet alphabet);
00255 
00256 
00258         inline virtual const char* get_name() const { return "Alphabet"; }
00259 
00268         template <class ST>
00269         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00270         {
00271             int32_t i,j;
00272             ST value=0;
00273 
00274             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00275             {
00276                 value=0;
00277                 for (j=i; j>=i-p_order+1; j--)
00278                     value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00279 
00280                 obs[i]= (ST) value;
00281             }
00282 
00283             for (i=p_order-2;i>=0;i--)
00284             {
00285                 if (i>=sequence_length)
00286                     continue;
00287 
00288                 value=0;
00289                 for (j=i; j>=i-p_order+1; j--)
00290                 {
00291                     value= (value >> max_val);
00292                     if (j>=0 && j<sequence_length)
00293                         value|=obs[j] << (max_val * (p_order-1));
00294                 }
00295                 obs[i]=value;
00296             }
00297 
00298             // TODO we should get rid of this loop!
00299             if (start>0)
00300             {
00301                 for (i=start; i<sequence_length; i++)
00302                     obs[i-start]=obs[i];
00303             }
00304         }
00305 
00314         template <class ST>
00315         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00316         {
00317             int32_t i,j;
00318             ST value=0;
00319 
00320             for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T
00321             {
00322                 value=0;
00323                 for (j=i; j>=i-p_order+1; j--)
00324                     value= (value << max_val) | obs[j];
00325 
00326                 obs[i]= (ST) value;
00327             }
00328 
00329             for (i=p_order-2;i>=0;i--)
00330             {
00331                 if (i>=sequence_length)
00332                     continue;
00333 
00334                 value=0;
00335                 for (j=i; j>=i-p_order+1; j--)
00336                 {
00337                     value= (value << max_val);
00338                     if (j>=0 && j<sequence_length)
00339                         value|=obs[j];
00340                 }
00341                 obs[i]=value;
00342             }
00343 
00344             // TODO we should get rid of this loop!
00345             if (start>0)
00346             {
00347                 for (i=start; i<sequence_length; i++)
00348                     obs[i-start]=obs[i];
00349             }
00350         }
00351 
00361         template <class ST>
00362         static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00363         {
00364             ASSERT(gap>=0);
00365 
00366             const int32_t start_gap=(p_order-gap)/2;
00367             const int32_t end_gap=start_gap+gap;
00368 
00369             int32_t i,j;
00370             ST value=0;
00371 
00372             // almost all positions
00373             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00374             {
00375                 value=0;
00376                 for (j=i; j>=i-p_order+1; j--)
00377                 {
00378                     if (i-j<start_gap)
00379                     {
00380                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00381                     }
00382                     else if (i-j>=end_gap)
00383                     {
00384                         value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00385                     }
00386                 }
00387                 obs[i]= (ST) value;
00388             }
00389 
00390             // the remaining `order` positions
00391             for (i=p_order-2;i>=0;i--)
00392             {
00393                 if (i>=sequence_length)
00394                     continue;
00395 
00396                 value=0;
00397                 for (j=i; j>=i-p_order+1; j--)
00398                 {
00399                     if (i-j<start_gap)
00400                     {
00401                         value= (value >> max_val);
00402                         if (j>=0 && j<sequence_length)
00403                             value|=obs[j] << (max_val * (p_order-1-gap));
00404                     }
00405                     else if (i-j>=end_gap)
00406                     {
00407                         value= (value >> max_val);
00408                         if (j>=0 && j<sequence_length)
00409                             value|=obs[j] << (max_val * (p_order-1-gap));
00410                     }
00411                 }
00412                 obs[i]=value;
00413             }
00414 
00415             // TODO we should get rid of this loop!
00416             if (start>0)
00417             {
00418                 for (i=start; i<sequence_length; i++)
00419                     obs[i-start]=obs[i];
00420             }
00421         }
00422 
00432         template <class ST>
00433         static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00434         {
00435             ASSERT(gap>=0);
00436 
00437             const int32_t start_gap=(p_order-gap)/2;
00438             const int32_t end_gap=start_gap+gap;
00439 
00440             int32_t i,j;
00441             ST value=0;
00442 
00443             // almost all positions
00444             for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T
00445             {
00446                 value=0;
00447                 for (j=i; j>=i-p_order+1; j--)
00448                 {
00449                     if (i-j<start_gap)
00450                         value= (value << max_val) | obs[j];
00451                     else if (i-j>=end_gap)
00452                         value= (value << max_val) | obs[j];
00453                 }
00454                 obs[i]= (ST) value;
00455             }
00456 
00457             // the remaining `order` positions
00458             for (i=p_order-2;i>=0;i--)
00459             {
00460                 if (i>=sequence_length)
00461                     continue;
00462 
00463                 value=0;
00464                 for (j=i; j>=i-p_order+1; j--)
00465                 {
00466                     if (i-j<start_gap)
00467                     {
00468                         value= value << max_val;
00469                         if (j>=0 && j<sequence_length)
00470                             value|=obs[j];
00471                     }
00472                     else if (i-j>=end_gap)
00473                     {
00474                         value= value << max_val;
00475                         if (j>=0 && j<sequence_length)
00476                             value|=obs[j];
00477                     }
00478                 }
00479                 obs[i]=value;
00480             }
00481 
00482             // TODO we should get rid of this loop!
00483             if (start>0)
00484             {
00485                 for (i=start; i<sequence_length; i++)
00486                     obs[i-start]=obs[i];
00487             }
00488         }
00489 
00490 
00491     protected:
00493         void init_map_table();
00494 
00499         void copy_histogram(CAlphabet* src);
00500 
00501 
00502 
00503 #ifdef HAVE_BOOST_SERIALIZATION
00504     private:
00505 
00506         friend class ::boost::serialization::access;
00507 
00508         template<class Archive>
00509             void serialize(Archive & ar, const unsigned int archive_version)
00510             {
00511 
00512                 SG_DEBUG("archiving CAlphabet\n");
00513 
00514                 ar & ::boost::serialization::base_object<CSGObject>(*this);
00515 
00516                 ar & alphabet;
00517                 ar & num_symbols;
00518                 ar & num_bits;
00519 
00520                 SG_DEBUG("done with CAlphabet\n");
00521 
00522             }
00523 
00528         virtual std::string to_string() const;
00529 
00534         virtual void from_string(std::string str);
00535 
00540         virtual void to_file(std::string filename) const;
00541 
00546         virtual void from_file(std::string filename);
00547 
00548 #endif //HAVE_BOOST_SERIALIZATION
00549 
00550 
00551     public:
00553         static const uint8_t B_A;
00555         static const uint8_t B_C;
00557         static const uint8_t B_G;
00559         static const uint8_t B_T;
00561         static const uint8_t MAPTABLE_UNDEF;
00563         static const char* alphabet_names[12];
00564 
00565     protected:
00567         EAlphabet alphabet;
00569         int32_t num_symbols;
00571         int32_t num_bits;
00573         bool valid_chars[1 << (sizeof(uint8_t)*8)];
00575         uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00577         uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00579         int64_t histogram[1 << (sizeof(uint8_t)*8)];
00580 };
00581 
00582 
00583 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00584 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00585 {
00586 }
00587 
00588 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00589 {
00590 }
00591 
00592 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00593 {
00594 }
00595 
00596 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00597 {
00598 }
00599 
00600 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00601 {
00602 }
00603 
00604 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00605 {
00606 }
00607 #endif
00608 
00609 }
00610 
00611 #ifdef HAVE_BOOST_SERIALIZATION
00612 //http://www.koders.com/cpp/fidB8C82A2BBA651A5E4EEC668EDE70B86EA017E937.aspx
00613 namespace boost {
00614 namespace serialization {
00615    template<class Archive>
00616     inline void save_construct_data(Archive & ar, const shogun::CAlphabet* t, const unsigned int archive_version)
00617     {
00618       shogun::EAlphabet a = t->get_alphabet();
00619       ar << a;
00620 
00621     }
00622 
00623     template<class Archive>
00624     inline void load_construct_data(Archive & ar, shogun::CAlphabet * t, const unsigned int archive_version)
00625     {
00626 
00627       shogun::EAlphabet a;
00628       ar >> a;
00629       new(t)shogun::CAlphabet(a);
00630 
00631     }
00632 } // serialization
00633 } // namespace boost
00634 #endif //HAVE_BOOST_SERIALIZATION
00635 
00636 
00637 
00638 #endif

SHOGUN Machine Learning Toolbox - Documentation