00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #ifndef _CALPHABET__H__
00012 #define _CALPHABET__H__
00013
00014 #include "base/SGObject.h"
00015 #include "lib/Mathematics.h"
00016 #include "lib/common.h"
00017
00018 namespace shogun
00019 {
00021 enum EAlphabet
00022 {
00024 DNA=0,
00025
00027 RAWDNA=1,
00028
00030 RNA=2,
00031
00033 PROTEIN=3,
00034
00035
00036 BINARY=4,
00037
00039 ALPHANUM=5,
00040
00042 CUBE=6,
00043
00045 RAWBYTE=7,
00046
00048 IUPAC_NUCLEIC_ACID=8,
00049
00051 IUPAC_AMINO_ACID=9,
00052
00054 NONE=10,
00055
00057 UNKNOWN=11
00058 };
00059
00060
00071 class CAlphabet : public CSGObject
00072 {
00073 public:
00074
00078
00079
00080
00086 CAlphabet(char* alpha, int32_t len);
00087
00092 CAlphabet(EAlphabet alpha);
00093
00098 CAlphabet(CAlphabet* alpha);
00099 virtual ~CAlphabet();
00100
00105 bool set_alphabet(EAlphabet alpha);
00106
00111 inline EAlphabet get_alphabet() const
00112 {
00113 return alphabet;
00114 }
00115
00120 inline int32_t get_num_symbols() const
00121 {
00122 return num_symbols;
00123 }
00124
00130 inline int32_t get_num_bits() const
00131 {
00132 return num_bits;
00133 }
00134
00140 inline uint8_t remap_to_bin(uint8_t c)
00141 {
00142 return maptable_to_bin[c];
00143 }
00144
00150 inline uint8_t remap_to_char(uint8_t c)
00151 {
00152 return maptable_to_char[c];
00153 }
00154
00156 void clear_histogram();
00157
00163 template <class T>
00164 void add_string_to_histogram(T* p, int64_t len)
00165 {
00166 for (int64_t i=0; i<len; i++)
00167 add_byte_to_histogram((uint8_t) (p[i]));
00168 }
00169
00174 inline void add_byte_to_histogram(uint8_t p)
00175 {
00176 histogram[p]++;
00177 }
00178
00180 void print_histogram();
00181
00187 inline void get_hist(int64_t** h, int32_t* len)
00188 {
00189 int32_t hist_size=(1 << (sizeof(uint8_t)*8));
00190 ASSERT(h && len);
00191 *h=(int64_t*) malloc(sizeof(int64_t)*hist_size);
00192 ASSERT(*h);
00193 *len=hist_size;
00194 ASSERT(*len);
00195 memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size);
00196 }
00197
00199 inline const int64_t* get_histogram()
00200 {
00201 return &histogram[0];
00202 }
00203
00210 bool check_alphabet(bool print_error=true);
00211
00218 inline bool is_valid(uint8_t c)
00219 {
00220 return valid_chars[c];
00221 }
00222
00228 bool check_alphabet_size(bool print_error=true);
00229
00234 int32_t get_num_symbols_in_histogram();
00235
00240 int32_t get_max_value_in_histogram();
00241
00248 int32_t get_num_bits_in_histogram();
00249
00254 static const char* get_alphabet_name(EAlphabet alphabet);
00255
00256
00258 inline virtual const char* get_name() const { return "Alphabet"; }
00259
00268 template <class ST>
00269 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00270 {
00271 int32_t i,j;
00272 ST value=0;
00273
00274 for (i=sequence_length-1; i>= p_order-1; i--)
00275 {
00276 value=0;
00277 for (j=i; j>=i-p_order+1; j--)
00278 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1)));
00279
00280 obs[i]= (ST) value;
00281 }
00282
00283 for (i=p_order-2;i>=0;i--)
00284 {
00285 if (i>=sequence_length)
00286 continue;
00287
00288 value=0;
00289 for (j=i; j>=i-p_order+1; j--)
00290 {
00291 value= (value >> max_val);
00292 if (j>=0 && j<sequence_length)
00293 value|=obs[j] << (max_val * (p_order-1));
00294 }
00295 obs[i]=value;
00296 }
00297
00298
00299 if (start>0)
00300 {
00301 for (i=start; i<sequence_length; i++)
00302 obs[i-start]=obs[i];
00303 }
00304 }
00305
00314 template <class ST>
00315 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val)
00316 {
00317 int32_t i,j;
00318 ST value=0;
00319
00320 for (i=sequence_length-1; i>= p_order-1; i--)
00321 {
00322 value=0;
00323 for (j=i; j>=i-p_order+1; j--)
00324 value= (value << max_val) | obs[j];
00325
00326 obs[i]= (ST) value;
00327 }
00328
00329 for (i=p_order-2;i>=0;i--)
00330 {
00331 if (i>=sequence_length)
00332 continue;
00333
00334 value=0;
00335 for (j=i; j>=i-p_order+1; j--)
00336 {
00337 value= (value << max_val);
00338 if (j>=0 && j<sequence_length)
00339 value|=obs[j];
00340 }
00341 obs[i]=value;
00342 }
00343
00344
00345 if (start>0)
00346 {
00347 for (i=start; i<sequence_length; i++)
00348 obs[i-start]=obs[i];
00349 }
00350 }
00351
00361 template <class ST>
00362 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00363 {
00364 ASSERT(gap>=0);
00365
00366 const int32_t start_gap=(p_order-gap)/2;
00367 const int32_t end_gap=start_gap+gap;
00368
00369 int32_t i,j;
00370 ST value=0;
00371
00372
00373 for (i=sequence_length-1; i>=p_order-1; i--)
00374 {
00375 value=0;
00376 for (j=i; j>=i-p_order+1; j--)
00377 {
00378 if (i-j<start_gap)
00379 {
00380 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00381 }
00382 else if (i-j>=end_gap)
00383 {
00384 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap)));
00385 }
00386 }
00387 obs[i]= (ST) value;
00388 }
00389
00390
00391 for (i=p_order-2;i>=0;i--)
00392 {
00393 if (i>=sequence_length)
00394 continue;
00395
00396 value=0;
00397 for (j=i; j>=i-p_order+1; j--)
00398 {
00399 if (i-j<start_gap)
00400 {
00401 value= (value >> max_val);
00402 if (j>=0 && j<sequence_length)
00403 value|=obs[j] << (max_val * (p_order-1-gap));
00404 }
00405 else if (i-j>=end_gap)
00406 {
00407 value= (value >> max_val);
00408 if (j>=0 && j<sequence_length)
00409 value|=obs[j] << (max_val * (p_order-1-gap));
00410 }
00411 }
00412 obs[i]=value;
00413 }
00414
00415
00416 if (start>0)
00417 {
00418 for (i=start; i<sequence_length; i++)
00419 obs[i-start]=obs[i];
00420 }
00421 }
00422
00432 template <class ST>
00433 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00434 {
00435 ASSERT(gap>=0);
00436
00437 const int32_t start_gap=(p_order-gap)/2;
00438 const int32_t end_gap=start_gap+gap;
00439
00440 int32_t i,j;
00441 ST value=0;
00442
00443
00444 for (i=sequence_length-1; i>=p_order-1; i--)
00445 {
00446 value=0;
00447 for (j=i; j>=i-p_order+1; j--)
00448 {
00449 if (i-j<start_gap)
00450 value= (value << max_val) | obs[j];
00451 else if (i-j>=end_gap)
00452 value= (value << max_val) | obs[j];
00453 }
00454 obs[i]= (ST) value;
00455 }
00456
00457
00458 for (i=p_order-2;i>=0;i--)
00459 {
00460 if (i>=sequence_length)
00461 continue;
00462
00463 value=0;
00464 for (j=i; j>=i-p_order+1; j--)
00465 {
00466 if (i-j<start_gap)
00467 {
00468 value= value << max_val;
00469 if (j>=0 && j<sequence_length)
00470 value|=obs[j];
00471 }
00472 else if (i-j>=end_gap)
00473 {
00474 value= value << max_val;
00475 if (j>=0 && j<sequence_length)
00476 value|=obs[j];
00477 }
00478 }
00479 obs[i]=value;
00480 }
00481
00482
00483 if (start>0)
00484 {
00485 for (i=start; i<sequence_length; i++)
00486 obs[i-start]=obs[i];
00487 }
00488 }
00489
00490
00491 protected:
00493 void init_map_table();
00494
00499 void copy_histogram(CAlphabet* src);
00500
00501
00502
00503 #ifdef HAVE_BOOST_SERIALIZATION
00504 private:
00505
00506 friend class ::boost::serialization::access;
00507
00508 template<class Archive>
00509 void serialize(Archive & ar, const unsigned int archive_version)
00510 {
00511
00512 SG_DEBUG("archiving CAlphabet\n");
00513
00514 ar & ::boost::serialization::base_object<CSGObject>(*this);
00515
00516 ar & alphabet;
00517 ar & num_symbols;
00518 ar & num_bits;
00519
00520 SG_DEBUG("done with CAlphabet\n");
00521
00522 }
00523
00528 virtual std::string to_string() const;
00529
00534 virtual void from_string(std::string str);
00535
00540 virtual void to_file(std::string filename) const;
00541
00546 virtual void from_file(std::string filename);
00547
00548 #endif //HAVE_BOOST_SERIALIZATION
00549
00550
00551 public:
00553 static const uint8_t B_A;
00555 static const uint8_t B_C;
00557 static const uint8_t B_G;
00559 static const uint8_t B_T;
00561 static const uint8_t MAPTABLE_UNDEF;
00563 static const char* alphabet_names[12];
00564
00565 protected:
00567 EAlphabet alphabet;
00569 int32_t num_symbols;
00571 int32_t num_bits;
00573 bool valid_chars[1 << (sizeof(uint8_t)*8)];
00575 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)];
00577 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)];
00579 int64_t histogram[1 << (sizeof(uint8_t)*8)];
00580 };
00581
00582
00583 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00584 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00585 {
00586 }
00587
00588 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00589 {
00590 }
00591
00592 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00593 {
00594 }
00595
00596 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00597 {
00598 }
00599
00600 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00601 {
00602 }
00603
00604 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap)
00605 {
00606 }
00607 #endif
00608
00609 }
00610
00611 #ifdef HAVE_BOOST_SERIALIZATION
00612
00613 namespace boost {
00614 namespace serialization {
00615 template<class Archive>
00616 inline void save_construct_data(Archive & ar, const shogun::CAlphabet* t, const unsigned int archive_version)
00617 {
00618 shogun::EAlphabet a = t->get_alphabet();
00619 ar << a;
00620
00621 }
00622
00623 template<class Archive>
00624 inline void load_construct_data(Archive & ar, shogun::CAlphabet * t, const unsigned int archive_version)
00625 {
00626
00627 shogun::EAlphabet a;
00628 ar >> a;
00629 new(t)shogun::CAlphabet(a);
00630
00631 }
00632 }
00633 }
00634 #endif //HAVE_BOOST_SERIALIZATION
00635
00636
00637
00638 #endif