SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2006-2009 Soeren Sonnenburg 00008 * Copyright (C) 2006-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00009 */ 00010 00011 #ifndef _CALPHABET__H__ 00012 #define _CALPHABET__H__ 00013 00014 #include "base/SGObject.h" 00015 #include "lib/Mathematics.h" 00016 #include "lib/common.h" 00017 00018 namespace shogun 00019 { 00021 enum EAlphabet 00022 { 00024 DNA=0, 00025 00027 RAWDNA=1, 00028 00030 RNA=2, 00031 00033 PROTEIN=3, 00034 00035 // BINARY just 0 and 1 00036 BINARY=4, 00037 00039 ALPHANUM=5, 00040 00042 CUBE=6, 00043 00045 RAWBYTE=7, 00046 00048 IUPAC_NUCLEIC_ACID=8, 00049 00051 IUPAC_AMINO_ACID=9, 00052 00054 NONE=10, 00055 00057 DIGIT=11, 00058 00060 DIGIT2=12, 00061 00063 RAWDIGIT=13, 00064 00066 RAWDIGIT2=14, 00067 00069 UNKNOWN=15, 00070 00072 SNP=16, 00073 00075 RAWSNP=17 00076 }; 00077 00078 00089 class CAlphabet : public CSGObject 00090 { 00091 public: 00092 00096 CAlphabet(); 00097 00103 CAlphabet(char* alpha, int32_t len); 00104 00109 CAlphabet(EAlphabet alpha); 00110 00115 CAlphabet(CAlphabet* alpha); 00116 virtual ~CAlphabet(); 00117 00122 bool set_alphabet(EAlphabet alpha); 00123 00128 inline EAlphabet get_alphabet() const 00129 { 00130 return alphabet; 00131 } 00132 00137 inline int32_t get_num_symbols() const 00138 { 00139 return num_symbols; 00140 } 00141 00147 inline int32_t get_num_bits() const 00148 { 00149 return num_bits; 00150 } 00151 00157 inline uint8_t remap_to_bin(uint8_t c) 00158 { 00159 return maptable_to_bin[c]; 00160 } 00161 00167 inline uint8_t remap_to_char(uint8_t c) 00168 { 00169 return maptable_to_char[c]; 00170 } 00171 00173 void clear_histogram(); 00174 00180 template <class T> 00181 void add_string_to_histogram(T* p, int64_t len) 00182 { 00183 for (int64_t i=0; i<len; i++) 00184 add_byte_to_histogram((uint8_t) (p[i])); 00185 } 00186 00191 inline void add_byte_to_histogram(uint8_t p) 00192 { 00193 histogram[p]++; 00194 } 00195 00197 void print_histogram(); 00198 00204 inline void get_hist(int64_t** h, int32_t* len) 00205 { 00206 int32_t hist_size=(1 << (sizeof(uint8_t)*8)); 00207 ASSERT(h && len); 00208 *h=(int64_t*) malloc(sizeof(int64_t)*hist_size); 00209 ASSERT(*h); 00210 *len=hist_size; 00211 ASSERT(*len); 00212 memcpy(*h, &histogram[0], sizeof(int64_t)*hist_size); 00213 } 00214 00216 inline const int64_t* get_histogram() 00217 { 00218 return &histogram[0]; 00219 } 00220 00227 bool check_alphabet(bool print_error=true); 00228 00235 inline bool is_valid(uint8_t c) 00236 { 00237 return valid_chars[c]; 00238 } 00239 00245 bool check_alphabet_size(bool print_error=true); 00246 00251 int32_t get_num_symbols_in_histogram(); 00252 00257 int32_t get_max_value_in_histogram(); 00258 00265 int32_t get_num_bits_in_histogram(); 00266 00271 static const char* get_alphabet_name(EAlphabet alphabet); 00272 00273 00275 inline virtual const char* get_name() const { return "Alphabet"; } 00276 00285 template <class ST> 00286 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) 00287 { 00288 int32_t i,j; 00289 ST value=0; 00290 00291 for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T 00292 { 00293 value=0; 00294 for (j=i; j>=i-p_order+1; j--) 00295 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1))); 00296 00297 obs[i]= (ST) value; 00298 } 00299 00300 for (i=p_order-2;i>=0;i--) 00301 { 00302 if (i>=sequence_length) 00303 continue; 00304 00305 value=0; 00306 for (j=i; j>=i-p_order+1; j--) 00307 { 00308 value= (value >> max_val); 00309 if (j>=0 && j<sequence_length) 00310 value|=obs[j] << (max_val * (p_order-1)); 00311 } 00312 obs[i]=value; 00313 } 00314 00315 // TODO we should get rid of this loop! 00316 if (start>0) 00317 { 00318 for (i=start; i<sequence_length; i++) 00319 obs[i-start]=obs[i]; 00320 } 00321 } 00322 00331 template <class ST> 00332 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val) 00333 { 00334 int32_t i,j; 00335 ST value=0; 00336 00337 for (i=sequence_length-1; i>= p_order-1; i--) //convert interval of size T 00338 { 00339 value=0; 00340 for (j=i; j>=i-p_order+1; j--) 00341 value= (value << max_val) | obs[j]; 00342 00343 obs[i]= (ST) value; 00344 } 00345 00346 for (i=p_order-2;i>=0;i--) 00347 { 00348 if (i>=sequence_length) 00349 continue; 00350 00351 value=0; 00352 for (j=i; j>=i-p_order+1; j--) 00353 { 00354 value= (value << max_val); 00355 if (j>=0 && j<sequence_length) 00356 value|=obs[j]; 00357 } 00358 obs[i]=value; 00359 } 00360 00361 // TODO we should get rid of this loop! 00362 if (start>0) 00363 { 00364 for (i=start; i<sequence_length; i++) 00365 obs[i-start]=obs[i]; 00366 } 00367 } 00368 00378 template <class ST> 00379 static void translate_from_single_order(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00380 { 00381 ASSERT(gap>=0); 00382 00383 const int32_t start_gap=(p_order-gap)/2; 00384 const int32_t end_gap=start_gap+gap; 00385 00386 int32_t i,j; 00387 ST value=0; 00388 00389 // almost all positions 00390 for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T 00391 { 00392 value=0; 00393 for (j=i; j>=i-p_order+1; j--) 00394 { 00395 if (i-j<start_gap) 00396 { 00397 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap))); 00398 } 00399 else if (i-j>=end_gap) 00400 { 00401 value= (value >> max_val) | (obs[j] << (max_val * (p_order-1-gap))); 00402 } 00403 } 00404 obs[i]= (ST) value; 00405 } 00406 00407 // the remaining `order` positions 00408 for (i=p_order-2;i>=0;i--) 00409 { 00410 if (i>=sequence_length) 00411 continue; 00412 00413 value=0; 00414 for (j=i; j>=i-p_order+1; j--) 00415 { 00416 if (i-j<start_gap) 00417 { 00418 value= (value >> max_val); 00419 if (j>=0 && j<sequence_length) 00420 value|=obs[j] << (max_val * (p_order-1-gap)); 00421 } 00422 else if (i-j>=end_gap) 00423 { 00424 value= (value >> max_val); 00425 if (j>=0 && j<sequence_length) 00426 value|=obs[j] << (max_val * (p_order-1-gap)); 00427 } 00428 } 00429 obs[i]=value; 00430 } 00431 00432 // TODO we should get rid of this loop! 00433 if (start>0) 00434 { 00435 for (i=start; i<sequence_length; i++) 00436 obs[i-start]=obs[i]; 00437 } 00438 } 00439 00449 template <class ST> 00450 static void translate_from_single_order_reversed(ST* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00451 { 00452 ASSERT(gap>=0); 00453 00454 const int32_t start_gap=(p_order-gap)/2; 00455 const int32_t end_gap=start_gap+gap; 00456 00457 int32_t i,j; 00458 ST value=0; 00459 00460 // almost all positions 00461 for (i=sequence_length-1; i>=p_order-1; i--) //convert interval of size T 00462 { 00463 value=0; 00464 for (j=i; j>=i-p_order+1; j--) 00465 { 00466 if (i-j<start_gap) 00467 value= (value << max_val) | obs[j]; 00468 else if (i-j>=end_gap) 00469 value= (value << max_val) | obs[j]; 00470 } 00471 obs[i]= (ST) value; 00472 } 00473 00474 // the remaining `order` positions 00475 for (i=p_order-2;i>=0;i--) 00476 { 00477 if (i>=sequence_length) 00478 continue; 00479 00480 value=0; 00481 for (j=i; j>=i-p_order+1; j--) 00482 { 00483 if (i-j<start_gap) 00484 { 00485 value= value << max_val; 00486 if (j>=0 && j<sequence_length) 00487 value|=obs[j]; 00488 } 00489 else if (i-j>=end_gap) 00490 { 00491 value= value << max_val; 00492 if (j>=0 && j<sequence_length) 00493 value|=obs[j]; 00494 } 00495 } 00496 obs[i]=value; 00497 } 00498 00499 // TODO we should get rid of this loop! 00500 if (start>0) 00501 { 00502 for (i=start; i<sequence_length; i++) 00503 obs[i-start]=obs[i]; 00504 } 00505 } 00506 00507 private: 00510 void init(); 00511 00512 protected: 00514 void init_map_table(); 00515 00520 void copy_histogram(CAlphabet* src); 00521 00522 public: 00524 static const uint8_t B_A; 00526 static const uint8_t B_C; 00528 static const uint8_t B_G; 00530 static const uint8_t B_T; 00532 static const uint8_t B_0; 00534 static const uint8_t MAPTABLE_UNDEF; 00536 static const char* alphabet_names[18]; 00537 00538 protected: 00547 virtual void load_serializable_post(void) throw (ShogunException); 00548 00549 protected: 00551 EAlphabet alphabet; 00553 int32_t num_symbols; 00555 int32_t num_bits; 00557 bool valid_chars[1 << (sizeof(uint8_t)*8)]; 00559 uint8_t maptable_to_bin[1 << (sizeof(uint8_t)*8)]; 00561 uint8_t maptable_to_char[1 << (sizeof(uint8_t)*8)]; 00563 int64_t histogram[1 << (sizeof(uint8_t)*8)]; 00564 }; 00565 00566 00567 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00568 template<> inline void CAlphabet::translate_from_single_order(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00569 { 00570 } 00571 00572 template<> inline void CAlphabet::translate_from_single_order(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00573 { 00574 } 00575 00576 template<> inline void CAlphabet::translate_from_single_order(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00577 { 00578 } 00579 00580 template<> inline void CAlphabet::translate_from_single_order_reversed(float32_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00581 { 00582 } 00583 00584 template<> inline void CAlphabet::translate_from_single_order_reversed(float64_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00585 { 00586 } 00587 00588 template<> inline void CAlphabet::translate_from_single_order_reversed(floatmax_t* obs, int32_t sequence_length, int32_t start, int32_t p_order, int32_t max_val, int32_t gap) 00589 { 00590 } 00591 #endif 00592 00593 } 00594 #endif