SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Gunnar Raetsch 00008 * Written (W) 1999-2009 Soeren Sonnenburg 00009 * Written (W) 2008-2009 Jonas Behr 00010 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00011 */ 00012 00013 #ifndef __CDYNPROG_H__ 00014 #define __CDYNPROG_H__ 00015 00016 #include "lib/Mathematics.h" 00017 #include "lib/common.h" 00018 #include "base/SGObject.h" 00019 #include "lib/io.h" 00020 #include "lib/config.h" 00021 #include "structure/PlifMatrix.h" 00022 #include "structure/PlifBase.h" 00023 #include "structure/Plif.h" 00024 #include "structure/IntronList.h" 00025 #include "structure/SegmentLoss.h" 00026 #include "features/StringFeatures.h" 00027 #include "features/SparseFeatures.h" 00028 #include "distributions/Distribution.h" 00029 #include "lib/DynamicArray.h" 00030 #include "lib/Array.h" 00031 #include "lib/Array2.h" 00032 #include "lib/Array3.h" 00033 #include "lib/Time.h" 00034 00035 #include <stdio.h> 00036 #include <limits.h> 00037 00038 namespace shogun 00039 { 00040 template <class T> class CSparseFeatures; 00041 class CIntronList; 00042 class CPlifMatrix; 00043 class CSegmentLoss; 00044 template <class T> class CArray; 00045 00046 //#define DYNPROG_TIMING 00047 00048 #ifdef USE_BIGSTATES 00049 typedef uint16_t T_STATES ; 00050 #else 00051 typedef uint8_t T_STATES ; 00052 #endif 00053 typedef T_STATES* P_STATES ; 00054 00056 struct segment_loss_struct 00057 { 00059 int32_t maxlookback; 00061 int32_t seqlen; 00063 int32_t *segments_changed; 00065 float64_t *num_segment_id; 00067 int32_t *length_segment_id ; 00068 }; 00069 00075 class CDynProg : public CSGObject 00076 { 00077 public: 00082 CDynProg(int32_t p_num_svms=8); 00083 virtual ~CDynProg(); 00084 00085 // model related functions 00091 void set_num_states(int32_t N); 00092 00094 int32_t get_num_states(); 00095 00097 int32_t get_num_svms(); 00098 00104 void init_content_svm_value_array(const int32_t p_num_svms); 00105 00113 void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes); 00114 00121 void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs); 00122 00127 void resize_lin_feat(int32_t num_new_feat); 00133 void set_p_vector(float64_t* p, int32_t N); 00134 00140 void set_q_vector(float64_t* q, int32_t N); 00141 00148 void set_a(float64_t* a, int32_t M, int32_t N); 00149 00156 void set_a_id(int32_t *a, int32_t M, int32_t N); 00157 00164 void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N); 00165 00172 void init_mod_words_array(int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns); 00173 00179 bool check_svm_arrays(); 00180 00187 void set_observation_matrix(float64_t* seq, int32_t* dims, int32_t ndims); 00188 00195 int32_t get_num_positions(); 00196 00208 void set_content_type_array(float64_t* seg_path, int32_t rows, int32_t cols); 00209 00215 void set_pos(int32_t* pos, int32_t seq_len); 00216 00224 void set_orf_info(int32_t* orf_info, int32_t m, int32_t n); 00225 00231 void set_gene_string(char* genestr, int32_t genestr_len); 00232 00233 00240 void set_dict_weights(float64_t* dictionary_weights, int32_t dict_len, int32_t n); 00241 00248 void best_path_set_segment_loss(float64_t * segment_loss, int32_t num_segment_id1, int32_t num_segment_id2); 00249 00256 void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m); 00257 00259 void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2); 00260 00265 void set_plif_matrices(CPlifMatrix* pm); 00266 00267 // best_path result retrieval functions 00273 void get_scores(float64_t **scores, int32_t *n); 00274 00281 void get_states(int32_t **states, int32_t *m, int32_t *n); 00282 00289 void get_positions(int32_t **positions, int32_t *m, int32_t *n); 00290 00291 00300 void compute_nbest_paths(int32_t max_num_signals, 00301 bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences); 00302 00304 00316 void best_path_trans_deriv( 00317 int32_t* my_state_seq, int32_t *my_pos_seq, 00318 int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals); 00319 00320 // additional best_path_trans_deriv functions 00325 void set_my_state_seq(int32_t* my_state_seq); 00326 00331 void set_my_pos_seq(int32_t* my_pos_seq); 00332 00340 void get_path_scores(float64_t** my_scores, int32_t* seq_len); 00341 00349 void get_path_losses(float64_t** my_losses, int32_t* seq_len); 00350 00351 00353 inline T_STATES get_N() const 00354 { 00355 return m_N ; 00356 } 00357 00362 inline void set_q(T_STATES offset, float64_t value) 00363 { 00364 m_end_state_distribution_q[offset]=value; 00365 } 00366 00371 inline void set_p(T_STATES offset, float64_t value) 00372 { 00373 m_initial_state_distribution_p[offset]=value; 00374 } 00375 00382 inline void set_a(T_STATES line_, T_STATES column, float64_t value) 00383 { 00384 m_transition_matrix_a.element(line_,column)=value; // look also best_path! 00385 } 00386 00392 inline float64_t get_q(T_STATES offset) const 00393 { 00394 return m_end_state_distribution_q[offset]; 00395 } 00396 00402 inline float64_t get_q_deriv(T_STATES offset) const 00403 { 00404 return m_end_state_distribution_q_deriv[offset]; 00405 } 00406 00412 inline float64_t get_p(T_STATES offset) const 00413 { 00414 return m_initial_state_distribution_p[offset]; 00415 } 00416 00422 inline float64_t get_p_deriv(T_STATES offset) const 00423 { 00424 return m_initial_state_distribution_p_deriv[offset]; 00425 } 00426 00430 void precompute_content_values(); 00431 00438 inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2) 00439 { 00440 m_lin_feat.get_array_size(dim1, dim2); 00441 return m_lin_feat.get_array(); 00442 } 00451 inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len) 00452 { 00453 m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true); 00454 } 00459 void create_word_string(); 00460 00463 void precompute_stop_codons(); 00464 00471 inline float64_t get_a(T_STATES line_, T_STATES column) const 00472 { 00473 return m_transition_matrix_a.element(line_, column); // look also best_path()! 00474 } 00475 00482 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const 00483 { 00484 return m_transition_matrix_a_deriv.element(line_, column); // look also best_path()! 00485 } 00487 00492 void set_intron_list(CIntronList* intron_list, int32_t num_plifs); 00493 00495 CSegmentLoss* get_segment_loss_object() 00496 { 00497 return m_seg_loss_obj; 00498 } 00499 00506 void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len) 00507 { 00508 m_long_transitions = use_long_transitions; 00509 m_long_transition_threshold = threshold; 00510 SG_DEBUG("ignoring max_len\n") ; 00511 //m_long_transition_max = max_len; 00512 } 00513 00514 protected: 00515 00516 /* helper functions */ 00517 00527 void lookup_content_svm_values(const int32_t from_state, 00528 const int32_t to_state, const int32_t from_pos, const int32_t to_pos, 00529 float64_t* svm_values, int32_t frame); 00530 00538 inline void lookup_tiling_plif_values(const int32_t from_state, 00539 const int32_t to_state, const int32_t len, float64_t* svm_values); 00540 00545 inline int32_t find_frame(const int32_t from_state); 00546 00555 inline int32_t raw_intensities_interval_query( 00556 const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type); 00557 00558 #ifndef DOXYGEN_SHOULD_SKIP_THIS 00559 00560 struct svm_values_struct 00561 { 00563 int32_t maxlookback; 00565 int32_t seqlen; 00566 00568 int32_t* start_pos; 00570 float64_t ** svm_values_unnormalized; 00572 float64_t * svm_values; 00574 bool *** word_used; 00576 int32_t **num_unique_words; 00577 }; 00578 #endif // DOXYGEN_SHOULD_SKIP_THIS 00579 00588 bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to); 00589 00591 inline virtual const char* get_name() const { return "DynProg"; } 00592 00593 private: 00594 00595 T_STATES trans_list_len; 00596 T_STATES **trans_list_forward; 00597 T_STATES *trans_list_forward_cnt; 00598 float64_t **trans_list_forward_val; 00599 int32_t **trans_list_forward_id; 00600 bool mem_initialized; 00601 00602 #ifdef DYNPROG_TIMING 00603 CTime MyTime; 00604 CTime MyTime2; 00605 CTime MyTime3; 00606 00607 float64_t segment_init_time; 00608 float64_t segment_pos_time; 00609 float64_t segment_clean_time; 00610 float64_t segment_extend_time; 00611 float64_t orf_time; 00612 float64_t content_time; 00613 float64_t content_penalty_time; 00614 float64_t content_svm_values_time ; 00615 float64_t content_plifs_time ; 00616 float64_t svm_init_time; 00617 float64_t svm_pos_time; 00618 float64_t inner_loop_time; 00619 float64_t inner_loop_max_time ; 00620 float64_t svm_clean_time; 00621 float64_t long_transition_time ; 00622 #endif 00623 00624 00625 protected: 00630 00631 int32_t m_N; 00632 00634 CArray2<int32_t> m_transition_matrix_a_id; 00635 CArray2<float64_t> m_transition_matrix_a; 00636 CArray2<float64_t> m_transition_matrix_a_deriv; 00637 00639 CArray<float64_t> m_initial_state_distribution_p; 00640 CArray<float64_t> m_initial_state_distribution_p_deriv; 00641 00643 CArray<float64_t> m_end_state_distribution_q; 00644 CArray<float64_t> m_end_state_distribution_q_deriv; 00645 00647 00649 int32_t m_num_degrees; 00651 int32_t m_num_svms; 00652 00654 CArray<int32_t> m_word_degree; 00656 CArray<int32_t> m_cum_num_words; 00658 int32_t * m_cum_num_words_array; 00660 CArray<int32_t> m_num_words; 00662 int32_t* m_num_words_array; 00664 CArray2<int32_t> m_mod_words; 00666 int32_t* m_mod_words_array; 00668 CArray<bool> m_sign_words; 00670 bool* m_sign_words_array; 00672 CArray<int32_t> m_string_words; 00674 int32_t* m_string_words_array; 00675 00677 // CArray<int32_t> m_svm_pos_start; 00679 CArray<int32_t> m_num_unique_words; 00681 bool m_svm_arrays_clean; 00683 int32_t m_max_a_id; 00684 00685 // input arguments 00687 CArray3<float64_t> m_observation_matrix; 00689 CArray<int32_t> m_pos; 00691 int32_t m_seq_len; 00693 CArray2<int32_t> m_orf_info; 00695 CArray2<float64_t> m_segment_sum_weights; 00697 CArray<CPlifBase*> m_plif_list; 00699 CArray2<CPlifBase*> m_PEN; 00701 CArray2<CPlifBase*> m_PEN_state_signals; 00703 CArray<char> m_genestr; 00718 uint16_t*** m_wordstr; 00720 CArray2<float64_t> m_dict_weights; 00722 CArray3<float64_t> m_segment_loss; 00724 CArray<int32_t> m_segment_ids; 00726 CArray<float64_t> m_segment_mask; 00728 CArray<int32_t> m_my_state_seq; 00730 CArray<int32_t> m_my_pos_seq; 00732 CArray<float64_t> m_my_scores; 00734 CArray<float64_t> m_my_losses; 00735 00738 CSegmentLoss* m_seg_loss_obj; 00739 00740 // output arguments 00742 CArray<float64_t> m_scores; 00744 CArray2<int32_t> m_states; 00746 CArray2<int32_t> m_positions; 00747 00749 CSparseFeatures<float64_t>* m_seq_sparse1; 00751 CSparseFeatures<float64_t>* m_seq_sparse2; 00753 CPlifMatrix* m_plif_matrices; 00754 00758 CArray<bool> m_genestr_stop; 00759 00762 CIntronList* m_intron_list; 00763 00765 int32_t m_num_intron_plifs; 00766 00771 CArray2<float64_t> m_lin_feat; 00772 00774 float64_t *m_raw_intensities; 00776 int32_t* m_probe_pos; 00778 int32_t* m_num_probes_cum; 00780 int32_t* m_num_lin_feat_plifs_cum; 00782 int32_t m_num_raw_data; 00783 00785 bool m_long_transitions ; 00788 int32_t m_long_transition_threshold ; 00793 //int32_t m_long_transition_max ; 00794 00798 static int32_t word_degree_default[4]; 00799 00803 static int32_t cum_num_words_default[5]; 00804 00807 static int32_t frame_plifs[3]; 00808 00811 static int32_t num_words_default[4]; 00812 00814 static int32_t mod_words_default[32]; 00815 00817 static bool sign_words_default[16]; 00818 00820 static int32_t string_words_default[16]; 00821 }; 00822 } 00823 #endif