00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015
00016 #include "lib/Mathematics.h"
00017 #include "lib/common.h"
00018 #include "base/SGObject.h"
00019 #include "lib/io.h"
00020 #include "lib/config.h"
00021 #include "structure/PlifMatrix.h"
00022 #include "structure/PlifBase.h"
00023 #include "structure/Plif.h"
00024 #include "structure/IntronList.h"
00025 #include "structure/SegmentLoss.h"
00026 #include "features/StringFeatures.h"
00027 #include "features/SparseFeatures.h"
00028 #include "distributions/Distribution.h"
00029 #include "lib/DynamicArray.h"
00030 #include "lib/Array.h"
00031 #include "lib/Array2.h"
00032 #include "lib/Array3.h"
00033 #include "lib/Time.h"
00034
00035 #include <stdio.h>
00036 #include <limits.h>
00037
00038 namespace shogun
00039 {
00040 template <class T> class CSparseFeatures;
00041 class CIntronList;
00042 class CPlifMatrix;
00043 class CSegmentLoss;
00044
00045
00046
00047 #ifdef USE_BIGSTATES
00048 typedef uint16_t T_STATES ;
00049 #else
00050 typedef uint8_t T_STATES ;
00051 #endif
00052 typedef T_STATES* P_STATES ;
00053
00059 class CDynProg : public CSGObject
00060 {
00061 public:
00066 CDynProg(int32_t p_num_svms=8);
00067 virtual ~CDynProg();
00068
00077 float64_t best_path_no_b(int32_t max_iter, int32_t & best_iter, int32_t *my_path);
00078
00087 void best_path_no_b_trans(int32_t max_iter, int32_t & max_best_iter, int16_t nbest, float64_t *prob_nbest, int32_t *my_paths);
00088
00089
00095 void set_num_states(int32_t N);
00096
00098 int32_t get_num_states();
00099
00101 int32_t get_num_svms();
00102
00108 void init_content_svm_value_array(const int32_t p_num_svms);
00109
00117 void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
00118
00125 void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);
00126
00131 void resize_lin_feat(int32_t num_new_feat);
00137 void set_p_vector(float64_t* p, int32_t N);
00138
00144 void set_q_vector(float64_t* q, int32_t N);
00145
00152 void set_a(float64_t* a, int32_t M, int32_t N);
00153
00160 void set_a_id(int32_t *a, int32_t M, int32_t N);
00161
00168 void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N);
00169
00176 void init_mod_words_array(int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns);
00177
00183 bool check_svm_arrays();
00184
00191 void set_observation_matrix(float64_t* seq, int32_t* dims, int32_t ndims);
00192
00199 int32_t get_num_positions();
00200
00212 void set_content_type_array(float64_t* seg_path, int32_t rows, int32_t cols);
00213
00219 void set_pos(int32_t* pos, int32_t seq_len);
00220
00228 void set_orf_info(int32_t* orf_info, int32_t m, int32_t n);
00229
00235 void set_gene_string(char* genestr, int32_t genestr_len);
00236
00237
00244 void set_dict_weights(float64_t* dictionary_weights, int32_t dict_len, int32_t n);
00245
00252 void best_path_set_segment_loss(float64_t * segment_loss, int32_t num_segment_id1, int32_t num_segment_id2);
00253
00260 void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00261
00263 void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
00264
00269 void set_plif_matrices(CPlifMatrix* pm);
00270
00271
00277 void get_scores(float64_t **scores, int32_t *n);
00278
00285 void get_states(int32_t **states, int32_t *m, int32_t *n);
00286
00293 void get_positions(int32_t **positions, int32_t *m, int32_t *n);
00294
00295
00304 void compute_nbest_paths(int32_t max_num_signals,
00305 bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
00306
00308
00320 void best_path_trans_deriv(
00321 int32_t* my_state_seq, int32_t *my_pos_seq,
00322 int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
00323
00324
00329 void set_my_state_seq(int32_t* my_state_seq);
00330
00335 void set_my_pos_seq(int32_t* my_pos_seq);
00336
00344 void get_path_scores(float64_t** my_scores, int32_t* seq_len);
00345
00353 void get_path_losses(float64_t** my_losses, int32_t* seq_len);
00354
00355
00357 inline T_STATES get_N() const
00358 {
00359 return m_N ;
00360 }
00361
00366 inline void set_q(T_STATES offset, float64_t value)
00367 {
00368 m_end_state_distribution_q[offset]=value;
00369 }
00370
00375 inline void set_p(T_STATES offset, float64_t value)
00376 {
00377 m_initial_state_distribution_p[offset]=value;
00378 }
00379
00386 inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00387 {
00388 m_transition_matrix_a.element(line_,column)=value;
00389 }
00390
00396 inline float64_t get_q(T_STATES offset) const
00397 {
00398 return m_end_state_distribution_q[offset];
00399 }
00400
00406 inline float64_t get_q_deriv(T_STATES offset) const
00407 {
00408 return m_end_state_distribution_q_deriv[offset];
00409 }
00410
00416 inline float64_t get_p(T_STATES offset) const
00417 {
00418 return m_initial_state_distribution_p[offset];
00419 }
00420
00426 inline float64_t get_p_deriv(T_STATES offset) const
00427 {
00428 return m_initial_state_distribution_p_deriv[offset];
00429 }
00430
00434 void precompute_content_values();
00435
00442 inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2)
00443 {
00444 m_lin_feat.get_array_size(dim1, dim2);
00445 return m_lin_feat.get_array();
00446 }
00455 inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len)
00456 {
00457 m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
00458 }
00463 void create_word_string();
00464
00467 void precompute_stop_codons();
00468
00475 inline float64_t get_a(T_STATES line_, T_STATES column) const
00476 {
00477 return m_transition_matrix_a.element(line_, column);
00478 }
00479
00486 inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00487 {
00488 return m_transition_matrix_a_deriv.element(line_, column);
00489 }
00491
00496 void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
00497
00499 CSegmentLoss* get_segment_loss_object()
00500 {
00501 return m_seg_loss_obj;
00502 }
00503
00510 void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
00511 {
00512 m_long_transitions = use_long_transitions;
00513 m_long_transition_threshold = threshold;
00514 m_long_transition_max = max_len;
00515 }
00516
00517 protected:
00518
00519
00520
00530 void lookup_content_svm_values(const int32_t from_state,
00531 const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00532 float64_t* svm_values, int32_t frame);
00533
00541 inline void lookup_tiling_plif_values(const int32_t from_state,
00542 const int32_t to_state, const int32_t len, float64_t* svm_values);
00543
00548 inline int32_t find_frame(const int32_t from_state);
00549
00558 inline int32_t raw_intensities_interval_query(
00559 const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00560
00561 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00562
00563 struct svm_values_struct
00564 {
00566 int32_t maxlookback;
00568 int32_t seqlen;
00569
00571 int32_t* start_pos;
00573 float64_t ** svm_values_unnormalized;
00575 float64_t * svm_values;
00577 bool *** word_used;
00579 int32_t **num_unique_words;
00580 };
00581 #endif // DOXYGEN_SHOULD_SKIP_THIS
00582
00591 bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00592
00594 inline virtual const char* get_name() const { return "DynProg"; }
00595
00596 private:
00597
00598 T_STATES trans_list_len;
00599 T_STATES **trans_list_forward;
00600 T_STATES *trans_list_forward_cnt;
00601 float64_t **trans_list_forward_val;
00602 int32_t **trans_list_forward_id;
00603 bool mem_initialized;
00604
00605 #ifdef DYNPROG_TIMING
00606 CTime MyTime;
00607 CTime MyTime2;
00608 CTime MyTime3;
00609
00610 float64_t segment_init_time;
00611 float64_t segment_pos_time;
00612 float64_t segment_clean_time;
00613 float64_t segment_extend_time;
00614 float64_t orf_time;
00615 float64_t content_time;
00616 float64_t content_penalty_time;
00617 float64_t content_svm_values_time ;
00618 float64_t content_plifs_time ;
00619 float64_t svm_init_time;
00620 float64_t svm_pos_time;
00621 float64_t inner_loop_time;
00622 float64_t inner_loop_max_time ;
00623 float64_t svm_clean_time;
00624 float64_t long_transition_time ;
00625 #endif
00626
00627
00628 protected:
00633
00634 int32_t m_N;
00635
00637 CArray2<int32_t> m_transition_matrix_a_id;
00638 CArray2<float64_t> m_transition_matrix_a;
00639 CArray2<float64_t> m_transition_matrix_a_deriv;
00640
00642 CArray<float64_t> m_initial_state_distribution_p;
00643 CArray<float64_t> m_initial_state_distribution_p_deriv;
00644
00646 CArray<float64_t> m_end_state_distribution_q;
00647 CArray<float64_t> m_end_state_distribution_q_deriv;
00648
00650
00652 int32_t m_num_degrees;
00654 int32_t m_num_svms;
00655
00657 CArray<int32_t> m_word_degree;
00659 CArray<int32_t> m_cum_num_words;
00661 int32_t * m_cum_num_words_array;
00663 CArray<int32_t> m_num_words;
00665 int32_t* m_num_words_array;
00667 CArray2<int32_t> m_mod_words;
00669 int32_t* m_mod_words_array;
00671 CArray<bool> m_sign_words;
00673 bool* m_sign_words_array;
00675 CArray<int32_t> m_string_words;
00677 int32_t* m_string_words_array;
00678
00680
00682 CArray<int32_t> m_num_unique_words;
00684 bool m_svm_arrays_clean;
00686 int32_t m_max_a_id;
00687
00688
00690 CArray3<float64_t> m_observation_matrix;
00692 CArray<int32_t> m_pos;
00694 int32_t m_seq_len;
00696 CArray2<int32_t> m_orf_info;
00698 CArray2<float64_t> m_segment_sum_weights;
00700 CArray<CPlifBase*> m_plif_list;
00702 CArray2<CPlifBase*> m_PEN;
00704 CArray2<CPlifBase*> m_PEN_state_signals;
00706 CArray<char> m_genestr;
00721 uint16_t*** m_wordstr;
00723 CArray2<float64_t> m_dict_weights;
00725 CArray3<float64_t> m_segment_loss;
00727 CArray<int32_t> m_segment_ids;
00729 CArray<float64_t> m_segment_mask;
00731 CArray<int32_t> m_my_state_seq;
00733 CArray<int32_t> m_my_pos_seq;
00735 CArray<float64_t> m_my_scores;
00737 CArray<float64_t> m_my_losses;
00738
00741 CSegmentLoss* m_seg_loss_obj;
00742
00743
00745 CArray<float64_t> m_scores;
00747 CArray2<int32_t> m_states;
00749 CArray2<int32_t> m_positions;
00750
00752 CSparseFeatures<float64_t>* m_seq_sparse1;
00754 CSparseFeatures<float64_t>* m_seq_sparse2;
00756 CPlifMatrix* m_plif_matrices;
00757
00761 CArray<bool> m_genestr_stop;
00762
00765 CIntronList* m_intron_list;
00766
00768 int32_t m_num_intron_plifs;
00769
00774 CArray2<float64_t> m_lin_feat;
00775
00777 float64_t *m_raw_intensities;
00779 int32_t* m_probe_pos;
00781 int32_t* m_num_probes_cum;
00783 int32_t* m_num_lin_feat_plifs_cum;
00785 int32_t m_num_raw_data;
00786
00788 bool m_long_transitions ;
00791 int32_t m_long_transition_threshold ;
00796 int32_t m_long_transition_max ;
00797
00801 static int32_t word_degree_default[4];
00802
00806 static int32_t cum_num_words_default[5];
00807
00810 static int32_t frame_plifs[3];
00811
00814 static int32_t num_words_default[4];
00815
00817 static int32_t mod_words_default[32];
00818
00820 static bool sign_words_default[16];
00821
00823 static int32_t string_words_default[16];
00824 };
00825 }
00826 #endif