DynProg.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Gunnar Raetsch
00008  * Written (W) 1999-2009 Soeren Sonnenburg
00009  * Written (W) 2008-2009 Jonas Behr
00010  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00011  */
00012 
00013 #ifndef __CDYNPROG_H__
00014 #define __CDYNPROG_H__
00015 
00016 #include "lib/Mathematics.h"
00017 #include "lib/common.h"
00018 #include "base/SGObject.h"
00019 #include "lib/io.h"
00020 #include "lib/config.h"
00021 #include "structure/PlifMatrix.h"
00022 #include "structure/PlifBase.h"
00023 #include "structure/Plif.h"
00024 #include "structure/IntronList.h"
00025 #include "structure/SegmentLoss.h"
00026 #include "features/StringFeatures.h"
00027 #include "features/SparseFeatures.h"
00028 #include "distributions/Distribution.h"
00029 #include "lib/DynamicArray.h"
00030 #include "lib/Array.h"
00031 #include "lib/Array2.h"
00032 #include "lib/Array3.h"
00033 #include "lib/Time.h"
00034 
00035 #include <stdio.h>
00036 #include <limits.h>
00037 
00038 namespace shogun
00039 {
00040     template <class T> class CSparseFeatures;
00041     class CIntronList;
00042     class CPlifMatrix;
00043     class CSegmentLoss;
00044 
00045 //#define DYNPROG_TIMING
00046 
00047 #ifdef USE_BIGSTATES
00048 typedef uint16_t T_STATES ;
00049 #else
00050 typedef uint8_t T_STATES ;
00051 #endif
00052 typedef T_STATES* P_STATES ;
00053 
00059 class CDynProg : public CSGObject
00060 {
00061 public:
00066     CDynProg(int32_t p_num_svms=8);
00067     virtual ~CDynProg();
00068 
00077     float64_t best_path_no_b(int32_t max_iter, int32_t & best_iter, int32_t *my_path);
00078 
00087     void best_path_no_b_trans(int32_t max_iter, int32_t & max_best_iter, int16_t nbest, float64_t *prob_nbest, int32_t *my_paths);
00088     
00089     // model related functions
00095     void set_num_states(int32_t N);
00096 
00098     int32_t get_num_states();
00099 
00101     int32_t get_num_svms();
00102 
00108     void init_content_svm_value_array(const int32_t p_num_svms);
00109 
00117     void init_tiling_data(int32_t* probe_pos, float64_t* intensities, const int32_t num_probes);
00118 
00125     void precompute_tiling_plifs(CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs);  
00126 
00131     void resize_lin_feat(int32_t num_new_feat);
00137     void set_p_vector(float64_t* p, int32_t N);
00138 
00144     void set_q_vector(float64_t* q, int32_t N);
00145     
00152     void set_a(float64_t* a, int32_t M, int32_t N);
00153     
00160     void set_a_id(int32_t *a, int32_t M, int32_t N);
00161     
00168     void set_a_trans_matrix(float64_t *a_trans, int32_t num_trans, int32_t N);
00169 
00176     void init_mod_words_array(int32_t * p_mod_words_array, int32_t num_elem, int32_t num_columns);
00177 
00183     bool check_svm_arrays();
00184 
00191     void set_observation_matrix(float64_t* seq, int32_t* dims, int32_t ndims);
00192 
00199     int32_t get_num_positions();
00200 
00212     void set_content_type_array(float64_t* seg_path, int32_t rows, int32_t cols);
00213 
00219     void set_pos(int32_t* pos, int32_t seq_len);
00220 
00228     void set_orf_info(int32_t* orf_info, int32_t m, int32_t n);
00229 
00235     void set_gene_string(char* genestr, int32_t genestr_len);
00236 
00237 
00244     void set_dict_weights(float64_t* dictionary_weights, int32_t dict_len, int32_t n);
00245 
00252     void best_path_set_segment_loss(float64_t * segment_loss, int32_t num_segment_id1, int32_t num_segment_id2);
00253 
00260     void best_path_set_segment_ids_mask(int32_t* segment_ids, float64_t* segment_mask, int32_t m);
00261 
00263     void set_sparse_features(CSparseFeatures<float64_t>* seq_sparse1, CSparseFeatures<float64_t>* seq_sparse2);
00264 
00269     void set_plif_matrices(CPlifMatrix* pm);
00270 
00271     // best_path result retrieval functions
00277     void get_scores(float64_t **scores, int32_t *n);
00278 
00285     void get_states(int32_t **states, int32_t *m, int32_t *n);
00286 
00293     void get_positions(int32_t **positions, int32_t *m, int32_t *n);
00294 
00295 
00304     void compute_nbest_paths(int32_t max_num_signals,
00305                          bool use_orf, int16_t nbest, bool with_loss, bool with_multiple_sequences);
00306 
00308 
00320     void best_path_trans_deriv(
00321             int32_t* my_state_seq, int32_t *my_pos_seq,
00322             int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals);
00323 
00324     // additional best_path_trans_deriv functions
00329     void set_my_state_seq(int32_t* my_state_seq);
00330 
00335     void set_my_pos_seq(int32_t* my_pos_seq);
00336 
00344     void get_path_scores(float64_t** my_scores, int32_t* seq_len);
00345 
00353     void get_path_losses(float64_t** my_losses, int32_t* seq_len);
00354 
00355 
00357     inline T_STATES get_N() const
00358     {
00359         return m_N ;
00360     }
00361     
00366     inline void set_q(T_STATES offset, float64_t value)
00367     {
00368         m_end_state_distribution_q[offset]=value;
00369     }
00370 
00375     inline void set_p(T_STATES offset, float64_t value)
00376     {
00377         m_initial_state_distribution_p[offset]=value;
00378     }
00379 
00386     inline void set_a(T_STATES line_, T_STATES column, float64_t value)
00387     {
00388       m_transition_matrix_a.element(line_,column)=value; // look also best_path!
00389     }
00390 
00396     inline float64_t get_q(T_STATES offset) const
00397     {
00398         return m_end_state_distribution_q[offset];
00399     }
00400 
00406     inline float64_t get_q_deriv(T_STATES offset) const
00407     {
00408         return m_end_state_distribution_q_deriv[offset];
00409     }
00410 
00416     inline float64_t get_p(T_STATES offset) const
00417     {
00418         return m_initial_state_distribution_p[offset];
00419     }
00420 
00426     inline float64_t get_p_deriv(T_STATES offset) const
00427     {
00428         return m_initial_state_distribution_p_deriv[offset];
00429     }
00430     
00434     void precompute_content_values();
00435 
00442     inline float64_t* get_lin_feat(int32_t & dim1, int32_t & dim2) 
00443     {
00444         m_lin_feat.get_array_size(dim1, dim2);
00445         return m_lin_feat.get_array();
00446     }
00455     inline void set_lin_feat(float64_t* p_lin_feat, int32_t p_num_svms, int32_t p_seq_len) 
00456     {
00457       m_lin_feat.set_array(p_lin_feat, p_num_svms, p_seq_len, true, true);
00458     }
00463     void create_word_string();
00464 
00467     void precompute_stop_codons();
00468 
00475     inline float64_t get_a(T_STATES line_, T_STATES column) const
00476     {
00477       return m_transition_matrix_a.element(line_, column); // look also best_path()!
00478     }
00479 
00486     inline float64_t get_a_deriv(T_STATES line_, T_STATES column) const
00487     {
00488       return m_transition_matrix_a_deriv.element(line_, column); // look also best_path()!
00489     }
00491 
00496     void set_intron_list(CIntronList* intron_list, int32_t num_plifs);
00497 
00499     CSegmentLoss* get_segment_loss_object()
00500     {
00501         return m_seg_loss_obj;
00502     }
00503 
00510     void long_transition_settings(bool use_long_transitions, int32_t threshold, int32_t max_len)
00511     {
00512         m_long_transitions = use_long_transitions;
00513         m_long_transition_threshold = threshold;
00514         m_long_transition_max = max_len;
00515     }
00516         
00517 protected:
00518 
00519     /* helper functions */
00520 
00530     void lookup_content_svm_values(const int32_t from_state,
00531         const int32_t to_state, const int32_t from_pos, const int32_t to_pos,
00532         float64_t* svm_values, int32_t frame);
00533 
00541     inline void lookup_tiling_plif_values(const int32_t from_state,
00542         const int32_t to_state, const int32_t len, float64_t* svm_values);
00543 
00548     inline int32_t find_frame(const int32_t from_state);
00549 
00558     inline int32_t raw_intensities_interval_query(
00559         const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type);
00560 
00561 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00562 
00563     struct svm_values_struct
00564     {
00566         int32_t maxlookback;
00568         int32_t seqlen;
00569 
00571         int32_t* start_pos;
00573         float64_t ** svm_values_unnormalized;
00575         float64_t * svm_values;
00577         bool *** word_used;
00579         int32_t **num_unique_words;
00580     };
00581 #endif // DOXYGEN_SHOULD_SKIP_THIS
00582 
00591     bool extend_orf(int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos, int32_t to);
00592 
00594     inline virtual const char* get_name() const { return "DynProg"; }
00595 
00596 private:
00597 
00598     T_STATES trans_list_len;
00599     T_STATES **trans_list_forward;
00600     T_STATES *trans_list_forward_cnt;
00601     float64_t **trans_list_forward_val;
00602     int32_t **trans_list_forward_id;
00603     bool mem_initialized;
00604 
00605 #ifdef DYNPROG_TIMING
00606     CTime MyTime;
00607     CTime MyTime2;
00608     CTime MyTime3;
00609     
00610     float64_t segment_init_time;
00611     float64_t segment_pos_time;
00612     float64_t segment_clean_time;
00613     float64_t segment_extend_time;
00614     float64_t orf_time;
00615     float64_t content_time;
00616     float64_t content_penalty_time;
00617     float64_t content_svm_values_time ;
00618     float64_t content_plifs_time ;  
00619     float64_t svm_init_time;
00620     float64_t svm_pos_time;
00621     float64_t inner_loop_time;
00622     float64_t inner_loop_max_time ; 
00623     float64_t svm_clean_time;
00624     float64_t long_transition_time ;
00625 #endif
00626     
00627 
00628 protected:
00633 
00634     int32_t m_N;
00635 
00637     CArray2<int32_t> m_transition_matrix_a_id;
00638     CArray2<float64_t> m_transition_matrix_a;
00639     CArray2<float64_t> m_transition_matrix_a_deriv;
00640 
00642     CArray<float64_t> m_initial_state_distribution_p;
00643     CArray<float64_t> m_initial_state_distribution_p_deriv;
00644 
00646     CArray<float64_t> m_end_state_distribution_q;
00647     CArray<float64_t> m_end_state_distribution_q_deriv;
00648 
00650         
00652     int32_t m_num_degrees;
00654     int32_t m_num_svms;
00655 
00657     CArray<int32_t> m_word_degree;
00659     CArray<int32_t> m_cum_num_words;
00661     int32_t * m_cum_num_words_array;
00663     CArray<int32_t> m_num_words;
00665     int32_t* m_num_words_array;
00667     CArray2<int32_t> m_mod_words;
00669     int32_t* m_mod_words_array;
00671     CArray<bool> m_sign_words;
00673     bool* m_sign_words_array;
00675     CArray<int32_t> m_string_words;
00677     int32_t* m_string_words_array;
00678 
00680 //  CArray<int32_t> m_svm_pos_start;
00682     CArray<int32_t> m_num_unique_words;
00684     bool m_svm_arrays_clean;
00686     int32_t m_max_a_id;
00687     
00688     // input arguments
00690     CArray3<float64_t> m_observation_matrix;
00692     CArray<int32_t> m_pos;
00694     int32_t m_seq_len; 
00696     CArray2<int32_t> m_orf_info;
00698     CArray2<float64_t> m_segment_sum_weights;
00700     CArray<CPlifBase*> m_plif_list;
00702     CArray2<CPlifBase*> m_PEN;
00704     CArray2<CPlifBase*> m_PEN_state_signals;
00706     CArray<char> m_genestr;
00721     uint16_t*** m_wordstr;
00723     CArray2<float64_t> m_dict_weights;
00725     CArray3<float64_t> m_segment_loss;
00727     CArray<int32_t> m_segment_ids;
00729     CArray<float64_t> m_segment_mask;
00731     CArray<int32_t> m_my_state_seq;
00733     CArray<int32_t> m_my_pos_seq;
00735     CArray<float64_t> m_my_scores;
00737     CArray<float64_t> m_my_losses;
00738 
00741     CSegmentLoss* m_seg_loss_obj;
00742 
00743     // output arguments
00745     CArray<float64_t> m_scores;
00747     CArray2<int32_t> m_states;
00749     CArray2<int32_t> m_positions;
00750 
00752     CSparseFeatures<float64_t>* m_seq_sparse1;
00754     CSparseFeatures<float64_t>* m_seq_sparse2;
00756     CPlifMatrix* m_plif_matrices;
00757 
00761     CArray<bool> m_genestr_stop;
00762 
00765     CIntronList* m_intron_list;
00766 
00768     int32_t m_num_intron_plifs;
00769 
00774     CArray2<float64_t> m_lin_feat;
00775 
00777     float64_t *m_raw_intensities;
00779     int32_t* m_probe_pos;
00781     int32_t* m_num_probes_cum;
00783     int32_t* m_num_lin_feat_plifs_cum;
00785     int32_t m_num_raw_data;
00786 
00788     bool m_long_transitions ;
00791     int32_t m_long_transition_threshold  ;
00796     int32_t m_long_transition_max ;
00797 
00801     static int32_t word_degree_default[4];
00802 
00806     static int32_t cum_num_words_default[5];
00807 
00810     static int32_t frame_plifs[3];
00811 
00814     static int32_t num_words_default[4];
00815 
00817     static int32_t mod_words_default[32];
00818 
00820     static bool sign_words_default[16];
00821 
00823     static int32_t string_words_default[16];
00824 };
00825 }
00826 #endif

SHOGUN Machine Learning Toolbox - Documentation