SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
DynProg.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 2 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 1999-2007 Soeren Sonnenburg
8  * Written (W) 1999-2007 Gunnar Raetsch
9  * Written (W) 2008-2009 Jonas Behr
10  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
11  */
12 
15 #include <shogun/io/SGIO.h>
16 #include <shogun/lib/config.h>
19 #include <shogun/structure/Plif.h>
21 #include <shogun/lib/Array.h>
22 #include <shogun/lib/Array2.h>
23 #include <shogun/lib/Array3.h>
24 
25 #include <stdlib.h>
26 #include <stdio.h>
27 #include <time.h>
28 #include <ctype.h>
29 
30 using namespace shogun;
31 
32 //#define USE_TMP_ARRAYCLASS
33 //#define DYNPROG_DEBUG
34 
35 int32_t CDynProg::word_degree_default[4]={3,4,5,6} ;
36 int32_t CDynProg::cum_num_words_default[5]={0,64,320,1344,5440} ;
37 int32_t CDynProg::frame_plifs[3]={4,5,6};
38 int32_t CDynProg::num_words_default[4]= {64,256,1024,4096} ;
39 int32_t CDynProg::mod_words_default[32] = {1,1,1,1,1,1,1,1,
40  1,1,1,1,1,1,1,1,
41  0,0,0,0,0,0,0,0,
42  0,0,0,0,0,0,0,0} ;
43 bool CDynProg::sign_words_default[16] = {true,true,true,true,true,true,true,true,
44  false,false,false,false,false,false,false,false} ; // whether to use counts or signum of counts
45 int32_t CDynProg::string_words_default[16] = {0,0,0,0,0,0,0,0,
46  1,1,1,1,1,1,1,1} ; // which string should be used
47 
48 CDynProg::CDynProg(int32_t num_svms /*= 8 */)
49 : CSGObject(), m_transition_matrix_a_id(1,1), m_transition_matrix_a(1,1),
50  m_transition_matrix_a_deriv(1,1), m_initial_state_distribution_p(1),
51  m_initial_state_distribution_p_deriv(1), m_end_state_distribution_q(1),
52  m_end_state_distribution_q_deriv(1),
53 
54  // multi svm
55  m_num_degrees(4),
56  m_num_svms(num_svms),
57  m_word_degree(word_degree_default, m_num_degrees, true, true),
58  m_cum_num_words(cum_num_words_default, m_num_degrees+1, true, true),
59  m_cum_num_words_array(m_cum_num_words.get_array()),
60  m_num_words(num_words_default, m_num_degrees, true, true),
61  m_num_words_array(m_num_words.get_array()),
62  m_mod_words(mod_words_default, m_num_svms, 2, true, true),
63  m_mod_words_array(m_mod_words.get_array()),
64  m_sign_words(sign_words_default, m_num_svms, true, true),
65  m_sign_words_array(m_sign_words.get_array()),
66  m_string_words(string_words_default, m_num_svms, true, true),
67  m_string_words_array(m_string_words.get_array()),
68  //m_svm_pos_start(m_num_degrees),
69  m_num_unique_words(m_num_degrees),
70  m_svm_arrays_clean(true),
71 
72  m_max_a_id(0), m_observation_matrix(1,1,1),
73  m_pos(1),
74  m_seq_len(0),
75  m_orf_info(1,2),
76  m_plif_list(1),
77  m_PEN(1,1),
78  m_genestr(1), m_wordstr(NULL), m_dict_weights(1,1), m_segment_loss(1,1,2),
79  m_segment_ids(1),
80  m_segment_mask(1),
81  m_my_state_seq(1),
82  m_my_pos_seq(1),
83  m_my_scores(1),
84  m_my_losses(1),
85  m_scores(1),
86  m_states(1,1),
87  m_positions(1,1),
88 
89  m_seq_sparse1(NULL),
90  m_seq_sparse2(NULL),
91  m_plif_matrices(NULL),
92 
93  m_genestr_stop(1),
94  m_intron_list(NULL),
95  m_num_intron_plifs(0),
96  m_lin_feat(1,1), //by Jonas
97  m_raw_intensities(NULL),
98  m_probe_pos(NULL),
99  m_num_probes_cum(NULL),
100  m_num_lin_feat_plifs_cum(NULL),
101  m_num_raw_data(0),
102 
103  m_long_transitions(true),
104  m_long_transition_threshold(1000)
105 {
106  trans_list_forward = NULL ;
107  trans_list_forward_cnt = NULL ;
108  trans_list_forward_val = NULL ;
109  trans_list_forward_id = NULL ;
110  trans_list_len = 0 ;
111 
112  mem_initialized = true ;
113 
114  m_N=1;
115 
116  m_raw_intensities = NULL;
117  m_probe_pos = NULL;
118  m_num_probes_cum = SG_MALLOC(int32_t, 100);
119  m_num_probes_cum[0] = 0;
120  //m_use_tiling=false;
121  m_num_lin_feat_plifs_cum = SG_MALLOC(int32_t, 100);
123  m_num_raw_data = 0;
124 #ifdef ARRAY_STATISTICS
125  m_word_degree.set_array_name("word_degree");
126 #endif
127 
128  m_transition_matrix_a_id.set_array_name("transition_matrix_a_id");
129  m_transition_matrix_a.set_array_name("transition_matrix_a");
130  m_transition_matrix_a_deriv.set_array_name("transition_matrix_a_deriv");
131  m_mod_words.set_array_name("mod_words");
132  m_orf_info.set_array_name("orf_info");
133  m_segment_sum_weights.set_array_name("segment_sum_weights");
134  m_PEN.set_array_name("PEN");
135  m_PEN_state_signals.set_array_name("PEN_state_signals");
136  m_dict_weights.set_array_name("dict_weights");
137  m_states.set_array_name("states");
138  m_positions.set_array_name("positions");
139  m_lin_feat.set_array_name("lin_feat");
140 
141 
142  m_observation_matrix.set_array_name("m_observation_matrix");
143  m_segment_loss.set_array_name("m_segment_loss");
145 }
146 
148 {
149  if (trans_list_forward_cnt)
150  SG_FREE(trans_list_forward_cnt);
151  if (trans_list_forward)
152  {
153  for (int32_t i=0; i<trans_list_len; i++)
154  {
155  if (trans_list_forward[i])
156  SG_FREE(trans_list_forward[i]);
157  }
158  SG_FREE(trans_list_forward);
159  }
160  if (trans_list_forward_val)
161  {
162  for (int32_t i=0; i<trans_list_len; i++)
163  {
164  if (trans_list_forward_val[i])
165  SG_FREE(trans_list_forward_val[i]);
166  }
167  SG_FREE(trans_list_forward_val);
168  }
169  if (trans_list_forward_id)
170  {
171  for (int32_t i=0; i<trans_list_len; i++)
172  {
173  if (trans_list_forward_id[i])
174  SG_FREE(trans_list_forward_id[i]);
175  }
176  SG_FREE(trans_list_forward_id);
177  }
178  if (m_raw_intensities)
180  if (m_probe_pos)
182  if (m_num_probes_cum)
186 
187  delete m_intron_list;
188 
192 }
193 
196 {
197  return m_num_svms;
198 }
199 
201 {
202  int32_t length=m_genestr.get_dim1();
203 
204  m_genestr_stop.resize_array(length) ;
205  m_genestr_stop.zero() ;
206  m_genestr_stop.set_array_name("genestr_stop") ;
207  {
208  for (int32_t i=0; i<length-2; i++)
209  if ((m_genestr[i]=='t' || m_genestr[i]=='T') &&
210  (((m_genestr[i+1]=='a' || m_genestr[i+1]=='A') &&
211  (m_genestr[i+2]=='a' || m_genestr[i+2]=='g' || m_genestr[i+2]=='A' || m_genestr[i+2]=='G')) ||
212  ((m_genestr[i+1]=='g'||m_genestr[i+1]=='G') && (m_genestr[i+2]=='a' || m_genestr[i+2]=='A') )))
213  {
214  m_genestr_stop.element(i)=true ;
215  }
216  else
217  m_genestr_stop.element(i)=false ;
218  m_genestr_stop.element(length-2)=false ;
219  m_genestr_stop.element(length-1)=false ;
220  }
221 }
222 
223 void CDynProg::set_num_states(int32_t p_N)
224 {
225  m_N=p_N ;
226 
234 
237 }
238 
240 {
241  return m_N;
242 }
243 
245  int32_t* probe_pos, float64_t* intensities, const int32_t num_probes)
246 {
247  m_num_raw_data++;
249 
250  int32_t* tmp_probe_pos = SG_MALLOC(int32_t, m_num_probes_cum[m_num_raw_data]);
251  float64_t* tmp_raw_intensities = SG_MALLOC(float64_t, m_num_probes_cum[m_num_raw_data]);
252 
253 
254  if (m_num_raw_data==1){
255  memcpy(tmp_probe_pos, probe_pos, num_probes*sizeof(int32_t));
256  memcpy(tmp_raw_intensities, intensities, num_probes*sizeof(float64_t));
257  //SG_PRINT("raw_intens:%f \n",*tmp_raw_intensities+2);
258  }else{
259  memcpy(tmp_probe_pos, m_probe_pos, m_num_probes_cum[m_num_raw_data-1]*sizeof(int32_t));
260  memcpy(tmp_raw_intensities, m_raw_intensities, m_num_probes_cum[m_num_raw_data-1]*sizeof(float64_t));
261  memcpy(tmp_probe_pos+m_num_probes_cum[m_num_raw_data-1], probe_pos, num_probes*sizeof(int32_t));
262  memcpy(tmp_raw_intensities+m_num_probes_cum[m_num_raw_data-1], intensities, num_probes*sizeof(float64_t));
263  }
266  m_probe_pos = tmp_probe_pos; //SG_MALLOC(int32_t, num_probes);
267  m_raw_intensities = tmp_raw_intensities;//SG_MALLOC(float64_t, num_probes);
268 
269  //memcpy(m_probe_pos, probe_pos, num_probes*sizeof(int32_t));
270  //memcpy(m_raw_intensities, intensities, num_probes*sizeof(float64_t));
271 
272 }
273 
274 void CDynProg::init_content_svm_value_array(const int32_t p_num_svms)
275 {
276  m_lin_feat.resize_array(p_num_svms, m_seq_len);
277 
278  // initialize array
279  for (int s=0; s<p_num_svms; s++)
280  for (int p=0; p<m_seq_len; p++)
281  m_lin_feat.set_element(0.0, s, p) ;
282 }
283 
284 void CDynProg::resize_lin_feat(const int32_t num_new_feat)
285 {
286  int32_t dim1, dim2;
287  m_lin_feat.get_array_size(dim1, dim2);
289  ASSERT(dim2==m_seq_len); // == number of candidate positions
290 
291 
292 
293  float64_t* arr = m_lin_feat.get_array();
294  float64_t* tmp = SG_MALLOC(float64_t, (dim1+num_new_feat)*dim2);
295  memset(tmp, 0, (dim1+num_new_feat)*dim2*sizeof(float64_t)) ;
296  for(int32_t j=0;j<m_seq_len;j++)
297  for(int32_t k=0;k<m_num_lin_feat_plifs_cum[m_num_raw_data-1];k++)
298  tmp[j*(dim1+num_new_feat)+k] = arr[j*dim1+k];
299 
300  m_lin_feat.set_array(tmp, dim1+num_new_feat,dim2, true, true);// copy array and free it later
301  SG_FREE(tmp);
302 
303  /*for(int32_t j=0;j<5;j++)
304  {
305  for(int32_t k=0;k<m_num_lin_feat_plifs_cum[m_num_raw_data];k++)
306  {
307  SG_PRINT("(%i,%i)%f ",k,j,m_lin_feat.get_element(k,j));
308  }
309  SG_PRINT("\n");
310  }
311  m_lin_feat.get_array_size(dim1,dim2);
312  SG_PRINT("resize_lin_feat: dim1:%i, dim2:%i\n",dim1,dim2);*/
313 
314  //SG_PRINT("resize_lin_feat: done\n");
315 }
316 
318  CPlif** PEN, const int32_t* tiling_plif_ids, const int32_t num_tiling_plifs)
319 {
321  float64_t* tiling_plif = SG_MALLOC(float64_t, num_tiling_plifs);
324  svm_value[i]=0.0;
325  int32_t* tiling_rows = SG_MALLOC(int32_t, num_tiling_plifs);
326  for (int32_t i=0; i<num_tiling_plifs; i++)
327  {
328  tiling_plif[i]=0.0;
329  CPlif * plif = PEN[tiling_plif_ids[i]];
330  tiling_rows[i] = plif->get_use_svm();
331 
332  ASSERT(tiling_rows[i]-1==m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i)
333  }
334  resize_lin_feat(num_tiling_plifs);
335 
336 
337  int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[m_num_raw_data-1]];
338  float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[m_num_raw_data-1]];
339  int32_t num=m_num_probes_cum[m_num_raw_data-1];
340 
341  for (int32_t pos_idx=0;pos_idx<m_seq_len;pos_idx++)
342  {
343  while (num<m_num_probes_cum[m_num_raw_data]&&*p_tiling_pos<m_pos[pos_idx])
344  {
345  for (int32_t i=0; i<num_tiling_plifs; i++)
346  {
347  svm_value[m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i]=*p_tiling_data;
348  CPlif * plif = PEN[tiling_plif_ids[i]];
349  ASSERT(m_num_lin_feat_plifs_cum[m_num_raw_data-1]+i==plif->get_use_svm()-1);
350  plif->set_do_calc(true);
351  tiling_plif[i]+=plif->lookup_penalty(0,svm_value);
352  plif->set_do_calc(false);
353  }
354  p_tiling_data++;
355  p_tiling_pos++;
356  num++;
357  }
358  for (int32_t i=0; i<num_tiling_plifs; i++)
359  m_lin_feat.set_element(tiling_plif[i],tiling_rows[i]-1,pos_idx);
360  }
361  SG_FREE(svm_value);
362  SG_FREE(tiling_plif);
363  SG_FREE(tiling_rows);
364 }
365 
367 {
369  m_wordstr=SG_MALLOC(uint16_t**, 5440);
370  int32_t k=0;
371  int32_t genestr_len=m_genestr.get_dim1();
372 
373  m_wordstr[k]=SG_MALLOC(uint16_t*, m_num_degrees);
374  for (int32_t j=0; j<m_num_degrees; j++)
375  {
376  m_wordstr[k][j]=NULL ;
377  {
378  m_wordstr[k][j]=SG_MALLOC(uint16_t, genestr_len);
379  for (int32_t i=0; i<genestr_len; i++)
380  switch (m_genestr[i])
381  {
382  case 'A':
383  case 'a': m_wordstr[k][j][i]=0 ; break ;
384  case 'C':
385  case 'c': m_wordstr[k][j][i]=1 ; break ;
386  case 'G':
387  case 'g': m_wordstr[k][j][i]=2 ; break ;
388  case 'T':
389  case 't': m_wordstr[k][j][i]=3 ; break ;
390  default: ASSERT(0) ;
391  }
393  }
394  }
395 }
396 
398 {
399  for (int32_t s=0; s<m_num_svms; s++)
400  m_lin_feat.set_element(0.0, s, 0);
401 
402  for (int32_t p=0 ; p<m_seq_len-1 ; p++)
403  {
404  int32_t from_pos = m_pos[p];
405  int32_t to_pos = m_pos[p+1];
406  float64_t* my_svm_values_unnormalized = SG_MALLOC(float64_t, m_num_svms);
407  //SG_PRINT("%i(%i->%i) ",p,from_pos, to_pos);
408 
409  ASSERT(from_pos<=m_genestr.get_dim1());
410  ASSERT(to_pos<=m_genestr.get_dim1());
411 
412  for (int32_t s=0; s<m_num_svms; s++)
413  my_svm_values_unnormalized[s]=0.0;//precomputed_svm_values.element(s,p);
414 
415  for (int32_t i=from_pos; i<to_pos; i++)
416  {
417  for (int32_t j=0; j<m_num_degrees; j++)
418  {
419  uint16_t word = m_wordstr[0][j][i] ;
420  for (int32_t s=0; s<m_num_svms; s++)
421  {
422  // check if this k-mer should be considered for this SVM
423  if (m_mod_words.get_element(s,0)==3 && i%3!=m_mod_words.get_element(s,1))
424  continue;
425  my_svm_values_unnormalized[s] += m_dict_weights[(word+m_cum_num_words_array[j])+s*m_cum_num_words_array[m_num_degrees]] ;
426  }
427  }
428  }
429  for (int32_t s=0; s<m_num_svms; s++)
430  {
431  float64_t prev = m_lin_feat.get_element(s, p);
432  //SG_PRINT("elem (%i, %i, %f)\n", s, p, prev) ;
433  if (prev<-1e20 || prev>1e20)
434  {
435  SG_ERROR("initialization missing (%i, %i, %f)\n", s, p, prev) ;
436  prev=0 ;
437  }
438  m_lin_feat.set_element(prev + my_svm_values_unnormalized[s], s, p+1);
439  }
440  SG_FREE(my_svm_values_unnormalized);
441  }
442  //for (int32_t j=0; j<m_num_degrees; j++)
443  // SG_FREE(m_wordstr[0][j]);
444  //SG_FREE(m_wordstr[0]);
445 }
446 
448 {
449  if (!(p.vlen==m_N))
450  SG_ERROR("length of start prob vector p (%i) is not equal to the number of states (%i), N: %i\n",p.vlen, m_N);
451 
453 }
454 
456 {
457  if (!(q.vlen==m_N))
458  SG_ERROR("length of end prob vector q (%i) is not equal to the number of states (%i), N: %i\n",q.vlen, m_N);
460 }
461 
463 {
464  ASSERT(a.num_cols==m_N);
465  ASSERT(a.num_rows==m_N);
466  m_transition_matrix_a.set_array(a.matrix, m_N, m_N, true, true);
468 }
469 
471 {
472  ASSERT(a.num_cols==m_N);
473  ASSERT(a.num_rows==m_N);
475  m_max_a_id = 0;
476  for (int32_t i=0; i<m_N; i++)
477  {
478  for (int32_t j=0; j<m_N; j++)
480  }
481 }
482 
484 {
485  int32_t num_trans=a_trans.num_rows;
486  int32_t num_cols=a_trans.num_cols;
487 
488  //CMath::display_matrix(a_trans.matrix,num_trans, num_cols,"a_trans");
489 
490  if (!((num_cols==3) || (num_cols==4)))
491  SG_ERROR("!((num_cols==3) || (num_cols==4)), num_cols: %i\n",num_cols);
492 
493  SG_FREE(trans_list_forward);
494  SG_FREE(trans_list_forward_cnt);
495  SG_FREE(trans_list_forward_val);
496  SG_FREE(trans_list_forward_id);
497 
498  trans_list_forward = NULL ;
499  trans_list_forward_cnt = NULL ;
500  trans_list_forward_val = NULL ;
501  trans_list_len = 0 ;
502 
505 
506  mem_initialized = true ;
507 
508  trans_list_forward_cnt=NULL ;
509  trans_list_len = m_N ;
510  trans_list_forward = SG_MALLOC(T_STATES*, m_N);
511  trans_list_forward_cnt = SG_MALLOC(T_STATES, m_N);
512  trans_list_forward_val = SG_MALLOC(float64_t*, m_N);
513  trans_list_forward_id = SG_MALLOC(int32_t*, m_N);
514 
515  int32_t start_idx=0;
516  for (int32_t j=0; j<m_N; j++)
517  {
518  int32_t old_start_idx=start_idx;
519 
520  while (start_idx<num_trans && a_trans.matrix[start_idx+num_trans]==j)
521  {
522  start_idx++;
523 
524  if (start_idx>1 && start_idx<num_trans)
525  ASSERT(a_trans.matrix[start_idx+num_trans-1] <= a_trans.matrix[start_idx+num_trans]);
526  }
527 
528  if (start_idx>1 && start_idx<num_trans)
529  ASSERT(a_trans.matrix[start_idx+num_trans-1] <= a_trans.matrix[start_idx+num_trans]);
530 
531  int32_t len=start_idx-old_start_idx;
532  ASSERT(len>=0);
533 
534  trans_list_forward_cnt[j] = 0 ;
535 
536  if (len>0)
537  {
538  trans_list_forward[j] = SG_MALLOC(T_STATES, len);
539  trans_list_forward_val[j] = SG_MALLOC(float64_t, len);
540  trans_list_forward_id[j] = SG_MALLOC(int32_t, len);
541  }
542  else
543  {
544  trans_list_forward[j] = NULL;
545  trans_list_forward_val[j] = NULL;
546  trans_list_forward_id[j] = NULL;
547  }
548  }
549 
550  for (int32_t i=0; i<num_trans; i++)
551  {
552  int32_t from_state = (int32_t)a_trans.matrix[i] ;
553  int32_t to_state = (int32_t)a_trans.matrix[i+num_trans] ;
554  float64_t val = a_trans.matrix[i+num_trans*2] ;
555  int32_t id = 0 ;
556  if (num_cols==4)
557  id = (int32_t)a_trans.matrix[i+num_trans*3] ;
558  //SG_DEBUG( "id=%i\n", id) ;
559 
560  ASSERT(to_state>=0 && to_state<m_N);
561  ASSERT(from_state>=0 && from_state<m_N);
562 
563  trans_list_forward[to_state][trans_list_forward_cnt[to_state]]=from_state ;
564  trans_list_forward_val[to_state][trans_list_forward_cnt[to_state]]=val ;
565  trans_list_forward_id[to_state][trans_list_forward_cnt[to_state]]=id ;
566  trans_list_forward_cnt[to_state]++ ;
567  m_transition_matrix_a.element(from_state, to_state) = val ;
568  m_transition_matrix_a_id.element(from_state, to_state) = id ;
569  //SG_PRINT("from_state:%i to_state:%i trans_matrix_a_id:%i \n",from_state, to_state,m_transition_matrix_a_id.element(from_state, to_state));
570  } ;
571 
572  m_max_a_id = 0 ;
573  for (int32_t i=0; i<m_N; i++)
574  for (int32_t j=0; j<m_N; j++)
575  {
576  //if (m_transition_matrix_a_id.element(i,j))
577  //SG_DEBUG( "(%i,%i)=%i\n", i,j, m_transition_matrix_a_id.element(i,j)) ;
579  }
580  //SG_DEBUG( "m_max_a_id=%i\n", m_max_a_id) ;
581 }
582 
583 
585 {
586  //for (int32_t i=0; i<mod_words_input.num_cols; i++)
587  //{
588  // for (int32_t j=0; j<mod_words_input.num_rows; j++)
589  // SG_PRINT("%i ",mod_words_input[i*mod_words_input.num_rows+j]);
590  // SG_PRINT("\n");
591  //}
592  m_svm_arrays_clean=false ;
593 
594  ASSERT(m_num_svms==mod_words_input.num_rows);
595  ASSERT(mod_words_input.num_cols==2);
596 
597  m_mod_words.set_array(mod_words_input.matrix, mod_words_input.num_rows, 2, true, true) ;
599 
600  /*SG_DEBUG( "m_mod_words=[") ;
601  for (int32_t i=0; i<mod_words_input.num_rows; i++)
602  SG_DEBUG( "%i, ", p_mod_words_array[i]) ;
603  SG_DEBUG( "]\n") ;*/
604 }
605 
607 {
608  //SG_DEBUG( "wd_dim1=%d, m_cum_num_words=%d, m_num_words=%d, m_svm_pos_start=%d, num_uniq_w=%d, mod_words_dims=(%d,%d), sign_w=%d,string_w=%d\n m_num_degrees=%d, m_num_svms=%d, m_num_strings=%d", m_word_degree.get_dim1(), m_cum_num_words.get_dim1(), m_num_words.get_dim1(), m_svm_pos_start.get_dim1(), m_num_unique_words.get_dim1(), m_mod_words.get_dim1(), m_mod_words.get_dim2(), m_sign_words.get_dim1(), m_string_words.get_dim1(), m_num_degrees, m_num_svms, m_num_strings);
612  //(word_used.get_dim1()==m_num_degrees) &&
613  //(word_used.get_dim2()==m_num_words[m_num_degrees-1]) &&
614  //(word_used.get_dim3()==m_num_strings) &&
615  // (svm_values_unnormalized.get_dim1()==m_num_degrees) &&
616  // (svm_values_unnormalized.get_dim2()==m_num_svms) &&
617  //(m_svm_pos_start.get_dim1()==m_num_degrees) &&
620  (m_mod_words.get_dim2()==2) &&
623  {
624  m_svm_arrays_clean=true ;
625  return true ;
626  }
627  else
628  {
631  (m_mod_words.get_dim2()==2) &&
634  SG_PRINT("OK\n") ;
635  else
636  SG_PRINT("not OK\n") ;
637 
639  SG_WARNING("SVM array: word_degree.get_dim1()!=m_num_degrees") ;
641  SG_WARNING("SVM array: m_cum_num_words.get_dim1()!=m_num_degrees+1") ;
643  SG_WARNING("SVM array: m_num_words.get_dim1()==m_num_degrees") ;
644  //if (!(m_svm_pos_start.get_dim1()==m_num_degrees))
645  // SG_WARNING("SVM array: m_svm_pos_start.get_dim1()!=m_num_degrees") ;
647  SG_WARNING("SVM array: m_num_unique_words.get_dim1()!=m_num_degrees") ;
648  if (!(m_mod_words.get_dim1()==m_num_svms))
649  SG_WARNING("SVM array: m_mod_words.get_dim1()!=num_svms") ;
650  if (!(m_mod_words.get_dim2()==2))
651  SG_WARNING("SVM array: m_mod_words.get_dim2()!=2") ;
652  if (!(m_sign_words.get_dim1()==m_num_svms))
653  SG_WARNING("SVM array: m_sign_words.get_dim1()!=num_svms") ;
655  SG_WARNING("SVM array: m_string_words.get_dim1()!=num_svms") ;
656 
657  m_svm_arrays_clean=false ;
658  return false ;
659  }
660 }
661 
663 {
664  if (seq.num_dims!=3)
665  SG_ERROR("Expected 3-dimensional Matrix\n");
666 
667  int32_t N=seq.dims[0];
668  int32_t cand_pos=seq.dims[1];
669  int32_t max_num_features=seq.dims[2];
670 
671  if (!m_svm_arrays_clean)
672  {
673  SG_ERROR( "SVM arrays not clean") ;
674  return ;
675  } ;
676 
677  ASSERT(N==m_N);
678  ASSERT(cand_pos==m_seq_len);
681 
682  m_observation_matrix.set_array(seq.array, N, m_seq_len, max_num_features, true, true) ;
683 }
685 {
686  return m_seq_len;
687 }
688 
690 {
691  ASSERT(seg_path.num_rows==2);
692  ASSERT(seg_path.num_cols==m_seq_len);
693 
694  if (seg_path.matrix!=NULL)
695  {
696  int32_t *segment_ids = SG_MALLOC(int32_t, m_seq_len);
697  float64_t *segment_mask = SG_MALLOC(float64_t, m_seq_len);
698  for (int32_t i=0; i<m_seq_len; i++)
699  {
700  segment_ids[i] = (int32_t)seg_path.matrix[2*i] ;
701  segment_mask[i] = seg_path.matrix[2*i+1] ;
702  }
703  best_path_set_segment_ids_mask(segment_ids, segment_mask, m_seq_len) ;
704  SG_FREE(segment_ids);
705  SG_FREE(segment_mask);
706  }
707  else
708  {
709  int32_t *izeros = SG_MALLOC(int32_t, m_seq_len);
711  for (int32_t i=0; i<m_seq_len; i++)
712  {
713  izeros[i]=0 ;
714  dzeros[i]=0.0 ;
715  }
716  best_path_set_segment_ids_mask(izeros, dzeros, m_seq_len) ;
717  SG_FREE(izeros);
718  SG_FREE(dzeros);
719  }
720 }
721 
723 {
724  m_pos.set_array(pos.vector, pos.vlen, true, true) ;
725  m_seq_len = pos.vlen;
726 }
727 
729 {
730  if (orf_info.num_cols!=2)
731  SG_ERROR( "orf_info size incorrect %i!=2\n", orf_info.num_cols) ;
732 
733  m_orf_info.set_array(orf_info.matrix, orf_info.num_rows, orf_info.num_cols, true, true) ;
734  m_orf_info.set_array_name("orf_info") ;
735 }
736 
738 {
739  if ((!seq_sparse1 && seq_sparse2) || (seq_sparse1 && !seq_sparse2))
740  SG_ERROR("Sparse features must either both be NULL or both NON-NULL\n");
741 
744 
745  m_seq_sparse1=seq_sparse1;
746  m_seq_sparse2=seq_sparse2;
749 }
750 
752 {
754 
755  m_plif_matrices=pm;
756 
758 }
759 
761 {
762  ASSERT(genestr.vector);
763  ASSERT(genestr.vlen>0);
764 
765  m_genestr.set_array(genestr.vector, genestr.vlen, true, true) ;
766 }
767 
768 void CDynProg::set_my_state_seq(int32_t* my_state_seq)
769 {
770  ASSERT(my_state_seq && m_seq_len>0);
772  for (int32_t i=0; i<m_seq_len; i++)
773  m_my_state_seq[i]=my_state_seq[i];
774 }
775 
776 void CDynProg::set_my_pos_seq(int32_t* my_pos_seq)
777 {
778  ASSERT(my_pos_seq && m_seq_len>0);
780  for (int32_t i=0; i<m_seq_len; i++)
781  m_my_pos_seq[i]=my_pos_seq[i];
782 }
783 
785 {
786  if (m_num_svms!=dictionary_weights.num_cols)
787  {
788  SG_ERROR( "m_dict_weights array does not match num_svms=%i!=%i\n",
789  m_num_svms, dictionary_weights.num_cols) ;
790  }
791 
792  m_dict_weights.set_array(dictionary_weights.matrix, dictionary_weights.num_rows, m_num_svms, true, true) ;
793 
794  // initialize, so it does not bother when not used
796  m_segment_loss.zero() ;
799  m_segment_ids.zero() ;
800  m_segment_mask.zero() ;
801 }
802 
804 {
805  int32_t m=segment_loss.num_rows;
806  int32_t n=segment_loss.num_cols;
807  // here we need two matrices. Store it in one: 2N x N
808  if (2*m!=n)
809  SG_ERROR( "segment_loss should be 2 x quadratic matrix: %i!=%i\n", 2*m, n) ;
810 
811  if (m!=m_max_a_id+1)
812  SG_ERROR( "segment_loss size should match m_max_a_id: %i!=%i\n", m, m_max_a_id+1) ;
813 
814  m_segment_loss.set_array(segment_loss.matrix, m, n/2, 2, true, true) ;
815  /*for (int32_t i=0; i<n; i++)
816  for (int32_t j=0; j<n; j++)
817  SG_DEBUG( "loss(%i,%i)=%f\n", i,j, m_segment_loss.element(0,i,j)) ;*/
818 }
819 
821  int32_t* segment_ids, float64_t* segment_mask, int32_t m)
822 {
823 
825  SG_ERROR("size of segment_ids or segment_mask (%i) does not match the size of the feature matrix (%i)", m, m_observation_matrix.get_dim2());
826  int32_t max_id = 0;
827  for (int32_t i=1;i<m;i++)
828  max_id = CMath::max(max_id,segment_ids[i]);
829  //SG_PRINT("max_id: %i, m:%i\n",max_id, m);
830  m_segment_ids.set_array(segment_ids, m, true, true) ;
831  m_segment_ids.set_array_name("m_segment_ids");
832  m_segment_mask.set_array(segment_mask, m, true, true) ;
833  m_segment_mask.set_array_name("m_segment_mask");
834 
838 }
839 
841 {
843  memcpy(scores.vector,m_scores.get_array(), sizeof(float64_t)*(m_scores.get_dim1()));
844 
845  return scores;
846 }
847 
849 {
851 
852  int32_t sz = sizeof(int32_t)*( m_states.get_dim1() * m_states.get_dim2() );
853  memcpy(states.matrix ,m_states.get_array(),sz);
854 
855  return states;
856 }
857 
859 {
861 
862  int32_t sz = sizeof(int32_t)*(m_positions.get_dim1()*m_positions.get_dim2());
863  memcpy(positions.matrix, m_positions.get_array(),sz);
864 
865  return positions;
866 }
867 
868 void CDynProg::get_path_scores(float64_t** scores, int32_t* seq_len)
869 {
870  ASSERT(scores && seq_len);
871 
872  *seq_len=m_my_scores.get_dim1();
873 
874  int32_t sz = sizeof(float64_t)*(*seq_len);
875 
876  *scores = SG_MALLOC(float64_t, *seq_len);
877  ASSERT(*scores);
878 
879  memcpy(*scores,m_my_scores.get_array(),sz);
880 }
881 
882 void CDynProg::get_path_losses(float64_t** losses, int32_t* seq_len)
883 {
884  ASSERT(losses && seq_len);
885 
886  *seq_len=m_my_losses.get_dim1();
887 
888  int32_t sz = sizeof(float64_t)*(*seq_len);
889 
890  *losses = SG_MALLOC(float64_t, *seq_len);
891  ASSERT(*losses);
892 
893  memcpy(*losses,m_my_losses.get_array(),sz);
894 }
895 
897 
899  int32_t orf_from, int32_t orf_to, int32_t start, int32_t &last_pos,
900  int32_t to)
901 {
902 #ifdef DYNPROG_TIMING_DETAIL
903  MyTime.start() ;
904 #endif
905 
906  if (start<0)
907  start=0 ;
908  if (to<0)
909  to=0 ;
910 
911  int32_t orf_target = orf_to-orf_from ;
912  if (orf_target<0) orf_target+=3 ;
913 
914  int32_t pos ;
915  if (last_pos==to)
916  pos = to-orf_to-3 ;
917  else
918  pos=last_pos ;
919 
920  if (pos<0)
921  {
922 #ifdef DYNPROG_TIMING_DETAIL
923  MyTime.stop() ;
924  orf_time += MyTime.time_diff_sec() ;
925 #endif
926  return true ;
927  }
928 
929  for (; pos>=start; pos-=3)
930  if (m_genestr_stop[pos])
931  {
932 #ifdef DYNPROG_TIMING_DETAIL
933  MyTime.stop() ;
934  orf_time += MyTime.time_diff_sec() ;
935 #endif
936  return false ;
937  }
938 
939 
940  last_pos = CMath::min(pos+3,to-orf_to-3) ;
941 
942 #ifdef DYNPROG_TIMING_DETAIL
943  MyTime.stop() ;
944  orf_time += MyTime.time_diff_sec() ;
945 #endif
946  return true ;
947 }
948 
949 void CDynProg::compute_nbest_paths(int32_t max_num_signals, bool use_orf,
950  int16_t nbest, bool with_loss, bool with_multiple_sequences)
951  {
952 
953  //FIXME we need checks here if all the fields are of right size
954  //SG_PRINT("m_seq_len: %i\n", m_seq_len);
955  //SG_PRINT("m_pos[0]: %i\n", m_pos[0]);
956  //SG_PRINT("\n");
957 
958  //FIXME these variables can go away when compute_nbest_paths uses them
959  //instead of the local pointers below
960  const float64_t* seq_array = m_observation_matrix.get_array();
961  m_scores.resize_array(nbest) ;
964 
965  for (int32_t i=0; i<nbest; i++)
966  {
967  m_scores[i]=-1;
968  for (int32_t j=0; j<m_observation_matrix.get_dim2(); j++)
969  {
970  m_states.element(i,j)=-1;
971  m_positions.element(i,j)=-1;
972  }
973  }
974  float64_t* prob_nbest=m_scores.get_array();
975  int32_t* my_state_seq=m_states.get_array();
976  int32_t* my_pos_seq=m_positions.get_array();
977 
978  CPlifBase** Plif_matrix=m_plif_matrices->get_plif_matrix();
979  CPlifBase** Plif_state_signals=m_plif_matrices->get_state_signals();
980  //END FIXME
981 
982 
983 #ifdef DYNPROG_TIMING
984  segment_init_time = 0.0 ;
985  segment_pos_time = 0.0 ;
986  segment_extend_time = 0.0 ;
987  segment_clean_time = 0.0 ;
988  orf_time = 0.0 ;
989  svm_init_time = 0.0 ;
990  svm_pos_time = 0.0 ;
991  svm_clean_time = 0.0 ;
992  inner_loop_time = 0.0 ;
993  content_svm_values_time = 0.0 ;
994  content_plifs_time = 0.0 ;
995  inner_loop_max_time = 0.0 ;
996  long_transition_time = 0.0 ;
997 
998  MyTime2.start() ;
999 #endif
1000 
1001  if (!m_svm_arrays_clean)
1002  {
1003  SG_ERROR( "SVM arrays not clean") ;
1004  return ;
1005  }
1006 
1007 #ifdef DYNPROG_DEBUG
1008  m_transition_matrix_a.set_array_name("transition_matrix");
1013  //SG_PRINT("use_orf = %i\n", use_orf) ;
1014 #endif
1015 
1016  int32_t max_look_back = 1000 ;
1017  bool use_svm = false ;
1018 
1019  SG_DEBUG("m_N:%i, m_seq_len:%i, max_num_signals:%i\n",m_N, m_seq_len, max_num_signals) ;
1020 
1021  //for (int32_t i=0;i<m_N*m_seq_len*max_num_signals;i++)
1022  // SG_PRINT("(%i)%0.2f ",i,seq_array[i]);
1023 
1024  CArray2<CPlifBase*> PEN(Plif_matrix, m_N, m_N, false, false) ;
1025  PEN.set_array_name("PEN");
1026  CArray2<CPlifBase*> PEN_state_signals(Plif_state_signals, m_N, max_num_signals, false, false) ;
1027  PEN_state_signals.set_array_name("state_signals");
1028 
1030  seq.set_array_name("seq") ;
1031  seq.zero() ;
1032 
1033 #ifdef DYNPROG_DEBUG
1034  SG_PRINT("m_num_raw_data: %i\n",m_num_raw_data);
1035  SG_PRINT("m_num_intron_plifs: %i\n", m_num_intron_plifs);
1036  SG_PRINT("m_num_svms: %i\n", m_num_svms);
1037  SG_PRINT("m_num_lin_feat_plifs_cum: ");
1038  for (int i=0; i<=m_num_raw_data; i++)
1040  SG_PRINT("\n");
1041 #endif
1042 
1044  { // initialize svm_svalue
1046  svm_value[s]=0 ;
1047  }
1048 
1049  { // convert seq_input to seq
1050  // this is independent of the svm values
1051 
1052  //CArray3<float64_t> seq_input(seq_array, m_N, m_seq_len, max_num_signals) ;
1053  CArray3<float64_t> *seq_input=NULL ;
1054  if (seq_array!=NULL)
1055  {
1056  //SG_PRINT("using dense seq_array\n") ;
1057 
1058  seq_input=new CArray3<float64_t>(seq_array, m_N, m_seq_len, max_num_signals) ;
1059  seq_input->set_array_name("seq_input") ;
1060  //seq_input.display_array() ;
1061 
1062  ASSERT(m_seq_sparse1==NULL) ;
1063  ASSERT(m_seq_sparse2==NULL) ;
1064  } else
1065  {
1066  SG_PRINT("using sparse seq_array\n") ;
1067 
1068  ASSERT(m_seq_sparse1!=NULL) ;
1069  ASSERT(m_seq_sparse2!=NULL) ;
1070  ASSERT(max_num_signals==2) ;
1071  }
1072 
1073  for (int32_t i=0; i<m_N; i++)
1074  for (int32_t j=0; j<m_seq_len; j++)
1075  seq.element(i,j) = 0 ;
1076 
1077  for (int32_t i=0; i<m_N; i++)
1078  for (int32_t j=0; j<m_seq_len; j++)
1079  for (int32_t k=0; k<max_num_signals; k++)
1080  {
1081  if ((PEN_state_signals.element(i,k)==NULL) && (k==0))
1082  {
1083  // no plif
1084  if (seq_input!=NULL)
1085  seq.element(i,j) = seq_input->element(i,j,k) ;
1086  else
1087  {
1088  if (k==0)
1089  seq.element(i,j) = m_seq_sparse1->get_feature(i,j) ;
1090  if (k==1)
1091  seq.element(i,j) = m_seq_sparse2->get_feature(i,j) ;
1092  }
1093  break ;
1094  }
1095  if (PEN_state_signals.element(i,k)!=NULL)
1096  {
1097  if (seq_input!=NULL)
1098  {
1099  // just one plif
1100  if (CMath::is_finite(seq_input->element(i,j,k)))
1101  seq.element(i,j) += PEN_state_signals.element(i,k)->lookup_penalty(seq_input->element(i,j,k), svm_value) ;
1102  else
1103  // keep infinity values
1104  seq.element(i,j) = seq_input->element(i, j, k) ;
1105  }
1106  else
1107  {
1108  if (k==0)
1109  {
1110  // just one plif
1112  seq.element(i,j) += PEN_state_signals.element(i,k)->lookup_penalty(m_seq_sparse1->get_feature(i,j), svm_value) ;
1113  else
1114  // keep infinity values
1115  seq.element(i,j) = m_seq_sparse1->get_feature(i, j) ;
1116  }
1117  if (k==1)
1118  {
1119  // just one plif
1121  seq.element(i,j) += PEN_state_signals.element(i,k)->lookup_penalty(m_seq_sparse2->get_feature(i,j), svm_value) ;
1122  else
1123  // keep infinity values
1124  seq.element(i,j) = m_seq_sparse2->get_feature(i, j) ;
1125  }
1126  }
1127  }
1128  else
1129  break ;
1130  }
1131  delete seq_input;
1132  SG_FREE(svm_value);
1133  }
1134 
1135  // allow longer transitions than look_back
1136  bool long_transitions = m_long_transitions ;
1137  CArray2<int32_t> long_transition_content_start_position(m_N,m_N) ;
1138  long_transition_content_start_position.set_array_name("long_transition_content_start_position");
1139 #ifdef DYNPROG_DEBUG
1140  CArray2<int32_t> long_transition_content_end_position(m_N,m_N) ;
1141  long_transition_content_end_position.set_array_name("long_transition_content_end_position");
1142 #endif
1143  CArray2<int32_t> long_transition_content_start(m_N,m_N) ;
1144  long_transition_content_start.set_array_name("long_transition_content_start");
1145  CArray2<float64_t> long_transition_content_scores(m_N,m_N) ;
1146  long_transition_content_scores.set_array_name("long_transition_content_scores");
1147 #ifdef DYNPROG_DEBUG
1148  CArray2<float64_t> long_transition_content_scores_pen(m_N,m_N) ;
1149  long_transition_content_scores_pen.set_array_name("long_transition_content_scores_pen");
1150  CArray2<float64_t> long_transition_content_scores_prev(m_N,m_N) ;
1151  long_transition_content_scores_prev.set_array_name("long_transition_content_scores_prev");
1152  CArray2<float64_t> long_transition_content_scores_elem(m_N,m_N) ;
1153  long_transition_content_scores_elem.set_array_name("long_transition_content_scores_elem");
1154 #endif
1155  CArray2<float64_t> long_transition_content_scores_loss(m_N,m_N) ;
1156  long_transition_content_scores_loss.set_array_name("long_transition_content_scores_loss");
1157 
1158  if (nbest!=1)
1159  {
1160  SG_ERROR("Long transitions are not supported for nbest!=1") ;
1161  long_transitions = false ;
1162  }
1163  long_transition_content_scores.set_const(-CMath::INFTY);
1164 #ifdef DYNPROG_DEBUG
1165  long_transition_content_scores_pen.set_const(0) ;
1166  long_transition_content_scores_elem.set_const(0) ;
1167  long_transition_content_scores_prev.set_const(0) ;
1168 #endif
1169  if (with_loss)
1170  long_transition_content_scores_loss.set_const(0) ;
1171  long_transition_content_start.zero() ;
1172  long_transition_content_start_position.zero() ;
1173 #ifdef DYNPROG_DEBUG
1174  long_transition_content_end_position.zero() ;
1175 #endif
1176 
1178  { // initialize svm_svalue
1180  svm_value[s]=0 ;
1181  }
1182 
1183  CArray2<int32_t> look_back(m_N,m_N) ;
1184  look_back.set_array_name("look_back");
1185  //CArray2<int32_t> look_back_orig(m_N,m_N) ;
1186  //look_back.set_array_name("look_back_orig");
1187 
1188 
1189  { // determine maximal length of look-back
1190  for (int32_t i=0; i<m_N; i++)
1191  for (int32_t j=0; j<m_N; j++)
1192  {
1193  look_back.set_element(INT_MAX, i, j) ;
1194  //look_back_orig.set_element(INT_MAX, i, j) ;
1195  }
1196 
1197  for (int32_t j=0; j<m_N; j++)
1198  {
1199  // only consider transitions that are actually allowed
1200  const T_STATES num_elem = trans_list_forward_cnt[j] ;
1201  const T_STATES *elem_list = trans_list_forward[j] ;
1202 
1203  for (int32_t i=0; i<num_elem; i++)
1204  {
1205  T_STATES ii = elem_list[i] ;
1206 
1207  CPlifBase *penij=PEN.element(j, ii) ;
1208  if (penij==NULL)
1209  {
1210  if (long_transitions)
1211  {
1212  look_back.set_element(m_long_transition_threshold, j, ii) ;
1213  //look_back_orig.set_element(m_long_transition_max, j, ii) ;
1214  }
1215  continue ;
1216  }
1217 
1218  /* if the transition is an ORF or we do computation with loss, we have to disable long transitions */
1219  if ((m_orf_info.element(ii,0)!=-1) || (m_orf_info.element(j,1)!=-1) || (!long_transitions))
1220  {
1221  look_back.set_element(CMath::ceil(penij->get_max_value()), j, ii) ;
1222  //look_back_orig.set_element(CMath::ceil(penij->get_max_value()), j, ii) ;
1223  if (CMath::ceil(penij->get_max_value()) > max_look_back)
1224  {
1225  SG_DEBUG( "%d %d -> value: %f\n", ii,j,penij->get_max_value());
1226  max_look_back = (int32_t) (CMath::ceil(penij->get_max_value()));
1227  }
1228  }
1229  else
1230  {
1231  look_back.set_element(CMath::min( (int32_t)CMath::ceil(penij->get_max_value()), m_long_transition_threshold ), j, ii) ;
1232  //look_back_orig.set_element( (int32_t)CMath::ceil(penij->get_max_value()), j, ii) ;
1233  }
1234 
1235  if (penij->uses_svm_values())
1236  use_svm=true ;
1237  }
1238  }
1239  /* make sure max_look_back is at least as long as a long transition */
1240  if (long_transitions)
1241  max_look_back = CMath::max(m_long_transition_threshold, max_look_back) ;
1242 
1243  /* make sure max_look_back is not longer than the whole string */
1244  max_look_back = CMath::min(m_genestr.get_dim1(), max_look_back) ;
1245 
1246  int32_t num_long_transitions = 0 ;
1247  for (int32_t i=0; i<m_N; i++)
1248  for (int32_t j=0; j<m_N; j++)
1249  {
1250  if (look_back.get_element(i,j)==m_long_transition_threshold)
1251  num_long_transitions++ ;
1252  if (look_back.get_element(i,j)==INT_MAX)
1253  {
1254  if (long_transitions)
1255  {
1256  look_back.set_element(m_long_transition_threshold, i, j) ;
1257  //look_back_orig.set_element(m_long_transition_max, i, j) ;
1258  }
1259  else
1260  {
1261  look_back.set_element(max_look_back, i, j) ;
1262  //look_back_orig.set_element(m_long_transition_max, i, j) ;
1263  }
1264  }
1265  }
1266  SG_DEBUG("Using %i long transitions\n", num_long_transitions) ;
1267  }
1268  //SG_PRINT("max_look_back: %i \n", max_look_back) ;
1269 
1270  //SG_PRINT("use_svm=%i, genestr_len: \n", use_svm, m_genestr.get_dim1()) ;
1271  SG_DEBUG("use_svm=%i\n", use_svm) ;
1272 
1273  SG_DEBUG("maxlook: %d m_N: %d nbest: %d \n", max_look_back, m_N, nbest);
1274  const int32_t look_back_buflen = (max_look_back*m_N+1)*nbest ;
1275  SG_DEBUG("look_back_buflen=%i\n", look_back_buflen) ;
1276  /*const float64_t mem_use = (float64_t)(m_seq_len*m_N*nbest*(sizeof(T_STATES)+sizeof(int16_t)+sizeof(int32_t))+
1277  look_back_buflen*(2*sizeof(float64_t)+sizeof(int32_t))+
1278  m_seq_len*(sizeof(T_STATES)+sizeof(int32_t))+
1279  m_genestr.get_dim1()*sizeof(bool))/(1024*1024);*/
1280 
1281  //bool is_big = (mem_use>200) || (m_seq_len>5000) ;
1282 
1283  /*if (is_big)
1284  {
1285  SG_DEBUG("calling compute_nbest_paths: m_seq_len=%i, m_N=%i, lookback=%i nbest=%i\n",
1286  m_seq_len, m_N, max_look_back, nbest) ;
1287  SG_DEBUG("allocating %1.2fMB of memory\n",
1288  mem_use) ;
1289  }*/
1290  ASSERT(nbest<32000) ;
1291 
1292 
1293 
1294  CArray3<float64_t> delta(m_seq_len, m_N, nbest) ;
1295  delta.set_array_name("delta");
1296  float64_t* delta_array = delta.get_array() ;
1297  //delta.zero() ;
1298 
1299  CArray3<T_STATES> psi(m_seq_len, m_N, nbest) ;
1300  psi.set_array_name("psi");
1301  //psi.zero() ;
1302 
1303  CArray3<int16_t> ktable(m_seq_len, m_N, nbest) ;
1304  ktable.set_array_name("ktable");
1305  //ktable.zero() ;
1306 
1307  CArray3<int32_t> ptable(m_seq_len, m_N, nbest) ;
1308  ptable.set_array_name("ptable");
1309  //ptable.zero() ;
1310 
1311  CArray<float64_t> delta_end(nbest) ;
1312  delta_end.set_array_name("delta_end");
1313  //delta_end.zero() ;
1314 
1315  CArray<T_STATES> path_ends(nbest) ;
1316  path_ends.set_array_name("path_ends");
1317  //path_ends.zero() ;
1318 
1319  CArray<int16_t> ktable_end(nbest) ;
1320  ktable_end.set_array_name("ktable_end");
1321  //ktable_end.zero() ;
1322 
1323  float64_t * fixedtempvv=SG_MALLOC(float64_t, look_back_buflen);
1324  memset(fixedtempvv, 0, look_back_buflen*sizeof(float64_t)) ;
1325  int32_t * fixedtempii=SG_MALLOC(int32_t, look_back_buflen);
1326  memset(fixedtempii, 0, look_back_buflen*sizeof(int32_t)) ;
1327 
1328  CArray<float64_t> oldtempvv(look_back_buflen) ;
1329  oldtempvv.set_array_name("oldtempvv");
1330  CArray<float64_t> oldtempvv2(look_back_buflen) ;
1331  oldtempvv2.set_array_name("oldtempvv2");
1332  //oldtempvv.zero() ;
1333  //oldtempvv.display_size() ;
1334 
1335  CArray<int32_t> oldtempii(look_back_buflen) ;
1336  oldtempii.set_array_name("oldtempii");
1337  CArray<int32_t> oldtempii2(look_back_buflen) ;
1338  oldtempii2.set_array_name("oldtempii2");
1339  //oldtempii.zero() ;
1340 
1341  CArray<T_STATES> state_seq(m_seq_len) ;
1342  state_seq.set_array_name("state_seq");
1343  //state_seq.zero() ;
1344 
1345  CArray<int32_t> pos_seq(m_seq_len) ;
1346  pos_seq.set_array_name("pos_seq");
1347  //pos_seq.zero() ;
1348 
1349 
1350  m_dict_weights.set_array_name("dict_weights") ;
1351  m_word_degree.set_array_name("word_degree") ;
1352  m_cum_num_words.set_array_name("cum_num_words") ;
1353  m_num_words.set_array_name("num_words") ;
1354  //word_used.set_array_name("word_used") ;
1355  //svm_values_unnormalized.set_array_name("svm_values_unnormalized") ;
1356  //m_svm_pos_start.set_array_name("svm_pos_start") ;
1357  m_num_unique_words.set_array_name("num_unique_words") ;
1358 
1359  PEN.set_array_name("PEN") ;
1360  seq.set_array_name("seq") ;
1361 
1362  delta.set_array_name("delta") ;
1363  psi.set_array_name("psi") ;
1364  ktable.set_array_name("ktable") ;
1365  ptable.set_array_name("ptable") ;
1366  delta_end.set_array_name("delta_end") ;
1367  path_ends.set_array_name("path_ends") ;
1368  ktable_end.set_array_name("ktable_end") ;
1369 
1370 #ifdef USE_TMP_ARRAYCLASS
1371  fixedtempvv.set_array_name("fixedtempvv") ;
1372  fixedtempii.set_array_name("fixedtempvv") ;
1373 #endif
1374 
1375  oldtempvv.set_array_name("oldtempvv") ;
1376  oldtempvv2.set_array_name("oldtempvv2") ;
1377  oldtempii.set_array_name("oldtempii") ;
1378  oldtempii2.set_array_name("oldtempii2") ;
1379 
1380 
1382 
1383 #ifdef DYNPROG_DEBUG
1384  state_seq.display_size() ;
1385  pos_seq.display_size() ;
1386 
1391  //word_used.display_size() ;
1392  //svm_values_unnormalized.display_size() ;
1393  //m_svm_pos_start.display_array() ;
1395 
1396  PEN.display_size() ;
1397  PEN_state_signals.display_size() ;
1398  seq.display_size() ;
1400 
1401  //m_genestr_stop.display_size() ;
1402  delta.display_size() ;
1403  psi.display_size() ;
1404  ktable.display_size() ;
1405  ptable.display_size() ;
1406  delta_end.display_size() ;
1407  path_ends.display_size() ;
1408  ktable_end.display_size() ;
1409 
1410 #ifdef USE_TMP_ARRAYCLASS
1411  fixedtempvv.display_size() ;
1412  fixedtempii.display_size() ;
1413 #endif
1414 
1415  //oldtempvv.display_size() ;
1416  //oldtempii.display_size() ;
1417 
1418  state_seq.display_size() ;
1419  pos_seq.display_size() ;
1420 
1421  //seq.zero() ;
1422 
1423 #endif //DYNPROG_DEBUG
1424 
1426 
1427 
1428 
1429  {
1430  for (int32_t s=0; s<m_num_svms; s++)
1432  }
1433 
1434 
1435  //CArray2<int32_t*> trans_matrix_svms(m_N,m_N);
1436  //CArray2<int32_t> trans_matrix_num_svms(m_N,m_N);
1437 
1438  { // initialization
1439 
1440  for (T_STATES i=0; i<m_N; i++)
1441  {
1442  //delta.element(0, i, 0) = get_p(i) + seq.element(i,0) ; // get_p defined in HMM.h to be equiv to initial_state_distribution
1443  delta.element(delta_array, 0, i, 0, m_seq_len, m_N) = get_p(i) + seq.element(i,0) ; // get_p defined in HMM.h to be equiv to initial_state_distribution
1444  psi.element(0,i,0) = 0 ;
1445  if (nbest>1)
1446  ktable.element(0,i,0) = 0 ;
1447  ptable.element(0,i,0) = 0 ;
1448  for (int16_t k=1; k<nbest; k++)
1449  {
1450  int32_t dim1, dim2, dim3 ;
1451  delta.get_array_size(dim1, dim2, dim3) ;
1452  //SG_DEBUG("i=%i, k=%i -- %i, %i, %i\n", i, k, dim1, dim2, dim3) ;
1453  //delta.element(0, i, k) = -CMath::INFTY ;
1454  delta.element(delta_array, 0, i, k, m_seq_len, m_N) = -CMath::INFTY ;
1455  psi.element(0,i,0) = 0 ; // <--- what's this for?
1456  if (nbest>1)
1457  ktable.element(0,i,k) = 0 ;
1458  ptable.element(0,i,k) = 0 ;
1459  }
1460  /*
1461  for (T_STATES j=0; j<m_N; j++)
1462  {
1463  CPlifBase * penalty = PEN.element(i,j) ;
1464  int32_t num_current_svms=0;
1465  int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
1466  if (penalty)
1467  {
1468  SG_PRINT("trans %i -> %i \n",i,j);
1469  penalty->get_used_svms(&num_current_svms, svm_ids);
1470  trans_matrix_svms.set_element(svm_ids,i,j);
1471  for (int32_t l=0;l<num_current_svms;l++)
1472  SG_PRINT("svm_ids[%i]: %i \n",l,svm_ids[l]);
1473  trans_matrix_num_svms.set_element(num_current_svms,i,j);
1474  }
1475  }
1476  */
1477 
1478  }
1479  }
1480 
1481  SG_DEBUG("START_RECURSION \n\n");
1482 
1483  // recursion
1484  for (int32_t t=1; t<m_seq_len; t++)
1485  {
1486  //if (is_big && t%(1+(m_seq_len/1000))==1)
1487  // SG_PROGRESS(t, 0, m_seq_len);
1488  //SG_PRINT("%i\n", t) ;
1489 
1490  for (T_STATES j=0; j<m_N; j++)
1491  {
1492  if (seq.element(j,t)<=-1e20)
1493  { // if we cannot observe the symbol here, then we can omit the rest
1494  for (int16_t k=0; k<nbest; k++)
1495  {
1496  delta.element(delta_array, t, j, k, m_seq_len, m_N) = seq.element(j,t) ;
1497  psi.element(t,j,k) = 0 ;
1498  if (nbest>1)
1499  ktable.element(t,j,k) = 0 ;
1500  ptable.element(t,j,k) = 0 ;
1501  }
1502  }
1503  else
1504  {
1505  const T_STATES num_elem = trans_list_forward_cnt[j] ;
1506  const T_STATES *elem_list = trans_list_forward[j] ;
1507  const float64_t *elem_val = trans_list_forward_val[j] ;
1508  const int32_t *elem_id = trans_list_forward_id[j] ;
1509 
1510  int32_t fixed_list_len = 0 ;
1511  float64_t fixedtempvv_ = CMath::INFTY ;
1512  int32_t fixedtempii_ = 0 ;
1513  bool fixedtemplong = false ;
1514 
1515  for (int32_t i=0; i<num_elem; i++)
1516  {
1517  T_STATES ii = elem_list[i] ;
1518 
1519  const CPlifBase * penalty = PEN.element(j,ii) ;
1520 
1521  /*int32_t look_back = max_look_back ;
1522  if (0)
1523  { // find lookback length
1524  CPlifBase *pen = (CPlifBase*) penalty ;
1525  if (pen!=NULL)
1526  look_back=(int32_t) (CMath::ceil(pen->get_max_value()));
1527  if (look_back>=1e6)
1528  SG_PRINT("%i,%i -> %d from %ld\n", j, ii, look_back, (long)pen) ;
1529  ASSERT(look_back<1e6);
1530  } */
1531 
1532  int32_t look_back_ = look_back.element(j, ii) ;
1533 
1534  int32_t orf_from = m_orf_info.element(ii,0) ;
1535  int32_t orf_to = m_orf_info.element(j,1) ;
1536  if((orf_from!=-1)!=(orf_to!=-1))
1537  SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i]) ;
1538  ASSERT((orf_from!=-1)==(orf_to!=-1)) ;
1539 
1540  int32_t orf_target = -1 ;
1541  if (orf_from!=-1)
1542  {
1543  orf_target=orf_to-orf_from ;
1544  if (orf_target<0)
1545  orf_target+=3 ;
1546  ASSERT(orf_target>=0 && orf_target<3) ;
1547  }
1548 
1549  int32_t orf_last_pos = m_pos[t] ;
1550 #ifdef DYNPROG_TIMING
1551  MyTime3.start() ;
1552 #endif
1553  int32_t num_ok_pos = 0 ;
1554 
1555  for (int32_t ts=t-1; ts>=0 && m_pos[t]-m_pos[ts]<=look_back_; ts--)
1556  {
1557  bool ok ;
1558  //int32_t plen=t-ts;
1559 
1560  /*for (int32_t s=0; s<m_num_svms; s++)
1561  if ((fabs(svs.svm_values[s*svs.seqlen+plen]-svs2.svm_values[s*svs.seqlen+plen])>1e-6) ||
1562  (fabs(svs.svm_values[s*svs.seqlen+plen]-svs3.svm_values[s*svs.seqlen+plen])>1e-6))
1563  {
1564  SG_DEBUG( "s=%i, t=%i, ts=%i, %1.5e, %1.5e, %1.5e\n", s, t, ts, svs.svm_values[s*svs.seqlen+plen], svs2.svm_values[s*svs.seqlen+plen], svs3.svm_values[s*svs.seqlen+plen]);
1565  }*/
1566 
1567  if (orf_target==-1)
1568  ok=true ;
1569  else if (m_pos[ts]!=-1 && (m_pos[t]-m_pos[ts])%3==orf_target)
1570  ok=(!use_orf) || extend_orf(orf_from, orf_to, m_pos[ts], orf_last_pos, m_pos[t]) ;
1571  else
1572  ok=false ;
1573 
1574  if (ok)
1575  {
1576 
1577  float64_t segment_loss = 0.0 ;
1578  if (with_loss)
1579  {
1580  segment_loss = m_seg_loss_obj->get_segment_loss(ts, t, elem_id[i]);
1581  //if (segment_loss!=segment_loss2)
1582  //SG_PRINT("segment_loss:%f segment_loss2:%f\n", segment_loss, segment_loss2);
1583  }
1585  // BEST_PATH_TRANS
1587 
1588  int32_t frame = orf_from;//m_orf_info.element(ii,0);
1589  lookup_content_svm_values(ts, t, m_pos[ts], m_pos[t], svm_value, frame);
1590 
1591  float64_t pen_val = 0.0 ;
1592  if (penalty)
1593  {
1594 #ifdef DYNPROG_TIMING_DETAIL
1595  MyTime.start() ;
1596 #endif
1597  pen_val = penalty->lookup_penalty(m_pos[t]-m_pos[ts], svm_value) ;
1598 
1599 #ifdef DYNPROG_TIMING_DETAIL
1600  MyTime.stop() ;
1601  content_plifs_time += MyTime.time_diff_sec() ;
1602 #endif
1603  }
1604 
1605 #ifdef DYNPROG_TIMING_DETAIL
1606  MyTime.start() ;
1607 #endif
1608  num_ok_pos++ ;
1609 
1610  if (nbest==1)
1611  {
1612  float64_t val = elem_val[i] + pen_val ;
1613  if (with_loss)
1614  val += segment_loss ;
1615 
1616  float64_t mval = -(val + delta.element(delta_array, ts, ii, 0, m_seq_len, m_N)) ;
1617 
1618  if (mval<fixedtempvv_)
1619  {
1620  fixedtempvv_ = mval ;
1621  fixedtempii_ = ii + ts*m_N;
1622  fixed_list_len = 1 ;
1623  fixedtemplong = false ;
1624  }
1625  }
1626  else
1627  {
1628  for (int16_t diff=0; diff<nbest; diff++)
1629  {
1630  float64_t val = elem_val[i] ;
1631  val += pen_val ;
1632  if (with_loss)
1633  val += segment_loss ;
1634 
1635  float64_t mval = -(val + delta.element(delta_array, ts, ii, diff, m_seq_len, m_N)) ;
1636 
1637  /* only place -val in fixedtempvv if it is one of the nbest lowest values in there */
1638  /* fixedtempvv[i], i=0:nbest-1, is sorted so that fixedtempvv[0] <= fixedtempvv[1] <= ...*/
1639  /* fixed_list_len has the number of elements in fixedtempvv */
1640 
1641  if ((fixed_list_len < nbest) || ((0==fixed_list_len) || (mval < fixedtempvv[fixed_list_len-1])))
1642  {
1643  if ( (fixed_list_len<nbest) && ((0==fixed_list_len) || (mval>fixedtempvv[fixed_list_len-1])) )
1644  {
1645  fixedtempvv[fixed_list_len] = mval ;
1646  fixedtempii[fixed_list_len] = ii + diff*m_N + ts*m_N*nbest;
1647  fixed_list_len++ ;
1648  }
1649  else // must have mval < fixedtempvv[fixed_list_len-1]
1650  {
1651  int32_t addhere = fixed_list_len;
1652  while ((addhere > 0) && (mval < fixedtempvv[addhere-1]))
1653  addhere--;
1654 
1655  // move everything from addhere+1 one forward
1656  for (int32_t jj=fixed_list_len-1; jj>addhere; jj--)
1657  {
1658  fixedtempvv[jj] = fixedtempvv[jj-1];
1659  fixedtempii[jj] = fixedtempii[jj-1];
1660  }
1661 
1662  fixedtempvv[addhere] = mval;
1663  fixedtempii[addhere] = ii + diff*m_N + ts*m_N*nbest;
1664 
1665  if (fixed_list_len < nbest)
1666  fixed_list_len++;
1667  }
1668  }
1669  }
1670  }
1671 #ifdef DYNPROG_TIMING_DETAIL
1672  MyTime.stop() ;
1673  inner_loop_max_time += MyTime.time_diff_sec() ;
1674 #endif
1675  }
1676  }
1677 #ifdef DYNPROG_TIMING
1678  MyTime3.stop() ;
1679  inner_loop_time += MyTime3.time_diff_sec() ;
1680 #endif
1681  }
1682  for (int32_t i=0; i<num_elem; i++)
1683  {
1684  T_STATES ii = elem_list[i] ;
1685 
1686  const CPlifBase * penalty = PEN.element(j,ii) ;
1687 
1688  /*int32_t look_back = max_look_back ;
1689  if (0)
1690  { // find lookback length
1691  CPlifBase *pen = (CPlifBase*) penalty ;
1692  if (pen!=NULL)
1693  look_back=(int32_t) (CMath::ceil(pen->get_max_value()));
1694  if (look_back>=1e6)
1695  SG_PRINT("%i,%i -> %d from %ld\n", j, ii, look_back, (long)pen) ;
1696  ASSERT(look_back<1e6);
1697  } */
1698 
1699  int32_t look_back_ = look_back.element(j, ii) ;
1700  //int32_t look_back_orig_ = look_back_orig.element(j, ii) ;
1701 
1702  int32_t orf_from = m_orf_info.element(ii,0) ;
1703  int32_t orf_to = m_orf_info.element(j,1) ;
1704  if((orf_from!=-1)!=(orf_to!=-1))
1705  SG_DEBUG("j=%i ii=%i orf_from=%i orf_to=%i p=%1.2f\n", j, ii, orf_from, orf_to, elem_val[i]) ;
1706  ASSERT((orf_from!=-1)==(orf_to!=-1)) ;
1707 
1708  int32_t orf_target = -1 ;
1709  if (orf_from!=-1)
1710  {
1711  orf_target=orf_to-orf_from ;
1712  if (orf_target<0)
1713  orf_target+=3 ;
1714  ASSERT(orf_target>=0 && orf_target<3) ;
1715  }
1716 
1717  //int32_t loss_last_pos = t ;
1718  //float64_t last_loss = 0.0 ;
1719 
1720 #ifdef DYNPROG_TIMING
1721  MyTime3.start() ;
1722 #endif
1723 
1724  /* long transition stuff */
1725  /* only do this, if
1726  * this feature is enabled
1727  * this is not a transition with ORF restrictions
1728  * the loss is switched off
1729  * nbest=1
1730  */
1731 #ifdef DYNPROG_TIMING
1732  MyTime3.start() ;
1733 #endif
1734  // long transitions, only when not considering ORFs
1735  if ( long_transitions && orf_target==-1 && look_back_ == m_long_transition_threshold )
1736  {
1737 
1738  // update table for 5' part of the long segment
1739 
1740  int32_t start = long_transition_content_start.get_element(ii, j) ;
1741  int32_t end_5p_part = start ;
1742  for (int32_t start_5p_part=start; m_pos[t]-m_pos[start_5p_part] > m_long_transition_threshold ; start_5p_part++)
1743  {
1744  // find end_5p_part, which is greater than start_5p_part and at least m_long_transition_threshold away
1745  while (end_5p_part<=t && m_pos[end_5p_part+1]-m_pos[start_5p_part]<=m_long_transition_threshold)
1746  end_5p_part++ ;
1747 
1748  ASSERT(m_pos[end_5p_part+1]-m_pos[start_5p_part] > m_long_transition_threshold || end_5p_part==t) ;
1749  ASSERT(m_pos[end_5p_part]-m_pos[start_5p_part] <= m_long_transition_threshold) ;
1750 
1751  float64_t pen_val = 0.0;
1752  /* recompute penalty, if necessary */
1753  if (penalty)
1754  {
1755  int32_t frame = m_orf_info.element(ii,0);
1756  lookup_content_svm_values(start_5p_part, end_5p_part, m_pos[start_5p_part], m_pos[end_5p_part], svm_value, frame); // * t -> end_5p_part
1757  pen_val = penalty->lookup_penalty(m_pos[end_5p_part]-m_pos[start_5p_part], svm_value) ;
1758  }
1759 
1760  /*if (m_pos[start_5p_part]==1003)
1761  {
1762  SG_PRINT("Part1: %i - %i vs %i - %i\n", m_pos[t], m_pos[ts], m_pos[end_5p_part], m_pos[start_5p_part]) ;
1763  SG_PRINT("Part1: ts=%i t=%i start_5p_part=%i m_seq_len=%i\n", m_pos[ts], m_pos[t], m_pos[start_5p_part], m_seq_len) ;
1764  }*/
1765 
1766  float64_t mval_trans = -( elem_val[i] + pen_val*0.5 + delta.element(delta_array, start_5p_part, ii, 0, m_seq_len, m_N) ) ;
1767  //float64_t mval_trans = -( elem_val[i] + delta.element(delta_array, ts, ii, 0, m_seq_len, m_N) ) ; // enable this for the incomplete extra check
1768 
1769  float64_t segment_loss_part1=0.0 ;
1770  if (with_loss)
1771  { // this is the loss from the start of the long segment (5' part + middle section)
1772 
1773  segment_loss_part1 = m_seg_loss_obj->get_segment_loss(start_5p_part /*long_transition_content_start_position.get_element(ii,j)*/, end_5p_part, elem_id[i]); // * unsure
1774 
1775  mval_trans -= segment_loss_part1 ;
1776  }
1777 
1778 
1779  if (0)//m_pos[end_5p_part] - m_pos[long_transition_content_start_position.get_element(ii, j)] > look_back_orig_/*m_long_transition_max*/)
1780  {
1781  // this restricts the maximal length of segments,
1782  // but the current implementation is not valid since the
1783  // long transition is discarded without loocking if there
1784  // is a second best long transition in between
1785  long_transition_content_scores.set_element(-CMath::INFTY, ii, j) ;
1786  long_transition_content_start_position.set_element(0, ii, j) ;
1787  if (with_loss)
1788  long_transition_content_scores_loss.set_element(0.0, ii, j) ;
1789 #ifdef DYNPROG_DEBUG
1790  long_transition_content_scores_pen.set_element(0.0, ii, j) ;
1791  long_transition_content_scores_elem.set_element(0.0, ii, j) ;
1792  long_transition_content_scores_prev.set_element(0.0, ii, j) ;
1793  long_transition_content_end_position.set_element(0, ii, j) ;
1794 #endif
1795  }
1796  if (with_loss)
1797  {
1798  float64_t old_loss = long_transition_content_scores_loss.get_element(ii, j) ;
1799  float64_t new_loss = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), end_5p_part, elem_id[i]);
1800  float64_t score = long_transition_content_scores.get_element(ii, j) - old_loss + new_loss ;
1801  long_transition_content_scores.set_element(score, ii, j) ;
1802  long_transition_content_scores_loss.set_element(new_loss, ii, j) ;
1803 #ifdef DYNPROG_DEBUG
1804  long_transition_content_end_position.set_element(end_5p_part, ii, j) ;
1805 #endif
1806 
1807  }
1808  if (-long_transition_content_scores.get_element(ii, j) > mval_trans )
1809  {
1810  /* then the old long transition is either too far away or worse than the current one */
1811  long_transition_content_scores.set_element(-mval_trans, ii, j) ;
1812  long_transition_content_start_position.set_element(start_5p_part, ii, j) ;
1813  if (with_loss)
1814  long_transition_content_scores_loss.set_element(segment_loss_part1, ii, j) ;
1815 #ifdef DYNPROG_DEBUG
1816  long_transition_content_scores_pen.set_element(pen_val*0.5, ii, j) ;
1817  long_transition_content_scores_elem.set_element(elem_val[i], ii, j) ;
1818  long_transition_content_scores_prev.set_element(delta.element(delta_array, start_5p_part, ii, 0, m_seq_len, m_N), ii, j) ;
1819  /*ASSERT(fabs(long_transition_content_scores.get_element(ii, j)-(long_transition_content_scores_pen.get_element(ii, j) +
1820  long_transition_content_scores_elem.get_element(ii, j) +
1821  long_transition_content_scores_prev.get_element(ii, j)))<1e-6) ;*/
1822  long_transition_content_end_position.set_element(end_5p_part, ii, j) ;
1823 #endif
1824  }
1825  //
1826  // this sets the position where the search for better 5'parts is started the next time
1827  // whithout this the prediction takes ages
1828  //
1829  long_transition_content_start.set_element(start_5p_part, ii, j) ;
1830  }
1831 
1832  // consider the 3' part at the end of the long segment:
1833  // * with length = m_long_transition_threshold
1834  // * content prediction and loss only for this part
1835 
1836  // find ts > 0 with distance from m_pos[t] greater m_long_transition_threshold
1837  // precompute: only depends on t
1838  int ts = t;
1839  while (ts>0 && m_pos[t]-m_pos[ts-1] <= m_long_transition_threshold)
1840  ts-- ;
1841 
1842  if (ts>0)
1843  {
1844  ASSERT((m_pos[t]-m_pos[ts-1] > m_long_transition_threshold) && (m_pos[t]-m_pos[ts] <= m_long_transition_threshold)) ;
1845 
1846 
1847  /* only consider this transition, if the right position was found */
1848  float pen_val_3p = 0.0 ;
1849  if (penalty)
1850  {
1851  int32_t frame = orf_from ; //m_orf_info.element(ii, 0);
1852  lookup_content_svm_values(ts, t, m_pos[ts], m_pos[t], svm_value, frame);
1853  pen_val_3p = penalty->lookup_penalty(m_pos[t]-m_pos[ts], svm_value) ;
1854  }
1855 
1856  float64_t mval = -(long_transition_content_scores.get_element(ii, j) + pen_val_3p*0.5) ;
1857 
1858  {
1859 #ifdef DYNPROG_DEBUG
1860  float64_t segment_loss_part2=0.0 ;
1861  float64_t segment_loss_part1=0.0 ;
1862 #endif
1863  float64_t segment_loss_total=0.0 ;
1864 
1865  if (with_loss)
1866  { // this is the loss for the 3' end fragment of the segment
1867  // (the 5' end and the middle section loss is already contained in mval)
1868 
1869 #ifdef DYNPROG_DEBUG
1870  // this is an alternative, which should be identical, if the loss is additive
1871  segment_loss_part2 = m_seg_loss_obj->get_segment_loss_extend(long_transition_content_end_position.get_element(ii,j), t, elem_id[i]);
1872  //mval -= segment_loss_part2 ;
1873  segment_loss_part1 = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), long_transition_content_end_position.get_element(ii,j), elem_id[i]);
1874 #endif
1875  segment_loss_total = m_seg_loss_obj->get_segment_loss(long_transition_content_start_position.get_element(ii,j), t, elem_id[i]);
1876  mval -= (segment_loss_total-long_transition_content_scores_loss.get_element(ii, j)) ;
1877  }
1878 
1879 #ifdef DYNPROG_DEBUG
1880  if (m_pos[t]==10108 ||m_pos[t]==12802 ||m_pos[t]== 12561)
1881  {
1882  SG_PRINT("Part2: %i,%i,%i: val=%1.6f pen_val_3p*0.5=%1.6f (t=%i, ts=%i, ts-1=%i, ts+1=%i); scores=%1.6f (pen=%1.6f,prev=%1.6f,elem=%1.6f,loss=%1.1f), positions=%i,%i,%i, loss=%1.1f/%1.1f (%i,%i)\n",
1883  m_pos[t], j, ii, -mval, 0.5*pen_val_3p, m_pos[t], m_pos[ts], m_pos[ts-1], m_pos[ts+1],
1884  long_transition_content_scores.get_element(ii, j),
1885  long_transition_content_scores_pen.get_element(ii, j),
1886  long_transition_content_scores_prev.get_element(ii, j),
1887  long_transition_content_scores_elem.get_element(ii, j),
1888  long_transition_content_scores_loss.get_element(ii, j),
1889  m_pos[long_transition_content_start_position.get_element(ii,j)],
1890  m_pos[long_transition_content_end_position.get_element(ii,j)],
1891  m_pos[long_transition_content_start.get_element(ii,j)], segment_loss_part2, segment_loss_total, long_transition_content_start_position.get_element(ii,j), t) ;
1892  SG_PRINT("fixedtempvv_: %1.6f, from_state:%i from_pos:%i\n ",-fixedtempvv_, (fixedtempii_%m_N), m_pos[(fixedtempii_-(fixedtempii_%(m_N*nbest)))/(m_N*nbest)] );
1893  }
1894 
1895  if (fabs(segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j) - segment_loss_total)>1e-3)
1896  {
1897  SG_ERROR("LOSS: total=%1.1f (%i-%i) part1=%1.1f/%1.1f (%i-%i) part2=%1.1f (%i-%i) sum=%1.1f diff=%1.1f\n",
1898  segment_loss_total, m_pos[long_transition_content_start_position.get_element(ii,j)], m_pos[t],
1899  long_transition_content_scores_loss.get_element(ii, j), segment_loss_part1, m_pos[long_transition_content_start_position.get_element(ii,j)], m_pos[long_transition_content_end_position.get_element(ii,j)],
1900  segment_loss_part2, m_pos[long_transition_content_end_position.get_element(ii,j)], m_pos[t],
1901  segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j),
1902  segment_loss_part2+long_transition_content_scores_loss.get_element(ii, j) - segment_loss_total) ;
1903  }
1904 #endif
1905  }
1906 
1907  // prefer simpler version to guarantee optimality
1908  //
1909  // original:
1910  /* if ((mval < fixedtempvv_) &&
1911  (m_pos[t] - m_pos[long_transition_content_start_position.get_element(ii, j)])<=look_back_orig_) */
1912  if (mval < fixedtempvv_)
1913  {
1914  /* then the long transition is better than the short one => replace it */
1915  int32_t fromtjk = fixedtempii_ ;
1916  /*SG_PRINT("%i,%i: Long transition (%1.5f=-(%1.5f+%1.5f+%1.5f+%1.5f), %i) to m_pos %i better than short transition (%1.5f,%i) to m_pos %i \n",
1917  m_pos[t], j,
1918  mval, pen_val_3p*0.5, long_transition_content_scores_pen.get_element(ii, j), long_transition_content_scores_elem.get_element(ii, j), long_transition_content_scores_prev.get_element(ii, j), ii,
1919  m_pos[long_transition_content_position.get_element(ii, j)],
1920  fixedtempvv_, (fromtjk%m_N), m_pos[(fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)]) ;*/
1921  ASSERT((fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)==0 || m_pos[(fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest)]>=m_pos[long_transition_content_start_position.get_element(ii, j)] || fixedtemplong) ;
1922 
1923  fixedtempvv_ = mval ;
1924  fixedtempii_ = ii + m_N*long_transition_content_start_position.get_element(ii, j) ;
1925  fixed_list_len = 1 ;
1926  fixedtemplong = true ;
1927  }
1928  }
1929  }
1930  }
1931 #ifdef DYNPROG_TIMING
1932  MyTime3.stop() ;
1933  long_transition_time += MyTime3.time_diff_sec() ;
1934 #endif
1935 
1936 
1937  int32_t numEnt = fixed_list_len;
1938 
1939  float64_t minusscore;
1940  int64_t fromtjk;
1941 
1942  for (int16_t k=0; k<nbest; k++)
1943  {
1944  if (k<numEnt)
1945  {
1946  if (nbest==1)
1947  {
1948  minusscore = fixedtempvv_ ;
1949  fromtjk = fixedtempii_ ;
1950  }
1951  else
1952  {
1953  minusscore = fixedtempvv[k];
1954  fromtjk = fixedtempii[k];
1955  }
1956 
1957  delta.element(delta_array, t, j, k, m_seq_len, m_N) = -minusscore + seq.element(j,t);
1958  psi.element(t,j,k) = (fromtjk%m_N) ;
1959  if (nbest>1)
1960  ktable.element(t,j,k) = (fromtjk%(m_N*nbest)-psi.element(t,j,k))/m_N ;
1961  ptable.element(t,j,k) = (fromtjk-(fromtjk%(m_N*nbest)))/(m_N*nbest) ;
1962  }
1963  else
1964  {
1965  delta.element(delta_array, t, j, k, m_seq_len, m_N) = -CMath::INFTY ;
1966  psi.element(t,j,k) = 0 ;
1967  if (nbest>1)
1968  ktable.element(t,j,k) = 0 ;
1969  ptable.element(t,j,k) = 0 ;
1970  }
1971  }
1972  }
1973  }
1974  }
1975  { //termination
1976  int32_t list_len = 0 ;
1977  for (int16_t diff=0; diff<nbest; diff++)
1978  {
1979  for (T_STATES i=0; i<m_N; i++)
1980  {
1981  oldtempvv[list_len] = -(delta.element(delta_array, (m_seq_len-1), i, diff, m_seq_len, m_N)+get_q(i)) ;
1982  oldtempii[list_len] = i + diff*m_N ;
1983  list_len++ ;
1984  }
1985  }
1986 
1987  CMath::nmin(oldtempvv.get_array(), oldtempii.get_array(), list_len, nbest) ;
1988 
1989  for (int16_t k=0; k<nbest; k++)
1990  {
1991  delta_end.element(k) = -oldtempvv[k] ;
1992  path_ends.element(k) = (oldtempii[k]%m_N) ;
1993  if (nbest>1)
1994  ktable_end.element(k) = (oldtempii[k]-path_ends.element(k))/m_N ;
1995  }
1996 
1997 
1998  }
1999 
2000  { //state sequence backtracking
2001  for (int16_t k=0; k<nbest; k++)
2002  {
2003  prob_nbest[k]= delta_end.element(k) ;
2004 
2005  int32_t i = 0 ;
2006  state_seq[i] = path_ends.element(k) ;
2007  int16_t q = 0 ;
2008  if (nbest>1)
2009  q=ktable_end.element(k) ;
2010  pos_seq[i] = m_seq_len-1 ;
2011 
2012  while (pos_seq[i]>0)
2013  {
2014  ASSERT(i+1<m_seq_len);
2015  //SG_DEBUG("s=%i p=%i q=%i\n", state_seq[i], pos_seq[i], q) ;
2016  state_seq[i+1] = psi.element(pos_seq[i], state_seq[i], q);
2017  pos_seq[i+1] = ptable.element(pos_seq[i], state_seq[i], q) ;
2018  if (nbest>1)
2019  q = ktable.element(pos_seq[i], state_seq[i], q) ;
2020  i++ ;
2021  }
2022  //SG_DEBUG("s=%i p=%i q=%i\n", state_seq[i], pos_seq[i], q) ;
2023  int32_t num_states = i+1 ;
2024  for (i=0; i<num_states;i++)
2025  {
2026  my_state_seq[i+k*m_seq_len] = state_seq[num_states-i-1] ;
2027  my_pos_seq[i+k*m_seq_len] = pos_seq[num_states-i-1] ;
2028  }
2029  if (num_states<m_seq_len)
2030  {
2031  my_state_seq[num_states+k*m_seq_len]=-1 ;
2032  my_pos_seq[num_states+k*m_seq_len]=-1 ;
2033  }
2034  }
2035  }
2036 
2037  //if (is_big)
2038  // SG_PRINT( "DONE. \n") ;
2039 
2040 
2041 #ifdef DYNPROG_TIMING
2042  MyTime2.stop() ;
2043 
2044  //if (is_big)
2045  SG_PRINT("Timing: orf=%1.2f s \n Segment_init=%1.2f s Segment_pos=%1.2f s Segment_extend=%1.2f s Segment_clean=%1.2f s\nsvm_init=%1.2f s svm_pos=%1.2f svm_clean=%1.2f\n content_svm_values_time=%1.2f content_plifs_time=%1.2f\ninner_loop_max_time=%1.2f inner_loop=%1.2f long_transition_time=%1.2f\n total=%1.2f\n", orf_time, segment_init_time, segment_pos_time, segment_extend_time, segment_clean_time, svm_init_time, svm_pos_time, svm_clean_time, content_svm_values_time, content_plifs_time, inner_loop_max_time, inner_loop_time, long_transition_time, MyTime2.time_diff_sec()) ;
2046 #endif
2047 
2048  SG_FREE(fixedtempvv);
2049  SG_FREE(fixedtempii);
2050  }
2051 
2052 
2054  int32_t *my_state_seq, int32_t *my_pos_seq,
2055  int32_t my_seq_len, const float64_t *seq_array, int32_t max_num_signals)
2056 {
2060  //m_my_scores.resize_array(m_my_state_seq.get_array_size()) ;
2061  //m_my_losses.resize_array(m_my_state_seq.get_array_size()) ;
2062  m_my_scores.resize_array(my_seq_len);
2063  m_my_losses.resize_array(my_seq_len);
2064  float64_t* my_scores=m_my_scores.get_array();
2065  float64_t* my_losses=m_my_losses.get_array();
2066  CPlifBase** Plif_matrix=m_plif_matrices->get_plif_matrix();
2067  CPlifBase** Plif_state_signals=m_plif_matrices->get_state_signals();
2068 
2069  if (!m_svm_arrays_clean)
2070  {
2071  SG_ERROR( "SVM arrays not clean") ;
2072  return ;
2073  } ;
2074  //SG_PRINT( "genestr_len=%i, genestr_num=%i\n", genestr_len, genestr_num) ;
2075  //m_mod_words.display() ;
2076  //m_sign_words.display() ;
2077  //m_string_words.display() ;
2078 
2079  bool use_svm = false ;
2080 
2081  CArray2<CPlifBase*> PEN(Plif_matrix, m_N, m_N, false, false) ;
2082  PEN.set_array_name("PEN");
2083  CArray2<CPlifBase*> PEN_state_signals(Plif_state_signals, m_N, max_num_signals, false, false) ;
2084  PEN_state_signals.set_array_name("PEN_state_signals");
2085  CArray3<float64_t> seq_input(seq_array, m_N, m_seq_len, max_num_signals) ;
2086  seq_input.set_array_name("seq_input");
2087 
2088  { // determine whether to use svm outputs and clear derivatives
2089  for (int32_t i=0; i<m_N; i++)
2090  for (int32_t j=0; j<m_N; j++)
2091  {
2092  CPlifBase *penij=PEN.element(i,j) ;
2093  if (penij==NULL)
2094  continue ;
2095 
2096  if (penij->uses_svm_values())
2097  use_svm=true ;
2098  penij->penalty_clear_derivative() ;
2099  }
2100  for (int32_t i=0; i<m_N; i++)
2101  for (int32_t j=0; j<max_num_signals; j++)
2102  {
2103  CPlifBase *penij=PEN_state_signals.element(i,j) ;
2104  if (penij==NULL)
2105  continue ;
2106  if (penij->uses_svm_values())
2107  use_svm=true ;
2108  penij->penalty_clear_derivative() ;
2109  }
2110  }
2111 
2112  { // set derivatives of p, q and a to zero
2113 
2114  for (int32_t i=0; i<m_N; i++)
2115  {
2118  for (int32_t j=0; j<m_N; j++)
2120  }
2121  }
2122 
2123  { // clear score vector
2124  for (int32_t i=0; i<my_seq_len; i++)
2125  {
2126  my_scores[i]=0.0 ;
2127  my_losses[i]=0.0 ;
2128  }
2129  }
2130 
2131  //int32_t total_len = 0 ;
2132 
2133  //m_transition_matrix_a.display_array() ;
2134  //m_transition_matrix_a_id.display_array() ;
2135 
2136  // compute derivatives for given path
2141  {
2142  svm_value[s]=0 ;
2143  svm_value_part1[s]=0 ;
2144  svm_value_part2[s]=0 ;
2145  }
2146 
2147  //#ifdef DYNPROG_DEBUG
2148  float64_t total_score = 0.0 ;
2149  float64_t total_loss = 0.0 ;
2150  //#endif
2151 
2152  ASSERT(my_state_seq[0]>=0) ;
2153  m_initial_state_distribution_p_deriv.element(my_state_seq[0])++ ;
2154  my_scores[0] += m_initial_state_distribution_p.element(my_state_seq[0]) ;
2155 
2156  ASSERT(my_state_seq[my_seq_len-1]>=0) ;
2157  m_end_state_distribution_q_deriv.element(my_state_seq[my_seq_len-1])++ ;
2158  my_scores[my_seq_len-1] += m_end_state_distribution_q.element(my_state_seq[my_seq_len-1]);
2159 
2160  //#ifdef DYNPROG_DEBUG
2161  total_score += my_scores[0] + my_scores[my_seq_len-1] ;
2162  //#endif
2163 
2164  SG_DEBUG( "m_seq_len=%i\n", my_seq_len) ;
2165  for (int32_t i=0; i<my_seq_len-1; i++)
2166  {
2167  if (my_state_seq[i+1]==-1)
2168  break ;
2169  int32_t from_state = my_state_seq[i] ;
2170  int32_t to_state = my_state_seq[i+1] ;
2171  int32_t from_pos = my_pos_seq[i] ;
2172  int32_t to_pos = my_pos_seq[i+1] ;
2173 
2174  int32_t elem_id = m_transition_matrix_a_id.element(from_state, to_state) ;
2175  my_losses[i] = m_seg_loss_obj->get_segment_loss(from_pos, to_pos, elem_id);
2176 
2177 #ifdef DYNPROG_DEBUG
2178 
2179 
2180  if (i>0)// test if segment loss is additive
2181  {
2182  float32_t loss1 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i-1], my_pos_seq[i], elem_id);
2183  float32_t loss2 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i], my_pos_seq[i+1], elem_id);
2184  float32_t loss3 = m_seg_loss_obj->get_segment_loss(my_pos_seq[i-1], my_pos_seq[i+1], elem_id);
2185  SG_PRINT("loss1:%f loss2:%f loss3:%f, diff:%f\n", loss1, loss2, loss3, loss1+loss2-loss3);
2186  if (CMath::abs(loss1+loss2-loss3)>0)
2187  {
2188  SG_PRINT( "%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state) ;
2189  }
2190  }
2191  io->set_loglevel(M_DEBUG) ;
2192  SG_DEBUG( "%i. segment loss %f (id=%i): from=%i(%i), to=%i(%i)\n", i, my_losses[i], elem_id, from_pos, from_state, to_pos, to_state) ;
2193 #endif
2194  // increase usage of this transition
2195  m_transition_matrix_a_deriv.element(from_state, to_state)++ ;
2196  my_scores[i] += m_transition_matrix_a.element(from_state, to_state) ;
2197  //SG_PRINT("m_transition_matrix_a.element(%i, %i),%f \n",from_state, to_state, m_transition_matrix_a.element(from_state, to_state));
2198 #ifdef DYNPROG_DEBUG
2199  SG_DEBUG( "%i. scores[i]=%f\n", i, my_scores[i]) ;
2200 #endif
2201 
2202  /*int32_t last_svm_pos[m_num_degrees] ;
2203  for (int32_t qq=0; qq<m_num_degrees; qq++)
2204  last_svm_pos[qq]=-1 ;*/
2205 
2206  bool is_long_transition = false ;
2207  if (m_long_transitions)
2208  {
2209  if (m_pos[to_pos]-m_pos[from_pos]>m_long_transition_threshold)
2210  is_long_transition = true ;
2211  if (m_orf_info.element(from_state,0)!=-1)
2212  is_long_transition = false ;
2213  }
2214 
2215  int32_t from_pos_thresh = from_pos ;
2216  int32_t to_pos_thresh = to_pos ;
2217 
2218  if (use_svm)
2219  {
2220  if (is_long_transition)
2221  {
2222 
2223  while (from_pos_thresh<to_pos && m_pos[from_pos_thresh+1] - m_pos[from_pos] <= m_long_transition_threshold) // *
2224  from_pos_thresh++ ;
2225  ASSERT(from_pos_thresh<to_pos) ;
2226  ASSERT(m_pos[from_pos_thresh] - m_pos[from_pos] <= m_long_transition_threshold); // *
2227  ASSERT(m_pos[from_pos_thresh+1] - m_pos[from_pos] > m_long_transition_threshold);// *
2228 
2229  int32_t frame = m_orf_info.element(from_state,0);
2230  lookup_content_svm_values(from_pos, from_pos_thresh, m_pos[from_pos], m_pos[from_pos_thresh], svm_value_part1, frame);
2231 
2232 #ifdef DYNPROG_DEBUG
2233  SG_PRINT("part1: pos1: %i pos2: %i pos3: %i \nsvm_value_part1: ", m_pos[from_pos], m_pos[from_pos_thresh], m_pos[from_pos_thresh+1]) ;
2235  SG_PRINT("%1.4f ", svm_value_part1[s]);
2236  SG_PRINT("\n");
2237 #endif
2238 
2239  while (to_pos_thresh>0 && m_pos[to_pos] - m_pos[to_pos_thresh-1] <= m_long_transition_threshold) // *
2240  to_pos_thresh-- ;
2241  ASSERT(to_pos_thresh>0) ;
2242  ASSERT(m_pos[to_pos] - m_pos[to_pos_thresh] <= m_long_transition_threshold) ; // *
2243  ASSERT(m_pos[to_pos] - m_pos[to_pos_thresh-1] > m_long_transition_threshold) ; // *
2244 
2245  lookup_content_svm_values(to_pos_thresh, to_pos, m_pos[to_pos_thresh], m_pos[to_pos], svm_value_part2, frame);
2246 
2247 #ifdef DYNPROG_DEBUG
2248  SG_PRINT("part2: pos1: %i pos2: %i pos3: %i \nsvm_value_part2: ", m_pos[to_pos], m_pos[to_pos_thresh], m_pos[to_pos_thresh+1]) ;
2250  SG_PRINT("%1.4f ", svm_value_part2[s]);
2251  SG_PRINT("\n");
2252 #endif
2253  }
2254  else
2255  {
2256  /* normal case */
2257 
2258  //SG_PRINT("from_pos: %i; to_pos: %i; m_pos[to_pos]-m_pos[from_pos]: %i \n",from_pos, to_pos, m_pos[to_pos]-m_pos[from_pos]);
2259  int32_t frame = m_orf_info.element(from_state,0);
2260  if (false)//(frame>=0)
2261  {
2262  int32_t num_current_svms=0;
2263  int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
2264  SG_PRINT("penalties(%i, %i), frame:%i ", from_state, to_state, frame);
2265  PEN.element(to_state, from_state)->get_used_svms(&num_current_svms, svm_ids);
2266  SG_PRINT("\n");
2267  }
2268 
2269  lookup_content_svm_values(from_pos, to_pos, m_pos[from_pos],m_pos[to_pos], svm_value, frame);
2270 #ifdef DYNPROG_DEBUG
2271  SG_PRINT("part2: pos1: %i pos2: %i \nsvm_values: ", m_pos[from_pos], m_pos[to_pos]) ;
2273  SG_PRINT("%1.4f ", svm_value[s]);
2274  SG_PRINT("\n");
2275 #endif
2276  }
2277  }
2278 
2279  if (PEN.element(to_state, from_state)!=NULL)
2280  {
2281  float64_t nscore = 0 ;
2282  if (is_long_transition)
2283  {
2284  float64_t pen_value_part1 = PEN.element(to_state, from_state)->lookup_penalty(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1) ;
2285  float64_t pen_value_part2 = PEN.element(to_state, from_state)->lookup_penalty(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2) ;
2286  nscore= 0.5*pen_value_part1 + 0.5*pen_value_part2 ;
2287  }
2288  else
2289  nscore = PEN.element(to_state, from_state)->lookup_penalty(m_pos[to_pos]-m_pos[from_pos], svm_value) ;
2290 
2291  if (false)//(nscore<-1e9)
2292  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), to_pos=%i (%i)=> %1.5f\n",
2293  is_long_transition, m_pos[from_pos], from_state, m_pos[to_pos], to_state, nscore) ;
2294 
2295  my_scores[i] += nscore ;
2296 
2297  for (int32_t s=m_num_svms;s<m_num_lin_feat_plifs_cum[m_num_raw_data]; s++)/*set tiling plif values to neutral values (that do not influence derivative calculation)*/
2298  {
2299  svm_value[s]=-CMath::INFTY;
2300  svm_value_part1[s]=-CMath::INFTY;
2301  svm_value_part2[s]=-CMath::INFTY;
2302  }
2303 
2304 #ifdef DYNPROG_DEBUG
2305  //SG_DEBUG( "%i. transition penalty: from_state=%i to_state=%i from_pos=%i [%i] to_pos=%i [%i] value=%i\n", i, from_state, to_state, from_pos, m_pos[from_pos], to_pos, m_pos[to_pos], m_pos[to_pos]-m_pos[from_pos]) ;
2306 #endif
2307  if (is_long_transition)
2308  {
2309 #ifdef DYNPROG_DEBUG
2310  float64_t sum_score = 0.0 ;
2311 
2312  for (int kk=0; kk<i; kk++)
2313  sum_score += my_scores[i] ;
2314 
2315  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), to_pos=%i (%i)=> %1.5f, %1.5f --- 1: %1.6f (%i-%i) 2: %1.6f (%i-%i) \n",
2316  is_long_transition, m_pos[from_pos], from_state, m_pos[to_pos], to_state,
2317  nscore, sum_score,
2318  PEN.element(to_state, from_state)->lookup_penalty(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1)*0.5, m_pos[from_pos], m_pos[from_pos_thresh],
2319  PEN.element(to_state, from_state)->lookup_penalty(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2)*0.5, m_pos[to_pos_thresh], m_pos[to_pos]) ;
2320 #endif
2321  }
2322 
2323  if (is_long_transition)
2324  {
2325  PEN.element(to_state, from_state)->penalty_add_derivative(m_pos[from_pos_thresh]-m_pos[from_pos], svm_value_part1, 0.5) ;
2326  PEN.element(to_state, from_state)->penalty_add_derivative(m_pos[to_pos]-m_pos[to_pos_thresh], svm_value_part2, 0.5) ;
2327  }
2328  else
2329  PEN.element(to_state, from_state)->penalty_add_derivative(m_pos[to_pos]-m_pos[from_pos], svm_value, 1) ;
2330 
2331  //SG_PRINT("m_num_raw_data = %i \n", m_num_raw_data) ;
2332 
2333  // for tiling array and rna-seq data every single measurement must be added to the derivative
2334  // in contrast to the content svm predictions where we have a single value per transition;
2335  // content svm predictions have already been added to the derivative, thus we start with d=1
2336  // instead of d=0
2337  if (is_long_transition)
2338  {
2339  for (int32_t d=1; d<=m_num_raw_data; d++)
2340  {
2341  for (int32_t s=0;s<m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;s++)
2342  svm_value[s]=-CMath::INFTY;
2343  float64_t* intensities = SG_MALLOC(float64_t, m_num_probes_cum[d]);
2344  int32_t num_intensities = raw_intensities_interval_query(m_pos[from_pos], m_pos[from_pos_thresh],intensities, d);
2345  for (int32_t k=0;k<num_intensities;k++)
2346  {
2347  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2348  svm_value[j]=intensities[k];
2349 
2350  PEN.element(to_state, from_state)->penalty_add_derivative(-CMath::INFTY, svm_value, 0.5) ;
2351 
2352  }
2353  num_intensities = raw_intensities_interval_query(m_pos[to_pos_thresh], m_pos[to_pos],intensities, d);
2354  for (int32_t k=0;k<num_intensities;k++)
2355  {
2356  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2357  svm_value[j]=intensities[k];
2358 
2359  PEN.element(to_state, from_state)->penalty_add_derivative(-CMath::INFTY, svm_value, 0.5) ;
2360 
2361  }
2362  SG_FREE(intensities);
2363 
2364  }
2365  }
2366  else
2367  {
2368  for (int32_t d=1; d<=m_num_raw_data; d++)
2369  {
2370  for (int32_t s=0;s<m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;s++)
2371  svm_value[s]=-CMath::INFTY;
2372  float64_t* intensities = SG_MALLOC(float64_t, m_num_probes_cum[d]);
2373  int32_t num_intensities = raw_intensities_interval_query(m_pos[from_pos], m_pos[to_pos],intensities, d);
2374  //SG_PRINT("m_pos[from_pos]:%i, m_pos[to_pos]:%i, num_intensities:%i\n",m_pos[from_pos],m_pos[to_pos], num_intensities);
2375  for (int32_t k=0;k<num_intensities;k++)
2376  {
2377  for (int32_t j=m_num_lin_feat_plifs_cum[d-1];j<m_num_lin_feat_plifs_cum[d];j++)
2378  svm_value[j]=intensities[k];
2379 
2380  PEN.element(to_state, from_state)->penalty_add_derivative(-CMath::INFTY, svm_value, 1) ;
2381 
2382  }
2383  SG_FREE(intensities);
2384  }
2385  }
2386 
2387  }
2388 #ifdef DYNPROG_DEBUG
2389  SG_DEBUG( "%i. scores[i]=%f\n", i, my_scores[i]) ;
2390 #endif
2391 
2392  //SG_DEBUG( "emmission penalty skipped: to_state=%i to_pos=%i value=%1.2f score=%1.2f\n", to_state, to_pos, seq_input.element(to_state, to_pos), 0.0) ;
2393  for (int32_t k=0; k<max_num_signals; k++)
2394  {
2395  if ((PEN_state_signals.element(to_state,k)==NULL)&&(k==0))
2396  {
2397 #ifdef DYNPROG_DEBUG
2398  SG_DEBUG( "%i. emmission penalty: to_state=%i to_pos=%i score=%1.2f (no signal plif)\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k)) ;
2399 #endif
2400  my_scores[i] += seq_input.element(to_state, to_pos, k) ;
2401  //if (seq_input.element(to_state, to_pos, k) !=0)
2402  // SG_PRINT("features(%i,%i): %f\n",to_state,to_pos,seq_input.element(to_state, to_pos, k));
2403  break ;
2404  }
2405  if (PEN_state_signals.element(to_state, k)!=NULL)
2406  {
2407  float64_t nscore = PEN_state_signals.element(to_state,k)->lookup_penalty(seq_input.element(to_state, to_pos, k), svm_value) ; // this should be ok for long_transitions (svm_value does not matter)
2408  my_scores[i] += nscore ;
2409 #ifdef DYNPROG_DEBUG
2410  if (false)//(nscore<-1e9)
2411  {
2412  SG_PRINT("is_long_transition=%i (from_pos=%i (%i), from_state=%i, to_pos=%i (%i) to_state=%i=> %1.5f, dim3:%i, seq_input.element(to_state, to_pos, k): %1.4f\n",
2413  is_long_transition, m_pos[from_pos], from_pos, from_state, m_pos[to_pos], to_pos, to_state, nscore, k, seq_input.element(to_state, to_pos, k)) ;
2414  for (int x=0; x<23; x++)
2415  {
2416  for (int i=-10; i<10; i++)
2417  SG_PRINT("%1.4f\t", seq_input.element(x, to_pos+i, k));
2418  SG_PRINT("\n");
2419  }
2420 
2421  }
2422 #endif
2423  //break ;
2424  //int32_t num_current_svms=0;
2425  //int32_t svm_ids[] = {-8, -7, -6, -5, -4, -3, -2, -1};
2426  //SG_PRINT("PEN_state_signals->id: ");
2427  //PEN_state_signals.element(to_state, k)->get_used_svms(&num_current_svms, svm_ids);
2428  //SG_PRINT("\n");
2429  //if (nscore != 0)
2430  //SG_PRINT( "%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k) ;
2431 #ifdef DYNPROG_DEBUG
2432  SG_DEBUG( "%i. emmission penalty: to_state=%i to_pos=%i value=%1.2f score=%1.2f k=%i\n", i, to_state, to_pos, seq_input.element(to_state, to_pos, k), nscore, k) ;
2433 #endif
2434  PEN_state_signals.element(to_state,k)->penalty_add_derivative(seq_input.element(to_state, to_pos, k), svm_value, 1) ; // this should be ok for long_transitions (svm_value does not matter)
2435  } else
2436  break ;
2437  }
2438 
2439  //#ifdef DYNPROG_DEBUG
2440  //SG_PRINT( "scores[%i]=%f (final) \n", i, my_scores[i]) ;
2441  //SG_PRINT( "losses[%i]=%f (final) , total_loss: %f \n", i, my_losses[i], total_loss) ;
2442  total_score += my_scores[i] ;
2443  total_loss += my_losses[i] ;
2444  //#endif
2445  }
2446  //#ifdef DYNPROG_DEBUG
2447  //SG_PRINT( "total score = %f \n", total_score) ;
2448  //SG_PRINT( "total loss = %f \n", total_loss) ;
2449  //#endif
2450  SG_FREE(svm_value);
2451  SG_FREE(svm_value_part1);
2452  SG_FREE(svm_value_part2);
2453 }
2454 
2455 int32_t CDynProg::raw_intensities_interval_query(const int32_t from_pos, const int32_t to_pos, float64_t* intensities, int32_t type)
2456 {
2457  ASSERT(from_pos<to_pos);
2458  int32_t num_intensities = 0;
2459  int32_t* p_tiling_pos = &m_probe_pos[m_num_probes_cum[type-1]];
2460  float64_t* p_tiling_data = &m_raw_intensities[m_num_probes_cum[type-1]];
2461  int32_t last_pos;
2462  int32_t num = m_num_probes_cum[type-1];
2463  while (*p_tiling_pos<to_pos)
2464  {
2465  if (*p_tiling_pos>=from_pos)
2466  {
2467  intensities[num_intensities] = *p_tiling_data;
2468  num_intensities++;
2469  }
2470  num++;
2471  if (num>=m_num_probes_cum[type])
2472  break;
2473  last_pos = *p_tiling_pos;
2474  p_tiling_pos++;
2475  p_tiling_data++;
2476  ASSERT(last_pos<*p_tiling_pos);
2477  }
2478  return num_intensities;
2479 }
2480 
2481 void CDynProg::lookup_content_svm_values(const int32_t from_state, const int32_t to_state, const int32_t from_pos, const int32_t to_pos, float64_t* svm_values, int32_t frame)
2482 {
2483 #ifdef DYNPROG_TIMING_DETAIL
2484  MyTime.start() ;
2485 #endif
2486 // ASSERT(from_state<to_state);
2487 // if (!(from_pos<to_pos))
2488 // SG_ERROR("from_pos!<to_pos, from_pos: %i to_pos: %i \n",from_pos,to_pos);
2489  for (int32_t i=0;i<m_num_svms;i++)
2490  {
2491  float64_t to_val = m_lin_feat.get_element(i, to_state);
2492  float64_t from_val = m_lin_feat.get_element(i, from_state);
2493  svm_values[i] = (to_val-from_val)/(to_pos-from_pos);
2494  }
2495  for (int32_t i=m_num_svms;i<m_num_lin_feat_plifs_cum[m_num_raw_data];i++)
2496  {
2497  float64_t to_val = m_lin_feat.get_element(i, to_state);
2498  float64_t from_val = m_lin_feat.get_element(i, from_state);
2499  svm_values[i] = to_val-from_val ;
2500  }
2501  if (m_intron_list)
2502  {
2503  int32_t* support = SG_MALLOC(int32_t, m_num_intron_plifs);
2504  m_intron_list->get_intron_support(support, from_state, to_state);
2505  int32_t intron_list_start = m_num_lin_feat_plifs_cum[m_num_raw_data];
2506  int32_t intron_list_end = m_num_lin_feat_plifs_cum[m_num_raw_data]+m_num_intron_plifs;
2507  int32_t cnt = 0;
2508  for (int32_t i=intron_list_start; i<intron_list_end;i++)
2509  {
2510  svm_values[i] = (float64_t) (support[cnt]);
2511  cnt++;
2512  }
2513  //if (to_pos>3990 && to_pos<4010)
2514  // SG_PRINT("from_state:%i to_state:%i support[0]:%i support[1]:%i\n",from_state, to_state, support[0], support[1]);
2515  SG_FREE(support);
2516  }
2517  // find the correct row with precomputed frame predictions
2518  if (frame!=-1)
2519  {
2520  svm_values[frame_plifs[0]] = 1e10;
2521  svm_values[frame_plifs[1]] = 1e10;
2522  svm_values[frame_plifs[2]] = 1e10;
2523  int32_t global_frame = from_pos%3;
2524  int32_t row = ((global_frame+frame)%3)+4;
2525  float64_t to_val = m_lin_feat.get_element(row, to_state);
2526  float64_t from_val = m_lin_feat.get_element(row, from_state);
2527  svm_values[frame+frame_plifs[0]] = (to_val-from_val)/(to_pos-from_pos);
2528  }
2529 #ifdef DYNPROG_TIMING_DETAIL
2530  MyTime.stop() ;
2531  content_svm_values_time += MyTime.time_diff_sec() ;
2532 #endif
2533 }
2534 void CDynProg::set_intron_list(CIntronList* intron_list, int32_t num_plifs)
2535 {
2536  m_intron_list = intron_list;
2537  m_num_intron_plifs = num_plifs;
2538 }
2539 

SHOGUN Machine Learning Toolbox - Documentation