SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
LocalAlignmentStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * Compute the local alignment kernel
3  *
4  * Largely based on LAkernel.c (version 0.3)
5  *
6  * Copyright 2003 Jean-Philippe Vert
7  * Copyright 2005 Jean-Philippe Vert, Hiroto Saigo
8  *
9  * Shogun specific adjustments Written (W) 2007-2008,2010 Soeren Sonnenburg
10  *
11  * Reference:
12  * H. Saigo, J.-P. Vert, T. Akutsu and N. Ueda, "Protein homology
13  * detection using string alignment kernels", Bioinformatics,
14  * vol.20, p.1682-1689, 2004.
15  *
16  * This program is free software; you can redistribute it and/or modify
17  * it under the terms of the GNU General Public License as published by
18  * the Free Software Foundation; either version 3 of the License, or
19  * (at your option) any later version.
20  */
21 
22 #include <stdlib.h>
23 #include <stdio.h>
24 #include <math.h>
25 #include <ctype.h>
26 #include <string.h>
28 
29 using namespace shogun;
30 
31 /****************/
32 /* The alphabet */
33 /****************/
34 
35 #define NAA 20 /* Number of amino-acids */
36 #define NLET 26 /* Number of letters in the alphabet */
37 const char* CLocalAlignmentStringKernel::aaList= "ARNDCQEGHILKMFPSTWYV"; /* The list of amino acids */
38 
39 /*****************/
40 /* SW parameters */
41 /*****************/
42 
43 /* mutation matrix */
44 const int32_t CLocalAlignmentStringKernel::blosum[] = {
45  6,
46  -2, 8,
47  -2, -1, 9,
48  -3, -2, 2, 9,
49  -1, -5, -4, -5, 13,
50  -1, 1, 0, 0, -4, 8,
51  -1, 0, 0, 2, -5, 3, 7,
52  0, -3, -1, -2, -4, -3, -3, 8,
53  -2, 0, 1, -2, -4, 1, 0, -3, 11,
54  -2, -5, -5, -5, -2, -4, -5, -6, -5, 6,
55  -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6,
56  -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7,
57  -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8,
58  -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9,
59  -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11,
60  2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6,
61  0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7,
62  -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16,
63  -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10,
64  0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6};
65 
66 /* Index corresponding to the (i,j) entry (i,j=0..19) in the blosum matrix */
67 #define BINDEX(i,j) (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2))
68 
69 /*********************
70  * Kernel parameters *
71  *********************/
72 
73 #define SCALING 0.1 /* Factor to scale all SW parameters */
74 
75 /* If you want to compute the sum over all local alignments (to get a valid kernel), uncomment the following line : */
76 /* If x=log(a) and y=log(b), compute log(a+b) : */
77 /*
78 #define LOGP(x,y) (((x)>(y))?(x)+log1p(exp((y)-(x))):(y)+log1p(exp((x)-(y))))
79 */
80 
81 #define LOGP(x,y) LogSum(x,y)
82 
83 /* OR if you want to compute the score of the best local alignment (to get the SW score by Viterbi), uncomment the following line : */
84 /*
85 #define LOGP(x,y) (((x)>(y))?(x):(y))
86 */
87 
88 /* Usefule constants */
89 #define LOG0 -10000 /* log(0) */
90 #define INTSCALE 1000.0 /* critical for speed and precise computation*/
91 
93 
95 : CStringKernel<char>(size)
96 {
97  init();
98  init_static_variables();
99 }
100 
103  float64_t opening, float64_t extension)
104 : CStringKernel<char>()
105 {
106  init();
107  m_opening=opening;
108  m_extension=extension;
109  init_static_variables();
110  init(l, r);
111 }
112 
114 {
115  cleanup();
116 }
117 
118 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
119 {
121  initialized = true;
122  return init_normalizer();
123 }
124 
126 {
128  scaled_blosum=NULL;
129 
130  SG_FREE(isAA);
131  isAA=NULL;
132  SG_FREE(aaIndex);
133  aaIndex=NULL;
134 
136 }
137 
138 /* LogSum - default log funciotion. fast, but not exact */
139 /* LogSum2 - precise, but slow. Note that these two functions need different figure types */
140 
141 void CLocalAlignmentStringKernel::init_logsum(){
142  int32_t i;
143  for (i = 0; i < LOGSUM_TBL; i++)
144  logsum_lookup[i] = (int32_t) (INTSCALE*
145  (log(1.+exp( (float32_t) -i/INTSCALE))));
146 }
147 
148 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
149 {
150  int32_t diff;
151  static int32_t firsttime=1;
152 
153  if (firsttime)
154  {
155  init_logsum();
156  firsttime =0;
157  }
158 
159  diff=p1-p2;
160  if (diff>=LOGSUM_TBL) return p1;
161  else if (diff<=-LOGSUM_TBL) return p2;
162  else if (diff>0) return p1+logsum_lookup[diff];
163  else return p2+logsum_lookup[-diff];
164 }
165 
166 
167 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
168 {
169  if (p1 > p2)
170  return (p1-p2 > 50.) ? p1 : p1 + log(1. + exp(p2-p1));
171  else
172  return (p2-p1 > 50.) ? p2 : p2 + log(1. + exp(p1-p2));
173 }
174 
175 
176 void CLocalAlignmentStringKernel::init_static_variables()
177  /* Initialize all static variables. This function should be called once before computing the first pair HMM score */
178 {
179  register int32_t i;
180 
181  /* Initialization of the array which gives the position of each amino-acid in the set of amino-acid */
182  if ((aaIndex=(int32_t *)calloc(NLET,sizeof(int32_t))) == NULL)
183  SG_ERROR("run out o memory");
184  for (i=0;i<NAA;i++)
185  aaIndex[aaList[i]-'A']=i;
186 
187  /* Initialization of the array which indicates whether a char is an amino-acid */
188  if ((isAA=(int32_t *)calloc(256,sizeof(int32_t))) == NULL)
189  SG_ERROR("run out of memory");
190  for (i=0;i<NAA;i++)
191  isAA[(int32_t)aaList[i]]=1;
192 
193  /* Scale the blossum matrix */
194  for (i=0 ; i<NAA*(NAA+1)/2; i++)
195  scaled_blosum[i] = (int32_t) floor(blosum[i]*SCALING*INTSCALE);
196 
197 
198  /* Scale of gap penalties */
199  m_opening = (int32_t) floor(m_opening * SCALING*INTSCALE);
200  m_extension = (int32_t) floor(m_extension * SCALING*INTSCALE);
201 }
202 
203 
204 
205 /* Implementation of the
206  * convolution kernel which generalizes the Smith-Waterman algorithm
207  */
208 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
209  int32_t* aaX, int32_t* aaY, /* the two amino-acid sequences (as sequences of indexes in [0..NAA-1] indicating the position of the amino-acid in the variable 'aaList') */
210  int32_t nX, int32_t nY /* the lengths of both sequences */)
211 {
212  register int32_t
213  i,j, /* loop indexes */
214  cur, old, /* to indicate the array to use (0 or 1) */
215  curpos, frompos; /* position in an array */
216 
217  int32_t
218  *logX, /* arrays to store the log-values of each state */
219  *logY,
220  *logM,
221  *logX2,
222  *logY2,
223 
224  aux , aux2;/* , aux3 , aux4 , aux5;*/
225  int32_t
226  cl; /* length of a column for the dynamic programming */
227 
228  /*
229  printf("now computing pairHMM between %d and %d:\n",nX,nY);
230  for (i=0;i<nX;printf("%d ",aaX[i++]));
231  printf("\n and \n");
232  for (i=0;i<nY;printf("%d ",aaY[i++]));
233  printf("\n");
234  */
235 
236  /* Initialization of the arrays */
237  /* Each array stores two successive columns of the (nX+1)x(nY+1) table used in dynamic programming */
238  cl = nY+1; /* each column stores the positions in the aaY sequence, plus a position at zero */
239 
240  logM=SG_CALLOC(int32_t, 2*cl);
241  logX=SG_CALLOC(int32_t, 2*cl);
242  logY=SG_CALLOC(int32_t, 2*cl);
243  logX2=SG_CALLOC(int32_t, 2*cl);
244  logY2=SG_CALLOC(int32_t, 2*cl);
245 
246  /************************************************/
247  /* First iteration : initialization of column 0 */
248  /************************************************/
249  /* The log=proabilities of each state are initialized for the first column (x=0,y=0..nY) */
250 
251  for (j=0;j<cl;j++) {
252  logM[j]=LOG0;
253  logX[j]=LOG0;
254  logY[j]=LOG0;
255  logX2[j]=LOG0;
256  logY2[j]=LOG0;
257 
258  }
259 
260  /* Update column order */
261  cur = 1; /* Indexes [0..cl-1] are used to process the next column */
262  old = 0; /* Indexes [cl..2*cl-1] were used for column 0 */
263 
264 
265  /************************************************/
266  /* Next iterations : processing columns 1 .. nX */
267  /************************************************/
268 
269  /* Main loop to vary the position in aaX : i=1..nX */
270  for (i=1;i<=nX;i++) {
271 
272  /* Special update for positions (i=1..nX,j=0) */
273  curpos = cur*cl; /* index of the state (i,0) */
274  logM[curpos] = LOG0;
275  logX[curpos] = LOG0;
276  logY[curpos] = LOG0;
277  logX2[curpos] = LOG0;
278  logY2[curpos] = LOG0;
279 
280  /* Secondary loop to vary the position in aaY : j=1..nY */
281  for (j=1;j<=nY;j++) {
282 
283  curpos = cur*cl + j; /* index of the state (i,j) */
284 
285  /* Update for states which emit X only */
286  /***************************************/
287 
288  frompos = old*cl + j; /* index of the state (i-1,j) */
289 
290  /* State RX */
291  logX[curpos] = LOGP( - m_opening + logM[frompos] , - m_extension + logX[frompos] );
292  /* printf("%.5f\n",logX[curpos]);*/
293  /* printf("%.5f\n",logX_B[curpos]);*/
294  /* State RX2 */
295  logX2[curpos] = LOGP( logM[frompos] , logX2[frompos] );
296 
297  /* Update for states which emit Y only */
298  /***************************************/
299 
300  frompos = cur*cl + j-1; /* index of the state (i,j-1) */
301 
302  /* State RY */
303  aux = LOGP( - m_opening + logM[frompos] , - m_extension + logY[frompos] );
304  logY[curpos] = LOGP( aux , - m_opening + logX[frompos] );
305 
306  /* State RY2 */
307  aux = LOGP( logM[frompos] , logY2[frompos] );
308  logY2[curpos] = LOGP( aux , logX2[frompos] );
309 
310  /* Update for states which emit X and Y */
311  /****************************************/
312 
313  frompos = old*cl + j-1; /* index of the state (i-1,j-1) */
314 
315  aux = LOGP( logX[frompos] , logY[frompos] );
316  aux2 = LOGP( 0 , logM[frompos] );
317  logM[curpos] = LOGP( aux , aux2 ) + scaled_blosum[ BINDEX( aaX[i-1] , aaY[j-1] ) ];
318 
319  /* printf("i=%d , j=%d\nM=%.5f\nX=%.5f\nY=%.5f\nX2=%.5f\nY2=%.5f\n",i,j,logM[curpos],logX[curpos],logY[curpos],logX2[curpos],logY2[curpos]);
320  */
321 
322  } /* end of j=1:nY loop */
323 
324 
325  /* Update the culumn order */
326  cur = 1-cur;
327  old = 1-old;
328 
329  } /* end of j=1:nX loop */
330 
331 
332  /* Termination */
333  /***************/
334 
335  curpos = old*cl + nY; /* index of the state (nX,nY) */
336  aux = LOGP( logX2[curpos] , logY2[curpos] );
337  aux2 = LOGP( 0 , logM[curpos] );
338  /* kernel_value = LOGP( aux , aux2 );*/
339 
340  /* Memory release */
341  SG_FREE(logM);
342  SG_FREE(logX);
343  SG_FREE(logY);
344  SG_FREE(logX2);
345  SG_FREE(logY2);
346 
347  /* Return the logarithm of the kernel */
348  return (float32_t)LOGP(aux,aux2)/INTSCALE;
349 }
350 
351 /********************/
352 /* Public functions */
353 /********************/
354 
355 
356 /* Return the log-probability of two sequences x and y under a pair HMM model */
357 /* x and y are strings of aminoacid letters, e.g., "AABRS" */
358 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
359 {
360  int32_t *aax,*aay; /* to convert x and y into sequences of amino-acid indexes */
361  int32_t lx=0,ly=0; /* lengths of x and y */
362  int32_t i,j;
363 
364  bool free_x, free_y;
365  char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
366  char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
367  ASSERT(x && y);
368 
369  if ((lx<1) || (ly<1))
370  SG_ERROR("empty chain");
371 
372  /* Create aax and aay */
373 
374  if ((aax=(int32_t *)calloc(lx,sizeof(int32_t))) == NULL)
375  SG_ERROR("run out of memory");
376  if ((aay=(int32_t *)calloc(ly,sizeof(int32_t))) == NULL)
377  SG_ERROR("run out of memory");
378 
379  /* Extract the characters corresponding to aminoacids and keep their indexes */
380 
381  j=0;
382  for (i=0 ; i<lx ; i++)
383  if (isAA[toupper(x[i])])
384  aax[j++] = aaIndex[toupper(x[i])-'A'];
385  lx = j;
386  j=0;
387  for (i=0 ; i<ly ; i++)
388  if (isAA[toupper(y[i])])
389  aay[j++] = aaIndex[toupper(y[i])-'A'];
390  ly = j;
391 
392 
393  /* Compute the pair HMM score */
394  float64_t result=LAkernelcompute(aax,aay,lx,ly);
395 
396  /* Release memory */
397  SG_FREE(aax);
398  SG_FREE(aay);
399 
400  ((CStringFeatures<char>*) lhs)->free_feature_vector(x, idx_x, free_x);
401  ((CStringFeatures<char>*) rhs)->free_feature_vector(y, idx_y, free_y);
402 
403  return result;
404 }
405 
406 void CLocalAlignmentStringKernel::init()
407 {
408  initialized = false;
409  isAA = NULL;
410  aaIndex = NULL;
411 
412  m_opening = 10;
413  m_extension = 2;
414 
415  scaled_blosum=SG_CALLOC(int32_t, sizeof(blosum));
416  init_logsum();
417 
418  m_parameters->add(&initialized, "initialized", "if kernel is initalized");
419  m_parameters->add(&m_opening, "opening", "opening gap opening penalty");
420  m_parameters->add(&m_extension, "extension", "extension gap extension penalty");
421 }

SHOGUN Machine Learning Toolbox - Documentation