00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <math.h>
00025 #include <ctype.h>
00026 #include <string.h>
00027 #include "kernel/LocalAlignmentStringKernel.h"
00028
00029 using namespace shogun;
00030
00031
00032
00033
00034
00035 #define NAA 20
00036 #define NLET 26
00037 const char* CLocalAlignmentStringKernel::aaList= "ARNDCQEGHILKMFPSTWYV";
00038
00039
00040
00041
00042
00043 #define OPENING 12
00044 #define EXTENSION 2
00045
00046
00047 const int32_t CLocalAlignmentStringKernel::blosum[] = {
00048 6,
00049 -2, 8,
00050 -2, -1, 9,
00051 -3, -2, 2, 9,
00052 -1, -5, -4, -5, 13,
00053 -1, 1, 0, 0, -4, 8,
00054 -1, 0, 0, 2, -5, 3, 7,
00055 0, -3, -1, -2, -4, -3, -3, 8,
00056 -2, 0, 1, -2, -4, 1, 0, -3, 11,
00057 -2, -5, -5, -5, -2, -4, -5, -6, -5, 6,
00058 -2, -3, -5, -5, -2, -3, -4, -5, -4, 2, 6,
00059 -1, 3, 0, -1, -5, 2, 1, -2, -1, -4, -4, 7,
00060 -1, -2, -3, -5, -2, -1, -3, -4, -2, 2, 3, -2, 8,
00061 -3, -4, -5, -5, -4, -5, -5, -5, -2, 0, 1, -5, 0, 9,
00062 -1, -3, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -4, -5, 11,
00063 2, -1, 1, 0, -1, 0, 0, 0, -1, -4, -4, 0, -2, -4, -1, 6,
00064 0, -2, 0, -2, -1, -1, -1, -2, -3, -1, -2, -1, -1, -3, -2, 2, 7,
00065 -4, -4, -6, -6, -3, -3, -4, -4, -4, -4, -2, -4, -2, 1, -6, -4, -4, 16,
00066 -3, -3, -3, -5, -4, -2, -3, -5, 3, -2, -2, -3, -1, 4, -4, -3, -2, 3, 10,
00067 0, -4, -4, -5, -1, -3, -4, -5, -5, 4, 1, -3, 1, -1, -4, -2, 0, -4, -2, 6};
00068
00069
00070 #define BINDEX(i,j) (((i)>(j))?(j)+(((i)*(i+1))/2):(i)+(((j)*(j+1))/2))
00071
00072
00073
00074
00075
00076 #define SCALING 0.1
00077
00078
00079
00080
00081
00082
00083
00084 #define LOGP(x,y) LogSum(x,y)
00085
00086
00087
00088
00089
00090
00091
00092 #define LOG0 -10000
00093 #define INTSCALE 1000.0
00094
00095 int32_t CLocalAlignmentStringKernel::logsum_lookup[LOGSUM_TBL];
00096
00097 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(int32_t size)
00098 : CStringKernel<char>(size), initialized(false)
00099 {
00100 scaled_blosum=new int32_t[sizeof(blosum)];
00101 init_logsum();
00102 initialize();
00103 }
00104
00105 CLocalAlignmentStringKernel::CLocalAlignmentStringKernel(
00106 CStringFeatures<char>* l, CStringFeatures<char>* r)
00107 : CStringKernel<char>(10), initialized(false)
00108 {
00109 scaled_blosum=new int32_t[sizeof(blosum)];
00110 init_logsum();
00111 initialize();
00112 init(l, r);
00113 }
00114
00115 CLocalAlignmentStringKernel::~CLocalAlignmentStringKernel()
00116 {
00117 cleanup();
00118 }
00119
00120 bool CLocalAlignmentStringKernel::init(CFeatures* l, CFeatures* r)
00121 {
00122 CStringKernel<char>::init(l, r);
00123 initialized = true;
00124 return init_normalizer();
00125 }
00126
00127 void CLocalAlignmentStringKernel::cleanup()
00128 {
00129 delete[] scaled_blosum;
00130 scaled_blosum=NULL;
00131
00132 free(isAA);
00133 isAA=NULL;
00134 free(aaIndex);
00135 aaIndex=NULL;
00136
00137 CKernel::cleanup();
00138 }
00139
00140
00141
00142
00143 void CLocalAlignmentStringKernel::init_logsum(void){
00144 int32_t i;
00145 for (i = 0; i < LOGSUM_TBL; i++)
00146 logsum_lookup[i] = (int32_t) (INTSCALE*
00147 (log(1.+exp( (float32_t) -i/INTSCALE))));
00148 }
00149
00150 int32_t CLocalAlignmentStringKernel::LogSum(int32_t p1, int32_t p2)
00151 {
00152 int32_t diff;
00153 static int32_t firsttime=1;
00154
00155 if (firsttime)
00156 {
00157 init_logsum();
00158 firsttime =0;
00159 }
00160
00161 diff=p1-p2;
00162 if (diff>=LOGSUM_TBL) return p1;
00163 else if (diff<=-LOGSUM_TBL) return p2;
00164 else if (diff>0) return p1+logsum_lookup[diff];
00165 else return p2+logsum_lookup[-diff];
00166 }
00167
00168
00169 float32_t CLocalAlignmentStringKernel::LogSum2(float32_t p1, float32_t p2)
00170 {
00171 if (p1 > p2)
00172 return (p1-p2 > 50.) ? p1 : p1 + log(1. + exp(p2-p1));
00173 else
00174 return (p2-p1 > 50.) ? p2 : p2 + log(1. + exp(p1-p2));
00175 }
00176
00177
00178 void CLocalAlignmentStringKernel::initialize(void)
00179
00180 {
00181 register int32_t i;
00182
00183
00184 if ((aaIndex=(int32_t *)calloc(NLET,sizeof(int32_t))) == NULL)
00185 SG_ERROR("run out o memory");
00186 for (i=0;i<NAA;i++)
00187 aaIndex[aaList[i]-'A']=i;
00188
00189
00190 if ((isAA=(int32_t *)calloc(256,sizeof(int32_t))) == NULL)
00191 SG_ERROR("run out of memory");
00192 for (i=0;i<NAA;i++)
00193 isAA[(int32_t)aaList[i]]=1;
00194
00195
00196 for (i=0 ; i<NAA*(NAA+1)/2; i++)
00197 scaled_blosum[i] = (int32_t) floor(blosum[i]*SCALING*INTSCALE);
00198
00199
00200
00201 opening = (int32_t) floor(OPENING * SCALING*INTSCALE);
00202 extension = (int32_t) floor(EXTENSION * SCALING*INTSCALE);
00203 }
00204
00205
00206
00207
00208
00209
00210 float64_t CLocalAlignmentStringKernel::LAkernelcompute(
00211 int32_t* aaX, int32_t* aaY,
00212 int32_t nX, int32_t nY )
00213 {
00214 register int32_t
00215 i,j,
00216 cur, old,
00217 curpos, frompos;
00218
00219 int32_t
00220 *logX,
00221 *logY,
00222 *logM,
00223 *logX2,
00224 *logY2,
00225
00226 aux , aux2;
00227 int32_t
00228 cl;
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240 cl = nY+1;
00241
00242 logM=new int32_t[2*cl];
00243 logX=new int32_t[2*cl];
00244 logY=new int32_t[2*cl];
00245 logX2=new int32_t[2*cl];
00246 logY2=new int32_t[2*cl];
00247
00248
00249
00250
00251
00252
00253 for (j=0;j<cl;j++) {
00254 logM[j]=LOG0;
00255 logX[j]=LOG0;
00256 logY[j]=LOG0;
00257 logX2[j]=LOG0;
00258 logY2[j]=LOG0;
00259
00260 }
00261
00262
00263 cur = 1;
00264 old = 0;
00265
00266
00267
00268
00269
00270
00271
00272 for (i=1;i<=nX;i++) {
00273
00274
00275 curpos = cur*cl;
00276 logM[curpos] = LOG0;
00277 logX[curpos] = LOG0;
00278 logY[curpos] = LOG0;
00279 logX2[curpos] = LOG0;
00280 logY2[curpos] = LOG0;
00281
00282
00283 for (j=1;j<=nY;j++) {
00284
00285 curpos = cur*cl + j;
00286
00287
00288
00289
00290 frompos = old*cl + j;
00291
00292
00293 logX[curpos] = LOGP( - opening + logM[frompos] , - extension + logX[frompos] );
00294
00295
00296
00297 logX2[curpos] = LOGP( logM[frompos] , logX2[frompos] );
00298
00299
00300
00301
00302 frompos = cur*cl + j-1;
00303
00304
00305 aux = LOGP( - opening + logM[frompos] , - extension + logY[frompos] );
00306 logY[curpos] = LOGP( aux , - opening + logX[frompos] );
00307
00308
00309 aux = LOGP( logM[frompos] , logY2[frompos] );
00310 logY2[curpos] = LOGP( aux , logX2[frompos] );
00311
00312
00313
00314
00315 frompos = old*cl + j-1;
00316
00317 aux = LOGP( logX[frompos] , logY[frompos] );
00318 aux2 = LOGP( 0 , logM[frompos] );
00319 logM[curpos] = LOGP( aux , aux2 ) + scaled_blosum[ BINDEX( aaX[i-1] , aaY[j-1] ) ];
00320
00321
00322
00323
00324 }
00325
00326
00327
00328 cur = 1-cur;
00329 old = 1-old;
00330
00331 }
00332
00333
00334
00335
00336
00337 curpos = old*cl + nY;
00338 aux = LOGP( logX2[curpos] , logY2[curpos] );
00339 aux2 = LOGP( 0 , logM[curpos] );
00340
00341
00342
00343 delete[] logM;
00344 delete[] logX;
00345 delete[] logY;
00346 delete[] logX2;
00347 delete[] logY2;
00348
00349
00350 return (float32_t)LOGP(aux,aux2)/INTSCALE;
00351 }
00352
00353
00354
00355
00356
00357
00358
00359
00360 float64_t CLocalAlignmentStringKernel::compute(int32_t idx_x, int32_t idx_y)
00361 {
00362 int32_t *aax,*aay;
00363 int32_t lx=0,ly=0;
00364 int32_t i,j;
00365
00366
00367 if (isAA == NULL)
00368 initialize();
00369
00370 bool free_x, free_y;
00371 char* x=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_x, lx, free_x);
00372 char* y=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_y, ly, free_y);
00373 ASSERT(x && y);
00374
00375 if ((lx<1) || (ly<1))
00376 SG_ERROR("empty chain");
00377
00378
00379
00380 if ((aax=(int32_t *)calloc(lx,sizeof(int32_t))) == NULL)
00381 SG_ERROR("run out of memory");
00382 if ((aay=(int32_t *)calloc(ly,sizeof(int32_t))) == NULL)
00383 SG_ERROR("run out of memory");
00384
00385
00386
00387 j=0;
00388 for (i=0 ; i<lx ; i++)
00389 if (isAA[toupper(x[i])])
00390 aax[j++] = aaIndex[toupper(x[i])-'A'];
00391 lx = j;
00392 j=0;
00393 for (i=0 ; i<ly ; i++)
00394 if (isAA[toupper(y[i])])
00395 aay[j++] = aaIndex[toupper(y[i])-'A'];
00396 ly = j;
00397
00398
00399
00400 float64_t result=LAkernelcompute(aax,aay,lx,ly);
00401
00402
00403 free(aax);
00404 free(aay);
00405
00406 ((CStringFeatures<char>*) lhs)->free_feature_vector(x, idx_x, free_x);
00407 ((CStringFeatures<char>*) rhs)->free_feature_vector(y, idx_y, free_y);
00408
00409 return result;
00410 }