SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2010 Soeren Sonnenburg 00008 * Copyright (C) 2010 Berlin Institute of Technology 00009 * 00010 * Based on code from Pavel Kuksa <pkuksa@cs.rutgers.edu> and 00011 * Vladimir Pavlovic <vladimir@cs.rutgers.edu> originally 00012 * released under the new BSD License. 00013 */ 00014 00015 #include "lib/common.h" 00016 #include "lib/io.h" 00017 #include "lib/Mathematics.h" 00018 #include "kernel/SparseSpatialSampleStringKernel.h" 00019 #include "features/StringFeatures.h" 00020 00021 using namespace shogun; 00022 00023 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel() 00024 : CStringKernel<char>(0), t(2), d(5) 00025 { 00026 } 00027 00028 CSparseSpatialSampleStringKernel::CSparseSpatialSampleStringKernel(CStringFeatures<char>* l, 00029 CStringFeatures<char>* r) : CStringKernel<char>(0), t(2), d(5) 00030 { 00031 init(l, r); 00032 } 00033 00034 bool CSparseSpatialSampleStringKernel::init(CFeatures* l, CFeatures* r) 00035 { 00036 CStringKernel<char>::init(l, r); 00037 return init_normalizer(); 00038 } 00039 00040 void CSparseSpatialSampleStringKernel::cleanup() 00041 { 00042 CKernel::cleanup(); 00043 } 00044 00045 CSparseSpatialSampleStringKernel::~CSparseSpatialSampleStringKernel() 00046 { 00047 } 00048 00049 SSKFeatures *CSparseSpatialSampleStringKernel::extractTriple(int **S, int *len, int nStr, int d1, int d2) 00050 { 00051 int i, j; 00052 int n, nfeat; 00053 int *group; 00054 int *features; 00055 int *s; 00056 int c; 00057 SSKFeatures *F; 00058 00059 nfeat = 0; 00060 for (i = 0; i < nStr; ++i) 00061 nfeat += len[i] - d1 -d2; 00062 group = (int *)malloc(nfeat*sizeof(int)); 00063 features = (int *)malloc(nfeat*3*sizeof(int *)); 00064 c = 0; 00065 for (i = 0; i < nStr; ++i) 00066 { 00067 n = len[i] - d1 - d2; 00068 s = S[i]; 00069 for (j = 0; j < n; ++j) 00070 { 00071 features[c] = s[j]; 00072 features[c+nfeat] = s[j+d1]; 00073 features[c+2*nfeat] = s[j+d1+d2]; 00074 group[c] = i; 00075 c++; 00076 } 00077 } 00078 ASSERT(nfeat==c); 00079 F = (SSKFeatures *)malloc(sizeof(SSKFeatures)); 00080 (*F).features = features; 00081 (*F).group = group; 00082 (*F).n = nfeat; 00083 return F; 00084 } 00085 00086 00087 SSKFeatures *CSparseSpatialSampleStringKernel::extractDouble(int **S, int *len, int nStr, int d1) 00088 { 00089 int i, j; 00090 int n, nfeat; 00091 int *group; 00092 int *features; 00093 int *s; 00094 int c; 00095 SSKFeatures *F; 00096 00097 nfeat = 0; 00098 for (i = 0; i < nStr; ++i) 00099 nfeat += len[i] - d1; 00100 group = (int *)malloc(nfeat*sizeof(int)); 00101 features = (int *)malloc(nfeat*2*sizeof(int *)); 00102 c = 0; 00103 for (i = 0; i < nStr; ++i) 00104 { 00105 n = len[i] - d1; 00106 s = S[i]; 00107 for (j = 0; j < n; ++j) 00108 { 00109 features[c] = s[j]; 00110 features[c+nfeat] = s[j+d1]; 00111 group[c] = i; 00112 c++; 00113 } 00114 } 00115 if (nfeat!=c) 00116 printf("Something is wrong...\n"); 00117 F = (SSKFeatures *)malloc(sizeof(SSKFeatures)); 00118 (*F).features = features; 00119 (*F).group = group; 00120 (*F).n = nfeat; 00121 return F; 00122 } 00123 00124 00125 void CSparseSpatialSampleStringKernel::compute_double(int32_t idx_a, int32_t idx_b) 00126 { 00127 int d1, d2; 00128 SSKFeatures *features; 00129 int *sortIdx; 00130 int *features_srt; 00131 int *group_srt; 00132 int maxIdx; 00133 char *kernelfilename; 00134 int **S; 00135 int *len; 00136 int nStr, nfeat; 00137 int i; 00138 int na; 00139 int *K; 00140 00141 for (d1 = 1; d1 <= d; ++d1) 00142 { 00143 if ( isVerbose ) printf("Extracting features..."), fflush( stdout ); 00144 features = extractDouble(S,len,nStr,d1); 00145 nfeat = (*features).n; 00146 printf("d=%d: %d features\n", d1, nfeat); 00147 maxIdx = 0; 00148 for (i = 0; i < nfeat*2; ++i) 00149 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i]; 00150 if (na < maxIdx+1) 00151 { 00152 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na); 00153 printf("\tUsing [0,%d] instead\n", maxIdx); 00154 na = maxIdx+1; 00155 } 00156 if (isVerbose) 00157 { 00158 printf("done.\n"); 00159 printf("Sorting..."); 00160 fflush( stdout ); 00161 } 00162 sortIdx = cntsrtna((*features).features,2,(*features).n,na); 00163 if (isVerbose) printf("done.\n"); 00164 features_srt = (int *)malloc(nfeat*2*sizeof(int *)); 00165 group_srt = (int *)malloc(nfeat*sizeof(int)); 00166 for (i = 0; i < nfeat; ++i) 00167 { 00168 features_srt[i]=(*features).features[sortIdx[i]]; 00169 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat]; 00170 group_srt[i] = (*features).group[sortIdx[i]]; 00171 } 00172 free(sortIdx); 00173 free((*features).features); 00174 free((*features).group); 00175 free(features); 00176 if (isVerbose) 00177 { 00178 printf("Counting..."); 00179 fflush( stdout ); 00180 } 00181 countAndUpdate(K,features_srt,group_srt,2,nfeat,nStr); 00182 free(features_srt); 00183 free(group_srt); 00184 if (isVerbose) 00185 { 00186 printf("done."); 00187 } 00188 } 00189 } 00190 00191 void CSparseSpatialSampleStringKernel::compute_triple(int32_t idx_a, int32_t idx_b) 00192 { 00193 int d1, d2; 00194 SSKFeatures *features; 00195 int *sortIdx; 00196 int *features_srt; 00197 int *group_srt; 00198 int maxIdx; 00199 char *kernelfilename; 00200 int **S; 00201 int *len; 00202 int nStr, nfeat; 00203 int i; 00204 int na; 00205 int *K; 00206 00207 for (d1 = 1; d1 <= d; ++d1) 00208 { 00209 for (d2 = 1; d2 <= d; ++d2) 00210 { 00211 if (isVerbose) 00212 { 00213 printf("Extracting features..."); 00214 fflush( stdout ); 00215 } 00216 features = extractTriple(S,len,nStr,d1,d2); 00217 nfeat = (*features).n; 00218 printf("(%d,%d): %d features\n", d1, d2, nfeat); 00219 maxIdx = 0; 00220 for (i = 0; i < nfeat*3; ++i) 00221 maxIdx = maxIdx > (*features).features[i] ? maxIdx : (*features).features[i]; 00222 if (na < maxIdx+1) 00223 { 00224 printf("WARNING: Sequence elements are outside the specified range [0,%d]\n",na); 00225 printf("\tUsing [0,%d] instead\n", maxIdx); 00226 na = maxIdx+1; 00227 } 00228 if (isVerbose) 00229 { 00230 printf("done.\n"); 00231 printf("Sorting..."); 00232 fflush( stdout ); 00233 } 00234 sortIdx = cntsrtna((*features).features,3,(*features).n,na); 00235 if (isVerbose) 00236 { 00237 printf("done.\n"); 00238 } 00239 features_srt = (int *)malloc(nfeat*3*sizeof(int *)); 00240 group_srt = (int *)malloc(nfeat*sizeof(int)); 00241 for (i = 0; i < nfeat; ++i) 00242 { 00243 features_srt[i]=(*features).features[sortIdx[i]]; 00244 features_srt[i+nfeat]=(*features).features[sortIdx[i]+nfeat]; 00245 features_srt[i+2*nfeat]=(*features).features[sortIdx[i]+2*nfeat]; 00246 group_srt[i] = (*features).group[sortIdx[i]]; 00247 } 00248 free(sortIdx); 00249 free((*features).features); 00250 free((*features).group); 00251 free(features); 00252 if (isVerbose) 00253 { 00254 printf("Counting..."); 00255 fflush( stdout ); 00256 } 00257 countAndUpdate(K,features_srt,group_srt,3,nfeat,nStr); 00258 free(features_srt); 00259 free(group_srt); 00260 if (isVerbose) 00261 { 00262 printf("done.\n"); 00263 } 00264 } 00265 } 00266 } 00267 00268 void CSparseSpatialSampleStringKernel::countAndUpdate(int *outK, int *sx, int *g, int k, int r, int nStr) 00269 { 00270 char same; 00271 int i, j; 00272 int cu; 00273 long int ucnt; 00274 long int startInd, endInd, j1; 00275 int *curfeat, *ucnts, *updind; 00276 00277 curfeat = (int *)malloc(k*sizeof(int)); 00278 ucnts= (int *)malloc(nStr*sizeof(int)); 00279 updind = (int *)malloc(nStr*sizeof(int)); 00280 i = 0; 00281 ucnt = 0; 00282 while (i<r) 00283 { 00284 for (j = 0; j < k; ++j) 00285 curfeat[j]=sx[i+j*r]; 00286 same=1; 00287 for (j = 0;j < k; ++j) 00288 if (curfeat[j]!=sx[i+j*r]) 00289 { 00290 same=0; 00291 break; 00292 } 00293 startInd=i; 00294 while (same && i<r) 00295 { 00296 i++; 00297 if (i >= r) break; 00298 same = 1; 00299 for (j = 0; j < k; ++j) 00300 if (curfeat[j]!=sx[i+j*r]) 00301 { 00302 same=0; 00303 break; 00304 } 00305 } 00306 endInd= (i<r) ? (i-1):(r-1); 00307 ucnt++; 00308 if ((long int)endInd-startInd+1>2*nStr) 00309 { 00310 for (j = 0; j < nStr; ++j) ucnts[j]=0; 00311 for (j = startInd;j <= endInd; ++j) ucnts[g[j]]++; 00312 cu=0; 00313 for (j=0;j<nStr;j++) 00314 { 00315 if (ucnts[j]>0) 00316 { 00317 updind[cu] = j; 00318 cu++; 00319 } 00320 } 00321 for (j=0;j<cu;j++) 00322 for (j1=0;j1<cu;j1++) 00323 outK[updind[j]+updind[j1]*nStr]+=ucnts[updind[j]]*ucnts[updind[j1]]; 00324 } 00325 else 00326 { 00327 for (j = startInd;j <= endInd; ++j) 00328 for (j1 = startInd;j1 <= endInd; ++j1) 00329 outK[ g[j]+nStr*g[j1] ]++; 00330 } 00331 } 00332 free(updind); 00333 free(ucnts); 00334 free(curfeat); 00335 } 00336 00337 int *CSparseSpatialSampleStringKernel::cntsrtna(int *sx, int k, int r, int na) 00338 { 00339 int *sxc, *bc, *sxl, *cc, *regroup; 00340 int i, j; 00341 00342 sxc = (int *)malloc(na*sizeof(int)); 00343 bc = (int *)malloc(na*sizeof(int)); 00344 sxl = (int *)malloc(r*sizeof(int)); 00345 cc = (int *)malloc(r*sizeof(int)); 00346 regroup = (int *)malloc(r*sizeof(int)); 00347 00348 for (i = 0; i < r; ++i) 00349 regroup[i]=i; 00350 for (j = k-1; j >= 0; --j) 00351 { 00352 for(i = 0; i < na; ++i) 00353 sxc[i]=0; 00354 for (i = 0; i < r; ++i) 00355 { 00356 cc[i]=sx[regroup[i]+j*r]; 00357 sxc[ cc[i] ]++; 00358 } 00359 bc[0]=0; 00360 for (i = 1;i < na; ++i) 00361 bc[i]=bc[i-1]+sxc[i-1]; 00362 for (i = 0; i < r; ++i) 00363 sxl[bc[ cc[i] ]++] = regroup[i]; 00364 for (i = 0; i < r; ++i) 00365 regroup[i] = sxl[i]; 00366 } 00367 free(sxl); free(bc); free(sxc); free(cc); 00368 00369 return regroup; 00370 } 00371 00372 float64_t CSparseSpatialSampleStringKernel::compute(int32_t idx_a, int32_t idx_b) 00373 { 00374 if (t==2) 00375 compute_double(idx_a, idx_b); 00376 if (t==3) 00377 compute_triple(idx_a, idx_b); 00378 00379 SG_ERROR("t out of range - shouldn't happen\n"); 00380 return -1; 00381 }