SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Berlin Institute of Technology 00009 */ 00010 00011 #include "lib/common.h" 00012 #include "lib/io.h" 00013 #include "kernel/SNPStringKernel.h" 00014 #include "kernel/SqrtDiagKernelNormalizer.h" 00015 #include "features/Features.h" 00016 #include "features/StringFeatures.h" 00017 00018 using namespace shogun; 00019 00020 CSNPStringKernel::CSNPStringKernel(void) 00021 : CStringKernel<char>(0), 00022 m_degree(0), m_win_len(0), m_inhomogene(false) 00023 { 00024 SG_UNSTABLE("CSNPStringKernel::CSNPStringKernel(void)", "\n"); 00025 00026 m_str_min=NULL; 00027 m_str_maj=NULL; 00028 set_normalizer(new CSqrtDiagKernelNormalizer()); 00029 } 00030 00031 CSNPStringKernel::CSNPStringKernel(int32_t size, 00032 int32_t degree, int32_t win_len, bool inhomogene) 00033 : CStringKernel<char>(size), 00034 m_degree(degree), m_win_len(2*win_len), m_inhomogene(inhomogene) 00035 { 00036 m_str_min=NULL; 00037 m_str_maj=NULL; 00038 set_normalizer(new CSqrtDiagKernelNormalizer()); 00039 } 00040 00041 CSNPStringKernel::CSNPStringKernel( 00042 CStringFeatures<char>* l, CStringFeatures<char>* r, 00043 int32_t degree, int32_t win_len, bool inhomogene) 00044 : CStringKernel<char>(10), m_degree(degree), m_win_len(2*win_len), 00045 m_inhomogene(inhomogene) 00046 { 00047 m_str_min=NULL; 00048 m_str_maj=NULL; 00049 set_normalizer(new CSqrtDiagKernelNormalizer()); 00050 if (l==r) 00051 obtain_base_strings(); 00052 init(l, r); 00053 } 00054 00055 CSNPStringKernel::~CSNPStringKernel() 00056 { 00057 cleanup(); 00058 } 00059 00060 bool CSNPStringKernel::init(CFeatures* l, CFeatures* r) 00061 { 00062 CStringKernel<char>::init(l, r); 00063 return init_normalizer(); 00064 } 00065 00066 void CSNPStringKernel::cleanup() 00067 { 00068 CKernel::cleanup(); 00069 free(m_str_min); 00070 free(m_str_maj); 00071 } 00072 00073 void CSNPStringKernel::obtain_base_strings() 00074 { 00075 //should only be called on training data 00076 ASSERT(lhs==rhs); 00077 00078 m_str_len=0; 00079 00080 for (int32_t i=0; i<num_lhs; i++) 00081 { 00082 int32_t len; 00083 bool free_vec; 00084 char* vec = ((CStringFeatures<char>*) lhs)->get_feature_vector(i, len, free_vec); 00085 00086 if (m_str_len==0) 00087 { 00088 m_str_len=len; 00089 size_t tlen=(len+1)*sizeof(char); 00090 m_str_min=(char*) malloc(tlen); 00091 m_str_maj=(char*) malloc(tlen); 00092 memset(m_str_min, 0, tlen); 00093 memset(m_str_maj, 0, tlen); 00094 } 00095 else 00096 { 00097 ASSERT(m_str_len==len); 00098 } 00099 00100 for (int32_t j=0; j<len; j++) 00101 { 00102 // skip sequencing errors 00103 if (vec[j]=='0') 00104 continue; 00105 00106 if (m_str_min[j]==0) 00107 m_str_min[j]=vec[j]; 00108 else if (m_str_maj[j]==0 && vec[j]!=m_str_min[j]) 00109 m_str_maj[j]=vec[j]; 00110 } 00111 00112 ((CStringFeatures<char>*) lhs)->free_feature_vector(vec, i, free_vec); 00113 } 00114 00115 for (int32_t j=0; j<m_str_len; j++) 00116 { 00117 // if only one one symbol occurs use 0 00118 if (m_str_min[j]==0) 00119 m_str_min[j]='0'; 00120 if (m_str_maj[j]==0) 00121 m_str_maj[j]='0'; 00122 00123 if (m_str_min[j]>m_str_maj[j]) 00124 CMath::swap(m_str_min[j], m_str_maj[j]); 00125 } 00126 } 00127 00128 float64_t CSNPStringKernel::compute(int32_t idx_a, int32_t idx_b) 00129 { 00130 int32_t alen, blen; 00131 bool free_avec, free_bvec; 00132 00133 char* avec = ((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_avec); 00134 char* bvec = ((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_bvec); 00135 00136 ASSERT(alen==blen); 00137 if (alen!=m_str_len) 00138 SG_ERROR("alen (%d) !=m_str_len (%d)\n", alen, m_str_len); 00139 ASSERT(m_str_min); 00140 ASSERT(m_str_maj); 00141 00142 float64_t total=0; 00143 int32_t inhomogene= (m_inhomogene) ? 1 : 0; 00144 00145 for (int32_t i = 0; i<alen-1; i+=2) 00146 { 00147 int32_t sumaa=0; 00148 int32_t sumbb=0; 00149 int32_t sumab=0; 00150 00151 for (int32_t l=0; l<m_win_len && i+l<alen-1; l+=2) 00152 { 00153 char a1=avec[i+l]; 00154 char a2=avec[i+l+1]; 00155 char b1=bvec[i+l]; 00156 char b2=bvec[i+l+1]; 00157 00158 if ((a1!=a2 || a1=='0' || a1=='0') && (b1!=b2 || b1=='0' || b2=='0')) 00159 sumab++; 00160 else if (a1==a2 && b1==b2) 00161 { 00162 if (a1!=b1) 00163 continue; 00164 00165 if (a1==m_str_min[i+l]) 00166 sumaa++; 00167 else if (a1==m_str_maj[i+l]) 00168 sumbb++; 00169 else 00170 { 00171 SG_ERROR("The impossible happened i=%d l=%d a1=%c " 00172 "a2=%c b1=%c b2=%c min=%c maj=%c\n", i, l, a1,a2, b1,b2, m_str_min[i+l], m_str_maj[i+l]); 00173 } 00174 } 00175 00176 } 00177 total+=CMath::pow(float64_t(sumaa+sumbb+sumab+inhomogene), 00178 (int32_t) m_degree); 00179 } 00180 00181 ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_avec); 00182 ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_bvec); 00183 return total; 00184 }