SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
OligoStringKernel.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2008 Christian Igel, Tobias Glasmachers
8  * Copyright (C) 2008 Christian Igel, Tobias Glasmachers
9  *
10  * Shogun adjustments (W) 2008-2009 Soeren Sonnenburg
11  * Copyright (C) 2008-2009 Fraunhofer Institute FIRST and Max-Planck-Society
12  *
13  */
17 
18 #include <map>
19 #include <vector>
20 #include <algorithm>
21 
22 using namespace shogun;
23 
25  : CStringKernel<char>()
26 {
27  init();
28 }
29 
30 COligoStringKernel::COligoStringKernel(int32_t cache_sz, int32_t kmer_len, float64_t w)
31 : CStringKernel<char>(cache_sz)
32 {
33  init();
34 
35  k=kmer_len;
36  width=w;
37 }
38 
40 {
41  cleanup();
42 }
43 
45 {
47  gauss_table=NULL;
49 
51 }
52 
53 bool COligoStringKernel::init(CFeatures* l, CFeatures* r)
54 {
55  cleanup();
56 
58  int32_t max_len=CMath::max(
59  ((CStringFeatures<char>*) l)->get_max_vector_length(),
60  ((CStringFeatures<char>*) r)->get_max_vector_length()
61  );
62 
63  getExpFunctionCache(max_len);
64  return init_normalizer();
65 }
66 
68  const std::string& sequence, uint32_t k_mer_length,
69  const std::string& allowed_characters,
70  std::vector< std::pair<int32_t, float64_t> >& values)
71 {
72  float64_t oligo_value = 0.;
73  float64_t factor = 1.;
74  std::map<std::string::value_type, uint32_t> residue_values;
75  uint32_t counter = 0;
76  uint32_t number_of_residues = allowed_characters.size();
77  uint32_t sequence_length = sequence.size();
78  bool sequence_ok = true;
79 
80  // checking if sequence contains illegal characters
81  for (uint32_t i = 0; i < sequence.size(); ++i)
82  {
83  if (allowed_characters.find(sequence.at(i)) == std::string::npos)
84  sequence_ok = false;
85  }
86 
87  if (sequence_ok && k_mer_length <= sequence_length)
88  {
89  values.resize(sequence_length - k_mer_length + 1,
90  std::pair<int32_t, float64_t>());
91  for (uint32_t i = 0; i < number_of_residues; ++i)
92  {
93  residue_values.insert(std::make_pair(allowed_characters[i], counter));
94  ++counter;
95  }
96  for (int32_t k = k_mer_length - 1; k >= 0; k--)
97  {
98  oligo_value += factor * residue_values[sequence[k]];
99  factor *= number_of_residues;
100  }
101  factor /= number_of_residues;
102  counter = 0;
103  values[counter].first = 1;
104  values[counter].second = oligo_value;
105  ++counter;
106 
107  for (uint32_t j = 1; j < sequence_length - k_mer_length + 1; j++)
108  {
109  oligo_value -= factor * residue_values[sequence[j - 1]];
110  oligo_value = oligo_value * number_of_residues +
111  residue_values[sequence[j + k_mer_length - 1]];
112 
113  values[counter].first = j + 1;
114  values[counter].second = oligo_value ;
115  ++counter;
116  }
117  stable_sort(values.begin(), values.end(), cmpOligos_);
118  }
119  else
120  {
121  values.clear();
122  }
123 }
124 
126  const std::vector<std::string>& sequences, uint32_t k_mer_length,
127  const std::string& allowed_characters,
128  std::vector< std::vector< std::pair<int32_t, float64_t> > >& encoded_sequences)
129 {
130  std::vector< std::pair<int32_t, float64_t> > temp_vector;
131  encoded_sequences.resize(sequences.size(),
132  std::vector< std::pair<int32_t, float64_t> >());
133 
134  for (uint32_t i = 0; i < sequences.size(); ++i)
135  {
136  encodeOligo(sequences[i], k_mer_length, allowed_characters, temp_vector);
137  encoded_sequences[i] = temp_vector;
138  }
139 }
140 
141 void COligoStringKernel::getExpFunctionCache(uint32_t sequence_length)
142 {
144  gauss_table=SG_MALLOC(float64_t, sequence_length);
145 
146  gauss_table[0] = 1;
147  for (uint32_t i = 1; i < sequence_length; i++)
148  gauss_table[i] = exp((-1.0 / (CMath::sq(width))) * CMath::sq((float64_t) i));
149 
150  gauss_table_len=sequence_length;
151 }
152 
154  const std::vector< std::pair<int32_t, float64_t> >& x,
155  const std::vector< std::pair<int32_t, float64_t> >& y,
156  int32_t max_distance)
157 {
158  float64_t result = 0;
159  int32_t i1 = 0;
160  int32_t i2 = 0;
161  int32_t c1 = 0;
162  uint32_t x_size = x.size();
163  uint32_t y_size = y.size();
164 
165  while ((uint32_t) i1 + 1 < x_size && (uint32_t) i2 + 1 < y_size)
166  {
167  if (x[i1].second == y[i2].second)
168  {
169  if (max_distance < 0
170  || (abs(x[i1].first - y[i2].first)) <= max_distance)
171  {
172  result += gauss_table[abs((x[i1].first - y[i2].first))];
173  if (x[i1].second == x[i1 + 1].second)
174  {
175  i1++;
176  c1++;
177  }
178  else if (y[i2].second == y[i2 + 1].second)
179  {
180  i2++;
181  i1 -= c1;
182  c1 = 0;
183  }
184  else
185  {
186  i1++;
187  i2++;
188  }
189  }
190  else
191  {
192  if (x[i1].first < y[i2].first)
193  {
194  if (x[i1].second == x[i1 + 1].second)
195  {
196  i1++;
197  }
198  else if (y[i2].second == y[i2 + 1].second)
199  {
200  while (y[i2].second == y[i2+1].second)
201  i2++;
202  ++i1;
203  c1 = 0;
204  }
205  else
206  {
207  i1++;
208  i2++;
209  c1 = 0;
210  }
211  }
212  else
213  {
214  i2++;
215  i1 -= c1;
216  c1 = 0;
217  }
218  }
219  }
220  else
221  {
222  if (x[i1].second < y[i2].second)
223  i1++;
224  else
225  i2++;
226  c1 = 0;
227  }
228  }
229  return result;
230 }
231 
232 
233 float64_t COligoStringKernel::compute(int32_t idx_a, int32_t idx_b)
234 {
235  int32_t alen, blen;
236  bool free_a, free_b;
237  char* avec=((CStringFeatures<char>*) lhs)->get_feature_vector(idx_a, alen, free_a);
238  char* bvec=((CStringFeatures<char>*) rhs)->get_feature_vector(idx_b, blen, free_b);
239  std::vector< std::pair<int32_t, float64_t> > aenc;
240  std::vector< std::pair<int32_t, float64_t> > benc;
241  encodeOligo(std::string(avec, alen), k, "ACGT", aenc);
242  encodeOligo(std::string(bvec, alen), k, "ACGT", benc);
243  float64_t result=kernelOligoFast(aenc, benc);
244  ((CStringFeatures<char>*) lhs)->free_feature_vector(avec, idx_a, free_a);
245  ((CStringFeatures<char>*) rhs)->free_feature_vector(bvec, idx_b, free_b);
246  return result;
247 }
248 
249 void COligoStringKernel::init()
250 {
251  k=0;
252  width=0.0;
253  gauss_table=NULL;
254  gauss_table_len=0;
255 
257 
258  m_parameters->add(&k, "k", "K-mer length.");
259  m_parameters->add(&width, "width", "Width of Gaussian.");
260  m_parameters->add_vector(&gauss_table, &gauss_table_len, "gauss_table", "Gauss Cache Table.");
261 }

SHOGUN Machine Learning Toolbox - Documentation