SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
StreamingSparseFeatures.cpp
Go to the documentation of this file.
2 namespace shogun
3 {
4 
5 template <class T>
7 {
9  init();
10 }
11 
12 template <class T>
14  bool is_labelled,
15  int32_t size)
17 {
19  init(file, is_labelled, size);
20 }
21 
22 template <class T>
24 {
25  parser.end_parser();
26 }
27 
28 template <class T>
30 {
31  ASSERT(index>=0 && index<current_num_features);
32 
33  T ret=0;
34 
35  if (current_vector)
36  {
37  for (int32_t i=0; i<current_length; i++)
38  if (current_vector[i].feat_index==index)
39  ret += current_vector[i].entry;
40  }
41 
42  return ret;
43 }
44 
45 template <class T>
47 {
48 }
49 
50 template <class T>
52 {
53  int32_t n=current_num_features;
54  ASSERT(n<=num);
55  current_num_features=num;
56  return n;
57 }
58 
59 template <class T>
61 {
62  int32_t dim = get_dim_feature_space();
63  if (dim > len)
64  {
65  vec = SG_REALLOC(float32_t, vec, dim);
66  memset(&vec[len], 0, (dim-len) * sizeof(float32_t));
67  len = dim;
68  }
69 }
70 
71 template <class T>
73 {
74  int32_t dim = get_dim_feature_space();
75  if (dim > len)
76  {
77  vec = SG_REALLOC(float64_t, vec, dim);
78  memset(&vec[len], 0, (dim-len) * sizeof(float64_t));
79  len = dim;
80  }
81 }
82 
83 template <class T>
85 {
86  T result=0;
87 
88  //result remains zero when one of the vectors is non existent
89  if (avec && bvec)
90  {
91  if (alen<=blen)
92  {
93  int32_t j=0;
94  for (int32_t i=0; i<alen; i++)
95  {
96  int32_t a_feat_idx=avec[i].feat_index;
97 
98  while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
99  j++;
100 
101  if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
102  {
103  result+= avec[i].entry * bvec[j].entry;
104  j++;
105  }
106  }
107  }
108  else
109  {
110  int32_t j=0;
111  for (int32_t i=0; i<blen; i++)
112  {
113  int32_t b_feat_idx=bvec[i].feat_index;
114 
115  while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
116  j++;
117 
118  if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
119  {
120  result+= bvec[i].entry * avec[j].entry;
121  j++;
122  }
123  }
124  }
125 
126  result*=alpha;
127  }
128 
129  return result;
130 }
131 
132 template <class T>
133 T CStreamingSparseFeatures<T>::dense_dot(T alpha, T* vec, int32_t dim, T b)
134 {
135  ASSERT(vec);
136  ASSERT(dim>=current_num_features);
137  T result=b;
138 
139  int32_t num_feat=current_length;
140  SGSparseVectorEntry<T>* sv=current_vector;
141 
142  if (sv)
143  {
144  for (int32_t i=0; i<num_feat; i++)
145  result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
146  }
147 
148  return result;
149 }
150 
151 template <class T>
153 {
154  ASSERT(vec2);
155  if (vec2_len < current_num_features)
156  {
157  SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
158  vec2_len, current_num_features);
159  }
160 
161  float64_t result=0;
162  if (current_vector)
163  {
164  for (int32_t i=0; i<current_length; i++)
165  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
166  }
167 
168  return result;
169 }
170 
171 template <class T>
173 {
174  ASSERT(vec2);
175  if (vec2_len < current_num_features)
176  {
177  SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n",
178  vec2_len, current_num_features);
179  }
180 
181  float32_t result=0;
182  if (current_vector)
183  {
184  for (int32_t i=0; i<current_length; i++)
185  result+=vec2[current_vector[i].feat_index]*current_vector[i].entry;
186  }
187 
188  return result;
189 }
190 
191 template <class T>
192 void CStreamingSparseFeatures<T>::add_to_dense_vec(float64_t alpha, float64_t* vec2, int32_t vec2_len, bool abs_val)
193 {
194  ASSERT(vec2);
195  if (vec2_len < current_num_features)
196  {
197  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
198  vec2_len, current_num_features);
199  }
200 
201  SGSparseVectorEntry<T>* sv=current_vector;
202  int32_t num_feat=current_length;
203 
204  if (sv)
205  {
206  if (abs_val)
207  {
208  for (int32_t i=0; i<num_feat; i++)
209  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
210  }
211  else
212  {
213  for (int32_t i=0; i<num_feat; i++)
214  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
215  }
216  }
217 }
218 
219 template <class T>
220 void CStreamingSparseFeatures<T>::add_to_dense_vec(float32_t alpha, float32_t* vec2, int32_t vec2_len, bool abs_val)
221 {
222  ASSERT(vec2);
223  if (vec2_len < current_num_features)
224  {
225  SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n",
226  vec2_len, current_num_features);
227  }
228 
229  SGSparseVectorEntry<T>* sv=current_vector;
230  int32_t num_feat=current_length;
231 
232  if (sv)
233  {
234  if (abs_val)
235  {
236  for (int32_t i=0; i<num_feat; i++)
237  vec2[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
238  }
239  else
240  {
241  for (int32_t i=0; i<num_feat; i++)
242  vec2[sv[i].feat_index]+= alpha*sv[i].entry;
243  }
244  }
245 }
246 
247 template <class T>
249 {
250  return current_length;
251 }
252 
253 template <class T>
255 {
256  ASSERT(current_vector);
257 
258  float32_t sq=0;
259 
260  for (int32_t i=0; i<current_length; i++)
261  sq += current_vector[i].entry * current_vector[i].entry;
262 
263  return sq;
264 }
265 
266 template <class T>
268 {
269  ASSERT(current_vector);
270 
271  SGSparseVectorEntry<T>* sf_orig=current_vector;
272  int32_t len=current_length;
273 
274  int32_t* feat_idx=SG_MALLOC(int32_t, len);
275  int32_t* orig_idx=SG_MALLOC(int32_t, len);
276 
277  for (int32_t i=0; i<len; i++)
278  {
279  feat_idx[i]=sf_orig[i].feat_index;
280  orig_idx[i]=i;
281  }
282 
283  CMath::qsort_index(feat_idx, orig_idx, len);
284 
286 
287  for (int32_t i=0; i<len; i++)
288  sf_new[i]=sf_orig[orig_idx[i]];
289 
290  // sanity check
291  for (int32_t i=0; i<len-1; i++)
292  ASSERT(sf_new[i].feat_index<sf_new[i+1].feat_index);
293 
294  // Copy new vector back to original
295  for (int32_t i=0; i<len; i++)
296  sf_orig[i]=sf_new[i];
297 
298  SG_FREE(orig_idx);
299  SG_FREE(feat_idx);
300  SG_FREE(sf_new);
301 }
302 
303 template <class T>
305 {
306  return new CStreamingSparseFeatures<T>(*this);
307 }
308 
309 template <class T>
311 {
312  if (current_vector)
313  return 1;
314  return 0;
315 }
316 
317 template <class T>
319 {
320  return sizeof(T);
321 }
322 
324 {
325  parser.set_read_vector(&CStreamingFile::get_sparse_vector);
326 }
327 
329 {
330  parser.set_read_vector_and_label
332 }
333 
334 #define GET_FEATURE_TYPE(f_type, sg_type) \
335 template<> EFeatureType CStreamingSparseFeatures<sg_type>::get_feature_type() \
336 { \
337  return f_type; \
338 }
339 
342 GET_FEATURE_TYPE(F_BYTE, uint8_t)
343 GET_FEATURE_TYPE(F_BYTE, int8_t)
344 GET_FEATURE_TYPE(F_SHORT, int16_t)
345 GET_FEATURE_TYPE(F_WORD, uint16_t)
346 GET_FEATURE_TYPE(F_INT, int32_t)
347 GET_FEATURE_TYPE(F_UINT, uint32_t)
348 GET_FEATURE_TYPE(F_LONG, int64_t)
349 GET_FEATURE_TYPE(F_ULONG, uint64_t)
353 #undef GET_FEATURE_TYPE
354 
355 
356 template <class T>
357 void CStreamingSparseFeatures<T>::init()
358 {
359  working_file=NULL;
360  current_vector=NULL;
361  current_length=-1;
362  current_vec_index=0;
363  current_num_features=-1;
364 }
365 
366 template <class T>
367 void CStreamingSparseFeatures<T>::init(CStreamingFile* file,
368  bool is_labelled,
369  int32_t size)
370 {
371  init();
372  has_labels = is_labelled;
373  working_file = file;
374  parser.init(file, is_labelled, size);
375 }
376 
377 template <class T>
379 {
380  if (!parser.is_running())
381  parser.start_parser();
382 }
383 
384 template <class T>
386 {
387  parser.end_parser();
388 }
389 
390 template <class T>
392 {
393  bool ret_value;
394  ret_value = (bool) parser.get_next_example(current_vector,
395  current_length,
396  current_label);
397 
398  if (!ret_value)
399  return false;
400 
401  // Update number of features based on highest index
402  for (int32_t i=0; i<current_length; i++)
403  {
404  if (current_vector[i].feat_index > current_num_features)
405  current_num_features = current_vector[i].feat_index+1;
406  }
407  current_vec_index++;
408 
409  return true;
410 }
411 
412 template <class T>
414 {
415  current_sgvector.features=current_vector;
416  current_sgvector.num_feat_entries=current_length;
417  current_sgvector.vec_index=current_vec_index;
418 
419  return current_sgvector;
420 }
421 
422 template <class T>
424 {
425  ASSERT(has_labels);
426 
427  return current_label;
428 }
429 
430 template <class T>
432 {
433  parser.finalize_example();
434 }
435 
436 template <class T>
438 {
439  return current_num_features;
440 }
441 
442 template <class T>
444 {
446  return -1;
447 }
448 
449 template <class T>
451 {
452  return current_num_features;
453 }
454 
455 template <class T>
457 {
458  return current_length;
459 }
460 
461 template <class T>
463 {
464  return C_STREAMING_SPARSE;
465 }
466 
467 template class CStreamingSparseFeatures<bool>;
468 template class CStreamingSparseFeatures<char>;
469 template class CStreamingSparseFeatures<int8_t>;
470 template class CStreamingSparseFeatures<uint8_t>;
471 template class CStreamingSparseFeatures<int16_t>;
473 template class CStreamingSparseFeatures<int32_t>;
475 template class CStreamingSparseFeatures<int64_t>;
480 }

SHOGUN Machine Learning Toolbox - Documentation