SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
HashedWDFeaturesTransposed.cpp
Go to the documentation of this file.
1 /*
2  * This program is free software; you can redistribute it and/or modify
3  * it under the terms of the GNU General Public License as published by
4  * the Free Software Foundation; either version 3 of the License, or
5  * (at your option) any later version.
6  *
7  * Written (W) 2010 Soeren Sonnenburg
8  * Copyright (C) 2010 Berlin Institute of Technology
9  */
10 
12 #include <shogun/io/SGIO.h>
13 #include <shogun/lib/Signal.h>
14 #include <shogun/base/Parallel.h>
15 
16 #ifdef HAVE_PTHREAD
17 #include <pthread.h>
18 #endif
19 
20 using namespace shogun;
21 
22 #ifndef DOXYGEN_SHOULD_SKIP_THIS
23 struct HASHEDWD_THREAD_PARAM
24 {
26  int32_t* sub_index;
27  float64_t* output;
28  int32_t start;
29  int32_t stop;
30  float64_t* alphas;
31  float64_t* vec;
32  float64_t bias;
33  bool progress;
34  uint32_t* index;
35 };
36 #endif // DOXYGEN_SHOULD_SKIP_THIS
37 
39  :CDotFeatures()
40 {
42  "CHashedWDFeaturesTransposed::CHashedWDFeaturesTransposed()",
43  "\n");
44 
45  strings = NULL;
46  transposed_strings = NULL;
47 
48  degree = 0;
49  start_degree = 0;
50  from_degree = 0;
51  string_length = 0;
52  num_strings = 0;
53  alphabet_size = 0;
54  w_dim = 0;
55  partial_w_dim = 0;
56  wd_weights = NULL;
57  mask = 0;
58  m_hash_bits = 0;
59 
60  normalization_const = 0.0;
61 }
62 
64  int32_t start_order, int32_t order, int32_t from_order,
65  int32_t hash_bits) : CDotFeatures()
66 {
67  ASSERT(start_order>=0);
68  ASSERT(start_order<order);
69  ASSERT(order<=from_order);
70  ASSERT(hash_bits>0);
71  ASSERT(str);
72  ASSERT(str->have_same_length());
73  SG_REF(str);
74 
75  strings=str;
76  int32_t transposed_num_feat=0;
77  int32_t transposed_num_vec=0;
78  transposed_strings=str->get_transposed(transposed_num_feat, transposed_num_vec);
79 
82  ASSERT(transposed_num_feat==num_strings);
83  ASSERT(transposed_num_vec==string_length);
84 
85  CAlphabet* alpha=str->get_alphabet();
87  SG_UNREF(alpha);
88 
89  degree=order;
90  start_degree=start_order;
91  if (start_degree!=0)
93  from_degree=from_order;
94  m_hash_bits=hash_bits;
97 }
98 
100  : CDotFeatures(orig), strings(orig.strings), transposed_strings(orig.transposed_strings),
101  degree(orig.degree), start_degree(orig.start_degree),
102  from_degree(orig.from_degree), m_hash_bits(orig.m_hash_bits),
103  normalization_const(orig.normalization_const)
104 {
105  SG_REF(strings);
108  CAlphabet* alpha=strings->get_alphabet();
110  SG_UNREF(alpha);
111 
112  set_wd_weights();
113 }
114 
116 {
117  for (int32_t i=0; i<string_length; i++)
118  SG_FREE(transposed_strings[i].string);
120 
121  SG_UNREF(strings);
123 }
124 
125 float64_t CHashedWDFeaturesTransposed::dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2)
126 {
127  ASSERT(df);
131 
132  int32_t len1, len2;
133  bool free_vec1, free_vec2;
134 
135  uint8_t* vec1=strings->get_feature_vector(vec_idx1, len1, free_vec1);
136  uint8_t* vec2=wdf->strings->get_feature_vector(vec_idx2, len2, free_vec2);
137 
138  ASSERT(len1==len2);
139 
140  float64_t sum=0.0;
141 
142  for (int32_t i=0; i<len1; i++)
143  {
144  for (int32_t j=0; (i+j<len1) && (j<degree); j++)
145  {
146  if (vec1[i+j]!=vec2[i+j])
147  break;
148  if (j>=start_degree)
149  sum += wd_weights[j]*wd_weights[j];
150  }
151  }
152  strings->free_feature_vector(vec1, vec_idx1, free_vec1);
153  wdf->strings->free_feature_vector(vec2, vec_idx2, free_vec2);
154  return sum/CMath::sq(normalization_const);
155 }
156 
157 float64_t CHashedWDFeaturesTransposed::dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
158 {
159  if (vec2_len != w_dim)
160  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
161 
162  float64_t sum=0;
163  int32_t len;
164  bool free_vec1;
165  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
166  uint32_t* val=SG_MALLOC(uint32_t, len);
167 
168  uint32_t offs=0;
169 
170  CMath::fill_vector(val, len, 0xDEADBEAF);
171 
172  for (int32_t i=0; i < len; i++)
173  {
174  uint32_t o=offs;
175  for (int32_t k=0; k<degree && i+k<len; k++)
176  {
177  const float64_t wd = wd_weights[k];
178  const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
179  val[i]=h;
180 #ifdef DEBUG_HASHEDWD
181  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
182 #endif
183  sum+=vec2[o+(h & mask)]*wd;
184  o+=partial_w_dim;
185  }
186  offs+=partial_w_dim*degree;
187  }
188  SG_FREE(val);
189  strings->free_feature_vector(vec, vec_idx1, free_vec1);
190 
191  return sum/normalization_const;
192 }
193 
194 void CHashedWDFeaturesTransposed::dense_dot_range(float64_t* output, int32_t start, int32_t stop, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
195 {
196  ASSERT(output);
197  // write access is internally between output[start..stop] so the following
198  // line is necessary to write to output[0...(stop-start-1)]
199  output-=start;
200  ASSERT(start>=0);
201  ASSERT(start<stop);
202  ASSERT(stop<=get_num_vectors());
203  uint32_t* index=SG_MALLOC(uint32_t, stop);
204 
205  int32_t num_vectors=stop-start;
206  ASSERT(num_vectors>0);
207 
208  int32_t num_threads=parallel->get_num_threads();
209  ASSERT(num_threads>0);
210 
212 
213  if (dim != w_dim)
214  SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
215 
216 #ifdef HAVE_PTHREAD
217  if (num_threads < 2)
218  {
219 #endif
220  HASHEDWD_THREAD_PARAM params;
221  params.hf=this;
222  params.sub_index=NULL;
223  params.output=output;
224  params.start=start;
225  params.stop=stop;
226  params.alphas=alphas;
227  params.vec=vec;
228  params.bias=b;
229  params.progress=false; //true;
230  params.index=index;
231  dense_dot_range_helper((void*) &params);
232 #ifdef HAVE_PTHREAD
233  }
234  else
235  {
236  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
237  HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
238  int32_t step= num_vectors/num_threads;
239 
240  int32_t t;
241 
242  for (t=0; t<num_threads-1; t++)
243  {
244  params[t].hf = this;
245  params[t].sub_index=NULL;
246  params[t].output = output;
247  params[t].start = start+t*step;
248  params[t].stop = start+(t+1)*step;
249  params[t].alphas=alphas;
250  params[t].vec=vec;
251  params[t].bias=b;
252  params[t].progress = false;
253  params[t].index=index;
254  pthread_create(&threads[t], NULL,
256  }
257 
258  params[t].hf = this;
259  params[t].sub_index=NULL;
260  params[t].output = output;
261  params[t].start = start+t*step;
262  params[t].stop = stop;
263  params[t].alphas=alphas;
264  params[t].vec=vec;
265  params[t].bias=b;
266  params[t].progress = false; //true;
267  params[t].index=index;
269 
270  for (t=0; t<num_threads-1; t++)
271  pthread_join(threads[t], NULL);
272 
273  SG_FREE(params);
274  SG_FREE(threads);
275  }
276 #endif
277  SG_FREE(index);
278 
279 #ifndef WIN32
281  SG_INFO( "prematurely stopped. \n");
282 #endif
283 }
284 
285 void CHashedWDFeaturesTransposed::dense_dot_range_subset(int32_t* sub_index, int num, float64_t* output, float64_t* alphas, float64_t* vec, int32_t dim, float64_t b)
286 {
287  ASSERT(sub_index);
288  ASSERT(output);
289 
290  uint32_t* index=SG_MALLOC(uint32_t, num);
291 
292  int32_t num_threads=parallel->get_num_threads();
293  ASSERT(num_threads>0);
294 
296 
297  if (dim != w_dim)
298  SG_ERROR("Dimensions don't match, vec_len=%d, w_dim=%d\n", dim, w_dim);
299 
300 #ifdef HAVE_PTHREAD
301  if (num_threads < 2)
302  {
303 #endif
304  HASHEDWD_THREAD_PARAM params;
305  params.hf=this;
306  params.sub_index=sub_index;
307  params.output=output;
308  params.start=0;
309  params.stop=num;
310  params.alphas=alphas;
311  params.vec=vec;
312  params.bias=b;
313  params.progress=false; //true;
314  params.index=index;
315  dense_dot_range_helper((void*) &params);
316 #ifdef HAVE_PTHREAD
317  }
318  else
319  {
320  pthread_t* threads = SG_MALLOC(pthread_t, num_threads-1);
321  HASHEDWD_THREAD_PARAM* params = SG_MALLOC(HASHEDWD_THREAD_PARAM, num_threads);
322  int32_t step= num/num_threads;
323 
324  int32_t t;
325 
326  for (t=0; t<num_threads-1; t++)
327  {
328  params[t].hf = this;
329  params[t].sub_index=sub_index;
330  params[t].output = output;
331  params[t].start = t*step;
332  params[t].stop = (t+1)*step;
333  params[t].alphas=alphas;
334  params[t].vec=vec;
335  params[t].bias=b;
336  params[t].progress = false;
337  params[t].index=index;
338  pthread_create(&threads[t], NULL,
340  }
341 
342  params[t].hf = this;
343  params[t].sub_index=sub_index;
344  params[t].output = output;
345  params[t].start = t*step;
346  params[t].stop = num;
347  params[t].alphas=alphas;
348  params[t].vec=vec;
349  params[t].bias=b;
350  params[t].progress = false; //true;
351  params[t].index=index;
353 
354  for (t=0; t<num_threads-1; t++)
355  pthread_join(threads[t], NULL);
356 
357  SG_FREE(params);
358  SG_FREE(threads);
359  SG_FREE(index);
360  }
361 #endif
362 
363 #ifndef WIN32
365  SG_INFO( "prematurely stopped. \n");
366 #endif
367 }
368 
370 {
371  HASHEDWD_THREAD_PARAM* par=(HASHEDWD_THREAD_PARAM*) p;
372  CHashedWDFeaturesTransposed* hf=par->hf;
373  int32_t* sub_index=par->sub_index;
374  float64_t* output=par->output;
375  int32_t start=par->start;
376  int32_t stop=par->stop;
377  float64_t* alphas=par->alphas;
378  float64_t* vec=par->vec;
379  float64_t bias=par->bias;
380  bool progress=par->progress;
381  uint32_t* index=par->index;
382  int32_t string_length=hf->string_length;
383  int32_t degree=hf->degree;
386  uint32_t mask=hf->mask;
387  int32_t partial_w_dim=hf->partial_w_dim;
389 
390  if (sub_index)
391  {
392  for (int32_t j=start; j<stop; j++)
393  output[j]=0.0;
394 
395  uint32_t offs=0;
396  for (int32_t i=0; i<string_length; i++)
397  {
398  uint32_t o=offs;
399  for (int32_t k=0; k<degree && i+k<string_length; k++)
400  {
401  const float64_t wd = wd_weights[k];
402  uint8_t* dim=transposed_strings[i+k].string;
403  uint32_t h;
404 
405  for (int32_t j=start; j<stop; j++)
406  {
407  uint8_t bval=dim[sub_index[j]];
408  if (k==0)
409  h=CHash::IncrementalMurmurHash2(bval, 0xDEADBEAF);
410  else
411  h=CHash::IncrementalMurmurHash2(bval, index[j]);
412  index[j]=h;
413  output[j]+=vec[o + (h & mask)]*wd;
414  }
415  o+=partial_w_dim;
416  }
417  offs+=partial_w_dim*degree;
418 
419  if (progress)
420  hf->io->progress(i, 0,string_length, i);
421  }
422 
423  for (int32_t j=start; j<stop; j++)
424  {
425  if (alphas)
426  output[j]=output[j]*alphas[sub_index[j]]/normalization_const+bias;
427  else
428  output[j]=output[j]/normalization_const+bias;
429  }
430  }
431  else
432  {
433  CMath::fill_vector(&output[start], stop-start, 0.0);
434 
435  uint32_t offs=0;
436  for (int32_t i=0; i<string_length; i++)
437  {
438  uint32_t o=offs;
439  for (int32_t k=0; k<degree && i+k<string_length; k++)
440  {
441  const float64_t wd = wd_weights[k];
442  uint8_t* dim=transposed_strings[i+k].string;
443  uint32_t h;
444 
445  for (int32_t j=start; j<stop; j++)
446  {
447  if (k==0)
448  h=CHash::IncrementalMurmurHash2(dim[j], 0xDEADBEAF);
449  else
450  h=CHash::IncrementalMurmurHash2(dim[j], index[j]);
451  index[j]=h;
452  output[j]+=vec[o + (h & mask)]*wd;
453  }
454  o+=partial_w_dim;
455  }
456  offs+=partial_w_dim*degree;
457 
458  if (progress)
459  hf->io->progress(i, 0,string_length, i);
460  }
461 
462  for (int32_t j=start; j<stop; j++)
463  {
464  if (alphas)
465  output[j]=output[j]*alphas[j]/normalization_const+bias;
466  else
467  output[j]=output[j]/normalization_const+bias;
468  }
469  }
470 
471  return NULL;
472 }
473 
474 void CHashedWDFeaturesTransposed::add_to_dense_vec(float64_t alpha, int32_t vec_idx1, float64_t* vec2, int32_t vec2_len, bool abs_val)
475 {
476  if (vec2_len != w_dim)
477  SG_ERROR("Dimensions don't match, vec2_dim=%d, w_dim=%d\n", vec2_len, w_dim);
478 
479  int32_t len;
480  bool free_vec1;
481  uint8_t* vec = strings->get_feature_vector(vec_idx1, len, free_vec1);
482  uint32_t* val=SG_MALLOC(uint32_t, len);
483 
484  uint32_t offs=0;
485  float64_t factor=alpha/normalization_const;
486  if (abs_val)
487  factor=CMath::abs(factor);
488 
489  CMath::fill_vector(val, len, 0xDEADBEAF);
490 
491  for (int32_t i=0; i<len; i++)
492  {
493  uint32_t o=offs;
494  for (int32_t k=0; k<degree && i+k<len; k++)
495  {
496  float64_t wd = wd_weights[k]*factor;
497 
498  const uint32_t h=CHash::IncrementalMurmurHash2(vec[i+k], val[i]);
499  val[i]=h;
500 
501 #ifdef DEBUG_HASHEDWD
502  SG_PRINT("offs=%d o=%d h=%d \n", offs, o, h);
503  SG_PRINT("vec[i]=%d, k=%d, offs=%d o=%d h=%d \n", vec[i], k,offs, o, h);
504 #endif
505  vec2[o+(h & mask)]+=wd;
506  o+=partial_w_dim;
507  }
508  offs+=partial_w_dim*degree;
509  }
510 
511  SG_FREE(val);
512  strings->free_feature_vector(vec, vec_idx1, free_vec1);
513 }
514 
516 {
517  ASSERT(degree>0);
518 
519  mask=(uint32_t) (((uint64_t) 1)<<m_hash_bits)-1;
522 
524 
525  for (int32_t i=0; i<degree; i++)
526  wd_weights[i]=sqrt(2.0*(from_degree-i)/(from_degree*(from_degree+1)));
527 
528  SG_DEBUG("created HashedWDFeaturesTransposed with d=%d (%d), alphabetsize=%d, "
529  "dim=%d partial_dim=%d num=%d, len=%d\n",
530  degree, from_degree, alphabet_size,
532 }
533 
534 
536 {
537  if (n==0)
538  {
540  for (int32_t i=0; i<degree; i++)
542 
544  }
545  else
547 
548  SG_DEBUG("normalization_const:%f\n", normalization_const);
549 }
550 
552 {
553  return new CHashedWDFeaturesTransposed(*this);
554 }
555 
557 {
559  return NULL;
560 }
561 
562 bool CHashedWDFeaturesTransposed::get_next_feature(int32_t& index, float64_t& value, void* iterator)
563 {
565  return NULL;
566 }
567 
569 {
571 }

SHOGUN Machine Learning Toolbox - Documentation