SHOGUN  v1.1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
VwParser.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2009 Yahoo! Inc. All rights reserved. The copyrights
3  * embodied in the content of this file are licensed under the BSD
4  * (revised) open source license.
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Written (W) 2011 Shashwat Lal Das
12  * Adaptation of Vowpal Wabbit v5.1.
13  * Copyright (C) 2011 Berlin Institute of Technology and Max-Planck-Society.
14  */
15 
18 
19 using namespace shogun;
20 
22  : CSGObject()
23 {
24  env = new CVwEnvironment();
26  write_cache = false;
27  cache_writer = NULL;
28 }
29 
31  : CSGObject()
32 {
33  ASSERT(env_to_use);
34 
35  env = env_to_use;
37  write_cache = false;
38  cache_writer = NULL;
39  SG_REF(env);
40 }
41 
43 {
44  SG_FREE(channels.begin);
45  channels.begin = channels.end = channels.end_array = NULL;
46  SG_FREE(words.begin);
47  words.begin = words.end = words.end_array = NULL;
48  SG_FREE(name.begin);
49  name.begin = name.end = name.end_array = NULL;
50 
51  SG_UNREF(env);
53 }
54 
56 {
57  char *line=NULL;
58  int32_t num_chars = buf->read_line(line);
59  if (num_chars == 0)
60  return num_chars;
61 
62  /* Mark begin and end of example in the buffer */
63  substring example_string = {line, line + num_chars};
64 
65  /* Channels containing separate namespaces/label information*/
66  channels.erase();
67 
68  /* Split at '|' character */
69  tokenize('|', example_string, channels);
70 
71  /* If first char is not '|', then the first channel contains label data */
72  substring* feature_start = &channels[1];
73 
74  if (*line == '|')
75  feature_start = &channels[0]; /* Unlabelled data */
76  else
77  {
78  /* First channel has label info */
79  substring label_space = channels[0];
80  char* tab_location = safe_index(label_space.start, '\t', label_space.end);
81  if (tab_location != label_space.end)
82  label_space.start = tab_location+1;
83 
84  /* Split the label space on spaces */
85  tokenize(' ',label_space,words);
86  if (words.index() > 0 && words.last().end == label_space.end) //The last field is a tag, so record and strip it off
87  {
88  substring tag = words.pop();
89  ae->tag.push_many(tag.start, tag.end - tag.start);
90  }
91 
92  ae->ld->parse_label(words);
93  set_minmax(ae->ld->label);
94  }
95 
96  vw_size_t mask = env->mask;
97 
98  /* Now parse the individual channels, i.e., namespaces */
99  for (substring* i = feature_start; i != channels.end; i++)
100  {
101  substring channel = *i;
102 
103  tokenize(' ',channel, words);
104  if (words.begin == words.end)
105  continue;
106 
107  /* Set default scale value for channel */
108  float32_t channel_v = 1.;
109  vw_size_t channel_hash;
110 
111  /* Index by which to refer to the namespace */
112  vw_size_t index = 0;
113  bool new_index = false;
114  vw_size_t feature_offset = 0;
115 
116  if (channel.start[0] != ' ')
117  {
118  /* Nonanonymous namespace specified */
119  feature_offset++;
120  feature_value(words[0], name, channel_v);
121 
122  if (name.index() > 0)
123  {
124  index = (unsigned char)(*name[0].start);
125  if (ae->atomics[index].begin == ae->atomics[index].end)
126  {
127  ae->sum_feat_sq[index] = 0;
128  new_index = true;
129  }
130  }
131  channel_hash = hasher(name[0], hash_base);
132  }
133  else
134  {
135  /* Use default namespace with index below */
136  index = (unsigned char)' ';
137  if (ae->atomics[index].begin == ae->atomics[index].end)
138  {
139  ae->sum_feat_sq[index] = 0;
140  new_index = true;
141  }
142  channel_hash = 0;
143  }
144 
145  for (substring* j = words.begin+feature_offset; j != words.end; j++)
146  {
147  /* Get individual features and multiply by scale value */
148  float32_t v;
149  feature_value(*j, name, v);
150  v *= channel_v;
151 
152  /* Hash feature */
153  vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
154  VwFeature f = {v,word_hash};
155  ae->sum_feat_sq[index] += v*v;
156  ae->atomics[index].push(f);
157  }
158 
159  /* Add index to list of indices if required */
160  if (new_index && ae->atomics[index].begin != ae->atomics[index].end)
161  ae->indices.push(index);
162 
163  }
164 
165  if (write_cache)
167 
168  return num_chars;
169 }
170 
172 {
173  char *line=NULL;
174  int32_t num_chars = buf->read_line(line);
175  if (num_chars == 0)
176  return num_chars;
177 
178  /* Mark begin and end of example in the buffer */
179  substring example_string = {line, line + num_chars};
180 
181  vw_size_t mask = env->mask;
182  tokenize(' ', example_string, words);
183 
184  ae->ld->label = float_of_substring(words[0]);
185  ae->ld->weight = 1.;
186  ae->ld->initial = 0.;
187  set_minmax(ae->ld->label);
188 
189  substring* feature_start = &words[1];
190 
191  vw_size_t index = (unsigned char)' '; // Any default namespace is ok
192  vw_size_t channel_hash = 0;
193  ae->sum_feat_sq[index] = 0;
194  ae->indices.push(index);
195  /* Now parse the individual features */
196  for (substring* i = feature_start; i != words.end; i++)
197  {
198  float32_t v;
199  feature_value(*i, name, v);
200 
201  vw_size_t word_hash = (hasher(name[0], channel_hash)) & mask;
202  VwFeature f = {v,word_hash};
203  ae->sum_feat_sq[index] += v*v;
204  ae->atomics[index].push(f);
205  }
206 
207  if (write_cache)
209 
210  return num_chars;
211 }
212 
214 {
215  char *line=NULL;
216  int32_t num_chars = buf->read_line(line);
217  if (num_chars == 0)
218  return num_chars;
219 
220  // Mark begin and end of example in the buffer
221  substring example_string = {line, line + num_chars};
222 
223  vw_size_t mask = env->mask;
224  tokenize(' ', example_string, words);
225 
226  ae->ld->label = float_of_substring(words[0]);
227  ae->ld->weight = 1.;
228  ae->ld->initial = 0.;
229  set_minmax(ae->ld->label);
230 
231  substring* feature_start = &words[1];
232 
233  vw_size_t index = (unsigned char)' ';
234 
235  ae->sum_feat_sq[index] = 0;
236  ae->indices.push(index);
237  // Now parse individual features
238  int32_t j=0;
239  for (substring* i = feature_start; i != words.end; i++)
240  {
242  vw_size_t word_hash = j & mask;
243  VwFeature f = {v,word_hash};
244  ae->sum_feat_sq[index] += v*v;
245  ae->atomics[index].push(f);
246  j++;
247  }
248 
249  if (write_cache)
251 
252  return num_chars;
253 }
254 
255 void CVwParser::init_cache(char * fname, EVwCacheType type)
256 {
257  char* file_name = fname;
258  char default_cache_name[] = "vw_cache.dat.cache";
259 
260  if (!fname)
261  file_name = default_cache_name;
262 
263  write_cache = true;
264  cache_type = type;
265 
266  switch (type)
267  {
268  case C_NATIVE:
269  cache_writer = new CVwNativeCacheWriter(file_name, env);
270  return;
271  case C_PROTOBUF:
272  SG_ERROR("Protocol buffers cache support is not implemented yet.\n");
273  }
274 
275  SG_ERROR("Unexpected cache type specified!\n");
276 }
277 
279 {
280  // Get the value of the feature in the substring
281  tokenize(':', s, feat_name);
282 
283  switch (feat_name.index())
284  {
285  // If feature value is not specified, assume 1.0
286  case 0:
287  case 1:
288  v = 1.;
289  break;
290  case 2:
291  v = float_of_substring(feat_name[1]);
292  if (isnan(v))
293  SG_SERROR("error NaN value for feature %s! Terminating!\n",
294  c_string_of_substring(feat_name[0]));
295  break;
296  default:
297  SG_SERROR("Examples with a weird name, i.e., '%s'\n",
299  }
300 }
301 
303 {
304  ret.erase();
305  char *last = s.start;
306  for (; s.start != s.end; s.start++)
307  {
308  if (*s.start == delim)
309  {
310  if (s.start != last)
311  {
312  substring temp = {last,s.start};
313  ret.push(temp);
314  }
315  last = s.start+1;
316  }
317  }
318  if (s.start != last)
319  {
320  substring final = {last, s.start};
321  ret.push(final);
322  }
323 }

SHOGUN Machine Learning Toolbox - Documentation