1
2
3
4
5
6
7
8
9 """Wrap the libsvm package into a very simple class interface."""
10
11 __docformat__ = 'restructuredtext'
12
13
14 _DEV__doc__ = """
15
16 TODOs:
17 * dual-license under GPL for use of SG?
18 * for recent versions add ability to specify/parametrize normalization
19 scheme for the kernel, and reuse 'scale' now for the normalizer
20 * Add support for simplified linear classifiers (which do not require
21 storing all training SVs/samples to make classification in predict())
22 """
23
24 import numpy as N
25
26
27
28 from mvpa.base import externals, warning
29 if externals.exists('shogun', raiseException=True):
30 import shogun.Features
31 import shogun.Classifier
32 import shogun.Regression
33 import shogun.Kernel
34 import shogun.Library
35
36
37 if hasattr(shogun.Kernel, 'M_DEBUG'):
38 _M_DEBUG = shogun.Kernel.M_DEBUG
39 _M_ERROR = shogun.Kernel.M_ERROR
40 elif hasattr(shogun.Kernel, 'MSG_DEBUG'):
41 _M_DEBUG = shogun.Kernel.MSG_DEBUG
42 _M_ERROR = shogun.Kernel.MSG_ERROR
43 else:
44 _M_DEBUG, _M_ERROR = None, None
45 warning("Could not figure out debug IDs within shogun. "
46 "No control over shogun verbosity would be provided")
47
48 import operator
49
50 from mvpa.misc.param import Parameter
51 from mvpa.base import warning
52
53 from mvpa.clfs.meta import MulticlassClassifier
54 from mvpa.clfs._svmbase import _SVM
55 from mvpa.misc.state import StateVariable
56 from mvpa.measures.base import Sensitivity
57
58 from sens import *
59
60 if __debug__:
61 from mvpa.base import debug
62
63
65 """Helper to set level of debugging output for SG
66 :Parameters:
67 obj
68 In SG debug output seems to be set per every object
69 partname : basestring
70 For what kind of object we are talking about... could be automated
71 later on (TODO)
72 """
73 if _M_DEBUG is None:
74 return
75 debugname = "SG_%s" % partname.upper()
76
77 switch = {True: (_M_DEBUG, 'M_DEBUG', "enable"),
78 False: (_M_ERROR, 'M_ERROR', "disable")}
79
80 key = __debug__ and debugname in debug.active
81
82 sglevel, slevel, progressfunc = switch[key]
83
84 if __debug__:
85 debug("SG_", "Setting verbosity for shogun.%s instance: %s to %s" %
86 (partname, `obj`, slevel))
87 obj.io.set_loglevel(sglevel)
88 try:
89 exec "obj.io.%s_progress()" % progressfunc
90 except:
91 warning("Shogun version installed has no way to enable progress" +
92 " reports")
93
94
96 """Draft helper function to convert data we have into SG suitable format
97
98 TODO: Support different datatypes
99 """
100
101 if __debug__:
102 debug("SG_", "Converting data for shogun into RealFeatures")
103
104 features = shogun.Features.RealFeatures(data.astype('double').T)
105
106 if __debug__:
107 debug("SG__", "Done converting data for shogun into RealFeatures")
108 _setdebug(features, 'Features')
109 return features
110
111
113 """Support Vector Machine Classifier(s) based on Shogun
114
115 This is a simple base interface
116 """
117
118 num_threads = Parameter(1,
119 min=1,
120 doc='Number of threads to utilize')
121
122
123 _KERNELS = {}
124 if externals.exists('shogun', raiseException=True):
125 _KERNELS = { "linear": (shogun.Kernel.LinearKernel,
126 ('scale',), LinearSVMWeights),
127 "rbf" : (shogun.Kernel.GaussianKernel,
128 ('gamma',), None),
129 "rbfshift": (shogun.Kernel.GaussianShiftKernel,
130 ('gamma', 'max_shift', 'shift_step'), None),
131 "sigmoid": (shogun.Kernel.SigmoidKernel,
132 ('cache_size', 'gamma', 'coef0'), None),
133 }
134
135 _KNOWN_PARAMS = [ 'epsilon' ]
136 _KNOWN_KERNEL_PARAMS = [ ]
137
138 _clf_internals = _SVM._clf_internals + [ 'sg', 'retrainable' ]
139
140 if externals.exists('sg ge 0.6.4'):
141 _KERNELS['linear'] = (shogun.Kernel.LinearKernel, (), LinearSVMWeights)
142
143
144
145 """
146 If you'd like to train linear SVMs use SGD or OCAS. These are (I am
147 serious) the fastest linear SVM-solvers to date. (OCAS cannot do SVMs
148 with standard additive bias, but will L2 reqularize it - though it
149 should not matter much in practice (although it will give slightly
150 different solutions)). Note that SGD has no stopping criterion (you
151 simply have to specify the number of iterations) and that OCAS has a
152 different stopping condition than svmlight for example which may be more
153 tight and more loose depending on the problem - I sugeest 1e-2 or 1e-3
154 for epsilon.
155
156 If you would like to train kernel SVMs use libsvm/gpdt/svmlight -
157 depending on the problem one is faster than the other (hard to say when,
158 I *think* when your dataset is very unbalanced chunking methods like
159 svmlight/gpdt are better), for smaller problems definitely libsvm.
160
161 If you use string kernels then gpdt/svmlight have a special 'linadd'
162 speedup for this (requires sg 0.6.2 - there was some inefficiency in the
163 code for python-modular before that). This is effective for big datasets
164 and (I trained on 10 million strings based on this).
165
166 And yes currently we only implemented parallel training for svmlight,
167 however all SVMs can be evaluated in parallel.
168 """
169 _KNOWN_IMPLEMENTATIONS = {}
170 if externals.exists('shogun', raiseException=True):
171 _KNOWN_IMPLEMENTATIONS = {
172 "libsvm" : (shogun.Classifier.LibSVM, ('C',),
173 ('multiclass', 'binary'),
174 "LIBSVM's C-SVM (L2 soft-margin SVM)"),
175 "gmnp" : (shogun.Classifier.GMNPSVM, ('C',),
176 ('multiclass', 'binary'),
177 "Generalized Nearest Point Problem SVM"),
178
179 "gpbt" : (shogun.Classifier.GPBTSVM, ('C',), ('binary',),
180 "Gradient Projection Decomposition Technique for " \
181 "large-scale SVM problems"),
182 "gnpp" : (shogun.Classifier.GNPPSVM, ('C',), ('binary',),
183 "Generalized Nearest Point Problem SVM"),
184
185
186
187
188
189
190
191
192
193
194 "libsvr": (shogun.Regression.LibSVR, ('C', 'tube_epsilon',),
195 ('regression',),
196 "LIBSVM's epsilon-SVR"),
197 "krr": (shogun.Regression.KRR, ('tau',), ('regression',),
198 "Kernel Ridge Regression"),
199 }
200
201
202 - def __init__(self,
203 kernel_type='linear',
204 **kwargs):
205 """Interface class to Shogun's classifiers and regressions.
206
207 Default implementation is 'libsvm'.
208 """
209
210 svm_impl = kwargs.get('svm_impl', 'libsvm').lower()
211 kwargs['svm_impl'] = svm_impl
212
213
214 _SVM.__init__(self, kernel_type=kernel_type, **kwargs)
215
216 self.__svm = None
217 """Holds the trained svm."""
218
219
220
221
222 self.__traindataset = None
223
224
225 self.__traindata = None
226 self.__kernel = None
227 self.__kernel_test = None
228 self.__testdata = None
229
230
232
233
234
235 if self._svm_impl in ['svrlight', 'lightsvm']:
236 kernel.set_precompute_matrix(True, True)
237
238
240 """Train SVM
241 """
242
243
244 newkernel, newsvm = False, False
245
246 retrainable = self.params.retrainable
247
248 if retrainable:
249 _changedData = self._changedData
250
251
252 ul = None
253 self.__traindataset = dataset
254
255
256
257
258
259
260 if __debug__:
261 debug("SG_", "Creating labels instance")
262
263 if 'regression' in self._clf_internals:
264 labels_ = N.asarray(dataset.labels, dtype='double')
265 else:
266 ul = dataset.uniquelabels
267 ul.sort()
268
269 if len(ul) == 2:
270
271 _labels_dict = {ul[0]:-1.0, ul[1]:+1.0}
272 elif len(ul) < 2:
273 raise ValueError, "we do not have 1-class SVM brought into SG yet"
274 else:
275
276 _labels_dict = dict([ (ul[i], i) for i in range(len(ul))])
277
278
279 _labels_dict_rev = dict([(x[1], x[0])
280 for x in _labels_dict.items()])
281
282
283 self._labels_dict = _labels_dict
284 self._labels_dict_rev = _labels_dict_rev
285
286
287
288
289
290 if __debug__:
291 debug("SG__", "Mapping labels using dict %s" % _labels_dict)
292 labels_ = N.asarray([ _labels_dict[x] for x in dataset.labels ], dtype='double')
293
294 labels = shogun.Features.Labels(labels_)
295 _setdebug(labels, 'Labels')
296
297
298
299 if not retrainable or _changedData['traindata'] or _changedData['kernel_params']:
300
301
302 kargs = []
303 for arg in self._KERNELS[self._kernel_type_literal][1]:
304 value = self.kernel_params[arg].value
305
306 if arg == 'gamma' and value == 0.0:
307 value = self._getDefaultGamma(dataset)
308 kargs += [value]
309
310 if retrainable and __debug__:
311 if _changedData['traindata']:
312 debug("SG",
313 "Re-Creating kernel since training data has changed")
314
315 if _changedData['kernel_params']:
316 debug("SG",
317 "Re-Creating kernel since params %s has changed" %
318 _changedData['kernel_params'])
319
320
321 if __debug__: debug("SG_", "Converting input data for shogun")
322 self.__traindata = _tosg(dataset.samples)
323
324 if __debug__:
325 debug("SG", "Creating kernel instance of %s giving arguments %s" %
326 (`self._kernel_type`, kargs))
327
328 self.__kernel = kernel = \
329 self._kernel_type(self.__traindata, self.__traindata,
330 *kargs)
331
332 if externals.exists('sg ge 0.6.4'):
333 kernel.set_normalizer(shogun.Kernel.IdentityKernelNormalizer())
334
335 newkernel = True
336 self.kernel_params.reset()
337 _setdebug(kernel, 'Kernels')
338
339 self.__condition_kernel(kernel)
340 if retrainable:
341 if __debug__:
342 debug("SG_", "Resetting test kernel for retrainable SVM")
343 self.__kernel_test = None
344 self.__kernel_args = kargs
345
346
347
348 Cs = None
349 if not retrainable or self.__svm is None or _changedData['params']:
350
351 if self.params.isKnown('C'):
352 C = self.params.C
353 if not operator.isSequenceType(C):
354
355 C = [C]
356
357 Cs = list(C[:])
358 for i in xrange(len(Cs)):
359 if Cs[i]<0:
360 Cs[i] = self._getDefaultC(dataset.samples)*abs(Cs[i])
361 if __debug__:
362 debug("SG_", "Default C for %s was computed to be %s" %
363 (C[i], Cs[i]))
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385 svm_impl_class = self.__get_implementation(ul)
386
387 if __debug__:
388 debug("SG", "Creating SVM instance of %s" % `svm_impl_class`)
389
390 if self._svm_impl in ['libsvr', 'svrlight']:
391
392 self.__svm = svm_impl_class(Cs[0], self.params.epsilon, self.__kernel, labels)
393 elif self._svm_impl in ['krr']:
394 self.__svm = svm_impl_class(self.params.tau, self.__kernel, labels)
395 else:
396 self.__svm = svm_impl_class(Cs[0], self.__kernel, labels)
397 self.__svm.set_epsilon(self.params.epsilon)
398 if Cs is not None and len(Cs) == 2:
399 if __debug__:
400 debug("SG_", "Since multiple Cs are provided: %s, assign them" % Cs)
401 self.__svm.set_C(Cs[0], Cs[1])
402
403 self.params.reset()
404 newsvm = True
405 _setdebug(self.__svm, 'SVM')
406
407 if self.params.isKnown('tube_epsilon') and \
408 hasattr(self.__svm, 'set_tube_epsilon'):
409 self.__svm.set_tube_epsilon(self.params.tube_epsilon)
410 self.__svm.parallel.set_num_threads(self.params.num_threads)
411 else:
412 if __debug__:
413 debug("SG_", "SVM instance is not re-created")
414 if _changedData['labels']:
415 if __debug__: debug("SG__", "Assigning new labels")
416 self.__svm.set_labels(labels)
417 if newkernel:
418 if __debug__: debug("SG__", "Assigning new kernel")
419 self.__svm.set_kernel(self.__kernel)
420 assert(_changedData['params'] is False)
421
422 if retrainable:
423
424 self.states.retrained = not newsvm or not newkernel
425
426
427 if __debug__ and 'SG' in debug.active:
428 if not self.regression:
429 lstr = " with labels %s" % dataset.uniquelabels
430 else:
431 lstr = ""
432 debug("SG", "%sTraining %s on data%s" %
433 (("","Re-")[retrainable and self.states.retrained],
434 self, lstr))
435
436 self.__svm.train()
437
438 if __debug__:
439 debug("SG_", "Done training SG_SVM %s" % self._kernel_type)
440
441
442 if (__debug__ and 'SG__' in debug.active) or \
443 self.states.isEnabled('training_confusion'):
444 trained_labels = self.__svm.classify().get_labels()
445 else:
446 trained_labels = None
447
448 if __debug__ and "SG__" in debug.active:
449 debug("SG__", "Original labels: %s, Trained labels: %s" %
450 (dataset.labels, trained_labels))
451
452
453
454
455
456
457
458
459
460 if self.regression and self.states.isEnabled('training_confusion'):
461 self.states.training_confusion = self._summaryClass(
462 targets=dataset.labels,
463 predictions=trained_labels)
464
466 """Predict values for the data
467 """
468
469 retrainable = self.params.retrainable
470
471 if retrainable:
472 changed_testdata = self._changedData['testdata'] or \
473 self.__kernel_test is None
474
475 if not retrainable or changed_testdata:
476 testdata = _tosg(data)
477
478 if not retrainable:
479 if __debug__:
480 debug("SG__",
481 "Initializing SVMs kernel of %s with training/testing samples"
482 % self)
483
484 self.__kernel.init(self.__traindata, testdata)
485 self.__condition_kernel(self.__kernel)
486 else:
487 if changed_testdata:
488 if __debug__:
489 debug("SG__",
490 "Re-creating testing kernel of %s giving "
491 "arguments %s" %
492 (`self._kernel_type`, self.__kernel_args))
493 kernel_test = self._kernel_type(self.__traindata, testdata,
494 *self.__kernel_args)
495 _setdebug(kernel_test, 'Kernels')
496
497 custk_args = ([self.__traindata, testdata], [])[
498 int(externals.exists('sg ge 0.6.4'))]
499 if __debug__:
500 debug("SG__",
501 "Re-creating custom testing kernel giving "
502 "arguments %s" % (str(custk_args)))
503 kernel_test_custom = shogun.Kernel.CustomKernel(*custk_args)
504
505 _setdebug(kernel_test_custom, 'Kernels')
506 self.__kernel_test = kernel_test_custom
507 self.__kernel_test.set_full_kernel_matrix_from_full(
508 kernel_test.get_kernel_matrix())
509 elif __debug__:
510 debug("SG__", "Re-using testing kernel")
511
512 assert(self.__kernel_test is not None)
513 self.__svm.set_kernel(self.__kernel_test)
514
515 if __debug__:
516 debug("SG_", "Classifying testing data")
517
518
519
520 values_ = self.__svm.classify()
521 if values_ is None:
522 raise RuntimeError, "We got empty list of values from %s" % self
523
524 values = values_.get_labels()
525
526 if retrainable:
527
528 self.states.repredicted = repredicted = not changed_testdata
529 if __debug__:
530 debug("SG__", "Re-assigning learing kernel. Repredicted is %s"
531 % repredicted)
532
533 self.__svm.set_kernel(self.__kernel)
534
535 if __debug__:
536 debug("SG__", "Got values %s" % values)
537
538 if ('regression' in self._clf_internals):
539 predictions = values
540 else:
541
542 _labels_dict = self._labels_dict
543 _labels_dict_rev = self._labels_dict_rev
544
545 if len(_labels_dict) == 2:
546 predictions = 1.0 - 2*N.signbit(values)
547 else:
548 predictions = values
549
550
551 label_type = type(_labels_dict.values()[0])
552
553
554 predictions = [_labels_dict_rev[label_type(x)]
555 for x in predictions]
556
557 if __debug__:
558 debug("SG__", "Tuned predictions %s" % predictions)
559
560
561
562
563 self.values = values
564
565
566 if not retrainable:
567 try:
568 testdata.free_features()
569 except:
570 pass
571
572 return predictions
573
574
576 super(SVM, self).untrain()
577 if not self.params.retrainable:
578 if __debug__:
579 debug("SG__", "Untraining %(clf)s and destroying sg's SVM",
580 msgargs={'clf':self})
581
582
583
584 if True:
585 if True:
586
587 if self.__kernel is not None:
588 del self.__kernel
589 self.__kernel = None
590
591 if self.__kernel_test is not None:
592 del self.__kernel_test
593 self.__kernel_test = None
594
595 if self.__svm is not None:
596 del self.__svm
597 self.__svm = None
598
599 if self.__traindata is not None:
600
601
602
603
604 self.__traindata.free_features()
605 del self.__traindata
606 self.__traindata = None
607
608 self.__traindataset = None
609
610
611
612
613
614 if __debug__:
615 debug("SG__",
616 "Done untraining %(self)s and destroying sg's SVM",
617 msgargs=locals())
618 elif __debug__:
619 debug("SG__", "Not untraining %(self)s since it is retrainable",
620 msgargs=locals())
621
622
624 if 'regression' in self._clf_internals or len(ul) == 2:
625 svm_impl_class = SVM._KNOWN_IMPLEMENTATIONS[self._svm_impl][0]
626 else:
627 if self._svm_impl == 'libsvm':
628 svm_impl_class = shogun.Classifier.LibSVMMultiClass
629 elif self._svm_impl == 'gmnp':
630 svm_impl_class = shogun.Classifier.GMNPSVM
631 else:
632 raise RuntimeError, \
633 "Shogun: Implementation %s doesn't handle multiclass " \
634 "data. Got labels %s. Use some other classifier" % \
635 (self._svm_impl, self.__traindataset.uniquelabels)
636 if __debug__:
637 debug("SG_", "Using %s for multiclass data of %s" %
638 (svm_impl_class, self._svm_impl))
639
640 return svm_impl_class
641
642
643 svm = property(fget=lambda self: self.__svm)
644 """Access to the SVM model."""
645
646 traindataset = property(fget=lambda self: self.__traindataset)
647 """Dataset which was used for training
648
649 TODO -- might better become state variable I guess"""
650
651
652
653
654
655 for name, item, params, descr in \
656 [('mpd', "shogun.Classifier.MPDSVM", "('C',), ('binary',)",
657 "MPD classifier from shogun"),
658 ('lightsvm', "shogun.Classifier.SVMLight", "('C',), ('binary',)",
659 "SVMLight classification http://svmlight.joachims.org/"),
660 ('svrlight', "shogun.Regression.SVRLight", "('C','tube_epsilon',), ('regression',)",
661 "SVMLight regression http://svmlight.joachims.org/")]:
662 if externals.exists('shogun.%s' % name):
663 exec "SVM._KNOWN_IMPLEMENTATIONS[\"%s\"] = (%s, %s, \"%s\")" % (name, item, params, descr)
664
665
666 LinearSVMWeights._LEGAL_CLFS = [SVM]
667