SVMSGD.cpp

Go to the documentation of this file.
00001 /*
00002    SVM with stochastic gradient
00003    Copyright (C) 2007- Leon Bottou
00004    
00005    This program is free software; you can redistribute it and/or
00006    modify it under the terms of the GNU Lesser General Public
00007    License as published by the Free Software Foundation; either
00008    version 2.1 of the License, or (at your option) any later version.
00009    
00010    This program is distributed in the hope that it will be useful,
00011    but WITHOUT ANY WARRANTY; without even the implied warranty of
00012    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013    GNU General Public License for more details.
00014    
00015    You should have received a copy of the GNU General Public License
00016    along with this program; if not, write to the Free Software
00017    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA
00018    $Id: svmsgd.cpp,v 1.13 2007/10/02 20:40:06 cvs Exp $
00019 
00020    Shogun adjustments (w) 2008-2009 Soeren Sonnenburg
00021 */
00022 
00023 #include "classifier/svm/SVMSGD.h"
00024 #include "lib/Signal.h"
00025 
00026 using namespace shogun;
00027 
00028 // Available losses
00029 #define HINGELOSS 1
00030 #define SMOOTHHINGELOSS 2
00031 #define SQUAREDHINGELOSS 3
00032 #define LOGLOSS 10
00033 #define LOGLOSSMARGIN 11
00034 
00035 // Select loss
00036 #define LOSS HINGELOSS
00037 
00038 // One when bias is regularized
00039 #define REGULARIZEBIAS 0
00040 
00041 inline
00042 float64_t loss(float64_t z)
00043 {
00044 #if LOSS == LOGLOSS
00045     if (z >= 0)
00046         return log(1+exp(-z));
00047     else
00048         return -z + log(1+exp(z));
00049 #elif LOSS == LOGLOSSMARGIN
00050     if (z >= 1)
00051         return log(1+exp(1-z));
00052     else
00053         return 1-z + log(1+exp(z-1));
00054 #elif LOSS == SMOOTHHINGELOSS
00055     if (z < 0)
00056         return 0.5 - z;
00057     if (z < 1)
00058         return 0.5 * (1-z) * (1-z);
00059     return 0;
00060 #elif LOSS == SQUAREDHINGELOSS
00061     if (z < 1)
00062         return 0.5 * (1 - z) * (1 - z);
00063     return 0;
00064 #elif LOSS == HINGELOSS
00065     if (z < 1)
00066         return 1 - z;
00067     return 0;
00068 #else
00069 # error "Undefined loss"
00070 #endif
00071 }
00072 
00073 inline
00074 float64_t dloss(float64_t z)
00075 {
00076 #if LOSS == LOGLOSS
00077     if (z < 0)
00078         return 1 / (exp(z) + 1);
00079     float64_t ez = exp(-z);
00080     return ez / (ez + 1);
00081 #elif LOSS == LOGLOSSMARGIN
00082     if (z < 1)
00083         return 1 / (exp(z-1) + 1);
00084     float64_t ez = exp(1-z);
00085     return ez / (ez + 1);
00086 #elif LOSS == SMOOTHHINGELOSS
00087     if (z < 0)
00088         return 1;
00089     if (z < 1)
00090         return 1-z;
00091     return 0;
00092 #elif LOSS == SQUAREDHINGELOSS
00093     if (z < 1)
00094         return (1 - z);
00095     return 0;
00096 #else
00097     if (z < 1)
00098         return 1;
00099     return 0;
00100 #endif
00101 }
00102 
00103 
00104 
00105 CSVMSGD::CSVMSGD(float64_t C)
00106 : CLinearClassifier(), t(1), C1(C), C2(C),
00107     wscale(1), bscale(1), epochs(5), skip(1000), count(1000), use_bias(true),
00108     use_regularized_bias(false)
00109 {
00110 }
00111 
00112 CSVMSGD::CSVMSGD(float64_t C, CDotFeatures* traindat, CLabels* trainlab)
00113 : CLinearClassifier(), t(1), C1(C), C2(C), wscale(1), bscale(1),
00114     epochs(5), skip(1000), count(1000), use_bias(true),
00115     use_regularized_bias(false)
00116 {
00117     w=NULL;
00118     set_features(traindat);
00119     set_labels(trainlab);
00120 }
00121 
00122 CSVMSGD::~CSVMSGD()
00123 {
00124     delete[] w;
00125     w=NULL;
00126 }
00127 
00128 bool CSVMSGD::train(CFeatures* data)
00129 {
00130     // allocate memory for w and initialize everyting w and bias with 0
00131     ASSERT(labels);
00132 
00133     if (data)
00134     {
00135         if (!data->has_property(FP_DOT))
00136             SG_ERROR("Specified features are not of type CDotFeatures\n");
00137         set_features((CDotFeatures*) data);
00138     }
00139 
00140     ASSERT(features);
00141     ASSERT(labels->is_two_class_labeling());
00142 
00143     int32_t num_train_labels=labels->get_num_labels();
00144     w_dim=features->get_dim_feature_space();
00145     int32_t num_vec=features->get_num_vectors();
00146 
00147     ASSERT(num_vec==num_train_labels);
00148     ASSERT(num_vec>0);
00149 
00150     delete[] w;
00151     w=new float64_t[w_dim];
00152     memset(w, 0, w_dim*sizeof(float64_t));
00153     bias=0;
00154 
00155     float64_t lambda= 1.0/(C1*num_vec);
00156 
00157     // Shift t in order to have a 
00158     // reasonable initial learning rate.
00159     // This assumes |x| \approx 1.
00160     float64_t maxw = 1.0 / sqrt(lambda);
00161     float64_t typw = sqrt(maxw);
00162     float64_t eta0 = typw / CMath::max(1.0,dloss(-typw));
00163     t = 1 / (eta0 * lambda);
00164 
00165     SG_INFO("lambda=%f, epochs=%d, eta0=%f\n", lambda, epochs, eta0);
00166 
00167 
00168     //do the sgd
00169     calibrate();
00170 
00171     SG_INFO("Training on %d vectors\n", num_vec);
00172     CSignal::clear_cancel();
00173 
00174     for(int32_t e=0; e<epochs && (!CSignal::cancel_computations()); e++)
00175     {
00176         count = skip;
00177         for (int32_t i=0; i<num_vec; i++)
00178         {
00179             float64_t eta = 1.0 / (lambda * t);
00180             float64_t y = labels->get_label(i);
00181             float64_t z = y * (features->dense_dot(i, w, w_dim) + bias);
00182 
00183 #if LOSS < LOGLOSS
00184             if (z < 1)
00185 #endif
00186             {
00187                 float64_t etd = eta * dloss(z);
00188                 features->add_to_dense_vec(etd * y / wscale, i, w, w_dim);
00189 
00190                 if (use_bias)
00191                 {
00192                     if (use_regularized_bias)
00193                         bias *= 1 - eta * lambda * bscale;
00194                     bias += etd * y * bscale;
00195                 }
00196             }
00197 
00198             if (--count <= 0)
00199             {
00200                 float64_t r = 1 - eta * lambda * skip;
00201                 if (r < 0.8)
00202                     r = pow(1 - eta * lambda, skip);
00203                 CMath::scale_vector(r, w, w_dim);
00204                 count = skip;
00205             }
00206             t++;
00207         }
00208     }
00209 
00210     float64_t wnorm =  CMath::dot(w,w, w_dim);
00211     SG_INFO("Norm: %.6f, Bias: %.6f\n", wnorm, bias);
00212 
00213     return true;
00214 }
00215 
00216 void CSVMSGD::calibrate()
00217 { 
00218     ASSERT(features);
00219     int32_t num_vec=features->get_num_vectors();
00220     int32_t c_dim=features->get_dim_feature_space();
00221 
00222     ASSERT(num_vec>0);
00223     ASSERT(c_dim>0);
00224 
00225     float64_t* c=new float64_t[c_dim];
00226     memset(c, 0, c_dim*sizeof(float64_t));
00227 
00228     SG_INFO("Estimating sparsity and bscale num_vec=%d num_feat=%d.\n", num_vec, c_dim);
00229 
00230     // compute average gradient size
00231     int32_t n = 0;
00232     float64_t m = 0;
00233     float64_t r = 0;
00234 
00235     for (int32_t j=0; j<num_vec && m<=1000; j++, n++)
00236     {
00237         r += features->get_nnz_features_for_vector(j);
00238         features->add_to_dense_vec(1, j, c, c_dim, true);
00239 
00240         //waste cpu cycles for readability
00241         //(only changed dims need checking)
00242         m=CMath::max(c, c_dim);
00243     }
00244 
00245     // bias update scaling
00246     bscale = m/n;
00247 
00248     // compute weight decay skip
00249     skip = (int32_t) ((16 * n * c_dim) / r);
00250     SG_INFO("using %d examples. skip=%d  bscale=%.6f\n", n, skip, bscale);
00251 
00252     delete[] c;
00253 }
00254 

SHOGUN Machine Learning Toolbox - Documentation