SparseFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _SPARSEFEATURES__H__
00013 #define _SPARSEFEATURES__H__
00014 
00015 #include <string.h>
00016 #include <stdlib.h>
00017 
00018 #include "lib/common.h"
00019 #include "lib/Mathematics.h"
00020 #include "lib/Cache.h"
00021 #include "lib/io.h"
00022 #include "lib/Cache.h"
00023 
00024 #include "features/Labels.h"
00025 #include "features/Features.h"
00026 #include "features/DotFeatures.h"
00027 #include "features/SimpleFeatures.h"
00028 #include "preproc/SparsePreProc.h"
00029 
00030 namespace shogun
00031 {
00032 
00033 class CLabels;
00034 class CFeatures;
00035 class CDotFeatures;
00036 template <class ST> class CSimpleFeatures;
00037 template <class ST> class CSparsePreProc;
00038 
00039 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00040 
00041 template <class ST> struct TSparseEntry
00042 {
00044     int32_t feat_index;
00046     ST entry;
00047 };
00048 
00050 template <class ST> struct TSparse
00051 {
00052     public:
00054         int32_t vec_index;
00056         int32_t num_feat_entries;
00058         TSparseEntry<ST>* features;
00059 };
00060 #endif // DOXYGEN_SHOULD_SKIP_THIS
00061 
00074 template <class ST> class CSparseFeatures : public CDotFeatures
00075 {
00076     public:
00081         CSparseFeatures(int32_t size=0)
00082         : CDotFeatures(size), num_vectors(0), num_features(0),
00083             sparse_feature_matrix(NULL), feature_cache(NULL)
00084         {}
00085 
00094         CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false)
00095         : CDotFeatures(0), num_vectors(0), num_features(0),
00096             sparse_feature_matrix(NULL), feature_cache(NULL)
00097         {
00098             if (!copy)
00099                 set_sparse_feature_matrix(src, num_feat, num_vec);
00100             else
00101             {
00102                 sparse_feature_matrix = new TSparse<ST>[num_vec];
00103                 memcpy(sparse_feature_matrix, src, sizeof(TSparse<ST>)*num_vec);
00104                 for (int32_t i=0; i< num_vec; i++)
00105                 {
00106                     sparse_feature_matrix[i].features = new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00107                     memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00108 
00109                 }
00110             }
00111         }
00112 
00120         CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec)
00121         : CDotFeatures(0), num_vectors(0), num_features(0),
00122             sparse_feature_matrix(NULL), feature_cache(NULL)
00123         {
00124             set_full_feature_matrix(src, num_feat, num_vec);
00125         }
00126 
00128         CSparseFeatures(const CSparseFeatures & orig)
00129         : CDotFeatures(orig), num_vectors(orig.num_vectors),
00130             num_features(orig.num_features),
00131             sparse_feature_matrix(orig.sparse_feature_matrix),
00132             feature_cache(orig.feature_cache)
00133         {
00134             if (orig.sparse_feature_matrix)
00135             {
00136                 free_sparse_feature_matrix();
00137                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00138                 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00139                 for (int32_t i=0; i< num_vectors; i++)
00140                 {
00141                     sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00142                     memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00143 
00144                 }
00145             }
00146         }
00147 
00152         CSparseFeatures(char* fname)
00153         : CDotFeatures(fname), num_vectors(0), num_features(0),
00154             sparse_feature_matrix(NULL), feature_cache(NULL)
00155         {}
00156 
00157         virtual ~CSparseFeatures()
00158         {
00159             free_sparse_features();
00160         }
00161 
00165         void free_sparse_feature_matrix()
00166         {
00167             clean_tsparse(sparse_feature_matrix, num_vectors);
00168             sparse_feature_matrix = NULL;
00169             num_vectors=0;
00170             num_features=0;
00171         }
00172 
00176         void free_sparse_features()
00177         {
00178             free_sparse_feature_matrix();
00179             delete feature_cache;
00180             feature_cache = NULL;
00181         }
00182 
00187         virtual CFeatures* duplicate() const
00188         {
00189             return new CSparseFeatures<ST>(*this);
00190         }
00191 
00199         ST get_feature(int32_t num, int32_t index)
00200         {
00201             ASSERT(index>=0 && index<num_features) ;
00202             ASSERT(num>=0 && num<num_vectors) ;
00203 
00204             bool vfree;
00205             int32_t num_feat;
00206             int32_t i;
00207             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00208             ST ret = 0 ;
00209             
00210             if (sv)
00211             {
00212                 for (i=0; i<num_feat; i++)
00213                     if (sv[i].feat_index==index)
00214                         ret += sv[i].entry ;
00215             }
00216             
00217             free_sparse_feature_vector(sv, num, vfree);
00218             
00219             return ret ;
00220         }
00221         
00222 
00231         ST* get_full_feature_vector(int32_t num, int32_t& len)
00232         {
00233             bool vfree;
00234             int32_t num_feat;
00235             int32_t i;
00236             len=0;
00237             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00238             ST* fv=NULL;
00239 
00240             if (sv)
00241             {
00242                 len=num_features;
00243                 fv=new ST[num_features];
00244 
00245                 for (i=0; i<num_features; i++)
00246                     fv[i]=0;
00247 
00248                 for (i=0; i<num_feat; i++)
00249                     fv[sv[i].feat_index]= sv[i].entry;
00250             }
00251 
00252             free_sparse_feature_vector(sv, num, vfree);
00253 
00254             return fv;
00255         }
00256 
00263         void get_full_feature_vector(ST** dst, int32_t* len, int32_t num)
00264         {
00265             if (num>=num_vectors)
00266             {
00267                 SG_ERROR("Index out of bounds (number of vectors %d, you "
00268                         "requested %d)\n", num_vectors, num);
00269             }
00270 
00271             bool vfree;
00272             int32_t num_feat=0;
00273             *len=0;
00274             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00275 
00276             if (sv)
00277             {
00278                 *len=num_features;
00279                 *dst= (ST*) malloc(sizeof(ST)*num_features);
00280                 memset(*dst, 0, sizeof(ST)*num_features);
00281 
00282                 for (int32_t i=0; i<num_feat; i++)
00283                     (*dst)[sv[i].feat_index]= sv[i].entry;
00284             }
00285 
00286             free_sparse_feature_vector(sv, num, vfree);
00287         }
00288 
00289 
00295         virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00296         {
00297             bool vfree;
00298             int32_t len;
00299             TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00300             free_sparse_feature_vector(sv, num, vfree);
00301             return len;
00302         }
00303 
00314         TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00315         {
00316             ASSERT(num<num_vectors);
00317 
00318             if (sparse_feature_matrix)
00319             {
00320                 len= sparse_feature_matrix[num].num_feat_entries;
00321                 vfree=false ;
00322                 return sparse_feature_matrix[num].features;
00323             } 
00324             else
00325             {
00326                 TSparseEntry<ST>* feat=NULL;
00327                 vfree=false;
00328 
00329                 if (feature_cache)
00330                 {
00331                     feat=feature_cache->lock_entry(num);
00332 
00333                     if (feat)
00334                         return feat;
00335                     else
00336                     {
00337                         feat=feature_cache->set_entry(num);
00338                     }
00339                 }
00340 
00341                 if (!feat)
00342                     vfree=true;
00343 
00344                 feat=compute_sparse_feature_vector(num, len, feat);
00345 
00346 
00347                 if (get_num_preproc())
00348                 {
00349                     int32_t tmp_len=len;
00350                     TSparseEntry<ST>* tmp_feat_before = feat;
00351                     TSparseEntry<ST>* tmp_feat_after = NULL;
00352 
00353                     for (int32_t i=0; i<get_num_preproc(); i++)
00354                     {
00355                         //tmp_feat_after=((CSparsePreProc<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len);
00356 
00357                         if (i!=0)   // delete feature vector, except for the the first one, i.e., feat
00358                             delete[] tmp_feat_before;
00359                         tmp_feat_before=tmp_feat_after;
00360                     }
00361 
00362                     memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00363                     delete[] tmp_feat_after;
00364                     len=tmp_len ;
00365                     SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00366                 }
00367                 return feat ;
00368             }
00369         }
00370 
00371 
00382         ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00383         {
00384             ST result=0;
00385 
00386             //result remains zero when one of the vectors is non existent
00387             if (avec && bvec)
00388             {
00389                 if (alen<=blen)
00390                 {
00391                     int32_t j=0;
00392                     for (int32_t i=0; i<alen; i++)
00393                     {
00394                         int32_t a_feat_idx=avec[i].feat_index;
00395 
00396                         while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00397                             j++;
00398 
00399                         if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00400                         {
00401                             result+= avec[i].entry * bvec[j].entry;
00402                             j++;
00403                         }
00404                     }
00405                 }
00406                 else
00407                 {
00408                     int32_t j=0;
00409                     for (int32_t i=0; i<blen; i++)
00410                     {
00411                         int32_t b_feat_idx=bvec[i].feat_index;
00412 
00413                         while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00414                             j++;
00415 
00416                         if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00417                         {
00418                             result+= bvec[i].entry * avec[j].entry;
00419                             j++;
00420                         }
00421                     }
00422                 }
00423 
00424                 result*=alpha;
00425             }
00426 
00427             return result;
00428         }
00429 
00440         ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00441         {
00442             ASSERT(vec);
00443             ASSERT(dim==num_features);
00444             ST result=b;
00445 
00446             bool vfree;
00447             int32_t num_feat;
00448             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00449 
00450             if (sv)
00451             {
00452                 for (int32_t i=0; i<num_feat; i++)
00453                     result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00454             }
00455 
00456             free_sparse_feature_vector(sv, num, vfree);
00457             return result;
00458         }
00459 
00469         void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00470         {
00471             ASSERT(vec);
00472             ASSERT(dim==num_features);
00473 
00474             bool vfree;
00475             int32_t num_feat;
00476             TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00477 
00478             if (sv)
00479             {
00480                 if (abs_val)
00481                 {
00482                     for (int32_t i=0; i<num_feat; i++)
00483                         vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00484                 }
00485                 else
00486                 {
00487                     for (int32_t i=0; i<num_feat; i++)
00488                         vec[sv[i].feat_index]+= alpha*sv[i].entry;
00489                 }
00490             }
00491 
00492             free_sparse_feature_vector(sv, num, vfree);
00493         }
00494 
00501         void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00502         {
00503             if (feature_cache)
00504                 feature_cache->unlock_entry(num);
00505 
00506             if (free)
00507                 delete[] feat_vec ;
00508         } 
00509 
00517         TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00518         {
00519             num_feat=num_features;
00520             num_vec=num_vectors;
00521 
00522             return sparse_feature_matrix;
00523         }
00524 
00533         void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat,
00534                 int32_t* num_vec, int64_t* nnz)
00535         {
00536             *nnz=get_num_nonzero_entries();
00537             *num_feat=num_features;
00538             *num_vec=num_vectors;
00539             *dst=sparse_feature_matrix;
00540         }
00541 
00547         void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00548         {
00549             if (sfm)
00550             {
00551                 for (int32_t i=0; i<num_vec; i++)
00552                     delete[] sfm[i].features;
00553 
00554                 delete[] sfm;
00555             }
00556         }
00557 
00567         TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00568         {
00569             num_feat=num_vectors;
00570             num_vec=num_features;
00571 
00572             int32_t* hist=new int32_t[num_features];
00573             memset(hist, 0, sizeof(int32_t)*num_features);
00574 
00575             // count how lengths of future feature vectors
00576             for (int32_t v=0; v<num_vectors; v++)
00577             {
00578                 int32_t vlen;
00579                 bool vfree;
00580                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00581 
00582                 for (int32_t i=0; i<vlen; i++)
00583                     hist[sv[i].feat_index]++;
00584 
00585                 free_sparse_feature_vector(sv, v, vfree);
00586             }
00587 
00588             // allocate room for future feature vectors
00589             TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00590             for (int32_t v=0; v<num_vec; v++)
00591             {
00592                 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00593                 sfm[v].num_feat_entries=hist[v];
00594                 sfm[v].vec_index=v;
00595             }
00596 
00597             // fill future feature vectors with content
00598             memset(hist,0,sizeof(int32_t)*num_features);
00599             for (int32_t v=0; v<num_vectors; v++)
00600             {
00601                 int32_t vlen;
00602                 bool vfree;
00603                 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00604 
00605                 for (int32_t i=0; i<vlen; i++)
00606                 {
00607                     int32_t vidx=sv[i].feat_index;
00608                     int32_t fidx=v;
00609                     sfm[vidx].features[hist[vidx]].feat_index=fidx;
00610                     sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00611                     hist[vidx]++;
00612                 }
00613 
00614                 free_sparse_feature_vector(sv, v, vfree);
00615             }
00616 
00617             delete[] hist;
00618             return sfm;
00619         }
00620 
00630         virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00631         {
00632             free_sparse_feature_matrix();
00633 
00634             sparse_feature_matrix=src;
00635             num_features=num_feat;
00636             num_vectors=num_vec;
00637         }
00638 
00646         ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00647         {
00648             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00649             num_feat=num_features;
00650             num_vec=num_vectors;
00651 
00652             ST* fm=new ST[num_feat*num_vec];
00653 
00654             if (fm)
00655             {
00656                 for (int64_t i=0; i<num_feat*num_vec; i++)
00657                     fm[i]=0;
00658 
00659                 for (int32_t v=0; v<num_vec; v++)
00660                 {
00661                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00662                     {
00663                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00664                         fm[offs]= sparse_feature_matrix[v].features[f].entry;
00665                     }
00666                 }
00667             }
00668             else
00669                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00670 
00671             return fm;
00672         }
00673 
00681         void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec)
00682         {
00683             SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00684             *num_feat=num_features;
00685             *num_vec=num_vectors;
00686 
00687             *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors);
00688 
00689             if (*dst)
00690             {
00691                 for (int64_t i=0; i<num_features*num_vectors; i++)
00692                     (*dst)[i]=0;
00693 
00694                 for (int32_t v=0; v<num_vectors; v++)
00695                 {
00696                     for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00697                     {
00698                         int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index;
00699                         (*dst)[offs]= sparse_feature_matrix[v].features[f].entry;
00700                     }
00701                 }
00702             }
00703             else
00704                 SG_ERROR( "error allocating memory for dense feature matrix\n");
00705         }
00706 
00716         virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec)
00717         {
00718             free_sparse_feature_matrix();
00719             bool result=true;
00720             num_features=num_feat;
00721             num_vectors=num_vec;
00722 
00723             SG_INFO("converting dense feature matrix to sparse one\n");
00724             int32_t* num_feat_entries=new int[num_vectors];
00725 
00726             if (num_feat_entries)
00727             {
00728                 int32_t num_total_entries=0;
00729 
00730                 // count nr of non sparse features
00731                 for (int32_t i=0; i< num_vec; i++)
00732                 {
00733                     num_feat_entries[i]=0;
00734                     for (int32_t j=0; j< num_feat; j++)
00735                     {
00736                         if (src[i*((int64_t) num_feat) + j] != 0)
00737                             num_feat_entries[i]++;
00738                     }
00739                 }
00740 
00741                 if (num_vec>0)
00742                 {
00743                     sparse_feature_matrix=new TSparse<ST>[num_vec];
00744 
00745                     if (sparse_feature_matrix)
00746                     {
00747                         for (int32_t i=0; i< num_vec; i++)
00748                         {
00749                             sparse_feature_matrix[i].vec_index=i;
00750                             sparse_feature_matrix[i].num_feat_entries=0;
00751                             sparse_feature_matrix[i].features= NULL;
00752 
00753                             if (num_feat_entries[i]>0)
00754                             {
00755                                 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00756 
00757                                 if (!sparse_feature_matrix[i].features)
00758                                 {
00759                                     SG_INFO( "allocation of features failed\n");
00760                                     return false;
00761                                 }
00762 
00763                                 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00764                                 int32_t sparse_feat_idx=0;
00765 
00766                                 for (int32_t j=0; j< num_feat; j++)
00767                                 {
00768                                     int64_t pos= i*num_feat + j;
00769 
00770                                     if (src[pos] != 0)
00771                                     {
00772                                         sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00773                                         sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00774                                         sparse_feat_idx++;
00775                                         num_total_entries++;
00776                                     }
00777                                 }
00778                             }
00779                         }
00780                     }
00781                     else
00782                     {
00783                         SG_ERROR( "allocation of sparse feature matrix failed\n");
00784                         result=false;
00785                     }
00786 
00787                     SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00788                             num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00789                 }
00790                 else
00791                 {
00792                     SG_ERROR( "huh ? zero size matrix given ?\n");
00793                     result=false;
00794                 }
00795             }
00796             delete[] num_feat_entries;
00797             return result;
00798         }
00799 
00805         virtual bool apply_preproc(bool force_preprocessing=false)
00806         {
00807             SG_INFO( "force: %d\n", force_preprocessing);
00808 
00809             if ( sparse_feature_matrix && get_num_preproc() )
00810             {
00811                 for (int32_t i=0; i<get_num_preproc(); i++)
00812                 {
00813                     if ( (!is_preprocessed(i) || force_preprocessing) )
00814                     {
00815                         set_preprocessed(i);
00816                         SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00817                         if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00818                             return false;
00819                     }
00820                     return true;
00821                 }
00822                 return true;
00823             }
00824             else
00825             {
00826                 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00827                 return false;
00828             }
00829         }
00830 
00835         virtual int32_t get_size() { return sizeof(ST); }
00836 
00842         bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00843         {
00844             int32_t num_feat=0;
00845             int32_t num_vec=0;
00846             ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00847             ASSERT(fm && num_feat>0 && num_vec>0);
00848 
00849             return set_full_feature_matrix(fm, num_feat, num_vec);
00850         }
00851 
00856         virtual inline int32_t  get_num_vectors() { return num_vectors; }
00857 
00862         inline int32_t  get_num_features() { return num_features; }
00863 
00875         inline int32_t set_num_features(int32_t num)
00876         {
00877             int32_t n=num_features;
00878             ASSERT(n<=num);
00879             num_features=num;
00880             return num_features;
00881         }
00882 
00887         inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00888 
00893         inline virtual EFeatureType get_feature_type();
00894 
00901         void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00902         {
00903             if (feature_cache)
00904                 feature_cache->unlock_entry(num);
00905 
00906             if (free)
00907                 delete[] feat_vec ;
00908         }
00909 
00914         int64_t get_num_nonzero_entries()
00915         {
00916             int64_t num=0;
00917             for (int32_t i=0; i<num_vectors; i++)
00918                 num+=sparse_feature_matrix[i].num_feat_entries;
00919 
00920             return num;
00921         }
00922 
00928         float64_t* compute_squared(float64_t* sq)
00929         {
00930             ASSERT(sq);
00931 
00932             int32_t len=0;
00933             bool do_free=false;
00934 
00935             for (int32_t i=0; i<this->get_num_vectors(); i++)
00936             {
00937                 sq[i]=0;
00938                 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00939 
00940                 for (int32_t j=0; j<len; j++)
00941                     sq[i] += vec[j].entry * vec[j].entry;
00942 
00943                 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00944             }
00945 
00946             return sq;
00947         }
00948 
00961         float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00962         {
00963             int32_t i,j;
00964             int32_t alen, blen;
00965             bool afree, bfree;
00966             ASSERT(lhs);
00967             ASSERT(rhs);
00968 
00969             TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00970             TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00971             ASSERT(avec);
00972             ASSERT(bvec);
00973 
00974             float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00975 
00976             if (alen<=blen)
00977             {
00978                 j=0;
00979                 for (i=0; i<alen; i++)
00980                 {
00981                     int32_t a_feat_idx=avec[i].feat_index;
00982 
00983                     while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
00984                         j++;
00985 
00986                     if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
00987                     {
00988                         result-=2*(avec[i].entry*bvec[j].entry);
00989                         j++;
00990                     }
00991                 }
00992             }
00993             else
00994             {
00995                 j=0;
00996                 for (i=0; i<blen; i++)
00997                 {
00998                     int32_t b_feat_idx=bvec[i].feat_index;
00999 
01000                     while ((j<alen) && (avec[j].feat_index<b_feat_idx))
01001                         j++;
01002 
01003                     if ((j<alen) && (avec[j].feat_index == b_feat_idx))
01004                     {
01005                         result-=2*(bvec[i].entry*avec[j].entry);
01006                         j++;
01007                     }
01008                 }
01009             }
01010 
01011             ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
01012             ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
01013 
01014             return CMath::abs(result);
01015         }
01016 
01024         CLabels* load_svmlight_file(char* fname, bool do_sort_features=true)
01025         {
01026             CLabels* lab=NULL;
01027 
01028             size_t blocksize=1024*1024;
01029             size_t required_blocksize=blocksize;
01030             uint8_t* dummy=new uint8_t[blocksize];
01031             FILE* f=fopen(fname, "ro");
01032 
01033             if (f)
01034             {
01035                 free_sparse_feature_matrix();
01036                 num_vectors=0;
01037                 num_features=0;
01038 
01039                 SG_INFO("counting line numbers in file %s\n", fname);
01040                 size_t sz=blocksize;
01041                 size_t block_offs=0;
01042                 size_t old_block_offs=0;
01043                 fseek(f, 0, SEEK_END);
01044                 size_t fsize=ftell(f);
01045                 rewind(f);
01046 
01047                 while (sz == blocksize)
01048                 {
01049                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01050                     bool contains_cr=false;
01051                     for (size_t i=0; i<sz; i++)
01052                     {
01053                         block_offs++;
01054                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01055                         {
01056                             num_vectors++;
01057                             contains_cr=true;
01058                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01059                             old_block_offs=block_offs;
01060                         }
01061                     }
01062                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01063                 }
01064 
01065                 SG_INFO("found %d feature vectors\n", num_vectors);
01066                 delete[] dummy;
01067                 blocksize=required_blocksize;
01068                 dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL
01069 
01070                 lab=new CLabels(num_vectors);
01071                 sparse_feature_matrix=new TSparse<ST>[num_vectors];
01072 
01073                 rewind(f);
01074                 sz=blocksize;
01075                 int32_t lines=0;
01076                 while (sz == blocksize)
01077                 {
01078                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01079 
01080                     size_t old_sz=0;
01081                     for (size_t i=0; i<sz; i++)
01082                     {
01083                         if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01084                         {
01085                             size_t len=i-old_sz+1;
01086                             uint8_t* data=&dummy[old_sz];
01087 
01088                             for (int32_t j=0; j<len; j++)
01089                                 dummy[j]=data[j];
01090 
01091                             sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01092                             i=0;
01093                             old_sz=0;
01094                             sz+=len;
01095                         }
01096 
01097                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01098                         {
01099 
01100                             size_t len=i-old_sz;
01101                             uint8_t* data=&dummy[old_sz];
01102 
01103                             int32_t dims=0;
01104                             for (int32_t j=0; j<len; j++)
01105                             {
01106                                 if (data[j]==':')
01107                                     dims++;
01108                             }
01109 
01110                             if (dims<=0)
01111                             {
01112                                 SG_ERROR("Error in line %d - number of"
01113                                         " dimensions is %d line is %d characters"
01114                                         " long\n line_content:'%.*s'\n", lines,
01115                                         dims, len, len, (const char*) data);
01116                             }
01117 
01118                             TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
01119                             int32_t j=0;
01120                             for (; j<len; j++)
01121                             {
01122                                 if (data[j]==' ')
01123                                 {
01124                                     data[j]='\0';
01125 
01126                                     lab->set_label(lines, atof((const char*) data));
01127                                     break;
01128                                 }
01129                             }
01130 
01131                             int32_t d=0;
01132                             j++;
01133                             uint8_t* start=&data[j];
01134                             for (; j<len; j++)
01135                             {
01136                                 if (data[j]==':')
01137                                 {
01138                                     data[j]='\0';
01139 
01140                                     feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01141                                     num_features=CMath::max(num_features, feat[d].feat_index+1);
01142 
01143                                     j++;
01144                                     start=&data[j];
01145                                     for (; j<len; j++)
01146                                     {
01147                                         if (data[j]==' ' || data[j]=='\n')
01148                                         {
01149                                             data[j]='\0';
01150                                             feat[d].entry=(ST) atof((const char*) start);
01151                                             d++;
01152                                             break;
01153                                         }
01154                                     }
01155 
01156                                     if (j==len)
01157                                     {
01158                                         data[j]='\0';
01159                                         feat[dims-1].entry=(ST) atof((const char*) start);
01160                                     }
01161 
01162                                     j++;
01163                                     start=&data[j];
01164                                 }
01165                             }
01166 
01167                             sparse_feature_matrix[lines].vec_index=lines;
01168                             sparse_feature_matrix[lines].num_feat_entries=dims;
01169                             sparse_feature_matrix[lines].features=feat;
01170 
01171                             old_sz=i+1;
01172                             lines++;
01173                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01174                         }
01175                     }
01176                 }
01177                 SG_INFO("file successfully read\n");
01178                 fclose(f);
01179             }
01180 
01181             delete[] dummy;
01182 
01183             if (do_sort_features)
01184                 sort_features();
01185 
01186             return lab;
01187         }
01188 
01191         void sort_features()
01192         {
01193             ASSERT(get_num_preproc()==0);
01194 
01195             if (!sparse_feature_matrix)
01196                 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
01197 
01198             for (int32_t i=0; i<num_vectors; i++)
01199             {
01200                 int32_t len=sparse_feature_matrix[i].num_feat_entries;
01201 
01202                 if (!len)
01203                     continue;
01204 
01205                 TSparseEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
01206                 int32_t* feat_idx=new int32_t[len];
01207                 int32_t* orig_idx=new int32_t[len];
01208 
01209                 for (int j=0; j<len; j++)
01210                 {
01211                     feat_idx[j]=sf_orig[j].feat_index;
01212                     orig_idx[j]=j;
01213                 }
01214 
01215                 CMath::qsort_index(feat_idx, orig_idx, len);
01216 
01217                 TSparseEntry<ST>* sf_new= new TSparseEntry<ST>[len];
01218                 for (int j=0; j<len; j++)
01219                     sf_new[j]=sf_orig[orig_idx[j]];
01220 
01221                 sparse_feature_matrix[i].features=sf_new;
01222 
01223                 // sanity check
01224                 for (int j=0; j<len-1; j++)
01225                     ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
01226 
01227                 delete[] orig_idx;
01228                 delete[] feat_idx;
01229                 delete[] sf_orig;
01230             }
01231         }
01232 
01239         bool write_svmlight_file(char* fname, CLabels* label)
01240         {
01241             ASSERT(label);
01242             int32_t num=label->get_num_labels();
01243             ASSERT(num>0);
01244             ASSERT(num==num_vectors);
01245 
01246             FILE* f=fopen(fname, "wb");
01247 
01248             if (f)
01249             {
01250                 for (int32_t i=0; i<num; i++)
01251                 {
01252                     fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01253 
01254                     TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01255                     int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01256 
01257                     for (int32_t j=0; j<num_feat; j++)
01258                     {
01259                         if (j<num_feat-1)
01260                             fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01261                         else
01262                             fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01263                     }
01264                 }
01265 
01266                 fclose(f);
01267                 return true;
01268             }
01269             return false;
01270         }
01271 
01279         virtual int32_t get_dim_feature_space()
01280         {
01281             return num_features;
01282         }
01283 
01290         virtual float64_t dot(int32_t vec_idx1, int32_t vec_idx2)
01291         {
01292             bool afree, bfree;
01293             int32_t alen, blen;
01294             TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree);
01295             TSparseEntry<ST>* bvec=get_sparse_feature_vector(vec_idx2, blen, bfree);
01296 
01297             float64_t result=sparse_dot(1, avec, alen, bvec, blen);
01298 
01299             free_sparse_feature_vector(avec, vec_idx1, afree);
01300             free_sparse_feature_vector(bvec, vec_idx2, bfree);
01301 
01302             return result;
01303         }
01304 
01311         virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01312         {
01313             ASSERT(vec2);
01314             ASSERT(vec2_len==num_features);
01315             float64_t result=0;
01316 
01317             bool vfree;
01318             int32_t vlen;
01319             TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree);
01320 
01321             if (sv)
01322             {
01323                 for (int32_t i=0; i<vlen; i++)
01324                     result+=vec2[sv[i].feat_index]*sv[i].entry;
01325             }
01326 
01327             free_sparse_feature_vector(sv, vec_idx1, vfree);
01328 
01329             return result;
01330         }
01331 
01333         inline virtual const char* get_name() const { return "SparseFeatures"; }
01334 
01335     protected:
01346         virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01347         {
01348             len=0;
01349             return NULL;
01350         }
01351 
01352     protected:
01353 
01355         int32_t num_vectors;
01356 
01358         int32_t num_features;
01359 
01361         TSparse<ST>* sparse_feature_matrix;
01362 
01364         CCache< TSparseEntry<ST> >* feature_cache;
01365 };
01366 
01371 template<> inline EFeatureType CSparseFeatures<bool>::get_feature_type()
01372 {
01373     return F_BOOL;
01374 }
01375 
01380 template<> inline EFeatureType CSparseFeatures<char>::get_feature_type()
01381 {
01382     return F_CHAR;
01383 }
01384 
01389 template<> inline EFeatureType CSparseFeatures<uint8_t>::get_feature_type()
01390 {
01391     return F_BYTE;
01392 }
01393 
01398 template<> inline EFeatureType CSparseFeatures<int16_t>::get_feature_type()
01399 {
01400     return F_SHORT;
01401 }
01402 
01407 template<> inline EFeatureType CSparseFeatures<uint16_t>::get_feature_type()
01408 {
01409     return F_WORD;
01410 }
01411 
01416 template<> inline EFeatureType CSparseFeatures<int32_t>::get_feature_type()
01417 {
01418     return F_INT;
01419 }
01420 
01425 template<> inline EFeatureType CSparseFeatures<uint32_t>::get_feature_type()
01426 {
01427     return F_UINT;
01428 }
01429 
01434 template<> inline EFeatureType CSparseFeatures<int64_t>::get_feature_type()
01435 {
01436     return F_LONG;
01437 }
01438 
01443 template<> inline EFeatureType CSparseFeatures<uint64_t>::get_feature_type()
01444 {
01445     return F_ULONG;
01446 }
01447 
01452 template<> inline EFeatureType CSparseFeatures<float32_t>::get_feature_type()
01453 {
01454     return F_SHORTREAL;
01455 }
01456 
01461 template<> inline EFeatureType CSparseFeatures<float64_t>::get_feature_type()
01462 {
01463     return F_DREAL;
01464 }
01465 
01470 template<> inline EFeatureType CSparseFeatures<floatmax_t>::get_feature_type()
01471 {
01472     return F_LONGREAL;
01473 }
01474 }
01475 #endif /* _SPARSEFEATURES__H__ */

SHOGUN Machine Learning Toolbox - Documentation