00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _SPARSEFEATURES__H__
00013 #define _SPARSEFEATURES__H__
00014
00015 #include <string.h>
00016 #include <stdlib.h>
00017
00018 #include "lib/common.h"
00019 #include "lib/Mathematics.h"
00020 #include "lib/Cache.h"
00021 #include "lib/io.h"
00022 #include "lib/Cache.h"
00023
00024 #include "features/Labels.h"
00025 #include "features/Features.h"
00026 #include "features/DotFeatures.h"
00027 #include "features/SimpleFeatures.h"
00028 #include "preproc/SparsePreProc.h"
00029
00030 namespace shogun
00031 {
00032
00033 class CLabels;
00034 class CFeatures;
00035 class CDotFeatures;
00036 template <class ST> class CSimpleFeatures;
00037 template <class ST> class CSparsePreProc;
00038
00039 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00040
00041 template <class ST> struct TSparseEntry
00042 {
00044 int32_t feat_index;
00046 ST entry;
00047 };
00048
00050 template <class ST> struct TSparse
00051 {
00052 public:
00054 int32_t vec_index;
00056 int32_t num_feat_entries;
00058 TSparseEntry<ST>* features;
00059 };
00060 #endif // DOXYGEN_SHOULD_SKIP_THIS
00061
00074 template <class ST> class CSparseFeatures : public CDotFeatures
00075 {
00076 public:
00081 CSparseFeatures(int32_t size=0)
00082 : CDotFeatures(size), num_vectors(0), num_features(0),
00083 sparse_feature_matrix(NULL), feature_cache(NULL)
00084 {}
00085
00094 CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false)
00095 : CDotFeatures(0), num_vectors(0), num_features(0),
00096 sparse_feature_matrix(NULL), feature_cache(NULL)
00097 {
00098 if (!copy)
00099 set_sparse_feature_matrix(src, num_feat, num_vec);
00100 else
00101 {
00102 sparse_feature_matrix = new TSparse<ST>[num_vec];
00103 memcpy(sparse_feature_matrix, src, sizeof(TSparse<ST>)*num_vec);
00104 for (int32_t i=0; i< num_vec; i++)
00105 {
00106 sparse_feature_matrix[i].features = new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00107 memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00108
00109 }
00110 }
00111 }
00112
00120 CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec)
00121 : CDotFeatures(0), num_vectors(0), num_features(0),
00122 sparse_feature_matrix(NULL), feature_cache(NULL)
00123 {
00124 set_full_feature_matrix(src, num_feat, num_vec);
00125 }
00126
00128 CSparseFeatures(const CSparseFeatures & orig)
00129 : CDotFeatures(orig), num_vectors(orig.num_vectors),
00130 num_features(orig.num_features),
00131 sparse_feature_matrix(orig.sparse_feature_matrix),
00132 feature_cache(orig.feature_cache)
00133 {
00134 if (orig.sparse_feature_matrix)
00135 {
00136 free_sparse_feature_matrix();
00137 sparse_feature_matrix=new TSparse<ST>[num_vectors];
00138 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors);
00139 for (int32_t i=0; i< num_vectors; i++)
00140 {
00141 sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries];
00142 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries);
00143
00144 }
00145 }
00146 }
00147
00152 CSparseFeatures(char* fname)
00153 : CDotFeatures(fname), num_vectors(0), num_features(0),
00154 sparse_feature_matrix(NULL), feature_cache(NULL)
00155 {}
00156
00157 virtual ~CSparseFeatures()
00158 {
00159 free_sparse_features();
00160 }
00161
00165 void free_sparse_feature_matrix()
00166 {
00167 clean_tsparse(sparse_feature_matrix, num_vectors);
00168 sparse_feature_matrix = NULL;
00169 num_vectors=0;
00170 num_features=0;
00171 }
00172
00176 void free_sparse_features()
00177 {
00178 free_sparse_feature_matrix();
00179 delete feature_cache;
00180 feature_cache = NULL;
00181 }
00182
00187 virtual CFeatures* duplicate() const
00188 {
00189 return new CSparseFeatures<ST>(*this);
00190 }
00191
00199 ST get_feature(int32_t num, int32_t index)
00200 {
00201 ASSERT(index>=0 && index<num_features) ;
00202 ASSERT(num>=0 && num<num_vectors) ;
00203
00204 bool vfree;
00205 int32_t num_feat;
00206 int32_t i;
00207 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00208 ST ret = 0 ;
00209
00210 if (sv)
00211 {
00212 for (i=0; i<num_feat; i++)
00213 if (sv[i].feat_index==index)
00214 ret += sv[i].entry ;
00215 }
00216
00217 free_sparse_feature_vector(sv, num, vfree);
00218
00219 return ret ;
00220 }
00221
00222
00231 ST* get_full_feature_vector(int32_t num, int32_t& len)
00232 {
00233 bool vfree;
00234 int32_t num_feat;
00235 int32_t i;
00236 len=0;
00237 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00238 ST* fv=NULL;
00239
00240 if (sv)
00241 {
00242 len=num_features;
00243 fv=new ST[num_features];
00244
00245 for (i=0; i<num_features; i++)
00246 fv[i]=0;
00247
00248 for (i=0; i<num_feat; i++)
00249 fv[sv[i].feat_index]= sv[i].entry;
00250 }
00251
00252 free_sparse_feature_vector(sv, num, vfree);
00253
00254 return fv;
00255 }
00256
00263 void get_full_feature_vector(ST** dst, int32_t* len, int32_t num)
00264 {
00265 if (num>=num_vectors)
00266 {
00267 SG_ERROR("Index out of bounds (number of vectors %d, you "
00268 "requested %d)\n", num_vectors, num);
00269 }
00270
00271 bool vfree;
00272 int32_t num_feat=0;
00273 *len=0;
00274 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00275
00276 if (sv)
00277 {
00278 *len=num_features;
00279 *dst= (ST*) malloc(sizeof(ST)*num_features);
00280 memset(*dst, 0, sizeof(ST)*num_features);
00281
00282 for (int32_t i=0; i<num_feat; i++)
00283 (*dst)[sv[i].feat_index]= sv[i].entry;
00284 }
00285
00286 free_sparse_feature_vector(sv, num, vfree);
00287 }
00288
00289
00295 virtual inline int32_t get_nnz_features_for_vector(int32_t num)
00296 {
00297 bool vfree;
00298 int32_t len;
00299 TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree);
00300 free_sparse_feature_vector(sv, num, vfree);
00301 return len;
00302 }
00303
00314 TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree)
00315 {
00316 ASSERT(num<num_vectors);
00317
00318 if (sparse_feature_matrix)
00319 {
00320 len= sparse_feature_matrix[num].num_feat_entries;
00321 vfree=false ;
00322 return sparse_feature_matrix[num].features;
00323 }
00324 else
00325 {
00326 TSparseEntry<ST>* feat=NULL;
00327 vfree=false;
00328
00329 if (feature_cache)
00330 {
00331 feat=feature_cache->lock_entry(num);
00332
00333 if (feat)
00334 return feat;
00335 else
00336 {
00337 feat=feature_cache->set_entry(num);
00338 }
00339 }
00340
00341 if (!feat)
00342 vfree=true;
00343
00344 feat=compute_sparse_feature_vector(num, len, feat);
00345
00346
00347 if (get_num_preproc())
00348 {
00349 int32_t tmp_len=len;
00350 TSparseEntry<ST>* tmp_feat_before = feat;
00351 TSparseEntry<ST>* tmp_feat_after = NULL;
00352
00353 for (int32_t i=0; i<get_num_preproc(); i++)
00354 {
00355
00356
00357 if (i!=0)
00358 delete[] tmp_feat_before;
00359 tmp_feat_before=tmp_feat_after;
00360 }
00361
00362 memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len);
00363 delete[] tmp_feat_after;
00364 len=tmp_len ;
00365 SG_DEBUG( "len: %d len2: %d\n", len, num_features);
00366 }
00367 return feat ;
00368 }
00369 }
00370
00371
00382 ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen)
00383 {
00384 ST result=0;
00385
00386
00387 if (avec && bvec)
00388 {
00389 if (alen<=blen)
00390 {
00391 int32_t j=0;
00392 for (int32_t i=0; i<alen; i++)
00393 {
00394 int32_t a_feat_idx=avec[i].feat_index;
00395
00396 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) )
00397 j++;
00398
00399 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) )
00400 {
00401 result+= avec[i].entry * bvec[j].entry;
00402 j++;
00403 }
00404 }
00405 }
00406 else
00407 {
00408 int32_t j=0;
00409 for (int32_t i=0; i<blen; i++)
00410 {
00411 int32_t b_feat_idx=bvec[i].feat_index;
00412
00413 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) )
00414 j++;
00415
00416 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) )
00417 {
00418 result+= bvec[i].entry * avec[j].entry;
00419 j++;
00420 }
00421 }
00422 }
00423
00424 result*=alpha;
00425 }
00426
00427 return result;
00428 }
00429
00440 ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b)
00441 {
00442 ASSERT(vec);
00443 ASSERT(dim==num_features);
00444 ST result=b;
00445
00446 bool vfree;
00447 int32_t num_feat;
00448 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00449
00450 if (sv)
00451 {
00452 for (int32_t i=0; i<num_feat; i++)
00453 result+=alpha*vec[sv[i].feat_index]*sv[i].entry;
00454 }
00455
00456 free_sparse_feature_vector(sv, num, vfree);
00457 return result;
00458 }
00459
00469 void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false)
00470 {
00471 ASSERT(vec);
00472 ASSERT(dim==num_features);
00473
00474 bool vfree;
00475 int32_t num_feat;
00476 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree);
00477
00478 if (sv)
00479 {
00480 if (abs_val)
00481 {
00482 for (int32_t i=0; i<num_feat; i++)
00483 vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry);
00484 }
00485 else
00486 {
00487 for (int32_t i=0; i<num_feat; i++)
00488 vec[sv[i].feat_index]+= alpha*sv[i].entry;
00489 }
00490 }
00491
00492 free_sparse_feature_vector(sv, num, vfree);
00493 }
00494
00501 void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00502 {
00503 if (feature_cache)
00504 feature_cache->unlock_entry(num);
00505
00506 if (free)
00507 delete[] feat_vec ;
00508 }
00509
00517 TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00518 {
00519 num_feat=num_features;
00520 num_vec=num_vectors;
00521
00522 return sparse_feature_matrix;
00523 }
00524
00533 void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat,
00534 int32_t* num_vec, int64_t* nnz)
00535 {
00536 *nnz=get_num_nonzero_entries();
00537 *num_feat=num_features;
00538 *num_vec=num_vectors;
00539 *dst=sparse_feature_matrix;
00540 }
00541
00547 void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec)
00548 {
00549 if (sfm)
00550 {
00551 for (int32_t i=0; i<num_vec; i++)
00552 delete[] sfm[i].features;
00553
00554 delete[] sfm;
00555 }
00556 }
00557
00567 TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec)
00568 {
00569 num_feat=num_vectors;
00570 num_vec=num_features;
00571
00572 int32_t* hist=new int32_t[num_features];
00573 memset(hist, 0, sizeof(int32_t)*num_features);
00574
00575
00576 for (int32_t v=0; v<num_vectors; v++)
00577 {
00578 int32_t vlen;
00579 bool vfree;
00580 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00581
00582 for (int32_t i=0; i<vlen; i++)
00583 hist[sv[i].feat_index]++;
00584
00585 free_sparse_feature_vector(sv, v, vfree);
00586 }
00587
00588
00589 TSparse<ST>* sfm=new TSparse<ST>[num_vec];
00590 for (int32_t v=0; v<num_vec; v++)
00591 {
00592 sfm[v].features= new TSparseEntry<ST>[hist[v]];
00593 sfm[v].num_feat_entries=hist[v];
00594 sfm[v].vec_index=v;
00595 }
00596
00597
00598 memset(hist,0,sizeof(int32_t)*num_features);
00599 for (int32_t v=0; v<num_vectors; v++)
00600 {
00601 int32_t vlen;
00602 bool vfree;
00603 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree);
00604
00605 for (int32_t i=0; i<vlen; i++)
00606 {
00607 int32_t vidx=sv[i].feat_index;
00608 int32_t fidx=v;
00609 sfm[vidx].features[hist[vidx]].feat_index=fidx;
00610 sfm[vidx].features[hist[vidx]].entry=sv[i].entry;
00611 hist[vidx]++;
00612 }
00613
00614 free_sparse_feature_vector(sv, v, vfree);
00615 }
00616
00617 delete[] hist;
00618 return sfm;
00619 }
00620
00630 virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec)
00631 {
00632 free_sparse_feature_matrix();
00633
00634 sparse_feature_matrix=src;
00635 num_features=num_feat;
00636 num_vectors=num_vec;
00637 }
00638
00646 ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec)
00647 {
00648 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00649 num_feat=num_features;
00650 num_vec=num_vectors;
00651
00652 ST* fm=new ST[num_feat*num_vec];
00653
00654 if (fm)
00655 {
00656 for (int64_t i=0; i<num_feat*num_vec; i++)
00657 fm[i]=0;
00658
00659 for (int32_t v=0; v<num_vec; v++)
00660 {
00661 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00662 {
00663 int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index;
00664 fm[offs]= sparse_feature_matrix[v].features[f].entry;
00665 }
00666 }
00667 }
00668 else
00669 SG_ERROR( "error allocating memory for dense feature matrix\n");
00670
00671 return fm;
00672 }
00673
00681 void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec)
00682 {
00683 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features);
00684 *num_feat=num_features;
00685 *num_vec=num_vectors;
00686
00687 *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors);
00688
00689 if (*dst)
00690 {
00691 for (int64_t i=0; i<num_features*num_vectors; i++)
00692 (*dst)[i]=0;
00693
00694 for (int32_t v=0; v<num_vectors; v++)
00695 {
00696 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++)
00697 {
00698 int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index;
00699 (*dst)[offs]= sparse_feature_matrix[v].features[f].entry;
00700 }
00701 }
00702 }
00703 else
00704 SG_ERROR( "error allocating memory for dense feature matrix\n");
00705 }
00706
00716 virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec)
00717 {
00718 free_sparse_feature_matrix();
00719 bool result=true;
00720 num_features=num_feat;
00721 num_vectors=num_vec;
00722
00723 SG_INFO("converting dense feature matrix to sparse one\n");
00724 int32_t* num_feat_entries=new int[num_vectors];
00725
00726 if (num_feat_entries)
00727 {
00728 int32_t num_total_entries=0;
00729
00730
00731 for (int32_t i=0; i< num_vec; i++)
00732 {
00733 num_feat_entries[i]=0;
00734 for (int32_t j=0; j< num_feat; j++)
00735 {
00736 if (src[i*((int64_t) num_feat) + j] != 0)
00737 num_feat_entries[i]++;
00738 }
00739 }
00740
00741 if (num_vec>0)
00742 {
00743 sparse_feature_matrix=new TSparse<ST>[num_vec];
00744
00745 if (sparse_feature_matrix)
00746 {
00747 for (int32_t i=0; i< num_vec; i++)
00748 {
00749 sparse_feature_matrix[i].vec_index=i;
00750 sparse_feature_matrix[i].num_feat_entries=0;
00751 sparse_feature_matrix[i].features= NULL;
00752
00753 if (num_feat_entries[i]>0)
00754 {
00755 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]];
00756
00757 if (!sparse_feature_matrix[i].features)
00758 {
00759 SG_INFO( "allocation of features failed\n");
00760 return false;
00761 }
00762
00763 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i];
00764 int32_t sparse_feat_idx=0;
00765
00766 for (int32_t j=0; j< num_feat; j++)
00767 {
00768 int64_t pos= i*num_feat + j;
00769
00770 if (src[pos] != 0)
00771 {
00772 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos];
00773 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j;
00774 sparse_feat_idx++;
00775 num_total_entries++;
00776 }
00777 }
00778 }
00779 }
00780 }
00781 else
00782 {
00783 SG_ERROR( "allocation of sparse feature matrix failed\n");
00784 result=false;
00785 }
00786
00787 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n",
00788 num_total_entries, num_feat*num_vec, (100.0*num_total_entries)/(num_feat*num_vec));
00789 }
00790 else
00791 {
00792 SG_ERROR( "huh ? zero size matrix given ?\n");
00793 result=false;
00794 }
00795 }
00796 delete[] num_feat_entries;
00797 return result;
00798 }
00799
00805 virtual bool apply_preproc(bool force_preprocessing=false)
00806 {
00807 SG_INFO( "force: %d\n", force_preprocessing);
00808
00809 if ( sparse_feature_matrix && get_num_preproc() )
00810 {
00811 for (int32_t i=0; i<get_num_preproc(); i++)
00812 {
00813 if ( (!is_preprocessed(i) || force_preprocessing) )
00814 {
00815 set_preprocessed(i);
00816 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name());
00817 if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL)
00818 return false;
00819 }
00820 return true;
00821 }
00822 return true;
00823 }
00824 else
00825 {
00826 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n");
00827 return false;
00828 }
00829 }
00830
00835 virtual int32_t get_size() { return sizeof(ST); }
00836
00842 bool obtain_from_simple(CSimpleFeatures<ST>* sf)
00843 {
00844 int32_t num_feat=0;
00845 int32_t num_vec=0;
00846 ST* fm=sf->get_feature_matrix(num_feat, num_vec);
00847 ASSERT(fm && num_feat>0 && num_vec>0);
00848
00849 return set_full_feature_matrix(fm, num_feat, num_vec);
00850 }
00851
00856 virtual inline int32_t get_num_vectors() { return num_vectors; }
00857
00862 inline int32_t get_num_features() { return num_features; }
00863
00875 inline int32_t set_num_features(int32_t num)
00876 {
00877 int32_t n=num_features;
00878 ASSERT(n<=num);
00879 num_features=num;
00880 return num_features;
00881 }
00882
00887 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; }
00888
00893 inline virtual EFeatureType get_feature_type();
00894
00901 void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free)
00902 {
00903 if (feature_cache)
00904 feature_cache->unlock_entry(num);
00905
00906 if (free)
00907 delete[] feat_vec ;
00908 }
00909
00914 int64_t get_num_nonzero_entries()
00915 {
00916 int64_t num=0;
00917 for (int32_t i=0; i<num_vectors; i++)
00918 num+=sparse_feature_matrix[i].num_feat_entries;
00919
00920 return num;
00921 }
00922
00928 float64_t* compute_squared(float64_t* sq)
00929 {
00930 ASSERT(sq);
00931
00932 int32_t len=0;
00933 bool do_free=false;
00934
00935 for (int32_t i=0; i<this->get_num_vectors(); i++)
00936 {
00937 sq[i]=0;
00938 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free);
00939
00940 for (int32_t j=0; j<len; j++)
00941 sq[i] += vec[j].entry * vec[j].entry;
00942
00943 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free);
00944 }
00945
00946 return sq;
00947 }
00948
00961 float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b)
00962 {
00963 int32_t i,j;
00964 int32_t alen, blen;
00965 bool afree, bfree;
00966 ASSERT(lhs);
00967 ASSERT(rhs);
00968
00969 TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree);
00970 TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree);
00971 ASSERT(avec);
00972 ASSERT(bvec);
00973
00974 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b];
00975
00976 if (alen<=blen)
00977 {
00978 j=0;
00979 for (i=0; i<alen; i++)
00980 {
00981 int32_t a_feat_idx=avec[i].feat_index;
00982
00983 while ((j<blen) && (bvec[j].feat_index < a_feat_idx))
00984 j++;
00985
00986 if ((j<blen) && (bvec[j].feat_index == a_feat_idx))
00987 {
00988 result-=2*(avec[i].entry*bvec[j].entry);
00989 j++;
00990 }
00991 }
00992 }
00993 else
00994 {
00995 j=0;
00996 for (i=0; i<blen; i++)
00997 {
00998 int32_t b_feat_idx=bvec[i].feat_index;
00999
01000 while ((j<alen) && (avec[j].feat_index<b_feat_idx))
01001 j++;
01002
01003 if ((j<alen) && (avec[j].feat_index == b_feat_idx))
01004 {
01005 result-=2*(bvec[i].entry*avec[j].entry);
01006 j++;
01007 }
01008 }
01009 }
01010
01011 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree);
01012 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree);
01013
01014 return CMath::abs(result);
01015 }
01016
01024 CLabels* load_svmlight_file(char* fname, bool do_sort_features=true)
01025 {
01026 CLabels* lab=NULL;
01027
01028 size_t blocksize=1024*1024;
01029 size_t required_blocksize=blocksize;
01030 uint8_t* dummy=new uint8_t[blocksize];
01031 FILE* f=fopen(fname, "ro");
01032
01033 if (f)
01034 {
01035 free_sparse_feature_matrix();
01036 num_vectors=0;
01037 num_features=0;
01038
01039 SG_INFO("counting line numbers in file %s\n", fname);
01040 size_t sz=blocksize;
01041 size_t block_offs=0;
01042 size_t old_block_offs=0;
01043 fseek(f, 0, SEEK_END);
01044 size_t fsize=ftell(f);
01045 rewind(f);
01046
01047 while (sz == blocksize)
01048 {
01049 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01050 bool contains_cr=false;
01051 for (size_t i=0; i<sz; i++)
01052 {
01053 block_offs++;
01054 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01055 {
01056 num_vectors++;
01057 contains_cr=true;
01058 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
01059 old_block_offs=block_offs;
01060 }
01061 }
01062 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
01063 }
01064
01065 SG_INFO("found %d feature vectors\n", num_vectors);
01066 delete[] dummy;
01067 blocksize=required_blocksize;
01068 dummy = new uint8_t[blocksize+1];
01069
01070 lab=new CLabels(num_vectors);
01071 sparse_feature_matrix=new TSparse<ST>[num_vectors];
01072
01073 rewind(f);
01074 sz=blocksize;
01075 int32_t lines=0;
01076 while (sz == blocksize)
01077 {
01078 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
01079
01080 size_t old_sz=0;
01081 for (size_t i=0; i<sz; i++)
01082 {
01083 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
01084 {
01085 size_t len=i-old_sz+1;
01086 uint8_t* data=&dummy[old_sz];
01087
01088 for (int32_t j=0; j<len; j++)
01089 dummy[j]=data[j];
01090
01091 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f);
01092 i=0;
01093 old_sz=0;
01094 sz+=len;
01095 }
01096
01097 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
01098 {
01099
01100 size_t len=i-old_sz;
01101 uint8_t* data=&dummy[old_sz];
01102
01103 int32_t dims=0;
01104 for (int32_t j=0; j<len; j++)
01105 {
01106 if (data[j]==':')
01107 dims++;
01108 }
01109
01110 if (dims<=0)
01111 {
01112 SG_ERROR("Error in line %d - number of"
01113 " dimensions is %d line is %d characters"
01114 " long\n line_content:'%.*s'\n", lines,
01115 dims, len, len, (const char*) data);
01116 }
01117
01118 TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims];
01119 int32_t j=0;
01120 for (; j<len; j++)
01121 {
01122 if (data[j]==' ')
01123 {
01124 data[j]='\0';
01125
01126 lab->set_label(lines, atof((const char*) data));
01127 break;
01128 }
01129 }
01130
01131 int32_t d=0;
01132 j++;
01133 uint8_t* start=&data[j];
01134 for (; j<len; j++)
01135 {
01136 if (data[j]==':')
01137 {
01138 data[j]='\0';
01139
01140 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
01141 num_features=CMath::max(num_features, feat[d].feat_index+1);
01142
01143 j++;
01144 start=&data[j];
01145 for (; j<len; j++)
01146 {
01147 if (data[j]==' ' || data[j]=='\n')
01148 {
01149 data[j]='\0';
01150 feat[d].entry=(ST) atof((const char*) start);
01151 d++;
01152 break;
01153 }
01154 }
01155
01156 if (j==len)
01157 {
01158 data[j]='\0';
01159 feat[dims-1].entry=(ST) atof((const char*) start);
01160 }
01161
01162 j++;
01163 start=&data[j];
01164 }
01165 }
01166
01167 sparse_feature_matrix[lines].vec_index=lines;
01168 sparse_feature_matrix[lines].num_feat_entries=dims;
01169 sparse_feature_matrix[lines].features=feat;
01170
01171 old_sz=i+1;
01172 lines++;
01173 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
01174 }
01175 }
01176 }
01177 SG_INFO("file successfully read\n");
01178 fclose(f);
01179 }
01180
01181 delete[] dummy;
01182
01183 if (do_sort_features)
01184 sort_features();
01185
01186 return lab;
01187 }
01188
01191 void sort_features()
01192 {
01193 ASSERT(get_num_preproc()==0);
01194
01195 if (!sparse_feature_matrix)
01196 SG_ERROR("Requires sparse feature matrix to be available in-memory\n");
01197
01198 for (int32_t i=0; i<num_vectors; i++)
01199 {
01200 int32_t len=sparse_feature_matrix[i].num_feat_entries;
01201
01202 if (!len)
01203 continue;
01204
01205 TSparseEntry<ST>* sf_orig=sparse_feature_matrix[i].features;
01206 int32_t* feat_idx=new int32_t[len];
01207 int32_t* orig_idx=new int32_t[len];
01208
01209 for (int j=0; j<len; j++)
01210 {
01211 feat_idx[j]=sf_orig[j].feat_index;
01212 orig_idx[j]=j;
01213 }
01214
01215 CMath::qsort_index(feat_idx, orig_idx, len);
01216
01217 TSparseEntry<ST>* sf_new= new TSparseEntry<ST>[len];
01218 for (int j=0; j<len; j++)
01219 sf_new[j]=sf_orig[orig_idx[j]];
01220
01221 sparse_feature_matrix[i].features=sf_new;
01222
01223
01224 for (int j=0; j<len-1; j++)
01225 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index);
01226
01227 delete[] orig_idx;
01228 delete[] feat_idx;
01229 delete[] sf_orig;
01230 }
01231 }
01232
01239 bool write_svmlight_file(char* fname, CLabels* label)
01240 {
01241 ASSERT(label);
01242 int32_t num=label->get_num_labels();
01243 ASSERT(num>0);
01244 ASSERT(num==num_vectors);
01245
01246 FILE* f=fopen(fname, "wb");
01247
01248 if (f)
01249 {
01250 for (int32_t i=0; i<num; i++)
01251 {
01252 fprintf(f, "%d ", (int32_t) label->get_int_label(i));
01253
01254 TSparseEntry<ST>* vec = sparse_feature_matrix[i].features;
01255 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries;
01256
01257 for (int32_t j=0; j<num_feat; j++)
01258 {
01259 if (j<num_feat-1)
01260 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01261 else
01262 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
01263 }
01264 }
01265
01266 fclose(f);
01267 return true;
01268 }
01269 return false;
01270 }
01271
01279 virtual int32_t get_dim_feature_space()
01280 {
01281 return num_features;
01282 }
01283
01290 virtual float64_t dot(int32_t vec_idx1, int32_t vec_idx2)
01291 {
01292 bool afree, bfree;
01293 int32_t alen, blen;
01294 TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree);
01295 TSparseEntry<ST>* bvec=get_sparse_feature_vector(vec_idx2, blen, bfree);
01296
01297 float64_t result=sparse_dot(1, avec, alen, bvec, blen);
01298
01299 free_sparse_feature_vector(avec, vec_idx1, afree);
01300 free_sparse_feature_vector(bvec, vec_idx2, bfree);
01301
01302 return result;
01303 }
01304
01311 virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len)
01312 {
01313 ASSERT(vec2);
01314 ASSERT(vec2_len==num_features);
01315 float64_t result=0;
01316
01317 bool vfree;
01318 int32_t vlen;
01319 TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree);
01320
01321 if (sv)
01322 {
01323 for (int32_t i=0; i<vlen; i++)
01324 result+=vec2[sv[i].feat_index]*sv[i].entry;
01325 }
01326
01327 free_sparse_feature_vector(sv, vec_idx1, vfree);
01328
01329 return result;
01330 }
01331
01333 inline virtual const char* get_name() const { return "SparseFeatures"; }
01334
01335 protected:
01346 virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL)
01347 {
01348 len=0;
01349 return NULL;
01350 }
01351
01352 protected:
01353
01355 int32_t num_vectors;
01356
01358 int32_t num_features;
01359
01361 TSparse<ST>* sparse_feature_matrix;
01362
01364 CCache< TSparseEntry<ST> >* feature_cache;
01365 };
01366
01371 template<> inline EFeatureType CSparseFeatures<bool>::get_feature_type()
01372 {
01373 return F_BOOL;
01374 }
01375
01380 template<> inline EFeatureType CSparseFeatures<char>::get_feature_type()
01381 {
01382 return F_CHAR;
01383 }
01384
01389 template<> inline EFeatureType CSparseFeatures<uint8_t>::get_feature_type()
01390 {
01391 return F_BYTE;
01392 }
01393
01398 template<> inline EFeatureType CSparseFeatures<int16_t>::get_feature_type()
01399 {
01400 return F_SHORT;
01401 }
01402
01407 template<> inline EFeatureType CSparseFeatures<uint16_t>::get_feature_type()
01408 {
01409 return F_WORD;
01410 }
01411
01416 template<> inline EFeatureType CSparseFeatures<int32_t>::get_feature_type()
01417 {
01418 return F_INT;
01419 }
01420
01425 template<> inline EFeatureType CSparseFeatures<uint32_t>::get_feature_type()
01426 {
01427 return F_UINT;
01428 }
01429
01434 template<> inline EFeatureType CSparseFeatures<int64_t>::get_feature_type()
01435 {
01436 return F_LONG;
01437 }
01438
01443 template<> inline EFeatureType CSparseFeatures<uint64_t>::get_feature_type()
01444 {
01445 return F_ULONG;
01446 }
01447
01452 template<> inline EFeatureType CSparseFeatures<float32_t>::get_feature_type()
01453 {
01454 return F_SHORTREAL;
01455 }
01456
01461 template<> inline EFeatureType CSparseFeatures<float64_t>::get_feature_type()
01462 {
01463 return F_DREAL;
01464 }
01465
01470 template<> inline EFeatureType CSparseFeatures<floatmax_t>::get_feature_type()
01471 {
01472 return F_LONGREAL;
01473 }
01474 }
01475 #endif