SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2010 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 * Copyright (C) 2010 Berlin Institute of Technology 00011 */ 00012 00013 #ifndef _SPARSEFEATURES__H__ 00014 #define _SPARSEFEATURES__H__ 00015 00016 #include <string.h> 00017 #include <stdlib.h> 00018 00019 #include "lib/common.h" 00020 #include "lib/Mathematics.h" 00021 #include "lib/Cache.h" 00022 #include "lib/io.h" 00023 #include "lib/Cache.h" 00024 #include "lib/File.h" 00025 #include "lib/DataType.h" 00026 00027 #include "features/Labels.h" 00028 #include "features/Features.h" 00029 #include "features/DotFeatures.h" 00030 #include "features/SimpleFeatures.h" 00031 #include "preproc/SparsePreProc.h" 00032 00033 namespace shogun 00034 { 00035 00036 class CFile; 00037 class CLabels; 00038 class CFeatures; 00039 class CDotFeatures; 00040 template <class ST> class CSimpleFeatures; 00041 template <class ST> class CSparsePreProc; 00042 00055 template <class ST> class CSparseFeatures : public CDotFeatures 00056 { 00057 void init(void) { 00058 set_generic<ST>(); 00059 00060 m_parameters->add_vector(&sparse_feature_matrix, &num_vectors, 00061 "sparse_feature_matrix", 00062 "Array of sparse vectors."); 00063 m_parameters->add(&num_features, "num_features", 00064 "Total number of features."); 00065 } 00066 00067 public: 00072 CSparseFeatures(int32_t size=0) 00073 : CDotFeatures(size), num_vectors(0), num_features(0), 00074 sparse_feature_matrix(NULL), feature_cache(NULL) 00075 { init(); } 00076 00085 CSparseFeatures(TSparse<ST>* src, int32_t num_feat, int32_t num_vec, bool copy=false) 00086 : CDotFeatures(0), num_vectors(0), num_features(0), 00087 sparse_feature_matrix(NULL), feature_cache(NULL) 00088 { 00089 init(); 00090 00091 if (!copy) 00092 set_sparse_feature_matrix(src, num_feat, num_vec); 00093 else 00094 { 00095 sparse_feature_matrix = new TSparse<ST>[num_vec]; 00096 memcpy(sparse_feature_matrix, src, sizeof(TSparse<ST>)*num_vec); 00097 for (int32_t i=0; i< num_vec; i++) 00098 { 00099 sparse_feature_matrix[i].features = new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries]; 00100 memcpy(sparse_feature_matrix[i].features, src[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries); 00101 00102 } 00103 } 00104 } 00105 00113 CSparseFeatures(ST* src, int32_t num_feat, int32_t num_vec) 00114 : CDotFeatures(0), num_vectors(0), num_features(0), 00115 sparse_feature_matrix(NULL), feature_cache(NULL) 00116 { 00117 init(); 00118 00119 set_full_feature_matrix(src, num_feat, num_vec); 00120 } 00121 00123 CSparseFeatures(const CSparseFeatures & orig) 00124 : CDotFeatures(orig), num_vectors(orig.num_vectors), 00125 num_features(orig.num_features), 00126 sparse_feature_matrix(orig.sparse_feature_matrix), 00127 feature_cache(orig.feature_cache) 00128 { 00129 init(); 00130 00131 if (orig.sparse_feature_matrix) 00132 { 00133 free_sparse_feature_matrix(); 00134 sparse_feature_matrix=new TSparse<ST>[num_vectors]; 00135 memcpy(sparse_feature_matrix, orig.sparse_feature_matrix, sizeof(TSparse<ST>)*num_vectors); 00136 for (int32_t i=0; i< num_vectors; i++) 00137 { 00138 sparse_feature_matrix[i].features=new TSparseEntry<ST>[sparse_feature_matrix[i].num_feat_entries]; 00139 memcpy(sparse_feature_matrix[i].features, orig.sparse_feature_matrix[i].features, sizeof(TSparseEntry<ST>)*sparse_feature_matrix[i].num_feat_entries); 00140 00141 } 00142 } 00143 } 00144 00149 CSparseFeatures(CFile* loader) 00150 : CDotFeatures(loader), num_vectors(0), num_features(0), 00151 sparse_feature_matrix(NULL), feature_cache(NULL) 00152 { 00153 init(); 00154 00155 load(loader); 00156 } 00157 00159 virtual ~CSparseFeatures() 00160 { 00161 free_sparse_features(); 00162 } 00163 00167 void free_sparse_feature_matrix() 00168 { 00169 clean_tsparse(sparse_feature_matrix, num_vectors); 00170 sparse_feature_matrix = NULL; 00171 num_vectors=0; 00172 num_features=0; 00173 } 00174 00178 void free_sparse_features() 00179 { 00180 free_sparse_feature_matrix(); 00181 delete feature_cache; 00182 feature_cache = NULL; 00183 } 00184 00189 virtual CFeatures* duplicate() const 00190 { 00191 return new CSparseFeatures<ST>(*this); 00192 } 00193 00201 ST get_feature(int32_t num, int32_t index) 00202 { 00203 ASSERT(index>=0 && index<num_features) ; 00204 ASSERT(num>=0 && num<num_vectors) ; 00205 00206 bool vfree; 00207 int32_t num_feat; 00208 int32_t i; 00209 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00210 ST ret = 0 ; 00211 00212 if (sv) 00213 { 00214 for (i=0; i<num_feat; i++) 00215 if (sv[i].feat_index==index) 00216 ret += sv[i].entry ; 00217 } 00218 00219 free_sparse_feature_vector(sv, num, vfree); 00220 00221 return ret ; 00222 } 00223 00224 00233 ST* get_full_feature_vector(int32_t num, int32_t& len) 00234 { 00235 bool vfree; 00236 int32_t num_feat; 00237 int32_t i; 00238 len=0; 00239 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00240 ST* fv=NULL; 00241 00242 if (sv) 00243 { 00244 len=num_features; 00245 fv=new ST[num_features]; 00246 00247 for (i=0; i<num_features; i++) 00248 fv[i]=0; 00249 00250 for (i=0; i<num_feat; i++) 00251 fv[sv[i].feat_index]= sv[i].entry; 00252 } 00253 00254 free_sparse_feature_vector(sv, num, vfree); 00255 00256 return fv; 00257 } 00258 00265 void get_full_feature_vector(ST** dst, int32_t* len, int32_t num) 00266 { 00267 if (num>=num_vectors) 00268 { 00269 SG_ERROR("Index out of bounds (number of vectors %d, you " 00270 "requested %d)\n", num_vectors, num); 00271 } 00272 00273 bool vfree; 00274 int32_t num_feat=0; 00275 *len=0; 00276 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00277 00278 if (sv) 00279 { 00280 *len=num_features; 00281 *dst= (ST*) malloc(sizeof(ST)*num_features); 00282 memset(*dst, 0, sizeof(ST)*num_features); 00283 00284 for (int32_t i=0; i<num_feat; i++) 00285 (*dst)[sv[i].feat_index]= sv[i].entry; 00286 } 00287 00288 free_sparse_feature_vector(sv, num, vfree); 00289 } 00290 00291 00297 virtual inline int32_t get_nnz_features_for_vector(int32_t num) 00298 { 00299 bool vfree; 00300 int32_t len; 00301 TSparseEntry<ST>* sv = get_sparse_feature_vector(num, len, vfree); 00302 free_sparse_feature_vector(sv, num, vfree); 00303 return len; 00304 } 00305 00316 TSparseEntry<ST>* get_sparse_feature_vector(int32_t num, int32_t& len, bool& vfree) 00317 { 00318 ASSERT(num<num_vectors); 00319 00320 if (sparse_feature_matrix) 00321 { 00322 len= sparse_feature_matrix[num].num_feat_entries; 00323 vfree=false ; 00324 return sparse_feature_matrix[num].features; 00325 } 00326 else 00327 { 00328 TSparseEntry<ST>* feat=NULL; 00329 vfree=false; 00330 00331 if (feature_cache) 00332 { 00333 feat=feature_cache->lock_entry(num); 00334 00335 if (feat) 00336 return feat; 00337 else 00338 { 00339 feat=feature_cache->set_entry(num); 00340 } 00341 } 00342 00343 if (!feat) 00344 vfree=true; 00345 00346 feat=compute_sparse_feature_vector(num, len, feat); 00347 00348 00349 if (get_num_preproc()) 00350 { 00351 int32_t tmp_len=len; 00352 TSparseEntry<ST>* tmp_feat_before = feat; 00353 TSparseEntry<ST>* tmp_feat_after = NULL; 00354 00355 for (int32_t i=0; i<get_num_preproc(); i++) 00356 { 00357 //tmp_feat_after=((CSparsePreProc<ST>*) get_preproc(i))->apply_to_feature_vector(tmp_feat_before, tmp_len); 00358 00359 if (i!=0) // delete feature vector, except for the the first one, i.e., feat 00360 delete[] tmp_feat_before; 00361 tmp_feat_before=tmp_feat_after; 00362 } 00363 00364 memcpy(feat, tmp_feat_after, sizeof(TSparseEntry<ST>)*tmp_len); 00365 delete[] tmp_feat_after; 00366 len=tmp_len ; 00367 SG_DEBUG( "len: %d len2: %d\n", len, num_features); 00368 } 00369 return feat ; 00370 } 00371 } 00372 00373 00384 static ST sparse_dot(ST alpha, TSparseEntry<ST>* avec, int32_t alen, TSparseEntry<ST>* bvec, int32_t blen) 00385 { 00386 ST result=0; 00387 00388 //result remains zero when one of the vectors is non existent 00389 if (avec && bvec) 00390 { 00391 if (alen<=blen) 00392 { 00393 int32_t j=0; 00394 for (int32_t i=0; i<alen; i++) 00395 { 00396 int32_t a_feat_idx=avec[i].feat_index; 00397 00398 while ( (j<blen) && (bvec[j].feat_index < a_feat_idx) ) 00399 j++; 00400 00401 if ( (j<blen) && (bvec[j].feat_index == a_feat_idx) ) 00402 { 00403 result+= avec[i].entry * bvec[j].entry; 00404 j++; 00405 } 00406 } 00407 } 00408 else 00409 { 00410 int32_t j=0; 00411 for (int32_t i=0; i<blen; i++) 00412 { 00413 int32_t b_feat_idx=bvec[i].feat_index; 00414 00415 while ( (j<alen) && (avec[j].feat_index < b_feat_idx) ) 00416 j++; 00417 00418 if ( (j<alen) && (avec[j].feat_index == b_feat_idx) ) 00419 { 00420 result+= bvec[i].entry * avec[j].entry; 00421 j++; 00422 } 00423 } 00424 } 00425 00426 result*=alpha; 00427 } 00428 00429 return result; 00430 } 00431 00442 ST dense_dot(ST alpha, int32_t num, ST* vec, int32_t dim, ST b) 00443 { 00444 ASSERT(vec); 00445 ASSERT(dim==num_features); 00446 ST result=b; 00447 00448 bool vfree; 00449 int32_t num_feat; 00450 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00451 00452 if (sv) 00453 { 00454 for (int32_t i=0; i<num_feat; i++) 00455 result+=alpha*vec[sv[i].feat_index]*sv[i].entry; 00456 } 00457 00458 free_sparse_feature_vector(sv, num, vfree); 00459 return result; 00460 } 00461 00471 void add_to_dense_vec(float64_t alpha, int32_t num, float64_t* vec, int32_t dim, bool abs_val=false) 00472 { 00473 ASSERT(vec); 00474 if (dim!=num_features) 00475 { 00476 SG_ERROR("dimension of vec (=%d) does not match number of features (=%d)\n", 00477 dim, num_features); 00478 } 00479 00480 bool vfree; 00481 int32_t num_feat; 00482 TSparseEntry<ST>* sv=get_sparse_feature_vector(num, num_feat, vfree); 00483 00484 if (sv) 00485 { 00486 if (abs_val) 00487 { 00488 for (int32_t i=0; i<num_feat; i++) 00489 vec[sv[i].feat_index]+= alpha*CMath::abs(sv[i].entry); 00490 } 00491 else 00492 { 00493 for (int32_t i=0; i<num_feat; i++) 00494 vec[sv[i].feat_index]+= alpha*sv[i].entry; 00495 } 00496 } 00497 00498 free_sparse_feature_vector(sv, num, vfree); 00499 } 00500 00507 void free_sparse_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free) 00508 { 00509 if (feature_cache) 00510 feature_cache->unlock_entry(num); 00511 00512 if (free) 00513 delete[] feat_vec ; 00514 } 00515 00523 TSparse<ST>* get_sparse_feature_matrix(int32_t &num_feat, int32_t &num_vec) 00524 { 00525 num_feat=num_features; 00526 num_vec=num_vectors; 00527 00528 return sparse_feature_matrix; 00529 } 00530 00539 void get_sparse_feature_matrix(TSparse<ST>** dst, int32_t* num_feat, 00540 int32_t* num_vec, int64_t* nnz) 00541 { 00542 *nnz=get_num_nonzero_entries(); 00543 *num_feat=num_features; 00544 *num_vec=num_vectors; 00545 *dst=sparse_feature_matrix; 00546 } 00547 00553 void clean_tsparse(TSparse<ST>* sfm, int32_t num_vec) 00554 { 00555 if (sfm) 00556 { 00557 for (int32_t i=0; i<num_vec; i++) 00558 delete[] sfm[i].features; 00559 00560 delete[] sfm; 00561 } 00562 } 00563 00568 CSparseFeatures<ST>* get_transposed() 00569 { 00570 int32_t num_feat; 00571 int32_t num_vec; 00572 TSparse<ST>* s=get_transposed(num_feat, num_vec); 00573 return new CSparseFeatures<ST>(s, num_feat, num_vec); 00574 } 00575 00585 TSparse<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec) 00586 { 00587 num_feat=num_vectors; 00588 num_vec=num_features; 00589 00590 int32_t* hist=new int32_t[num_features]; 00591 memset(hist, 0, sizeof(int32_t)*num_features); 00592 00593 // count how lengths of future feature vectors 00594 for (int32_t v=0; v<num_vectors; v++) 00595 { 00596 int32_t vlen; 00597 bool vfree; 00598 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree); 00599 00600 for (int32_t i=0; i<vlen; i++) 00601 hist[sv[i].feat_index]++; 00602 00603 free_sparse_feature_vector(sv, v, vfree); 00604 } 00605 00606 // allocate room for future feature vectors 00607 TSparse<ST>* sfm=new TSparse<ST>[num_vec]; 00608 for (int32_t v=0; v<num_vec; v++) 00609 { 00610 sfm[v].features= new TSparseEntry<ST>[hist[v]]; 00611 sfm[v].num_feat_entries=hist[v]; 00612 sfm[v].vec_index=v; 00613 } 00614 00615 // fill future feature vectors with content 00616 memset(hist,0,sizeof(int32_t)*num_features); 00617 for (int32_t v=0; v<num_vectors; v++) 00618 { 00619 int32_t vlen; 00620 bool vfree; 00621 TSparseEntry<ST>* sv=get_sparse_feature_vector(v, vlen, vfree); 00622 00623 for (int32_t i=0; i<vlen; i++) 00624 { 00625 int32_t vidx=sv[i].feat_index; 00626 int32_t fidx=v; 00627 sfm[vidx].features[hist[vidx]].feat_index=fidx; 00628 sfm[vidx].features[hist[vidx]].entry=sv[i].entry; 00629 hist[vidx]++; 00630 } 00631 00632 free_sparse_feature_vector(sv, v, vfree); 00633 } 00634 00635 delete[] hist; 00636 return sfm; 00637 } 00638 00648 virtual void set_sparse_feature_matrix(TSparse<ST>* src, int32_t num_feat, int32_t num_vec) 00649 { 00650 free_sparse_feature_matrix(); 00651 00652 sparse_feature_matrix=src; 00653 num_features=num_feat; 00654 num_vectors=num_vec; 00655 } 00656 00664 ST* get_full_feature_matrix(int32_t &num_feat, int32_t &num_vec) 00665 { 00666 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features); 00667 num_feat=num_features; 00668 num_vec=num_vectors; 00669 00670 ST* fm=new ST[num_feat*num_vec]; 00671 00672 if (fm) 00673 { 00674 for (int64_t i=0; i<num_feat*num_vec; i++) 00675 fm[i]=0; 00676 00677 for (int32_t v=0; v<num_vec; v++) 00678 { 00679 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++) 00680 { 00681 int64_t offs= (sparse_feature_matrix[v].vec_index * num_feat) + sparse_feature_matrix[v].features[f].feat_index; 00682 fm[offs]= sparse_feature_matrix[v].features[f].entry; 00683 } 00684 } 00685 } 00686 else 00687 SG_ERROR( "error allocating memory for dense feature matrix\n"); 00688 00689 return fm; 00690 } 00691 00699 void get_full_feature_matrix(ST** dst, int32_t* num_feat, int32_t* num_vec) 00700 { 00701 SG_INFO( "converting sparse features to full feature matrix of %ld x %ld entries\n", num_vectors, num_features); 00702 *num_feat=num_features; 00703 *num_vec=num_vectors; 00704 00705 *dst= (ST*) malloc(sizeof(ST)*num_features*num_vectors); 00706 00707 if (*dst) 00708 { 00709 for (int64_t i=0; i<num_features*num_vectors; i++) 00710 (*dst)[i]=0; 00711 00712 for (int32_t v=0; v<num_vectors; v++) 00713 { 00714 for (int32_t f=0; f<sparse_feature_matrix[v].num_feat_entries; f++) 00715 { 00716 int64_t offs= (sparse_feature_matrix[v].vec_index * num_features) + sparse_feature_matrix[v].features[f].feat_index; 00717 (*dst)[offs]= sparse_feature_matrix[v].features[f].entry; 00718 } 00719 } 00720 } 00721 else 00722 SG_ERROR( "error allocating memory for dense feature matrix\n"); 00723 } 00724 00734 virtual bool set_full_feature_matrix(ST* src, int32_t num_feat, int32_t num_vec) 00735 { 00736 free_sparse_feature_matrix(); 00737 bool result=true; 00738 num_features=num_feat; 00739 num_vectors=num_vec; 00740 00741 SG_INFO("converting dense feature matrix to sparse one\n"); 00742 int32_t* num_feat_entries=new int[num_vectors]; 00743 00744 if (num_feat_entries) 00745 { 00746 int64_t num_total_entries=0; 00747 00748 // count nr of non sparse features 00749 for (int32_t i=0; i< num_vec; i++) 00750 { 00751 num_feat_entries[i]=0; 00752 for (int32_t j=0; j< num_feat; j++) 00753 { 00754 if (src[i*((int64_t) num_feat) + j] != 0) 00755 num_feat_entries[i]++; 00756 } 00757 } 00758 00759 if (num_vec>0) 00760 { 00761 sparse_feature_matrix=new TSparse<ST>[num_vec]; 00762 00763 if (sparse_feature_matrix) 00764 { 00765 for (int32_t i=0; i< num_vec; i++) 00766 { 00767 sparse_feature_matrix[i].vec_index=i; 00768 sparse_feature_matrix[i].num_feat_entries=0; 00769 sparse_feature_matrix[i].features= NULL; 00770 00771 if (num_feat_entries[i]>0) 00772 { 00773 sparse_feature_matrix[i].features= new TSparseEntry<ST>[num_feat_entries[i]]; 00774 00775 if (!sparse_feature_matrix[i].features) 00776 { 00777 SG_INFO( "allocation of features failed\n"); 00778 return false; 00779 } 00780 00781 sparse_feature_matrix[i].num_feat_entries=num_feat_entries[i]; 00782 int32_t sparse_feat_idx=0; 00783 00784 for (int32_t j=0; j< num_feat; j++) 00785 { 00786 int64_t pos= i*num_feat + j; 00787 00788 if (src[pos] != 0) 00789 { 00790 sparse_feature_matrix[i].features[sparse_feat_idx].entry=src[pos]; 00791 sparse_feature_matrix[i].features[sparse_feat_idx].feat_index=j; 00792 sparse_feat_idx++; 00793 num_total_entries++; 00794 } 00795 } 00796 } 00797 } 00798 } 00799 else 00800 { 00801 SG_ERROR( "allocation of sparse feature matrix failed\n"); 00802 result=false; 00803 } 00804 00805 SG_INFO( "sparse feature matrix has %ld entries (full matrix had %ld, sparsity %2.2f%%)\n", 00806 num_total_entries, int64_t(num_feat)*num_vec, (100.0*num_total_entries)/(int64_t(num_feat)*num_vec)); 00807 } 00808 else 00809 { 00810 SG_ERROR( "huh ? zero size matrix given ?\n"); 00811 result=false; 00812 } 00813 } 00814 delete[] num_feat_entries; 00815 return result; 00816 } 00817 00823 virtual bool apply_preproc(bool force_preprocessing=false) 00824 { 00825 SG_INFO( "force: %d\n", force_preprocessing); 00826 00827 if ( sparse_feature_matrix && get_num_preproc() ) 00828 { 00829 for (int32_t i=0; i<get_num_preproc(); i++) 00830 { 00831 if ( (!is_preprocessed(i) || force_preprocessing) ) 00832 { 00833 set_preprocessed(i); 00834 SG_INFO( "preprocessing using preproc %s\n", get_preproc(i)->get_name()); 00835 if (((CSparsePreProc<ST>*) get_preproc(i))->apply_to_sparse_feature_matrix(this) == NULL) 00836 return false; 00837 } 00838 return true; 00839 } 00840 return true; 00841 } 00842 else 00843 { 00844 SG_WARNING( "no sparse feature matrix available or features already preprocessed - skipping.\n"); 00845 return false; 00846 } 00847 } 00848 00853 virtual int32_t get_size() { return sizeof(ST); } 00854 00860 bool obtain_from_simple(CSimpleFeatures<ST>* sf) 00861 { 00862 int32_t num_feat=0; 00863 int32_t num_vec=0; 00864 ST* fm=sf->get_feature_matrix(num_feat, num_vec); 00865 ASSERT(fm && num_feat>0 && num_vec>0); 00866 00867 return set_full_feature_matrix(fm, num_feat, num_vec); 00868 } 00869 00874 virtual inline int32_t get_num_vectors() { return num_vectors; } 00875 00880 inline int32_t get_num_features() { return num_features; } 00881 00893 inline int32_t set_num_features(int32_t num) 00894 { 00895 int32_t n=num_features; 00896 ASSERT(n<=num); 00897 num_features=num; 00898 return num_features; 00899 } 00900 00905 inline virtual EFeatureClass get_feature_class() { return C_SPARSE; } 00906 00911 inline virtual EFeatureType get_feature_type(); 00912 00919 void free_feature_vector(TSparseEntry<ST>* feat_vec, int32_t num, bool free) 00920 { 00921 if (feature_cache) 00922 feature_cache->unlock_entry(num); 00923 00924 if (free) 00925 delete[] feat_vec ; 00926 } 00927 00932 int64_t get_num_nonzero_entries() 00933 { 00934 int64_t num=0; 00935 for (int32_t i=0; i<num_vectors; i++) 00936 num+=sparse_feature_matrix[i].num_feat_entries; 00937 00938 return num; 00939 } 00940 00946 float64_t* compute_squared(float64_t* sq) 00947 { 00948 ASSERT(sq); 00949 00950 int32_t len=0; 00951 bool do_free=false; 00952 00953 for (int32_t i=0; i<this->get_num_vectors(); i++) 00954 { 00955 sq[i]=0; 00956 TSparseEntry<float64_t>* vec = ((CSparseFeatures<float64_t>*) this)->get_sparse_feature_vector(i, len, do_free); 00957 00958 for (int32_t j=0; j<len; j++) 00959 sq[i] += vec[j].entry * vec[j].entry; 00960 00961 ((CSparseFeatures<float64_t>*) this)->free_feature_vector(vec, i, do_free); 00962 } 00963 00964 return sq; 00965 } 00966 00979 float64_t compute_squared_norm(CSparseFeatures<float64_t>* lhs, float64_t* sq_lhs, int32_t idx_a, CSparseFeatures<float64_t>* rhs, float64_t* sq_rhs, int32_t idx_b) 00980 { 00981 int32_t i,j; 00982 int32_t alen, blen; 00983 bool afree, bfree; 00984 ASSERT(lhs); 00985 ASSERT(rhs); 00986 00987 TSparseEntry<float64_t>* avec=lhs->get_sparse_feature_vector(idx_a, alen, afree); 00988 TSparseEntry<float64_t>* bvec=rhs->get_sparse_feature_vector(idx_b, blen, bfree); 00989 ASSERT(avec); 00990 ASSERT(bvec); 00991 00992 float64_t result=sq_lhs[idx_a]+sq_rhs[idx_b]; 00993 00994 if (alen<=blen) 00995 { 00996 j=0; 00997 for (i=0; i<alen; i++) 00998 { 00999 int32_t a_feat_idx=avec[i].feat_index; 01000 01001 while ((j<blen) && (bvec[j].feat_index < a_feat_idx)) 01002 j++; 01003 01004 if ((j<blen) && (bvec[j].feat_index == a_feat_idx)) 01005 { 01006 result-=2*(avec[i].entry*bvec[j].entry); 01007 j++; 01008 } 01009 } 01010 } 01011 else 01012 { 01013 j=0; 01014 for (i=0; i<blen; i++) 01015 { 01016 int32_t b_feat_idx=bvec[i].feat_index; 01017 01018 while ((j<alen) && (avec[j].feat_index<b_feat_idx)) 01019 j++; 01020 01021 if ((j<alen) && (avec[j].feat_index == b_feat_idx)) 01022 { 01023 result-=2*(bvec[i].entry*avec[j].entry); 01024 j++; 01025 } 01026 } 01027 } 01028 01029 ((CSparseFeatures<float64_t>*) lhs)->free_feature_vector(avec, idx_a, afree); 01030 ((CSparseFeatures<float64_t>*) rhs)->free_feature_vector(bvec, idx_b, bfree); 01031 01032 return CMath::abs(result); 01033 } 01034 01039 void load(CFile* loader); 01040 01045 void save(CFile* writer); 01046 01054 CLabels* load_svmlight_file(char* fname, bool do_sort_features=true) 01055 { 01056 CLabels* lab=NULL; 01057 01058 size_t blocksize=1024*1024; 01059 size_t required_blocksize=blocksize; 01060 uint8_t* dummy=new uint8_t[blocksize]; 01061 FILE* f=fopen(fname, "ro"); 01062 01063 if (f) 01064 { 01065 free_sparse_feature_matrix(); 01066 num_vectors=0; 01067 num_features=0; 01068 01069 SG_INFO("counting line numbers in file %s\n", fname); 01070 size_t sz=blocksize; 01071 size_t block_offs=0; 01072 size_t old_block_offs=0; 01073 fseek(f, 0, SEEK_END); 01074 size_t fsize=ftell(f); 01075 rewind(f); 01076 01077 while (sz == blocksize) 01078 { 01079 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 01080 bool contains_cr=false; 01081 for (size_t i=0; i<sz; i++) 01082 { 01083 block_offs++; 01084 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 01085 { 01086 num_vectors++; 01087 contains_cr=true; 01088 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1); 01089 old_block_offs=block_offs; 01090 } 01091 } 01092 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 01093 } 01094 01095 SG_INFO("found %d feature vectors\n", num_vectors); 01096 delete[] dummy; 01097 blocksize=required_blocksize; 01098 dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL 01099 01100 lab=new CLabels(num_vectors); 01101 sparse_feature_matrix=new TSparse<ST>[num_vectors]; 01102 01103 rewind(f); 01104 sz=blocksize; 01105 int32_t lines=0; 01106 while (sz == blocksize) 01107 { 01108 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 01109 01110 size_t old_sz=0; 01111 for (size_t i=0; i<sz; i++) 01112 { 01113 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize) 01114 { 01115 size_t len=i-old_sz+1; 01116 uint8_t* data=&dummy[old_sz]; 01117 01118 for (int32_t j=0; j<len; j++) 01119 dummy[j]=data[j]; 01120 01121 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, f); 01122 i=0; 01123 old_sz=0; 01124 sz+=len; 01125 } 01126 01127 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 01128 { 01129 01130 size_t len=i-old_sz; 01131 uint8_t* data=&dummy[old_sz]; 01132 01133 int32_t dims=0; 01134 for (int32_t j=0; j<len; j++) 01135 { 01136 if (data[j]==':') 01137 dims++; 01138 } 01139 01140 if (dims<=0) 01141 { 01142 SG_ERROR("Error in line %d - number of" 01143 " dimensions is %d line is %d characters" 01144 " long\n line_content:'%.*s'\n", lines, 01145 dims, len, len, (const char*) data); 01146 } 01147 01148 TSparseEntry<ST>* feat=new TSparseEntry<ST>[dims]; 01149 int32_t j=0; 01150 for (; j<len; j++) 01151 { 01152 if (data[j]==' ') 01153 { 01154 data[j]='\0'; 01155 01156 lab->set_label(lines, atof((const char*) data)); 01157 break; 01158 } 01159 } 01160 01161 int32_t d=0; 01162 j++; 01163 uint8_t* start=&data[j]; 01164 for (; j<len; j++) 01165 { 01166 if (data[j]==':') 01167 { 01168 data[j]='\0'; 01169 01170 feat[d].feat_index=(int32_t) atoi((const char*) start)-1; 01171 num_features=CMath::max(num_features, feat[d].feat_index+1); 01172 01173 j++; 01174 start=&data[j]; 01175 for (; j<len; j++) 01176 { 01177 if (data[j]==' ' || data[j]=='\n') 01178 { 01179 data[j]='\0'; 01180 feat[d].entry=(ST) atof((const char*) start); 01181 d++; 01182 break; 01183 } 01184 } 01185 01186 if (j==len) 01187 { 01188 data[j]='\0'; 01189 feat[dims-1].entry=(ST) atof((const char*) start); 01190 } 01191 01192 j++; 01193 start=&data[j]; 01194 } 01195 } 01196 01197 sparse_feature_matrix[lines].vec_index=lines; 01198 sparse_feature_matrix[lines].num_feat_entries=dims; 01199 sparse_feature_matrix[lines].features=feat; 01200 01201 old_sz=i+1; 01202 lines++; 01203 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t"); 01204 } 01205 } 01206 } 01207 SG_INFO("file successfully read\n"); 01208 fclose(f); 01209 } 01210 01211 delete[] dummy; 01212 01213 if (do_sort_features) 01214 sort_features(); 01215 01216 return lab; 01217 } 01218 01221 void sort_features() 01222 { 01223 ASSERT(get_num_preproc()==0); 01224 01225 if (!sparse_feature_matrix) 01226 SG_ERROR("Requires sparse feature matrix to be available in-memory\n"); 01227 01228 for (int32_t i=0; i<num_vectors; i++) 01229 { 01230 int32_t len=sparse_feature_matrix[i].num_feat_entries; 01231 01232 if (!len) 01233 continue; 01234 01235 TSparseEntry<ST>* sf_orig=sparse_feature_matrix[i].features; 01236 int32_t* feat_idx=new int32_t[len]; 01237 int32_t* orig_idx=new int32_t[len]; 01238 01239 for (int j=0; j<len; j++) 01240 { 01241 feat_idx[j]=sf_orig[j].feat_index; 01242 orig_idx[j]=j; 01243 } 01244 01245 CMath::qsort_index(feat_idx, orig_idx, len); 01246 01247 TSparseEntry<ST>* sf_new= new TSparseEntry<ST>[len]; 01248 for (int j=0; j<len; j++) 01249 sf_new[j]=sf_orig[orig_idx[j]]; 01250 01251 sparse_feature_matrix[i].features=sf_new; 01252 01253 // sanity check 01254 for (int j=0; j<len-1; j++) 01255 ASSERT(sf_new[j].feat_index<sf_new[j+1].feat_index); 01256 01257 delete[] orig_idx; 01258 delete[] feat_idx; 01259 delete[] sf_orig; 01260 } 01261 } 01262 01269 bool write_svmlight_file(char* fname, CLabels* label) 01270 { 01271 ASSERT(label); 01272 int32_t num=label->get_num_labels(); 01273 ASSERT(num>0); 01274 ASSERT(num==num_vectors); 01275 01276 FILE* f=fopen(fname, "wb"); 01277 01278 if (f) 01279 { 01280 for (int32_t i=0; i<num; i++) 01281 { 01282 fprintf(f, "%d ", (int32_t) label->get_int_label(i)); 01283 01284 TSparseEntry<ST>* vec = sparse_feature_matrix[i].features; 01285 int32_t num_feat = sparse_feature_matrix[i].num_feat_entries; 01286 01287 for (int32_t j=0; j<num_feat; j++) 01288 { 01289 if (j<num_feat-1) 01290 fprintf(f, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry); 01291 else 01292 fprintf(f, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry); 01293 } 01294 } 01295 01296 fclose(f); 01297 return true; 01298 } 01299 return false; 01300 } 01301 01309 virtual int32_t get_dim_feature_space() 01310 { 01311 return num_features; 01312 } 01313 01321 virtual float64_t dot(int32_t vec_idx1, CDotFeatures* df, int32_t vec_idx2) 01322 { 01323 ASSERT(df); 01324 ASSERT(df->get_feature_type() == get_feature_type()); 01325 ASSERT(df->get_feature_class() == get_feature_class()); 01326 CSparseFeatures<ST>* sf = (CSparseFeatures<ST>*) df; 01327 01328 bool afree, bfree; 01329 int32_t alen, blen; 01330 TSparseEntry<ST>* avec=get_sparse_feature_vector(vec_idx1, alen, afree); 01331 TSparseEntry<ST>* bvec=sf->get_sparse_feature_vector(vec_idx2, blen, bfree); 01332 01333 float64_t result=sparse_dot(1, avec, alen, bvec, blen); 01334 01335 free_sparse_feature_vector(avec, vec_idx1, afree); 01336 sf->free_sparse_feature_vector(bvec, vec_idx2, bfree); 01337 01338 return result; 01339 } 01340 01347 virtual float64_t dense_dot(int32_t vec_idx1, const float64_t* vec2, int32_t vec2_len) 01348 { 01349 ASSERT(vec2); 01350 if (vec2_len!=num_features) 01351 { 01352 SG_ERROR("dimension of vec2 (=%d) does not match number of features (=%d)\n", 01353 vec2_len, num_features); 01354 } 01355 float64_t result=0; 01356 01357 bool vfree; 01358 int32_t vlen; 01359 TSparseEntry<ST>* sv=get_sparse_feature_vector(vec_idx1, vlen, vfree); 01360 01361 if (sv) 01362 { 01363 for (int32_t i=0; i<vlen; i++) 01364 result+=vec2[sv[i].feat_index]*sv[i].entry; 01365 } 01366 01367 free_sparse_feature_vector(sv, vec_idx1, vfree); 01368 01369 return result; 01370 } 01371 01373 struct sparse_feature_iterator 01374 { 01376 TSparseEntry<ST>* sv; 01378 int32_t vidx; 01380 int32_t num_feat_entries; 01382 bool vfree; 01383 01385 int32_t index; 01386 01388 void print_info() 01389 { 01390 SG_SPRINT("sv=%p, vidx=%d, num_feat_entries=%d, index=%d\n", 01391 sv, vidx, num_feat_entries, index); 01392 } 01393 }; 01394 01404 virtual void* get_feature_iterator(int32_t vector_index) 01405 { 01406 if (vector_index>=num_vectors) 01407 { 01408 SG_ERROR("Index out of bounds (number of vectors %d, you " 01409 "requested %d)\n", num_vectors, vector_index); 01410 } 01411 01412 if (!sparse_feature_matrix) 01413 SG_ERROR("Requires a in-memory feature matrix\n"); 01414 01415 sparse_feature_iterator* it=new sparse_feature_iterator[1]; 01416 it->sv=get_sparse_feature_vector(vector_index, it->num_feat_entries, it->vfree); 01417 it->vidx=vector_index; 01418 it->index=0; 01419 01420 return it; 01421 } 01422 01433 virtual bool get_next_feature(int32_t& index, float64_t& value, void* iterator) 01434 { 01435 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator; 01436 if (!it || it->index >= it->num_feat_entries) 01437 return false; 01438 01439 int32_t i=it->index++; 01440 01441 index = it->sv[i].feat_index; 01442 value = (float64_t) it->sv[i].entry; 01443 01444 return true; 01445 } 01446 01452 virtual void free_feature_iterator(void* iterator) 01453 { 01454 if (!iterator) 01455 return; 01456 01457 sparse_feature_iterator* it=(sparse_feature_iterator*) iterator; 01458 free_sparse_feature_vector(it->sv, it->vidx, it->vfree); 01459 delete[] it; 01460 } 01461 01463 inline virtual const char* get_name() const { return "SparseFeatures"; } 01464 01465 protected: 01476 virtual TSparseEntry<ST>* compute_sparse_feature_vector(int32_t num, int32_t& len, TSparseEntry<ST>* target=NULL) 01477 { 01478 len=0; 01479 return NULL; 01480 } 01481 01482 protected: 01483 01485 int32_t num_vectors; 01486 01488 int32_t num_features; 01489 01491 TSparse<ST>* sparse_feature_matrix; 01492 01494 CCache< TSparseEntry<ST> >* feature_cache; 01495 }; 01496 01497 #ifndef DOXYGEN_SHOULD_SKIP_THIS 01498 #define GET_FEATURE_TYPE(sg_type, f_type) \ 01499 template<> inline EFeatureType CSparseFeatures<sg_type>::get_feature_type() \ 01500 { \ 01501 return f_type; \ 01502 } 01503 GET_FEATURE_TYPE(bool, F_BOOL) 01504 GET_FEATURE_TYPE(char, F_CHAR) 01505 GET_FEATURE_TYPE(uint8_t, F_BYTE) 01506 GET_FEATURE_TYPE(int8_t, F_BYTE) 01507 GET_FEATURE_TYPE(int16_t, F_SHORT) 01508 GET_FEATURE_TYPE(uint16_t, F_WORD) 01509 GET_FEATURE_TYPE(int32_t, F_INT) 01510 GET_FEATURE_TYPE(uint32_t, F_UINT) 01511 GET_FEATURE_TYPE(int64_t, F_LONG) 01512 GET_FEATURE_TYPE(uint64_t, F_ULONG) 01513 GET_FEATURE_TYPE(float32_t, F_SHORTREAL) 01514 GET_FEATURE_TYPE(float64_t, F_DREAL) 01515 GET_FEATURE_TYPE(floatmax_t, F_LONGREAL) 01516 #undef GET_FEATURE_TYPE 01517 01518 #define LOAD(fname, sg_type) \ 01519 template<> inline void CSparseFeatures<sg_type>::load(CFile* loader) \ 01520 { \ 01521 SG_SET_LOCALE_C; \ 01522 ASSERT(loader); \ 01523 TSparse<sg_type>* matrix=NULL; \ 01524 int32_t num_feat=0; \ 01525 int32_t num_vec=0; \ 01526 loader->fname(matrix, num_feat, num_vec); \ 01527 set_sparse_feature_matrix(matrix, num_feat, num_vec); \ 01528 SG_RESET_LOCALE; \ 01529 } 01530 LOAD(get_bool_sparsematrix, bool) 01531 LOAD(get_char_sparsematrix, char) 01532 LOAD(get_byte_sparsematrix, uint8_t) 01533 LOAD(get_int8_sparsematrix, int8_t) 01534 LOAD(get_short_sparsematrix, int16_t) 01535 LOAD(get_word_sparsematrix, uint16_t) 01536 LOAD(get_int_sparsematrix, int32_t) 01537 LOAD(get_uint_sparsematrix, uint32_t) 01538 LOAD(get_long_sparsematrix, int64_t) 01539 LOAD(get_ulong_sparsematrix, uint64_t) 01540 LOAD(get_shortreal_sparsematrix, float32_t) 01541 LOAD(get_real_sparsematrix, float64_t) 01542 LOAD(get_longreal_sparsematrix, floatmax_t) 01543 #undef LOAD 01544 01545 #define WRITE(fname, sg_type) \ 01546 template<> inline void CSparseFeatures<sg_type>::save(CFile* writer) \ 01547 { \ 01548 SG_SET_LOCALE_C; \ 01549 ASSERT(writer); \ 01550 writer->fname(sparse_feature_matrix, num_features, num_vectors); \ 01551 SG_RESET_LOCALE; \ 01552 } 01553 WRITE(set_bool_sparsematrix, bool) 01554 WRITE(set_char_sparsematrix, char) 01555 WRITE(set_byte_sparsematrix, uint8_t) 01556 WRITE(set_int8_sparsematrix, int8_t) 01557 WRITE(set_short_sparsematrix, int16_t) 01558 WRITE(set_word_sparsematrix, uint16_t) 01559 WRITE(set_int_sparsematrix, int32_t) 01560 WRITE(set_uint_sparsematrix, uint32_t) 01561 WRITE(set_long_sparsematrix, int64_t) 01562 WRITE(set_ulong_sparsematrix, uint64_t) 01563 WRITE(set_shortreal_sparsematrix, float32_t) 01564 WRITE(set_real_sparsematrix, float64_t) 01565 WRITE(set_longreal_sparsematrix, floatmax_t) 01566 #undef WRITE 01567 #endif // DOXYGEN_SHOULD_SKIP_THIS 01568 } 01569 #endif /* _SPARSEFEATURES__H__ */