SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 1999-2009 Soeren Sonnenburg 00008 * Written (W) 1999-2008 Gunnar Raetsch 00009 * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society 00010 */ 00011 00012 #ifndef _CSTRINGFEATURES__H__ 00013 #define _CSTRINGFEATURES__H__ 00014 00015 #include "lib/common.h" 00016 #include "lib/io.h" 00017 #include "lib/Cache.h" 00018 #include "lib/DynamicArray.h" 00019 #include "lib/File.h" 00020 #include "lib/MemoryMappedFile.h" 00021 #include "lib/Mathematics.h" 00022 #include "lib/Compressor.h" 00023 #include "base/Parameter.h" 00024 00025 #include "preproc/PreProc.h" 00026 #include "preproc/StringPreProc.h" 00027 #include "features/Features.h" 00028 #include "features/Alphabet.h" 00029 00030 #include <sys/types.h> 00031 #include <sys/stat.h> 00032 #include <dirent.h> 00033 #include <stdio.h> 00034 #include <stdlib.h> 00035 #include <unistd.h> 00036 00037 namespace shogun 00038 { 00039 class CCompressor; 00040 enum E_COMPRESSION_TYPE; 00041 class CAlphabet; 00042 enum EAlphabet; 00043 template <class T> class CDynamicArray; 00044 class CFile; 00045 template <class T> class CMemoryMappedFile; 00046 class CMath; 00047 template <class ST> class CStringPreProc; 00048 template <class T> class TString; 00049 00050 struct SSKDoubleFeature 00051 { 00052 int feature1; 00053 int feature2; 00054 int group; 00055 }; 00056 00057 struct SSKTripleFeature 00058 { 00059 int feature1; 00060 int feature2; 00061 int feature3; 00062 int group; 00063 }; 00064 00083 template <class ST> class CStringFeatures : public CFeatures 00084 { 00085 public: 00089 CStringFeatures() : CFeatures(0), alphabet(NULL), num_vectors(0), 00090 features(NULL), single_string(NULL),length_of_single_string(0), 00091 max_string_length(0), order(0), symbol_mask_table(NULL), 00092 preprocess_on_get(false), feature_cache(NULL) 00093 { 00094 init(); 00095 alphabet=new CAlphabet(); 00096 } 00097 00102 CStringFeatures(EAlphabet alpha) 00103 : CFeatures(0), num_vectors(0), features(NULL), 00104 single_string(NULL),length_of_single_string(0), 00105 max_string_length(0), order(0), symbol_mask_table(NULL), 00106 preprocess_on_get(false), feature_cache(NULL) 00107 { 00108 init(); 00109 00110 alphabet=new CAlphabet(alpha); 00111 SG_REF(alphabet); 00112 num_symbols=alphabet->get_num_symbols(); 00113 original_num_symbols=num_symbols; 00114 } 00115 00123 CStringFeatures(TString<ST>* p_features, int32_t p_num_vectors, 00124 int32_t p_max_string_length, EAlphabet alpha) 00125 : CFeatures(0), num_vectors(0), features(NULL), 00126 single_string(NULL),length_of_single_string(0), 00127 max_string_length(0), order(0), symbol_mask_table(NULL), 00128 preprocess_on_get(false), feature_cache(NULL) 00129 { 00130 init(); 00131 00132 alphabet=new CAlphabet(alpha); 00133 SG_REF(alphabet); 00134 num_symbols=alphabet->get_num_symbols(); 00135 original_num_symbols=num_symbols; 00136 set_features(p_features, p_num_vectors, p_max_string_length); 00137 } 00138 00146 CStringFeatures(TString<ST>* p_features, int32_t p_num_vectors, 00147 int32_t p_max_string_length, CAlphabet* alpha) 00148 : CFeatures(0), num_vectors(0), features(NULL), 00149 single_string(NULL),length_of_single_string(0), 00150 max_string_length(0), order(0), symbol_mask_table(NULL), 00151 preprocess_on_get(false), feature_cache(NULL) 00152 { 00153 init(); 00154 00155 alphabet=new CAlphabet(alpha); 00156 SG_REF(alphabet); 00157 num_symbols=alphabet->get_num_symbols(); 00158 original_num_symbols=num_symbols; 00159 set_features(p_features, p_num_vectors, p_max_string_length); 00160 } 00161 00166 CStringFeatures(CAlphabet* alpha) 00167 : CFeatures(0), num_vectors(0), features(NULL), 00168 single_string(NULL),length_of_single_string(0), 00169 max_string_length(0), order(0), symbol_mask_table(NULL), 00170 preprocess_on_get(false), feature_cache(NULL) 00171 { 00172 init(); 00173 00174 ASSERT(alpha); 00175 SG_REF(alpha); 00176 alphabet=alpha; 00177 num_symbols=alphabet->get_num_symbols(); 00178 original_num_symbols=num_symbols; 00179 } 00180 00182 CStringFeatures(const CStringFeatures & orig) 00183 : CFeatures(orig), num_vectors(orig.num_vectors), 00184 single_string(orig.single_string), 00185 length_of_single_string(orig.length_of_single_string), 00186 max_string_length(orig.max_string_length), 00187 num_symbols(orig.num_symbols), 00188 original_num_symbols(orig.original_num_symbols), 00189 order(orig.order), preprocess_on_get(false), 00190 feature_cache(NULL) 00191 { 00192 init(); 00193 00194 ASSERT(orig.single_string == NULL); //not implemented 00195 00196 alphabet=orig.alphabet; 00197 SG_REF(alphabet); 00198 00199 if (orig.features) 00200 { 00201 features=new TString<ST>[orig.num_vectors]; 00202 00203 for (int32_t i=0; i<num_vectors; i++) 00204 { 00205 features[i].string=new ST[orig.features[i].length]; 00206 features[i].length=orig.features[i].length; 00207 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length); 00208 } 00209 } 00210 00211 if (orig.symbol_mask_table) 00212 { 00213 symbol_mask_table=new ST[256]; 00214 for (int32_t i=0; i<256; i++) 00215 symbol_mask_table[i]=orig.symbol_mask_table[i]; 00216 } 00217 } 00218 00224 CStringFeatures(CFile* loader, EAlphabet alpha=DNA) 00225 : CFeatures(loader), num_vectors(0), features(NULL), single_string(NULL), 00226 length_of_single_string(0), max_string_length(0), order(0), 00227 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL) 00228 { 00229 init(); 00230 00231 alphabet=new CAlphabet(alpha); 00232 SG_REF(alphabet); 00233 num_symbols=alphabet->get_num_symbols(); 00234 original_num_symbols=num_symbols; 00235 load(loader); 00236 } 00237 00238 virtual ~CStringFeatures() 00239 { 00240 cleanup(); 00241 00242 SG_UNREF(alphabet); 00243 } 00244 00246 virtual void cleanup() 00247 { 00248 if (single_string) 00249 { 00250 delete[] single_string; 00251 single_string=NULL; 00252 } 00253 else 00254 { 00255 for (int32_t i=0; i<num_vectors; i++) 00256 cleanup_feature_vector(i); 00257 } 00258 00259 num_vectors=0; 00260 delete[] features; 00261 delete[] symbol_mask_table; 00262 features=NULL; 00263 symbol_mask_table=NULL; 00264 00265 /* start with a fresh alphabet, but instead of emptying the histogram 00266 * create a new object (to leave the alphabet object alone if it is used 00267 * by others) 00268 */ 00269 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 00270 SG_UNREF(alphabet); 00271 alphabet=alpha; 00272 SG_REF(alphabet); 00273 } 00274 00276 virtual void cleanup_feature_vector(int32_t num) 00277 { 00278 ASSERT(num<num_vectors); 00279 if (features) 00280 { 00281 delete[] features[num].string; 00282 features[num].string=NULL; 00283 features[num].length=0; 00284 } 00285 } 00286 00291 inline virtual EFeatureClass get_feature_class() { return C_STRING; } 00292 00297 inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; } 00298 00303 inline CAlphabet* get_alphabet() 00304 { 00305 SG_REF(alphabet); 00306 return alphabet; 00307 } 00308 00313 virtual CFeatures* duplicate() const 00314 { 00315 return new CStringFeatures<ST>(*this); 00316 } 00317 00324 void get_feature_vector(ST** dst, int32_t* len, int32_t num) 00325 { 00326 ASSERT(features); 00327 if (num>=num_vectors) 00328 { 00329 SG_ERROR("Index out of bounds (number of strings %d, you " 00330 "requested %d)\n", num_vectors, num); 00331 } 00332 00333 int32_t l; 00334 bool free_vec; 00335 ST* vec=get_feature_vector(num, l, free_vec); 00336 *len=l; 00337 *dst=(ST*) malloc(*len * sizeof(ST)); 00338 ASSERT(*dst); 00339 memcpy(*dst, vec, *len * sizeof(ST)); 00340 free_feature_vector(vec, num, free_vec); 00341 } 00342 00349 void set_feature_vector(ST* src, int32_t len, int32_t num) 00350 { 00351 ASSERT(features); 00352 if (num>=num_vectors) 00353 { 00354 SG_ERROR("Index out of bounds (number of strings %d, you " 00355 "requested %d)\n", num_vectors, num); 00356 } 00357 00358 if (len<=0) 00359 SG_ERROR("String has zero or negative length\n"); 00360 00361 00362 cleanup_feature_vector(num); 00363 features[num].length=len; 00364 features[num].string=new ST[len]; 00365 memcpy(features[num].string, src, len*sizeof(ST)); 00366 00367 determine_maximum_string_length(); 00368 } 00369 00372 void enable_on_the_fly_preprocessing() 00373 { 00374 preprocess_on_get=true; 00375 } 00376 00380 void disable_on_the_fly_preprocessing() 00381 { 00382 preprocess_on_get=false; 00383 } 00384 00393 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree) 00394 { 00395 ASSERT(features); 00396 ASSERT(num<num_vectors); 00397 00398 if (!preprocess_on_get) 00399 { 00400 dofree=false; 00401 len=features[num].length; 00402 return features[num].string; 00403 } 00404 else 00405 { 00406 SG_DEBUG( "computing feature vector!\n") ; 00407 ST* feat=compute_feature_vector(num, len); 00408 dofree=true; 00409 00410 if (get_num_preproc()) 00411 { 00412 ST* tmp_feat_before = feat; 00413 00414 for (int32_t i=0; i<get_num_preproc(); i++) 00415 { 00416 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i); 00417 feat=p->apply_to_string(tmp_feat_before, len); 00418 SG_UNREF(p); 00419 delete[] tmp_feat_before; 00420 tmp_feat_before=feat; 00421 } 00422 } 00423 // TODO: implement caching 00424 return feat; 00425 } 00426 } 00427 00432 CStringFeatures<ST>* get_transposed() 00433 { 00434 int32_t num_feat; 00435 int32_t num_vec; 00436 TString<ST>* s=get_transposed(num_feat, num_vec); 00437 00438 return new CStringFeatures<ST>(s, num_vec, num_feat, alphabet); 00439 } 00440 00452 TString<ST>* get_transposed(int32_t &num_feat, int32_t &num_vec) 00453 { 00454 num_feat=num_vectors; 00455 num_vec=get_max_vector_length(); 00456 ASSERT(have_same_length()); 00457 00458 SG_DEBUG("Allocating memory for transposed string features of size %ld\n", 00459 int64_t(num_feat)*num_vec); 00460 00461 TString<ST>* sf=new TString<ST>[num_vec]; 00462 00463 for (int32_t i=0; i<num_vec; i++) 00464 { 00465 sf[i].string=new ST[num_feat]; 00466 sf[i].length=num_feat; 00467 } 00468 00469 for (int32_t i=0; i<num_feat; i++) 00470 { 00471 int32_t len=0; 00472 bool free_vec=false; 00473 ST* vec=get_feature_vector(i, len, free_vec); 00474 00475 for (int32_t j=0; j<num_vec; j++) 00476 sf[j].string[i]=vec[j]; 00477 00478 free_feature_vector(vec, i, free_vec); 00479 } 00480 return sf; 00481 } 00482 00489 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree) 00490 { 00491 if (feature_cache) 00492 feature_cache->unlock_entry(num); 00493 00494 if (dofree) 00495 delete[] feat_vec ; 00496 } 00497 00504 virtual ST inline get_feature(int32_t vec_num, int32_t feat_num) 00505 { 00506 int32_t len; 00507 bool free_vec; 00508 ST* vec=get_feature_vector(vec_num, len, free_vec); 00509 ASSERT(feat_num<len); 00510 ST result=vec[feat_num]; 00511 free_feature_vector(vec, vec_num, free_vec); 00512 00513 return result; 00514 } 00515 00521 virtual inline int32_t get_vector_length(int32_t vec_num) 00522 { 00523 int32_t len; 00524 bool free_vec; 00525 ST* vec=get_feature_vector(vec_num, len, free_vec); 00526 free_feature_vector(vec, vec_num, free_vec); 00527 return len; 00528 } 00529 00534 virtual inline int32_t get_max_vector_length() 00535 { 00536 return max_string_length; 00537 } 00538 00543 virtual inline int32_t get_num_vectors() { return num_vectors; } 00544 00551 inline floatmax_t get_num_symbols() { return num_symbols; } 00552 00560 inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); } 00561 00562 // these functions are necessary to find out about a former conversion process 00563 00568 inline floatmax_t get_original_num_symbols() { return original_num_symbols; } 00569 00574 inline int32_t get_order() { return order; } 00575 00583 inline ST get_masked_symbols(ST symbol, uint8_t mask) 00584 { 00585 ASSERT(symbol_mask_table); 00586 return symbol_mask_table[mask] & symbol; 00587 } 00588 00595 inline ST shift_offset(ST offset, int32_t amount) 00596 { 00597 ASSERT(alphabet); 00598 return (offset << (amount*alphabet->get_num_bits())); 00599 } 00600 00607 inline ST shift_symbol(ST symbol, int32_t amount) 00608 { 00609 ASSERT(alphabet); 00610 return (symbol >> (amount*alphabet->get_num_bits())); 00611 } 00612 00617 virtual inline void load(CFile* loader); 00618 00627 void load_ascii_file(char* fname, bool remap_to_bin=true, 00628 EAlphabet ascii_alphabet=DNA, EAlphabet binary_alphabet=RAWDNA) 00629 { 00630 size_t blocksize=1024*1024; 00631 size_t required_blocksize=0; 00632 uint8_t* dummy=new uint8_t[blocksize]; 00633 uint8_t* overflow=NULL; 00634 int32_t overflow_len=0; 00635 00636 cleanup(); 00637 00638 CAlphabet* alpha=new CAlphabet(ascii_alphabet); 00639 CAlphabet* alpha_bin=new CAlphabet(binary_alphabet); 00640 00641 FILE* f=fopen(fname, "ro"); 00642 00643 if (f) 00644 { 00645 num_vectors=0; 00646 max_string_length=0; 00647 00648 SG_INFO("counting line numbers in file %s\n", fname); 00649 size_t block_offs=0; 00650 size_t old_block_offs=0; 00651 fseek(f, 0, SEEK_END); 00652 size_t fsize=ftell(f); 00653 rewind(f); 00654 00655 if (blocksize>fsize) 00656 blocksize=fsize; 00657 00658 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize); 00659 00660 size_t sz=blocksize; 00661 while (sz == blocksize) 00662 { 00663 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00664 bool contains_cr=false; 00665 for (size_t i=0; i<sz; i++) 00666 { 00667 block_offs++; 00668 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00669 { 00670 num_vectors++; 00671 contains_cr=true; 00672 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs); 00673 old_block_offs=block_offs; 00674 } 00675 } 00676 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t"); 00677 } 00678 00679 SG_INFO("found %d strings\n", num_vectors); 00680 delete[] dummy; 00681 blocksize=required_blocksize; 00682 dummy = new uint8_t[blocksize]; 00683 overflow = new uint8_t[blocksize]; 00684 features=new TString<ST>[num_vectors]; 00685 00686 rewind(f); 00687 sz=blocksize; 00688 int32_t lines=0; 00689 while (sz == blocksize) 00690 { 00691 sz=fread(dummy, sizeof(uint8_t), blocksize, f); 00692 00693 size_t old_sz=0; 00694 for (size_t i=0; i<sz; i++) 00695 { 00696 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize)) 00697 { 00698 int32_t len=i-old_sz; 00699 //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz); 00700 max_string_length=CMath::max(max_string_length, len+overflow_len); 00701 00702 features[lines].length=len; 00703 features[lines].string=new ST[len]; 00704 00705 if (remap_to_bin) 00706 { 00707 for (int32_t j=0; j<overflow_len; j++) 00708 features[lines].string[j]=alpha->remap_to_bin(overflow[j]); 00709 for (int32_t j=0; j<len; j++) 00710 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]); 00711 alpha->add_string_to_histogram(&dummy[old_sz], len); 00712 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length); 00713 } 00714 else 00715 { 00716 for (int32_t j=0; j<overflow_len; j++) 00717 features[lines].string[j]=overflow[j]; 00718 for (int32_t j=0; j<len; j++) 00719 features[lines].string[j+overflow_len]=dummy[old_sz+j]; 00720 alpha->add_string_to_histogram(&dummy[old_sz], len); 00721 alpha->add_string_to_histogram(features[lines].string, features[lines].length); 00722 } 00723 00724 // clear overflow 00725 overflow_len=0; 00726 00727 //CMath::display_vector(features[lines].string, len); 00728 old_sz=i+1; 00729 lines++; 00730 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t"); 00731 } 00732 } 00733 for (size_t i=old_sz; i<sz; i++) 00734 overflow[i-old_sz]=dummy[i]; 00735 00736 overflow_len=sz-old_sz; 00737 } 00738 00739 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 00740 { 00741 SG_INFO("file successfully read\n"); 00742 SG_INFO("max_string_length=%d\n", max_string_length); 00743 SG_INFO("num_strings=%d\n", num_vectors); 00744 } 00745 fclose(f); 00746 } 00747 00748 delete[] dummy; 00749 00750 SG_UNREF(alphabet); 00751 00752 if (remap_to_bin) 00753 alphabet = alpha_bin; 00754 else 00755 alphabet = alpha; 00756 SG_REF(alphabet); 00757 num_symbols=alphabet->get_num_symbols(); 00758 } 00759 00766 bool load_fasta_file(const char* fname, bool ignore_invalid=false) 00767 { 00768 int32_t i=0; 00769 uint64_t len=0; 00770 uint64_t offs=0; 00771 int32_t num=0; 00772 int32_t max_len=0; 00773 00774 CMemoryMappedFile<char> f(fname); 00775 00776 while (true) 00777 { 00778 char* s=f.get_line(len, offs); 00779 if (!s) 00780 break; 00781 00782 if (len>0 && s[0]=='>') 00783 num++; 00784 } 00785 00786 if (num==0) 00787 SG_ERROR("No fasta hunks (lines starting with '>') found\n"); 00788 00789 cleanup(); 00790 SG_UNREF(alphabet); 00791 alphabet=new CAlphabet(DNA); 00792 num_symbols=alphabet->get_num_symbols(); 00793 00794 TString<ST>* strings=new TString<ST>[num]; 00795 offs=0; 00796 00797 for (i=0;i<num; i++) 00798 { 00799 uint64_t id_len=0; 00800 char* id=f.get_line(id_len, offs); 00801 00802 char* fasta=f.get_line(len, offs); 00803 char* s=fasta; 00804 int32_t fasta_len=0; 00805 int32_t spanned_lines=0; 00806 00807 while (true) 00808 { 00809 if (!s || len==0) 00810 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len); 00811 00812 if (s[0]=='>' || offs==f.get_size()) 00813 { 00814 offs-=len+1; // seek to beginning 00815 if (offs==f.get_size()) 00816 { 00817 SG_DEBUG("at EOF\n"); 00818 fasta_len+=len; 00819 } 00820 00821 len = fasta_len-spanned_lines; 00822 strings[i].string=new ST[len]; 00823 strings[i].length=len; 00824 00825 ST* str=strings[i].string; 00826 int32_t idx=0; 00827 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines); 00828 00829 for (int32_t j=0; j<fasta_len; j++) 00830 { 00831 if (fasta[j]=='\n') 00832 continue; 00833 00834 ST c = (ST) fasta[j]; 00835 00836 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j])) 00837 c = (ST) 'A'; 00838 00839 if (idx>=len) 00840 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str); 00841 str[idx++]=c; 00842 } 00843 max_len=CMath::max(max_len, strings[i].length); 00844 00845 00846 break; 00847 } 00848 00849 spanned_lines++; 00850 fasta_len+=len+1; // including '\n' 00851 s=f.get_line(len, offs); 00852 } 00853 } 00854 00855 return set_features(strings, num, max_len); 00856 } 00857 00865 bool load_fastq_file(const char* fname, 00866 bool ignore_invalid=false, bool bitremap_in_single_string=false) 00867 { 00868 CMemoryMappedFile<char> f(fname); 00869 00870 int32_t i=0; 00871 uint64_t len=0; 00872 uint64_t offs=0; 00873 00874 int32_t num=f.get_num_lines(); 00875 int32_t max_len=0; 00876 00877 if (num%4) 00878 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n"); 00879 num/=4; 00880 00881 cleanup(); 00882 SG_UNREF(alphabet); 00883 alphabet=new CAlphabet(DNA); 00884 00885 TString<ST>* strings; 00886 00887 ST* str; 00888 if (bitremap_in_single_string) 00889 { 00890 strings=new TString<ST>[1]; 00891 strings[0].string=new ST[num]; 00892 strings[0].length=num; 00893 f.get_line(len, offs); 00894 f.get_line(len, offs); 00895 order=len; 00896 max_len=num; 00897 offs=0; 00898 original_num_symbols=alphabet->get_num_symbols(); 00899 int32_t max_val=alphabet->get_num_bits(); 00900 str=new ST[len]; 00901 } 00902 else 00903 strings=new TString<ST>[num]; 00904 00905 for (i=0;i<num; i++) 00906 { 00907 if (!f.get_line(len, offs)) 00908 SG_ERROR("Error reading 'read' identifier in line %d", 4*i); 00909 00910 char* s=f.get_line(len, offs); 00911 if (!s || len==0) 00912 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len); 00913 00914 if (bitremap_in_single_string) 00915 { 00916 if (len!=order) 00917 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len); 00918 for (int32_t j=0; j<order; j++) 00919 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]); 00920 00921 strings[0].string[i]=embed_word(str, order); 00922 } 00923 else 00924 { 00925 strings[i].string=new ST[len]; 00926 strings[i].length=len; 00927 str=strings[i].string; 00928 00929 if (ignore_invalid) 00930 { 00931 for (int32_t j=0; j<len; j++) 00932 { 00933 if (alphabet->is_valid((uint8_t) s[j])) 00934 str[j]= (ST) s[j]; 00935 else 00936 str[j]= (ST) 'A'; 00937 } 00938 } 00939 else 00940 { 00941 for (int32_t j=0; j<len; j++) 00942 str[j]= (ST) s[j]; 00943 } 00944 max_len=CMath::max(max_len, (int32_t) len); 00945 } 00946 00947 00948 if (!f.get_line(len, offs)) 00949 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2); 00950 00951 if (!f.get_line(len, offs)) 00952 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3); 00953 } 00954 00955 if (bitremap_in_single_string) 00956 num=1; 00957 00958 num_vectors=num; 00959 max_string_length=max_len; 00960 features=strings; 00961 00962 return true; 00963 } 00964 00970 bool load_from_directory(char* dirname) 00971 { 00972 struct dirent **namelist; 00973 int32_t n; 00974 00975 IO::set_dirname(dirname); 00976 00977 SG_DEBUG("dirname '%s'\n", dirname); 00978 00979 n = scandir(dirname, &namelist, &IO::filter, alphasort); 00980 if (n <= 0) 00981 { 00982 SG_ERROR("error calling scandir - no files found\n"); 00983 return false; 00984 } 00985 else 00986 { 00987 TString<ST>* strings=NULL; 00988 00989 int32_t num=0; 00990 int32_t max_len=-1; 00991 00992 //usually n==num_vec, but it might not in race conditions 00993 //(file perms modified, file erased) 00994 strings=new TString<ST>[n]; 00995 00996 for (int32_t i=0; i<n; i++) 00997 { 00998 char* fname=IO::concat_filename(namelist[i]->d_name); 00999 01000 struct stat s; 01001 off_t filesize=0; 01002 01003 if (!stat(fname, &s) && s.st_size>0) 01004 { 01005 filesize=s.st_size/sizeof(ST); 01006 01007 FILE* f=fopen(fname, "ro"); 01008 if (f) 01009 { 01010 ST* str=new ST[filesize]; 01011 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize); 01012 fread(str, sizeof(ST), filesize, f); 01013 strings[num].string=str; 01014 strings[num].length=filesize; 01015 max_len=CMath::max(max_len, strings[num].length); 01016 01017 num++; 01018 fclose(f); 01019 } 01020 } 01021 else 01022 SG_ERROR("empty or non readable file \'%s\'\n", fname); 01023 01024 free(namelist[i]); 01025 } 01026 free(namelist); 01027 01028 if (num>0 && strings) 01029 { 01030 set_features(strings, num, max_len); 01031 return true; 01032 } 01033 } 01034 return false; 01035 } 01036 01044 bool set_features(TString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 01045 { 01046 if (p_features) 01047 { 01048 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 01049 01050 //compute histogram for char/byte 01051 for (int32_t i=0; i<p_num_vectors; i++) 01052 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length); 01053 01054 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()); 01055 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()); 01056 01057 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 01058 { 01059 cleanup(); 01060 SG_UNREF(alphabet); 01061 01062 alphabet=alpha; 01063 SG_REF(alphabet); 01064 01065 this->features=p_features; 01066 this->num_vectors=p_num_vectors; 01067 this->max_string_length=p_max_string_length; 01068 01069 return true; 01070 } 01071 else 01072 SG_UNREF(alpha); 01073 } 01074 01075 return false; 01076 } 01077 01083 bool append_features(CStringFeatures<ST>* sf) 01084 { 01085 ASSERT(sf); 01086 TString<ST>* new_features = new TString<ST>[sf->num_vectors]; 01087 01088 for (int32_t i=0; i<sf->num_vectors; i++) 01089 { 01090 int32_t length=sf->features[i].length; 01091 new_features[i].string=new ST[length]; 01092 memcpy(new_features[i].string, sf->features[i].string, length); 01093 new_features[i].length=length; 01094 } 01095 return append_features(new_features, sf->num_vectors, 01096 sf->max_string_length); 01097 } 01098 01109 bool append_features(TString<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length) 01110 { 01111 if (!features) 01112 return set_features(p_features, p_num_vectors, p_max_string_length); 01113 01114 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet()); 01115 01116 //compute histogram for char/byte 01117 for (int32_t i=0; i<p_num_vectors; i++) 01118 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length); 01119 01120 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram()); 01121 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram()); 01122 01123 if (alpha->check_alphabet_size() && alpha->check_alphabet()) 01124 { 01125 SG_UNREF(alpha); 01126 for (int32_t i=0; i<p_num_vectors; i++) 01127 alphabet->add_string_to_histogram( p_features[i].string, p_features[i].length); 01128 01129 int32_t old_num_vectors=num_vectors; 01130 num_vectors=old_num_vectors+p_num_vectors; 01131 TString<ST>* new_features = new TString<ST>[num_vectors]; 01132 01133 for (int32_t i=0; i<num_vectors; i++) 01134 { 01135 if (i<old_num_vectors) 01136 { 01137 new_features[i].string=features[i].string; 01138 new_features[i].length=features[i].length; 01139 } 01140 else 01141 { 01142 new_features[i].string=p_features[i-old_num_vectors].string; 01143 new_features[i].length=p_features[i-old_num_vectors].length; 01144 } 01145 } 01146 delete[] features; 01147 delete[] p_features; // free now obsolete features 01148 01149 this->features=new_features; 01150 this->max_string_length=CMath::max(max_string_length, p_max_string_length); 01151 01152 return true; 01153 } 01154 SG_UNREF(alpha); 01155 01156 return false; 01157 } 01158 01165 virtual TString<ST>* get_features(int32_t& num_str, int32_t& max_str_len) 01166 { 01167 num_str=num_vectors; 01168 max_str_len=max_string_length; 01169 return features; 01170 } 01171 01178 virtual TString<ST>* copy_features(int32_t& num_str, int32_t& max_str_len) 01179 { 01180 ASSERT(num_vectors>0); 01181 01182 num_str=num_vectors; 01183 max_str_len=max_string_length; 01184 TString<ST>* new_feat=new TString<ST>[num_str]; 01185 01186 for (int32_t i=0; i<num_str; i++) 01187 { 01188 int32_t len; 01189 bool free_vec; 01190 ST* vec=get_feature_vector(i, len, free_vec); 01191 new_feat[i].string=new ST[len]; 01192 new_feat[i].length=len; 01193 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST)); 01194 free_feature_vector(vec, i, free_vec); 01195 } 01196 01197 return new_feat; 01198 } 01199 01205 virtual void get_features(TString<ST>** dst, int32_t* num_str) 01206 { 01207 int32_t num_vec; 01208 int32_t max_str_len; 01209 *dst=copy_features(num_vec, max_str_len); 01210 *num_str=num_vec; 01211 } 01212 01217 virtual inline void save(CFile* writer); 01218 01225 virtual bool load_compressed(char* src, bool decompress) 01226 { 01227 FILE* file=NULL; 01228 01229 if (!(file=fopen(src, "r"))) 01230 return false; 01231 cleanup(); 01232 01233 // header shogun v0 01234 char id[4]; 01235 fread(&id[0], sizeof(char), 1, file); 01236 ASSERT(id[0]=='S'); 01237 fread(&id[1], sizeof(char), 1, file); 01238 ASSERT(id[1]=='G'); 01239 fread(&id[2], sizeof(char), 1, file); 01240 ASSERT(id[2]=='V'); 01241 fread(&id[3], sizeof(char), 1, file); 01242 ASSERT(id[3]=='0'); 01243 01244 //compression type 01245 uint8_t c; 01246 fread(&c, sizeof(uint8_t), 1, file); 01247 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c); 01248 //alphabet 01249 uint8_t a; 01250 delete alphabet; 01251 fread(&a, sizeof(uint8_t), 1, file); 01252 alphabet=new CAlphabet((EAlphabet) a); 01253 // number of vectors 01254 fread(&num_vectors, sizeof(int32_t), 1, file); 01255 ASSERT(num_vectors>0); 01256 // maximum string length 01257 fread(&max_string_length, sizeof(int32_t), 1, file); 01258 ASSERT(max_string_length>0); 01259 01260 features=new TString<ST>[num_vectors]; 01261 01262 // vectors 01263 for (int32_t i=0; i<num_vectors; i++) 01264 { 01265 // vector len compressed 01266 int32_t len_compressed; 01267 fread(&len_compressed, sizeof(int32_t), 1, file); 01268 // vector len uncompressed 01269 int32_t len_uncompressed; 01270 fread(&len_uncompressed, sizeof(int32_t), 1, file); 01271 01272 // vector raw data 01273 if (decompress) 01274 { 01275 features[i].string=new ST[len_uncompressed]; 01276 features[i].length=len_uncompressed; 01277 uint8_t* compressed=new uint8_t[len_compressed]; 01278 fread(compressed, len_compressed, 1, file); 01279 uint64_t uncompressed_size=len_uncompressed; 01280 uncompressed_size*=sizeof(ST); 01281 compressor->decompress(compressed, len_compressed, 01282 (uint8_t*) features[i].string, uncompressed_size); 01283 delete[] compressed; 01284 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST)); 01285 } 01286 else 01287 { 01288 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST)); 01289 features[i].string=new ST[len_compressed+offs]; 01290 features[i].length=len_compressed+offs; 01291 int32_t* feat32ptr=((int32_t*) (features[i].string)); 01292 memset(features[i].string, 0, offs*sizeof(ST)); 01293 feat32ptr[0]=(int32_t) len_compressed; 01294 feat32ptr[1]=(int32_t) len_uncompressed; 01295 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]); 01296 fread(compressed, len_compressed, 1, file); 01297 } 01298 } 01299 01300 delete compressor; 01301 fclose(file); 01302 return false; 01303 } 01304 01312 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level) 01313 { 01314 FILE* file=NULL; 01315 01316 if (!(file=fopen(dest, "wb"))) 01317 return false; 01318 01319 CCompressor* compressor= new CCompressor(compression); 01320 01321 // header shogun v0 01322 const char* id="SGV0"; 01323 fwrite(&id[0], sizeof(char), 1, file); 01324 fwrite(&id[1], sizeof(char), 1, file); 01325 fwrite(&id[2], sizeof(char), 1, file); 01326 fwrite(&id[3], sizeof(char), 1, file); 01327 01328 //compression type 01329 uint8_t c=(uint8_t) compression; 01330 fwrite(&c, sizeof(uint8_t), 1, file); 01331 //alphabet 01332 uint8_t a=(uint8_t) alphabet->get_alphabet(); 01333 fwrite(&a, sizeof(uint8_t), 1, file); 01334 // number of vectors 01335 fwrite(&num_vectors, sizeof(int32_t), 1, file); 01336 // maximum string length 01337 fwrite(&max_string_length, sizeof(int32_t), 1, file); 01338 01339 // vectors 01340 for (int32_t i=0; i<num_vectors; i++) 01341 { 01342 int32_t len=-1; 01343 bool vfree; 01344 ST* vec=get_feature_vector(i, len, vfree); 01345 01346 uint8_t* compressed=NULL; 01347 uint64_t compressed_size=0; 01348 01349 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST), 01350 compressed, compressed_size, level); 01351 01352 int32_t len_compressed = (int32_t) compressed_size; 01353 // vector len compressed in bytes 01354 fwrite(&len_compressed, sizeof(int32_t), 1, file); 01355 // vector len uncompressed in number of elements of type ST 01356 fwrite(&len, sizeof(int32_t), 1, file); 01357 // vector raw data 01358 fwrite(compressed, compressed_size, 1, file); 01359 delete[] compressed; 01360 01361 free_feature_vector(vec, i, vfree); 01362 } 01363 01364 delete compressor; 01365 fclose(file); 01366 return true; 01367 } 01368 01369 01374 virtual int32_t get_size() { return sizeof(ST); } 01375 01381 virtual bool apply_preproc(bool force_preprocessing=false) 01382 { 01383 SG_DEBUG( "force: %d\n", force_preprocessing); 01384 01385 for (int32_t i=0; i<get_num_preproc(); i++) 01386 { 01387 if ( (!is_preprocessed(i) || force_preprocessing) ) 01388 { 01389 set_preprocessed(i); 01390 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i); 01391 SG_INFO( "preprocessing using preproc %s\n", p->get_name()); 01392 01393 if (!p->apply_to_string_features(this)) 01394 { 01395 SG_UNREF(p); 01396 return false; 01397 } 01398 else 01399 SG_UNREF(p); 01400 } 01401 } 01402 return true; 01403 } 01404 01414 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0) 01415 { 01416 ASSERT(step_size>0); 01417 ASSERT(window_size>0); 01418 ASSERT(num_vectors==1 || single_string); 01419 ASSERT(max_string_length>=window_size || 01420 (single_string && length_of_single_string>=window_size)); 01421 01422 //in case we are dealing with a single remapped string 01423 //allow remapping 01424 if (single_string) 01425 num_vectors= (length_of_single_string-window_size)/step_size + 1; 01426 else if (num_vectors==1) 01427 { 01428 num_vectors= (max_string_length-window_size)/step_size + 1; 01429 length_of_single_string=max_string_length; 01430 } 01431 01432 TString<ST>* f=new TString<ST>[num_vectors]; 01433 int32_t offs=0; 01434 for (int32_t i=0; i<num_vectors; i++) 01435 { 01436 f[i].string=&features[0].string[offs+skip]; 01437 f[i].length=window_size-skip; 01438 offs+=step_size; 01439 } 01440 single_string=features[0].string; 01441 delete[] features; 01442 features=f; 01443 max_string_length=window_size-skip; 01444 01445 return num_vectors; 01446 } 01447 01456 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0) 01457 { 01458 ASSERT(positions); 01459 ASSERT(window_size>0); 01460 ASSERT(num_vectors==1 || single_string); 01461 ASSERT(max_string_length>=window_size || 01462 (single_string && length_of_single_string>=window_size)); 01463 01464 num_vectors= positions->get_num_elements(); 01465 ASSERT(num_vectors>0); 01466 01467 int32_t len; 01468 01469 //in case we are dealing with a single remapped string 01470 //allow remapping 01471 if (single_string) 01472 len=length_of_single_string; 01473 else 01474 { 01475 single_string=features[0].string; 01476 len=max_string_length; 01477 length_of_single_string=max_string_length; 01478 } 01479 01480 TString<ST>* f=new TString<ST>[num_vectors]; 01481 for (int32_t i=0; i<num_vectors; i++) 01482 { 01483 int32_t p=positions->get_element(i); 01484 01485 if (p>=0 && p<=len-window_size) 01486 { 01487 f[i].string=&features[0].string[p+skip]; 01488 f[i].length=window_size-skip; 01489 } 01490 else 01491 { 01492 num_vectors=1; 01493 max_string_length=len; 01494 features[0].length=len; 01495 single_string=NULL; 01496 delete[] f; 01497 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n", 01498 window_size, i, p, len); 01499 return -1; 01500 } 01501 } 01502 01503 delete[] features; 01504 features=f; 01505 max_string_length=window_size-skip; 01506 01507 return num_vectors; 01508 } 01509 01521 inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01522 { 01523 return obtain_from_char_features(sf, start, p_order, gap, rev); 01524 } 01525 01535 template <class CT> 01536 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 01537 { 01538 ASSERT(sf); 01539 01540 CAlphabet* alpha=sf->get_alphabet(); 01541 ASSERT(alpha->get_num_symbols_in_histogram() > 0); 01542 01543 this->order=p_order; 01544 cleanup(); 01545 01546 num_vectors=sf->get_num_vectors(); 01547 ASSERT(num_vectors>0); 01548 max_string_length=sf->get_max_vector_length()-start; 01549 features=new TString<ST>[num_vectors]; 01550 01551 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(), 01552 alpha->get_num_symbols_in_histogram()); 01553 01554 for (int32_t i=0; i<num_vectors; i++) 01555 { 01556 int32_t len=-1; 01557 bool vfree; 01558 CT* c=sf->get_feature_vector(i, len, vfree); 01559 ASSERT(!vfree); // won't work when preprocessors are attached 01560 01561 features[i].string=new ST[len]; 01562 features[i].length=len; 01563 01564 ST* str=features[i].string; 01565 for (int32_t j=0; j<len; j++) 01566 str[j]=(ST) alpha->remap_to_bin(c[j]); 01567 } 01568 01569 original_num_symbols=alpha->get_num_symbols(); 01570 int32_t max_val=alpha->get_num_bits(); 01571 01572 SG_UNREF(alpha); 01573 01574 if (p_order>1) 01575 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01576 else 01577 num_symbols=original_num_symbols; 01578 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols); 01579 01580 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01581 { 01582 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val); 01583 return false; 01584 } 01585 01586 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ; 01587 for (int32_t line=0; line<num_vectors; line++) 01588 { 01589 int32_t len=0; 01590 bool vfree; 01591 ST* fv=get_feature_vector(line, len, vfree); 01592 ASSERT(!vfree); // won't work when preprocessors are attached 01593 01594 if (rev) 01595 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap); 01596 else 01597 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap); 01598 01599 /* fix the length of the string -- hacky */ 01600 features[line].length-=start+gap ; 01601 if (features[line].length<0) 01602 features[line].length=0 ; 01603 } 01604 01605 compute_symbol_mask_table(max_val); 01606 01607 return true; 01608 } 01609 01617 bool have_same_length(int32_t len=-1) 01618 { 01619 if (len!=-1) 01620 { 01621 if (len!=get_max_vector_length()) 01622 return false; 01623 } 01624 len = get_max_vector_length(); 01625 01626 for (int32_t i=0; i<num_vectors; i++) 01627 { 01628 if (get_vector_length(i)!=len) 01629 return false; 01630 } 01631 01632 return true; 01633 } 01634 01639 inline void embed_features(int32_t p_order) 01640 { 01641 ASSERT(alphabet->get_num_symbols_in_histogram() > 0); 01642 01643 order=p_order; 01644 original_num_symbols=alphabet->get_num_symbols(); 01645 int32_t max_val=alphabet->get_num_bits(); 01646 01647 if (p_order>1) 01648 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order); 01649 else 01650 num_symbols=original_num_symbols; 01651 01652 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols); 01653 01654 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) ) 01655 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val); 01656 01657 ST mask=0; 01658 for (int32_t i=0; i<p_order*max_val; i++) 01659 mask= (mask<<1) | ((ST) 1); 01660 01661 for (int32_t i=0; i<num_vectors; i++) 01662 { 01663 int32_t len=features[i].length; 01664 01665 if (len < p_order) 01666 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order); 01667 01668 ST* str = features[i].string; 01669 01670 // convert first word 01671 for (int32_t j=0; j<p_order; j++) 01672 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01673 str[0]=embed_word(&str[0], p_order); 01674 01675 // convert the rest 01676 int32_t idx=0; 01677 for (int32_t j=p_order; j<len; j++) 01678 { 01679 str[j]=(ST) alphabet->remap_to_bin(str[j]); 01680 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask; 01681 idx++; 01682 } 01683 01684 features[i].length=len-p_order+1; 01685 } 01686 01687 compute_symbol_mask_table(max_val); 01688 } 01689 01694 inline void compute_symbol_mask_table(int64_t max_val) 01695 { 01696 delete[] symbol_mask_table; 01697 symbol_mask_table=new ST[256]; 01698 01699 uint64_t mask=0; 01700 for (int32_t i=0; i< (int64_t) max_val; i++) 01701 mask=(mask<<1) | 1; 01702 01703 for (int32_t i=0; i<256; i++) 01704 { 01705 uint8_t bits=(uint8_t) i; 01706 symbol_mask_table[i]=0; 01707 01708 for (int32_t j=0; j<8; j++) 01709 { 01710 if (bits & 1) 01711 symbol_mask_table[i]|=mask<<(max_val*j); 01712 01713 bits>>=1; 01714 } 01715 } 01716 } 01717 01724 inline void unembed_word(ST word, uint8_t* seq, int32_t len) 01725 { 01726 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01727 01728 ST mask=0; 01729 for (int32_t i=0; i<nbits; i++) 01730 mask=(mask<<1) | (ST) 1; 01731 01732 for (int32_t i=0; i<len; i++) 01733 { 01734 ST w=(word & mask); 01735 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w); 01736 word>>=nbits; 01737 } 01738 } 01739 01745 inline ST embed_word(ST* seq, int32_t len) 01746 { 01747 ST value=(ST) 0; 01748 uint32_t nbits= (uint32_t) alphabet->get_num_bits(); 01749 for (int32_t i=0; i<len; i++) 01750 { 01751 value<<=nbits; 01752 value|=seq[i]; 01753 } 01754 01755 return value; 01756 } 01757 01760 void determine_maximum_string_length() 01761 { 01762 max_string_length=0; 01763 01764 for (int32_t i=0; i<num_vectors; i++) 01765 max_string_length=CMath::max(max_string_length, features[i].length); 01766 } 01767 01775 static ST* get_zero_terminated_string_copy(TString<ST> str) 01776 { 01777 int32_t l=str.length; 01778 ST* s=new ST[l+1]; 01779 memcpy(s, str.string, sizeof(ST)*l); 01780 s[l]='\0'; 01781 return s; 01782 } 01783 01790 virtual void set_feature_vector(int32_t num, ST* string, int32_t len) 01791 { 01792 ASSERT(features); 01793 ASSERT(num<num_vectors); 01794 01795 features[num].length=len ; 01796 features[num].string=string ; 01797 01798 max_string_length=CMath::max(len, max_string_length); 01799 } 01800 01801 01804 virtual void get_histogram(float64_t** hist, int32_t* rows, int32_t* cols, bool normalize=true) 01805 { 01806 int32_t nsym=get_num_symbols(); 01807 int32_t slen=get_max_vector_length(); 01808 int64_t sz=int64_t(nsym)*slen*sizeof(float64_t); 01809 float64_t* h= (float64_t*) malloc(sz); 01810 ASSERT(h); 01811 memset(h, 0, sz); 01812 01813 float64_t* h_normalizer=new float64_t[slen]; 01814 memset(h_normalizer, 0, slen*sizeof(float64_t)); 01815 int32_t num_str=get_num_vectors(); 01816 for (int32_t i=0; i<num_str; i++) 01817 { 01818 int32_t len; 01819 bool free_vec; 01820 ST* vec=get_feature_vector(i, len, free_vec); 01821 for (int32_t j=0; j<len; j++) 01822 { 01823 h[int64_t(j)*nsym+alphabet->remap_to_bin(vec[j])]++; 01824 h_normalizer[j]++; 01825 } 01826 free_feature_vector(vec, i, free_vec); 01827 } 01828 01829 if (normalize) 01830 { 01831 for (int32_t i=0; i<slen; i++) 01832 { 01833 for (int32_t j=0; j<nsym; j++) 01834 { 01835 if (h_normalizer && h_normalizer[i]) 01836 h[int64_t(i)*nsym+j]/=h_normalizer[i]; 01837 } 01838 } 01839 } 01840 delete[] h_normalizer; 01841 01842 *hist=h; 01843 *rows=nsym; 01844 *cols=slen; 01845 } 01846 01849 virtual void create_random(float64_t* hist, int32_t rows, int32_t cols, int32_t num_vec) 01850 { 01851 ASSERT(rows == get_num_symbols()); 01852 cleanup(); 01853 float64_t* randoms=new float64_t[cols]; 01854 TString<ST>* sf=new TString<ST>[num_vec]; 01855 01856 for (int32_t i=0; i<num_vec; i++) 01857 { 01858 sf[i].string=new ST[cols]; 01859 sf[i].length=cols; 01860 01861 CMath::random_vector(randoms, cols, 0.0, 1.0); 01862 01863 for (int32_t j=0; j<cols; j++) 01864 { 01865 float64_t lik=hist[int64_t(j)*rows+0]; 01866 01867 int32_t c; 01868 for (c=0; c<rows-1; c++) 01869 { 01870 if (randoms[j]<=lik) 01871 break; 01872 lik+=hist[int64_t(j)*rows+c+1]; 01873 } 01874 sf[i].string[j]=alphabet->remap_to_char(c); 01875 } 01876 } 01877 delete[] randoms; 01878 set_features(sf, num_vec, cols); 01879 } 01880 01881 /* 01882 CStringFeatures<SSKTripleFeature>* obtain_sssk_triple_from_cha(int d1, int d2) 01883 { 01884 int *s; 01885 int32_t nStr=get_num_vectors(); 01886 01887 int32_t nfeat = 0; 01888 for (int32_t i = 0; i < nStr; ++i) 01889 nfeat += get_vector_length[i] - d1 -d2; 01890 TString<SSKFeature>* F= new TString<SSKFeature>[nfeat]; 01891 int32_t c = 0; 01892 for (int32_t i = 0; i < nStr; ++i) 01893 { 01894 int32_t len; 01895 bool free_vec; 01896 ST* S=get_feature_vector(vec_num, len, free_vec); 01897 free_feature_vector(vec, vec_num, free_vec); 01898 int32_t n = len - d1 - d2; 01899 s = S[i]; 01900 for (int32_t j = 0; j < n; ++j) 01901 { 01902 F[c].feature1 = s[j]; 01903 F[c].feature2 = s[j+d1]; 01904 F[c].feature3 = s[j+d1+d2]; 01905 F[c].group = i; 01906 c++; 01907 } 01908 } 01909 ASSERT(nfeat==c); 01910 return F; 01911 } 01912 01913 CStringFeatures<SSKFeature>* obtain_sssk_double_from_char(int **S, int *len, int nStr, int d1) 01914 { 01915 int i, j; 01916 int n, nfeat; 01917 int *group; 01918 int *features; 01919 int *s; 01920 int c; 01921 SSKFeatures *F; 01922 01923 nfeat = 0; 01924 for (i = 0; i < nStr; ++i) 01925 nfeat += len[i] - d1; 01926 group = (int *)malloc(nfeat*sizeof(int)); 01927 features = (int *)malloc(nfeat*2*sizeof(int *)); 01928 c = 0; 01929 for (i = 0; i < nStr; ++i) 01930 { 01931 n = len[i] - d1; 01932 s = S[i]; 01933 for (j = 0; j < n; ++j) 01934 { 01935 features[c] = s[j]; 01936 features[c+nfeat] = s[j+d1]; 01937 group[c] = i; 01938 c++; 01939 } 01940 } 01941 if (nfeat!=c) 01942 printf("Something is wrong...\n"); 01943 F = (SSKFeatures *)malloc(sizeof(SSKFeatures)); 01944 (*F).features = features; 01945 (*F).group = group; 01946 (*F).n = nfeat; 01947 return F; 01948 } 01949 */ 01950 01951 01952 01954 inline virtual const char* get_name() const { return "StringFeatures"; } 01955 01956 protected: 01957 01968 virtual ST* compute_feature_vector(int32_t num, int32_t& len) 01969 { 01970 ASSERT(features && num<num_vectors); 01971 01972 len=features[num].length; 01973 if (len<=0) 01974 return NULL; 01975 01976 ST* target=new ST[len]; 01977 memcpy(target, features[num].string, len*sizeof(ST)); 01978 return target; 01979 } 01980 01981 private: 01982 void init(void) 01983 { 01984 set_generic<ST>(); 01985 01986 m_parameters->add((CSGObject**) &alphabet, "alphabet"); 01987 m_parameters->add_vector(&features, &num_vectors, "features", 01988 "This contains the array of features."); 01989 m_parameters->add_vector(&single_string, 01990 &length_of_single_string, 01991 "single_string", 01992 "Created by sliding window."); 01993 m_parameters->add(&max_string_length, "max_string_length", 01994 "Length of longest string."); 01995 m_parameters->add(&num_symbols, "num_symbols", 01996 "Number of used symbols."); 01997 m_parameters->add(&original_num_symbols, "original_num_symbols", 01998 "Original number of used symbols."); 01999 m_parameters->add(&order, "order", 02000 "Order used in higher order mapping."); 02001 m_parameters->add(&preprocess_on_get, "preprocess_on_get", 02002 "Preprocess on-the-fly?"); 02003 02004 /* TODO M_PARAMETERS->ADD? 02005 * /// order used in higher order mapping 02006 * ST* symbol_mask_table; 02007 */ 02008 } 02009 02010 02011 protected: 02012 02014 CAlphabet* alphabet; 02015 02017 int32_t num_vectors; 02018 02020 TString<ST>* features; 02021 02023 ST* single_string; 02024 02026 int32_t length_of_single_string; 02027 02029 int32_t max_string_length; 02030 02032 floatmax_t num_symbols; 02033 02035 floatmax_t original_num_symbols; 02036 02038 int32_t order; 02039 02041 ST* symbol_mask_table; 02042 02044 bool preprocess_on_get; 02045 02047 CCache<ST>* feature_cache; 02048 }; 02049 02050 #ifndef DOXYGEN_SHOULD_SKIP_THIS 02051 02055 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type() 02056 { 02057 return F_BOOL; 02058 } 02059 02064 template<> inline EFeatureType CStringFeatures<char>::get_feature_type() 02065 { 02066 return F_CHAR; 02067 } 02068 02073 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type() 02074 { 02075 return F_BYTE; 02076 } 02077 02082 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type() 02083 { 02084 return F_SHORT; 02085 } 02086 02091 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type() 02092 { 02093 return F_WORD; 02094 } 02095 02100 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type() 02101 { 02102 return F_INT; 02103 } 02104 02109 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type() 02110 { 02111 return F_UINT; 02112 } 02113 02118 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type() 02119 { 02120 return F_LONG; 02121 } 02122 02127 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type() 02128 { 02129 return F_ULONG; 02130 } 02131 02136 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type() 02137 { 02138 return F_SHORTREAL; 02139 } 02140 02145 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type() 02146 { 02147 return F_DREAL; 02148 } 02149 02154 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type() 02155 { 02156 return F_LONGREAL; 02157 } 02158 02159 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask) 02160 { 02161 return symbol; 02162 } 02163 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask) 02164 { 02165 return symbol; 02166 } 02167 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask) 02168 { 02169 return symbol; 02170 } 02171 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask) 02172 { 02173 return symbol; 02174 } 02175 02176 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount) 02177 { 02178 return false; 02179 } 02180 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount) 02181 { 02182 return 0; 02183 } 02184 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount) 02185 { 02186 return 0; 02187 } 02188 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount) 02189 { 02190 return 0; 02191 } 02192 02193 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount) 02194 { 02195 return symbol; 02196 } 02197 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount) 02198 { 02199 return symbol; 02200 } 02201 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount) 02202 { 02203 return symbol; 02204 } 02205 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount) 02206 { 02207 return symbol; 02208 } 02209 02210 #ifndef SUNOS 02211 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 02212 { 02213 return false; 02214 } 02215 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 02216 { 02217 return false; 02218 } 02219 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev) 02220 { 02221 return false; 02222 } 02223 #endif 02224 02225 template<> inline void CStringFeatures<float32_t>::embed_features(int32_t p_order) 02226 { 02227 } 02228 template<> inline void CStringFeatures<float64_t>::embed_features(int32_t p_order) 02229 { 02230 } 02231 template<> inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order) 02232 { 02233 } 02234 02235 template<> inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val) 02236 { 02237 } 02238 template<> inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val) 02239 { 02240 } 02241 template<> inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val) 02242 { 02243 } 02244 02245 template<> inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len) 02246 { 02247 return 0; 02248 } 02249 template<> inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len) 02250 { 02251 return 0; 02252 } 02253 template<> inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len) 02254 { 02255 return 0; 02256 } 02257 02258 template<> inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len) 02259 { 02260 } 02261 template<> inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len) 02262 { 02263 } 02264 template<> inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len) 02265 { 02266 } 02267 #define LOAD(f_load, sg_type) \ 02268 template<> inline void CStringFeatures<sg_type>::load(CFile* loader) \ 02269 { \ 02270 SG_INFO( "loading...\n"); \ 02271 \ 02272 SG_SET_LOCALE_C; \ 02273 TString<sg_type>* strs; \ 02274 int32_t num_str; \ 02275 int32_t max_len; \ 02276 loader->f_load(strs, num_str, max_len); \ 02277 set_features(strs, num_str, max_len); \ 02278 SG_RESET_LOCALE; \ 02279 } 02280 02281 LOAD(get_bool_string_list, bool) 02282 LOAD(get_char_string_list, char) 02283 LOAD(get_int8_string_list, int8_t) 02284 LOAD(get_byte_string_list, uint8_t) 02285 LOAD(get_short_string_list, int16_t) 02286 LOAD(get_word_string_list, uint16_t) 02287 LOAD(get_int_string_list, int32_t) 02288 LOAD(get_uint_string_list, uint32_t) 02289 LOAD(get_long_string_list, int64_t) 02290 LOAD(get_ulong_string_list, uint64_t) 02291 LOAD(get_shortreal_string_list, float32_t) 02292 LOAD(get_real_string_list, float64_t) 02293 LOAD(get_longreal_string_list, floatmax_t) 02294 #undef LOAD 02295 02296 #define SAVE(f_write, sg_type) \ 02297 template<> inline void CStringFeatures<sg_type>::save(CFile* writer) \ 02298 { \ 02299 SG_SET_LOCALE_C; \ 02300 ASSERT(writer); \ 02301 writer->f_write(features, num_vectors); \ 02302 SG_RESET_LOCALE; \ 02303 } 02304 02305 SAVE(set_bool_string_list, bool) 02306 SAVE(set_char_string_list, char) 02307 SAVE(set_int8_string_list, int8_t) 02308 SAVE(set_byte_string_list, uint8_t) 02309 SAVE(set_short_string_list, int16_t) 02310 SAVE(set_word_string_list, uint16_t) 02311 SAVE(set_int_string_list, int32_t) 02312 SAVE(set_uint_string_list, uint32_t) 02313 SAVE(set_long_string_list, int64_t) 02314 SAVE(set_ulong_string_list, uint64_t) 02315 SAVE(set_shortreal_string_list, float32_t) 02316 SAVE(set_real_string_list, float64_t) 02317 SAVE(set_longreal_string_list, floatmax_t) 02318 #undef SAVE 02319 #endif // DOXYGEN_SHOULD_SKIP_THIS 02320 } 02321 #endif // _CSTRINGFEATURES__H__