00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014
00015 #include "lib/common.h"
00016 #include "lib/io.h"
00017 #include "lib/Cache.h"
00018 #include "preproc/PreProc.h"
00019 #include "preproc/StringPreProc.h"
00020 #include "features/Features.h"
00021 #include "features/Alphabet.h"
00022 #include "lib/DynamicArray.h"
00023 #include "lib/File.h"
00024 #include "lib/MemoryMappedFile.h"
00025 #include "lib/Mathematics.h"
00026 #include "lib/Compressor.h"
00027
00028 #include <sys/types.h>
00029 #include <sys/stat.h>
00030 #include <dirent.h>
00031 #include <stdio.h>
00032 #include <stdlib.h>
00033 #include <unistd.h>
00034
00035 namespace shogun
00036 {
00037 class CCompressor;
00038 enum E_COMPRESSION_TYPE;
00039 class CAlphabet;
00040 enum EAlphabet;
00041 template <class T> class CDynamicArray;
00042 class CFile;
00043 template <class T> class CMemoryMappedFile;
00044 class CMath;
00045 template <class ST> class CStringPreProc;
00046 template <class T> class T_STRING;
00047
00048 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00049
00050 template <class T> class T_STRING
00051 {
00052
00053
00054 #ifdef HAVE_BOOST_SERIALIZATION
00055
00056 private:
00057
00058
00059 friend class ::boost::serialization::access;
00060 template<class Archive>
00061 void save(Archive & ar, const unsigned int archive_version) const
00062 {
00063
00064
00065
00066 ar & length;
00067
00068 for (int i=0; i < length; ++i) {
00069 ar & string[i];
00070 }
00071
00072
00073
00074 }
00075
00076 template<class Archive>
00077 void load(Archive & ar, const unsigned int archive_version)
00078 {
00079
00080
00081
00082 ar & length;
00083
00084 string = new T[length];
00085
00086 for (int i=0; i < length; ++i) {
00087 ar & string[i];
00088 }
00089
00090
00091
00092 }
00093
00094 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
00095
00096
00097 #endif //HAVE_BOOST_SERIALIZATION
00098
00099 public:
00101 T* string;
00103 int32_t length;
00104 };
00105 #endif // DOXYGEN_SHOULD_SKIP_THIS
00106
00107
00126 template <class ST> class CStringFeatures : public CFeatures
00127 {
00128 public:
00132 CStringFeatures() : CFeatures(0), alphabet(NULL), num_vectors(0),
00133 features(NULL), single_string(NULL),length_of_single_string(0),
00134 max_string_length(0), order(0), symbol_mask_table(NULL),
00135 preprocess_on_get(false), feature_cache(NULL)
00136 {
00137 }
00138
00143 CStringFeatures(EAlphabet alpha)
00144 : CFeatures(0), num_vectors(0), features(NULL),
00145 single_string(NULL),length_of_single_string(0),
00146 max_string_length(0), order(0), symbol_mask_table(NULL),
00147 preprocess_on_get(false), feature_cache(NULL)
00148 {
00149 alphabet=new CAlphabet(alpha);
00150 SG_REF(alphabet);
00151 num_symbols=alphabet->get_num_symbols();
00152 original_num_symbols=num_symbols;
00153 }
00154
00162 CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors,
00163 int32_t p_max_string_length, EAlphabet alpha)
00164 : CFeatures(0), num_vectors(0), features(NULL),
00165 single_string(NULL),length_of_single_string(0),
00166 max_string_length(0), order(0), symbol_mask_table(NULL),
00167 preprocess_on_get(false), feature_cache(NULL)
00168 {
00169 alphabet=new CAlphabet(alpha);
00170 SG_REF(alphabet);
00171 num_symbols=alphabet->get_num_symbols();
00172 original_num_symbols=num_symbols;
00173 set_features(p_features, p_num_vectors, p_max_string_length);
00174 }
00175
00180 CStringFeatures(CAlphabet* alpha)
00181 : CFeatures(0), num_vectors(0), features(NULL),
00182 single_string(NULL),length_of_single_string(0),
00183 max_string_length(0), order(0), symbol_mask_table(NULL),
00184 preprocess_on_get(false), feature_cache(NULL)
00185 {
00186 ASSERT(alpha);
00187 SG_REF(alpha);
00188 alphabet=alpha;
00189 num_symbols=alphabet->get_num_symbols();
00190 original_num_symbols=num_symbols;
00191 }
00192
00194 CStringFeatures(const CStringFeatures & orig)
00195 : CFeatures(orig), num_vectors(orig.num_vectors),
00196 single_string(orig.single_string),
00197 length_of_single_string(orig.length_of_single_string),
00198 max_string_length(orig.max_string_length),
00199 num_symbols(orig.num_symbols),
00200 original_num_symbols(orig.original_num_symbols),
00201 order(orig.order), preprocess_on_get(false),
00202 feature_cache(NULL)
00203 {
00204 ASSERT(orig.single_string == NULL);
00205
00206 alphabet=orig.alphabet;
00207 SG_REF(alphabet);
00208
00209 if (orig.features)
00210 {
00211 features=new T_STRING<ST>[orig.num_vectors];
00212
00213 for (int32_t i=0; i<num_vectors; i++)
00214 {
00215 features[i].string=new ST[orig.features[i].length];
00216 ASSERT(features[i].string);
00217 features[i].length=orig.features[i].length;
00218 memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00219 }
00220 }
00221
00222 if (orig.symbol_mask_table)
00223 {
00224 symbol_mask_table=new ST[256];
00225 for (int32_t i=0; i<256; i++)
00226 symbol_mask_table[i]=orig.symbol_mask_table[i];
00227 }
00228 }
00229
00235 CStringFeatures(char* fname, EAlphabet alpha=DNA)
00236 : CFeatures(fname), num_vectors(0), features(NULL), single_string(NULL),
00237 length_of_single_string(0), max_string_length(0), order(0),
00238 symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00239 {
00240 alphabet=new CAlphabet(alpha);
00241 SG_REF(alphabet);
00242 num_symbols=alphabet->get_num_symbols();
00243 original_num_symbols=num_symbols;
00244 load(fname);
00245 }
00246
00247 virtual ~CStringFeatures()
00248 {
00249 cleanup();
00250
00251 SG_UNREF(alphabet);
00252 }
00253
00255 virtual void cleanup()
00256 {
00257 if (single_string)
00258 {
00259 delete[] single_string;
00260 single_string=NULL;
00261 }
00262 else
00263 {
00264 for (int32_t i=0; i<num_vectors; i++)
00265 cleanup_feature_vector(i);
00266 }
00267
00268 num_vectors=0;
00269 delete[] features;
00270 delete[] symbol_mask_table;
00271 features=NULL;
00272 symbol_mask_table=NULL;
00273
00274
00275
00276
00277
00278 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00279 SG_UNREF(alphabet);
00280 alphabet=alpha;
00281 SG_REF(alphabet);
00282 }
00283
00285 virtual void cleanup_feature_vector(int32_t num)
00286 {
00287 ASSERT(num<num_vectors);
00288 if (features)
00289 {
00290 delete[] features[num].string;
00291 features[num].string=NULL;
00292 features[num].length=0;
00293 }
00294 }
00295
00300 inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00301
00306 inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00307
00312 inline CAlphabet* get_alphabet()
00313 {
00314 SG_REF(alphabet);
00315 return alphabet;
00316 }
00317
00322 virtual CFeatures* duplicate() const
00323 {
00324 return new CStringFeatures<ST>(*this);
00325 }
00326
00333 void get_feature_vector(ST** dst, int32_t* len, int32_t num)
00334 {
00335 ASSERT(features);
00336 if (num>=num_vectors)
00337 {
00338 SG_ERROR("Index out of bounds (number of strings %d, you "
00339 "requested %d)\n", num_vectors, num);
00340 }
00341
00342 int32_t l;
00343 bool free_vec;
00344 ST* vec=get_feature_vector(num, l, free_vec);
00345 *len=l;
00346 *dst=(ST*) malloc(*len * sizeof(ST));
00347 memcpy(*dst, vec, *len * sizeof(ST));
00348 free_feature_vector(vec, num, free_vec);
00349 }
00350
00357 void set_feature_vector(ST* src, int32_t len, int32_t num)
00358 {
00359 ASSERT(features);
00360 if (num>=num_vectors)
00361 {
00362 SG_ERROR("Index out of bounds (number of strings %d, you "
00363 "requested %d)\n", num_vectors, num);
00364 }
00365
00366 if (len<=0)
00367 SG_ERROR("String has zero or negative length\n");
00368
00369
00370 cleanup_feature_vector(num);
00371 features[num].length=len;
00372 features[num].string=new ST[len];
00373 memcpy(features[num].string, src, len*sizeof(ST));
00374
00375 determine_maximum_string_length();
00376 }
00377
00380 void enable_on_the_fly_preprocessing()
00381 {
00382 preprocess_on_get=true;
00383 }
00384
00388 void disable_on_the_fly_preprocessing()
00389 {
00390 preprocess_on_get=false;
00391 }
00392
00401 ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00402 {
00403 ASSERT(features);
00404 ASSERT(num<num_vectors);
00405
00406 if (!preprocess_on_get)
00407 {
00408 dofree=false;
00409 len=features[num].length;
00410 return features[num].string;
00411 }
00412 else
00413 {
00414 SG_DEBUG( "computing feature vector!\n") ;
00415 ST* feat=compute_feature_vector(num, len);
00416 dofree=true;
00417
00418 if (get_num_preproc())
00419 {
00420 ST* tmp_feat_before = feat;
00421
00422 for (int32_t i=0; i<get_num_preproc(); i++)
00423 {
00424 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
00425 feat=p->apply_to_string(tmp_feat_before, len);
00426 SG_UNREF(p);
00427 delete[] tmp_feat_before;
00428 tmp_feat_before=feat;
00429 }
00430 }
00431
00432 return feat;
00433 }
00434 }
00435
00442 void free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00443 {
00444 if (feature_cache)
00445 feature_cache->unlock_entry(num);
00446
00447 if (dofree)
00448 delete[] feat_vec ;
00449 }
00450
00457 virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00458 {
00459 int32_t len;
00460 bool free_vec;
00461 ST* vec=get_feature_vector(vec_num, len, free_vec);
00462 ASSERT(feat_num<len);
00463 ST result=vec[feat_num];
00464 free_feature_vector(vec, vec_num, free_vec);
00465
00466 return result;
00467 }
00468
00474 virtual inline int32_t get_vector_length(int32_t vec_num)
00475 {
00476 int32_t len;
00477 bool free_vec;
00478 ST* vec=get_feature_vector(vec_num, len, free_vec);
00479 free_feature_vector(vec, vec_num, free_vec);
00480 return len;
00481 }
00482
00487 virtual inline int32_t get_max_vector_length()
00488 {
00489 return max_string_length;
00490 }
00491
00496 virtual inline int32_t get_num_vectors() { return num_vectors; }
00497
00504 inline floatmax_t get_num_symbols() { return num_symbols; }
00505
00513 inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00514
00515
00516
00521 inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00522
00527 inline int32_t get_order() { return order; }
00528
00536 inline ST get_masked_symbols(ST symbol, uint8_t mask)
00537 {
00538 ASSERT(symbol_mask_table);
00539 return symbol_mask_table[mask] & symbol;
00540 }
00541
00548 inline ST shift_offset(ST offset, int32_t amount)
00549 {
00550 ASSERT(alphabet);
00551 return (offset << (amount*alphabet->get_num_bits()));
00552 }
00553
00560 inline ST shift_symbol(ST symbol, int32_t amount)
00561 {
00562 ASSERT(alphabet);
00563 return (symbol >> (amount*alphabet->get_num_bits()));
00564 }
00565
00571 virtual bool load(char* fname)
00572 {
00573 SG_INFO( "loading...\n");
00574 int64_t length=0;
00575 max_string_length=0;
00576
00577 CFile f(fname, 'r', F_CHAR);
00578 char* feature_matrix=f.load_char_data(NULL, length);
00579
00580 SG_DEBUG("char data now at %p of length %ld\n",
00581 feature_matrix, (int64_t) length);
00582
00583 num_vectors=0;
00584
00585 if (f.is_ok())
00586 {
00587 for (int64_t i=0; i<length; i++)
00588 {
00589 if (feature_matrix[i]=='\n')
00590 num_vectors++;
00591 }
00592
00593 SG_INFO( "file contains %ld vectors\n", num_vectors);
00594 features= new T_STRING<ST>[num_vectors];
00595
00596 int64_t index=0;
00597 for (int32_t lines=0; lines<num_vectors; lines++)
00598 {
00599 char* p=&feature_matrix[index];
00600 int32_t columns=0;
00601
00602 for (columns=0; index+columns<length && p[columns]!='\n'; columns++);
00603
00604 if (index+columns>=length && p[columns]!='\n') {
00605 SG_ERROR( "error in \"%s\":%d\n", fname, lines);
00606 }
00607
00608 features[lines].length=columns;
00609 features[lines].string=new ST[columns];
00610
00611 max_string_length=CMath::max(max_string_length,columns);
00612
00613 for (int32_t i=0; i<columns; i++)
00614 features[lines].string[i]= ((ST) p[i]);
00615
00616 index+= features[lines].length+1;
00617 }
00618
00619 num_symbols=4;
00620 return true;
00621 }
00622 else
00623 SG_ERROR( "reading file failed\n");
00624
00625 return false;
00626 }
00627
00634 bool load_dna_file(char* fname, bool remap_to_bin=true)
00635 {
00636 bool result=false;
00637
00638 size_t blocksize=1024*1024;
00639 size_t required_blocksize=0;
00640 uint8_t* dummy=new uint8_t[blocksize];
00641 uint8_t* overflow=NULL;
00642 int32_t overflow_len=0;
00643
00644 num_symbols=4;
00645 cleanup();
00646
00647 CAlphabet* alpha=new CAlphabet(DNA);
00648 CAlphabet* alpha_bin=new CAlphabet(RAWDNA);
00649
00650 FILE* f=fopen(fname, "ro");
00651
00652 if (f)
00653 {
00654 num_vectors=0;
00655 max_string_length=0;
00656
00657 SG_INFO("counting line numbers in file %s\n", fname);
00658 size_t block_offs=0;
00659 size_t old_block_offs=0;
00660 fseek(f, 0, SEEK_END);
00661 size_t fsize=ftell(f);
00662 rewind(f);
00663
00664 if (blocksize>fsize)
00665 blocksize=fsize;
00666
00667 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00668
00669 size_t sz=blocksize;
00670 while (sz == blocksize)
00671 {
00672 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00673 bool contains_cr=false;
00674 for (size_t i=0; i<sz; i++)
00675 {
00676 block_offs++;
00677 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00678 {
00679 num_vectors++;
00680 contains_cr=true;
00681 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00682 old_block_offs=block_offs;
00683 }
00684 }
00685 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00686 }
00687
00688 SG_INFO("found %d strings\n", num_vectors);
00689 delete[] dummy;
00690 blocksize=required_blocksize;
00691 dummy = new uint8_t[blocksize];
00692 overflow = new uint8_t[blocksize];
00693 features=new T_STRING<ST>[num_vectors];
00694
00695 rewind(f);
00696 sz=blocksize;
00697 int32_t lines=0;
00698 while (sz == blocksize)
00699 {
00700 sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00701
00702 size_t old_sz=0;
00703 for (size_t i=0; i<sz; i++)
00704 {
00705 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00706 {
00707 int32_t len=i-old_sz;
00708
00709 max_string_length=CMath::max(max_string_length, len+overflow_len);
00710
00711 features[lines].length=len;
00712 features[lines].string=new ST[len];
00713
00714 if (remap_to_bin)
00715 {
00716 for (int32_t j=0; j<overflow_len; j++)
00717 features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00718 for (int32_t j=0; j<len; j++)
00719 features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00720 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length);
00721 }
00722 else
00723 {
00724 for (int32_t j=0; j<overflow_len; j++)
00725 features[lines].string[j]=overflow[j];
00726 for (int32_t j=0; j<len; j++)
00727 features[lines].string[j+overflow_len]=dummy[old_sz+j];
00728 alpha->add_string_to_histogram(features[lines].string, features[lines].length);
00729 }
00730
00731
00732 overflow_len=0;
00733
00734
00735 old_sz=i+1;
00736 lines++;
00737 SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00738 }
00739 }
00740 for (size_t i=old_sz; i<sz; i++)
00741 overflow[i-old_sz]=dummy[i];
00742
00743 overflow_len=sz-old_sz;
00744 }
00745 result=true;
00746 SG_INFO("file successfully read\n");
00747 SG_INFO("max_string_length=%d\n", max_string_length);
00748 SG_INFO("num_strings=%d\n", num_vectors);
00749 }
00750
00751 fclose(f);
00752 delete[] dummy;
00753
00754 SG_UNREF(alphabet);
00755
00756 if (remap_to_bin)
00757 alphabet = alpha_bin;
00758 else
00759 alphabet = alpha;
00760 SG_REF(alphabet);
00761
00762 return result;
00763 }
00764
00771 bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00772 {
00773 int32_t i=0;
00774 uint64_t len=0;
00775 uint64_t offs=0;
00776 int32_t num=0;
00777 int32_t max_len=0;
00778
00779 CMemoryMappedFile<char> f(fname);
00780
00781 while (true)
00782 {
00783 char* s=f.get_line(len, offs);
00784 if (!s)
00785 break;
00786
00787 if (len>0 && s[0]=='>')
00788 num++;
00789 }
00790
00791 if (num==0)
00792 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00793
00794 cleanup();
00795 SG_UNREF(alphabet);
00796 alphabet=new CAlphabet(DNA);
00797
00798 T_STRING<ST>* strings=new T_STRING<ST>[num];
00799 offs=0;
00800
00801 for (i=0;i<num; i++)
00802 {
00803 uint64_t id_len=0;
00804 char* id=f.get_line(id_len, offs);
00805
00806 char* fasta=f.get_line(len, offs);
00807 char* s=fasta;
00808 int32_t fasta_len=0;
00809 int32_t spanned_lines=0;
00810
00811 while (true)
00812 {
00813 if (!s || len==0)
00814 SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00815
00816 if (s[0]=='>' || offs==f.get_size())
00817 {
00818 offs-=len+1;
00819 if (offs==f.get_size())
00820 {
00821 SG_DEBUG("at EOF\n");
00822 fasta_len+=len;
00823 }
00824
00825 len = fasta_len-spanned_lines;
00826 strings[i].string=new ST[len];
00827 strings[i].length=len;
00828
00829 ST* str=strings[i].string;
00830 int32_t idx=0;
00831 SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00832
00833 for (int32_t j=0; j<fasta_len; j++)
00834 {
00835 if (fasta[j]=='\n')
00836 continue;
00837
00838 ST c = (ST) fasta[j];
00839
00840 if (ignore_invalid && !alphabet->is_valid((uint8_t) fasta[j]))
00841 c = (ST) 'A';
00842
00843 if (idx>=len)
00844 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00845 str[idx++]=c;
00846 }
00847 max_len=CMath::max(max_len, strings[i].length);
00848
00849
00850 break;
00851 }
00852
00853 spanned_lines++;
00854 fasta_len+=len+1;
00855 s=f.get_line(len, offs);
00856 }
00857 }
00858
00859 return set_features(strings, num, max_len);
00860 }
00861
00869 bool load_fastq_file(const char* fname,
00870 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00871 {
00872 CMemoryMappedFile<char> f(fname);
00873
00874 int32_t i=0;
00875 uint64_t len=0;
00876 uint64_t offs=0;
00877
00878 int32_t num=f.get_num_lines();
00879 int32_t max_len=0;
00880
00881 if (num%4)
00882 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00883 num/=4;
00884
00885 cleanup();
00886 SG_UNREF(alphabet);
00887 alphabet=new CAlphabet(DNA);
00888
00889 T_STRING<ST>* strings;
00890
00891 ST* str;
00892 if (bitremap_in_single_string)
00893 {
00894 strings=new T_STRING<ST>[1];
00895 strings[0].string=new ST[num];
00896 strings[0].length=num;
00897 f.get_line(len, offs);
00898 f.get_line(len, offs);
00899 order=len;
00900 max_len=num;
00901 offs=0;
00902 original_num_symbols=alphabet->get_num_symbols();
00903 int32_t max_val=alphabet->get_num_bits();
00904 str=new ST[len];
00905 }
00906 else
00907 strings=new T_STRING<ST>[num];
00908
00909 for (i=0;i<num; i++)
00910 {
00911 if (!f.get_line(len, offs))
00912 SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00913
00914 char* s=f.get_line(len, offs);
00915 if (!s || len==0)
00916 SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00917
00918 if (bitremap_in_single_string)
00919 {
00920 if (len!=order)
00921 SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00922 for (int32_t j=0; j<order; j++)
00923 str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00924
00925 strings[0].string[i]=embed_word(str, order);
00926 }
00927 else
00928 {
00929 strings[i].string=new ST[len];
00930 strings[i].length=len;
00931 str=strings[i].string;
00932
00933 if (ignore_invalid)
00934 {
00935 for (int32_t j=0; j<len; j++)
00936 {
00937 if (alphabet->is_valid((uint8_t) s[j]))
00938 str[j]= (ST) s[j];
00939 else
00940 str[j]= (ST) 'A';
00941 }
00942 }
00943 else
00944 {
00945 for (int32_t j=0; j<len; j++)
00946 str[j]= (ST) s[j];
00947 }
00948 max_len=CMath::max(max_len, (int32_t) len);
00949 }
00950
00951
00952 if (!f.get_line(len, offs))
00953 SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00954
00955 if (!f.get_line(len, offs))
00956 SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00957 }
00958
00959 if (bitremap_in_single_string)
00960 num=1;
00961
00962 num_vectors=num;
00963 max_string_length=max_len;
00964 features=strings;
00965
00966 return true;
00967 }
00968
00974 bool load_from_directory(char* dirname)
00975 {
00976 struct dirent **namelist;
00977 int32_t n;
00978
00979 CIO::set_dirname(dirname);
00980
00981 SG_DEBUG("dirname '%s'\n", dirname);
00982
00983 n = scandir(dirname, &namelist, &CIO::filter, alphasort);
00984 if (n <= 0)
00985 {
00986 SG_ERROR("error calling scandir - no files found\n");
00987 return false;
00988 }
00989 else
00990 {
00991 T_STRING<ST>* strings=NULL;
00992
00993 int32_t num=0;
00994 int32_t max_len=-1;
00995
00996
00997
00998 strings=new T_STRING<ST>[n];
00999
01000 for (int32_t i=0; i<n; i++)
01001 {
01002 char* fname=CIO::concat_filename(namelist[i]->d_name);
01003
01004 struct stat s;
01005 off_t filesize=0;
01006
01007 if (!stat(fname, &s) && s.st_size>0)
01008 {
01009 filesize=s.st_size/sizeof(ST);
01010
01011 FILE* f=fopen(fname, "ro");
01012 if (f)
01013 {
01014 ST* str=new ST[filesize];
01015 SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
01016 fread(str, sizeof(ST), filesize, f);
01017 strings[num].string=str;
01018 strings[num].length=filesize;
01019 max_len=CMath::max(max_len, strings[num].length);
01020
01021 num++;
01022 fclose(f);
01023 }
01024 }
01025 else
01026 SG_ERROR("empty or non readable file \'%s\'\n", fname);
01027
01028 free(namelist[i]);
01029 }
01030 free(namelist);
01031
01032 if (num>0 && strings)
01033 {
01034 set_features(strings, num, max_len);
01035 return true;
01036 }
01037 }
01038 return false;
01039 }
01040
01048 bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01049 {
01050 if (p_features)
01051 {
01052 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01053
01054
01055 for (int32_t i=0; i<p_num_vectors; i++)
01056 alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01057
01058 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01059 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01060
01061 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01062 {
01063 cleanup();
01064 SG_UNREF(alphabet);
01065
01066 alphabet=alpha;
01067 SG_REF(alphabet);
01068
01069 this->features=p_features;
01070 this->num_vectors=p_num_vectors;
01071 this->max_string_length=p_max_string_length;
01072
01073 return true;
01074 }
01075 else
01076 SG_UNREF(alpha);
01077 }
01078
01079 return false;
01080 }
01081
01088 virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
01089 {
01090 num_str=num_vectors;
01091 max_str_len=max_string_length;
01092 return features;
01093 }
01094
01101 virtual T_STRING<ST>* copy_features(int32_t& num_str, int32_t& max_str_len)
01102 {
01103 ASSERT(num_vectors>0);
01104
01105 num_str=num_vectors;
01106 max_str_len=max_string_length;
01107 T_STRING<ST>* new_feat=new T_STRING<ST>[num_str];
01108
01109 for (int i=0; i<num_str; i++)
01110 {
01111 int32_t len;
01112 bool free_vec;
01113 ST* vec=get_feature_vector(i, len, free_vec);
01114 new_feat[i].string=new ST[len];
01115 new_feat[i].length=len;
01116 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01117 free_feature_vector(vec, i, free_vec);
01118 }
01119
01120 return new_feat;
01121 }
01122
01128 virtual void get_features(T_STRING<ST>** dst, int32_t* num_str)
01129 {
01130 int32_t num_vec;
01131 int32_t max_str_len;
01132 *dst=copy_features(num_vec, max_str_len);
01133 *num_str=num_vec;
01134 }
01135
01141 virtual bool save(char* dest)
01142 {
01143 return false;
01144 }
01145
01152 virtual bool load_compressed(char* src, bool decompress)
01153 {
01154 FILE* file=NULL;
01155
01156 if (!(file=fopen(src, "r")))
01157 return false;
01158 cleanup();
01159
01160
01161 char id[4];
01162 fread(&id[0], sizeof(char), 1, file);
01163 ASSERT(id[0]=='S');
01164 fread(&id[1], sizeof(char), 1, file);
01165 ASSERT(id[1]=='G');
01166 fread(&id[2], sizeof(char), 1, file);
01167 ASSERT(id[2]=='V');
01168 fread(&id[3], sizeof(char), 1, file);
01169 ASSERT(id[3]=='0');
01170
01171
01172 uint8_t c;
01173 fread(&c, sizeof(uint8_t), 1, file);
01174 CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01175
01176 uint8_t a;
01177 delete alphabet;
01178 fread(&a, sizeof(uint8_t), 1, file);
01179 alphabet=new CAlphabet((EAlphabet) a);
01180
01181 fread(&num_vectors, sizeof(int32_t), 1, file);
01182 ASSERT(num_vectors>0);
01183
01184 fread(&max_string_length, sizeof(int32_t), 1, file);
01185 ASSERT(max_string_length>0);
01186
01187 features=new T_STRING<ST>[num_vectors];
01188
01189
01190 for (int32_t i=0; i<num_vectors; i++)
01191 {
01192
01193 int32_t len_compressed;
01194 fread(&len_compressed, sizeof(int32_t), 1, file);
01195
01196 int32_t len_uncompressed;
01197 fread(&len_uncompressed, sizeof(int32_t), 1, file);
01198
01199
01200 if (decompress)
01201 {
01202 features[i].string=new ST[len_uncompressed];
01203 features[i].length=len_uncompressed;
01204 uint8_t* compressed=new uint8_t[len_compressed];
01205 fread(compressed, len_compressed, 1, file);
01206 uint64_t uncompressed_size=len_uncompressed;
01207 uncompressed_size*=sizeof(ST);
01208 compressor->decompress(compressed, len_compressed,
01209 (uint8_t*) features[i].string, uncompressed_size);
01210 delete[] compressed;
01211 ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01212 }
01213 else
01214 {
01215 int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01216 features[i].string=new ST[len_compressed+offs];
01217 features[i].length=len_compressed+offs;
01218 int32_t* feat32ptr=((int32_t*) (features[i].string));
01219 memset(features[i].string, 0, offs*sizeof(ST));
01220 feat32ptr[0]=(int32_t) len_compressed;
01221 feat32ptr[1]=(int32_t) len_uncompressed;
01222 uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01223 fread(compressed, len_compressed, 1, file);
01224 }
01225 }
01226
01227 delete compressor;
01228 fclose(file);
01229 return false;
01230 }
01231
01239 virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01240 {
01241 FILE* file=NULL;
01242
01243 if (!(file=fopen(dest, "wb")))
01244 return false;
01245
01246 CCompressor* compressor= new CCompressor(compression);
01247
01248
01249 const char* id="SGV0";
01250 fwrite(&id[0], sizeof(char), 1, file);
01251 fwrite(&id[1], sizeof(char), 1, file);
01252 fwrite(&id[2], sizeof(char), 1, file);
01253 fwrite(&id[3], sizeof(char), 1, file);
01254
01255
01256 uint8_t c=(uint8_t) compression;
01257 fwrite(&c, sizeof(uint8_t), 1, file);
01258
01259 uint8_t a=(uint8_t) alphabet->get_alphabet();
01260 fwrite(&a, sizeof(uint8_t), 1, file);
01261
01262 fwrite(&num_vectors, sizeof(int32_t), 1, file);
01263
01264 fwrite(&max_string_length, sizeof(int32_t), 1, file);
01265
01266
01267 for (int32_t i=0; i<num_vectors; i++)
01268 {
01269 int32_t len=-1;
01270 bool vfree;
01271 ST* vec=get_feature_vector(i, len, vfree);
01272
01273 uint8_t* compressed=NULL;
01274 uint64_t compressed_size=0;
01275
01276 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01277 compressed, compressed_size, level);
01278
01279 int32_t len_compressed = (int32_t) compressed_size;
01280
01281 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01282
01283 fwrite(&len, sizeof(int32_t), 1, file);
01284
01285 fwrite(compressed, compressed_size, 1, file);
01286 delete[] compressed;
01287
01288 free_feature_vector(vec, i, vfree);
01289 }
01290
01291 delete compressor;
01292 fclose(file);
01293 return true;
01294 }
01295
01296
01301 virtual int32_t get_size() { return sizeof(ST); }
01302
01308 virtual bool apply_preproc(bool force_preprocessing=false)
01309 {
01310 SG_DEBUG( "force: %d\n", force_preprocessing);
01311
01312 for (int32_t i=0; i<get_num_preproc(); i++)
01313 {
01314 if ( (!is_preprocessed(i) || force_preprocessing) )
01315 {
01316 set_preprocessed(i);
01317 CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
01318 SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01319
01320 if (!p->apply_to_string_features(this))
01321 {
01322 SG_UNREF(p);
01323 return false;
01324 }
01325 else
01326 SG_UNREF(p);
01327 }
01328 }
01329 return true;
01330 }
01331
01341 int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01342 {
01343 ASSERT(step_size>0);
01344 ASSERT(window_size>0);
01345 ASSERT(num_vectors==1 || single_string);
01346 ASSERT(max_string_length>=window_size ||
01347 (single_string && length_of_single_string>=window_size));
01348
01349
01350
01351 if (single_string)
01352 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01353 else if (num_vectors==1)
01354 {
01355 num_vectors= (max_string_length-window_size)/step_size + 1;
01356 length_of_single_string=max_string_length;
01357 }
01358
01359 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01360 int32_t offs=0;
01361 for (int32_t i=0; i<num_vectors; i++)
01362 {
01363 f[i].string=&features[0].string[offs+skip];
01364 f[i].length=window_size-skip;
01365 offs+=step_size;
01366 }
01367 single_string=features[0].string;
01368 delete[] features;
01369 features=f;
01370 max_string_length=window_size-skip;
01371
01372 return num_vectors;
01373 }
01374
01383 int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01384 {
01385 ASSERT(positions);
01386 ASSERT(window_size>0);
01387 ASSERT(num_vectors==1 || single_string);
01388 ASSERT(max_string_length>=window_size ||
01389 (single_string && length_of_single_string>=window_size));
01390
01391 num_vectors= positions->get_num_elements();
01392 ASSERT(num_vectors>0);
01393
01394 int32_t len;
01395
01396
01397
01398 if (single_string)
01399 len=length_of_single_string;
01400 else
01401 {
01402 single_string=features[0].string;
01403 len=max_string_length;
01404 length_of_single_string=max_string_length;
01405 }
01406
01407 T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01408 for (int32_t i=0; i<num_vectors; i++)
01409 {
01410 int32_t p=positions->get_element(i);
01411
01412 if (p>=0 && p<=len-window_size)
01413 {
01414 f[i].string=&features[0].string[p+skip];
01415 f[i].length=window_size-skip;
01416 }
01417 else
01418 {
01419 num_vectors=1;
01420 max_string_length=len;
01421 features[0].length=len;
01422 single_string=NULL;
01423 delete[] f;
01424 SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01425 window_size, i, p, len);
01426 return -1;
01427 }
01428 }
01429
01430 delete[] features;
01431 features=f;
01432 max_string_length=window_size-skip;
01433
01434 return num_vectors;
01435 }
01436
01448 inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01449 {
01450 return obtain_from_char_features(sf, start, p_order, gap, rev);
01451 }
01452
01462 template <class CT>
01463 bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01464 {
01465 ASSERT(sf);
01466
01467 CAlphabet* alpha=sf->get_alphabet();
01468 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01469
01470 this->order=p_order;
01471 cleanup();
01472
01473 num_vectors=sf->get_num_vectors();
01474 ASSERT(num_vectors>0);
01475 max_string_length=sf->get_max_vector_length()-start;
01476 features=new T_STRING<ST>[num_vectors];
01477
01478 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01479 alpha->get_num_symbols_in_histogram());
01480
01481 for (int32_t i=0; i<num_vectors; i++)
01482 {
01483 int32_t len=-1;
01484 bool vfree;
01485 CT* c=sf->get_feature_vector(i, len, vfree);
01486 ASSERT(!vfree);
01487
01488 features[i].string=new ST[len];
01489 features[i].length=len;
01490
01491 ST* str=features[i].string;
01492 for (int32_t j=0; j<len; j++)
01493 str[j]=(ST) alpha->remap_to_bin(c[j]);
01494 }
01495
01496 original_num_symbols=alpha->get_num_symbols();
01497 int32_t max_val=alpha->get_num_bits();
01498
01499 SG_UNREF(alpha);
01500
01501 if (p_order>1)
01502 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01503 else
01504 num_symbols=original_num_symbols;
01505 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01506
01507 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01508 {
01509 SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01510 return false;
01511 }
01512
01513 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01514 for (int32_t line=0; line<num_vectors; line++)
01515 {
01516 int32_t len=0;
01517 bool vfree;
01518 ST* fv=get_feature_vector(line, len, vfree);
01519 ASSERT(!vfree);
01520
01521 if (rev)
01522 CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01523 else
01524 CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01525
01526
01527 features[line].length-=start+gap ;
01528 if (features[line].length<0)
01529 features[line].length=0 ;
01530 }
01531
01532 compute_symbol_mask_table(max_val);
01533
01534 return true;
01535 }
01536
01544 bool have_same_length(int32_t len=-1)
01545 {
01546 if (len!=-1)
01547 {
01548 if (len!=get_max_vector_length())
01549 return false;
01550 }
01551 len = get_max_vector_length();
01552
01553 for (int32_t i=0; i<num_vectors; i++)
01554 {
01555 if (get_vector_length(i)!=len)
01556 return false;
01557 }
01558
01559 return true;
01560 }
01561
01566 inline void embed_features(int32_t p_order)
01567 {
01568 ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01569
01570 order=p_order;
01571 original_num_symbols=alphabet->get_num_symbols();
01572 int32_t max_val=alphabet->get_num_bits();
01573
01574 if (p_order>1)
01575 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01576 else
01577 num_symbols=original_num_symbols;
01578
01579 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01580
01581 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01582 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01583
01584 ST mask=0;
01585 for (int32_t i=0; i<p_order*max_val; i++)
01586 mask= (mask<<1) | ((ST) 1);
01587
01588 for (int32_t i=0; i<num_vectors; i++)
01589 {
01590 int32_t len=features[i].length;
01591
01592 if (len < p_order)
01593 SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01594
01595 ST* str = features[i].string;
01596
01597
01598 for (int32_t j=0; j<p_order; j++)
01599 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01600 str[0]=embed_word(&str[0], p_order);
01601
01602
01603 int32_t idx=0;
01604 for (int32_t j=p_order; j<len; j++)
01605 {
01606 str[j]=(ST) alphabet->remap_to_bin(str[j]);
01607 str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01608 idx++;
01609 }
01610
01611 features[i].length=len-p_order+1;
01612 }
01613
01614 compute_symbol_mask_table(max_val);
01615 }
01616
01621 inline void compute_symbol_mask_table(int64_t max_val)
01622 {
01623 delete[] symbol_mask_table;
01624 symbol_mask_table=new ST[256];
01625
01626 uint64_t mask=0;
01627 for (int32_t i=0; i< (int64_t) max_val; i++)
01628 mask=(mask<<1) | 1;
01629
01630 for (int32_t i=0; i<256; i++)
01631 {
01632 uint8_t bits=(uint8_t) i;
01633 symbol_mask_table[i]=0;
01634
01635 for (int32_t j=0; j<8; j++)
01636 {
01637 if (bits & 1)
01638 symbol_mask_table[i]|=mask<<(max_val*j);
01639
01640 bits>>=1;
01641 }
01642 }
01643 }
01644
01651 inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01652 {
01653 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01654
01655 ST mask=0;
01656 for (int32_t i=0; i<nbits; i++)
01657 mask=(mask<<1) | (ST) 1;
01658
01659 for (int32_t i=0; i<len; i++)
01660 {
01661 ST w=(word & mask);
01662 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01663 word>>=nbits;
01664 }
01665 }
01666
01672 inline ST embed_word(ST* seq, int32_t len)
01673 {
01674 ST value=(ST) 0;
01675 uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01676 for (int32_t i=0; i<len; i++)
01677 {
01678 value<<=nbits;
01679 value|=seq[i];
01680 }
01681
01682 return value;
01683 }
01684
01687 void determine_maximum_string_length()
01688 {
01689 max_string_length=0;
01690
01691 for (int32_t i=0; i<num_vectors; i++)
01692 max_string_length=CMath::max(max_string_length, features[i].length);
01693 }
01694
01695 static ST* get_zero_terminated_string_copy(T_STRING<ST> str)
01696 {
01697 int32_t l=str.length;
01698 ST* s=new ST[l+1];
01699 memcpy(s, str.string, sizeof(ST)*l);
01700 s[l]='\0';
01701 return s;
01702 }
01703
01710 virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01711 {
01712 ASSERT(features);
01713 ASSERT(num<num_vectors);
01714
01715 features[num].length=len ;
01716 features[num].string=string ;
01717
01718 max_string_length=CMath::max(len, max_string_length);
01719 }
01720
01721
01723 inline virtual const char* get_name() const { return "StringFeatures"; }
01724
01725 protected:
01726
01737 virtual ST* compute_feature_vector(int32_t num, int32_t& len)
01738 {
01739 ASSERT(features && num<num_vectors);
01740
01741 len=features[num].length;
01742 if (len<=0)
01743 return NULL;
01744
01745 ST* target=new ST[len];
01746 memcpy(target, features[num].string, len*sizeof(ST));
01747 return target;
01748 }
01749
01750 #ifdef HAVE_BOOST_SERIALIZATION
01751 private:
01752
01753 friend class ::boost::serialization::access;
01754 template<class Archive>
01755 void save(Archive & ar, const unsigned int archive_version) const
01756 {
01757
01758 SG_DEBUG("archiving StringFeatures\n");
01759
01760 ar & ::boost::serialization::base_object<CFeatures>(*this);
01761
01762 ar & alphabet;
01763
01764 ar & num_vectors;
01765 for (int i=0; i < num_vectors; ++i) {
01766 ar & features[i];
01767 }
01768
01769 ar & length_of_single_string;
01770 for (int i=0; i < length_of_single_string; ++i) {
01771 ar & single_string[i];
01772 }
01773
01774 ar & max_string_length;
01775 ar & num_symbols;
01776 ar & original_num_symbols;
01777 ar & order;
01778
01780
01781
01782
01783 SG_DEBUG("done archiving StringFeatures\n");
01784
01785 }
01786
01787 template<class Archive>
01788 void load(Archive & ar, const unsigned int archive_version)
01789 {
01790
01791 SG_DEBUG("archiving StringFeatures\n");
01792
01793 ar & ::boost::serialization::base_object<CFeatures>(*this);
01794
01795
01796 ar & alphabet;
01797
01798 ar & num_vectors;
01799
01800
01801 features = new T_STRING<ST>[num_vectors];
01802 for (int i=0; i < num_vectors; ++i) {
01803 ar & features[i];
01804 }
01805
01806
01807 ar & length_of_single_string;
01808
01809
01810 single_string = new ST[length_of_single_string];
01811 for (int i=0; i < length_of_single_string; ++i) {
01812 ar & single_string[i];
01813 }
01814
01815 ar & max_string_length;
01816 ar & num_symbols;
01817 ar & original_num_symbols;
01818 ar & order;
01819
01821
01822
01823
01824 SG_DEBUG("done archiving StringFeatures\n");
01825
01826 }
01827
01828 GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
01829
01830
01831 #endif //HAVE_BOOST_SERIALIZATION
01832
01833
01834 protected:
01835
01837 CAlphabet* alphabet;
01838
01840 int32_t num_vectors;
01841
01843 T_STRING<ST>* features;
01844
01846 ST* single_string;
01847
01849 int32_t length_of_single_string;
01850
01852 int32_t max_string_length;
01853
01855 floatmax_t num_symbols;
01856
01858 floatmax_t original_num_symbols;
01859
01861 int32_t order;
01862
01864 ST* symbol_mask_table;
01865
01867 bool preprocess_on_get;
01868
01870 CCache<ST>* feature_cache;
01871 };
01872
01873 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01874
01878 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
01879 {
01880 return F_BOOL;
01881 }
01882
01887 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
01888 {
01889 return F_CHAR;
01890 }
01891
01896 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
01897 {
01898 return F_BYTE;
01899 }
01900
01905 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
01906 {
01907 return F_SHORT;
01908 }
01909
01914 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
01915 {
01916 return F_WORD;
01917 }
01918
01923 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
01924 {
01925 return F_INT;
01926 }
01927
01932 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
01933 {
01934 return F_UINT;
01935 }
01936
01941 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
01942 {
01943 return F_LONG;
01944 }
01945
01950 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
01951 {
01952 return F_ULONG;
01953 }
01954
01959 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
01960 {
01961 return F_SHORTREAL;
01962 }
01963
01968 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
01969 {
01970 return F_DREAL;
01971 }
01972
01977 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
01978 {
01979 return F_LONGREAL;
01980 }
01981
01982 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01983 {
01984 return symbol;
01985 }
01986 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01987 {
01988 return symbol;
01989 }
01990 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01991 {
01992 return symbol;
01993 }
01994 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01995 {
01996 return symbol;
01997 }
01998
01999 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
02000 {
02001 return false;
02002 }
02003 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
02004 {
02005 return 0;
02006 }
02007 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
02008 {
02009 return 0;
02010 }
02011 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
02012 {
02013 return 0;
02014 }
02015
02016 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
02017 {
02018 return symbol;
02019 }
02020 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
02021 {
02022 return symbol;
02023 }
02024 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
02025 {
02026 return symbol;
02027 }
02028 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
02029 {
02030 return symbol;
02031 }
02032
02033 #ifndef SUNOS
02034 template<> template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02035 {
02036 return false;
02037 }
02038 template<> template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02039 {
02040 return false;
02041 }
02042 template<> template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02043 {
02044 return false;
02045 }
02046 #endif
02047
02048 template<> inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
02049 {
02050 }
02051 template<> inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
02052 {
02053 }
02054 template<> inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
02055 {
02056 }
02057
02058 template<> inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
02059 {
02060 }
02061 template<> inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
02062 {
02063 }
02064 template<> inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
02065 {
02066 }
02067
02068 template<> inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
02069 {
02070 return 0;
02071 }
02072 template<> inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
02073 {
02074 return 0;
02075 }
02076 template<> inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
02077 {
02078 return 0;
02079 }
02080
02081 template<> inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
02082 {
02083 }
02084 template<> inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
02085 {
02086 }
02087 template<> inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
02088 {
02089 }
02090 #endif // DOXYGEN_SHOULD_SKIP_THIS
02091 }
02092 #endif // _CSTRINGFEATURES__H__