StringFeatures.h

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Written (W) 1999-2008 Gunnar Raetsch
00009  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00010  */
00011 
00012 #ifndef _CSTRINGFEATURES__H__
00013 #define _CSTRINGFEATURES__H__
00014 
00015 #include "lib/common.h"
00016 #include "lib/io.h"
00017 #include "lib/Cache.h"
00018 #include "preproc/PreProc.h"
00019 #include "preproc/StringPreProc.h"
00020 #include "features/Features.h"
00021 #include "features/Alphabet.h"
00022 #include "lib/DynamicArray.h"
00023 #include "lib/File.h"
00024 #include "lib/MemoryMappedFile.h"
00025 #include "lib/Mathematics.h"
00026 #include "lib/Compressor.h"
00027 
00028 #include <sys/types.h>
00029 #include <sys/stat.h>
00030 #include <dirent.h>
00031 #include <stdio.h>
00032 #include <stdlib.h>
00033 #include <unistd.h>
00034 
00035 namespace shogun
00036 {
00037 class CCompressor;
00038 enum E_COMPRESSION_TYPE;
00039 class CAlphabet;
00040 enum EAlphabet;
00041 template <class T> class CDynamicArray;
00042 class CFile;
00043 template <class T> class CMemoryMappedFile;
00044 class CMath;
00045 template <class ST> class CStringPreProc;
00046 template <class T> class T_STRING;
00047 
00048 #ifndef DOXYGEN_SHOULD_SKIP_THIS
00049 
00050 template <class T> class T_STRING
00051 {
00052 
00053 
00054 #ifdef HAVE_BOOST_SERIALIZATION
00055 
00056   private:
00057 
00058 
00059   friend class ::boost::serialization::access;
00060   template<class Archive>
00061   void save(Archive & ar, const unsigned int archive_version) const
00062   {
00063 
00064     //SG_DEBUG("archiving T_STRING\n");
00065 
00066     ar & length;
00067 
00068     for (int i=0; i < length; ++i) {
00069       ar & string[i];
00070     }
00071 
00072     //SG_DEBUG("done archiving T_STRING\n");
00073 
00074   }
00075 
00076   template<class Archive>
00077   void load(Archive & ar, const unsigned int archive_version)
00078   {
00079 
00080     //SG_DEBUG("archiving T_STRING\n");
00081 
00082     ar & length;
00083 
00084     string = new T[length];
00085 
00086     for (int i=0; i < length; ++i) {
00087       ar & string[i];
00088     }
00089 
00090     //SG_DEBUG("done archiving T_STRING\n");
00091 
00092   }
00093 
00094   GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
00095 
00096 
00097 #endif //HAVE_BOOST_SERIALIZATION
00098 
00099     public:
00101     T* string;
00103     int32_t length;
00104 };
00105 #endif // DOXYGEN_SHOULD_SKIP_THIS
00106 
00107 
00126 template <class ST> class CStringFeatures : public CFeatures
00127 {
00128     public:
00132         CStringFeatures() : CFeatures(0), alphabet(NULL), num_vectors(0),
00133         features(NULL), single_string(NULL),length_of_single_string(0),
00134         max_string_length(0), order(0), symbol_mask_table(NULL),
00135         preprocess_on_get(false), feature_cache(NULL)
00136         {
00137         }
00138 
00143         CStringFeatures(EAlphabet alpha)
00144         : CFeatures(0), num_vectors(0), features(NULL),
00145             single_string(NULL),length_of_single_string(0),
00146             max_string_length(0), order(0), symbol_mask_table(NULL),
00147             preprocess_on_get(false), feature_cache(NULL)
00148         {
00149             alphabet=new CAlphabet(alpha);
00150             SG_REF(alphabet);
00151             num_symbols=alphabet->get_num_symbols();
00152             original_num_symbols=num_symbols;
00153         }
00154 
00162         CStringFeatures(T_STRING<ST>* p_features, int32_t p_num_vectors,
00163                 int32_t p_max_string_length, EAlphabet alpha)
00164         : CFeatures(0), num_vectors(0), features(NULL),
00165             single_string(NULL),length_of_single_string(0),
00166             max_string_length(0), order(0), symbol_mask_table(NULL),
00167             preprocess_on_get(false), feature_cache(NULL)
00168         {
00169             alphabet=new CAlphabet(alpha);
00170             SG_REF(alphabet);
00171             num_symbols=alphabet->get_num_symbols();
00172             original_num_symbols=num_symbols;
00173             set_features(p_features, p_num_vectors, p_max_string_length);
00174         }
00175 
00180         CStringFeatures(CAlphabet* alpha)
00181         : CFeatures(0), num_vectors(0), features(NULL),
00182             single_string(NULL),length_of_single_string(0),
00183             max_string_length(0), order(0), symbol_mask_table(NULL),
00184             preprocess_on_get(false), feature_cache(NULL)
00185         {
00186             ASSERT(alpha);
00187             SG_REF(alpha);
00188             alphabet=alpha;
00189             num_symbols=alphabet->get_num_symbols();
00190             original_num_symbols=num_symbols;
00191         }
00192 
00194         CStringFeatures(const CStringFeatures & orig)
00195         : CFeatures(orig), num_vectors(orig.num_vectors),
00196             single_string(orig.single_string),
00197             length_of_single_string(orig.length_of_single_string),
00198             max_string_length(orig.max_string_length),
00199             num_symbols(orig.num_symbols),
00200             original_num_symbols(orig.original_num_symbols),
00201             order(orig.order), preprocess_on_get(false),
00202             feature_cache(NULL)
00203         {
00204             ASSERT(orig.single_string == NULL); //not implemented
00205 
00206             alphabet=orig.alphabet;
00207             SG_REF(alphabet);
00208 
00209             if (orig.features)
00210             {
00211                 features=new T_STRING<ST>[orig.num_vectors];
00212 
00213                 for (int32_t i=0; i<num_vectors; i++)
00214                 {
00215                     features[i].string=new ST[orig.features[i].length];
00216                     ASSERT(features[i].string);
00217                     features[i].length=orig.features[i].length;
00218                     memcpy(features[i].string, orig.features[i].string, sizeof(ST)*orig.features[i].length);
00219                 }
00220             }
00221 
00222             if (orig.symbol_mask_table)
00223             {
00224                 symbol_mask_table=new ST[256];
00225                 for (int32_t i=0; i<256; i++)
00226                     symbol_mask_table[i]=orig.symbol_mask_table[i];
00227             }
00228         }
00229 
00235         CStringFeatures(char* fname, EAlphabet alpha=DNA)
00236         : CFeatures(fname), num_vectors(0), features(NULL), single_string(NULL),
00237             length_of_single_string(0), max_string_length(0), order(0),
00238             symbol_mask_table(NULL), preprocess_on_get(false), feature_cache(NULL)
00239         {
00240             alphabet=new CAlphabet(alpha);
00241             SG_REF(alphabet);
00242             num_symbols=alphabet->get_num_symbols();
00243             original_num_symbols=num_symbols;
00244             load(fname);
00245         }
00246 
00247         virtual ~CStringFeatures()
00248         {
00249             cleanup();
00250 
00251             SG_UNREF(alphabet);
00252         }
00253 
00255         virtual void cleanup()
00256         {
00257             if (single_string)
00258             {
00259                 delete[] single_string;
00260                 single_string=NULL;
00261             }
00262             else
00263             {
00264                 for (int32_t i=0; i<num_vectors; i++)
00265                     cleanup_feature_vector(i);
00266             }
00267 
00268             num_vectors=0;
00269             delete[] features;
00270             delete[] symbol_mask_table;
00271             features=NULL;
00272             symbol_mask_table=NULL;
00273 
00274             /* start with a fresh alphabet, but instead of emptying the histogram
00275              * create a new object (to leave the alphabet object alone if it is used
00276              * by others)
00277              */
00278             CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
00279             SG_UNREF(alphabet);
00280             alphabet=alpha;
00281             SG_REF(alphabet);
00282         }
00283 
00285         virtual void cleanup_feature_vector(int32_t num)
00286         {
00287             ASSERT(num<num_vectors);
00288             if (features)
00289             {
00290                 delete[] features[num].string;
00291                 features[num].string=NULL;
00292                 features[num].length=0;
00293             }
00294         }
00295 
00300         inline virtual EFeatureClass get_feature_class() { return C_STRING; }
00301 
00306         inline virtual EFeatureType get_feature_type() { return F_UNKNOWN; }
00307 
00312         inline CAlphabet* get_alphabet()
00313         {
00314             SG_REF(alphabet);
00315             return alphabet;
00316         }
00317 
00322         virtual CFeatures* duplicate() const
00323         {
00324             return new CStringFeatures<ST>(*this);
00325         }
00326 
00333         void get_feature_vector(ST** dst, int32_t* len, int32_t num)
00334         {
00335             ASSERT(features);
00336             if (num>=num_vectors)
00337             {
00338                 SG_ERROR("Index out of bounds (number of strings %d, you "
00339                         "requested %d)\n", num_vectors, num);
00340             }
00341 
00342             int32_t l;
00343             bool free_vec;
00344             ST* vec=get_feature_vector(num, l, free_vec);
00345             *len=l;
00346             *dst=(ST*) malloc(*len * sizeof(ST));
00347             memcpy(*dst, vec, *len * sizeof(ST));
00348             free_feature_vector(vec, num, free_vec);
00349         }
00350 
00357         void set_feature_vector(ST* src, int32_t len, int32_t num)
00358         {
00359             ASSERT(features);
00360             if (num>=num_vectors)
00361             {
00362                 SG_ERROR("Index out of bounds (number of strings %d, you "
00363                         "requested %d)\n", num_vectors, num);
00364             }
00365 
00366             if (len<=0)
00367                 SG_ERROR("String has zero or negative length\n");
00368 
00369 
00370             cleanup_feature_vector(num);
00371             features[num].length=len;
00372             features[num].string=new ST[len];
00373             memcpy(features[num].string, src, len*sizeof(ST));
00374 
00375             determine_maximum_string_length();
00376         }
00377 
00380         void enable_on_the_fly_preprocessing()
00381         {
00382             preprocess_on_get=true;
00383         }
00384 
00388         void disable_on_the_fly_preprocessing()
00389         {
00390             preprocess_on_get=false;
00391         }
00392 
00401         ST* get_feature_vector(int32_t num, int32_t& len, bool& dofree)
00402         {
00403             ASSERT(features);
00404             ASSERT(num<num_vectors);
00405 
00406             if (!preprocess_on_get)
00407             {
00408                 dofree=false;
00409                 len=features[num].length;
00410                 return features[num].string;
00411             }
00412             else
00413             {
00414                 SG_DEBUG( "computing feature vector!\n") ;
00415                 ST* feat=compute_feature_vector(num, len);
00416                 dofree=true;
00417 
00418                 if (get_num_preproc())
00419                 {
00420                     ST* tmp_feat_before = feat;
00421 
00422                     for (int32_t i=0; i<get_num_preproc(); i++)
00423                     {
00424                         CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
00425                         feat=p->apply_to_string(tmp_feat_before, len);
00426                         SG_UNREF(p);
00427                         delete[] tmp_feat_before;
00428                         tmp_feat_before=feat;
00429                     }
00430                 }
00431                 // TODO: implement caching
00432                 return feat;
00433             }
00434         }
00435 
00442         void free_feature_vector(ST* feat_vec, int32_t num, bool dofree)
00443         {
00444             if (feature_cache)
00445                 feature_cache->unlock_entry(num);
00446 
00447             if (dofree)
00448                 delete[] feat_vec ;
00449         }
00450 
00457         virtual ST inline get_feature(int32_t vec_num, int32_t feat_num)
00458         {
00459             int32_t len;
00460             bool free_vec;
00461             ST* vec=get_feature_vector(vec_num, len, free_vec);
00462             ASSERT(feat_num<len);
00463             ST result=vec[feat_num];
00464             free_feature_vector(vec, vec_num, free_vec);
00465 
00466             return result;
00467         }
00468 
00474         virtual inline int32_t get_vector_length(int32_t vec_num)
00475         {
00476             int32_t len;
00477             bool free_vec;
00478             ST* vec=get_feature_vector(vec_num, len, free_vec);
00479             free_feature_vector(vec, vec_num, free_vec);
00480             return len;
00481         }
00482 
00487         virtual inline int32_t get_max_vector_length()
00488         {
00489             return max_string_length;
00490         }
00491 
00496         virtual inline int32_t get_num_vectors() { return num_vectors; }
00497 
00504         inline floatmax_t get_num_symbols() { return num_symbols; }
00505 
00513         inline floatmax_t get_max_num_symbols() { return CMath::powl(2,sizeof(ST)*8); }
00514 
00515         // these functions are necessary to find out about a former conversion process
00516 
00521         inline floatmax_t get_original_num_symbols() { return original_num_symbols; }
00522 
00527         inline int32_t get_order() { return order; }
00528 
00536         inline ST get_masked_symbols(ST symbol, uint8_t mask)
00537         {
00538             ASSERT(symbol_mask_table);
00539             return symbol_mask_table[mask] & symbol;
00540         }
00541 
00548         inline ST shift_offset(ST offset, int32_t amount)
00549         {
00550             ASSERT(alphabet);
00551             return (offset << (amount*alphabet->get_num_bits()));
00552         }
00553 
00560         inline ST shift_symbol(ST symbol, int32_t amount)
00561         {
00562             ASSERT(alphabet);
00563             return (symbol >> (amount*alphabet->get_num_bits()));
00564         }
00565 
00571         virtual bool load(char* fname)
00572         {
00573             SG_INFO( "loading...\n");
00574             int64_t length=0;
00575             max_string_length=0;
00576 
00577             CFile f(fname, 'r', F_CHAR);
00578             char* feature_matrix=f.load_char_data(NULL, length);
00579 
00580             SG_DEBUG("char data now at %p of length %ld\n", 
00581                     feature_matrix, (int64_t) length);
00582 
00583             num_vectors=0;
00584 
00585             if (f.is_ok())
00586             {
00587                 for (int64_t i=0; i<length; i++)
00588                 {
00589                     if (feature_matrix[i]=='\n')
00590                         num_vectors++;
00591                 }
00592 
00593                 SG_INFO( "file contains %ld vectors\n", num_vectors);
00594                 features= new T_STRING<ST>[num_vectors];
00595 
00596                 int64_t index=0;
00597                 for (int32_t lines=0; lines<num_vectors; lines++)
00598                 {
00599                     char* p=&feature_matrix[index];
00600                     int32_t columns=0;
00601 
00602                     for (columns=0; index+columns<length && p[columns]!='\n'; columns++);
00603 
00604                     if (index+columns>=length && p[columns]!='\n') {
00605                         SG_ERROR( "error in \"%s\":%d\n", fname, lines);
00606                     }
00607 
00608                     features[lines].length=columns;
00609                     features[lines].string=new ST[columns];
00610 
00611                     max_string_length=CMath::max(max_string_length,columns);
00612 
00613                     for (int32_t i=0; i<columns; i++)
00614                         features[lines].string[i]= ((ST) p[i]);
00615 
00616                     index+= features[lines].length+1;
00617                 }
00618 
00619                 num_symbols=4; //FIXME
00620                 return true;
00621             }
00622             else
00623                 SG_ERROR( "reading file failed\n");
00624 
00625             return false;
00626         }
00627 
00634         bool load_dna_file(char* fname, bool remap_to_bin=true)
00635         {
00636             bool result=false;
00637 
00638             size_t blocksize=1024*1024;
00639             size_t required_blocksize=0;
00640             uint8_t* dummy=new uint8_t[blocksize];
00641             uint8_t* overflow=NULL;
00642             int32_t overflow_len=0;
00643 
00644             num_symbols=4;
00645             cleanup();
00646 
00647             CAlphabet* alpha=new CAlphabet(DNA);
00648             CAlphabet* alpha_bin=new CAlphabet(RAWDNA);
00649 
00650             FILE* f=fopen(fname, "ro");
00651 
00652             if (f)
00653             {
00654                 num_vectors=0;
00655                 max_string_length=0;
00656 
00657                 SG_INFO("counting line numbers in file %s\n", fname);
00658                 size_t block_offs=0;
00659                 size_t old_block_offs=0;
00660                 fseek(f, 0, SEEK_END);
00661                 size_t fsize=ftell(f);
00662                 rewind(f);
00663 
00664                 if (blocksize>fsize)
00665                     blocksize=fsize;
00666 
00667                 SG_DEBUG("block_size=%ld file_size=%ld\n", blocksize, fsize);
00668 
00669                 size_t sz=blocksize;
00670                 while (sz == blocksize)
00671                 {
00672                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00673                     bool contains_cr=false;
00674                     for (size_t i=0; i<sz; i++)
00675                     {
00676                         block_offs++;
00677                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00678                         {
00679                             num_vectors++;
00680                             contains_cr=true;
00681                             required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00682                             old_block_offs=block_offs;
00683                         }
00684                     }
00685                     SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00686                 }
00687 
00688                 SG_INFO("found %d strings\n", num_vectors);
00689                 delete[] dummy;
00690                 blocksize=required_blocksize;
00691                 dummy = new uint8_t[blocksize];
00692                 overflow = new uint8_t[blocksize];
00693                 features=new T_STRING<ST>[num_vectors];
00694 
00695                 rewind(f);
00696                 sz=blocksize;
00697                 int32_t lines=0;
00698                 while (sz == blocksize)
00699                 {
00700                     sz=fread(dummy, sizeof(uint8_t), blocksize, f);
00701 
00702                     size_t old_sz=0;
00703                     for (size_t i=0; i<sz; i++)
00704                     {
00705                         if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00706                         {
00707                             int32_t len=i-old_sz;
00708                             //SG_PRINT("i:%d len:%d old_sz:%d\n", i, len, old_sz);
00709                             max_string_length=CMath::max(max_string_length, len+overflow_len);
00710 
00711                             features[lines].length=len;
00712                             features[lines].string=new ST[len];
00713 
00714                             if (remap_to_bin)
00715                             {
00716                                 for (int32_t j=0; j<overflow_len; j++)
00717                                     features[lines].string[j]=alpha->remap_to_bin(overflow[j]);
00718                                 for (int32_t j=0; j<len; j++)
00719                                     features[lines].string[j+overflow_len]=alpha->remap_to_bin(dummy[old_sz+j]);
00720                                 alpha_bin->add_string_to_histogram(features[lines].string, features[lines].length);
00721                             }
00722                             else
00723                             {
00724                                 for (int32_t j=0; j<overflow_len; j++)
00725                                     features[lines].string[j]=overflow[j];
00726                                 for (int32_t j=0; j<len; j++)
00727                                     features[lines].string[j+overflow_len]=dummy[old_sz+j];
00728                                 alpha->add_string_to_histogram(features[lines].string, features[lines].length);
00729                             }
00730 
00731                             // clear overflow
00732                             overflow_len=0;
00733 
00734                             //CMath::display_vector(features[lines].string, len);
00735                             old_sz=i+1;
00736                             lines++;
00737                             SG_PROGRESS(lines, 0, num_vectors, 1, "LOADING:\t");
00738                         }
00739                     }
00740                     for (size_t i=old_sz; i<sz; i++)
00741                         overflow[i-old_sz]=dummy[i];
00742 
00743                     overflow_len=sz-old_sz;
00744                 }
00745                 result=true;
00746                 SG_INFO("file successfully read\n");
00747                 SG_INFO("max_string_length=%d\n", max_string_length);
00748                 SG_INFO("num_strings=%d\n", num_vectors);
00749             }
00750 
00751             fclose(f);
00752             delete[] dummy;
00753 
00754             SG_UNREF(alphabet);
00755 
00756             if (remap_to_bin)
00757                 alphabet = alpha_bin;
00758             else
00759                 alphabet = alpha;
00760             SG_REF(alphabet);
00761 
00762             return result;
00763         }
00764 
00771         bool load_fasta_file(const char* fname, bool ignore_invalid=false)
00772         {
00773             int32_t i=0;
00774             uint64_t len=0;
00775             uint64_t offs=0;
00776             int32_t num=0;
00777             int32_t max_len=0;
00778 
00779             CMemoryMappedFile<char> f(fname);
00780 
00781             while (true)
00782             {
00783                 char* s=f.get_line(len, offs);
00784                 if (!s)
00785                     break;
00786 
00787                 if (len>0 && s[0]=='>')
00788                     num++;
00789             }
00790 
00791             if (num==0)
00792                 SG_ERROR("No fasta hunks (lines starting with '>') found\n");
00793 
00794             cleanup();
00795             SG_UNREF(alphabet);
00796             alphabet=new CAlphabet(DNA);
00797 
00798             T_STRING<ST>* strings=new T_STRING<ST>[num];
00799             offs=0;
00800 
00801             for (i=0;i<num; i++)
00802             {
00803                 uint64_t id_len=0;
00804                 char* id=f.get_line(id_len, offs);
00805 
00806                 char* fasta=f.get_line(len, offs);
00807                 char* s=fasta;
00808                 int32_t fasta_len=0;
00809                 int32_t spanned_lines=0;
00810 
00811                 while (true)
00812                 {
00813                     if (!s || len==0)
00814                         SG_ERROR("Error reading fasta entry in line %d len=%ld", 4*i+1, len);
00815 
00816                     if (s[0]=='>' || offs==f.get_size())
00817                     {
00818                         offs-=len+1; // seek to beginning
00819                         if (offs==f.get_size())
00820                         {
00821                             SG_DEBUG("at EOF\n");
00822                             fasta_len+=len;
00823                         }
00824 
00825                         len = fasta_len-spanned_lines;
00826                         strings[i].string=new ST[len];
00827                         strings[i].length=len;
00828 
00829                         ST* str=strings[i].string;
00830                         int32_t idx=0;
00831                         SG_DEBUG("'%.*s', len=%d, spanned_lines=%d\n", (int32_t) id_len, id, (int32_t) len, (int32_t) spanned_lines);
00832 
00833                         for (int32_t j=0; j<fasta_len; j++)
00834                         {
00835                             if (fasta[j]=='\n')
00836                                 continue;
00837 
00838                             ST c = (ST) fasta[j];
00839 
00840                             if (ignore_invalid  && !alphabet->is_valid((uint8_t) fasta[j]))
00841                                 c = (ST) 'A';
00842 
00843                             if (idx>=len)
00844                                 SG_ERROR("idx=%d j=%d fasta_len=%d, spanned_lines=%d str='%.*s'\n", idx, j, fasta_len, spanned_lines, idx, str);
00845                             str[idx++]=c;
00846                         }
00847                         max_len=CMath::max(max_len, strings[i].length);
00848 
00849 
00850                         break;
00851                     }
00852 
00853                     spanned_lines++;
00854                     fasta_len+=len+1; // including '\n'
00855                     s=f.get_line(len, offs);
00856                 }
00857             }
00858 
00859             return set_features(strings, num, max_len);
00860         }
00861 
00869         bool load_fastq_file(const char* fname,
00870                 bool ignore_invalid=false, bool bitremap_in_single_string=false)
00871         {
00872             CMemoryMappedFile<char> f(fname);
00873 
00874             int32_t i=0;
00875             uint64_t len=0;
00876             uint64_t offs=0;
00877 
00878             int32_t num=f.get_num_lines();
00879             int32_t max_len=0;
00880 
00881             if (num%4)
00882                 SG_ERROR("Number of lines must be divisible by 4 in fastq files\n");
00883             num/=4;
00884 
00885             cleanup();
00886             SG_UNREF(alphabet);
00887             alphabet=new CAlphabet(DNA);
00888 
00889             T_STRING<ST>* strings;
00890 
00891             ST* str;
00892             if (bitremap_in_single_string)
00893             {
00894                 strings=new T_STRING<ST>[1];
00895                 strings[0].string=new ST[num];
00896                 strings[0].length=num;
00897                 f.get_line(len, offs);
00898                 f.get_line(len, offs);
00899                 order=len;
00900                 max_len=num;
00901                 offs=0;
00902                 original_num_symbols=alphabet->get_num_symbols();
00903                 int32_t max_val=alphabet->get_num_bits();
00904                 str=new ST[len];
00905             }
00906             else
00907                 strings=new T_STRING<ST>[num];
00908 
00909             for (i=0;i<num; i++)
00910             {
00911                 if (!f.get_line(len, offs))
00912                     SG_ERROR("Error reading 'read' identifier in line %d", 4*i);
00913 
00914                 char* s=f.get_line(len, offs);
00915                 if (!s || len==0)
00916                     SG_ERROR("Error reading 'read' in line %d len=%ld", 4*i+1, len);
00917 
00918                 if (bitremap_in_single_string)
00919                 {
00920                     if (len!=order)
00921                         SG_ERROR("read in line %d not of length %d (is %d)\n", 4*i+1, order, len);
00922                     for (int32_t j=0; j<order; j++)
00923                         str[j]=(ST) alphabet->remap_to_bin((uint8_t) s[j]);
00924 
00925                     strings[0].string[i]=embed_word(str, order);
00926                 }
00927                 else
00928                 {
00929                     strings[i].string=new ST[len];
00930                     strings[i].length=len;
00931                     str=strings[i].string;
00932 
00933                     if (ignore_invalid)
00934                     {
00935                         for (int32_t j=0; j<len; j++)
00936                         {
00937                             if (alphabet->is_valid((uint8_t) s[j]))
00938                                 str[j]= (ST) s[j];
00939                             else
00940                                 str[j]= (ST) 'A';
00941                         }
00942                     }
00943                     else
00944                     {
00945                         for (int32_t j=0; j<len; j++)
00946                             str[j]= (ST) s[j];
00947                     }
00948                     max_len=CMath::max(max_len, (int32_t) len);
00949                 }
00950 
00951 
00952                 if (!f.get_line(len, offs))
00953                     SG_ERROR("Error reading 'read' quality identifier in line %d", 4*i+2);
00954 
00955                 if (!f.get_line(len, offs))
00956                     SG_ERROR("Error reading 'read' quality in line %d", 4*i+3);
00957             }
00958 
00959             if (bitremap_in_single_string)
00960                 num=1;
00961 
00962             num_vectors=num;
00963             max_string_length=max_len;
00964             features=strings;
00965 
00966             return true;
00967         }
00968 
00974         bool load_from_directory(char* dirname)
00975         {
00976             struct dirent **namelist;
00977             int32_t n;
00978 
00979             CIO::set_dirname(dirname);
00980 
00981             SG_DEBUG("dirname '%s'\n", dirname);
00982 
00983             n = scandir(dirname, &namelist, &CIO::filter, alphasort);
00984             if (n <= 0)
00985             {
00986                 SG_ERROR("error calling scandir - no files found\n");
00987                 return false;
00988             }
00989             else
00990             {
00991                 T_STRING<ST>* strings=NULL;
00992 
00993                 int32_t num=0;
00994                 int32_t max_len=-1;
00995 
00996                 //usually n==num_vec, but it might not in race conditions
00997                 //(file perms modified, file erased)
00998                 strings=new T_STRING<ST>[n];
00999 
01000                 for (int32_t i=0; i<n; i++)
01001                 {
01002                     char* fname=CIO::concat_filename(namelist[i]->d_name);
01003 
01004                     struct stat s;
01005                     off_t filesize=0;
01006 
01007                     if (!stat(fname, &s) && s.st_size>0)
01008                     {
01009                         filesize=s.st_size/sizeof(ST);
01010 
01011                         FILE* f=fopen(fname, "ro");
01012                         if (f)
01013                         {
01014                             ST* str=new ST[filesize];
01015                             SG_DEBUG("%s:%ld\n", fname, (int64_t) filesize);
01016                             fread(str, sizeof(ST), filesize, f);
01017                             strings[num].string=str;
01018                             strings[num].length=filesize;
01019                             max_len=CMath::max(max_len, strings[num].length);
01020 
01021                             num++;
01022                             fclose(f);
01023                         }
01024                     }
01025                     else
01026                         SG_ERROR("empty or non readable file \'%s\'\n", fname);
01027 
01028                     free(namelist[i]);
01029                 }
01030                 free(namelist);
01031 
01032                 if (num>0 && strings)
01033                 {
01034                     set_features(strings, num, max_len);
01035                     return true;
01036                 }
01037             }
01038             return false;
01039         }
01040 
01048         bool set_features(T_STRING<ST>* p_features, int32_t p_num_vectors, int32_t p_max_string_length)
01049         {
01050             if (p_features)
01051             {
01052                 CAlphabet* alpha=new CAlphabet(alphabet->get_alphabet());
01053 
01054                 //compute histogram for char/byte
01055                 for (int32_t i=0; i<p_num_vectors; i++)
01056                     alpha->add_string_to_histogram( p_features[i].string, p_features[i].length);
01057 
01058                 SG_INFO("max_value_in_histogram:%d\n", alpha->get_max_value_in_histogram());
01059                 SG_INFO("num_symbols_in_histogram:%d\n", alpha->get_num_symbols_in_histogram());
01060 
01061                 if (alpha->check_alphabet_size() && alpha->check_alphabet())
01062                 {
01063                     cleanup();
01064                     SG_UNREF(alphabet);
01065 
01066                     alphabet=alpha;
01067                     SG_REF(alphabet);
01068 
01069                     this->features=p_features;
01070                     this->num_vectors=p_num_vectors;
01071                     this->max_string_length=p_max_string_length;
01072 
01073                     return true;
01074                 }
01075                 else
01076                     SG_UNREF(alpha);
01077             }
01078 
01079             return false;
01080         }
01081 
01088         virtual T_STRING<ST>* get_features(int32_t& num_str, int32_t& max_str_len)
01089         {
01090             num_str=num_vectors;
01091             max_str_len=max_string_length;
01092             return features;
01093         }
01094 
01101         virtual T_STRING<ST>* copy_features(int32_t& num_str, int32_t& max_str_len)
01102         {
01103             ASSERT(num_vectors>0);
01104 
01105             num_str=num_vectors;
01106             max_str_len=max_string_length;
01107             T_STRING<ST>* new_feat=new T_STRING<ST>[num_str];
01108 
01109             for (int i=0; i<num_str; i++)
01110             {
01111                 int32_t len;
01112                 bool free_vec;
01113                 ST* vec=get_feature_vector(i, len, free_vec);
01114                 new_feat[i].string=new ST[len];
01115                 new_feat[i].length=len;
01116                 memcpy(new_feat[i].string, vec, ((size_t) len) * sizeof(ST));
01117                 free_feature_vector(vec, i, free_vec);
01118             }
01119 
01120             return new_feat;
01121         }
01122 
01128         virtual void get_features(T_STRING<ST>** dst, int32_t* num_str)
01129         {
01130             int32_t num_vec;
01131             int32_t max_str_len;
01132             *dst=copy_features(num_vec, max_str_len);
01133             *num_str=num_vec;
01134         }
01135 
01141         virtual bool save(char* dest)
01142         {
01143             return false;
01144         }
01145 
01152         virtual bool load_compressed(char* src, bool decompress)
01153         {
01154             FILE* file=NULL;
01155 
01156             if (!(file=fopen(src, "r")))
01157                 return false;
01158             cleanup();
01159 
01160             // header shogun v0
01161             char id[4];
01162             fread(&id[0], sizeof(char), 1, file);
01163             ASSERT(id[0]=='S');
01164             fread(&id[1], sizeof(char), 1, file);
01165             ASSERT(id[1]=='G');
01166             fread(&id[2], sizeof(char), 1, file);
01167             ASSERT(id[2]=='V');
01168             fread(&id[3], sizeof(char), 1, file);
01169             ASSERT(id[3]=='0');
01170 
01171             //compression type
01172             uint8_t c;
01173             fread(&c, sizeof(uint8_t), 1, file);
01174             CCompressor* compressor= new CCompressor((E_COMPRESSION_TYPE) c);
01175             //alphabet
01176             uint8_t a;
01177             delete alphabet;
01178             fread(&a, sizeof(uint8_t), 1, file);
01179             alphabet=new CAlphabet((EAlphabet) a);
01180             // number of vectors
01181             fread(&num_vectors, sizeof(int32_t), 1, file);
01182             ASSERT(num_vectors>0);
01183             // maximum string length
01184             fread(&max_string_length, sizeof(int32_t), 1, file);
01185             ASSERT(max_string_length>0);
01186 
01187             features=new T_STRING<ST>[num_vectors];
01188 
01189             // vectors
01190             for (int32_t i=0; i<num_vectors; i++)
01191             {
01192                 // vector len compressed
01193                 int32_t len_compressed;
01194                 fread(&len_compressed, sizeof(int32_t), 1, file);
01195                 // vector len uncompressed
01196                 int32_t len_uncompressed;
01197                 fread(&len_uncompressed, sizeof(int32_t), 1, file);
01198 
01199                 // vector raw data
01200                 if (decompress)
01201                 {
01202                     features[i].string=new ST[len_uncompressed];
01203                     features[i].length=len_uncompressed;
01204                     uint8_t* compressed=new uint8_t[len_compressed];
01205                     fread(compressed, len_compressed, 1, file);
01206                     uint64_t uncompressed_size=len_uncompressed;
01207                     uncompressed_size*=sizeof(ST);
01208                     compressor->decompress(compressed, len_compressed,
01209                             (uint8_t*) features[i].string, uncompressed_size);
01210                     delete[] compressed;
01211                     ASSERT(uncompressed_size==((uint64_t) len_uncompressed)*sizeof(ST));
01212                 }
01213                 else
01214                 {
01215                     int32_t offs=CMath::ceil(2.0*sizeof(int32_t)/sizeof(ST));
01216                     features[i].string=new ST[len_compressed+offs];
01217                     features[i].length=len_compressed+offs;
01218                     int32_t* feat32ptr=((int32_t*) (features[i].string));
01219                     memset(features[i].string, 0, offs*sizeof(ST));
01220                     feat32ptr[0]=(int32_t) len_compressed;
01221                     feat32ptr[1]=(int32_t) len_uncompressed;
01222                     uint8_t* compressed=(uint8_t*) (&features[i].string[offs]);
01223                     fread(compressed, len_compressed, 1, file);
01224                 }
01225             }
01226 
01227             delete compressor;
01228             fclose(file);
01229             return false;
01230         }
01231 
01239         virtual bool save_compressed(char* dest, E_COMPRESSION_TYPE compression, int level)
01240         {
01241             FILE* file=NULL;
01242 
01243             if (!(file=fopen(dest, "wb")))
01244                 return false;
01245 
01246             CCompressor* compressor= new CCompressor(compression);
01247 
01248             // header shogun v0
01249             const char* id="SGV0";
01250             fwrite(&id[0], sizeof(char), 1, file);
01251             fwrite(&id[1], sizeof(char), 1, file);
01252             fwrite(&id[2], sizeof(char), 1, file);
01253             fwrite(&id[3], sizeof(char), 1, file);
01254 
01255             //compression type
01256             uint8_t c=(uint8_t) compression;
01257             fwrite(&c, sizeof(uint8_t), 1, file);
01258             //alphabet
01259             uint8_t a=(uint8_t) alphabet->get_alphabet();
01260             fwrite(&a, sizeof(uint8_t), 1, file);
01261             // number of vectors
01262             fwrite(&num_vectors, sizeof(int32_t), 1, file);
01263             // maximum string length
01264             fwrite(&max_string_length, sizeof(int32_t), 1, file);
01265 
01266             // vectors
01267             for (int32_t i=0; i<num_vectors; i++)
01268             {
01269                 int32_t len=-1;
01270                 bool vfree;
01271                 ST* vec=get_feature_vector(i, len, vfree);
01272 
01273                 uint8_t* compressed=NULL;
01274                 uint64_t compressed_size=0;
01275 
01276                 compressor->compress((uint8_t*) vec, ((uint64_t) len)*sizeof(ST),
01277                         compressed, compressed_size, level);
01278 
01279                 int32_t len_compressed = (int32_t) compressed_size;
01280                 // vector len compressed in bytes
01281                 fwrite(&len_compressed, sizeof(int32_t), 1, file);
01282                 // vector len uncompressed in number of elements of type ST
01283                 fwrite(&len, sizeof(int32_t), 1, file);
01284                 // vector raw data
01285                 fwrite(compressed, compressed_size, 1, file);
01286                 delete[] compressed;
01287 
01288                 free_feature_vector(vec, i, vfree);
01289             }
01290 
01291             delete compressor;
01292             fclose(file);
01293             return true;
01294         }
01295 
01296 
01301         virtual int32_t get_size() { return sizeof(ST); }
01302 
01308         virtual bool apply_preproc(bool force_preprocessing=false)
01309         {
01310             SG_DEBUG( "force: %d\n", force_preprocessing);
01311 
01312             for (int32_t i=0; i<get_num_preproc(); i++)
01313             {
01314                 if ( (!is_preprocessed(i) || force_preprocessing) )
01315                 {
01316                     set_preprocessed(i);
01317                     CStringPreProc<ST>* p = (CStringPreProc<ST>*) get_preproc(i);
01318                     SG_INFO( "preprocessing using preproc %s\n", p->get_name());
01319 
01320                     if (!p->apply_to_string_features(this))
01321                     {
01322                         SG_UNREF(p);
01323                         return false;
01324                     }
01325                     else
01326                         SG_UNREF(p);
01327                 }
01328             }
01329             return true;
01330         }
01331 
01341         int32_t obtain_by_sliding_window(int32_t window_size, int32_t step_size, int32_t skip=0)
01342         {
01343             ASSERT(step_size>0);
01344             ASSERT(window_size>0);
01345             ASSERT(num_vectors==1 || single_string);
01346             ASSERT(max_string_length>=window_size ||
01347                     (single_string && length_of_single_string>=window_size));
01348 
01349             //in case we are dealing with a single remapped string
01350             //allow remapping
01351             if (single_string)
01352                 num_vectors= (length_of_single_string-window_size)/step_size + 1;
01353             else if (num_vectors==1)
01354             {
01355                 num_vectors= (max_string_length-window_size)/step_size + 1;
01356                 length_of_single_string=max_string_length;
01357             }
01358 
01359             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01360             int32_t offs=0;
01361             for (int32_t i=0; i<num_vectors; i++)
01362             {
01363                 f[i].string=&features[0].string[offs+skip];
01364                 f[i].length=window_size-skip;
01365                 offs+=step_size;
01366             }
01367             single_string=features[0].string;
01368             delete[] features;
01369             features=f;
01370             max_string_length=window_size-skip;
01371 
01372             return num_vectors;
01373         }
01374 
01383         int32_t obtain_by_position_list(int32_t window_size, CDynamicArray<int32_t>* positions, int32_t skip=0)
01384         {
01385             ASSERT(positions);
01386             ASSERT(window_size>0);
01387             ASSERT(num_vectors==1 || single_string);
01388             ASSERT(max_string_length>=window_size ||
01389                     (single_string && length_of_single_string>=window_size));
01390 
01391             num_vectors= positions->get_num_elements();
01392             ASSERT(num_vectors>0);
01393 
01394             int32_t len;
01395 
01396             //in case we are dealing with a single remapped string
01397             //allow remapping
01398             if (single_string)
01399                 len=length_of_single_string;
01400             else
01401             {
01402                 single_string=features[0].string;
01403                 len=max_string_length;
01404                 length_of_single_string=max_string_length;
01405             }
01406 
01407             T_STRING<ST>* f=new T_STRING<ST>[num_vectors];
01408             for (int32_t i=0; i<num_vectors; i++)
01409             {
01410                 int32_t p=positions->get_element(i);
01411 
01412                 if (p>=0 && p<=len-window_size)
01413                 {
01414                     f[i].string=&features[0].string[p+skip];
01415                     f[i].length=window_size-skip;
01416                 }
01417                 else
01418                 {
01419                     num_vectors=1;
01420                     max_string_length=len;
01421                     features[0].length=len;
01422                     single_string=NULL;
01423                     delete[] f;
01424                     SG_ERROR("window (size:%d) starting at position[%d]=%d does not fit in sequence(len:%d)\n",
01425                             window_size, i, p, len);
01426                     return -1;
01427                 }
01428             }
01429 
01430             delete[] features;
01431             features=f;
01432             max_string_length=window_size-skip;
01433 
01434             return num_vectors;
01435         }
01436 
01448         inline bool obtain_from_char(CStringFeatures<char>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01449         {
01450             return obtain_from_char_features(sf, start, p_order, gap, rev);
01451         }
01452 
01462         template <class CT>
01463             bool obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
01464             {
01465                 ASSERT(sf);
01466 
01467                 CAlphabet* alpha=sf->get_alphabet();
01468                 ASSERT(alpha->get_num_symbols_in_histogram() > 0);
01469 
01470                 this->order=p_order;
01471                 cleanup();
01472 
01473                 num_vectors=sf->get_num_vectors();
01474                 ASSERT(num_vectors>0);
01475                 max_string_length=sf->get_max_vector_length()-start;
01476                 features=new T_STRING<ST>[num_vectors];
01477 
01478                 SG_DEBUG( "%1.0llf symbols in StringFeatures<*> %d symbols in histogram\n", sf->get_num_symbols(),
01479                         alpha->get_num_symbols_in_histogram());
01480 
01481                 for (int32_t i=0; i<num_vectors; i++)
01482                 {
01483                     int32_t len=-1;
01484                     bool vfree;
01485                     CT* c=sf->get_feature_vector(i, len, vfree);
01486                     ASSERT(!vfree); // won't work when preprocessors are attached
01487 
01488                     features[i].string=new ST[len];
01489                     features[i].length=len;
01490 
01491                     ST* str=features[i].string;
01492                     for (int32_t j=0; j<len; j++)
01493                         str[j]=(ST) alpha->remap_to_bin(c[j]);
01494                 }
01495 
01496                 original_num_symbols=alpha->get_num_symbols();
01497                 int32_t max_val=alpha->get_num_bits();
01498 
01499                 SG_UNREF(alpha);
01500 
01501                 if (p_order>1)
01502                     num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01503                 else
01504                     num_symbols=original_num_symbols;
01505                 SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01506 
01507                 if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01508                 {
01509                     SG_ERROR( "symbol does not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01510                     return false;
01511                 }
01512 
01513                 SG_DEBUG( "translate: start=%i order=%i gap=%i(size:%i)\n", start, p_order, gap, sizeof(ST)) ;
01514                 for (int32_t line=0; line<num_vectors; line++)
01515                 {
01516                     int32_t len=0;
01517                     bool vfree;
01518                     ST* fv=get_feature_vector(line, len, vfree);
01519                     ASSERT(!vfree); // won't work when preprocessors are attached
01520 
01521                     if (rev)
01522                         CAlphabet::translate_from_single_order_reversed(fv, len, start+gap, p_order+gap, max_val, gap);
01523                     else
01524                         CAlphabet::translate_from_single_order(fv, len, start+gap, p_order+gap, max_val, gap);
01525 
01526                     /* fix the length of the string -- hacky */
01527                     features[line].length-=start+gap ;
01528                     if (features[line].length<0)
01529                         features[line].length=0 ;
01530                 }
01531 
01532                 compute_symbol_mask_table(max_val);
01533 
01534                 return true;
01535             }
01536 
01544         bool have_same_length(int32_t len=-1)
01545         {
01546             if (len!=-1)
01547             {
01548                 if (len!=get_max_vector_length())
01549                     return false;
01550             }
01551             len = get_max_vector_length();
01552 
01553             for (int32_t i=0; i<num_vectors; i++)
01554             {
01555                 if (get_vector_length(i)!=len)
01556                     return false;
01557             }
01558 
01559             return true;
01560         }
01561 
01566         inline void embed_features(int32_t p_order)
01567         {
01568             ASSERT(alphabet->get_num_symbols_in_histogram() > 0);
01569 
01570             order=p_order;
01571             original_num_symbols=alphabet->get_num_symbols();
01572             int32_t max_val=alphabet->get_num_bits();
01573 
01574             if (p_order>1)
01575                 num_symbols=CMath::powl((floatmax_t) 2, (floatmax_t) max_val*p_order);
01576             else
01577                 num_symbols=original_num_symbols;
01578 
01579             SG_INFO( "max_val (bit): %d order: %d -> results in num_symbols: %.0Lf\n", max_val, p_order, num_symbols);
01580 
01581             if ( ((floatmax_t) num_symbols) > CMath::powl(((floatmax_t) 2),((floatmax_t) sizeof(ST)*8)) )
01582                 SG_WARNING("symbols did not fit into datatype \"%c\" (%d)\n", (char) max_val, (int) max_val);
01583 
01584             ST mask=0;
01585             for (int32_t i=0; i<p_order*max_val; i++)
01586                 mask= (mask<<1) | ((ST) 1);
01587 
01588             for (int32_t i=0; i<num_vectors; i++)
01589             {
01590                 int32_t len=features[i].length;
01591 
01592                 if (len < p_order)
01593                     SG_ERROR("Sequence must be longer than order (%d vs. %d)\n", len, p_order);
01594 
01595                 ST* str = features[i].string;
01596 
01597                 // convert first word
01598                 for (int32_t j=0; j<p_order; j++)
01599                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01600                 str[0]=embed_word(&str[0], p_order);
01601 
01602                 // convert the rest
01603                 int32_t idx=0;
01604                 for (int32_t j=p_order; j<len; j++)
01605                 {
01606                     str[j]=(ST) alphabet->remap_to_bin(str[j]);
01607                     str[idx+1]= ((str[idx]<<max_val) | str[j]) & mask;
01608                     idx++;
01609                 }
01610 
01611                 features[i].length=len-p_order+1;
01612             }
01613 
01614             compute_symbol_mask_table(max_val);
01615         }
01616 
01621         inline void compute_symbol_mask_table(int64_t max_val)
01622         {
01623             delete[] symbol_mask_table;
01624             symbol_mask_table=new ST[256];
01625 
01626             uint64_t mask=0;
01627             for (int32_t i=0; i< (int64_t) max_val; i++)
01628                 mask=(mask<<1) | 1;
01629 
01630             for (int32_t i=0; i<256; i++)
01631             {
01632                 uint8_t bits=(uint8_t) i;
01633                 symbol_mask_table[i]=0;
01634 
01635                 for (int32_t j=0; j<8; j++)
01636                 {
01637                     if (bits & 1)
01638                         symbol_mask_table[i]|=mask<<(max_val*j);
01639 
01640                     bits>>=1;
01641                 }
01642             }
01643         }
01644 
01651         inline void unembed_word(ST word, uint8_t* seq, int32_t len)
01652         {
01653             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01654 
01655             ST mask=0;
01656             for (int32_t i=0; i<nbits; i++)
01657                 mask=(mask<<1) | (ST) 1;
01658 
01659             for (int32_t i=0; i<len; i++)
01660             {
01661                 ST w=(word & mask);
01662                 seq[len-i-1]=alphabet->remap_to_char((uint8_t) w);
01663                 word>>=nbits;
01664             }
01665         }
01666 
01672         inline ST embed_word(ST* seq, int32_t len)
01673         {
01674             ST value=(ST) 0;
01675             uint32_t nbits= (uint32_t) alphabet->get_num_bits();
01676             for (int32_t i=0; i<len; i++)
01677             {
01678                 value<<=nbits;
01679                 value|=seq[i];
01680             }
01681 
01682             return value;
01683         }
01684 
01687         void determine_maximum_string_length()
01688         {
01689             max_string_length=0;
01690 
01691             for (int32_t i=0; i<num_vectors; i++)
01692                 max_string_length=CMath::max(max_string_length, features[i].length);
01693         }
01694 
01695         static ST* get_zero_terminated_string_copy(T_STRING<ST> str)
01696         {
01697             int32_t l=str.length;
01698             ST* s=new ST[l+1];
01699             memcpy(s, str.string, sizeof(ST)*l);
01700             s[l]='\0';
01701             return s;
01702         }
01703 
01710         virtual void set_feature_vector(int32_t num, ST* string, int32_t len)
01711         {
01712             ASSERT(features);
01713             ASSERT(num<num_vectors);
01714 
01715             features[num].length=len ;
01716             features[num].string=string ;
01717 
01718             max_string_length=CMath::max(len, max_string_length);
01719         }
01720 
01721 
01723         inline virtual const char* get_name() const { return "StringFeatures"; }
01724 
01725     protected:
01726 
01737         virtual ST* compute_feature_vector(int32_t num, int32_t& len)
01738         {
01739             ASSERT(features && num<num_vectors);
01740 
01741             len=features[num].length;
01742             if (len<=0)
01743                 return NULL;
01744 
01745             ST* target=new ST[len];
01746             memcpy(target, features[num].string, len*sizeof(ST));
01747             return target;
01748         }
01749 
01750 #ifdef HAVE_BOOST_SERIALIZATION
01751     private:
01752 
01753         friend class ::boost::serialization::access;
01754         template<class Archive>
01755             void save(Archive & ar, const unsigned int archive_version) const
01756             {
01757 
01758                 SG_DEBUG("archiving StringFeatures\n");
01759 
01760                 ar & ::boost::serialization::base_object<CFeatures>(*this);
01761 
01762                 ar & alphabet;
01763 
01764                 ar & num_vectors;
01765                 for (int i=0; i < num_vectors; ++i) {
01766                     ar & features[i];
01767                 }
01768 
01769                 ar & length_of_single_string;
01770                 for (int i=0; i < length_of_single_string; ++i) {
01771                     ar & single_string[i];
01772                 }
01773 
01774                 ar & max_string_length;
01775                 ar & num_symbols;
01776                 ar & original_num_symbols;
01777                 ar & order;
01778 
01780                 //TODO?! how long
01781                 //ST* symbol_mask_table;
01782 
01783                 SG_DEBUG("done archiving StringFeatures\n");
01784 
01785             }
01786 
01787         template<class Archive>
01788             void load(Archive & ar, const unsigned int archive_version)
01789             {
01790 
01791                 SG_DEBUG("archiving StringFeatures\n");
01792 
01793                 ar & ::boost::serialization::base_object<CFeatures>(*this);
01794 
01795 
01796                 ar & alphabet;
01797 
01798                 ar & num_vectors;
01799 
01800                 //T_STRING<ST>* features = new T_STRING<ST>[num_vectors];
01801                 features = new T_STRING<ST>[num_vectors];
01802                 for (int i=0; i < num_vectors; ++i) {
01803                     ar & features[i];
01804                 }
01805 
01806 
01807                 ar & length_of_single_string;
01808 
01809                 //ST* single_string = new ST[length_of_single_string];
01810                 single_string = new ST[length_of_single_string];
01811                 for (int i=0; i < length_of_single_string; ++i) {
01812                     ar & single_string[i];
01813                 }
01814 
01815                 ar & max_string_length;
01816                 ar & num_symbols;
01817                 ar & original_num_symbols;
01818                 ar & order;
01819 
01821                 //TODO?! how long -> num_of_symbols?
01822                 //ST* symbol_mask_table;
01823 
01824                 SG_DEBUG("done archiving StringFeatures\n");
01825 
01826             }
01827 
01828         GLOBAL_BOOST_SERIALIZATION_SPLIT_MEMBER();
01829 
01830 
01831 #endif //HAVE_BOOST_SERIALIZATION
01832 
01833 
01834     protected:
01835 
01837         CAlphabet* alphabet;
01838 
01840         int32_t num_vectors;
01841 
01843         T_STRING<ST>* features;
01844 
01846         ST* single_string;
01847 
01849         int32_t length_of_single_string;
01850 
01852         int32_t max_string_length;
01853 
01855         floatmax_t num_symbols;
01856 
01858         floatmax_t original_num_symbols;
01859 
01861         int32_t order;
01862 
01864         ST* symbol_mask_table;
01865 
01867         bool preprocess_on_get;
01868 
01870         CCache<ST>* feature_cache;
01871 };
01872 
01873 #ifndef DOXYGEN_SHOULD_SKIP_THIS
01874 
01878 template<> inline EFeatureType CStringFeatures<bool>::get_feature_type()
01879 {
01880     return F_BOOL;
01881 }
01882 
01887 template<> inline EFeatureType CStringFeatures<char>::get_feature_type()
01888 {
01889     return F_CHAR;
01890 }
01891 
01896 template<> inline EFeatureType CStringFeatures<uint8_t>::get_feature_type()
01897 {
01898     return F_BYTE;
01899 }
01900 
01905 template<> inline EFeatureType CStringFeatures<int16_t>::get_feature_type()
01906 {
01907     return F_SHORT;
01908 }
01909 
01914 template<> inline EFeatureType CStringFeatures<uint16_t>::get_feature_type()
01915 {
01916     return F_WORD;
01917 }
01918 
01923 template<> inline EFeatureType CStringFeatures<int32_t>::get_feature_type()
01924 {
01925     return F_INT;
01926 }
01927 
01932 template<> inline EFeatureType CStringFeatures<uint32_t>::get_feature_type()
01933 {
01934     return F_UINT;
01935 }
01936 
01941 template<> inline EFeatureType CStringFeatures<int64_t>::get_feature_type()
01942 {
01943     return F_LONG;
01944 }
01945 
01950 template<> inline EFeatureType CStringFeatures<uint64_t>::get_feature_type()
01951 {
01952     return F_ULONG;
01953 }
01954 
01959 template<> inline EFeatureType CStringFeatures<float32_t>::get_feature_type()
01960 {
01961     return F_SHORTREAL;
01962 }
01963 
01968 template<> inline EFeatureType CStringFeatures<float64_t>::get_feature_type()
01969 {
01970     return F_DREAL;
01971 }
01972 
01977 template<> inline EFeatureType CStringFeatures<floatmax_t>::get_feature_type()
01978 {
01979     return F_LONGREAL;
01980 }
01981 
01982 template<> inline bool CStringFeatures<bool>::get_masked_symbols(bool symbol, uint8_t mask)
01983 {
01984     return symbol;
01985 }
01986 template<> inline float32_t CStringFeatures<float32_t>::get_masked_symbols(float32_t symbol, uint8_t mask)
01987 {
01988     return symbol;
01989 }
01990 template<> inline float64_t CStringFeatures<float64_t>::get_masked_symbols(float64_t symbol, uint8_t mask)
01991 {
01992     return symbol;
01993 }
01994 template<> inline floatmax_t CStringFeatures<floatmax_t>::get_masked_symbols(floatmax_t symbol, uint8_t mask)
01995 {
01996     return symbol;
01997 }
01998 
01999 template<> inline bool CStringFeatures<bool>::shift_offset(bool symbol, int32_t amount)
02000 {
02001     return false;
02002 }
02003 template<> inline float32_t CStringFeatures<float32_t>::shift_offset(float32_t symbol, int32_t amount)
02004 {
02005     return 0;
02006 }
02007 template<> inline float64_t CStringFeatures<float64_t>::shift_offset(float64_t symbol, int32_t amount)
02008 {
02009     return 0;
02010 }
02011 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_offset(floatmax_t symbol, int32_t amount)
02012 {
02013     return 0;
02014 }
02015 
02016 template<> inline bool CStringFeatures<bool>::shift_symbol(bool symbol, int32_t amount)
02017 {
02018     return symbol;
02019 }
02020 template<> inline float32_t CStringFeatures<float32_t>::shift_symbol(float32_t symbol, int32_t amount)
02021 {
02022     return symbol;
02023 }
02024 template<> inline float64_t CStringFeatures<float64_t>::shift_symbol(float64_t symbol, int32_t amount)
02025 {
02026     return symbol;
02027 }
02028 template<> inline floatmax_t CStringFeatures<floatmax_t>::shift_symbol(floatmax_t symbol, int32_t amount)
02029 {
02030     return symbol;
02031 }
02032 
02033 #ifndef SUNOS
02034 template<>  template <class CT> bool CStringFeatures<float32_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02035 {
02036     return false;
02037 }
02038 template<>  template <class CT> bool CStringFeatures<float64_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02039 {
02040     return false;
02041 }
02042 template<>  template <class CT> bool CStringFeatures<floatmax_t>::obtain_from_char_features(CStringFeatures<CT>* sf, int32_t start, int32_t p_order, int32_t gap, bool rev)
02043 {
02044     return false;
02045 }
02046 #endif
02047 
02048 template<>  inline void CStringFeatures<float32_t>::embed_features(int32_t p_order)
02049 {
02050 }
02051 template<>  inline void CStringFeatures<float64_t>::embed_features(int32_t p_order)
02052 {
02053 }
02054 template<>  inline void CStringFeatures<floatmax_t>::embed_features(int32_t p_order)
02055 {
02056 }
02057 
02058 template<>  inline void CStringFeatures<float32_t>::compute_symbol_mask_table(int64_t max_val)
02059 {
02060 }
02061 template<>  inline void CStringFeatures<float64_t>::compute_symbol_mask_table(int64_t max_val)
02062 {
02063 }
02064 template<>  inline void CStringFeatures<floatmax_t>::compute_symbol_mask_table(int64_t max_val)
02065 {
02066 }
02067 
02068 template<>  inline float32_t CStringFeatures<float32_t>::embed_word(float32_t* seq, int32_t len)
02069 {
02070     return 0;
02071 }
02072 template<>  inline float64_t CStringFeatures<float64_t>::embed_word(float64_t* seq, int32_t len)
02073 {
02074     return 0;
02075 }
02076 template<>  inline floatmax_t CStringFeatures<floatmax_t>::embed_word(floatmax_t* seq, int32_t len)
02077 {
02078     return 0;
02079 }
02080 
02081 template<>  inline void CStringFeatures<float32_t>::unembed_word(float32_t word, uint8_t* seq, int32_t len)
02082 {
02083 }
02084 template<>  inline void CStringFeatures<float64_t>::unembed_word(float64_t word, uint8_t* seq, int32_t len)
02085 {
02086 }
02087 template<>  inline void CStringFeatures<floatmax_t>::unembed_word(floatmax_t word, uint8_t* seq, int32_t len)
02088 {
02089 }
02090 #endif // DOXYGEN_SHOULD_SKIP_THIS
02091 }
02092 #endif // _CSTRINGFEATURES__H__

SHOGUN Machine Learning Toolbox - Documentation