BitString.h
Go to the documentation of this file.00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 #ifndef __BITSTRING_H__
00011 #define __BITSTRING_H__
00012
00013 #include <shogun/features/Alphabet.h>
00014 #include <shogun/lib/common.h>
00015 #include <shogun/lib/io.h>
00016 #include <shogun/lib/MemoryMappedFile.h>
00017 #include <shogun/lib/Mathematics.h>
00018
00019 namespace shogun
00020 {
00028 class CBitString : public CSGObject
00029 {
00030 public:
00038 CBitString(EAlphabet alpha, int32_t width=1) : CSGObject(), string(NULL), length(0)
00039 {
00040 alphabet=new CAlphabet(alpha);
00041 int32_t nbits=alphabet->get_num_bits();
00042 word_len = width*nbits;
00043
00044 mask=0;
00045 for (int32_t j=0; j<word_len; j++)
00046 mask=(mask<<1) | (uint64_t) 1;
00047 mask<<=sizeof(uint64_t)*8-word_len;
00048
00049 single_mask=0;
00050 for (int32_t j=0; j<nbits; j++)
00051 mask=(mask<<1) | (uint64_t) 1;
00052 }
00053
00055 ~CBitString()
00056 {
00057 cleanup();
00058 SG_UNREF(alphabet);
00059 }
00060
00062 void cleanup()
00063 {
00064 delete[] string;
00065 string=NULL;
00066 length=0;
00067 }
00068
00074 void obtain_from_char(char* str, uint64_t len)
00075 {
00076 cleanup();
00077 uint64_t stream_len=len/sizeof(uint64_t)+1;
00078 string=new uint64_t[stream_len];
00079 length=len;
00080
00081 uint64_t w=0;
00082 int32_t nbits=alphabet->get_num_bits();
00083 uint64_t j=0;
00084 int32_t nfit=8*sizeof(w)/nbits;
00085 for (uint64_t i=0; i<len; i++)
00086 {
00087 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) str[j]);
00088
00089 if (i % nfit == nfit-1)
00090 {
00091 string[j]=w;
00092 j++;
00093 }
00094 }
00095
00096 if (j<stream_len)
00097 {
00098 string[j]=w;
00099 j++;
00100 }
00101
00102 ASSERT(j==stream_len);
00103 }
00104
00111 void load_fasta_file(const char* fname, bool ignore_invalid=false)
00112 {
00113 cleanup();
00114
00115 uint64_t len=0;
00116 uint64_t offs=0;
00117
00118 CMemoryMappedFile<char> f(fname);
00119
00120 uint64_t id_len=0;
00121 char* id=f.get_line(id_len, offs);
00122
00123 if (!id_len || id[0]!='>')
00124 SG_SERROR("No fasta hunks (lines starting with '>') found\n");
00125
00126 if (offs==f.get_size())
00127 SG_SERROR("Empty file?\n");
00128
00129 char* fasta=NULL;
00130 char* s=NULL;
00131 int32_t spanned_lines=0;
00132 int32_t fasta_len=0;
00133
00134 while (true)
00135 {
00136 s=f.get_line(len, offs);
00137
00138 if (!fasta)
00139 fasta=s;
00140
00141 if (!s || len==0)
00142 SG_SERROR("Error reading fasta entry in line %d len=%ld", spanned_lines+1, len);
00143
00144 if (s[0]=='>')
00145 SG_SERROR("Multiple fasta hunks (lines starting with '>') are not supported!\n");
00146
00147 if (offs==f.get_size())
00148 {
00149 offs-=len+1;
00150 fasta_len+=len;
00151
00152 uint64_t w=0;
00153 int32_t nbits=alphabet->get_num_bits();
00154 uint64_t nfit=8*sizeof(w)/nbits;
00155
00156 len = fasta_len-spanned_lines;
00157 uint64_t stream_len=len/(nfit)+1;
00158 string=new uint64_t[stream_len];
00159 length=len;
00160
00161 int32_t idx=0;
00162 int32_t k=0;
00163
00164 for (int32_t j=0; j<fasta_len; j++, k++)
00165 {
00166 if (fasta[j]=='\n')
00167 {
00168 k--;
00169 continue;
00170 }
00171
00172 w= (w << nbits) | alphabet->remap_to_bin((uint8_t) fasta[j]);
00173
00174 if (k % nfit == nfit-1)
00175 {
00176 string[idx]=w;
00177 w=0;
00178 idx++;
00179 }
00180 }
00181
00182 if (idx<stream_len)
00183 string[idx]=w<<(nbits*(nfit - k%nfit));
00184
00185 break;
00186 }
00187
00188 spanned_lines++;
00189 fasta_len+=len+1;
00190 }
00191 }
00192
00198 void set_string(uint64_t* str, uint64_t len)
00199 {
00200 cleanup();
00201 string=str;
00202 length=len;
00203 }
00204
00209 void create(uint64_t len)
00210 {
00211 cleanup();
00212 uint64_t stream_len=len/sizeof(uint64_t)+1;
00213 string=new uint64_t[stream_len];
00214 CMath::fill_vector(string, (int32_t) stream_len, (uint64_t) 0);
00215 length=len;
00216 }
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00251 inline uint64_t operator[](uint64_t index) const
00252 {
00253 ASSERT(index<length);
00254
00255 uint64_t bitindex=alphabet->get_num_bits()*index;
00256 int32_t ws=8*sizeof(uint64_t);
00257 uint64_t i=bitindex/ws;
00258 int32_t j=bitindex % ws;
00259 int32_t missing=word_len-(ws-j);
00260
00261
00262 uint64_t res= ((string[i] << j) & mask ) >> (ws-word_len);
00263
00264 if (missing>0)
00265 res|= ( string[i+1] >> (ws-missing) );
00266
00267
00268
00269 return res;
00270 }
00271
00277 inline void set_binary_word(uint16_t word, uint64_t index)
00278 {
00279 ASSERT(index<length);
00280
00281 uint64_t bitindex=alphabet->get_num_bits()*index;
00282 int32_t ws=8*sizeof(uint64_t);
00283 uint64_t i=bitindex/ws;
00284 int32_t j=bitindex % ws;
00285 int32_t missing=word_len-(ws-j);
00286
00287 uint64_t sl = j-word_len;
00288 uint64_t ml;
00289 uint64_t wl;
00290 uint64_t mr;
00291 uint64_t wr;
00292
00293 if (sl>0)
00294 {
00295 ml=mask<<sl;
00296 wl=word<<sl ;
00297 }
00298 else
00299 {
00300 sl=-sl;
00301 ml=mask>>sl;
00302 wl=word>>sl;
00303 }
00304
00305 string[i] = ( string[i] & (~ml) ) | ( wl & ml);
00306
00307 if (missing>0)
00308 {
00309 mr = mask<<missing;
00310 wr = word<<missing;
00311 string[i+1] = ( string[i+1] & (~mr) ) | ( wr & mr);
00312 }
00313
00314 }
00315
00317 inline uint64_t get_length() const { return length-word_len/alphabet->get_num_bits()+1; }
00318
00320 inline virtual const char* get_name() const { return "BitString"; }
00321
00322 private:
00324 CAlphabet* alphabet;
00326 uint64_t* string;
00328 uint64_t length;
00330 int32_t word_len;
00332 uint64_t mask;
00334 uint64_t single_mask;
00335 };
00336 }
00337 #endif //__BITSTRING_H__