Template class StringFeatures implements a list of strings.
As this class is a template the underlying storage type is quite arbitrary and not limited to character strings, but could also be sequences of floating point numbers etc. Strings differ from matrices (cf. CSimpleFeatures) in a way that the dimensionality of the feature vectors (i.e. the strings) is not fixed; it may vary between strings.
Most string kernels require StringFeatures but a number of them actually requires strings to have same length.
When preprocessors are attached to string features they may shorten the string, but are not allowed to return strings longer than max_string_length, as some algorithms depend on this.
Also note that string features cannot currently be computed on-the-fly.
Definition at line 126 of file StringFeatures.h.
Public Member Functions | |
CStringFeatures () | |
CStringFeatures (EAlphabet alpha) | |
CStringFeatures (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length, EAlphabet alpha) | |
CStringFeatures (CAlphabet *alpha) | |
CStringFeatures (const CStringFeatures &orig) | |
CStringFeatures (char *fname, EAlphabet alpha=DNA) | |
virtual | ~CStringFeatures () |
virtual void | cleanup () |
virtual void | cleanup_feature_vector (int32_t num) |
virtual EFeatureClass | get_feature_class () |
virtual EFeatureType | get_feature_type () |
CAlphabet * | get_alphabet () |
virtual CFeatures * | duplicate () const |
void | get_feature_vector (ST **dst, int32_t *len, int32_t num) |
void | set_feature_vector (ST *src, int32_t len, int32_t num) |
void | enable_on_the_fly_preprocessing () |
void | disable_on_the_fly_preprocessing () |
ST * | get_feature_vector (int32_t num, int32_t &len, bool &dofree) |
void | free_feature_vector (ST *feat_vec, int32_t num, bool dofree) |
virtual ST | get_feature (int32_t vec_num, int32_t feat_num) |
virtual int32_t | get_vector_length (int32_t vec_num) |
virtual int32_t | get_max_vector_length () |
virtual int32_t | get_num_vectors () |
floatmax_t | get_num_symbols () |
floatmax_t | get_max_num_symbols () |
floatmax_t | get_original_num_symbols () |
int32_t | get_order () |
ST | get_masked_symbols (ST symbol, uint8_t mask) |
ST | shift_offset (ST offset, int32_t amount) |
ST | shift_symbol (ST symbol, int32_t amount) |
virtual bool | load (char *fname) |
bool | load_dna_file (char *fname, bool remap_to_bin=true) |
bool | load_fasta_file (const char *fname, bool ignore_invalid=false) |
bool | load_fastq_file (const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false) |
bool | load_from_directory (char *dirname) |
bool | set_features (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
virtual T_STRING< ST > * | get_features (int32_t &num_str, int32_t &max_str_len) |
virtual T_STRING< ST > * | copy_features (int32_t &num_str, int32_t &max_str_len) |
virtual void | get_features (T_STRING< ST > **dst, int32_t *num_str) |
virtual bool | save (char *dest) |
virtual bool | load_compressed (char *src, bool decompress) |
virtual bool | save_compressed (char *dest, E_COMPRESSION_TYPE compression, int level) |
virtual int32_t | get_size () |
virtual bool | apply_preproc (bool force_preprocessing=false) |
int32_t | obtain_by_sliding_window (int32_t window_size, int32_t step_size, int32_t skip=0) |
int32_t | obtain_by_position_list (int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0) |
bool | obtain_from_char (CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<class CT > | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
bool | have_same_length (int32_t len=-1) |
void | embed_features (int32_t p_order) |
void | compute_symbol_mask_table (int64_t max_val) |
void | unembed_word (ST word, uint8_t *seq, int32_t len) |
ST | embed_word (ST *seq, int32_t len) |
void | determine_maximum_string_length () |
virtual void | set_feature_vector (int32_t num, ST *string, int32_t len) |
virtual const char * | get_name () const |
Static Public Member Functions | |
static ST * | get_zero_terminated_string_copy (T_STRING< ST > str) |
Protected Member Functions | |
virtual ST * | compute_feature_vector (int32_t num, int32_t &len) |
Protected Attributes | |
CAlphabet * | alphabet |
alphabet | |
int32_t | num_vectors |
number of string vectors | |
T_STRING< ST > * | features |
this contains the array of features. | |
ST * | single_string |
true when single string / created by sliding window | |
int32_t | length_of_single_string |
length of prior single string | |
int32_t | max_string_length |
length of longest string | |
floatmax_t | num_symbols |
number of used symbols | |
floatmax_t | original_num_symbols |
original number of used symbols (before higher order mapping) | |
int32_t | order |
order used in higher order mapping | |
ST * | symbol_mask_table |
order used in higher order mapping | |
bool | preprocess_on_get |
preprocess on-the-fly? | |
CCache< ST > * | feature_cache |
CStringFeatures | ( | ) |
default constructor
Definition at line 132 of file StringFeatures.h.
CStringFeatures | ( | EAlphabet | alpha | ) |
constructor
alpha | alphabet (type) to use for string features |
Definition at line 143 of file StringFeatures.h.
CStringFeatures | ( | T_STRING< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length, | |||
EAlphabet | alpha | |||
) |
constructor
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length | |
alpha | alphabet (type) to use for string features |
Definition at line 162 of file StringFeatures.h.
CStringFeatures | ( | CAlphabet * | alpha | ) |
constructor
alpha | alphabet to use for string features |
Definition at line 180 of file StringFeatures.h.
CStringFeatures | ( | const CStringFeatures< ST > & | orig | ) |
copy constructor
Definition at line 194 of file StringFeatures.h.
CStringFeatures | ( | char * | fname, | |
EAlphabet | alpha = DNA | |||
) |
constructor
fname | filename to load features from | |
alpha | alphabet (type) to use for string features |
Definition at line 235 of file StringFeatures.h.
virtual ~CStringFeatures | ( | ) | [virtual] |
Definition at line 247 of file StringFeatures.h.
virtual bool apply_preproc | ( | bool | force_preprocessing = false |
) | [virtual] |
apply preprocessor
force_preprocessing | if preprocssing shall be forced |
Definition at line 1308 of file StringFeatures.h.
virtual void cleanup | ( | ) | [virtual] |
cleanup string features
Reimplemented in CStringFileFeatures< ST >.
Definition at line 255 of file StringFeatures.h.
virtual void cleanup_feature_vector | ( | int32_t | num | ) | [virtual] |
cleanup a single feature vector
Reimplemented in CStringFileFeatures< ST >.
Definition at line 285 of file StringFeatures.h.
virtual ST* compute_feature_vector | ( | int32_t | num, | |
int32_t & | len | |||
) | [protected, virtual] |
compute feature vector for sample num if target is set the vector is written to target len is returned by reference
default implementation returns
num | which vector | |
len | length of vector |
Definition at line 1737 of file StringFeatures.h.
void compute_symbol_mask_table | ( | int64_t | max_val | ) |
compute symbol mask table
required to access bit-based symbols
Definition at line 1621 of file StringFeatures.h.
virtual T_STRING<ST>* copy_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
copy_features
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
Definition at line 1101 of file StringFeatures.h.
void determine_maximum_string_length | ( | ) |
determine new maximum string length
Definition at line 1687 of file StringFeatures.h.
void disable_on_the_fly_preprocessing | ( | ) |
call this to disable on the fly feature preprocessing on get_feature_vector. Useful when you manually apply preprocessors.
Definition at line 388 of file StringFeatures.h.
virtual CFeatures* duplicate | ( | ) | const [virtual] |
duplicate feature object
Implements CFeatures.
Definition at line 322 of file StringFeatures.h.
void embed_features | ( | int32_t | p_order | ) |
embed string features in bit representation in-place
Definition at line 1566 of file StringFeatures.h.
ST embed_word | ( | ST * | seq, | |
int32_t | len | |||
) |
embed a single word
seq | sequence of size len in a bitfield | |
len |
Definition at line 1672 of file StringFeatures.h.
void enable_on_the_fly_preprocessing | ( | ) |
call this to preprocess string features upon get_feature_vector
Definition at line 380 of file StringFeatures.h.
void free_feature_vector | ( | ST * | feat_vec, | |
int32_t | num, | |||
bool | dofree | |||
) |
free feature vector
feat_vec | feature vector to free | |
num | index in feature cache | |
dofree | if vector should be really deleted |
Definition at line 442 of file StringFeatures.h.
CAlphabet* get_alphabet | ( | ) |
get alphabet used in string features
Definition at line 312 of file StringFeatures.h.
virtual ST get_feature | ( | int32_t | vec_num, | |
int32_t | feat_num | |||
) | [virtual] |
get feature
vec_num | which vector | |
feat_num | which feature |
Definition at line 457 of file StringFeatures.h.
virtual EFeatureClass get_feature_class | ( | ) | [virtual] |
get feature class
Implements CFeatures.
Definition at line 300 of file StringFeatures.h.
virtual EFeatureType get_feature_type | ( | ) | [virtual] |
get feature type
Implements CFeatures.
Definition at line 306 of file StringFeatures.h.
ST* get_feature_vector | ( | int32_t | num, | |
int32_t & | len, | |||
bool & | dofree | |||
) |
get feature vector for sample num
num | index of feature vector | |
len | length is returned by reference | |
dofree | whether returned vector must be freed by caller via free_feature_vector |
Definition at line 401 of file StringFeatures.h.
void get_feature_vector | ( | ST ** | dst, | |
int32_t * | len, | |||
int32_t | num | |||
) |
get string for selected example num
dst | destination where vector will be stored | |
len | number of features in vector | |
num | index of the string |
Definition at line 333 of file StringFeatures.h.
virtual void get_features | ( | T_STRING< ST > ** | dst, | |
int32_t * | num_str | |||
) | [virtual] |
get_features (swig compatible)
dst | string features (returned) | |
num_str | number of strings (returned) |
Definition at line 1128 of file StringFeatures.h.
virtual T_STRING<ST>* get_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
get_features
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
Definition at line 1088 of file StringFeatures.h.
ST get_masked_symbols | ( | ST | symbol, | |
uint8_t | mask | |||
) |
a higher order mapped symbol will be shaped such that the symbols specified by bits in the mask will be returned.
symbol | symbol to mask | |
mask | mask to apply |
Definition at line 536 of file StringFeatures.h.
floatmax_t get_max_num_symbols | ( | ) |
get maximum number of symbols
Note: floatmax_t sounds weird, but int64_t is not long enough (and there is no int128_t type)
Definition at line 513 of file StringFeatures.h.
virtual int32_t get_max_vector_length | ( | ) | [virtual] |
get maximum vector length
Definition at line 487 of file StringFeatures.h.
virtual const char* get_name | ( | ) | const [virtual] |
floatmax_t get_num_symbols | ( | ) |
get number of symbols
Note: floatmax_t sounds weird, but LONG is not long enough
Definition at line 504 of file StringFeatures.h.
virtual int32_t get_num_vectors | ( | ) | [virtual] |
get number of vectors
Implements CFeatures.
Definition at line 496 of file StringFeatures.h.
int32_t get_order | ( | ) |
floatmax_t get_original_num_symbols | ( | ) |
number of symbols before higher order mapping
Definition at line 521 of file StringFeatures.h.
virtual int32_t get_size | ( | ) | [virtual] |
get memory footprint of one feature
Implements CFeatures.
Definition at line 1301 of file StringFeatures.h.
virtual int32_t get_vector_length | ( | int32_t | vec_num | ) | [virtual] |
get vector length
vec_num | which vector |
Definition at line 474 of file StringFeatures.h.
static ST* get_zero_terminated_string_copy | ( | T_STRING< ST > | str | ) | [static] |
Definition at line 1695 of file StringFeatures.h.
bool have_same_length | ( | int32_t | len = -1 |
) |
check if length of each vector in this feature object equals the given length.
len | vector length to check against |
Definition at line 1544 of file StringFeatures.h.
virtual bool load | ( | char * | fname | ) | [virtual] |
load features from file
fname | filename to load from |
Reimplemented from CFeatures.
Definition at line 571 of file StringFeatures.h.
virtual bool load_compressed | ( | char * | src, | |
bool | decompress | |||
) | [virtual] |
load compressed features from file
src | filename to load from | |
decompress | whether to decompress on loading |
Definition at line 1152 of file StringFeatures.h.
bool load_dna_file | ( | char * | fname, | |
bool | remap_to_bin = true | |||
) |
load DNA features from file
fname | filename to load from | |
remap_to_bin | if remap_to_bin |
Definition at line 634 of file StringFeatures.h.
bool load_fasta_file | ( | const char * | fname, | |
bool | ignore_invalid = false | |||
) |
load fasta file as string features
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A |
Definition at line 771 of file StringFeatures.h.
bool load_fastq_file | ( | const char * | fname, | |
bool | ignore_invalid = false , |
|||
bool | bitremap_in_single_string = false | |||
) |
load fastq file as string features
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A | |
bitremap_in_single_string | if set to true, do binary embedding of symbols |
Definition at line 869 of file StringFeatures.h.
bool load_from_directory | ( | char * | dirname | ) |
load features from directory
dirname | directory name to load from |
Definition at line 974 of file StringFeatures.h.
int32_t obtain_by_position_list | ( | int32_t | window_size, | |
CDynamicArray< int32_t > * | positions, | |||
int32_t | skip = 0 | |||
) |
extracts windows of size window_size from first string using the positions in list
window_size | window size | |
positions | positions | |
skip | skip |
Definition at line 1383 of file StringFeatures.h.
int32_t obtain_by_sliding_window | ( | int32_t | window_size, | |
int32_t | step_size, | |||
int32_t | skip = 0 | |||
) |
slides a window of size window_size over the current single string step_size is the amount by which the window is shifted. creates (string_len-window_size)/step_size many feature obj if skip is nonzero, skip the first 'skip' characters of each string
window_size | window size | |
step_size | step size | |
skip | skip |
Definition at line 1341 of file StringFeatures.h.
bool obtain_from_char | ( | CStringFeatures< char > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
obtain string features from char features
wrapper for template method
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
Definition at line 1448 of file StringFeatures.h.
bool obtain_from_char_features | ( | CStringFeatures< CT > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
template obtain from char features
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
Definition at line 1463 of file StringFeatures.h.
virtual bool save | ( | char * | dest | ) | [virtual] |
save features to file
dest | filename to save to |
Reimplemented from CFeatures.
Definition at line 1141 of file StringFeatures.h.
virtual bool save_compressed | ( | char * | dest, | |
E_COMPRESSION_TYPE | compression, | |||
int | level | |||
) | [virtual] |
save compressed features to file
dest | filename to save to | |
compression | compressor to use | |
level | compression level to use (1-9) |
Definition at line 1239 of file StringFeatures.h.
virtual void set_feature_vector | ( | int32_t | num, | |
ST * | string, | |||
int32_t | len | |||
) | [virtual] |
set feature vector for sample num
num | index of feature vector | |
string | string with the feature vector's content | |
len | length of the string |
Definition at line 1710 of file StringFeatures.h.
void set_feature_vector | ( | ST * | src, | |
int32_t | len, | |||
int32_t | num | |||
) |
set string for selected example num
src | destination where vector will be stored | |
len | number of features in vector | |
num | index of the string |
Definition at line 357 of file StringFeatures.h.
bool set_features | ( | T_STRING< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length | |||
) |
set features
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length |
Definition at line 1048 of file StringFeatures.h.
ST shift_offset | ( | ST | offset, | |
int32_t | amount | |||
) |
shift offset to the left by amount
offset | offset to shift | |
amount | amount to shift the offset |
Definition at line 548 of file StringFeatures.h.
ST shift_symbol | ( | ST | symbol, | |
int32_t | amount | |||
) |
shift symbol to the right by amount (taking care of custom symbol sizes)
symbol | symbol to shift | |
amount | amount to shift the symbol |
Definition at line 560 of file StringFeatures.h.
void unembed_word | ( | ST | word, | |
uint8_t * | seq, | |||
int32_t | len | |||
) |
remap bit-based word to character sequence
word | word to remap | |
seq | sequence of size len that remapped characters are written to | |
len | length of sequence and word |
Definition at line 1651 of file StringFeatures.h.
alphabet
Definition at line 1837 of file StringFeatures.h.
CCache<ST>* feature_cache [protected] |
feature cache
Definition at line 1870 of file StringFeatures.h.
T_STRING<ST>* features [protected] |
this contains the array of features.
Definition at line 1843 of file StringFeatures.h.
int32_t length_of_single_string [protected] |
length of prior single string
Definition at line 1849 of file StringFeatures.h.
int32_t max_string_length [protected] |
length of longest string
Definition at line 1852 of file StringFeatures.h.
floatmax_t num_symbols [protected] |
number of used symbols
Definition at line 1855 of file StringFeatures.h.
int32_t num_vectors [protected] |
number of string vectors
Definition at line 1840 of file StringFeatures.h.
int32_t order [protected] |
order used in higher order mapping
Definition at line 1861 of file StringFeatures.h.
floatmax_t original_num_symbols [protected] |
original number of used symbols (before higher order mapping)
Definition at line 1858 of file StringFeatures.h.
bool preprocess_on_get [protected] |
preprocess on-the-fly?
Definition at line 1867 of file StringFeatures.h.
ST* single_string [protected] |
true when single string / created by sliding window
Definition at line 1846 of file StringFeatures.h.
ST* symbol_mask_table [protected] |
order used in higher order mapping
Definition at line 1864 of file StringFeatures.h.