Template class StringFeatures implements a list of strings.
As this class is a template the underlying storage type is quite arbitrary and not limited to character strings, but could also be sequences of floating point numbers etc. Strings differ from matrices (cf. CSimpleFeatures) in a way that the dimensionality of the feature vectors (i.e. the strings) is not fixed; it may vary between strings.
Most string kernels require StringFeatures but a number of them actually requires strings to have same length.
When preprocessors are attached to string features they may shorten the string, but are not allowed to return strings longer than max_string_length, as some algorithms depend on this.
Also note that string features cannot currently be computed on-the-fly.
在文件StringFeatures.h第126行定义。
公有成员 | |
CStringFeatures () | |
CStringFeatures (EAlphabet alpha) | |
CStringFeatures (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length, EAlphabet alpha) | |
CStringFeatures (CAlphabet *alpha) | |
CStringFeatures (const CStringFeatures &orig) | |
CStringFeatures (char *fname, EAlphabet alpha=DNA) | |
virtual | ~CStringFeatures () |
virtual void | cleanup () |
virtual void | cleanup_feature_vector (int32_t num) |
virtual EFeatureClass | get_feature_class () |
virtual EFeatureType | get_feature_type () |
CAlphabet * | get_alphabet () |
virtual CFeatures * | duplicate () const |
void | get_feature_vector (ST **dst, int32_t *len, int32_t num) |
void | set_feature_vector (ST *src, int32_t len, int32_t num) |
void | enable_on_the_fly_preprocessing () |
void | disable_on_the_fly_preprocessing () |
ST * | get_feature_vector (int32_t num, int32_t &len, bool &dofree) |
void | free_feature_vector (ST *feat_vec, int32_t num, bool dofree) |
virtual ST | get_feature (int32_t vec_num, int32_t feat_num) |
virtual int32_t | get_vector_length (int32_t vec_num) |
virtual int32_t | get_max_vector_length () |
virtual int32_t | get_num_vectors () |
floatmax_t | get_num_symbols () |
floatmax_t | get_max_num_symbols () |
floatmax_t | get_original_num_symbols () |
int32_t | get_order () |
ST | get_masked_symbols (ST symbol, uint8_t mask) |
ST | shift_offset (ST offset, int32_t amount) |
ST | shift_symbol (ST symbol, int32_t amount) |
virtual bool | load (char *fname) |
bool | load_dna_file (char *fname, bool remap_to_bin=true) |
bool | load_fasta_file (const char *fname, bool ignore_invalid=false) |
bool | load_fastq_file (const char *fname, bool ignore_invalid=false, bool bitremap_in_single_string=false) |
bool | load_from_directory (char *dirname) |
bool | set_features (T_STRING< ST > *p_features, int32_t p_num_vectors, int32_t p_max_string_length) |
virtual T_STRING< ST > * | get_features (int32_t &num_str, int32_t &max_str_len) |
virtual T_STRING< ST > * | copy_features (int32_t &num_str, int32_t &max_str_len) |
virtual void | get_features (T_STRING< ST > **dst, int32_t *num_str) |
virtual bool | save (char *dest) |
virtual bool | load_compressed (char *src, bool decompress) |
virtual bool | save_compressed (char *dest, E_COMPRESSION_TYPE compression, int level) |
virtual int32_t | get_size () |
virtual bool | apply_preproc (bool force_preprocessing=false) |
int32_t | obtain_by_sliding_window (int32_t window_size, int32_t step_size, int32_t skip=0) |
int32_t | obtain_by_position_list (int32_t window_size, CDynamicArray< int32_t > *positions, int32_t skip=0) |
bool | obtain_from_char (CStringFeatures< char > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
template<class CT > | |
bool | obtain_from_char_features (CStringFeatures< CT > *sf, int32_t start, int32_t p_order, int32_t gap, bool rev) |
bool | have_same_length (int32_t len=-1) |
void | embed_features (int32_t p_order) |
void | compute_symbol_mask_table (int64_t max_val) |
void | unembed_word (ST word, uint8_t *seq, int32_t len) |
ST | embed_word (ST *seq, int32_t len) |
void | determine_maximum_string_length () |
virtual void | set_feature_vector (int32_t num, ST *string, int32_t len) |
virtual const char * | get_name () const |
静态公有成员 | |
static ST * | get_zero_terminated_string_copy (T_STRING< ST > str) |
保护成员 | |
virtual ST * | compute_feature_vector (int32_t num, int32_t &len) |
保护属性 | |
CAlphabet * | alphabet |
alphabet | |
int32_t | num_vectors |
number of string vectors | |
T_STRING< ST > * | features |
this contains the array of features. | |
ST * | single_string |
true when single string / created by sliding window | |
int32_t | length_of_single_string |
length of prior single string | |
int32_t | max_string_length |
length of longest string | |
floatmax_t | num_symbols |
number of used symbols | |
floatmax_t | original_num_symbols |
original number of used symbols (before higher order mapping) | |
int32_t | order |
order used in higher order mapping | |
ST * | symbol_mask_table |
order used in higher order mapping | |
bool | preprocess_on_get |
preprocess on-the-fly? | |
CCache< ST > * | feature_cache |
CStringFeatures | ( | ) |
default constructor
在文件StringFeatures.h第132行定义。
CStringFeatures | ( | EAlphabet | alpha | ) |
CStringFeatures | ( | T_STRING< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length, | |||
EAlphabet | alpha | |||
) |
constructor
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length | |
alpha | alphabet (type) to use for string features |
在文件StringFeatures.h第162行定义。
CStringFeatures | ( | CAlphabet * | alpha | ) |
CStringFeatures | ( | const CStringFeatures< ST > & | orig | ) |
copy constructor
在文件StringFeatures.h第194行定义。
CStringFeatures | ( | char * | fname, | |
EAlphabet | alpha = DNA | |||
) |
constructor
fname | filename to load features from | |
alpha | alphabet (type) to use for string features |
在文件StringFeatures.h第235行定义。
virtual ~CStringFeatures | ( | ) | [virtual] |
在文件StringFeatures.h第247行定义。
virtual bool apply_preproc | ( | bool | force_preprocessing = false |
) | [virtual] |
apply preprocessor
force_preprocessing | if preprocssing shall be forced |
在文件StringFeatures.h第1308行定义。
virtual void cleanup | ( | ) | [virtual] |
virtual void cleanup_feature_vector | ( | int32_t | num | ) | [virtual] |
virtual ST* compute_feature_vector | ( | int32_t | num, | |
int32_t & | len | |||
) | [protected, virtual] |
compute feature vector for sample num if target is set the vector is written to target len is returned by reference
default implementation returns
num | which vector | |
len | length of vector |
在文件StringFeatures.h第1737行定义。
void compute_symbol_mask_table | ( | int64_t | max_val | ) |
virtual T_STRING<ST>* copy_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
copy_features
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
在文件StringFeatures.h第1101行定义。
void determine_maximum_string_length | ( | ) |
determine new maximum string length
在文件StringFeatures.h第1687行定义。
void disable_on_the_fly_preprocessing | ( | ) |
call this to disable on the fly feature preprocessing on get_feature_vector. Useful when you manually apply preprocessors.
在文件StringFeatures.h第388行定义。
virtual CFeatures* duplicate | ( | ) | const [virtual] |
void embed_features | ( | int32_t | p_order | ) |
embed string features in bit representation in-place
在文件StringFeatures.h第1566行定义。
ST embed_word | ( | ST * | seq, | |
int32_t | len | |||
) |
void enable_on_the_fly_preprocessing | ( | ) |
call this to preprocess string features upon get_feature_vector
在文件StringFeatures.h第380行定义。
void free_feature_vector | ( | ST * | feat_vec, | |
int32_t | num, | |||
bool | dofree | |||
) |
free feature vector
feat_vec | feature vector to free | |
num | index in feature cache | |
dofree | if vector should be really deleted |
在文件StringFeatures.h第442行定义。
CAlphabet* get_alphabet | ( | ) |
virtual ST get_feature | ( | int32_t | vec_num, | |
int32_t | feat_num | |||
) | [virtual] |
virtual EFeatureClass get_feature_class | ( | ) | [virtual] |
virtual EFeatureType get_feature_type | ( | ) | [virtual] |
ST* get_feature_vector | ( | int32_t | num, | |
int32_t & | len, | |||
bool & | dofree | |||
) |
get feature vector for sample num
num | index of feature vector | |
len | length is returned by reference | |
dofree | whether returned vector must be freed by caller via free_feature_vector |
在文件StringFeatures.h第401行定义。
void get_feature_vector | ( | ST ** | dst, | |
int32_t * | len, | |||
int32_t | num | |||
) |
get string for selected example num
dst | destination where vector will be stored | |
len | number of features in vector | |
num | index of the string |
在文件StringFeatures.h第333行定义。
virtual void get_features | ( | T_STRING< ST > ** | dst, | |
int32_t * | num_str | |||
) | [virtual] |
get_features (swig compatible)
dst | string features (returned) | |
num_str | number of strings (returned) |
在文件StringFeatures.h第1128行定义。
virtual T_STRING<ST>* get_features | ( | int32_t & | num_str, | |
int32_t & | max_str_len | |||
) | [virtual] |
get_features
num_str | number of strings (returned) | |
max_str_len | maximal string length (returned) |
在文件StringFeatures.h第1088行定义。
ST get_masked_symbols | ( | ST | symbol, | |
uint8_t | mask | |||
) |
a higher order mapped symbol will be shaped such that the symbols specified by bits in the mask will be returned.
symbol | symbol to mask | |
mask | mask to apply |
在文件StringFeatures.h第536行定义。
floatmax_t get_max_num_symbols | ( | ) |
get maximum number of symbols
Note: floatmax_t sounds weird, but int64_t is not long enough (and there is no int128_t type)
在文件StringFeatures.h第513行定义。
virtual int32_t get_max_vector_length | ( | ) | [virtual] |
virtual const char* get_name | ( | ) | const [virtual] |
floatmax_t get_num_symbols | ( | ) |
get number of symbols
Note: floatmax_t sounds weird, but LONG is not long enough
在文件StringFeatures.h第504行定义。
virtual int32_t get_num_vectors | ( | ) | [virtual] |
int32_t get_order | ( | ) |
floatmax_t get_original_num_symbols | ( | ) |
number of symbols before higher order mapping
在文件StringFeatures.h第521行定义。
virtual int32_t get_size | ( | ) | [virtual] |
get memory footprint of one feature
实现了CFeatures。
在文件StringFeatures.h第1301行定义。
virtual int32_t get_vector_length | ( | int32_t | vec_num | ) | [virtual] |
static ST* get_zero_terminated_string_copy | ( | T_STRING< ST > | str | ) | [static] |
在文件StringFeatures.h第1695行定义。
bool have_same_length | ( | int32_t | len = -1 |
) |
check if length of each vector in this feature object equals the given length.
len | vector length to check against |
在文件StringFeatures.h第1544行定义。
virtual bool load | ( | char * | fname | ) | [virtual] |
load features from file
fname | filename to load from |
重载CFeatures。
在文件StringFeatures.h第571行定义。
virtual bool load_compressed | ( | char * | src, | |
bool | decompress | |||
) | [virtual] |
load compressed features from file
src | filename to load from | |
decompress | whether to decompress on loading |
在文件StringFeatures.h第1152行定义。
bool load_dna_file | ( | char * | fname, | |
bool | remap_to_bin = true | |||
) |
load DNA features from file
fname | filename to load from | |
remap_to_bin | if remap_to_bin |
在文件StringFeatures.h第634行定义。
bool load_fasta_file | ( | const char * | fname, | |
bool | ignore_invalid = false | |||
) |
load fasta file as string features
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A |
在文件StringFeatures.h第771行定义。
bool load_fastq_file | ( | const char * | fname, | |
bool | ignore_invalid = false , |
|||
bool | bitremap_in_single_string = false | |||
) |
load fastq file as string features
fname | filename to load from | |
ignore_invalid | if set to true, characters other than A,C,G,T are converted to A | |
bitremap_in_single_string | if set to true, do binary embedding of symbols |
在文件StringFeatures.h第869行定义。
bool load_from_directory | ( | char * | dirname | ) |
load features from directory
dirname | directory name to load from |
在文件StringFeatures.h第974行定义。
int32_t obtain_by_position_list | ( | int32_t | window_size, | |
CDynamicArray< int32_t > * | positions, | |||
int32_t | skip = 0 | |||
) |
extracts windows of size window_size from first string using the positions in list
window_size | window size | |
positions | positions | |
skip | skip |
在文件StringFeatures.h第1383行定义。
int32_t obtain_by_sliding_window | ( | int32_t | window_size, | |
int32_t | step_size, | |||
int32_t | skip = 0 | |||
) |
slides a window of size window_size over the current single string step_size is the amount by which the window is shifted. creates (string_len-window_size)/step_size many feature obj if skip is nonzero, skip the first 'skip' characters of each string
window_size | window size | |
step_size | step size | |
skip | skip |
在文件StringFeatures.h第1341行定义。
bool obtain_from_char | ( | CStringFeatures< char > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
obtain string features from char features
wrapper for template method
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
在文件StringFeatures.h第1448行定义。
bool obtain_from_char_features | ( | CStringFeatures< CT > * | sf, | |
int32_t | start, | |||
int32_t | p_order, | |||
int32_t | gap, | |||
bool | rev | |||
) |
template obtain from char features
sf | string features | |
start | start | |
p_order | order | |
gap | gap | |
rev | reverse |
在文件StringFeatures.h第1463行定义。
virtual bool save | ( | char * | dest | ) | [virtual] |
save features to file
dest | filename to save to |
重载CFeatures。
在文件StringFeatures.h第1141行定义。
virtual bool save_compressed | ( | char * | dest, | |
E_COMPRESSION_TYPE | compression, | |||
int | level | |||
) | [virtual] |
save compressed features to file
dest | filename to save to | |
compression | compressor to use | |
level | compression level to use (1-9) |
在文件StringFeatures.h第1239行定义。
virtual void set_feature_vector | ( | int32_t | num, | |
ST * | string, | |||
int32_t | len | |||
) | [virtual] |
set feature vector for sample num
num | index of feature vector | |
string | string with the feature vector's content | |
len | length of the string |
在文件StringFeatures.h第1710行定义。
void set_feature_vector | ( | ST * | src, | |
int32_t | len, | |||
int32_t | num | |||
) |
set string for selected example num
src | destination where vector will be stored | |
len | number of features in vector | |
num | index of the string |
在文件StringFeatures.h第357行定义。
bool set_features | ( | T_STRING< ST > * | p_features, | |
int32_t | p_num_vectors, | |||
int32_t | p_max_string_length | |||
) |
set features
p_features | new features | |
p_num_vectors | number of vectors | |
p_max_string_length | maximum string length |
在文件StringFeatures.h第1048行定义。
ST shift_offset | ( | ST | offset, | |
int32_t | amount | |||
) |
shift offset to the left by amount
offset | offset to shift | |
amount | amount to shift the offset |
在文件StringFeatures.h第548行定义。
ST shift_symbol | ( | ST | symbol, | |
int32_t | amount | |||
) |
shift symbol to the right by amount (taking care of custom symbol sizes)
symbol | symbol to shift | |
amount | amount to shift the symbol |
在文件StringFeatures.h第560行定义。
void unembed_word | ( | ST | word, | |
uint8_t * | seq, | |||
int32_t | len | |||
) |
remap bit-based word to character sequence
word | word to remap | |
seq | sequence of size len that remapped characters are written to | |
len | length of sequence and word |
在文件StringFeatures.h第1651行定义。
alphabet
在文件StringFeatures.h第1837行定义。
CCache<ST>* feature_cache [protected] |
feature cache
在文件StringFeatures.h第1870行定义。
T_STRING<ST>* features [protected] |
this contains the array of features.
在文件StringFeatures.h第1843行定义。
int32_t length_of_single_string [protected] |
length of prior single string
在文件StringFeatures.h第1849行定义。
int32_t max_string_length [protected] |
length of longest string
在文件StringFeatures.h第1852行定义。
floatmax_t num_symbols [protected] |
number of used symbols
在文件StringFeatures.h第1855行定义。
int32_t num_vectors [protected] |
number of string vectors
在文件StringFeatures.h第1840行定义。
int32_t order [protected] |
order used in higher order mapping
在文件StringFeatures.h第1861行定义。
floatmax_t original_num_symbols [protected] |
original number of used symbols (before higher order mapping)
在文件StringFeatures.h第1858行定义。
bool preprocess_on_get [protected] |
preprocess on-the-fly?
在文件StringFeatures.h第1867行定义。
ST* single_string [protected] |
true when single string / created by sliding window
在文件StringFeatures.h第1846行定义。
ST* symbol_mask_table [protected] |
order used in higher order mapping
在文件StringFeatures.h第1864行定义。