SHOGUN v0.9.0
|
00001 /* 00002 * This program is free software; you can redistribute it and/or modify 00003 * it under the terms of the GNU General Public License as published by 00004 * the Free Software Foundation; either version 3 of the License, or 00005 * (at your option) any later version. 00006 * 00007 * Written (W) 2009 Soeren Sonnenburg 00008 * Copyright (C) 2009 Berlin Institute of Technology 00009 */ 00010 00011 #ifndef _CSTRINGFILEFEATURES__H__ 00012 #define _CSTRINGFILEFEATURES__H__ 00013 00014 #include "features/StringFeatures.h" 00015 #include "features/Alphabet.h" 00016 #include "lib/MemoryMappedFile.h" 00017 #include "lib/Mathematics.h" 00018 #include "lib/io.h" 00019 00020 namespace shogun 00021 { 00022 class CAlphabet; 00023 template <class T> class CMemoryMappedFile; 00024 00034 template <class ST> class CStringFileFeatures : public CStringFeatures<ST> 00035 { 00036 public: 00037 00041 CStringFileFeatures() : CStringFeatures<ST>(), file(NULL) 00042 { 00043 } 00044 00050 CStringFileFeatures(const char* fname, EAlphabet alpha) 00051 : CStringFeatures<ST>(alpha) 00052 { 00053 file = new CMemoryMappedFile<ST>(fname); 00054 fetch_meta_info_from_file(); 00055 } 00056 00060 virtual ~CStringFileFeatures() 00061 { 00062 SG_UNREF(file); 00063 CStringFileFeatures<ST>::cleanup(); 00064 } 00065 00066 protected: 00081 ST* get_line(uint64_t& len, uint64_t& offs, int32_t& line_nr, uint64_t file_length) 00082 { 00083 ST* s = file->get_map(); 00084 for (uint64_t i=offs; i<file_length; i++) 00085 { 00086 ST c=s[i]; 00087 00088 if (c == '\n') 00089 { 00090 ST* line=&s[offs]; 00091 len=i-offs; 00092 offs=i+1; 00093 line_nr++; 00094 return line; 00095 } 00096 else 00097 { 00098 if (!CStringFeatures<ST>::alphabet->is_valid((uint8_t) c)) 00099 { 00100 CStringFileFeatures<ST>::cleanup(); 00101 CStringFeatures<ST>::SG_ERROR("Invalid character (%c) in line %d\n", c, line_nr); 00102 } 00103 } 00104 } 00105 00106 len=0; 00107 offs=file_length; 00108 return NULL; 00109 } 00110 00112 virtual void cleanup() 00113 { 00114 CStringFeatures<ST>::num_vectors=0; 00115 delete[] CStringFeatures<ST>::features; 00116 delete[] CStringFeatures<ST>::symbol_mask_table; 00117 CStringFeatures<ST>::features=NULL; 00118 CStringFeatures<ST>::symbol_mask_table=NULL; 00119 00120 /* start with a fresh alphabet, but instead of emptying the histogram 00121 * create a new object (to leave the alphabet object alone if it is used 00122 * by others) 00123 */ 00124 CAlphabet* alpha=new CAlphabet(CStringFeatures<ST>::alphabet->get_alphabet()); 00125 SG_UNREF(CStringFeatures<ST>::alphabet); 00126 CStringFeatures<ST>::alphabet=alpha; 00127 SG_REF(CStringFeatures<ST>::alphabet); 00128 } 00129 00131 virtual void cleanup_feature_vector(int32_t num) 00132 { 00133 CStringFeatures<ST>::SG_ERROR("Cleaning single feature vector not" 00134 "supported by StringFileFeatures\n"); 00135 } 00136 00141 void fetch_meta_info_from_file(int32_t granularity=1048576) 00142 { 00143 CStringFileFeatures<ST>::cleanup(); 00144 uint64_t file_size=file->get_size(); 00145 ASSERT(granularity>=1); 00146 ASSERT(CStringFeatures<ST>::alphabet); 00147 00148 uint64_t buffer_size=granularity; 00149 CStringFeatures<ST>::features=new TString<ST>[buffer_size]; 00150 00151 uint64_t offs=0; 00152 uint64_t len=0; 00153 CStringFeatures<ST>::max_string_length=0; 00154 CStringFeatures<ST>::num_vectors=0; 00155 00156 while (true) 00157 { 00158 ST* line=get_line(len, offs, CStringFeatures<ST>::num_vectors, file_size); 00159 00160 if (line) 00161 { 00162 if (CStringFeatures<ST>::num_vectors>buffer_size) 00163 { 00164 CMath::resize(CStringFeatures<ST>::features, buffer_size, buffer_size+granularity); 00165 buffer_size+=granularity; 00166 } 00167 00168 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].string=line; 00169 CStringFeatures<ST>::features[CStringFeatures<ST>::num_vectors-1].length=len; 00170 CStringFeatures<ST>::max_string_length=CMath::max(CStringFeatures<ST>::max_string_length, (int32_t) len); 00171 } 00172 else 00173 break; 00174 } 00175 00176 CStringFeatures<ST>::SG_INFO("number of strings:%d\n", CStringFeatures<ST>::num_vectors); 00177 CStringFeatures<ST>::SG_INFO("maximum string length:%d\n", CStringFeatures<ST>::max_string_length); 00178 CStringFeatures<ST>::SG_INFO("max_value_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_max_value_in_histogram()); 00179 CStringFeatures<ST>::SG_INFO("num_symbols_in_histogram:%d\n", CStringFeatures<ST>::alphabet->get_num_symbols_in_histogram()); 00180 00181 if (!CStringFeatures<ST>::alphabet->check_alphabet_size() || !CStringFeatures<ST>::alphabet->check_alphabet()) 00182 CStringFileFeatures<ST>::cleanup(); 00183 00184 CMath::resize(CStringFeatures<ST>::features, buffer_size, CStringFeatures<ST>::num_vectors); 00185 } 00186 00187 00188 protected: 00190 CMemoryMappedFile<ST>* file; 00191 }; 00192 } 00193 #endif // _CSTRINGFILEFEATURES__H__