00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011 #include <string.h>
00012 #include <sys/types.h>
00013 #include <sys/stat.h>
00014 #include <unistd.h>
00015 #include <ctype.h>
00016
00017 #include "lib/File.h"
00018 #include "lib/SimpleFile.h"
00019
00020 #include "features/StringFeatures.h"
00021 #include "features/SparseFeatures.h"
00022
00023 using namespace shogun;
00024
00025 CFile::CFile(FILE* f)
00026 : CSGObject()
00027 {
00028 file=f;
00029 filename=NULL;
00030 expected_type=F_UNKNOWN;
00031 }
00032
00033 CFile::CFile(char* fname, char rw, EFeatureType typ, char file_fourcc[4])
00034 : CSGObject()
00035 {
00036 status=false;
00037 task=rw;
00038 expected_type=typ;
00039 filename=strdup(fname);
00040 char mode[2];
00041 mode[0]=rw;
00042 mode[1]='\0';
00043
00044
00045 if (rw=='r' || rw == 'w')
00046 {
00047 if (filename)
00048 {
00049 if ((file=fopen((const char*) filename, (const char*) mode)))
00050 status=true;
00051 }
00052 }
00053 else
00054 SG_ERROR("unknown mode '%c'\n", mode[0]);
00055
00056 if (file_fourcc)
00057 {
00058 if (rw=='r')
00059 status=read_header();
00060 else if (rw=='w')
00061 status=write_header();
00062
00063 if (!status)
00064 fclose(file);
00065
00066 file=NULL;
00067 }
00068 }
00069
00070 CFile::~CFile()
00071 {
00072 free(filename);
00073 if (file)
00074 fclose(file);
00075 filename=NULL;
00076 file=NULL;
00077 }
00078
00079 int32_t* CFile::load_int_data(int32_t* target, int64_t& num)
00080 {
00081 ASSERT(expected_type==F_INT);
00082 CSimpleFile<int32_t> f(filename, file);
00083 target=f.load(target, num);
00084 status=(target!=NULL);
00085 return target;
00086 }
00087
00088 bool CFile::save_int_data(int32_t* src, int64_t num)
00089 {
00090 ASSERT(expected_type==F_INT);
00091 CSimpleFile<int32_t> f(filename, file);
00092 status=f.save(src, num);
00093 return status;
00094 }
00095
00096 float64_t* CFile::load_real_data(float64_t* target, int64_t& num)
00097 {
00098 ASSERT(expected_type==F_DREAL);
00099 CSimpleFile<float64_t> f(filename, file);
00100 target=f.load(target, num);
00101 status=(target!=NULL);
00102 return target;
00103 }
00104
00105 float32_t* CFile::load_shortreal_data(float32_t* target, int64_t& num)
00106 {
00107 ASSERT(expected_type==F_SHORTREAL);
00108 CSimpleFile<float32_t> f(filename, file);
00109 target=f.load(target, num);
00110 status=(target!=NULL);
00111 return target;
00112 }
00113
00114 bool CFile::save_real_data(float64_t* src, int64_t num)
00115 {
00116 ASSERT(expected_type==F_DREAL);
00117 CSimpleFile<float64_t> f(filename, file);
00118 status=f.save(src, num);
00119 return status;
00120 }
00121
00122 bool CFile::save_shortreal_data(float32_t* src, int64_t num)
00123 {
00124 ASSERT(expected_type==F_SHORTREAL);
00125 CSimpleFile<float32_t> f(filename, file);
00126 status=f.save(src, num);
00127 return status;
00128 }
00129
00130 char* CFile::load_char_data(char* target, int64_t& num)
00131 {
00132 ASSERT(expected_type==F_CHAR);
00133 CSimpleFile<char> f(filename, file);
00134 target=f.load(target, num);
00135 status=(target!=NULL);
00136 return target;
00137 }
00138
00139 bool CFile::save_char_data(char* src, int64_t num)
00140 {
00141 ASSERT(expected_type==F_CHAR);
00142 CSimpleFile<char> f(filename, file);
00143 status=f.save(src, num);
00144 return status;
00145 }
00146
00147 uint8_t* CFile::load_byte_data(uint8_t* target, int64_t& num)
00148 {
00149 ASSERT(expected_type==F_BYTE);
00150 CSimpleFile<uint8_t> f(filename, file);
00151 target=f.load(target, num);
00152 status=(target!=NULL);
00153 return target;
00154 }
00155
00156 bool CFile::save_byte_data(uint8_t* src, int64_t num)
00157 {
00158 ASSERT(expected_type==F_BYTE);
00159 CSimpleFile<uint8_t> f(filename, file);
00160 status=f.save(src, num);
00161 return status;
00162 }
00163
00164 uint16_t* CFile::load_word_data(uint16_t* target, int64_t& num)
00165 {
00166 ASSERT(expected_type==F_WORD);
00167 CSimpleFile<uint16_t> f(filename, file);
00168 target=f.load(target, num);
00169 status=(target!=NULL);
00170 return target;
00171 }
00172
00173 bool CFile::save_word_data(uint16_t* src, int64_t num)
00174 {
00175 ASSERT(expected_type==F_WORD);
00176 CSimpleFile<uint16_t> f(filename, file);
00177 status=f.save(src, num);
00178 return status;
00179 }
00180
00181 int16_t* CFile::load_short_data(int16_t* target, int64_t& num)
00182 {
00183 ASSERT(expected_type==F_SHORT);
00184 CSimpleFile<int16_t> f(filename, file);
00185 target=f.load(target, num);
00186 status=(target!=NULL);
00187 return target;
00188 }
00189
00190 bool CFile::save_short_data(int16_t* src, int64_t num)
00191 {
00192 ASSERT(expected_type==F_SHORT);
00193 CSimpleFile<int16_t> f(filename, file);
00194 status=f.save(src, num);
00195 return status;
00196 }
00197
00198 int32_t CFile::parse_first_header(EFeatureType &type)
00199 {
00200 return -1;
00201 }
00202
00203 int32_t CFile::parse_next_header(EFeatureType &type)
00204 {
00205 return -1;
00206 }
00207
00208
00209 bool CFile::read_header()
00210 {
00211 ASSERT(file);
00212 uint32_t intlen=0;
00213 uint32_t endian=0;
00214 uint32_t file_fourcc=0;
00215 uint32_t doublelen=0;
00216
00217 if ( (fread(&intlen, sizeof(uint8_t), 1, file)==1) &&
00218 (fread(&doublelen, sizeof(uint8_t), 1, file)==1) &&
00219 (fread(&endian, (uint32_t) intlen, 1, file)== 1) &&
00220 (fread(&file_fourcc, (uint32_t) intlen, 1, file)==1))
00221 return true;
00222 else
00223 return false;
00224 }
00225
00226 bool CFile::write_header()
00227 {
00228 uint8_t intlen=sizeof(uint32_t);
00229 uint8_t doublelen=sizeof(double);
00230 uint32_t endian=0x12345678;
00231
00232 if ((fwrite(&intlen, sizeof(uint8_t), 1, file)==1) &&
00233 (fwrite(&doublelen, sizeof(uint8_t), 1, file)==1) &&
00234 (fwrite(&endian, sizeof(uint32_t), 1, file)==1) &&
00235 (fwrite(&fourcc, 4*sizeof(char), 1, file)==1))
00236 return true;
00237 else
00238 return false;
00239 }
00240
00241 template <class T> void CFile::append_item(
00242 CDynamicArray<T>* items, char* ptr_data, char* ptr_item)
00243 {
00244 size_t len=(ptr_data-ptr_item)/sizeof(char);
00245 char* item=new char[len+1];
00246 memset(item, 0, sizeof(char)*(len+1));
00247 item=strncpy(item, ptr_item, len);
00248
00249 SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
00250 items->append_element(item);
00251 }
00252
00253 bool CFile::read_real_valued_dense(
00254 float64_t*& matrix, int32_t& num_feat, int32_t& num_vec)
00255 {
00256 ASSERT(expected_type==F_DREAL);
00257
00258 struct stat stats;
00259 if (stat(filename, &stats)!=0)
00260 SG_ERROR("Could not get file statistics.\n");
00261
00262 char* data=new char[stats.st_size+1];
00263 memset(data, 0, sizeof(char)*(stats.st_size+1));
00264 size_t nread=fread(data, sizeof(char), stats.st_size, file);
00265 if (nread<=0)
00266 SG_ERROR("Could not read data from %s.\n");
00267
00268 SG_DEBUG("data read from file:\n%s\n", data);
00269
00270
00271 int32_t nf=0;
00272 num_feat=0;
00273 num_vec=0;
00274 char* ptr_item=NULL;
00275 char* ptr_data=data;
00276 CDynamicArray<char*>* items=new CDynamicArray<char*>();
00277
00278 while (*ptr_data)
00279 {
00280 if (*ptr_data=='\n')
00281 {
00282 if (ptr_item)
00283 nf++;
00284
00285 if (num_feat!=0 && nf!=num_feat)
00286 SG_ERROR("Number of features mismatches (%d != %d) in vector %d in file %s.\n", num_feat, nf, num_vec, filename);
00287
00288 append_item(items, ptr_data, ptr_item);
00289 num_feat=nf;
00290 num_vec++;
00291 nf=0;
00292 ptr_item=NULL;
00293 }
00294 else if (!isblank(*ptr_data) && !ptr_item)
00295 {
00296 ptr_item=ptr_data;
00297 }
00298 else if (isblank(*ptr_data) && ptr_item)
00299 {
00300 append_item(items, ptr_data, ptr_item);
00301 ptr_item=NULL;
00302 nf++;
00303 }
00304
00305 ptr_data++;
00306 }
00307
00308 SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);
00309 delete[] data;
00310
00311
00312 matrix=new float64_t[num_vec*num_feat];
00313 for (int32_t i=0; i<num_vec; i++)
00314 {
00315 for (int32_t j=0; j<num_feat; j++)
00316 {
00317 char* item=items->get_element(i*num_feat+j);
00318 matrix[i*num_feat+j]=atof(item);
00319 delete[] item;
00320 }
00321 }
00322 delete items;
00323
00324
00325 return true;
00326 }
00327
00328 bool CFile::write_real_valued_dense(
00329 const float64_t* matrix, int32_t num_feat, int32_t num_vec)
00330 {
00331 if (!(file && matrix))
00332 SG_ERROR("File or matrix invalid.\n");
00333
00334 for (int32_t i=0; i<num_feat; i++)
00335 {
00336 for (int32_t j=0; j<num_vec; j++)
00337 {
00338 float64_t v=matrix[num_feat*j+i];
00339 if (j==num_vec-1)
00340 fprintf(file, "%f\n", v);
00341 else
00342 fprintf(file, "%f ", v);
00343 }
00344 }
00345
00346 return true;
00347 }
00348
00349 bool CFile::read_real_valued_sparse(
00350 TSparse<float64_t>*& matrix, int32_t& num_feat, int32_t& num_vec)
00351 {
00352 size_t blocksize=1024*1024;
00353 size_t required_blocksize=blocksize;
00354 uint8_t* dummy=new uint8_t[blocksize];
00355
00356 if (file)
00357 {
00358 num_vec=0;
00359 num_feat=0;
00360
00361 SG_INFO("counting line numbers in file %s\n", filename);
00362 size_t sz=blocksize;
00363 size_t block_offs=0;
00364 size_t old_block_offs=0;
00365 fseek(file, 0, SEEK_END);
00366 size_t fsize=ftell(file);
00367 rewind(file);
00368
00369 while (sz == blocksize)
00370 {
00371 sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00372 bool contains_cr=false;
00373 for (size_t i=0; i<sz; i++)
00374 {
00375 block_offs++;
00376 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00377 {
00378 num_vec++;
00379 contains_cr=true;
00380 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00381 old_block_offs=block_offs;
00382 }
00383 }
00384 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00385 }
00386
00387 SG_INFO("found %d feature vectors\n", num_vec);
00388 delete[] dummy;
00389 blocksize=required_blocksize;
00390 dummy = new uint8_t[blocksize+1];
00391 matrix=new TSparse<float64_t>[num_vec];
00392
00393 rewind(file);
00394 sz=blocksize;
00395 int32_t lines=0;
00396 while (sz == blocksize)
00397 {
00398 sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00399
00400 size_t old_sz=0;
00401 for (size_t i=0; i<sz; i++)
00402 {
00403 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00404 {
00405 size_t len=i-old_sz+1;
00406 uint8_t* data=&dummy[old_sz];
00407
00408 for (size_t j=0; j<len; j++)
00409 dummy[j]=data[j];
00410
00411 sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file);
00412 i=0;
00413 old_sz=0;
00414 sz+=len;
00415 }
00416
00417 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00418 {
00419
00420 size_t len=i-old_sz;
00421 uint8_t* data=&dummy[old_sz];
00422
00423 int32_t dims=0;
00424 for (size_t j=0; j<len; j++)
00425 {
00426 if (data[j]==':')
00427 dims++;
00428 }
00429
00430 if (dims<=0)
00431 {
00432 SG_ERROR("Error in line %d - number of"
00433 " dimensions is %d line is %d characters"
00434 " long\n line_content:'%.*s'\n", lines,
00435 dims, len, len, (const char*) data);
00436 }
00437
00438 TSparseEntry<float64_t>* feat=new TSparseEntry<float64_t>[dims];
00439
00440
00441 size_t j=0;
00442 for (; j<len; j++)
00443 {
00444 if (data[j]==':')
00445 {
00446 j=-1;
00447 break;
00448 }
00449
00450 if (data[j]==' ')
00451 {
00452 data[j]='\0';
00453
00454
00455 break;
00456 }
00457 }
00458
00459 int32_t d=0;
00460 j++;
00461 uint8_t* start=&data[j];
00462 for (; j<len; j++)
00463 {
00464 if (data[j]==':')
00465 {
00466 data[j]='\0';
00467
00468 feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00469 num_feat=CMath::max(num_feat, feat[d].feat_index+1);
00470
00471 j++;
00472 start=&data[j];
00473 for (; j<len; j++)
00474 {
00475 if (data[j]==' ' || data[j]=='\n')
00476 {
00477 data[j]='\0';
00478 feat[d].entry=(float64_t) atof((const char*) start);
00479 d++;
00480 break;
00481 }
00482 }
00483
00484 if (j==len)
00485 {
00486 data[j]='\0';
00487 feat[dims-1].entry=(float64_t) atof((const char*) start);
00488 }
00489
00490 j++;
00491 start=&data[j];
00492 }
00493 }
00494
00495 matrix[lines].vec_index=lines;
00496 matrix[lines].num_feat_entries=dims;
00497 matrix[lines].features=feat;
00498
00499 old_sz=i+1;
00500 lines++;
00501 SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");
00502 }
00503 }
00504 }
00505
00506 SG_INFO("file successfully read\n");
00507 }
00508
00509 delete[] dummy;
00510 return true;
00511 }
00512
00513 bool CFile::write_real_valued_sparse(
00514 const TSparse<float64_t>* matrix, int32_t num_feat, int32_t num_vec)
00515 {
00516 if (!(file && matrix))
00517 SG_ERROR("File or matrix invalid.\n");
00518
00519 for (int32_t i=0; i<num_vec; i++)
00520 {
00521 TSparseEntry<float64_t>* vec = matrix[i].features;
00522 int32_t len=matrix[i].num_feat_entries;
00523
00524 for (int32_t j=0; j<len; j++)
00525 {
00526 if (j<len-1)
00527 fprintf(file, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00528 else
00529 fprintf(file, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00530 }
00531 }
00532
00533 return true;
00534 }
00535
00536
00537 bool CFile::read_char_valued_strings(
00538 T_STRING<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00539 {
00540 bool result=false;
00541
00542 size_t blocksize=1024*1024;
00543 size_t required_blocksize=0;
00544 char* dummy=new char[blocksize];
00545 char* overflow=NULL;
00546 int32_t overflow_len=0;
00547
00548 if (file)
00549 {
00550 num_str=0;
00551 max_string_len=0;
00552
00553 SG_INFO("counting line numbers in file %s\n", filename);
00554 size_t sz=blocksize;
00555 size_t block_offs=0;
00556 size_t old_block_offs=0;
00557 fseek(file, 0, SEEK_END);
00558 size_t fsize=ftell(file);
00559 rewind(file);
00560
00561 while (sz == blocksize)
00562 {
00563 sz=fread(dummy, sizeof(char), blocksize, file);
00564 bool contains_cr=false;
00565 for (size_t i=0; i<sz; i++)
00566 {
00567 block_offs++;
00568 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00569 {
00570 num_str++;
00571 contains_cr=true;
00572 required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00573 old_block_offs=block_offs;
00574 }
00575 }
00576 SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00577 }
00578
00579 SG_INFO("found %d strings\n", num_str);
00580 SG_DEBUG("block_size=%d\n", required_blocksize);
00581 delete[] dummy;
00582 blocksize=required_blocksize;
00583 dummy=new char[blocksize];
00584 overflow=new char[blocksize];
00585 strings=new T_STRING<char>[num_str];
00586
00587 rewind(file);
00588 sz=blocksize;
00589 int32_t lines=0;
00590 size_t old_sz=0;
00591 while (sz == blocksize)
00592 {
00593 sz=fread(dummy, sizeof(char), blocksize, file);
00594
00595 old_sz=0;
00596 for (size_t i=0; i<sz; i++)
00597 {
00598 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00599 {
00600 int32_t len=i-old_sz;
00601 max_string_len=CMath::max(max_string_len, len+overflow_len);
00602
00603 strings[lines].length=len+overflow_len;
00604 strings[lines].string=new char[len+overflow_len];
00605
00606 for (int32_t j=0; j<overflow_len; j++)
00607 strings[lines].string[j]=overflow[j];
00608 for (int32_t j=0; j<len; j++)
00609 strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00610
00611
00612 overflow_len=0;
00613
00614
00615 old_sz=i+1;
00616 lines++;
00617 SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00618 }
00619 }
00620
00621 for (size_t i=old_sz; i<sz; i++)
00622 overflow[i-old_sz]=dummy[i];
00623
00624 overflow_len=sz-old_sz;
00625 }
00626 result=true;
00627 SG_INFO("file successfully read\n");
00628 SG_INFO("max_string_length=%d\n", max_string_len);
00629 SG_INFO("num_strings=%d\n", num_str);
00630 }
00631
00632 delete[] dummy;
00633 delete[] overflow;
00634
00635 return result;
00636 }
00637
00638 bool CFile::write_char_valued_strings(
00639 const T_STRING<char>* strings, int32_t num_str)
00640 {
00641 if (!(file && strings))
00642 SG_ERROR("File or strings invalid.\n");
00643
00644 for (int32_t i=0; i<num_str; i++)
00645 {
00646 int32_t len = strings[i].length;
00647 fwrite(strings[i].string, sizeof(char), len, file);
00648 fprintf(file, "\n");
00649 }
00650
00651 return true;
00652 }
00653
00654