File.cpp

Go to the documentation of this file.
00001 /*
00002  * This program is free software; you can redistribute it and/or modify
00003  * it under the terms of the GNU General Public License as published by
00004  * the Free Software Foundation; either version 3 of the License, or
00005  * (at your option) any later version.
00006  *
00007  * Written (W) 1999-2009 Soeren Sonnenburg
00008  * Copyright (C) 1999-2009 Fraunhofer Institute FIRST and Max-Planck-Society
00009  */
00010 
00011 #include <string.h>
00012 #include <sys/types.h>
00013 #include <sys/stat.h>
00014 #include <unistd.h>
00015 #include <ctype.h>
00016 
00017 #include "lib/File.h"
00018 #include "lib/SimpleFile.h"
00019 
00020 #include "features/StringFeatures.h"
00021 #include "features/SparseFeatures.h"
00022 
00023 using namespace shogun;
00024 
00025 CFile::CFile(FILE* f)
00026 : CSGObject()
00027 {
00028     file=f;
00029     filename=NULL;
00030     expected_type=F_UNKNOWN;
00031 }
00032 
00033 CFile::CFile(char* fname, char rw, EFeatureType typ, char file_fourcc[4])
00034 : CSGObject()
00035 {
00036     status=false;
00037     task=rw;
00038     expected_type=typ;
00039     filename=strdup(fname);
00040     char mode[2];
00041     mode[0]=rw;
00042     mode[1]='\0';
00043 
00044 
00045     if (rw=='r' || rw == 'w')
00046     {
00047         if (filename)
00048         {
00049             if ((file=fopen((const char*) filename, (const char*) mode)))
00050                 status=true;
00051         }
00052     }
00053     else
00054         SG_ERROR("unknown mode '%c'\n", mode[0]);
00055 
00056     if (file_fourcc)
00057     {
00058         if (rw=='r')
00059             status=read_header();
00060         else if (rw=='w')
00061             status=write_header();
00062 
00063         if (!status)
00064             fclose(file);
00065 
00066         file=NULL;
00067     }
00068 }
00069 
00070 CFile::~CFile()
00071 {
00072     free(filename);
00073     if (file)
00074       fclose(file);
00075     filename=NULL;
00076     file=NULL;
00077 }
00078 
00079 int32_t* CFile::load_int_data(int32_t* target, int64_t& num)
00080 {
00081     ASSERT(expected_type==F_INT);
00082     CSimpleFile<int32_t> f(filename, file);
00083     target=f.load(target, num);
00084     status=(target!=NULL);
00085     return target;
00086 }
00087 
00088 bool CFile::save_int_data(int32_t* src, int64_t num)
00089 {
00090     ASSERT(expected_type==F_INT);
00091     CSimpleFile<int32_t> f(filename, file);
00092     status=f.save(src, num);
00093     return status;
00094 }
00095 
00096 float64_t* CFile::load_real_data(float64_t* target, int64_t& num)
00097 {
00098     ASSERT(expected_type==F_DREAL);
00099     CSimpleFile<float64_t> f(filename, file);
00100     target=f.load(target, num);
00101     status=(target!=NULL);
00102     return target;
00103 }
00104 
00105 float32_t* CFile::load_shortreal_data(float32_t* target, int64_t& num)
00106 {
00107     ASSERT(expected_type==F_SHORTREAL);
00108     CSimpleFile<float32_t> f(filename, file);
00109     target=f.load(target, num);
00110     status=(target!=NULL);
00111     return target;
00112 }
00113 
00114 bool CFile::save_real_data(float64_t* src, int64_t num)
00115 {
00116     ASSERT(expected_type==F_DREAL);
00117     CSimpleFile<float64_t> f(filename, file);
00118     status=f.save(src, num);
00119     return status;
00120 }
00121 
00122 bool CFile::save_shortreal_data(float32_t* src, int64_t num)
00123 {
00124     ASSERT(expected_type==F_SHORTREAL);
00125     CSimpleFile<float32_t> f(filename, file);
00126     status=f.save(src, num);
00127     return status;
00128 }
00129 
00130 char* CFile::load_char_data(char* target, int64_t& num)
00131 {
00132     ASSERT(expected_type==F_CHAR);
00133     CSimpleFile<char> f(filename, file);
00134     target=f.load(target, num);
00135     status=(target!=NULL);
00136     return target;
00137 }
00138 
00139 bool CFile::save_char_data(char* src, int64_t num)
00140 {
00141     ASSERT(expected_type==F_CHAR);
00142     CSimpleFile<char> f(filename, file);
00143     status=f.save(src, num);
00144     return status;
00145 }
00146 
00147 uint8_t* CFile::load_byte_data(uint8_t* target, int64_t& num)
00148 {
00149     ASSERT(expected_type==F_BYTE);
00150     CSimpleFile<uint8_t> f(filename, file);
00151     target=f.load(target, num);
00152     status=(target!=NULL);
00153     return target;
00154 }
00155 
00156 bool CFile::save_byte_data(uint8_t* src, int64_t num)
00157 {
00158     ASSERT(expected_type==F_BYTE);
00159     CSimpleFile<uint8_t> f(filename, file);
00160     status=f.save(src, num);
00161     return status;
00162 }
00163 
00164 uint16_t* CFile::load_word_data(uint16_t* target, int64_t& num)
00165 {
00166     ASSERT(expected_type==F_WORD);
00167     CSimpleFile<uint16_t> f(filename, file);
00168     target=f.load(target, num);
00169     status=(target!=NULL);
00170     return target;
00171 }
00172 
00173 bool CFile::save_word_data(uint16_t* src, int64_t num)
00174 {
00175     ASSERT(expected_type==F_WORD);
00176     CSimpleFile<uint16_t> f(filename, file);
00177     status=f.save(src, num);
00178     return status;
00179 }
00180 
00181 int16_t* CFile::load_short_data(int16_t* target, int64_t& num)
00182 {
00183     ASSERT(expected_type==F_SHORT);
00184     CSimpleFile<int16_t> f(filename, file);
00185     target=f.load(target, num);
00186     status=(target!=NULL);
00187     return target;
00188 }
00189 
00190 bool CFile::save_short_data(int16_t* src, int64_t num)
00191 {
00192     ASSERT(expected_type==F_SHORT);
00193     CSimpleFile<int16_t> f(filename, file);
00194     status=f.save(src, num);
00195     return status;
00196 }
00197 
00198 int32_t CFile::parse_first_header(EFeatureType &type)
00199 {
00200     return -1;
00201 }
00202 
00203 int32_t CFile::parse_next_header(EFeatureType &type)
00204 {
00205     return -1;
00206 }
00207 
00208 
00209 bool CFile::read_header()
00210 {
00211     ASSERT(file);
00212     uint32_t intlen=0;
00213     uint32_t endian=0;
00214     uint32_t file_fourcc=0;
00215     uint32_t doublelen=0;
00216 
00217     if ( (fread(&intlen, sizeof(uint8_t), 1, file)==1) &&
00218             (fread(&doublelen, sizeof(uint8_t), 1, file)==1) &&
00219             (fread(&endian, (uint32_t) intlen, 1, file)== 1) &&
00220             (fread(&file_fourcc, (uint32_t) intlen, 1, file)==1))
00221         return true;
00222     else
00223         return false;
00224 }
00225 
00226 bool CFile::write_header()
00227 {
00228     uint8_t intlen=sizeof(uint32_t);
00229     uint8_t doublelen=sizeof(double);
00230     uint32_t endian=0x12345678;
00231 
00232     if ((fwrite(&intlen, sizeof(uint8_t), 1, file)==1) &&
00233             (fwrite(&doublelen, sizeof(uint8_t), 1, file)==1) &&
00234             (fwrite(&endian, sizeof(uint32_t), 1, file)==1) &&
00235             (fwrite(&fourcc, 4*sizeof(char), 1, file)==1))
00236         return true;
00237     else
00238         return false;
00239 }
00240 
00241 template <class T> void CFile::append_item(
00242     CDynamicArray<T>* items, char* ptr_data, char* ptr_item)
00243 {
00244     size_t len=(ptr_data-ptr_item)/sizeof(char);
00245     char* item=new char[len+1];
00246     memset(item, 0, sizeof(char)*(len+1));
00247     item=strncpy(item, ptr_item, len);
00248 
00249     SG_DEBUG("current %c, len %d, item %s\n", *ptr_data, len, item);
00250     items->append_element(item);
00251 }
00252 
00253 bool CFile::read_real_valued_dense(
00254     float64_t*& matrix, int32_t& num_feat, int32_t& num_vec)
00255 {
00256     ASSERT(expected_type==F_DREAL);
00257 
00258     struct stat stats;
00259     if (stat(filename, &stats)!=0)
00260         SG_ERROR("Could not get file statistics.\n");
00261 
00262     char* data=new char[stats.st_size+1];
00263     memset(data, 0, sizeof(char)*(stats.st_size+1));
00264     size_t nread=fread(data, sizeof(char), stats.st_size, file);
00265     if (nread<=0)
00266         SG_ERROR("Could not read data from %s.\n");
00267 
00268     SG_DEBUG("data read from file:\n%s\n", data);
00269 
00270     // determine num_feat and num_vec, populate dynamic array
00271     int32_t nf=0;
00272     num_feat=0;
00273     num_vec=0;
00274     char* ptr_item=NULL;
00275     char* ptr_data=data;
00276     CDynamicArray<char*>* items=new CDynamicArray<char*>();
00277 
00278     while (*ptr_data)
00279     {
00280         if (*ptr_data=='\n')
00281         {
00282             if (ptr_item)
00283                 nf++;
00284 
00285             if (num_feat!=0 && nf!=num_feat)
00286                 SG_ERROR("Number of features mismatches (%d != %d) in vector %d in file %s.\n", num_feat, nf, num_vec, filename);
00287 
00288             append_item(items, ptr_data, ptr_item);
00289             num_feat=nf;
00290             num_vec++;
00291             nf=0;
00292             ptr_item=NULL;
00293         }
00294         else if (!isblank(*ptr_data) && !ptr_item)
00295         {
00296             ptr_item=ptr_data;
00297         }
00298         else if (isblank(*ptr_data) && ptr_item)
00299         {
00300             append_item(items, ptr_data, ptr_item);
00301             ptr_item=NULL;
00302             nf++;
00303         }
00304 
00305         ptr_data++;
00306     }
00307 
00308     SG_DEBUG("num feat: %d, num_vec %d\n", num_feat, num_vec);
00309     delete[] data;
00310 
00311     // now copy data into matrix
00312     matrix=new float64_t[num_vec*num_feat];
00313     for (int32_t i=0; i<num_vec; i++)
00314     {
00315         for (int32_t j=0; j<num_feat; j++)
00316         {
00317             char* item=items->get_element(i*num_feat+j);
00318             matrix[i*num_feat+j]=atof(item);
00319             delete[] item;
00320         }
00321     }
00322     delete items;
00323 
00324     //CMath::display_matrix(matrix, num_feat, num_vec);
00325     return true;
00326 }
00327 
00328 bool CFile::write_real_valued_dense(
00329     const float64_t* matrix, int32_t num_feat, int32_t num_vec)
00330 {
00331     if (!(file && matrix))
00332         SG_ERROR("File or matrix invalid.\n");
00333 
00334     for (int32_t i=0; i<num_feat; i++)
00335     {
00336         for (int32_t j=0; j<num_vec; j++)
00337         {
00338             float64_t v=matrix[num_feat*j+i];
00339             if (j==num_vec-1)
00340                 fprintf(file, "%f\n", v);
00341             else
00342                 fprintf(file, "%f ", v);
00343         }
00344     }
00345 
00346     return true;
00347 }
00348 
00349 bool CFile::read_real_valued_sparse(
00350     TSparse<float64_t>*& matrix, int32_t& num_feat, int32_t& num_vec)
00351 {
00352     size_t blocksize=1024*1024;
00353     size_t required_blocksize=blocksize;
00354     uint8_t* dummy=new uint8_t[blocksize];
00355 
00356     if (file)
00357     {
00358         num_vec=0;
00359         num_feat=0;
00360 
00361         SG_INFO("counting line numbers in file %s\n", filename);
00362         size_t sz=blocksize;
00363         size_t block_offs=0;
00364         size_t old_block_offs=0;
00365         fseek(file, 0, SEEK_END);
00366         size_t fsize=ftell(file);
00367         rewind(file);
00368 
00369         while (sz == blocksize)
00370         {
00371             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00372             bool contains_cr=false;
00373             for (size_t i=0; i<sz; i++)
00374             {
00375                 block_offs++;
00376                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00377                 {
00378                     num_vec++;
00379                     contains_cr=true;
00380                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs+1);
00381                     old_block_offs=block_offs;
00382                 }
00383             }
00384             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00385         }
00386 
00387         SG_INFO("found %d feature vectors\n", num_vec);
00388         delete[] dummy;
00389         blocksize=required_blocksize;
00390         dummy = new uint8_t[blocksize+1]; //allow setting of '\0' at EOL
00391         matrix=new TSparse<float64_t>[num_vec];
00392 
00393         rewind(file);
00394         sz=blocksize;
00395         int32_t lines=0;
00396         while (sz == blocksize)
00397         {
00398             sz=fread(dummy, sizeof(uint8_t), blocksize, file);
00399 
00400             size_t old_sz=0;
00401             for (size_t i=0; i<sz; i++)
00402             {
00403                 if (i==sz-1 && dummy[i]!='\n' && sz==blocksize)
00404                 {
00405                     size_t len=i-old_sz+1;
00406                     uint8_t* data=&dummy[old_sz];
00407 
00408                     for (size_t j=0; j<len; j++)
00409                         dummy[j]=data[j];
00410 
00411                     sz=fread(dummy+len, sizeof(uint8_t), blocksize-len, file);
00412                     i=0;
00413                     old_sz=0;
00414                     sz+=len;
00415                 }
00416 
00417                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00418                 {
00419 
00420                     size_t len=i-old_sz;
00421                     uint8_t* data=&dummy[old_sz];
00422 
00423                     int32_t dims=0;
00424                     for (size_t j=0; j<len; j++)
00425                     {
00426                         if (data[j]==':')
00427                             dims++;
00428                     }
00429 
00430                     if (dims<=0)
00431                     {
00432                         SG_ERROR("Error in line %d - number of"
00433                                 " dimensions is %d line is %d characters"
00434                                 " long\n line_content:'%.*s'\n", lines,
00435                                 dims, len, len, (const char*) data);
00436                     }
00437 
00438                     TSparseEntry<float64_t>* feat=new TSparseEntry<float64_t>[dims];
00439 
00440                     //skip label part
00441                     size_t j=0;
00442                     for (; j<len; j++)
00443                     {
00444                         if (data[j]==':')
00445                         {
00446                             j=-1; //file without label
00447                             break;
00448                         }
00449 
00450                         if (data[j]==' ')
00451                         {
00452                             data[j]='\0';
00453 
00454                             //skip label part
00455                             break;
00456                         }
00457                     }
00458 
00459                     int32_t d=0;
00460                     j++;
00461                     uint8_t* start=&data[j];
00462                     for (; j<len; j++)
00463                     {
00464                         if (data[j]==':')
00465                         {
00466                             data[j]='\0';
00467 
00468                             feat[d].feat_index=(int32_t) atoi((const char*) start)-1;
00469                             num_feat=CMath::max(num_feat, feat[d].feat_index+1);
00470 
00471                             j++;
00472                             start=&data[j];
00473                             for (; j<len; j++)
00474                             {
00475                                 if (data[j]==' ' || data[j]=='\n')
00476                                 {
00477                                     data[j]='\0';
00478                                     feat[d].entry=(float64_t) atof((const char*) start);
00479                                     d++;
00480                                     break;
00481                                 }
00482                             }
00483 
00484                             if (j==len)
00485                             {
00486                                 data[j]='\0';
00487                                 feat[dims-1].entry=(float64_t) atof((const char*) start);
00488                             }
00489 
00490                             j++;
00491                             start=&data[j];
00492                         }
00493                     }
00494 
00495                     matrix[lines].vec_index=lines;
00496                     matrix[lines].num_feat_entries=dims;
00497                     matrix[lines].features=feat;
00498 
00499                     old_sz=i+1;
00500                     lines++;
00501                     SG_PROGRESS(lines, 0, num_vec, 1, "LOADING:\t");
00502                 }
00503             }
00504         }
00505 
00506         SG_INFO("file successfully read\n");
00507     }
00508 
00509     delete[] dummy;
00510     return true;
00511 }
00512 
00513 bool CFile::write_real_valued_sparse(
00514     const TSparse<float64_t>* matrix, int32_t num_feat, int32_t num_vec)
00515 {
00516     if (!(file && matrix))
00517         SG_ERROR("File or matrix invalid.\n");
00518 
00519     for (int32_t i=0; i<num_vec; i++)
00520     {
00521         TSparseEntry<float64_t>* vec = matrix[i].features;
00522         int32_t len=matrix[i].num_feat_entries;
00523 
00524         for (int32_t j=0; j<len; j++)
00525         {
00526             if (j<len-1)
00527                 fprintf(file, "%d:%f ", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00528             else
00529                 fprintf(file, "%d:%f\n", (int32_t) vec[j].feat_index+1, (double) vec[j].entry);
00530         }
00531     }
00532 
00533     return true;
00534 }
00535 
00536 
00537 bool CFile::read_char_valued_strings(
00538     T_STRING<char>*& strings, int32_t& num_str, int32_t& max_string_len)
00539 {
00540     bool result=false;
00541 
00542     size_t blocksize=1024*1024;
00543     size_t required_blocksize=0;
00544     char* dummy=new char[blocksize];
00545     char* overflow=NULL;
00546     int32_t overflow_len=0;
00547 
00548     if (file)
00549     {
00550         num_str=0;
00551         max_string_len=0;
00552 
00553         SG_INFO("counting line numbers in file %s\n", filename);
00554         size_t sz=blocksize;
00555         size_t block_offs=0;
00556         size_t old_block_offs=0;
00557         fseek(file, 0, SEEK_END);
00558         size_t fsize=ftell(file);
00559         rewind(file);
00560 
00561         while (sz == blocksize)
00562         {
00563             sz=fread(dummy, sizeof(char), blocksize, file);
00564             bool contains_cr=false;
00565             for (size_t i=0; i<sz; i++)
00566             {
00567                 block_offs++;
00568                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00569                 {
00570                     num_str++;
00571                     contains_cr=true;
00572                     required_blocksize=CMath::max(required_blocksize, block_offs-old_block_offs);
00573                     old_block_offs=block_offs;
00574                 }
00575             }
00576             SG_PROGRESS(block_offs, 0, fsize, 1, "COUNTING:\t");
00577         }
00578 
00579         SG_INFO("found %d strings\n", num_str);
00580         SG_DEBUG("block_size=%d\n", required_blocksize);
00581         delete[] dummy;
00582         blocksize=required_blocksize;
00583         dummy=new char[blocksize];
00584         overflow=new char[blocksize];
00585         strings=new T_STRING<char>[num_str];
00586 
00587         rewind(file);
00588         sz=blocksize;
00589         int32_t lines=0;
00590         size_t old_sz=0;
00591         while (sz == blocksize)
00592         {
00593             sz=fread(dummy, sizeof(char), blocksize, file);
00594 
00595             old_sz=0;
00596             for (size_t i=0; i<sz; i++)
00597             {
00598                 if (dummy[i]=='\n' || (i==sz-1 && sz<blocksize))
00599                 {
00600                     int32_t len=i-old_sz;
00601                     max_string_len=CMath::max(max_string_len, len+overflow_len);
00602 
00603                     strings[lines].length=len+overflow_len;
00604                     strings[lines].string=new char[len+overflow_len];
00605 
00606                     for (int32_t j=0; j<overflow_len; j++)
00607                         strings[lines].string[j]=overflow[j];
00608                     for (int32_t j=0; j<len; j++)
00609                         strings[lines].string[j+overflow_len]=dummy[old_sz+j];
00610 
00611                     // clear overflow
00612                     overflow_len=0;
00613 
00614                     //CMath::display_vector(strings[lines].string, len);
00615                     old_sz=i+1;
00616                     lines++;
00617                     SG_PROGRESS(lines, 0, num_str, 1, "LOADING:\t");
00618                 }
00619             }
00620 
00621             for (size_t i=old_sz; i<sz; i++)
00622                 overflow[i-old_sz]=dummy[i];
00623 
00624             overflow_len=sz-old_sz;
00625         }
00626         result=true;
00627         SG_INFO("file successfully read\n");
00628         SG_INFO("max_string_length=%d\n", max_string_len);
00629         SG_INFO("num_strings=%d\n", num_str);
00630     }
00631 
00632     delete[] dummy;
00633     delete[] overflow;
00634 
00635     return result;
00636 }
00637 
00638 bool CFile::write_char_valued_strings(
00639     const T_STRING<char>* strings, int32_t num_str)
00640 {
00641     if (!(file && strings))
00642         SG_ERROR("File or strings invalid.\n");
00643 
00644     for (int32_t i=0; i<num_str; i++)
00645     {
00646         int32_t len = strings[i].length;
00647         fwrite(strings[i].string, sizeof(char), len, file);
00648         fprintf(file, "\n");
00649     }
00650 
00651     return true;
00652 }
00653 
00654 

SHOGUN Machine Learning Toolbox - Documentation