/* :call-seq: Dvector.fast_fancy_read(stream, options) => Array_of_Dvectors Reads data from an IO stream (or anything that supports a gets method) and separate it into columns of data according to the _options_, a hash holding the following elements (compulsory, but you can use FANCY_READ_DEFAULTS): * 'sep': a regular expression that separate the entries * 'comments': any line matching this will be skipped * 'skip_first': skips that many lines before reading anything * 'index_col': if true, the first column returned contains the number of the line read * 'remove_space': whether to remove spaces at the beginning of a line. *This option is currently not implemented !* * 'comment_out': this should be an array into which the comments will be dumped one by one. * 'default': what to put when nothing was found but a number must be used In addition to these options that control the output, here are a few others to tune memory allocation; these can strongly improve the performance (or make it worse if you wish): * 'initial_size': the initial size of the memory buffers: if there are not more lines than that, no additional memory allocation/copy occurs. */ static VALUE dvector_fast_fancy_read(VALUE self, VALUE stream, VALUE options) { /* First, we read up options: */ double def = rb_num2dbl(rb_hash_aref(options, rb_str_new2("default"))); int remove_space = RTEST(rb_hash_aref(options, rb_str_new2("remove_space"))); // int index_col = RTEST(rb_hash_aref(options, // rb_str_new2("index_col"))); long skip_first = FIX2LONG(rb_hash_aref(options, rb_str_new2("skip_first"))); VALUE sep = rb_hash_aref(options, rb_str_new2("sep")); VALUE comments = rb_hash_aref(options, rb_str_new2("comments")); VALUE comment_out = rb_hash_aref(options, rb_str_new2("comment_out")); /* Then, some various variables: */ VALUE line; ID chomp_id = rb_intern("chomp!"); ID gets_id = rb_intern("gets"); long line_number = 0; /* Now come the fun part - rudimentary vectors management TODO: if the stream provides functionality to get its total size, it could be interesting to estimate the total number of lines based on some small heuristics */ int nb_vectors = 0; /* The number of vectors currently created */ int current_size = 10; /* The number of slots available */ double ** vectors = ALLOC_N(double *, current_size); long index = 0; /* The current index in the vectors */ /* The size available in the vectors */ int allocated_size = FIX2LONG(rb_hash_aref(options, rb_str_new2("initial_size"))); int i; /* The return value */ VALUE ary; /* We use a real gets so we can also use StringIO, for instance */ while(RTEST(line = rb_funcall(stream, gets_id, 0))) { VALUE pre, post, match; const char * line_ptr; int col = 0; line_number++; /* Whether we should skip the line... */ if(skip_first >= line_number) continue; /* We check for a blank line using isspace: */ line_ptr = StringValueCStr(line); while(line_ptr && *line_ptr) { if(! isspace(*line_ptr)) break; line_ptr++; } if(! *line_ptr) continue; /* We found a blank line */ if(remove_space) /* We replace the contents of the line */ line = rb_str_new2(line_ptr); /* ... or a comment line */ if(RTEST(comments) && RTEST(rb_reg_match(comments, line))) { if(RTEST(comment_out)) rb_ary_push(comment_out, line); continue; } /* Then, we remove the newline: */ post = line; rb_funcall(post, chomp_id, 0); /* We iterate over the different portions between matches */ while(RTEST(post)) { const char * a; char * b; if(RTEST(rb_reg_match(sep, post))) { match = rb_gv_get("$~"); pre = rb_reg_match_pre(match); post = rb_reg_match_post(match); } else { pre = post; post = Qnil; } a = StringValueCStr(pre); double c = strtod(a, &b); if(b == a) c = def; if(col >= nb_vectors) { nb_vectors++; /* We need to create a new vector */ if(col >= current_size) { /* Increase the available size */ current_size += 5; REALLOC_N(vectors, double * , current_size); } double * vals = vectors[col] = ALLOC_N(double, allocated_size); /* Filling it with the default value */ for(i = 0; i < index; i++) { vals[i] = def; } } vectors[col][index] = c; col++; } /* Now, we finish the line */ for(; col < nb_vectors; col++) vectors[col][index] = def; index++; /* Now, we reallocate memory if necessary */ if(index >= allocated_size) { allocated_size *= 2; /* We double the size */ for(col = 0; col < nb_vectors; col++) REALLOC_N(vectors[col], double, allocated_size); } } /* Now, we make up the array */ ary = rb_ary_new(); for(i = 0; i < nb_vectors; i++) { /* We create a vector */ rb_ary_store(ary, i, make_dvector_from_data(cDvector, index, vectors[i])); /* And free the memory */ free(vectors[i]); } free(vectors); return ary; }