GenSVM
gensvm_io.c
Go to the documentation of this file.
1 
32 #include "gensvm_io.h"
33 
47 void gensvm_read_data(struct GenData *dataset, char *data_file)
48 {
49  FILE *fid = NULL;
50  long i, j, n, m,
51  nr = 0,
52  K = 0;
53  double value;
54  char buf[GENSVM_MAX_LINE_LENGTH];
55 
56  if ((fid = fopen(data_file, "r")) == NULL) {
57  // LCOV_EXCL_START
58  err("[GenSVM Error]: Datafile %s could not be opened.\n",
59  data_file);
60  exit(EXIT_FAILURE);
61  // LCOV_EXCL_STOP
62  }
63 
64  // Read data dimensions
65  nr += fscanf(fid, "%ld", &n);
66  nr += fscanf(fid, "%ld", &m);
67 
68  // Allocate memory
69  dataset->RAW = Malloc(double, n*(m+1));
70 
71  // Read first line of data
72  for (j=1; j<m+1; j++) {
73  nr += fscanf(fid, "%lf", &value);
74  matrix_set(dataset->RAW, m+1, 0, j, value);
75  }
76 
77  if (fgets(buf, GENSVM_MAX_LINE_LENGTH, fid) == NULL) {
78  // LCOV_EXCL_START
79  err("[GenSVM Error]: No label found on first line.\n");
80  exit(EXIT_FAILURE);
81  // LCOV_EXCL_STOP
82  }
83 
84  // Check if there is a label at the end of the line
85  if (sscanf(buf, "%lf", &value) > 0) {
86  dataset->y = Malloc(long, n);
87  dataset->y[0] = value;
88  K = 1;
89  } else {
90  free(dataset->y);
91  dataset->y = NULL;
92  }
93 
94  // Read the rest of the file
95  for (i=1; i<n; i++) {
96  for (j=1; j<m+1; j++) {
97  nr += fscanf(fid, "%lf", &value);
98  matrix_set(dataset->RAW, m+1, i, j, value);
99  }
100  if (dataset->y != NULL) {
101  nr += fscanf(fid, "%lf", &value);
102  dataset->y[i] = (long) value;
103  K = maximum(K, dataset->y[i]);
104  }
105  }
106  fclose(fid);
107 
108  if (nr < n * m) {
109  // LCOV_EXCL_START
110  err("[GenSVM Error]: not enough data found in %s\n",
111  data_file);
112  exit(EXIT_FAILURE);
113  // LCOV_EXCL_STOP
114  }
115 
116  // Set the column of ones
117  for (i=0; i<n; i++)
118  matrix_set(dataset->RAW, m+1, i, 0, 1.0);
119 
120  dataset->n = n;
121  dataset->m = m;
122  dataset->r = m;
123  dataset->K = K;
124  dataset->Z = dataset->RAW;
125 
126  if (gensvm_could_sparse(dataset->Z, n, m+1)) {
127  note("Converting to sparse ... ");
128  dataset->spZ = gensvm_dense_to_sparse(dataset->Z, n, m+1);
129  note("done.\n");
130  free(dataset->RAW);
131  dataset->RAW = NULL;
132  dataset->Z = NULL;
133  }
134 }
135 
142 void exit_input_error(int line_num)
143 {
144  err("[GenSVM Error]: Wrong input format on line: %i\n", line_num);
145  exit(EXIT_FAILURE);
146 }
147 
178 void gensvm_read_data_libsvm(struct GenData *data, char *data_file)
179 {
180  bool do_sparse, zero_based = false;
181  long i, j, n, m, K, nnz, cnt, tmp, index, row_cnt, num_labels,
182  min_index = 1;
183  int n_big, n_small, big_start;
184  double value;
185  FILE *fid = NULL;
186  char *label = NULL,
187  *endptr = NULL,
188  **big_parts = NULL,
189  **small_parts = NULL;
190  char buf[GENSVM_MAX_LINE_LENGTH];
191 
192  fid = fopen(data_file, "r");
193  if (fid == NULL) {
194  // LCOV_EXCL_START
195  err("[GenSVM Error]: Datafile %s could not be opened.\n",
196  data_file);
197  exit(EXIT_FAILURE);
198  // LCOV_EXCL_STOP
199  }
200 
201  // first count the number of elements
202  n = 0;
203  m = -1;
204 
205  num_labels = 0;
206  nnz = 0;
207 
208  while (fgets(buf, GENSVM_MAX_LINE_LENGTH, fid) != NULL) {
209  // split the string in labels and/or index:value pairs
210  big_parts = str_split(buf, " \t", &n_big);
211 
212  // record if this line has a label (first part has no colon)
213  num_labels += (!str_contains_char(big_parts[0], ':'));
214 
215  // check for each part if it is a index:value pair
216  for (i=0; i<n_big; i++) {
217  if (!str_contains_char(big_parts[i], ':'))
218  continue;
219 
220  // split the index:value pair
221  small_parts = str_split(big_parts[i], ":", &n_small);
222 
223  // convert the index to a number
224  index = strtol(small_parts[0], &endptr, 10);
225 
226  // catch conversion errors
227  if (endptr == small_parts[0] || errno != 0 ||
228  *endptr != '\0')
229  exit_input_error(n+1);
230 
231  // update the maximum index
232  m = maximum(m, index);
233 
234  // update the minimum index
235  min_index = minimum(min_index, index);
236 
237  // free the small parts
238  for (j=0; j<n_small; j++) free(small_parts[j]);
239  free(small_parts);
240 
241  // increment the nonzero counter
242  nnz++;
243  }
244 
245  // free the big parts
246  for (i=0; i<n_big; i++) {
247  free(big_parts[i]);
248  }
249  free(big_parts);
250 
251  // increment the number of observations
252  n++;
253  }
254 
255  // rewind the file pointer
256  rewind(fid);
257 
258  // check if we have enough labels
259  if (num_labels > 0 && num_labels != n) {
260  err("[GenSVM Error]: There are some lines with missing "
261  "labels. Please fix this before "
262  "continuing.\n");
263  exit(EXIT_FAILURE);
264  }
265 
266  // don't forget the column of ones
267  nnz += n;
268 
269  // deal with 0-based or 1-based indexing in the LibSVM file
270  if (min_index == 0) {
271  m++;
272  zero_based = true;
273  }
274 
275  // check if sparsity is worth it
276  do_sparse = gensvm_nnz_comparison(nnz, n, m+1);
277  if (do_sparse) {
278  data->spZ = gensvm_init_sparse();
279  data->spZ->nnz = nnz;
280  data->spZ->n_row = n;
281  data->spZ->n_col = m+1;
282  data->spZ->values = Calloc(double, nnz);
283  data->spZ->ia = Calloc(long, n+1);
284  data->spZ->ja = Calloc(long, nnz);
285  data->spZ->ia[0] = 0;
286  } else {
287  data->RAW = Calloc(double, n*(m+1));
288  data->Z = data->RAW;
289  }
290  if (num_labels > 0)
291  data->y = Calloc(long, n);
292 
293  K = 0;
294  cnt = 0;
295  for (i=0; i<n; i++) {
296  fgets(buf, GENSVM_MAX_LINE_LENGTH, fid);
297 
298  // split the string in labels and/or index:value pairs
299  big_parts = str_split(buf, " \t", &n_big);
300 
301  big_start = 0;
302  // get the label from the first part if it exists
303  if (!str_contains_char(big_parts[0], ':')) {
304  label = strtok(big_parts[0], " \t\n");
305  if (label == NULL) // empty line
306  exit_input_error(i+1);
307 
308  // convert the label part to a number exit if there
309  // are errors
310  tmp = strtol(label, &endptr, 10);
311  if (endptr == label || *endptr != '\0')
312  exit_input_error(i+1);
313 
314  // assign label to y
315  data->y[i] = tmp;
316 
317  // keep track of maximum K
318  K = maximum(K, data->y[i]);
319 
320  // increment big part index
321  big_start++;
322  }
323 
324  row_cnt = 0;
325  // set the first element in the row to 1
326  if (do_sparse) {
327  data->spZ->values[cnt] = 1.0;
328  data->spZ->ja[cnt] = 0;
329  cnt++;
330  row_cnt++;
331  } else {
332  matrix_set(data->RAW, m+1, i, 0, 1.0);
333  }
334 
335  // read the rest of the line
336  for (j=big_start; j<n_big; j++) {
337  if (!str_contains_char(big_parts[j], ':'))
338  continue;
339 
340  // split the index:value pair
341  small_parts = str_split(big_parts[j], ":", &n_small);
342  if (n_small != 2)
343  exit_input_error(n+1);
344 
345  // convert the index to a long
346  errno = 0;
347  index = strtol(small_parts[0], &endptr, 10);
348 
349  // catch conversion errors
350  if (endptr == small_parts[0] || errno != 0 ||
351  *endptr != '\0')
352  exit_input_error(n+1);
353 
354  // convert the value to a double
355  errno = 0;
356  value = strtod(small_parts[1], &endptr);
357  if (endptr == small_parts[1] || errno != 0 ||
358  (*endptr != '\0' && !isspace(*endptr)))
359  exit_input_error(n+1);
360 
361  if (do_sparse) {
362  data->spZ->values[cnt] = value;
363  data->spZ->ja[cnt] = index + zero_based;
364  cnt++;
365  row_cnt++;
366  } else {
367  matrix_set(data->RAW, m+1, i,
368  index + zero_based, value);
369  }
370 
371  // free the small parts
372  free(small_parts[0]);
373  free(small_parts[1]);
374  free(small_parts);
375  }
376 
377  if (do_sparse) {
378  data->spZ->ia[i+1] = data->spZ->ia[i] + row_cnt;
379  }
380 
381  // free the big parts
382  for (j=0; j<n_big; j++) {
383  free(big_parts[j]);
384  }
385  free(big_parts);
386  }
387 
388  fclose(fid);
389 
390  data->n = n;
391  data->m = m;
392  data->r = m;
393  data->K = K;
394 
395 }
396 
410 void gensvm_read_model(struct GenModel *model, char *model_filename)
411 {
412  long i, j, nr = 0;
413  FILE *fid = NULL;
414  char buffer[GENSVM_MAX_LINE_LENGTH];
415  char data_filename[GENSVM_MAX_LINE_LENGTH];
416  double value = 0;
417 
418  fid = fopen(model_filename, "r");
419  if (fid == NULL) {
420  // LCOV_EXCL_START
421  err("[GenSVM Error]: Couldn't open model file %s\n",
422  model_filename);
423  exit(EXIT_FAILURE);
424  // LCOV_EXCL_STOP
425  }
426  // skip the first four lines
427  for (i=0; i<4; i++)
428  next_line(fid, model_filename);
429 
430  // read all model variables
431  model->p = get_fmt_double(fid, model_filename, "p = %lf");
432  model->lambda = get_fmt_double(fid, model_filename, "lambda = %lf");
433  model->kappa = get_fmt_double(fid, model_filename, "kappa = %lf");
434  model->epsilon = get_fmt_double(fid, model_filename, "epsilon = %lf");
435  model->weight_idx = (int) get_fmt_long(fid, model_filename,
436  "weight_idx = %li");
437 
438  // skip to data section
439  for (i=0; i<2; i++)
440  next_line(fid, model_filename);
441 
442  // read filename of data file
443  if (fgets(buffer, GENSVM_MAX_LINE_LENGTH, fid) == NULL) {
444  // LCOV_EXCL_START
445  err("[GenSVM Error]: Error reading from model file %s\n",
446  model_filename);
447  exit(EXIT_FAILURE);
448  // LCOV_EXCL_STOP
449  }
450  sscanf(buffer, "filename = %s\n", data_filename);
451  model->data_file = Calloc(char, GENSVM_MAX_LINE_LENGTH);
452  strcpy(model->data_file, data_filename);
453 
454  // read all data variables
455  model->n = get_fmt_long(fid, model_filename, "n = %li\n");
456  model->m = get_fmt_long(fid, model_filename, "m = %li\n");
457  model->K = get_fmt_long(fid, model_filename, "K = %li\n");
458 
459  // skip to output
460  for (i=0; i<2; i++)
461  next_line(fid, model_filename);
462 
463  // read the matrix V and check for consistency
464  model->V = Malloc(double, (model->m+1)*(model->K-1));
465  for (i=0; i<model->m+1; i++) {
466  for (j=0; j<model->K-1; j++) {
467  nr += fscanf(fid, "%lf ", &value);
468  matrix_set(model->V, model->K-1, i, j, value);
469  }
470  }
471  if (nr != (model->m+1)*(model->K-1)) {
472  // LCOV_EXCL_START
473  err("[GenSVM Error] Error reading from model file %s. "
474  "Not enough elements of V found.\n",
475  model_filename);
476  exit(EXIT_FAILURE);
477  // LCOV_EXCL_STOP
478  }
479 }
480 
494 void gensvm_write_model(struct GenModel *model, char *output_filename)
495 {
496  FILE *fid = NULL;
497  long i, j;
498  char timestr[GENSVM_MAX_LINE_LENGTH];
499 
500  // open output file
501  fid = fopen(output_filename, "w");
502  if (fid == NULL) {
503  // LCOV_EXCL_START
504  err("[GenSVM Error]: Error opening output file %s\n",
505  output_filename);
506  exit(EXIT_FAILURE);
507  // LCOV_EXCL_STOP
508  }
509  gensvm_time_string(timestr);
510 
511  // Write output to file
512  fprintf(fid, "Output file for GenSVM (version %s)\n", VERSION_STRING);
513  fprintf(fid, "Generated on: %s\n\n", timestr);
514  fprintf(fid, "Model:\n");
515  fprintf(fid, "p = %15.16f\n", model->p);
516  fprintf(fid, "lambda = %15.16f\n", model->lambda);
517  fprintf(fid, "kappa = %15.16f\n", model->kappa);
518  fprintf(fid, "epsilon = %g\n", model->epsilon);
519  fprintf(fid, "weight_idx = %i\n", model->weight_idx);
520  fprintf(fid, "\n");
521  fprintf(fid, "Data:\n");
522  fprintf(fid, "filename = %s\n", model->data_file);
523  fprintf(fid, "n = %li\n", model->n);
524  fprintf(fid, "m = %li\n", model->m);
525  fprintf(fid, "K = %li\n", model->K);
526  fprintf(fid, "\n");
527  fprintf(fid, "Output:\n");
528  for (i=0; i<model->m+1; i++) {
529  for (j=0; j<model->K-1; j++) {
530  if (j > 0)
531  fprintf(fid, " ");
532  fprintf(fid, "%+15.16f", matrix_get(model->V,
533  model->K-1, i, j));
534  }
535  fprintf(fid, "\n");
536  }
537 
538  fclose(fid);
539 }
540 
556 void gensvm_write_predictions(struct GenData *data, long *predy,
557  char *output_filename)
558 {
559  long i, j;
560  FILE *fid = NULL;
561 
562  fid = fopen(output_filename, "w");
563  if (fid == NULL) {
564  // LCOV_EXCL_START
565  err("[GenSVM Error]: Error opening output file %s\n",
566  output_filename);
567  exit(EXIT_FAILURE);
568  // LCOV_EXCL_STOP
569  }
570 
571  fprintf(fid, "%li\n", data->n);
572  fprintf(fid, "%li\n", data->m);
573 
574  for (i=0; i<data->n; i++) {
575  for (j=0; j<data->m; j++)
576  fprintf(fid, "%.16f ", matrix_get(data->Z, data->m+1, i,
577  j+1));
578  fprintf(fid, "%li\n", predy[i]);
579  }
580 
581  fclose(fid);
582 }
583 
596 void gensvm_time_string(char *buffer)
597 {
598  int diff, hours, minutes;
599  char timestr[GENSVM_MAX_LINE_LENGTH];
600  time_t current_time, lt, gt;
601  struct tm *lclt = NULL;
602 
603  // get current time (in epoch)
604  current_time = time(NULL);
605  if (current_time == ((time_t)-1)) {
606  // LCOV_EXCL_START
607  err("[GenSVM Error]: Failed to compute the current time.\n");
608  return;
609  // LCOV_EXCL_STOP
610  }
611 
612  // convert time to local time and create a string
613  lclt = localtime(&current_time);
614  strftime(timestr, GENSVM_MAX_LINE_LENGTH, "%c", lclt);
615  if (timestr == NULL) {
616  err("[GenSVM Error]: Failed to convert time to string.\n");
617  return;
618  }
619 
620  // calculate the UTC offset including DST
621  lt = mktime(localtime(&current_time));
622  gt = mktime(gmtime(&current_time));
623  diff = -difftime(gt, lt);
624  hours = (diff/3600);
625  minutes = (diff%3600)/60;
626  if (lclt->tm_isdst == 1)
627  hours++;
628 
629  sprintf(buffer, "%s (UTC %+03i:%02i)", timestr, hours, minutes);
630 }
#define Calloc(type, size)
Definition: gensvm_memory.h:40
void gensvm_write_model(struct GenModel *model, char *output_filename)
Write model to file.
Definition: gensvm_io.c:494
long * ja
column indices, should be of length nnz
Definition: gensvm_sparse.h:67
#define VERSION_STRING
double epsilon
stopping criterion for the IM algorithm.
Definition: gensvm_base.h:101
long n_col
number of columns of the original matrix
Definition: gensvm_sparse.h:60
void err(const char *fmt,...)
Parse a formatted string and write it to standard error.
Definition: gensvm_print.c:84
void gensvm_read_data_libsvm(struct GenData *data, char *data_file)
Read data from a file in LibSVM/SVMlight format.
Definition: gensvm_io.c:178
double p
parameter for the L-p norm in the loss function
Definition: gensvm_base.h:103
#define GENSVM_MAX_LINE_LENGTH
long K
number of classes
Definition: gensvm_base.h:58
char ** str_split(char *original, const char *delims, int *len_ret)
Split a string on delimiters and return an array of parts.
bool str_contains_char(const char *str, const char c)
Check if a string contains a char.
void gensvm_read_data(struct GenData *dataset, char *data_file)
Read data from file.
Definition: gensvm_io.c:47
#define matrix_get(M, cols, i, j)
void gensvm_read_model(struct GenModel *model, char *model_filename)
Read model from file.
Definition: gensvm_io.c:410
double * Z
Definition: gensvm_base.h:68
long nnz
number of nonzero elements
Definition: gensvm_sparse.h:56
bool gensvm_nnz_comparison(long nnz, long rows, long cols)
Compare the number of nonzeros is such that sparsity if worth it.
int weight_idx
which weights to use (1 = unit, 2 = group)
Definition: gensvm_base.h:93
#define Malloc(type, size)
Definition: gensvm_memory.h:48
double * V
augmented weight matrix
Definition: gensvm_base.h:115
long * y
array of class labels, 1..K
Definition: gensvm_base.h:66
void gensvm_time_string(char *buffer)
Get time string with UTC offset.
Definition: gensvm_io.c:596
A structure to represent the data.
Definition: gensvm_base.h:57
double * values
actual nonzero values, should be of length nnz
Definition: gensvm_sparse.h:63
A structure to represent a single GenSVM model.
Definition: gensvm_base.h:92
char * data_file
filename of the data
Definition: gensvm_base.h:134
long n
number of instances in the dataset
Definition: gensvm_base.h:97
#define maximum(a, b)
long r
number of eigenvalues (width of Z)
Definition: gensvm_base.h:64
Header file for gensvm_io.c.
double kappa
parameter for the Huber hinge function
Definition: gensvm_base.h:105
long K
number of classes in the dataset
Definition: gensvm_base.h:95
long m
number of predictors (width of RAW)
Definition: gensvm_base.h:62
void exit_input_error(int line_num)
Print an error to the screen and exit (copied from LibSVM)
Definition: gensvm_io.c:142
struct GenSparse * gensvm_dense_to_sparse(double *A, long rows, long cols)
Convert a dense matrix to a GenSparse structure if advantageous.
bool gensvm_could_sparse(double *A, long rows, long cols)
Check if it is worthwile to convert to a sparse matrix.
#define matrix_set(M, cols, i, j, val)
#define minimum(a, b)
long get_fmt_long(FILE *fid, char *filename, const char *fmt)
Read a long integer from file following a format.
long n
number of instances
Definition: gensvm_base.h:60
void gensvm_write_predictions(struct GenData *data, long *predy, char *output_filename)
Write predictions to file.
Definition: gensvm_io.c:556
struct GenSparse * gensvm_init_sparse(void)
Initialize a GenSparse structure.
Definition: gensvm_sparse.c:38
long m
number of predictor variables in the dataset
Definition: gensvm_base.h:99
long * ia
cumulative row lengths, should be of length n_row+1
Definition: gensvm_sparse.h:65
double get_fmt_double(FILE *fid, char *filename, const char *fmt)
Read a double from file following a format.
double * RAW
augmented raw data matrix
Definition: gensvm_base.h:73
struct GenSparse * spZ
sparse representation of the augmented data matrix
Definition: gensvm_base.h:71
double lambda
regularization parameter in the loss function
Definition: gensvm_base.h:107
void next_line(FILE *fid, char *filename)
Move to next line in file.
void note(const char *fmt,...)
Parse a formatted string and write to the output stream.
Definition: gensvm_print.c:62
long n_row
number of rows of the original matrix
Definition: gensvm_sparse.h:58