Line data Source code
1 : /**
2 : * @file gensvm_io.c
3 : * @author G.J.J. van den Burg
4 : * @date 2014-01-07
5 : * @brief Functions for input and output of data and model files
6 : *
7 : * @details
8 : * This file contains functions for reading and writing model files, and data
9 : * files. It also contains a function for generating a string of the current
10 : * time, used in writing output files.
11 : *
12 : * @copyright
13 : Copyright 2016, G.J.J. van den Burg.
14 :
15 : This file is part of GenSVM.
16 :
17 : GenSVM is free software: you can redistribute it and/or modify
18 : it under the terms of the GNU General Public License as published by
19 : the Free Software Foundation, either version 3 of the License, or
20 : (at your option) any later version.
21 :
22 : GenSVM is distributed in the hope that it will be useful,
23 : but WITHOUT ANY WARRANTY; without even the implied warranty of
24 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 : GNU General Public License for more details.
26 :
27 : You should have received a copy of the GNU General Public License
28 : along with GenSVM. If not, see <http://www.gnu.org/licenses/>.
29 :
30 : */
31 :
32 : #include "gensvm_io.h"
33 :
34 : /**
35 : * @brief Read data from file
36 : *
37 : * @details
38 : * Read the data from the data_file. The data matrix X is augmented
39 : * with a column of ones, to get the matrix Z. The data is expected
40 : * to follow a specific format, which is specified in the @ref spec_data_file.
41 : * The class labels are assumed to be in the interval [1 .. K], which can be
42 : * checked using the function gensvm_check_outcome_contiguous().
43 : *
44 : * @param[in,out] dataset initialized GenData struct
45 : * @param[in] data_file filename of the data file.
46 : */
47 3 : void gensvm_read_data(struct GenData *dataset, char *data_file)
48 : {
49 3 : FILE *fid = NULL;
50 : long i, j, n, m,
51 3 : nr = 0,
52 3 : K = 0;
53 : double value;
54 : char buf[GENSVM_MAX_LINE_LENGTH];
55 :
56 3 : if ((fid = fopen(data_file, "r")) == NULL) {
57 : // LCOV_EXCL_START
58 : err("[GenSVM Error]: Datafile %s could not be opened.\n",
59 : data_file);
60 : exit(EXIT_FAILURE);
61 : // LCOV_EXCL_STOP
62 : }
63 :
64 : // Read data dimensions
65 3 : nr += fscanf(fid, "%ld", &n);
66 3 : nr += fscanf(fid, "%ld", &m);
67 :
68 : // Allocate memory
69 3 : dataset->RAW = Malloc(double, n*(m+1));
70 :
71 : // Read first line of data
72 12 : for (j=1; j<m+1; j++) {
73 9 : nr += fscanf(fid, "%lf", &value);
74 9 : matrix_set(dataset->RAW, m+1, 0, j, value);
75 : }
76 :
77 3 : if (fgets(buf, GENSVM_MAX_LINE_LENGTH, fid) == NULL) {
78 : // LCOV_EXCL_START
79 : err("[GenSVM Error]: No label found on first line.\n");
80 : exit(EXIT_FAILURE);
81 : // LCOV_EXCL_STOP
82 : }
83 :
84 : // Check if there is a label at the end of the line
85 3 : if (sscanf(buf, "%lf", &value) > 0) {
86 2 : dataset->y = Malloc(long, n);
87 2 : dataset->y[0] = value;
88 2 : K = 1;
89 : } else {
90 1 : free(dataset->y);
91 1 : dataset->y = NULL;
92 : }
93 :
94 : // Read the rest of the file
95 20 : for (i=1; i<n; i++) {
96 68 : for (j=1; j<m+1; j++) {
97 51 : nr += fscanf(fid, "%lf", &value);
98 51 : matrix_set(dataset->RAW, m+1, i, j, value);
99 : }
100 17 : if (dataset->y != NULL) {
101 13 : nr += fscanf(fid, "%lf", &value);
102 13 : dataset->y[i] = (long) value;
103 13 : K = maximum(K, dataset->y[i]);
104 : }
105 : }
106 3 : fclose(fid);
107 :
108 3 : if (nr < n * m) {
109 : // LCOV_EXCL_START
110 : err("[GenSVM Error]: not enough data found in %s\n",
111 : data_file);
112 : exit(EXIT_FAILURE);
113 : // LCOV_EXCL_STOP
114 : }
115 :
116 : // Set the column of ones
117 23 : for (i=0; i<n; i++)
118 20 : matrix_set(dataset->RAW, m+1, i, 0, 1.0);
119 :
120 3 : dataset->n = n;
121 3 : dataset->m = m;
122 3 : dataset->r = m;
123 3 : dataset->K = K;
124 3 : dataset->Z = dataset->RAW;
125 :
126 3 : if (gensvm_could_sparse(dataset->Z, n, m+1)) {
127 1 : note("Converting to sparse ... ");
128 1 : dataset->spZ = gensvm_dense_to_sparse(dataset->Z, n, m+1);
129 1 : note("done.\n");
130 1 : free(dataset->RAW);
131 1 : dataset->RAW = NULL;
132 1 : dataset->Z = NULL;
133 : }
134 3 : }
135 :
136 : /**
137 : * @brief Print an error to the screen and exit (copied from LibSVM)
138 : *
139 : * @param[in] line_num line number where the error occured
140 : *
141 : */
142 0 : void exit_input_error(int line_num)
143 : {
144 0 : err("[GenSVM Error]: Wrong input format on line: %i\n", line_num);
145 0 : exit(EXIT_FAILURE);
146 : }
147 :
148 : /**
149 : * @brief Read data from a file in LibSVM/SVMlight format
150 : *
151 : * @details
152 : * This function reads data from a file where the data is stored in
153 : * LibSVM/SVMlight format. The file format is described in @ref
154 : * spec_libsvm_data_file. This is a sparse data format, which can be
155 : * beneficial for certain applications. The advantage of having this function
156 : * here is twofold: 1) existing datasets where data is stored in
157 : * LibSVM/SVMlight format can be easily used in GenSVM, and 2) sparse datasets
158 : * which are too large for memory when kept in dense format can be loaded
159 : * efficiently into GenSVM.
160 : *
161 : * @note
162 : * This code is based on the read_problem() function in the svm-train.c
163 : * file of LibSVM. It has however been expanded to be able to handle data
164 : * files without labels.
165 : *
166 : * @note
167 : * This file tries to detect whether 1-based or 0-based indexing is used in
168 : * the data file. By default 1-based indexing is used, but if an index is
169 : * found with value 0, 0-based indexing is assumed.
170 : *
171 : * @sa
172 : * gensvm_read_problem()
173 : *
174 : * @param[in] data GenData structure
175 : * @param[in] data_file filename of the datafile
176 : *
177 : */
178 4 : void gensvm_read_data_libsvm(struct GenData *data, char *data_file)
179 : {
180 4 : bool do_sparse, zero_based = false;
181 : long i, j, n, m, K, nnz, cnt, tmp, index, row_cnt, num_labels,
182 4 : min_index = 1;
183 : int n_big, n_small, big_start;
184 : double value;
185 4 : FILE *fid = NULL;
186 4 : char *label = NULL,
187 4 : *endptr = NULL,
188 4 : **big_parts = NULL,
189 4 : **small_parts = NULL;
190 : char buf[GENSVM_MAX_LINE_LENGTH];
191 :
192 4 : fid = fopen(data_file, "r");
193 4 : if (fid == NULL) {
194 : // LCOV_EXCL_START
195 : err("[GenSVM Error]: Datafile %s could not be opened.\n",
196 : data_file);
197 : exit(EXIT_FAILURE);
198 : // LCOV_EXCL_STOP
199 : }
200 :
201 : // first count the number of elements
202 4 : n = 0;
203 4 : m = -1;
204 :
205 4 : num_labels = 0;
206 4 : nnz = 0;
207 :
208 33 : while (fgets(buf, GENSVM_MAX_LINE_LENGTH, fid) != NULL) {
209 : // split the string in labels and/or index:value pairs
210 25 : big_parts = str_split(buf, " \t", &n_big);
211 :
212 : // record if this line has a label (first part has no colon)
213 25 : num_labels += (!str_contains_char(big_parts[0], ':'));
214 :
215 : // check for each part if it is a index:value pair
216 94 : for (i=0; i<n_big; i++) {
217 69 : if (!str_contains_char(big_parts[i], ':'))
218 20 : continue;
219 :
220 : // split the index:value pair
221 49 : small_parts = str_split(big_parts[i], ":", &n_small);
222 :
223 : // convert the index to a number
224 49 : index = strtol(small_parts[0], &endptr, 10);
225 :
226 : // catch conversion errors
227 98 : if (endptr == small_parts[0] || errno != 0 ||
228 49 : *endptr != '\0')
229 0 : exit_input_error(n+1);
230 :
231 : // update the maximum index
232 49 : m = maximum(m, index);
233 :
234 : // update the minimum index
235 49 : min_index = minimum(min_index, index);
236 :
237 : // free the small parts
238 49 : for (j=0; j<n_small; j++) free(small_parts[j]);
239 49 : free(small_parts);
240 :
241 : // increment the nonzero counter
242 49 : nnz++;
243 : }
244 :
245 : // free the big parts
246 94 : for (i=0; i<n_big; i++) {
247 69 : free(big_parts[i]);
248 : }
249 25 : free(big_parts);
250 :
251 : // increment the number of observations
252 25 : n++;
253 : }
254 :
255 : // rewind the file pointer
256 4 : rewind(fid);
257 :
258 : // check if we have enough labels
259 4 : if (num_labels > 0 && num_labels != n) {
260 0 : err("[GenSVM Error]: There are some lines with missing "
261 : "labels. Please fix this before "
262 : "continuing.\n");
263 0 : exit(EXIT_FAILURE);
264 : }
265 :
266 : // don't forget the column of ones
267 4 : nnz += n;
268 :
269 : // deal with 0-based or 1-based indexing in the LibSVM file
270 4 : if (min_index == 0) {
271 1 : m++;
272 1 : zero_based = true;
273 : }
274 :
275 : // check if sparsity is worth it
276 4 : do_sparse = gensvm_nnz_comparison(nnz, n, m+1);
277 4 : if (do_sparse) {
278 1 : data->spZ = gensvm_init_sparse();
279 1 : data->spZ->nnz = nnz;
280 1 : data->spZ->n_row = n;
281 1 : data->spZ->n_col = m+1;
282 1 : data->spZ->values = Calloc(double, nnz);
283 1 : data->spZ->ia = Calloc(long, n+1);
284 1 : data->spZ->ja = Calloc(long, nnz);
285 1 : data->spZ->ia[0] = 0;
286 : } else {
287 3 : data->RAW = Calloc(double, n*(m+1));
288 3 : data->Z = data->RAW;
289 : }
290 4 : if (num_labels > 0)
291 3 : data->y = Calloc(long, n);
292 :
293 4 : K = 0;
294 4 : cnt = 0;
295 29 : for (i=0; i<n; i++) {
296 25 : fgets(buf, GENSVM_MAX_LINE_LENGTH, fid);
297 :
298 : // split the string in labels and/or index:value pairs
299 25 : big_parts = str_split(buf, " \t", &n_big);
300 :
301 25 : big_start = 0;
302 : // get the label from the first part if it exists
303 25 : if (!str_contains_char(big_parts[0], ':')) {
304 20 : label = strtok(big_parts[0], " \t\n");
305 20 : if (label == NULL) // empty line
306 0 : exit_input_error(i+1);
307 :
308 : // convert the label part to a number exit if there
309 : // are errors
310 20 : tmp = strtol(label, &endptr, 10);
311 20 : if (endptr == label || *endptr != '\0')
312 0 : exit_input_error(i+1);
313 :
314 : // assign label to y
315 20 : data->y[i] = tmp;
316 :
317 : // keep track of maximum K
318 20 : K = maximum(K, data->y[i]);
319 :
320 : // increment big part index
321 20 : big_start++;
322 : }
323 :
324 25 : row_cnt = 0;
325 : // set the first element in the row to 1
326 25 : if (do_sparse) {
327 10 : data->spZ->values[cnt] = 1.0;
328 10 : data->spZ->ja[cnt] = 0;
329 10 : cnt++;
330 10 : row_cnt++;
331 : } else {
332 15 : matrix_set(data->RAW, m+1, i, 0, 1.0);
333 : }
334 :
335 : // read the rest of the line
336 74 : for (j=big_start; j<n_big; j++) {
337 49 : if (!str_contains_char(big_parts[j], ':'))
338 0 : continue;
339 :
340 : // split the index:value pair
341 49 : small_parts = str_split(big_parts[j], ":", &n_small);
342 49 : if (n_small != 2)
343 0 : exit_input_error(n+1);
344 :
345 : // convert the index to a long
346 49 : errno = 0;
347 49 : index = strtol(small_parts[0], &endptr, 10);
348 :
349 : // catch conversion errors
350 98 : if (endptr == small_parts[0] || errno != 0 ||
351 49 : *endptr != '\0')
352 0 : exit_input_error(n+1);
353 :
354 : // convert the value to a double
355 49 : errno = 0;
356 49 : value = strtod(small_parts[1], &endptr);
357 98 : if (endptr == small_parts[1] || errno != 0 ||
358 68 : (*endptr != '\0' && !isspace(*endptr)))
359 0 : exit_input_error(n+1);
360 :
361 49 : if (do_sparse) {
362 4 : data->spZ->values[cnt] = value;
363 4 : data->spZ->ja[cnt] = index + zero_based;
364 4 : cnt++;
365 4 : row_cnt++;
366 : } else {
367 45 : matrix_set(data->RAW, m+1, i,
368 : index + zero_based, value);
369 : }
370 :
371 : // free the small parts
372 49 : free(small_parts[0]);
373 49 : free(small_parts[1]);
374 49 : free(small_parts);
375 : }
376 :
377 25 : if (do_sparse) {
378 10 : data->spZ->ia[i+1] = data->spZ->ia[i] + row_cnt;
379 : }
380 :
381 : // free the big parts
382 94 : for (j=0; j<n_big; j++) {
383 69 : free(big_parts[j]);
384 : }
385 25 : free(big_parts);
386 : }
387 :
388 4 : fclose(fid);
389 :
390 4 : data->n = n;
391 4 : data->m = m;
392 4 : data->r = m;
393 4 : data->K = K;
394 :
395 4 : }
396 :
397 : /**
398 : * @brief Read model from file
399 : *
400 : * @details
401 : * Read a GenModel from a model file. The GenModel struct must have been
402 : * initalized elswhere. The model file is expected to follow the @ref
403 : * spec_model_file. The easiest way to generate a model file is through
404 : * gensvm_write_model(), which can for instance be used in trainGenSVM.c.
405 : *
406 : * @param[in,out] model initialized GenModel
407 : * @param[in] model_filename filename of the model file
408 : *
409 : */
410 1 : void gensvm_read_model(struct GenModel *model, char *model_filename)
411 : {
412 1 : long i, j, nr = 0;
413 1 : FILE *fid = NULL;
414 : char buffer[GENSVM_MAX_LINE_LENGTH];
415 : char data_filename[GENSVM_MAX_LINE_LENGTH];
416 1 : double value = 0;
417 :
418 1 : fid = fopen(model_filename, "r");
419 1 : if (fid == NULL) {
420 : // LCOV_EXCL_START
421 : err("[GenSVM Error]: Couldn't open model file %s\n",
422 : model_filename);
423 : exit(EXIT_FAILURE);
424 : // LCOV_EXCL_STOP
425 : }
426 : // skip the first four lines
427 5 : for (i=0; i<4; i++)
428 4 : next_line(fid, model_filename);
429 :
430 : // read all model variables
431 1 : model->p = get_fmt_double(fid, model_filename, "p = %lf");
432 1 : model->lambda = get_fmt_double(fid, model_filename, "lambda = %lf");
433 1 : model->kappa = get_fmt_double(fid, model_filename, "kappa = %lf");
434 1 : model->epsilon = get_fmt_double(fid, model_filename, "epsilon = %lf");
435 1 : model->weight_idx = (int) get_fmt_long(fid, model_filename,
436 : "weight_idx = %li");
437 :
438 : // skip to data section
439 3 : for (i=0; i<2; i++)
440 2 : next_line(fid, model_filename);
441 :
442 : // read filename of data file
443 1 : if (fgets(buffer, GENSVM_MAX_LINE_LENGTH, fid) == NULL) {
444 : // LCOV_EXCL_START
445 : err("[GenSVM Error]: Error reading from model file %s\n",
446 : model_filename);
447 : exit(EXIT_FAILURE);
448 : // LCOV_EXCL_STOP
449 : }
450 1 : sscanf(buffer, "filename = %s\n", data_filename);
451 1 : model->data_file = Calloc(char, GENSVM_MAX_LINE_LENGTH);
452 1 : strcpy(model->data_file, data_filename);
453 :
454 : // read all data variables
455 1 : model->n = get_fmt_long(fid, model_filename, "n = %li\n");
456 1 : model->m = get_fmt_long(fid, model_filename, "m = %li\n");
457 1 : model->K = get_fmt_long(fid, model_filename, "K = %li\n");
458 :
459 : // skip to output
460 3 : for (i=0; i<2; i++)
461 2 : next_line(fid, model_filename);
462 :
463 : // read the matrix V and check for consistency
464 1 : model->V = Malloc(double, (model->m+1)*(model->K-1));
465 4 : for (i=0; i<model->m+1; i++) {
466 9 : for (j=0; j<model->K-1; j++) {
467 6 : nr += fscanf(fid, "%lf ", &value);
468 6 : matrix_set(model->V, model->K-1, i, j, value);
469 : }
470 : }
471 1 : if (nr != (model->m+1)*(model->K-1)) {
472 : // LCOV_EXCL_START
473 : err("[GenSVM Error] Error reading from model file %s. "
474 : "Not enough elements of V found.\n",
475 : model_filename);
476 : exit(EXIT_FAILURE);
477 : // LCOV_EXCL_STOP
478 : }
479 1 : }
480 :
481 : /**
482 : * @brief Write model to file
483 : *
484 : * @details
485 : * Write a GenModel to a file. The current time is specified in the file in
486 : * UTC + offset. The model file further corresponds to the @ref
487 : * spec_model_file.
488 : *
489 : * @param[in] model GenModel which contains an estimate for
490 : * GenModel::V
491 : * @param[in] output_filename the output file to write the model to
492 : *
493 : */
494 1 : void gensvm_write_model(struct GenModel *model, char *output_filename)
495 : {
496 1 : FILE *fid = NULL;
497 : long i, j;
498 : char timestr[GENSVM_MAX_LINE_LENGTH];
499 :
500 : // open output file
501 1 : fid = fopen(output_filename, "w");
502 1 : if (fid == NULL) {
503 : // LCOV_EXCL_START
504 : err("[GenSVM Error]: Error opening output file %s\n",
505 : output_filename);
506 : exit(EXIT_FAILURE);
507 : // LCOV_EXCL_STOP
508 : }
509 1 : gensvm_time_string(timestr);
510 :
511 : // Write output to file
512 1 : fprintf(fid, "Output file for GenSVM (version %s)\n", VERSION_STRING);
513 1 : fprintf(fid, "Generated on: %s\n\n", timestr);
514 1 : fprintf(fid, "Model:\n");
515 1 : fprintf(fid, "p = %15.16f\n", model->p);
516 1 : fprintf(fid, "lambda = %15.16f\n", model->lambda);
517 1 : fprintf(fid, "kappa = %15.16f\n", model->kappa);
518 1 : fprintf(fid, "epsilon = %g\n", model->epsilon);
519 1 : fprintf(fid, "weight_idx = %i\n", model->weight_idx);
520 1 : fprintf(fid, "\n");
521 1 : fprintf(fid, "Data:\n");
522 1 : fprintf(fid, "filename = %s\n", model->data_file);
523 1 : fprintf(fid, "n = %li\n", model->n);
524 1 : fprintf(fid, "m = %li\n", model->m);
525 1 : fprintf(fid, "K = %li\n", model->K);
526 1 : fprintf(fid, "\n");
527 1 : fprintf(fid, "Output:\n");
528 4 : for (i=0; i<model->m+1; i++) {
529 9 : for (j=0; j<model->K-1; j++) {
530 6 : if (j > 0)
531 3 : fprintf(fid, " ");
532 6 : fprintf(fid, "%+15.16f", matrix_get(model->V,
533 : model->K-1, i, j));
534 : }
535 3 : fprintf(fid, "\n");
536 : }
537 :
538 1 : fclose(fid);
539 1 : }
540 :
541 : /**
542 : * @brief Write predictions to file
543 : *
544 : * @details
545 : * Write the given predictions to an output file, such that the resulting file
546 : * corresponds to the @ref spec_data_file.
547 : *
548 : * @param[in] data GenData with the original instances
549 : * @param[in] predy predictions of the class labels of the
550 : * instances in the given GenData. Note that the
551 : * order of the instances is assumed to be the
552 : * same.
553 : * @param[in] output_filename the file to which the predictions are written
554 : *
555 : */
556 1 : void gensvm_write_predictions(struct GenData *data, long *predy,
557 : char *output_filename)
558 : {
559 : long i, j;
560 1 : FILE *fid = NULL;
561 :
562 1 : fid = fopen(output_filename, "w");
563 1 : if (fid == NULL) {
564 : // LCOV_EXCL_START
565 : err("[GenSVM Error]: Error opening output file %s\n",
566 : output_filename);
567 : exit(EXIT_FAILURE);
568 : // LCOV_EXCL_STOP
569 : }
570 :
571 1 : fprintf(fid, "%li\n", data->n);
572 1 : fprintf(fid, "%li\n", data->m);
573 :
574 6 : for (i=0; i<data->n; i++) {
575 20 : for (j=0; j<data->m; j++)
576 15 : fprintf(fid, "%.16f ", matrix_get(data->Z, data->m+1, i,
577 : j+1));
578 5 : fprintf(fid, "%li\n", predy[i]);
579 : }
580 :
581 1 : fclose(fid);
582 1 : }
583 :
584 : /**
585 : * @brief Get time string with UTC offset
586 : *
587 : * @details
588 : * Create a string for the current system time. Include an offset of UTC for
589 : * consistency. The format of the generated string is "DDD MMM D HH:MM:SS
590 : * YYYY (UTC +HH:MM)", e.g. "Fri Aug 9, 12:34:56 2013 (UTC +02:00)".
591 : *
592 : * @param[in,out] buffer allocated string buffer, on exit contains
593 : * formatted string
594 : *
595 : */
596 1 : void gensvm_time_string(char *buffer)
597 : {
598 : int diff, hours, minutes;
599 : char timestr[GENSVM_MAX_LINE_LENGTH];
600 : time_t current_time, lt, gt;
601 1 : struct tm *lclt = NULL;
602 :
603 : // get current time (in epoch)
604 1 : current_time = time(NULL);
605 1 : if (current_time == ((time_t)-1)) {
606 : // LCOV_EXCL_START
607 : err("[GenSVM Error]: Failed to compute the current time.\n");
608 : return;
609 : // LCOV_EXCL_STOP
610 : }
611 :
612 : // convert time to local time and create a string
613 1 : lclt = localtime(¤t_time);
614 1 : strftime(timestr, GENSVM_MAX_LINE_LENGTH, "%c", lclt);
615 : if (timestr == NULL) {
616 : err("[GenSVM Error]: Failed to convert time to string.\n");
617 : return;
618 : }
619 :
620 : // calculate the UTC offset including DST
621 1 : lt = mktime(localtime(¤t_time));
622 1 : gt = mktime(gmtime(¤t_time));
623 1 : diff = -difftime(gt, lt);
624 1 : hours = (diff/3600);
625 1 : minutes = (diff%3600)/60;
626 1 : if (lclt->tm_isdst == 1)
627 0 : hours++;
628 :
629 1 : sprintf(buffer, "%s (UTC %+03i:%02i)", timestr, hours, minutes);
630 : }
|