56 if ((fid = fopen(data_file,
"r")) == NULL) {
58 err(
"[GenSVM Error]: Datafile %s could not be opened.\n",
65 nr += fscanf(fid,
"%ld", &n);
66 nr += fscanf(fid,
"%ld", &m);
72 for (j=1; j<m+1; j++) {
73 nr += fscanf(fid,
"%lf", &value);
79 err(
"[GenSVM Error]: No label found on first line.\n");
85 if (sscanf(buf,
"%lf", &value) > 0) {
87 dataset->
y[0] = value;
96 for (j=1; j<m+1; j++) {
97 nr += fscanf(fid,
"%lf", &value);
100 if (dataset->
y != NULL) {
101 nr += fscanf(fid,
"%lf", &value);
102 dataset->
y[i] = (long) value;
110 err(
"[GenSVM Error]: not enough data found in %s\n",
124 dataset->
Z = dataset->
RAW;
127 note(
"Converting to sparse ... ");
144 err(
"[GenSVM Error]: Wrong input format on line: %i\n", line_num);
180 bool do_sparse, zero_based =
false;
181 long i, j,
n,
m,
K, nnz, cnt, tmp, index, row_cnt, num_labels,
183 int n_big, n_small, big_start;
189 **small_parts = NULL;
192 fid = fopen(data_file,
"r");
195 err(
"[GenSVM Error]: Datafile %s could not be opened.\n",
210 big_parts =
str_split(buf,
" \t", &n_big);
216 for (i=0; i<n_big; i++) {
221 small_parts =
str_split(big_parts[i],
":", &n_small);
224 index = strtol(small_parts[0], &endptr, 10);
227 if (endptr == small_parts[0] || errno != 0 ||
235 min_index =
minimum(min_index, index);
238 for (j=0; j<n_small; j++) free(small_parts[j]);
246 for (i=0; i<n_big; i++) {
259 if (num_labels > 0 && num_labels != n) {
260 err(
"[GenSVM Error]: There are some lines with missing " 261 "labels. Please fix this before " 270 if (min_index == 0) {
285 data->
spZ->
ia[0] = 0;
295 for (i=0; i<
n; i++) {
299 big_parts =
str_split(buf,
" \t", &n_big);
304 label = strtok(big_parts[0],
" \t\n");
310 tmp = strtol(label, &endptr, 10);
311 if (endptr == label || *endptr !=
'\0')
328 data->
spZ->
ja[cnt] = 0;
336 for (j=big_start; j<n_big; j++) {
341 small_parts =
str_split(big_parts[j],
":", &n_small);
347 index = strtol(small_parts[0], &endptr, 10);
350 if (endptr == small_parts[0] || errno != 0 ||
356 value = strtod(small_parts[1], &endptr);
357 if (endptr == small_parts[1] || errno != 0 ||
358 (*endptr !=
'\0' && !isspace(*endptr)))
363 data->
spZ->
ja[cnt] = index + zero_based;
368 index + zero_based, value);
372 free(small_parts[0]);
373 free(small_parts[1]);
378 data->
spZ->
ia[i+1] = data->
spZ->
ia[i] + row_cnt;
382 for (j=0; j<n_big; j++) {
418 fid = fopen(model_filename,
"r");
421 err(
"[GenSVM Error]: Couldn't open model file %s\n",
445 err(
"[GenSVM Error]: Error reading from model file %s\n",
450 sscanf(buffer,
"filename = %s\n", data_filename);
464 model->
V =
Malloc(
double, (model->
m+1)*(model->
K-1));
465 for (i=0; i<model->
m+1; i++) {
466 for (j=0; j<model->
K-1; j++) {
467 nr += fscanf(fid,
"%lf ", &value);
471 if (nr != (model->
m+1)*(model->
K-1)) {
473 err(
"[GenSVM Error] Error reading from model file %s. " 474 "Not enough elements of V found.\n",
501 fid = fopen(output_filename,
"w");
504 err(
"[GenSVM Error]: Error opening output file %s\n",
512 fprintf(fid,
"Output file for GenSVM (version %s)\n",
VERSION_STRING);
513 fprintf(fid,
"Generated on: %s\n\n", timestr);
514 fprintf(fid,
"Model:\n");
515 fprintf(fid,
"p = %15.16f\n", model->
p);
516 fprintf(fid,
"lambda = %15.16f\n", model->
lambda);
517 fprintf(fid,
"kappa = %15.16f\n", model->
kappa);
518 fprintf(fid,
"epsilon = %g\n", model->
epsilon);
519 fprintf(fid,
"weight_idx = %i\n", model->
weight_idx);
521 fprintf(fid,
"Data:\n");
522 fprintf(fid,
"filename = %s\n", model->
data_file);
523 fprintf(fid,
"n = %li\n", model->
n);
524 fprintf(fid,
"m = %li\n", model->
m);
525 fprintf(fid,
"K = %li\n", model->
K);
527 fprintf(fid,
"Output:\n");
528 for (i=0; i<model->
m+1; i++) {
529 for (j=0; j<model->
K-1; j++) {
557 char *output_filename)
562 fid = fopen(output_filename,
"w");
565 err(
"[GenSVM Error]: Error opening output file %s\n",
571 fprintf(fid,
"%li\n", data->
n);
572 fprintf(fid,
"%li\n", data->
m);
574 for (i=0; i<data->
n; i++) {
575 for (j=0; j<data->
m; j++)
576 fprintf(fid,
"%.16f ",
matrix_get(data->
Z, data->
m+1, i,
578 fprintf(fid,
"%li\n", predy[i]);
598 int diff, hours, minutes;
600 time_t current_time, lt, gt;
601 struct tm *lclt = NULL;
604 current_time = time(NULL);
605 if (current_time == ((time_t)-1)) {
607 err(
"[GenSVM Error]: Failed to compute the current time.\n");
613 lclt = localtime(¤t_time);
615 if (timestr == NULL) {
616 err(
"[GenSVM Error]: Failed to convert time to string.\n");
621 lt = mktime(localtime(¤t_time));
622 gt = mktime(gmtime(¤t_time));
623 diff = -difftime(gt, lt);
625 minutes = (diff%3600)/60;
626 if (lclt->tm_isdst == 1)
629 sprintf(buffer,
"%s (UTC %+03i:%02i)", timestr, hours, minutes);
#define Calloc(type, size)
void gensvm_write_model(struct GenModel *model, char *output_filename)
Write model to file.
long * ja
column indices, should be of length nnz
double epsilon
stopping criterion for the IM algorithm.
long n_col
number of columns of the original matrix
void err(const char *fmt,...)
Parse a formatted string and write it to standard error.
void gensvm_read_data_libsvm(struct GenData *data, char *data_file)
Read data from a file in LibSVM/SVMlight format.
double p
parameter for the L-p norm in the loss function
#define GENSVM_MAX_LINE_LENGTH
char ** str_split(char *original, const char *delims, int *len_ret)
Split a string on delimiters and return an array of parts.
bool str_contains_char(const char *str, const char c)
Check if a string contains a char.
void gensvm_read_data(struct GenData *dataset, char *data_file)
Read data from file.
#define matrix_get(M, cols, i, j)
void gensvm_read_model(struct GenModel *model, char *model_filename)
Read model from file.
long nnz
number of nonzero elements
bool gensvm_nnz_comparison(long nnz, long rows, long cols)
Compare the number of nonzeros is such that sparsity if worth it.
int weight_idx
which weights to use (1 = unit, 2 = group)
#define Malloc(type, size)
double * V
augmented weight matrix
long * y
array of class labels, 1..K
void gensvm_time_string(char *buffer)
Get time string with UTC offset.
A structure to represent the data.
double * values
actual nonzero values, should be of length nnz
A structure to represent a single GenSVM model.
char * data_file
filename of the data
long n
number of instances in the dataset
long r
number of eigenvalues (width of Z)
Header file for gensvm_io.c.
double kappa
parameter for the Huber hinge function
long K
number of classes in the dataset
long m
number of predictors (width of RAW)
void exit_input_error(int line_num)
Print an error to the screen and exit (copied from LibSVM)
struct GenSparse * gensvm_dense_to_sparse(double *A, long rows, long cols)
Convert a dense matrix to a GenSparse structure if advantageous.
bool gensvm_could_sparse(double *A, long rows, long cols)
Check if it is worthwile to convert to a sparse matrix.
#define matrix_set(M, cols, i, j, val)
long get_fmt_long(FILE *fid, char *filename, const char *fmt)
Read a long integer from file following a format.
long n
number of instances
void gensvm_write_predictions(struct GenData *data, long *predy, char *output_filename)
Write predictions to file.
struct GenSparse * gensvm_init_sparse(void)
Initialize a GenSparse structure.
long m
number of predictor variables in the dataset
long * ia
cumulative row lengths, should be of length n_row+1
double get_fmt_double(FILE *fid, char *filename, const char *fmt)
Read a double from file following a format.
double * RAW
augmented raw data matrix
struct GenSparse * spZ
sparse representation of the augmented data matrix
double lambda
regularization parameter in the loss function
void next_line(FILE *fid, char *filename)
Move to next line in file.
void note(const char *fmt,...)
Parse a formatted string and write to the output stream.
long n_row
number of rows of the original matrix