GenSVM
GenSVMgrid.c
Go to the documentation of this file.
1 
40 #include "gensvm_checks.h"
41 #include "gensvm_cmdarg.h"
42 #include "gensvm_io.h"
43 #include "gensvm_gridsearch.h"
44 #include "gensvm_consistency.h"
45 
49 #define MINARGS 2
50 
51 extern FILE *GENSVM_OUTPUT_FILE;
52 extern FILE *GENSVM_ERROR_FILE;
53 
54 // function declarations
55 void exit_with_help(char **argv);
56 void parse_command_line(int argc, char **argv, char *input_filename);
57 void read_grid_from_file(char *input_filename, struct GenGrid *grid);
58 
69 void exit_with_help(char **argv)
70 {
71  printf("This is GenSVM, version %s.\n", VERSION_STRING);
72  printf("Copyright (C) 2016, G.J.J. van den Burg.\n");
73  printf("This program is free software, see the LICENSE file "
74  "for details.\n\n");
75  printf("Usage: %s [options] grid_file\n", argv[0]);
76  printf("Options:\n");
77  printf("-h | -help : print this help.\n");
78  printf("-q : quiet mode (no output, not even errors!)\n");
79  printf("-x : data files are in LibSVM/SVMlight format\n");
80 
81  exit(EXIT_FAILURE);
82 }
83 
102 int main(int argc, char **argv)
103 {
104  bool libsvm_format = false;
105  char input_filename[GENSVM_MAX_LINE_LENGTH];
106 
107  struct GenGrid *grid = gensvm_init_grid();
108  struct GenData *train_data = gensvm_init_data();
109  struct GenData *test_data = gensvm_init_data();
110  struct GenQueue *q = gensvm_init_queue();
111 
112  if (argc < MINARGS || gensvm_check_argv(argc, argv, "-help")
113  || gensvm_check_argv_eq(argc, argv, "-h") )
114  exit_with_help(argv);
115  parse_command_line(argc, argv, input_filename);
116  libsvm_format = gensvm_check_argv(argc, argv, "-x");
117 
118  note("Reading grid file\n");
119  read_grid_from_file(input_filename, grid);
120 
121  note("Reading data from %s\n", grid->train_data_file);
122  if (libsvm_format)
123  gensvm_read_data_libsvm(train_data, grid->train_data_file);
124  else
125  gensvm_read_data(train_data, grid->train_data_file);
126 
127  // check labels of training data
129  if (!gensvm_check_outcome_contiguous(train_data)) {
130  err("[GenSVM Error]: Class labels should start from 1 and "
131  "have no gaps. Please reformat your data.\n");
132  exit(EXIT_FAILURE);
133  }
134 
135  // check if we are sparse and want nonlinearity
136  if (train_data->Z == NULL && grid->kerneltype != K_LINEAR) {
137  err("[GenSVM Warning]: Sparse matrices with nonlinear kernels "
138  "are not yet supported. Dense matrices will "
139  "be used.\n");
140  train_data->RAW = gensvm_sparse_to_dense(train_data->spZ);
141  train_data->Z = train_data->RAW;
142  gensvm_free_sparse(train_data->spZ);
143  }
144 
145  if (grid->traintype == TT) {
146  err("[GenSVM Warning]: Using test datasets in a grid search "
147  "is not yet supported in GenSVM.\n"
148  " The test dataset will be "
149  "ignored during training.\n");
150  //note("Reading data from %s\n", grid->test_data_file);
151  //gensvm_read_data(test_data, grid->test_data_file);
152  }
153 
154  note("Creating queue\n");
155  gensvm_fill_queue(grid, q, train_data, test_data);
156 
157  srand(time(NULL));
158 
159  note("Starting training\n");
161  note("Training finished\n");
162 
163  if (grid->repeats > 0) {
165  }
166 
168  gensvm_free_grid(grid);
169  gensvm_free_data(train_data);
170  gensvm_free_data(test_data);
171 
172  note("Done.\n");
173  return 0;
174 }
175 
191 void parse_command_line(int argc, char **argv, char *input_filename)
192 {
193  int i;
194 
195  GENSVM_OUTPUT_FILE = stdout;
196  GENSVM_ERROR_FILE = stderr;
197 
198  for (i=1; i<argc; i++) {
199  if (argv[i][0] != '-') break;
200  if (++i>=argc)
201  exit_with_help(argv);
202  switch (argv[i-1][1]) {
203  case 'q':
204  GENSVM_OUTPUT_FILE = NULL;
205  GENSVM_ERROR_FILE = NULL;
206  i--;
207  break;
208  case 'x':
209  i--;
210  break;
211  default:
212  fprintf(stderr, "Unknown option: -%c\n",
213  argv[i-1][1]);
214  exit_with_help(argv);
215  }
216  }
217 
218  if (i >= argc)
219  exit_with_help(argv);
220 
221  strcpy(input_filename, argv[i]);
222 }
223 
236 KernelType parse_kernel_str(char *kernel_line)
237 {
238  if (str_endswith(kernel_line, "LINEAR\n")) {
239  return K_LINEAR;
240  } else if (str_endswith(kernel_line, "POLY\n")) {
241  return K_POLY;
242  } else if (str_endswith(kernel_line, "RBF\n")) {
243  return K_RBF;
244  } else if (str_endswith(kernel_line, "SIGMOID\n")) {
245  return K_SIGMOID;
246  } else {
247  fprintf(stderr, "Unknown kernel specified on line: %s\n",
248  kernel_line);
249  exit(EXIT_FAILURE);
250  }
251 }
252 
268 void read_grid_from_file(char *input_filename, struct GenGrid *grid)
269 {
270  long i, nr = 0;
271  FILE *fid;
272  char buffer[GENSVM_MAX_LINE_LENGTH];
273  char train_filename[GENSVM_MAX_LINE_LENGTH];
274  char test_filename[GENSVM_MAX_LINE_LENGTH];
275  double *params = Calloc(double, GENSVM_MAX_LINE_LENGTH);
276  long *lparams = Calloc(long, GENSVM_MAX_LINE_LENGTH);
277 
278  fid = fopen(input_filename, "r");
279  if (fid == NULL) {
280  fprintf(stderr, "Error opening grid file %s\n",
281  input_filename);
282  exit(EXIT_FAILURE);
283  }
284  grid->traintype = CV;
285  while ( fgets(buffer, GENSVM_MAX_LINE_LENGTH, fid) != NULL ) {
286  Memset(params, double, GENSVM_MAX_LINE_LENGTH);
287  Memset(lparams, long, GENSVM_MAX_LINE_LENGTH);
288  if (str_startswith(buffer, "train:")) {
289  sscanf(buffer, "train: %s\n", train_filename);
290  grid->train_data_file = Calloc(char,
292  strcpy(grid->train_data_file, train_filename);
293  } else if (str_startswith(buffer, "test:")) {
294  sscanf(buffer, "test: %s\n", test_filename);
295  grid->test_data_file = Calloc(char,
297  strcpy(grid->test_data_file, test_filename);
298  grid->traintype = TT;
299  } else if (str_startswith(buffer, "p:")) {
300  nr = all_doubles_str(buffer, 2, params);
301  grid->ps = Calloc(double, nr);
302  for (i=0; i<nr; i++)
303  grid->ps[i] = params[i];
304  grid->Np = nr;
305  } else if (str_startswith(buffer, "lambda:")) {
306  nr = all_doubles_str(buffer, 7, params);
307  grid->lambdas = Calloc(double, nr);
308  for (i=0; i<nr; i++)
309  grid->lambdas[i] = params[i];
310  grid->Nl = nr;
311  } else if (str_startswith(buffer, "kappa:")) {
312  nr = all_doubles_str(buffer, 6, params);
313  grid->kappas = Calloc(double, nr);
314  for (i=0; i<nr; i++)
315  grid->kappas[i] = params[i];
316  grid->Nk = nr;
317  } else if (str_startswith(buffer, "epsilon:")) {
318  nr = all_doubles_str(buffer, 8, params);
319  grid->epsilons = Calloc(double, nr);
320  for (i=0; i<nr; i++)
321  grid->epsilons[i] = params[i];
322  grid->Ne = nr;
323  } else if (str_startswith(buffer, "weight:")) {
324  nr = all_longs_str(buffer, 7, lparams);
325  grid->weight_idxs = Calloc(int, nr);
326  for (i=0; i<nr; i++)
327  grid->weight_idxs[i] = lparams[i];
328  grid->Nw = nr;
329  } else if (str_startswith(buffer, "folds:")) {
330  nr = all_longs_str(buffer, 6, lparams);
331  grid->folds = lparams[0];
332  if (nr > 1)
333  fprintf(stderr, "Field \"folds\" only takes "
334  "one value. Additional "
335  "fields are ignored.\n");
336  } else if (str_startswith(buffer, "repeats:")) {
337  nr = all_longs_str(buffer, 8, lparams);
338  grid->repeats = lparams[0];
339  if (nr > 1)
340  fprintf(stderr, "Field \"repeats\" only "
341  "takes one value. Additional "
342  "fields are ignored.\n");
343  } else if (str_startswith(buffer, "percentile:")) {
344  nr = all_doubles_str(buffer, 11, params);
345  grid->percentile = params[0];
346  if (nr > 1)
347  fprintf(stderr, "Field \"percentile\" only "
348  "takes one value. Additional "
349  "fields are ignored.\n");
350  } else if (str_startswith(buffer, "kernel:")) {
351  grid->kerneltype = parse_kernel_str(buffer);
352  } else if (str_startswith(buffer, "gamma:")) {
353  nr = all_doubles_str(buffer, 6, params);
354  if (grid->kerneltype == K_LINEAR) {
355  fprintf(stderr, "Field \"gamma\" ignored, "
356  "linear kernel is used.\n");
357  grid->Ng = 0;
358  break;
359  }
360  grid->gammas = Calloc(double, nr);
361  for (i=0; i<nr; i++)
362  grid->gammas[i] = params[i];
363  grid->Ng = nr;
364  } else if (str_startswith(buffer, "coef:")) {
365  nr = all_doubles_str(buffer, 5, params);
366  if (grid->kerneltype == K_LINEAR ||
367  grid->kerneltype == K_RBF) {
368  fprintf(stderr, "Field \"coef\" ignored with "
369  "specified kernel.\n");
370  grid->Nc = 0;
371  break;
372  }
373  grid->coefs = Calloc(double, nr);
374  for (i=0; i<nr; i++)
375  grid->coefs[i] = params[i];
376  grid->Nc = nr;
377  } else if (str_startswith(buffer, "degree:")) {
378  nr = all_doubles_str(buffer, 7, params);
379  if (grid->kerneltype != K_POLY) {
380  fprintf(stderr, "Field \"degree\" ignored "
381  "with specified kernel.\n");
382  grid->Nd = 0;
383  break;
384  }
385  grid->degrees = Calloc(double, nr);
386  for (i=0; i<nr; i++)
387  grid->degrees[i] = params[i];
388  grid->Nd = nr;
389  } else {
390  fprintf(stderr, "Cannot find any parameters on line: "
391  "%s\n", buffer);
392  }
393  }
394 
395  free(params);
396  free(lparams);
397  fclose(fid);
398 }
#define Calloc(type, size)
Definition: gensvm_memory.h:40
Structure for describing the entire grid search.
Definition: gensvm_grid.h:67
double * epsilons
array of epsilon values
Definition: gensvm_grid.h:103
void read_grid_from_file(char *input_filename, struct GenGrid *grid)
Read the GenGrid struct from file.
Definition: GenSVMgrid.c:268
struct GenQueue * gensvm_init_queue(void)
Initialize a GenQueue structure.
Definition: gensvm_queue.c:38
#define VERSION_STRING
void err(const char *fmt,...)
Parse a formatted string and write it to standard error.
Definition: gensvm_print.c:84
void gensvm_read_data(struct GenData *dataset, char *data_file)
Read data from file.
Definition: gensvm_io.c:47
#define GENSVM_MAX_LINE_LENGTH
long Np
size of the array of p values
Definition: gensvm_grid.h:79
long Ne
size of the array of epsilon values
Definition: gensvm_grid.h:85
long Nc
size of the array of coef values
Definition: gensvm_grid.h:91
double * degrees
array of degree values
Definition: gensvm_grid.h:109
int gensvm_check_argv_eq(int argc, char **argv, char *str)
Check if a command line argument equals a string.
Definition: gensvm_cmdarg.c:78
long Nk
size of the array of kappa values
Definition: gensvm_grid.h:83
long Nw
size of the array of weight_idx values
Definition: gensvm_grid.h:87
#define Memset(var, type, size)
Definition: gensvm_memory.h:61
void gensvm_free_sparse(struct GenSparse *sp)
Free an allocated GenSparse structure.
Definition: gensvm_sparse.c:62
KernelType kerneltype
type of kernel to use throughout training
Definition: gensvm_grid.h:70
double * Z
Definition: gensvm_base.h:68
long i
index used for keeping track of the queue
Definition: gensvm_queue.h:52
void gensvm_fill_queue(struct GenGrid *grid, struct GenQueue *queue, struct GenData *train_data, struct GenData *test_data)
Initialize a GenQueue from a Training instance.
bool gensvm_check_outcome_contiguous(struct GenData *data)
Check if the labels are contiguous on [1 .. K].
Definition: gensvm_checks.c:43
void parse_command_line(int argc, char **argv, char *input_filename)
Parse command line arguments.
Definition: GenSVMgrid.c:191
Header file for gensvm_checks.c.
long all_longs_str(char *buffer, long offset, long *all_longs)
Read all longs in a given buffer.
double * lambdas
array of lambda values
Definition: gensvm_grid.h:99
Simple task queue.
Definition: gensvm_queue.h:47
A structure to represent the data.
Definition: gensvm_base.h:57
void gensvm_train_queue(struct GenQueue *q)
Run the grid search for a GenQueue.
bool str_endswith(const char *str, const char *suf)
Check if a string ends with a suffix.
void gensvm_consistency_repeats(struct GenQueue *q, long repeats, double percentile)
Run repeats of the GenTask structs in GenQueue to find the best configuration.
char * test_data_file
filename of test data file
Definition: gensvm_grid.h:113
Header file for gensvm_cmdarg.c.
void exit_with_help(char **argv)
Help function.
Definition: GenSVMgrid.c:69
double percentile
percentile to use for the consistency repeats
Definition: gensvm_grid.h:77
long repeats
Definition: gensvm_grid.h:74
Header file for gensvm_gridsearch.c.
long all_doubles_str(char *buffer, long offset, double *all_doubles)
Read all doubles in a given buffer.
bool str_startswith(const char *str, const char *pre)
Check if a string starts with a prefix.
long Nl
size of the array of lambda values
Definition: gensvm_grid.h:81
double * kappas
array of kappa values
Definition: gensvm_grid.h:101
long Ng
size of the array of gamma values
Definition: gensvm_grid.h:89
void gensvm_free_data(struct GenData *data)
Free allocated GenData struct.
Definition: gensvm_base.c:73
void gensvm_free_queue(struct GenQueue *q)
Free the GenQueue struct.
Definition: gensvm_queue.c:59
Header file for gensvm_consistency.c.
Header file for gensvm_io.c.
double * gensvm_sparse_to_dense(struct GenSparse *A)
Convert a GenSparse structure to a dense matrix.
int * weight_idxs
array of weight_idxs
Definition: gensvm_grid.h:95
long folds
number of folds in cross validation
Definition: gensvm_grid.h:72
int gensvm_check_argv(int argc, char **argv, char *str)
Check if any command line arguments contain string.
Definition: gensvm_cmdarg.c:49
char * train_data_file
filename of train data file
Definition: gensvm_grid.h:111
KernelType
type of kernel used in training
double * gammas
array of gamma values
Definition: gensvm_grid.h:105
#define MINARGS
Definition: GenSVMgrid.c:49
int main(int argc, char **argv)
Main interface function for GenSVMgrid.
Definition: GenSVMgrid.c:102
double * coefs
array of coef values
Definition: gensvm_grid.h:107
TrainType traintype
type of training to use
Definition: gensvm_grid.h:68
long Nd
size of the array of degree values
Definition: gensvm_grid.h:93
FILE * GENSVM_OUTPUT_FILE
Definition: gensvm_print.c:33
struct GenData * gensvm_init_data(void)
Initialize a GenData structure.
Definition: gensvm_base.c:45
KernelType parse_kernel_str(char *kernel_line)
Parse the kernel string from the training file.
Definition: GenSVMgrid.c:236
void gensvm_read_data_libsvm(struct GenData *dataset, char *data_file)
Read data from a file in LibSVM/SVMlight format.
Definition: gensvm_io.c:178
double * RAW
augmented raw data matrix
Definition: gensvm_base.h:73
FILE * GENSVM_ERROR_FILE
Definition: gensvm_print.c:43
struct GenSparse * spZ
sparse representation of the augmented data matrix
Definition: gensvm_base.h:71
double * ps
array of p values
Definition: gensvm_grid.h:97
void gensvm_free_grid(struct GenGrid *grid)
Free a GenGrid structure.
Definition: gensvm_grid.c:88
void note(const char *fmt,...)
Parse a formatted string and write to the output stream.
Definition: gensvm_print.c:62
struct GenGrid * gensvm_init_grid(void)
Initialize a GenGrid structure.
Definition: gensvm_grid.c:44