GenSVM
GenSVMtraintest.c
Go to the documentation of this file.
1 
31 #include "gensvm_checks.h"
32 #include "gensvm_cmdarg.h"
33 #include "gensvm_io.h"
34 #include "gensvm_train.h"
35 #include "gensvm_predict.h"
36 
40 #define MINARGS 2
41 
42 extern FILE *GENSVM_OUTPUT_FILE;
43 extern FILE *GENSVM_ERROR_FILE;
44 
45 // function declarations
46 void exit_with_help(char **argv);
47 void parse_command_line(int argc, char **argv, struct GenModel *model,
48  char **model_inputfile, char **training_inputfile,
49  char **testing_inputfile, char **model_outputfile,
50  char **prediction_outputfile);
51 
62 void exit_with_help(char **argv)
63 {
64  printf("This is GenSVM, version %s.\n", VERSION_STRING);
65  printf("Copyright (C) 2016, G.J.J. van den Burg.\n");
66  printf("This program is free software, see the LICENSE file "
67  "for details.\n\n");
68  printf("Usage: %s [options] training_data [test_data]\n\n", argv[0]);
69  printf("Options:\n");
70  printf("--------\n");
71  printf("-c coef : coefficient for the polynomial and "
72  "sigmoid kernel\n");
73  printf("-d degree : degree for the polynomial kernel\n");
74  printf("-e epsilon : set the value of the stopping "
75  "criterion (epsilon > 0)\n");
76  printf("-g gamma : parameter for the rbf, polynomial or "
77  "sigmoid kernel\n");
78  printf("-h | -help : print this help.\n");
79  printf("-k kappa : set the value of kappa used in the "
80  "Huber hinge (kappa > -1.0)\n");
81  printf("-l lambda : set the value of lambda "
82  "(lambda > 0)\n");
83  printf("-m model_output_file : write model output to file "
84  "(not saved if no file provided)\n");
85  printf("-o prediction_output : write predictions of test data to "
86  "file (uses stdout if not provided)\n");
87  printf("-p p-value : set the value of p in the lp norm "
88  "(1.0 <= p <= 2.0)\n");
89  printf("-q : quiet mode (no output, not even "
90  "errors!)\n");
91  printf("-r rho : choose the weigth specification "
92  "(1 = unit, 2 = group)\n");
93  printf("-s seed_model_file : use previous model as seed for V\n");
94  printf("-t type : kerneltype (0=LINEAR, 1=POLY, 2=RBF, "
95  "3=SIGMOID)\n");
96  printf("-x : data files are in LibSVM/SVMlight "
97  "format\n");
98  printf("\n");
99 
100  exit(EXIT_FAILURE);
101 }
102 
114 int main(int argc, char **argv)
115 {
116  bool libsvm_format = false;
117  long i, *predy = NULL;
118  double performance;
119 
120  char *training_inputfile = NULL,
121  *testing_inputfile = NULL,
122  *model_inputfile = NULL,
123  *model_outputfile = NULL,
124  *prediction_outputfile = NULL;
125 
126  struct GenModel *model = gensvm_init_model();
127  struct GenModel *seed_model = NULL;
128  struct GenData *traindata = gensvm_init_data();
129  struct GenData *testdata = gensvm_init_data();
130 
131  if (argc < MINARGS || gensvm_check_argv(argc, argv, "-help")
132  || gensvm_check_argv_eq(argc, argv, "-h"))
133  exit_with_help(argv);
134 
135  // parse command line arguments
136  parse_command_line(argc, argv, model, &model_inputfile,
137  &training_inputfile, &testing_inputfile,
138  &model_outputfile, &prediction_outputfile);
139  libsvm_format = gensvm_check_argv(argc, argv, "-x");
140 
141  // read data from file
142  if (libsvm_format)
143  gensvm_read_data_libsvm(traindata, training_inputfile);
144  else
145  gensvm_read_data(traindata, training_inputfile);
146 
147  // check labels for consistency
148  if (!gensvm_check_outcome_contiguous(traindata)) {
149  err("[GenSVM Error]: Class labels should start from 1 and "
150  "have no gaps. Please reformat your data.\n");
151  exit(EXIT_FAILURE);
152  }
153 
154  // save data filename to model
155  model->data_file = Calloc(char, GENSVM_MAX_LINE_LENGTH);
156  strcpy(model->data_file, training_inputfile);
157 
158  // check if we are sparse and want nonlinearity
159  if (traindata->Z == NULL && model->kerneltype != K_LINEAR) {
160  err("[GenSVM Warning]: Sparse matrices with nonlinear kernels "
161  "are not yet supported. Dense matrices will "
162  "be used.\n");
163  traindata->RAW = gensvm_sparse_to_dense(traindata->spZ);
164  traindata->Z = traindata->RAW;
165  gensvm_free_sparse(traindata->spZ);
166  }
167 
168  // seed the random number generator
169  srand(time(NULL));
170 
171  // load a seed model from file if it is specified
172  if (gensvm_check_argv_eq(argc, argv, "-s")) {
173  seed_model = gensvm_init_model();
174  gensvm_read_model(seed_model, model_inputfile);
175  }
176 
177  // train the GenSVM model
178  gensvm_train(model, traindata, seed_model);
179 
180  // if we also have a test set, predict labels and write to predictions
181  // to an output file if specified
182  if (testing_inputfile != NULL) {
183  // read the test data
184  if (libsvm_format)
185  gensvm_read_data_libsvm(testdata, testing_inputfile);
186  else
187  gensvm_read_data(testdata, testing_inputfile);
188 
189  // check if we are sparse and want nonlinearity
190  if (testdata->Z == NULL && model->kerneltype != K_LINEAR) {
191  err("[GenSVM Warning]: Sparse matrices with nonlinear "
192  "kernels are not yet supported. Dense "
193  "matrices will be used.\n");
194  testdata->Z = gensvm_sparse_to_dense(testdata->spZ);
195  gensvm_free_sparse(testdata->spZ);
196  }
197 
198  gensvm_kernel_postprocess(model, traindata, testdata);
199 
200  // predict labels
201  predy = Calloc(long, testdata->n);
202  gensvm_predict_labels(testdata, model, predy);
203 
204  if (testdata->y != NULL) {
205  performance = gensvm_prediction_perf(testdata, predy);
206  note("Predictive performance: %3.2f%%\n", performance);
207  }
208 
209  // if output file is specified, write predictions to it
210  if (gensvm_check_argv_eq(argc, argv, "-o")) {
211  gensvm_write_predictions(testdata, predy,
212  prediction_outputfile);
213  note("Prediction written to: %s\n",
214  prediction_outputfile);
215  } else {
216  for (i=0; i<testdata->n; i++)
217  printf("%li ", predy[i]);
218  printf("\n");
219  }
220  }
221 
222  // write model to output file if necessary
223  if (gensvm_check_argv_eq(argc, argv, "-m")) {
224  gensvm_write_model(model, model_outputfile);
225  note("Model written to: %s\n", model_outputfile);
226  }
227 
228  // free everything
229  gensvm_free_model(model);
230  gensvm_free_model(seed_model);
231  gensvm_free_data(traindata);
232  gensvm_free_data(testdata);
233  free(training_inputfile);
234  free(testing_inputfile);
235  free(model_inputfile);
236  free(model_outputfile);
237  free(prediction_outputfile);
238 
239  free(predy);
240 
241  return 0;
242 }
243 
250 void exit_invalid_param(const char *label, char **argv)
251 {
252  fprintf(stderr, "Invalid parameter value for %s.\n\n", label);
253  exit_with_help(argv);
254 }
255 
275 void parse_command_line(int argc, char **argv, struct GenModel *model,
276  char **model_inputfile, char **training_inputfile,
277  char **testing_inputfile, char **model_outputfile,
278  char **prediction_outputfile)
279 {
280  int i;
281 
282  GENSVM_OUTPUT_FILE = stdout;
283  GENSVM_ERROR_FILE = stderr;
284 
285  // parse options
286  // note: flags that don't have an argument should decrement i
287  for (i=1; i<argc; i++) {
288  if (argv[i][0] != '-') break;
289  if (++i>=argc) {
290  exit_with_help(argv);
291  }
292  switch (argv[i-1][1]) {
293  case 'c':
294  model->coef = atof(argv[i]);
295  break;
296  case 'd':
297  model->degree = atof(argv[i]);
298  break;
299  case 'e':
300  model->epsilon = atof(argv[i]);
301  if (model->epsilon <= 0)
302  exit_invalid_param("epsilon", argv);
303  break;
304  case 'g':
305  model->gamma = atof(argv[i]);
306  break;
307  case 'k':
308  model->kappa = atof(argv[i]);
309  if (model->kappa <= -1.0)
310  exit_invalid_param("kappa", argv);
311  break;
312  case 'l':
313  model->lambda = atof(argv[i]);
314  if (model->lambda <= 0)
315  exit_invalid_param("lambda", argv);
316  break;
317  case 's':
318  (*model_inputfile) = Malloc(char,
319  strlen(argv[i])+1);
320  strcpy((*model_inputfile), argv[i]);
321  break;
322  case 'm':
323  (*model_outputfile) = Malloc(char,
324  strlen(argv[i])+1);
325  strcpy((*model_outputfile), argv[i]);
326  break;
327  case 'o':
328  (*prediction_outputfile) = Malloc(char,
329  strlen(argv[i])+1);
330  strcpy((*prediction_outputfile), argv[i]);
331  break;
332  case 'p':
333  model->p = atof(argv[i]);
334  if (model->p < 1.0 || model->p > 2.0)
335  exit_invalid_param("p", argv);
336  break;
337  case 'r':
338  model->weight_idx = atoi(argv[i]);
339  break;
340  case 't':
341  model->kerneltype = atoi(argv[i]);
342  break;
343  case 'q':
344  GENSVM_OUTPUT_FILE = NULL;
345  GENSVM_ERROR_FILE = NULL;
346  i--;
347  break;
348  case 'x':
349  i--;
350  break;
351  default:
352  // this one should always print explicitly to
353  // stderr, even if '-q' is supplied, because
354  // otherwise you can't debug cmdline flags.
355  fprintf(stderr, "Unknown option: -%c\n",
356  argv[i-1][1]);
357  exit_with_help(argv);
358  }
359  }
360  if (i >= argc)
361  exit_with_help(argv);
362 
363  (*training_inputfile) = Malloc(char, strlen(argv[i])+1);
364  strcpy((*training_inputfile), argv[i]);
365  if (i+2 == argc) {
366  (*testing_inputfile) = Malloc(char, strlen(argv[i])+1);
367  strcpy((*testing_inputfile), argv[i+1]);
368  }
369 }
#define Calloc(type, size)
Definition: gensvm_memory.h:40
void gensvm_write_predictions(struct GenData *data, long *predy, char *output_filename)
Write predictions to file.
Definition: gensvm_io.c:556
#define VERSION_STRING
Header file for gensvm_predict.c.
double epsilon
stopping criterion for the IM algorithm.
Definition: gensvm_base.h:101
void err(const char *fmt,...)
Parse a formatted string and write it to standard error.
Definition: gensvm_print.c:84
void gensvm_read_model(struct GenModel *model, char *model_filename)
Read model from file.
Definition: gensvm_io.c:410
double p
parameter for the L-p norm in the loss function
Definition: gensvm_base.h:103
void gensvm_read_data(struct GenData *dataset, char *data_file)
Read data from file.
Definition: gensvm_io.c:47
#define GENSVM_MAX_LINE_LENGTH
int gensvm_check_argv_eq(int argc, char **argv, char *str)
Check if a command line argument equals a string.
Definition: gensvm_cmdarg.c:78
void gensvm_free_sparse(struct GenSparse *sp)
Free an allocated GenSparse structure.
Definition: gensvm_sparse.c:62
double gensvm_prediction_perf(struct GenData *data, long *perdy)
Calculate the predictive performance (percentage correct)
void parse_command_line(int argc, char **argv, struct GenModel *model, char **model_inputfile, char **training_inputfile, char **testing_inputfile, char **model_outputfile, char **prediction_outputfile)
Parse the command line arguments.
double * Z
Definition: gensvm_base.h:68
void gensvm_free_model(struct GenModel *model)
Free allocated GenModel struct.
Definition: gensvm_base.c:211
long i
index used for keeping track of the queue
Definition: gensvm_queue.h:52
int weight_idx
which weights to use (1 = unit, 2 = group)
Definition: gensvm_base.h:93
FILE * GENSVM_OUTPUT_FILE
Definition: gensvm_print.c:33
#define Malloc(type, size)
Definition: gensvm_memory.h:48
bool gensvm_check_outcome_contiguous(struct GenData *data)
Check if the labels are contiguous on [1 .. K].
Definition: gensvm_checks.c:43
Header file for gensvm_checks.c.
long * y
array of class labels, 1..K
Definition: gensvm_base.h:66
struct GenModel * gensvm_init_model(void)
Initialize a GenModel structure.
Definition: gensvm_base.c:102
A structure to represent the data.
Definition: gensvm_base.h:57
void gensvm_train(struct GenModel *model, struct GenData *data, struct GenModel *seed_model)
Utility function for training a GenSVM model.
Definition: gensvm_train.c:44
Header file for gensvm_train.c.
A structure to represent a single GenSVM model.
Definition: gensvm_base.h:92
void gensvm_write_model(struct GenModel *model, char *output_filename)
Write model to file.
Definition: gensvm_io.c:494
void gensvm_predict_labels(struct GenData *testdata, struct GenModel *model, long *predy)
Predict class labels of data given and output in predy.
Header file for gensvm_cmdarg.c.
char * data_file
filename of the data
Definition: gensvm_base.h:134
void exit_with_help(char **argv)
Help function.
int main(int argc, char **argv)
Main interface function for GenSVMtraintest.
void gensvm_free_data(struct GenData *data)
Free allocated GenData struct.
Definition: gensvm_base.c:73
Header file for gensvm_io.c.
double * gensvm_sparse_to_dense(struct GenSparse *A)
Convert a GenSparse structure to a dense matrix.
double kappa
parameter for the Huber hinge function
Definition: gensvm_base.h:105
FILE * GENSVM_ERROR_FILE
Definition: gensvm_print.c:43
double degree
kernel parameter for poly
Definition: gensvm_base.h:113
int gensvm_check_argv(int argc, char **argv, char *str)
Check if any command line arguments contain string.
Definition: gensvm_cmdarg.c:49
double coef
kernel parameter for poly and sigmoid
Definition: gensvm_base.h:111
KernelType kerneltype
type of kernel used in the model
Definition: gensvm_base.h:136
double gamma
kernel parameter for RBF, poly, and sigmoid
Definition: gensvm_base.h:109
void exit_invalid_param(const char *label, char **argv)
Exit with warning about invalid parameter value.
void gensvm_kernel_postprocess(struct GenModel *model, struct GenData *traindata, struct GenData *testdata)
Compute the kernel postprocessing factor.
long n
number of instances
Definition: gensvm_base.h:60
struct GenData * gensvm_init_data(void)
Initialize a GenData structure.
Definition: gensvm_base.c:45
void gensvm_read_data_libsvm(struct GenData *dataset, char *data_file)
Read data from a file in LibSVM/SVMlight format.
Definition: gensvm_io.c:178
double * RAW
augmented raw data matrix
Definition: gensvm_base.h:73
struct GenSparse * spZ
sparse representation of the augmented data matrix
Definition: gensvm_base.h:71
double lambda
regularization parameter in the loss function
Definition: gensvm_base.h:107
void note(const char *fmt,...)
Parse a formatted string and write to the output stream.
Definition: gensvm_print.c:62
#define MINARGS