GenSVM
gensvm_consistency.c
Go to the documentation of this file.
1 
33 #include "gensvm_consistency.h"
34 
59 struct GenQueue *gensvm_top_queue(struct GenQueue *q, double percentile)
60 {
61  long i, k, N = 0;
62  double boundary,
63  *perf = Calloc(double, q->N);
64  struct GenQueue *nq = gensvm_init_queue();
65 
66  // find the desired percentile of performance
67  for (i=0; i<q->N; i++) {
68  perf[i] = q->tasks[i]->performance;
69  }
70  boundary = gensvm_percentile(perf, q->N, percentile);
71  note("Boundary of the %g-th percentile determined at: %f\n",
72  percentile, boundary);
73 
74  // find the number of tasks that perform at or above the boundary
75  for (i=0; i<q->N; i++) {
76  if (q->tasks[i]->performance >= boundary)
77  N++;
78  }
79 
80  // create a new queue with the best tasks
81  nq->tasks = Malloc(struct GenTask *, N);
82  k = 0;
83  for (i=0; i<q->N; i++) {
84  if (q->tasks[i]->performance >= boundary)
85  nq->tasks[k++] = gensvm_copy_task(q->tasks[i]);
86  }
87  nq->N = N;
88  nq->i = 0;
89 
90  free(perf);
91  return nq;
92 }
93 
128 void gensvm_consistency_repeats(struct GenQueue *q, long repeats,
129  double percentile)
130 {
131  bool breakout;
132  long i, f, r, N, *cv_idx = NULL;
133  double p, pi, pr, pt,
134  *time = NULL,
135  *std = NULL,
136  *mean = NULL,
137  *perf = NULL;
138  struct GenQueue *nq = NULL;
139  struct GenData **train_folds = NULL,
140  **test_folds = NULL;
141  struct GenModel *model = gensvm_init_model();
142  struct GenTask *task = NULL;
143  struct timespec loop_s, loop_e;
144 
145  nq = gensvm_top_queue(q, percentile);
146  N = nq->N;
147 
148  note("Number of items to check: %li\n", nq->N);
149  std = Calloc(double, N);
150  mean = Calloc(double, N);
151  time = Calloc(double, N);
152  perf = Calloc(double, N*repeats);
153 
154  task = get_next_task(nq);
155 
156  model->n = 0;
157  model->m = task->train_data->m;
158  model->K = task->train_data->K;
159  gensvm_allocate_model(model);
160  gensvm_init_V(NULL, model, task->train_data);
161 
162  cv_idx = Calloc(long, task->train_data->n);
163 
164  train_folds = Malloc(struct GenData *, task->folds);
165  test_folds = Malloc(struct GenData *, task->folds);
166 
167  i = 0;
168  while (task) {
169  gensvm_task_to_model(task, model);
170 
171  time[i] = 0.0;
172  note("(%02li/%02li:%03li)\t", i+1, N, task->ID);
173  for (r=0; r<repeats; r++) {
174  Memset(cv_idx, long, task->train_data->n);
175  gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx);
176  train_folds = Malloc(struct GenData *, task->folds);
177  test_folds = Malloc(struct GenData *, task->folds);
178  for (f=0; f<task->folds; f++) {
179  train_folds[f] = gensvm_init_data();
180  test_folds[f] = gensvm_init_data();
181  gensvm_get_tt_split(task->train_data, train_folds[f],
182  test_folds[f], cv_idx, f);
183  gensvm_kernel_preprocess(model, train_folds[f]);
184  gensvm_kernel_postprocess(model, train_folds[f],
185  test_folds[f]);
186  }
187 
188  Timer(loop_s);
189  p = gensvm_cross_validation(model, train_folds, test_folds,
190  task->folds, task->train_data->n);
191  Timer(loop_e);
192  time[i] += gensvm_elapsed_time(&loop_s, &loop_e);
193  matrix_set(perf, repeats, i, r, p);
194  mean[i] += p/((double) repeats);
195  note("%3.3f\t", p);
196  // this is done because if we reuse the V it's not a
197  // consistency check
198  gensvm_init_V(NULL, model, task->train_data);
199  for (f=0; f<task->folds; f++) {
200  gensvm_free_data(train_folds[f]);
201  gensvm_free_data(test_folds[f]);
202  }
203  free(train_folds);
204  train_folds = NULL;
205 
206  free(test_folds);
207  test_folds = NULL;
208  }
209  for (r=0; r<repeats; r++) {
210  std[i] += pow(matrix_get(perf, repeats, i, r) - mean[i],
211  2.0);
212  }
213  if (r > 1) {
214  std[i] /= ((double) repeats) - 1.0;
215  std[i] = sqrt(std[i]);
216  } else {
217  std[i] = 0.0;
218  }
219  note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i],
220  time[i]);
221  task = get_next_task(nq);
222  i++;
223  }
224 
225  // find the best overall configurations: those with high average
226  // performance and low deviation in the performance
227  note("\nBest overall configuration(s):\n");
228  note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t"
229  "mean_perf\tstd_perf\ttime_perf\n");
230  p = 0.0;
231  breakout = false;
232  while (breakout == false) {
233  pi = gensvm_percentile(mean, N, (100.0-p));
234  pr = gensvm_percentile(std, N, p);
235  pt = gensvm_percentile(time, N, p);
236  for (i=0; i<N; i++)
237  if ((pi - mean[i] < 0.0001) &&
238  (std[i] - pr < 0.0001) &&
239  (time[i] - pt < 0.0001)) {
240  note("(%li)\tw = %li\te = %f\tp = %f\t"
241  "k = %f\tl = %f\t"
242  "mean: %3.3f\tstd: %3.3f\t"
243  "time: %3.3f\n",
244  nq->tasks[i]->ID,
245  nq->tasks[i]->weight_idx,
246  nq->tasks[i]->epsilon,
247  nq->tasks[i]->p,
248  nq->tasks[i]->kappa,
249  nq->tasks[i]->lambda,
250  mean[i],
251  std[i],
252  time[i]);
253  breakout = true;
254  }
255  p += 1.0;
256  }
257 
258  free(cv_idx);
259  gensvm_free_model(model);
260  gensvm_free_queue(nq);
261 
262  free(perf);
263  free(std);
264  free(mean);
265  free(time);
266 }
267 
275 int gensvm_dsort(const void *elem1, const void *elem2)
276 {
277  const double t1 = (*(double *) elem1);
278  const double t2 = (*(double *) elem2);
279  return t1 > t2;
280 }
281 
296 double gensvm_percentile(double *values, long N, double p)
297 {
298  if (N == 1)
299  return values[0];
300 
301  long i;
302  double pi, pr, boundary;
303  double *local = Malloc(double, N);
304  for (i=0; i<N; i++)
305  local[i] = values[i];
306 
307  qsort(local, N, sizeof(double), gensvm_dsort);
308  p /= 100.0;
309  p = p*N + 0.5;
310  pi = maximum(minimum(floor(p), N-1), 1);
311  pr = maximum(minimum(p - pi, 1), 0);
312  boundary = (1 - pr)*local[((long) pi)-1] + pr*local[((long) pi)];
313 
314  free(local);
315 
316  return boundary;
317 }
#define Calloc(type, size)
Definition: gensvm_memory.h:40
long folds
number of folds in cross validation
Definition: gensvm_task.h:60
struct GenQueue * gensvm_init_queue(void)
Initialize a GenQueue structure.
Definition: gensvm_queue.c:38
double gensvm_elapsed_time(struct timespec *start, struct timespec *stop)
Calculate the time between two time recordings.
Definition: gensvm_timer.c:58
long ID
numeric id of the task in the queue
Definition: gensvm_task.h:62
double gensvm_cross_validation(struct GenModel *model, struct GenData **train_folds, struct GenData **test_folds, long folds, long n_total)
Run cross validation with a given set of train/test folds.
long K
number of classes
Definition: gensvm_base.h:58
#define Memset(var, type, size)
Definition: gensvm_memory.h:61
#define matrix_get(M, cols, i, j)
struct GenTask * get_next_task(struct GenQueue *q)
Get new GenTask from GenQueue.
Definition: gensvm_queue.c:82
long N
size of task array
Definition: gensvm_queue.h:50
void gensvm_free_model(struct GenModel *model)
Free allocated GenModel struct.
Definition: gensvm_base.c:211
long i
index used for keeping track of the queue
Definition: gensvm_queue.h:52
double performance
performance after cross validation
Definition: gensvm_task.h:84
#define Malloc(type, size)
Definition: gensvm_memory.h:48
struct GenModel * gensvm_init_model(void)
Initialize a GenModel structure.
Definition: gensvm_base.c:102
void gensvm_consistency_repeats(struct GenQueue *q, long repeats, double percentile)
Run repeats of the GenTask structs in GenQueue to find the best configuration.
Simple task queue.
Definition: gensvm_queue.h:47
A structure to represent the data.
Definition: gensvm_base.h:57
A structure to represent a single GenSVM model.
Definition: gensvm_base.h:92
void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, struct GenData *test_data, long *cv_idx, long fold_idx)
Wrapper around sparse/dense versions of this function.
long n
number of instances in the dataset
Definition: gensvm_base.h:97
void gensvm_make_cv_split(long N, long folds, long *cv_idx)
Create a cross validation split vector.
#define maximum(a, b)
int gensvm_dsort(const void *elem1, const void *elem2)
Comparison function for doubl.
void gensvm_free_data(struct GenData *data)
Free allocated GenData struct.
Definition: gensvm_base.c:73
void gensvm_free_queue(struct GenQueue *q)
Free the GenQueue struct.
Definition: gensvm_queue.c:59
A structure for a single task in the queue.
Definition: gensvm_task.h:55
Header file for gensvm_consistency.c.
void gensvm_allocate_model(struct GenModel *model)
Allocate memory for a GenModel.
Definition: gensvm_base.c:144
void gensvm_init_V(struct GenModel *from_model, struct GenModel *to_model, struct GenData *data)
Seed the matrix V from an existing model or using rand.
Definition: gensvm_init.c:57
double gensvm_percentile(double *values, long N, double p)
Calculate the percentile of an array of doubles.
long K
number of classes in the dataset
Definition: gensvm_base.h:95
struct GenTask ** tasks
array of pointers to Task structs
Definition: gensvm_queue.h:48
long m
number of predictors (width of RAW)
Definition: gensvm_base.h:62
struct GenTask * gensvm_copy_task(struct GenTask *t)
Deepcopy a GenTask struct.
Definition: gensvm_task.c:88
#define matrix_set(M, cols, i, j, val)
#define minimum(a, b)
void gensvm_kernel_postprocess(struct GenModel *model, struct GenData *traindata, struct GenData *testdata)
Compute the kernel postprocessing factor.
long n
number of instances
Definition: gensvm_base.h:60
struct GenData * gensvm_init_data(void)
Initialize a GenData structure.
Definition: gensvm_base.c:45
long m
number of predictor variables in the dataset
Definition: gensvm_base.h:99
void gensvm_kernel_preprocess(struct GenModel *model, struct GenData *data)
Do the preprocessing steps needed to perform kernel GenSVM.
Definition: gensvm_kernel.c:75
struct GenQueue * gensvm_top_queue(struct GenQueue *q, double percentile)
Create GenQueue of tasks with performance above a given percentile.
void note(const char *fmt,...)
Parse a formatted string and write to the output stream.
Definition: gensvm_print.c:62
struct GenData * train_data
pointer to the training data
Definition: gensvm_task.h:80
void gensvm_task_to_model(struct GenTask *task, struct GenModel *model)
Copy parameters from GenTask to GenModel.
Definition: gensvm_task.c:122
#define Timer(spec)
Timer macro for easily recording time.
Definition: gensvm_timer.h:37