Line data Source code
1 : /**
2 : * @file gensvm_consistency.c
3 : * @author G.J.J. van den Burg
4 : * @date 2016-10-24
5 : * @brief Functions for running consistency repeats
6 : *
7 : * @details
8 : * When running the grid search, the user may optionally choose to run
9 : * repetitions of the best performing configurations to check if they perform
10 : * well consistently. These are called consistency repeats, and the code
11 : * needed for them is defined here.
12 : *
13 : * @copyright
14 : Copyright 2016, G.J.J. van den Burg.
15 :
16 : This file is part of GenSVM.
17 :
18 : GenSVM is free software: you can redistribute it and/or modify
19 : it under the terms of the GNU General Public License as published by
20 : the Free Software Foundation, either version 3 of the License, or
21 : (at your option) any later version.
22 :
23 : GenSVM is distributed in the hope that it will be useful,
24 : but WITHOUT ANY WARRANTY; without even the implied warranty of
25 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
26 : GNU General Public License for more details.
27 :
28 : You should have received a copy of the GNU General Public License
29 : along with GenSVM. If not, see <http://www.gnu.org/licenses/>.
30 :
31 : */
32 :
33 : #include "gensvm_consistency.h"
34 :
35 : /**
36 : * @brief Create GenQueue of tasks with performance above a given percentile
37 : *
38 : * @details
39 : * This function constructs a GenQueue of the GenTask instances in the
40 : * provided GenQueue which have a performance at or above the given percentile
41 : * of the performance of all elements in the provided GenQueue. This can be
42 : * used to determine which hyperparameter configurations belong to the top x-%
43 : * of all tasks in terms of performance.
44 : *
45 : * @sa
46 : * gensvm_consistency_repeats(), gensvm_percentile()
47 : *
48 : * @note
49 : * This function assumes that for each task in the given GenQueue, the
50 : * GenTask::perf element has been set.
51 : *
52 : * @param[in] q a complete GenQueue struct
53 : * @param[in] percentile the desired percentile
54 : *
55 : * @return a GenQueue struct with GenTasks which are at
56 : * or above the desired percentile of performance
57 : *
58 : */
59 1 : struct GenQueue *gensvm_top_queue(struct GenQueue *q, double percentile)
60 : {
61 1 : long i, k, N = 0;
62 : double boundary,
63 1 : *perf = Calloc(double, q->N);
64 1 : struct GenQueue *nq = gensvm_init_queue();
65 :
66 : // find the desired percentile of performance
67 11 : for (i=0; i<q->N; i++) {
68 10 : perf[i] = q->tasks[i]->performance;
69 : }
70 1 : boundary = gensvm_percentile(perf, q->N, percentile);
71 1 : note("Boundary of the %g-th percentile determined at: %f\n",
72 : percentile, boundary);
73 :
74 : // find the number of tasks that perform at or above the boundary
75 11 : for (i=0; i<q->N; i++) {
76 10 : if (q->tasks[i]->performance >= boundary)
77 3 : N++;
78 : }
79 :
80 : // create a new queue with the best tasks
81 1 : nq->tasks = Malloc(struct GenTask *, N);
82 1 : k = 0;
83 11 : for (i=0; i<q->N; i++) {
84 10 : if (q->tasks[i]->performance >= boundary)
85 3 : nq->tasks[k++] = gensvm_copy_task(q->tasks[i]);
86 : }
87 1 : nq->N = N;
88 1 : nq->i = 0;
89 :
90 1 : free(perf);
91 1 : return nq;
92 : }
93 :
94 : /**
95 : * @brief Run repeats of the GenTask structs in GenQueue to find the best
96 : * configuration
97 : *
98 : * @details
99 : * The best performing tasks in the supplied GenQueue are found by taking
100 : * those GenTask structs that have a performance greater or equal to the given
101 : * percentile of the performance of all tasks. These tasks are then gathered
102 : * in a new GenQueue. For each of the tasks in this new GenQueue the cross
103 : * validation run is repeated a number of times.
104 : *
105 : * For each of the GenTask configurations that are repeated the mean
106 : * performance, standard deviation of the performance and the mean computation
107 : * time are reported.
108 : *
109 : * Finally, the overall best tasks are written to the specified output. These
110 : * tasks are selected to have both the highest mean performance, as well as
111 : * the smallest standard deviation in their performance. This is done as
112 : * follows. First the 99th percentile of task performance and the 1st
113 : * percentile of standard deviation is calculated. If a task exists for which
114 : * the mean performance of the repeats and the standard deviation equals these
115 : * values respectively, this task is found to be the best and is written to
116 : * the output. If no such task exists, the 98th percentile of performance and
117 : * the 2nd percentile of standard deviation is considered. This is repeated
118 : * until an interval is found which contains tasks. If one or more tasks are
119 : * found, this loop stops.
120 : *
121 : * @param[in] q GenQueue of GenTask structs which have already been
122 : * run and have a GenTask::performance value
123 : * @param[in] repeats Number of times to repeat the best
124 : * configurations for consistency
125 : * @param[in] percentile percentile of performance to determine which
126 : * tasks to repeat
127 : */
128 0 : void gensvm_consistency_repeats(struct GenQueue *q, long repeats,
129 : double percentile)
130 : {
131 : bool breakout;
132 0 : long i, f, r, N, *cv_idx = NULL;
133 : double p, pi, pr, pt,
134 0 : *time = NULL,
135 0 : *std = NULL,
136 0 : *mean = NULL,
137 0 : *perf = NULL;
138 0 : struct GenQueue *nq = NULL;
139 0 : struct GenData **train_folds = NULL,
140 0 : **test_folds = NULL;
141 0 : struct GenModel *model = gensvm_init_model();
142 0 : struct GenTask *task = NULL;
143 : struct timespec loop_s, loop_e;
144 :
145 0 : nq = gensvm_top_queue(q, percentile);
146 0 : N = nq->N;
147 :
148 0 : note("Number of items to check: %li\n", nq->N);
149 0 : std = Calloc(double, N);
150 0 : mean = Calloc(double, N);
151 0 : time = Calloc(double, N);
152 0 : perf = Calloc(double, N*repeats);
153 :
154 0 : task = get_next_task(nq);
155 :
156 0 : model->n = 0;
157 0 : model->m = task->train_data->m;
158 0 : model->K = task->train_data->K;
159 0 : gensvm_allocate_model(model);
160 0 : gensvm_init_V(NULL, model, task->train_data);
161 :
162 0 : cv_idx = Calloc(long, task->train_data->n);
163 :
164 0 : train_folds = Malloc(struct GenData *, task->folds);
165 0 : test_folds = Malloc(struct GenData *, task->folds);
166 :
167 0 : i = 0;
168 0 : while (task) {
169 0 : gensvm_task_to_model(task, model);
170 :
171 0 : time[i] = 0.0;
172 0 : note("(%02li/%02li:%03li)\t", i+1, N, task->ID);
173 0 : for (r=0; r<repeats; r++) {
174 0 : Memset(cv_idx, long, task->train_data->n);
175 0 : gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx);
176 0 : train_folds = Malloc(struct GenData *, task->folds);
177 0 : test_folds = Malloc(struct GenData *, task->folds);
178 0 : for (f=0; f<task->folds; f++) {
179 0 : train_folds[f] = gensvm_init_data();
180 0 : test_folds[f] = gensvm_init_data();
181 0 : gensvm_get_tt_split(task->train_data, train_folds[f],
182 0 : test_folds[f], cv_idx, f);
183 0 : gensvm_kernel_preprocess(model, train_folds[f]);
184 0 : gensvm_kernel_postprocess(model, train_folds[f],
185 0 : test_folds[f]);
186 : }
187 :
188 0 : Timer(loop_s);
189 0 : p = gensvm_cross_validation(model, train_folds, test_folds,
190 0 : task->folds, task->train_data->n);
191 0 : Timer(loop_e);
192 0 : time[i] += gensvm_elapsed_time(&loop_s, &loop_e);
193 0 : matrix_set(perf, repeats, i, r, p);
194 0 : mean[i] += p/((double) repeats);
195 0 : note("%3.3f\t", p);
196 : // this is done because if we reuse the V it's not a
197 : // consistency check
198 0 : gensvm_init_V(NULL, model, task->train_data);
199 0 : for (f=0; f<task->folds; f++) {
200 0 : gensvm_free_data(train_folds[f]);
201 0 : gensvm_free_data(test_folds[f]);
202 : }
203 0 : free(train_folds);
204 0 : train_folds = NULL;
205 :
206 0 : free(test_folds);
207 0 : test_folds = NULL;
208 : }
209 0 : for (r=0; r<repeats; r++) {
210 0 : std[i] += pow(matrix_get(perf, repeats, i, r) - mean[i],
211 : 2.0);
212 : }
213 0 : if (r > 1) {
214 0 : std[i] /= ((double) repeats) - 1.0;
215 0 : std[i] = sqrt(std[i]);
216 : } else {
217 0 : std[i] = 0.0;
218 : }
219 0 : note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i],
220 0 : time[i]);
221 0 : task = get_next_task(nq);
222 0 : i++;
223 : }
224 :
225 : // find the best overall configurations: those with high average
226 : // performance and low deviation in the performance
227 0 : note("\nBest overall configuration(s):\n");
228 0 : note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t"
229 : "mean_perf\tstd_perf\ttime_perf\n");
230 0 : p = 0.0;
231 0 : breakout = false;
232 0 : while (breakout == false) {
233 0 : pi = gensvm_percentile(mean, N, (100.0-p));
234 0 : pr = gensvm_percentile(std, N, p);
235 0 : pt = gensvm_percentile(time, N, p);
236 0 : for (i=0; i<N; i++)
237 0 : if ((pi - mean[i] < 0.0001) &&
238 0 : (std[i] - pr < 0.0001) &&
239 0 : (time[i] - pt < 0.0001)) {
240 0 : note("(%li)\tw = %li\te = %f\tp = %f\t"
241 : "k = %f\tl = %f\t"
242 : "mean: %3.3f\tstd: %3.3f\t"
243 : "time: %3.3f\n",
244 0 : nq->tasks[i]->ID,
245 0 : nq->tasks[i]->weight_idx,
246 0 : nq->tasks[i]->epsilon,
247 0 : nq->tasks[i]->p,
248 0 : nq->tasks[i]->kappa,
249 0 : nq->tasks[i]->lambda,
250 0 : mean[i],
251 0 : std[i],
252 0 : time[i]);
253 0 : breakout = true;
254 : }
255 0 : p += 1.0;
256 : }
257 :
258 0 : free(cv_idx);
259 0 : gensvm_free_model(model);
260 0 : gensvm_free_queue(nq);
261 :
262 0 : free(perf);
263 0 : free(std);
264 0 : free(mean);
265 0 : free(time);
266 0 : }
267 :
268 : /**
269 : * @brief Comparison function for doubl
270 : *
271 : * @param[in] elem1 number 1
272 : * @param[in] elem2 number 2
273 : * @returns comparison of number 1 larger than number 2
274 : */
275 113 : int gensvm_dsort(const void *elem1, const void *elem2)
276 : {
277 113 : const double t1 = (*(double *) elem1);
278 113 : const double t2 = (*(double *) elem2);
279 113 : return t1 > t2;
280 : }
281 :
282 : /**
283 : * @brief Calculate the percentile of an array of doubles
284 : *
285 : * @details
286 : * The percentile of performance is used to find the top performing
287 : * configurations. Since no standard definition of the percentile exists, we
288 : * use the method used in MATLAB and Octave. Since calculating the percentile
289 : * requires a sorted list of the values, a local copy is made first.
290 : *
291 : * @param[in] values array of doubles
292 : * @param[in] N length of the array
293 : * @param[in] p percentile to calculate ( 0 <= p <= 100.0 ).
294 : * @returns the p-th percentile of the values
295 : */
296 6 : double gensvm_percentile(double *values, long N, double p)
297 : {
298 6 : if (N == 1)
299 1 : return values[0];
300 :
301 : long i;
302 : double pi, pr, boundary;
303 5 : double *local = Malloc(double, N);
304 55 : for (i=0; i<N; i++)
305 50 : local[i] = values[i];
306 :
307 5 : qsort(local, N, sizeof(double), gensvm_dsort);
308 5 : p /= 100.0;
309 5 : p = p*N + 0.5;
310 5 : pi = maximum(minimum(floor(p), N-1), 1);
311 5 : pr = maximum(minimum(p - pi, 1), 0);
312 5 : boundary = (1 - pr)*local[((long) pi)-1] + pr*local[((long) pi)];
313 :
314 5 : free(local);
315 :
316 5 : return boundary;
317 : }
|