LCOV - code coverage report
Current view: top level - src - gensvm_consistency.c (source / functions) Hit Total Coverage
Test: coverage.all Lines: 38 141 27.0 %
Date: 2017-02-21 18:44:20 Functions: 3 4 75.0 %

          Line data    Source code
       1             : /**
       2             :  * @file gensvm_consistency.c
       3             :  * @author G.J.J. van den Burg
       4             :  * @date 2016-10-24
       5             :  * @brief Functions for running consistency repeats
       6             :  *
       7             :  * @details
       8             :  * When running the grid search, the user may optionally choose to run 
       9             :  * repetitions of the best performing configurations to check if they perform 
      10             :  * well consistently. These are called consistency repeats, and the code 
      11             :  * needed for them is defined here.
      12             :  *
      13             :  * @copyright
      14             :  Copyright 2016, G.J.J. van den Burg.
      15             : 
      16             :  This file is part of GenSVM.
      17             : 
      18             :  GenSVM is free software: you can redistribute it and/or modify
      19             :  it under the terms of the GNU General Public License as published by
      20             :  the Free Software Foundation, either version 3 of the License, or
      21             :  (at your option) any later version.
      22             : 
      23             :  GenSVM is distributed in the hope that it will be useful,
      24             :  but WITHOUT ANY WARRANTY; without even the implied warranty of
      25             :  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      26             :  GNU General Public License for more details.
      27             : 
      28             :  You should have received a copy of the GNU General Public License
      29             :  along with GenSVM. If not, see <http://www.gnu.org/licenses/>.
      30             : 
      31             :  */
      32             : 
      33             : #include "gensvm_consistency.h"
      34             : 
      35             : /**
      36             :  * @brief Create GenQueue of tasks with performance above a given percentile
      37             :  *
      38             :  * @details
      39             :  * This function constructs a GenQueue of the GenTask instances in the 
      40             :  * provided GenQueue which have a performance at or above the given percentile 
      41             :  * of the performance of all elements in the provided GenQueue. This can be 
      42             :  * used to determine which hyperparameter configurations belong to the top x-% 
      43             :  * of all tasks in terms of performance.
      44             :  *
      45             :  * @sa
      46             :  * gensvm_consistency_repeats(), gensvm_percentile()
      47             :  *
      48             :  * @note
      49             :  * This function assumes that for each task in the given GenQueue, the 
      50             :  * GenTask::perf element has been set.
      51             :  *
      52             :  * @param[in]   q               a complete GenQueue struct
      53             :  * @param[in]   percentile      the desired percentile
      54             :  *
      55             :  * @return                      a GenQueue struct with GenTasks which are at
      56             :  *                              or above the desired percentile of performance
      57             :  *
      58             :  */
      59           1 : struct GenQueue *gensvm_top_queue(struct GenQueue *q, double percentile)
      60             : {
      61           1 :         long i, k, N = 0;
      62             :         double boundary,
      63           1 :                *perf = Calloc(double, q->N);
      64           1 :         struct GenQueue *nq = gensvm_init_queue();
      65             : 
      66             :         // find the desired percentile of performance
      67          11 :         for (i=0; i<q->N; i++) {
      68          10 :                 perf[i] = q->tasks[i]->performance;
      69             :         }
      70           1 :         boundary = gensvm_percentile(perf, q->N, percentile);
      71           1 :         note("Boundary of the %g-th percentile determined at: %f\n",
      72             :                         percentile, boundary);
      73             : 
      74             :         // find the number of tasks that perform at or above the boundary
      75          11 :         for (i=0; i<q->N; i++) {
      76          10 :                 if (q->tasks[i]->performance >= boundary)
      77           3 :                         N++;
      78             :         }
      79             : 
      80             :         // create a new queue with the best tasks
      81           1 :         nq->tasks = Malloc(struct GenTask *, N);
      82           1 :         k = 0;
      83          11 :         for (i=0; i<q->N; i++) {
      84          10 :                 if (q->tasks[i]->performance >= boundary)
      85           3 :                         nq->tasks[k++] = gensvm_copy_task(q->tasks[i]);
      86             :         }
      87           1 :         nq->N = N;
      88           1 :         nq->i = 0;
      89             : 
      90           1 :         free(perf);
      91           1 :         return nq;
      92             : }
      93             : 
      94             : /**
      95             :  * @brief Run repeats of the GenTask structs in GenQueue to find the best
      96             :  * configuration
      97             :  *
      98             :  * @details
      99             :  * The best performing tasks in the supplied GenQueue are found by taking 
     100             :  * those GenTask structs that have a performance greater or equal to the given 
     101             :  * percentile of the performance of all tasks. These tasks are then gathered 
     102             :  * in a new GenQueue. For each of the tasks in this new GenQueue the cross 
     103             :  * validation run is repeated a number of times.
     104             :  *
     105             :  * For each of the GenTask configurations that are repeated the mean 
     106             :  * performance, standard deviation of the performance and the mean computation 
     107             :  * time are reported.
     108             :  *
     109             :  * Finally, the overall best tasks are written to the specified output. These 
     110             :  * tasks are selected to have both the highest mean performance, as well as 
     111             :  * the smallest standard deviation in their performance. This is done as 
     112             :  * follows.  First the 99th percentile of task performance and the 1st 
     113             :  * percentile of standard deviation is calculated. If a task exists for which 
     114             :  * the mean performance of the repeats and the standard deviation equals these 
     115             :  * values respectively, this task is found to be the best and is written to 
     116             :  * the output. If no such task exists, the 98th percentile of performance and 
     117             :  * the 2nd percentile of standard deviation is considered. This is repeated 
     118             :  * until an interval is found which contains tasks. If one or more tasks are 
     119             :  * found, this loop stops.
     120             :  *
     121             :  * @param[in]   q               GenQueue of GenTask structs which have already been
     122             :  *                              run and have a GenTask::performance value
     123             :  * @param[in]   repeats         Number of times to repeat the best
     124             :  *                              configurations for consistency
     125             :  * @param[in]   percentile      percentile of performance to determine which
     126             :  *                              tasks to repeat
     127             :  */
     128           0 : void gensvm_consistency_repeats(struct GenQueue *q, long repeats,
     129             :                 double percentile)
     130             : {
     131             :         bool breakout;
     132           0 :         long i, f, r, N, *cv_idx = NULL;
     133             :         double p, pi, pr, pt,
     134           0 :                *time = NULL,
     135           0 :                *std = NULL,
     136           0 :                *mean = NULL,
     137           0 :                *perf = NULL;
     138           0 :         struct GenQueue *nq = NULL;
     139           0 :         struct GenData **train_folds = NULL,
     140           0 :                        **test_folds = NULL;
     141           0 :         struct GenModel *model = gensvm_init_model();
     142           0 :         struct GenTask *task = NULL;
     143             :         struct timespec loop_s, loop_e;
     144             : 
     145           0 :         nq = gensvm_top_queue(q, percentile);
     146           0 :         N = nq->N;
     147             : 
     148           0 :         note("Number of items to check: %li\n", nq->N);
     149           0 :         std = Calloc(double, N);
     150           0 :         mean = Calloc(double, N);
     151           0 :         time = Calloc(double, N);
     152           0 :         perf = Calloc(double, N*repeats);
     153             : 
     154           0 :         task = get_next_task(nq);
     155             : 
     156           0 :         model->n = 0;
     157           0 :         model->m = task->train_data->m;
     158           0 :         model->K = task->train_data->K;
     159           0 :         gensvm_allocate_model(model);
     160           0 :         gensvm_init_V(NULL, model, task->train_data);
     161             : 
     162           0 :         cv_idx = Calloc(long, task->train_data->n);
     163             : 
     164           0 :         train_folds = Malloc(struct GenData *, task->folds);
     165           0 :         test_folds = Malloc(struct GenData *, task->folds);
     166             : 
     167           0 :         i = 0;
     168           0 :         while (task) {
     169           0 :                 gensvm_task_to_model(task, model);
     170             : 
     171           0 :                 time[i] = 0.0;
     172           0 :                 note("(%02li/%02li:%03li)\t", i+1, N, task->ID);
     173           0 :                 for (r=0; r<repeats; r++) {
     174           0 :                         Memset(cv_idx, long, task->train_data->n);
     175           0 :                         gensvm_make_cv_split(task->train_data->n, task->folds, cv_idx);
     176           0 :                         train_folds = Malloc(struct GenData *, task->folds);
     177           0 :                         test_folds = Malloc(struct GenData *, task->folds);
     178           0 :                         for (f=0; f<task->folds; f++) {
     179           0 :                                 train_folds[f] = gensvm_init_data();
     180           0 :                                 test_folds[f] = gensvm_init_data();
     181           0 :                                 gensvm_get_tt_split(task->train_data, train_folds[f],
     182           0 :                                                 test_folds[f], cv_idx, f);
     183           0 :                                 gensvm_kernel_preprocess(model, train_folds[f]);
     184           0 :                                 gensvm_kernel_postprocess(model, train_folds[f],
     185           0 :                                                 test_folds[f]);
     186             :                         }
     187             : 
     188           0 :                         Timer(loop_s);
     189           0 :                         p = gensvm_cross_validation(model, train_folds, test_folds,
     190           0 :                                         task->folds, task->train_data->n);
     191           0 :                         Timer(loop_e);
     192           0 :                         time[i] += gensvm_elapsed_time(&loop_s, &loop_e);
     193           0 :                         matrix_set(perf, repeats, i, r, p);
     194           0 :                         mean[i] += p/((double) repeats);
     195           0 :                         note("%3.3f\t", p);
     196             :                         // this is done because if we reuse the V it's not a
     197             :                         // consistency check
     198           0 :                         gensvm_init_V(NULL, model, task->train_data);
     199           0 :                         for (f=0; f<task->folds; f++) {
     200           0 :                                 gensvm_free_data(train_folds[f]);
     201           0 :                                 gensvm_free_data(test_folds[f]);
     202             :                         }
     203           0 :                         free(train_folds);
     204           0 :                         train_folds = NULL;
     205             : 
     206           0 :                         free(test_folds);
     207           0 :                         test_folds = NULL;
     208             :                 }
     209           0 :                 for (r=0; r<repeats; r++) {
     210           0 :                         std[i] += pow(matrix_get(perf, repeats, i, r) - mean[i],
     211             :                                         2.0);
     212             :                 }
     213           0 :                 if (r > 1) {
     214           0 :                         std[i] /= ((double) repeats) - 1.0;
     215           0 :                         std[i] = sqrt(std[i]);
     216             :                 } else {
     217           0 :                         std[i] = 0.0;
     218             :                 }
     219           0 :                 note("(m = %3.3f, s = %3.3f, t = %3.3f)\n", mean[i], std[i],
     220           0 :                                 time[i]);
     221           0 :                 task = get_next_task(nq);
     222           0 :                 i++;
     223             :         }
     224             : 
     225             :         // find the best overall configurations: those with high average
     226             :         // performance and low deviation in the performance
     227           0 :         note("\nBest overall configuration(s):\n");
     228           0 :         note("ID\tweights\tepsilon\t\tp\t\tkappa\t\tlambda\t\t"
     229             :                         "mean_perf\tstd_perf\ttime_perf\n");
     230           0 :         p = 0.0;
     231           0 :         breakout = false;
     232           0 :         while (breakout == false) {
     233           0 :                 pi = gensvm_percentile(mean, N, (100.0-p));
     234           0 :                 pr = gensvm_percentile(std, N, p);
     235           0 :                 pt = gensvm_percentile(time, N, p);
     236           0 :                 for (i=0; i<N; i++)
     237           0 :                         if ((pi - mean[i] < 0.0001) &&
     238           0 :                                         (std[i] - pr < 0.0001) &&
     239           0 :                                         (time[i] - pt < 0.0001)) {
     240           0 :                                 note("(%li)\tw = %li\te = %f\tp = %f\t"
     241             :                                                 "k = %f\tl = %f\t"
     242             :                                                 "mean: %3.3f\tstd: %3.3f\t"
     243             :                                                 "time: %3.3f\n",
     244           0 :                                                 nq->tasks[i]->ID,
     245           0 :                                                 nq->tasks[i]->weight_idx,
     246           0 :                                                 nq->tasks[i]->epsilon,
     247           0 :                                                 nq->tasks[i]->p,
     248           0 :                                                 nq->tasks[i]->kappa,
     249           0 :                                                 nq->tasks[i]->lambda,
     250           0 :                                                 mean[i],
     251           0 :                                                 std[i],
     252           0 :                                                 time[i]);
     253           0 :                                 breakout = true;
     254             :                         }
     255           0 :                 p += 1.0;
     256             :         }
     257             : 
     258           0 :         free(cv_idx);
     259           0 :         gensvm_free_model(model);
     260           0 :         gensvm_free_queue(nq);
     261             : 
     262           0 :         free(perf);
     263           0 :         free(std);
     264           0 :         free(mean);
     265           0 :         free(time);
     266           0 : }
     267             : 
     268             : /**
     269             :  * @brief Comparison function for doubl
     270             :  *
     271             :  * @param[in]   elem1   number 1
     272             :  * @param[in]   elem2   number 2
     273             :  * @returns             comparison of number 1 larger than number 2
     274             :  */
     275         113 : int gensvm_dsort(const void *elem1, const void *elem2)
     276             : {
     277         113 :         const double t1 = (*(double *) elem1);
     278         113 :         const double t2 = (*(double *) elem2);
     279         113 :         return t1 > t2;
     280             : }
     281             : 
     282             : /**
     283             :  * @brief Calculate the percentile of an array of doubles
     284             :  *
     285             :  * @details
     286             :  * The percentile of performance is used to find the top performing
     287             :  * configurations. Since no standard definition of the percentile exists, we
     288             :  * use the method used in MATLAB and Octave. Since calculating the percentile
     289             :  * requires a sorted list of the values, a local copy is made first.
     290             :  *
     291             :  * @param[in]   values  array of doubles
     292             :  * @param[in]   N       length of the array
     293             :  * @param[in]   p       percentile to calculate ( 0 <= p <= 100.0 ).
     294             :  * @returns             the p-th percentile of the values
     295             :  */
     296           6 : double gensvm_percentile(double *values, long N, double p)
     297             : {
     298           6 :         if (N == 1)
     299           1 :                 return values[0];
     300             : 
     301             :         long i;
     302             :         double pi, pr, boundary;
     303           5 :         double *local = Malloc(double, N);
     304          55 :         for (i=0; i<N; i++)
     305          50 :                 local[i] = values[i];
     306             : 
     307           5 :         qsort(local, N, sizeof(double), gensvm_dsort);
     308           5 :         p /= 100.0;
     309           5 :         p = p*N + 0.5;
     310           5 :         pi = maximum(minimum(floor(p), N-1), 1);
     311           5 :         pr = maximum(minimum(p - pi, 1), 0);
     312           5 :         boundary = (1 - pr)*local[((long) pi)-1] + pr*local[((long) pi)];
     313             : 
     314           5 :         free(local);
     315             : 
     316           5 :         return boundary;
     317             : }

Generated by: LCOV version 1.12