LCOV - code coverage report
Current view: top level - src - gensvm_cv_util.c (source / functions) Hit Total Coverage
Test: coverage.all Lines: 127 127 100.0 %
Date: 2017-02-21 18:44:20 Functions: 4 4 100.0 %

          Line data    Source code
       1             : /**
       2             :  * @file gensvm_cv_util.c
       3             :  * @author G.J.J. van den Burg
       4             :  * @date 2014-01-07
       5             :  * @brief Functions for cross validation
       6             :  *
       7             :  * @details
       8             :  * This file contains functions for performing cross validation. The funtion
       9             :  * gensvm_make_cv_split() creates a cross validation vector for non-stratified
      10             :  * cross validation. The function gensvm_get_tt_split() creates a train and
      11             :  * test dataset from a given dataset and a pre-determined CV partition vector.
      12             :  * See individual function documentation for details.
      13             :  *
      14             :  * @copyright
      15             :  Copyright 2016, G.J.J. van den Burg.
      16             : 
      17             :  This file is part of GenSVM.
      18             : 
      19             :  GenSVM is free software: you can redistribute it and/or modify
      20             :  it under the terms of the GNU General Public License as published by
      21             :  the Free Software Foundation, either version 3 of the License, or
      22             :  (at your option) any later version.
      23             : 
      24             :  GenSVM is distributed in the hope that it will be useful,
      25             :  but WITHOUT ANY WARRANTY; without even the implied warranty of
      26             :  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
      27             :  GNU General Public License for more details.
      28             : 
      29             :  You should have received a copy of the GNU General Public License
      30             :  along with GenSVM. If not, see <http://www.gnu.org/licenses/>.
      31             : 
      32             :  */
      33             : 
      34             : #include "gensvm_cv_util.h"
      35             : 
      36             : /**
      37             :  * @brief Create a cross validation split vector
      38             :  *
      39             :  * @details
      40             :  * A pre-allocated vector of length N is created which can be used to define
      41             :  * cross validation splits. The folds are contain between
      42             :  * @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$
      43             :  * instances. An instance is mapped to a partition randomly until all folds
      44             :  * contain @f$ N \% folds @f$ instances. The zero fold then contains
      45             :  * @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$
      46             :  * instances are then distributed over the first @f$ N \% folds @f$ folds.
      47             :  *
      48             :  * @param[in]           N       number of instances
      49             :  * @param[in]           folds   number of folds
      50             :  * @param[in,out]       cv_idx  array of size N which contains the fold index
      51             :  *                              for each observation on exit
      52             :  *
      53             :  */
      54           2 : void gensvm_make_cv_split(long N, long folds, long *cv_idx)
      55             : {
      56             :         long i, j, idx;
      57             : 
      58         113 :         for (i=0; i<N; i++)
      59         111 :                 cv_idx[i] = 0;
      60             : 
      61           2 :         long big_folds = N%folds;
      62           2 :         long small_fold_size = N/folds;
      63             : 
      64           2 :         j = 0;
      65         108 :         for (i=0; i<small_fold_size*folds; i++)
      66             :                 while (1) {
      67         406 :                         idx = rand()%N;
      68         256 :                         if (cv_idx[idx] == 0) {
      69         106 :                                 cv_idx[idx] = j;
      70         106 :                                 j++;
      71         106 :                                 j%=folds;
      72         106 :                                 break;
      73             :                         }
      74             :                 }
      75           2 :         j = 0;
      76           2 :         i = 0;
      77          18 :         while (i < big_folds) {
      78          14 :                 if (cv_idx[j] == 0) {
      79           5 :                         cv_idx[j] = i++;
      80             :                 }
      81          14 :                 j++;
      82             :         }
      83           2 : }
      84             : 
      85             : /**
      86             :  * @brief Wrapper around sparse/dense versions of this function
      87             :  *
      88             :  * @details
      89             :  * This function tests if the data in the full_data structure is stored in a
      90             :  * dense matrix format or not, and calls gensvm_get_tt_split_dense() or
      91             :  * gensvm_get_tt_split_sparse() accordingly.
      92             :  *
      93             :  * @sa
      94             :  * gensvm_get_tt_split_dense(), gensvm_get_tt_split_sparse()
      95             :  *
      96             :  * @param[in]           full_data       a GenData structure for the entire
      97             :  *                                      dataset
      98             :  * @param[in,out]       train_data      an initialized GenData structure which
      99             :  *                                      on exit contains the training dataset
     100             :  * @param[in,out]       test_data       an initialized GenData structure which
     101             :  *                                      on exit contains the test dataset
     102             :  * @param[in]           cv_idx          a vector of cv partitions created by
     103             :  *                                      gensvm_make_cv_split()
     104             :  * @param[in]           fold_idx        index of the fold which becomes the
     105             :  *                                      test dataset
     106             :  */
     107           2 : void gensvm_get_tt_split(struct GenData *full_data,
     108             :                 struct GenData *train_data, struct GenData *test_data,
     109             :                 long *cv_idx, long fold_idx)
     110             : {
     111           2 :         if (full_data->Z == NULL)
     112           1 :                 gensvm_get_tt_split_sparse(full_data, train_data, test_data,
     113             :                                 cv_idx, fold_idx);
     114             :         else
     115           1 :                 gensvm_get_tt_split_dense(full_data, train_data, test_data,
     116             :                                 cv_idx, fold_idx);
     117           2 : }
     118             : 
     119             : /**
     120             :  * @brief Create train and test datasets for a CV split with dense data
     121             :  *
     122             :  * @details
     123             :  * Given a GenData structure for the full dataset, a previously created
     124             :  * cross validation split vector and a fold index, a training and test dataset
     125             :  * are created. It is assumed here that the data is stored as a dense matrix,
     126             :  * and that the train and test data should also be stored as a dense matrix.
     127             :  *
     128             :  * @sa
     129             :  * gensvm_get_tt_split_sparse(), gensvm_get_tt_split()
     130             :  *
     131             :  * @param[in]           full_data       a GenData structure for the entire
     132             :  *                                      dataset
     133             :  * @param[in,out]       train_data      an initialized GenData structure which
     134             :  *                                      on exit contains the training dataset
     135             :  * @param[in,out]       test_data       an initialized GenData structure which
     136             :  *                                      on exit contains the test dataset
     137             :  * @param[in]           cv_idx          a vector of cv partitions created by
     138             :  *                                      gensvm_make_cv_split()
     139             :  * @param[in]           fold_idx        index of the fold which becomes the
     140             :  *                                      test dataset
     141             :  */
     142           1 : void gensvm_get_tt_split_dense(struct GenData *full_data,
     143             :                 struct GenData *train_data, struct GenData *test_data,
     144             :                 long *cv_idx, long fold_idx)
     145             : {
     146             :         long i, j, k, l, test_n, train_n;
     147             : 
     148           1 :         long n = full_data->n;
     149           1 :         long m = full_data->m;
     150           1 :         long K = full_data->K;
     151             : 
     152             :         double value;
     153             : 
     154           1 :         test_n = 0;
     155          11 :         for (i=0; i<n; i++)
     156          10 :                 if (cv_idx[i] == fold_idx)
     157           2 :                         test_n++;
     158           1 :         train_n = n - test_n;
     159             : 
     160           1 :         test_data->n = test_n;
     161           1 :         train_data->n = train_n;
     162             : 
     163           1 :         train_data->K = K;
     164           1 :         test_data->K = K;
     165             : 
     166           1 :         train_data->m = m;
     167           1 :         test_data->m = m;
     168             : 
     169           1 :         train_data->y = Calloc(long, train_n);
     170           1 :         test_data->y = Calloc(long, test_n);
     171             : 
     172           1 :         train_data->RAW = Calloc(double, train_n*(m+1));
     173           1 :         test_data->RAW = Calloc(double, test_n*(m+1));
     174             : 
     175           1 :         k = 0;
     176           1 :         l = 0;
     177          11 :         for (i=0; i<n; i++) {
     178          10 :                 if (cv_idx[i] == fold_idx) {
     179           2 :                         test_data->y[k] = full_data->y[i];
     180           8 :                         for (j=0; j<m+1; j++) {
     181           6 :                                 value = matrix_get(full_data->RAW, m+1, i, j);
     182           6 :                                 matrix_set(test_data->RAW, m+1, k, j, value);
     183             :                         }
     184           2 :                         k++;
     185             :                 } else {
     186           8 :                         train_data->y[l] = full_data->y[i];
     187          32 :                         for (j=0; j<m+1; j++) {
     188          24 :                                 value = matrix_get(full_data->RAW, m+1, i, j);
     189          24 :                                 matrix_set(train_data->RAW, m+1, l, j, value);
     190             :                         }
     191           8 :                         l++;
     192             :                 }
     193             :         }
     194             : 
     195           1 :         train_data->Z = train_data->RAW;
     196           1 :         test_data->Z = test_data->RAW;
     197           1 : }
     198             : 
     199             : 
     200             : /**
     201             :  * @brief Create train and test dataset for a CV split with sparse data
     202             :  *
     203             :  * @details
     204             :  * Given a GenData structure for the full dataset, a previously created
     205             :  * cross validation split vector and a fold index, a training and test dataset
     206             :  * are created. It is assumed here that the data is stored as a sparse matrix,
     207             :  * and that the train and test data should also be stored as a sparse matrix.
     208             :  *
     209             :  * @sa
     210             :  * gensvm_get_tt_split_dense(), gensvm_get_tt_split()
     211             :  *
     212             :  * @param[in]           full_data       a GenData structure for the entire
     213             :  *                                      dataset
     214             :  * @param[in,out]       train_data      an initialized GenData structure which
     215             :  *                                      on exit contains the training dataset
     216             :  * @param[in,out]       test_data       an initialized GenData structure which
     217             :  *                                      on exit contains the test dataset
     218             :  * @param[in]           cv_idx          a vector of cv partitions created by
     219             :  *                                      gensvm_make_cv_split()
     220             :  * @param[in]           fold_idx        index of the fold which becomes the
     221             :  *                                      test dataset
     222             :  */
     223           1 : void gensvm_get_tt_split_sparse(struct GenData *full_data,
     224             :                 struct GenData *train_data, struct GenData *test_data,
     225             :                 long *cv_idx, long fold_idx)
     226             : {
     227             :         long i, j, test_n, train_n, train_nnz, test_nnz, row_nnz, jj,
     228             :              jj_start, jj_end,
     229           1 :              tr_nnz_idx = 0,
     230           1 :              tr_row_idx = 0,
     231           1 :              te_nnz_idx = 0,
     232           1 :              te_row_idx = 0;
     233             : 
     234             :         double value;
     235             : 
     236             :         // determine number of instances in test and train
     237           1 :         test_n = 0;
     238          11 :         for (i=0; i<full_data->n; i++)
     239          10 :                 if (cv_idx[i] == fold_idx)
     240           2 :                         test_n++;
     241           1 :         train_n = full_data->n - test_n;
     242             : 
     243             :         // set n, m, K variables
     244           1 :         train_data->n = train_n;
     245           1 :         train_data->m = full_data->m;
     246           1 :         train_data->K = full_data->K;
     247           1 :         test_data->n = test_n;
     248           1 :         test_data->m = full_data->m;
     249           1 :         test_data->K = full_data->K;
     250             : 
     251             :         // allocate outcome
     252           1 :         train_data->y = Calloc(long, train_n);
     253           1 :         test_data->y = Calloc(long, test_n);
     254             : 
     255             :         // compute train nnz and test nnz
     256           1 :         train_nnz = 0;
     257           1 :         test_nnz = 0;
     258          11 :         for (i=0; i<full_data->n; i++) {
     259          10 :                 row_nnz = full_data->spZ->ia[i+1] - full_data->spZ->ia[i];
     260          10 :                 if (cv_idx[i] == fold_idx) {
     261           2 :                         test_nnz += row_nnz;
     262             :                 } else {
     263           8 :                         train_nnz += row_nnz;
     264             :                 }
     265             :         }
     266             : 
     267             :         // allocate the train GenSparse
     268           1 :         train_data->spZ = gensvm_init_sparse();
     269           1 :         test_data->spZ = gensvm_init_sparse();
     270             : 
     271             :         // set GenSparse variables for train
     272           1 :         train_data->spZ->nnz = train_nnz;
     273           1 :         train_data->spZ->n_row = train_n;
     274           1 :         train_data->spZ->n_col = full_data->m+1;
     275           1 :         train_data->spZ->values = Calloc(double, train_nnz);
     276           1 :         train_data->spZ->ia = Calloc(long, train_n+1);
     277           1 :         train_data->spZ->ja = Calloc(long, train_nnz);
     278             : 
     279             :         // set GenSparse variables for test
     280           1 :         test_data->spZ->nnz = test_nnz;
     281           1 :         test_data->spZ->n_row = test_n;
     282           1 :         test_data->spZ->n_col = full_data->m+1;
     283           1 :         test_data->spZ->values = Calloc(double, test_nnz);
     284           1 :         test_data->spZ->ia = Calloc(long, test_n+1);
     285           1 :         test_data->spZ->ja = Calloc(long, test_nnz);
     286             : 
     287           1 :         tr_nnz_idx = 0;
     288           1 :         tr_row_idx = 0;
     289           1 :         te_nnz_idx = 0;
     290           1 :         te_row_idx = 0;
     291             : 
     292           1 :         test_data->spZ->ia[0] = 0;
     293           1 :         train_data->spZ->ia[0] = 0;
     294          11 :         for (i=0; i<full_data->n; i++) {
     295          10 :                 jj_start = full_data->spZ->ia[i];
     296          10 :                 jj_end = full_data->spZ->ia[i+1];
     297             : 
     298          30 :                 for (jj=jj_start; jj<jj_end; jj++) {
     299          20 :                         j = full_data->spZ->ja[jj];
     300          20 :                         value = full_data->spZ->values[jj];
     301             : 
     302          20 :                         if (cv_idx[i] == fold_idx) {
     303           4 :                                 test_data->spZ->values[te_nnz_idx] = value;
     304           4 :                                 test_data->spZ->ja[te_nnz_idx] = j;
     305           4 :                                 te_nnz_idx++;
     306             :                         } else {
     307          16 :                                 train_data->spZ->values[tr_nnz_idx] = value;
     308          16 :                                 train_data->spZ->ja[tr_nnz_idx] = j;
     309          16 :                                 tr_nnz_idx++;
     310             :                         }
     311             :                 }
     312             : 
     313          10 :                 if (cv_idx[i] == fold_idx) {
     314           2 :                         test_data->y[te_row_idx] = full_data->y[i];
     315           2 :                         test_data->spZ->ia[te_row_idx+1] = te_nnz_idx;
     316           2 :                         te_row_idx++;
     317             :                 } else {
     318           8 :                         train_data->y[tr_row_idx] = full_data->y[i];
     319           8 :                         train_data->spZ->ia[tr_row_idx+1] = tr_nnz_idx;
     320           8 :                         tr_row_idx++;
     321             :                 }
     322             :         }
     323           1 : }

Generated by: LCOV version 1.12