Line data Source code
1 : /**
2 : * @file gensvm_cv_util.c
3 : * @author G.J.J. van den Burg
4 : * @date 2014-01-07
5 : * @brief Functions for cross validation
6 : *
7 : * @details
8 : * This file contains functions for performing cross validation. The funtion
9 : * gensvm_make_cv_split() creates a cross validation vector for non-stratified
10 : * cross validation. The function gensvm_get_tt_split() creates a train and
11 : * test dataset from a given dataset and a pre-determined CV partition vector.
12 : * See individual function documentation for details.
13 : *
14 : * @copyright
15 : Copyright 2016, G.J.J. van den Burg.
16 :
17 : This file is part of GenSVM.
18 :
19 : GenSVM is free software: you can redistribute it and/or modify
20 : it under the terms of the GNU General Public License as published by
21 : the Free Software Foundation, either version 3 of the License, or
22 : (at your option) any later version.
23 :
24 : GenSVM is distributed in the hope that it will be useful,
25 : but WITHOUT ANY WARRANTY; without even the implied warranty of
26 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 : GNU General Public License for more details.
28 :
29 : You should have received a copy of the GNU General Public License
30 : along with GenSVM. If not, see <http://www.gnu.org/licenses/>.
31 :
32 : */
33 :
34 : #include "gensvm_cv_util.h"
35 :
36 : /**
37 : * @brief Create a cross validation split vector
38 : *
39 : * @details
40 : * A pre-allocated vector of length N is created which can be used to define
41 : * cross validation splits. The folds are contain between
42 : * @f$ \lfloor N / folds \rfloor @f$ and @f$ \lceil N / folds \rceil @f$
43 : * instances. An instance is mapped to a partition randomly until all folds
44 : * contain @f$ N \% folds @f$ instances. The zero fold then contains
45 : * @f$ N / folds + N \% folds @f$ instances. These remaining @f$ N \% folds @f$
46 : * instances are then distributed over the first @f$ N \% folds @f$ folds.
47 : *
48 : * @param[in] N number of instances
49 : * @param[in] folds number of folds
50 : * @param[in,out] cv_idx array of size N which contains the fold index
51 : * for each observation on exit
52 : *
53 : */
54 2 : void gensvm_make_cv_split(long N, long folds, long *cv_idx)
55 : {
56 : long i, j, idx;
57 :
58 113 : for (i=0; i<N; i++)
59 111 : cv_idx[i] = 0;
60 :
61 2 : long big_folds = N%folds;
62 2 : long small_fold_size = N/folds;
63 :
64 2 : j = 0;
65 108 : for (i=0; i<small_fold_size*folds; i++)
66 : while (1) {
67 406 : idx = rand()%N;
68 256 : if (cv_idx[idx] == 0) {
69 106 : cv_idx[idx] = j;
70 106 : j++;
71 106 : j%=folds;
72 106 : break;
73 : }
74 : }
75 2 : j = 0;
76 2 : i = 0;
77 18 : while (i < big_folds) {
78 14 : if (cv_idx[j] == 0) {
79 5 : cv_idx[j] = i++;
80 : }
81 14 : j++;
82 : }
83 2 : }
84 :
85 : /**
86 : * @brief Wrapper around sparse/dense versions of this function
87 : *
88 : * @details
89 : * This function tests if the data in the full_data structure is stored in a
90 : * dense matrix format or not, and calls gensvm_get_tt_split_dense() or
91 : * gensvm_get_tt_split_sparse() accordingly.
92 : *
93 : * @sa
94 : * gensvm_get_tt_split_dense(), gensvm_get_tt_split_sparse()
95 : *
96 : * @param[in] full_data a GenData structure for the entire
97 : * dataset
98 : * @param[in,out] train_data an initialized GenData structure which
99 : * on exit contains the training dataset
100 : * @param[in,out] test_data an initialized GenData structure which
101 : * on exit contains the test dataset
102 : * @param[in] cv_idx a vector of cv partitions created by
103 : * gensvm_make_cv_split()
104 : * @param[in] fold_idx index of the fold which becomes the
105 : * test dataset
106 : */
107 2 : void gensvm_get_tt_split(struct GenData *full_data,
108 : struct GenData *train_data, struct GenData *test_data,
109 : long *cv_idx, long fold_idx)
110 : {
111 2 : if (full_data->Z == NULL)
112 1 : gensvm_get_tt_split_sparse(full_data, train_data, test_data,
113 : cv_idx, fold_idx);
114 : else
115 1 : gensvm_get_tt_split_dense(full_data, train_data, test_data,
116 : cv_idx, fold_idx);
117 2 : }
118 :
119 : /**
120 : * @brief Create train and test datasets for a CV split with dense data
121 : *
122 : * @details
123 : * Given a GenData structure for the full dataset, a previously created
124 : * cross validation split vector and a fold index, a training and test dataset
125 : * are created. It is assumed here that the data is stored as a dense matrix,
126 : * and that the train and test data should also be stored as a dense matrix.
127 : *
128 : * @sa
129 : * gensvm_get_tt_split_sparse(), gensvm_get_tt_split()
130 : *
131 : * @param[in] full_data a GenData structure for the entire
132 : * dataset
133 : * @param[in,out] train_data an initialized GenData structure which
134 : * on exit contains the training dataset
135 : * @param[in,out] test_data an initialized GenData structure which
136 : * on exit contains the test dataset
137 : * @param[in] cv_idx a vector of cv partitions created by
138 : * gensvm_make_cv_split()
139 : * @param[in] fold_idx index of the fold which becomes the
140 : * test dataset
141 : */
142 1 : void gensvm_get_tt_split_dense(struct GenData *full_data,
143 : struct GenData *train_data, struct GenData *test_data,
144 : long *cv_idx, long fold_idx)
145 : {
146 : long i, j, k, l, test_n, train_n;
147 :
148 1 : long n = full_data->n;
149 1 : long m = full_data->m;
150 1 : long K = full_data->K;
151 :
152 : double value;
153 :
154 1 : test_n = 0;
155 11 : for (i=0; i<n; i++)
156 10 : if (cv_idx[i] == fold_idx)
157 2 : test_n++;
158 1 : train_n = n - test_n;
159 :
160 1 : test_data->n = test_n;
161 1 : train_data->n = train_n;
162 :
163 1 : train_data->K = K;
164 1 : test_data->K = K;
165 :
166 1 : train_data->m = m;
167 1 : test_data->m = m;
168 :
169 1 : train_data->y = Calloc(long, train_n);
170 1 : test_data->y = Calloc(long, test_n);
171 :
172 1 : train_data->RAW = Calloc(double, train_n*(m+1));
173 1 : test_data->RAW = Calloc(double, test_n*(m+1));
174 :
175 1 : k = 0;
176 1 : l = 0;
177 11 : for (i=0; i<n; i++) {
178 10 : if (cv_idx[i] == fold_idx) {
179 2 : test_data->y[k] = full_data->y[i];
180 8 : for (j=0; j<m+1; j++) {
181 6 : value = matrix_get(full_data->RAW, m+1, i, j);
182 6 : matrix_set(test_data->RAW, m+1, k, j, value);
183 : }
184 2 : k++;
185 : } else {
186 8 : train_data->y[l] = full_data->y[i];
187 32 : for (j=0; j<m+1; j++) {
188 24 : value = matrix_get(full_data->RAW, m+1, i, j);
189 24 : matrix_set(train_data->RAW, m+1, l, j, value);
190 : }
191 8 : l++;
192 : }
193 : }
194 :
195 1 : train_data->Z = train_data->RAW;
196 1 : test_data->Z = test_data->RAW;
197 1 : }
198 :
199 :
200 : /**
201 : * @brief Create train and test dataset for a CV split with sparse data
202 : *
203 : * @details
204 : * Given a GenData structure for the full dataset, a previously created
205 : * cross validation split vector and a fold index, a training and test dataset
206 : * are created. It is assumed here that the data is stored as a sparse matrix,
207 : * and that the train and test data should also be stored as a sparse matrix.
208 : *
209 : * @sa
210 : * gensvm_get_tt_split_dense(), gensvm_get_tt_split()
211 : *
212 : * @param[in] full_data a GenData structure for the entire
213 : * dataset
214 : * @param[in,out] train_data an initialized GenData structure which
215 : * on exit contains the training dataset
216 : * @param[in,out] test_data an initialized GenData structure which
217 : * on exit contains the test dataset
218 : * @param[in] cv_idx a vector of cv partitions created by
219 : * gensvm_make_cv_split()
220 : * @param[in] fold_idx index of the fold which becomes the
221 : * test dataset
222 : */
223 1 : void gensvm_get_tt_split_sparse(struct GenData *full_data,
224 : struct GenData *train_data, struct GenData *test_data,
225 : long *cv_idx, long fold_idx)
226 : {
227 : long i, j, test_n, train_n, train_nnz, test_nnz, row_nnz, jj,
228 : jj_start, jj_end,
229 1 : tr_nnz_idx = 0,
230 1 : tr_row_idx = 0,
231 1 : te_nnz_idx = 0,
232 1 : te_row_idx = 0;
233 :
234 : double value;
235 :
236 : // determine number of instances in test and train
237 1 : test_n = 0;
238 11 : for (i=0; i<full_data->n; i++)
239 10 : if (cv_idx[i] == fold_idx)
240 2 : test_n++;
241 1 : train_n = full_data->n - test_n;
242 :
243 : // set n, m, K variables
244 1 : train_data->n = train_n;
245 1 : train_data->m = full_data->m;
246 1 : train_data->K = full_data->K;
247 1 : test_data->n = test_n;
248 1 : test_data->m = full_data->m;
249 1 : test_data->K = full_data->K;
250 :
251 : // allocate outcome
252 1 : train_data->y = Calloc(long, train_n);
253 1 : test_data->y = Calloc(long, test_n);
254 :
255 : // compute train nnz and test nnz
256 1 : train_nnz = 0;
257 1 : test_nnz = 0;
258 11 : for (i=0; i<full_data->n; i++) {
259 10 : row_nnz = full_data->spZ->ia[i+1] - full_data->spZ->ia[i];
260 10 : if (cv_idx[i] == fold_idx) {
261 2 : test_nnz += row_nnz;
262 : } else {
263 8 : train_nnz += row_nnz;
264 : }
265 : }
266 :
267 : // allocate the train GenSparse
268 1 : train_data->spZ = gensvm_init_sparse();
269 1 : test_data->spZ = gensvm_init_sparse();
270 :
271 : // set GenSparse variables for train
272 1 : train_data->spZ->nnz = train_nnz;
273 1 : train_data->spZ->n_row = train_n;
274 1 : train_data->spZ->n_col = full_data->m+1;
275 1 : train_data->spZ->values = Calloc(double, train_nnz);
276 1 : train_data->spZ->ia = Calloc(long, train_n+1);
277 1 : train_data->spZ->ja = Calloc(long, train_nnz);
278 :
279 : // set GenSparse variables for test
280 1 : test_data->spZ->nnz = test_nnz;
281 1 : test_data->spZ->n_row = test_n;
282 1 : test_data->spZ->n_col = full_data->m+1;
283 1 : test_data->spZ->values = Calloc(double, test_nnz);
284 1 : test_data->spZ->ia = Calloc(long, test_n+1);
285 1 : test_data->spZ->ja = Calloc(long, test_nnz);
286 :
287 1 : tr_nnz_idx = 0;
288 1 : tr_row_idx = 0;
289 1 : te_nnz_idx = 0;
290 1 : te_row_idx = 0;
291 :
292 1 : test_data->spZ->ia[0] = 0;
293 1 : train_data->spZ->ia[0] = 0;
294 11 : for (i=0; i<full_data->n; i++) {
295 10 : jj_start = full_data->spZ->ia[i];
296 10 : jj_end = full_data->spZ->ia[i+1];
297 :
298 30 : for (jj=jj_start; jj<jj_end; jj++) {
299 20 : j = full_data->spZ->ja[jj];
300 20 : value = full_data->spZ->values[jj];
301 :
302 20 : if (cv_idx[i] == fold_idx) {
303 4 : test_data->spZ->values[te_nnz_idx] = value;
304 4 : test_data->spZ->ja[te_nnz_idx] = j;
305 4 : te_nnz_idx++;
306 : } else {
307 16 : train_data->spZ->values[tr_nnz_idx] = value;
308 16 : train_data->spZ->ja[tr_nnz_idx] = j;
309 16 : tr_nnz_idx++;
310 : }
311 : }
312 :
313 10 : if (cv_idx[i] == fold_idx) {
314 2 : test_data->y[te_row_idx] = full_data->y[i];
315 2 : test_data->spZ->ia[te_row_idx+1] = te_nnz_idx;
316 2 : te_row_idx++;
317 : } else {
318 8 : train_data->y[tr_row_idx] = full_data->y[i];
319 8 : train_data->spZ->ia[tr_row_idx+1] = tr_nnz_idx;
320 8 : tr_row_idx++;
321 : }
322 : }
323 1 : }
|