GenSVM
gensvm_cv_util.c
Go to the documentation of this file.
1 
34 #include "gensvm_cv_util.h"
35 
54 void gensvm_make_cv_split(long N, long folds, long *cv_idx)
55 {
56  long i, j, idx;
57 
58  for (i=0; i<N; i++)
59  cv_idx[i] = 0;
60 
61  long big_folds = N%folds;
62  long small_fold_size = N/folds;
63 
64  j = 0;
65  for (i=0; i<small_fold_size*folds; i++)
66  while (1) {
67  idx = rand()%N;
68  if (cv_idx[idx] == 0) {
69  cv_idx[idx] = j;
70  j++;
71  j%=folds;
72  break;
73  }
74  }
75  j = 0;
76  i = 0;
77  while (i < big_folds) {
78  if (cv_idx[j] == 0) {
79  cv_idx[j] = i++;
80  }
81  j++;
82  }
83 }
84 
107 void gensvm_get_tt_split(struct GenData *full_data,
108  struct GenData *train_data, struct GenData *test_data,
109  long *cv_idx, long fold_idx)
110 {
111  if (full_data->Z == NULL)
112  gensvm_get_tt_split_sparse(full_data, train_data, test_data,
113  cv_idx, fold_idx);
114  else
115  gensvm_get_tt_split_dense(full_data, train_data, test_data,
116  cv_idx, fold_idx);
117 }
118 
142 void gensvm_get_tt_split_dense(struct GenData *full_data,
143  struct GenData *train_data, struct GenData *test_data,
144  long *cv_idx, long fold_idx)
145 {
146  long i, j, k, l, test_n, train_n;
147 
148  long n = full_data->n;
149  long m = full_data->m;
150  long K = full_data->K;
151 
152  double value;
153 
154  test_n = 0;
155  for (i=0; i<n; i++)
156  if (cv_idx[i] == fold_idx)
157  test_n++;
158  train_n = n - test_n;
159 
160  test_data->n = test_n;
161  train_data->n = train_n;
162 
163  train_data->K = K;
164  test_data->K = K;
165 
166  train_data->m = m;
167  test_data->m = m;
168 
169  train_data->y = Calloc(long, train_n);
170  test_data->y = Calloc(long, test_n);
171 
172  train_data->RAW = Calloc(double, train_n*(m+1));
173  test_data->RAW = Calloc(double, test_n*(m+1));
174 
175  k = 0;
176  l = 0;
177  for (i=0; i<n; i++) {
178  if (cv_idx[i] == fold_idx) {
179  test_data->y[k] = full_data->y[i];
180  for (j=0; j<m+1; j++) {
181  value = matrix_get(full_data->RAW, m+1, i, j);
182  matrix_set(test_data->RAW, m+1, k, j, value);
183  }
184  k++;
185  } else {
186  train_data->y[l] = full_data->y[i];
187  for (j=0; j<m+1; j++) {
188  value = matrix_get(full_data->RAW, m+1, i, j);
189  matrix_set(train_data->RAW, m+1, l, j, value);
190  }
191  l++;
192  }
193  }
194 
195  train_data->Z = train_data->RAW;
196  test_data->Z = test_data->RAW;
197 }
198 
199 
223 void gensvm_get_tt_split_sparse(struct GenData *full_data,
224  struct GenData *train_data, struct GenData *test_data,
225  long *cv_idx, long fold_idx)
226 {
227  long i, j, test_n, train_n, train_nnz, test_nnz, row_nnz, jj,
228  jj_start, jj_end,
229  tr_nnz_idx = 0,
230  tr_row_idx = 0,
231  te_nnz_idx = 0,
232  te_row_idx = 0;
233 
234  double value;
235 
236  // determine number of instances in test and train
237  test_n = 0;
238  for (i=0; i<full_data->n; i++)
239  if (cv_idx[i] == fold_idx)
240  test_n++;
241  train_n = full_data->n - test_n;
242 
243  // set n, m, K variables
244  train_data->n = train_n;
245  train_data->m = full_data->m;
246  train_data->K = full_data->K;
247  test_data->n = test_n;
248  test_data->m = full_data->m;
249  test_data->K = full_data->K;
250 
251  // allocate outcome
252  train_data->y = Calloc(long, train_n);
253  test_data->y = Calloc(long, test_n);
254 
255  // compute train nnz and test nnz
256  train_nnz = 0;
257  test_nnz = 0;
258  for (i=0; i<full_data->n; i++) {
259  row_nnz = full_data->spZ->ia[i+1] - full_data->spZ->ia[i];
260  if (cv_idx[i] == fold_idx) {
261  test_nnz += row_nnz;
262  } else {
263  train_nnz += row_nnz;
264  }
265  }
266 
267  // allocate the train GenSparse
268  train_data->spZ = gensvm_init_sparse();
269  test_data->spZ = gensvm_init_sparse();
270 
271  // set GenSparse variables for train
272  train_data->spZ->nnz = train_nnz;
273  train_data->spZ->n_row = train_n;
274  train_data->spZ->n_col = full_data->m+1;
275  train_data->spZ->values = Calloc(double, train_nnz);
276  train_data->spZ->ia = Calloc(long, train_n+1);
277  train_data->spZ->ja = Calloc(long, train_nnz);
278 
279  // set GenSparse variables for test
280  test_data->spZ->nnz = test_nnz;
281  test_data->spZ->n_row = test_n;
282  test_data->spZ->n_col = full_data->m+1;
283  test_data->spZ->values = Calloc(double, test_nnz);
284  test_data->spZ->ia = Calloc(long, test_n+1);
285  test_data->spZ->ja = Calloc(long, test_nnz);
286 
287  tr_nnz_idx = 0;
288  tr_row_idx = 0;
289  te_nnz_idx = 0;
290  te_row_idx = 0;
291 
292  test_data->spZ->ia[0] = 0;
293  train_data->spZ->ia[0] = 0;
294  for (i=0; i<full_data->n; i++) {
295  jj_start = full_data->spZ->ia[i];
296  jj_end = full_data->spZ->ia[i+1];
297 
298  for (jj=jj_start; jj<jj_end; jj++) {
299  j = full_data->spZ->ja[jj];
300  value = full_data->spZ->values[jj];
301 
302  if (cv_idx[i] == fold_idx) {
303  test_data->spZ->values[te_nnz_idx] = value;
304  test_data->spZ->ja[te_nnz_idx] = j;
305  te_nnz_idx++;
306  } else {
307  train_data->spZ->values[tr_nnz_idx] = value;
308  train_data->spZ->ja[tr_nnz_idx] = j;
309  tr_nnz_idx++;
310  }
311  }
312 
313  if (cv_idx[i] == fold_idx) {
314  test_data->y[te_row_idx] = full_data->y[i];
315  test_data->spZ->ia[te_row_idx+1] = te_nnz_idx;
316  te_row_idx++;
317  } else {
318  train_data->y[tr_row_idx] = full_data->y[i];
319  train_data->spZ->ia[tr_row_idx+1] = tr_nnz_idx;
320  tr_row_idx++;
321  }
322  }
323 }
#define Calloc(type, size)
Definition: gensvm_memory.h:40
long * ja
column indices, should be of length nnz
Definition: gensvm_sparse.h:67
long n_col
number of columns of the original matrix
Definition: gensvm_sparse.h:60
long K
number of classes
Definition: gensvm_base.h:58
#define matrix_get(M, cols, i, j)
void gensvm_get_tt_split(struct GenData *full_data, struct GenData *train_data, struct GenData *test_data, long *cv_idx, long fold_idx)
Wrapper around sparse/dense versions of this function.
void gensvm_get_tt_split_sparse(struct GenData *full_data, struct GenData *train_data, struct GenData *test_data, long *cv_idx, long fold_idx)
Create train and test dataset for a CV split with sparse data.
double * Z
Definition: gensvm_base.h:68
long nnz
number of nonzero elements
Definition: gensvm_sparse.h:56
long * y
array of class labels, 1..K
Definition: gensvm_base.h:66
A structure to represent the data.
Definition: gensvm_base.h:57
double * values
actual nonzero values, should be of length nnz
Definition: gensvm_sparse.h:63
void gensvm_get_tt_split_dense(struct GenData *full_data, struct GenData *train_data, struct GenData *test_data, long *cv_idx, long fold_idx)
Create train and test datasets for a CV split with dense data.
long m
number of predictors (width of RAW)
Definition: gensvm_base.h:62
#define matrix_set(M, cols, i, j, val)
long n
number of instances
Definition: gensvm_base.h:60
struct GenSparse * gensvm_init_sparse(void)
Initialize a GenSparse structure.
Definition: gensvm_sparse.c:38
long * ia
cumulative row lengths, should be of length n_row+1
Definition: gensvm_sparse.h:65
double * RAW
augmented raw data matrix
Definition: gensvm_base.h:73
struct GenSparse * spZ
sparse representation of the augmented data matrix
Definition: gensvm_base.h:71
Header file for gensvm_cv_util.c.
void gensvm_make_cv_split(long N, long folds, long *cv_idx)
Create a cross validation split vector.
long n_row
number of rows of the original matrix
Definition: gensvm_sparse.h:58