# coding=utf-8
"""
Dataset splitting functions.
"""
import numpy as np
import scipy.sparse as sp
def _shuffle(uids, iids, data, random_state):
shuffle_indices = np.arange(len(uids))
random_state.shuffle(shuffle_indices)
return (uids[shuffle_indices],
iids[shuffle_indices],
data[shuffle_indices])
[docs]def random_train_test_split(interactions,
test_percentage=0.2,
random_state=None):
"""
Randomly split interactions between training and testing.
This function takes an interaction set and splits it into
two disjoint sets, a training set and a test set. Note that
no effort is made to make sure that all items and users with
interactions in the test set also have interactions in the
training set; this may lead to a partial cold-start problem
in the test set.
Parameters
----------
interactions: a scipy sparse matrix containing interactions
The interactions to split.
test_percentage: float, optional
The fraction of interactions to place in the test set.
random_state: np.random.RandomState, optional
The random state used for the shuffle.
Returns
-------
(train, test): (scipy.sparse.COOMatrix,
scipy.sparse.COOMatrix)
A tuple of (train data, test data)
"""
if not sp.issparse(interactions):
raise ValueError('Interactions must be a scipy.sparse matrix.')
if random_state is None:
random_state = np.random.RandomState()
interactions = interactions.tocoo()
shape = interactions.shape
uids, iids, data = (interactions.row,
interactions.col,
interactions.data)
uids, iids, data = _shuffle(uids, iids, data, random_state)
cutoff = int((1.0 - test_percentage) * len(uids))
train_idx = slice(None, cutoff)
test_idx = slice(cutoff, None)
train = sp.coo_matrix((data[train_idx],
(uids[train_idx],
iids[train_idx])),
shape=shape,
dtype=interactions.dtype)
test = sp.coo_matrix((data[test_idx],
(uids[test_idx],
iids[test_idx])),
shape=shape,
dtype=interactions.dtype)
return train, test