Dataset splitting functions.
import numpy as np
import scipy.sparse as sp
def _shuffle(uids, iids, data, random_state):
shuffle_indices = np.arange(len(uids))
Randomly split interactions between training and testing.
This function takes an interaction set and splits it into
two disjoint sets, a training set and a test set. Note that
no effort is made to make sure that all items and users with
interactions in the test set also have interactions in the
training set; this may lead to a partial cold-start problem
in the test set.
interactions: a scipy sparse matrix containing interactions
The interactions to split.
test_percentage: float, optional
The fraction of interactions to place in the test set.
random_state: np.random.RandomState, optional
The random state used for the shuffle.
(train, test): (scipy.sparse.COOMatrix,
A tuple of (train data, test data)
if not sp.issparse(interactions):
raise ValueError('Interactions must be a scipy.sparse matrix.')
if random_state is None:
random_state = np.random.RandomState()
interactions = interactions.tocoo()
shape = interactions.shape
uids, iids, data = (interactions.row,
uids, iids, data = _shuffle(uids, iids, data, random_state)
cutoff = int((1.0 - test_percentage) * len(uids))
train_idx = slice(None, cutoff)
test_idx = slice(cutoff, None)
train = sp.coo_matrix((data[train_idx],
test = sp.coo_matrix((data[test_idx],
return train, test