Source code for lightfm.cross_validation

# coding=utf-8
"""
Dataset splitting functions.
"""

import numpy as np
import scipy.sparse as sp


def _shuffle(uids, iids, data, random_state):

    shuffle_indices = np.arange(len(uids))
    random_state.shuffle(shuffle_indices)

    return (uids[shuffle_indices],
            iids[shuffle_indices],
            data[shuffle_indices])


[docs]def random_train_test_split(interactions, test_percentage=0.2, random_state=None): """ Randomly split interactions between training and testing. This function takes an interaction set and splits it into two disjoint sets, a training set and a test set. Note that no effort is made to make sure that all items and users with interactions in the test set also have interactions in the training set; this may lead to a partial cold-start problem in the test set. Parameters ---------- interactions: a scipy sparse matrix containing interactions The interactions to split. test_percentage: float, optional The fraction of interactions to place in the test set. random_state: np.random.RandomState, optional The random state used for the shuffle. Returns ------- (train, test): (scipy.sparse.COOMatrix, scipy.sparse.COOMatrix) A tuple of (train data, test data) """ if not sp.issparse(interactions): raise ValueError('Interactions must be a scipy.sparse matrix.') if random_state is None: random_state = np.random.RandomState() interactions = interactions.tocoo() shape = interactions.shape uids, iids, data = (interactions.row, interactions.col, interactions.data) uids, iids, data = _shuffle(uids, iids, data, random_state) cutoff = int((1.0 - test_percentage) * len(uids)) train_idx = slice(None, cutoff) test_idx = slice(cutoff, None) train = sp.coo_matrix((data[train_idx], (uids[train_idx], iids[train_idx])), shape=shape, dtype=interactions.dtype) test = sp.coo_matrix((data[test_idx], (uids[test_idx], iids[test_idx])), shape=shape, dtype=interactions.dtype) return train, test