import array
import numpy as np
import scipy.sparse as sp
import sklearn.preprocessing
class _IncrementalCOOMatrix(object):
def __init__(self, shape, dtype):
if dtype is np.int32:
type_flag = "i"
elif dtype is np.int64:
type_flag = "l"
elif dtype is np.float32:
type_flag = "f"
elif dtype is np.float64:
type_flag = "d"
else:
raise Exception("Dtype not supported.")
self.shape = shape
self.dtype = dtype
self.rows = array.array("i")
self.cols = array.array("i")
self.data = array.array(type_flag)
def append(self, i, j, v):
m, n = self.shape
if i >= m or j >= n:
raise Exception("Index out of bounds")
self.rows.append(i)
self.cols.append(j)
self.data.append(v)
def tocoo(self):
rows = np.frombuffer(self.rows, dtype=np.int32)
cols = np.frombuffer(self.cols, dtype=np.int32)
data = np.frombuffer(self.data, dtype=self.dtype)
return sp.coo_matrix((data, (rows, cols)), shape=self.shape)
def __len__(self):
return len(self.data)
class _FeatureBuilder(object):
def __init__(
self, id_mapping, feature_mapping, identity_features, normalize, entity_type
):
self._id_mapping = id_mapping
self._feature_mapping = feature_mapping
self._identity_features = identity_features
self._normalize = normalize
self._entity_type = entity_type
def features_shape(self):
return len(self._id_mapping), len(self._feature_mapping)
def _iter_features(self, features):
if isinstance(features, dict):
for entry in features.items():
yield entry
else:
for feature_name in features:
yield (feature_name, 1.0)
def _process_features(self, datum):
if len(datum) != 2:
raise ValueError(
"Expected tuples of ({}_id, features), "
"got {}.".format(self._entity_type, datum)
)
entity_id, features = datum
if entity_id not in self._id_mapping:
raise ValueError(
"{entity_type} id {entity_id} not in {entity_type} id mappings.".format(
entity_type=self._entity_type, entity_id=entity_id
)
)
idx = self._id_mapping[entity_id]
for (feature, weight) in self._iter_features(features):
if feature not in self._feature_mapping:
raise ValueError(
"Feature {} not in feature mapping. "
"Call fit first.".format(feature)
)
feature_idx = self._feature_mapping[feature]
yield (idx, feature_idx, weight)
def build(self, data):
features = _IncrementalCOOMatrix(self.features_shape(), np.float32)
if self._identity_features:
for (_id, idx) in self._id_mapping.items():
features.append(idx, self._feature_mapping[_id], 1.0)
for datum in data:
for (entity_idx, feature_idx, weight) in self._process_features(datum):
features.append(entity_idx, feature_idx, weight)
features = features.tocoo().tocsr()
if self._normalize:
if np.any(features.getnnz(1) == 0):
raise ValueError(
"Cannot normalize feature matrix: some rows have zero norm. "
"Ensure that features were provided for all entries."
)
sklearn.preprocessing.normalize(features, norm="l1", copy=False)
return features
[docs]class Dataset(object):
"""
Tool for building interaction and feature matrices, taking care of the
mapping between user/item ids and feature names and internal feature indices.
To create a dataset:
- Create an instance of the `Dataset` class.
- Call `fit` (or `fit_partial`), supplying user/item ids and feature names
that you want to use in your model. This will create internal mappings that
translate the ids and feature names to internal indices used by the LightFM
model.
- Call `build_interactions` with an iterable of (user id, item id) or (user id,
item id, weight) to build an interactions and weights matrix.
- Call `build_user/item_features` with iterables of (user/item id, [features])
or (user/item id, {feature: feature weight}) to build feature matrices.
- To add new user/item ids or features, call `fit_partial` again. You will need
to resize your LightFM model to be able to use the new features.
Parameters
----------
user_identity_features: bool, optional
Create a unique feature for every user in addition to other features.
If true (default), a latent vector will be allocated for every user. This
is a reasonable default for most applications, but should be set to false
if there is very little data for every user. For more details see the Notes
in :doc:`LightFM<lightfm>`.
item_identity_features: bool, optional
Create a unique feature for every item in addition to other features.
If true (default), a latent vector will be allocated for every item. This
is a reasonable default for most applications, but should be set to false
if there is very little data for every item. For more details see the Notes
in :doc:`LightFM<lightfm>`.
"""
def __init__(self, user_identity_features=True, item_identity_features=True):
self._user_identity_features = user_identity_features
self._item_identity_features = item_identity_features
self._user_id_mapping = {}
self._item_id_mapping = {}
self._user_feature_mapping = {}
self._item_feature_mapping = {}
def _check_fitted(self):
if not self._user_id_mapping or not self._item_id_mapping:
raise ValueError(
"You must call fit first to build the item and user " "id mappings."
)
[docs] def fit(self, users, items, user_features=None, item_features=None):
"""
Fit the user/item id and feature name mappings.
Calling fit the second time will reset existing mappings.
Parameters
----------
users: iterable of user ids
items: iterable of item ids
user_features: iterable of user features, optional
item_features: iterable of item features, optional
"""
self._user_id_mapping = {}
self._item_id_mapping = {}
self._user_feature_mapping = {}
self._item_feature_mapping = {}
return self.fit_partial(users, items, user_features, item_features)
[docs] def fit_partial(
self, users=None, items=None, user_features=None, item_features=None
):
"""
Fit the user/item id and feature name mappings.
Calling fit the second time will add new entries to existing mappings.
Parameters
----------
users: iterable of user ids, optional
items: iterable of item ids, optional
user_features: iterable of user features, optional
item_features: iterable of item features, optional
"""
if users is not None:
for user_id in users:
self._user_id_mapping.setdefault(user_id, len(self._user_id_mapping))
if self._user_identity_features:
self._user_feature_mapping.setdefault(
user_id, len(self._user_feature_mapping)
)
if items is not None:
for item_id in items:
self._item_id_mapping.setdefault(item_id, len(self._item_id_mapping))
if self._item_identity_features:
self._item_feature_mapping.setdefault(
item_id, len(self._item_feature_mapping)
)
if user_features is not None:
for user_feature in user_features:
self._user_feature_mapping.setdefault(
user_feature, len(self._user_feature_mapping)
)
if item_features is not None:
for item_feature in item_features:
self._item_feature_mapping.setdefault(
item_feature, len(self._item_feature_mapping)
)
def _unpack_datum(self, datum):
if len(datum) == 3:
(user_id, item_id, weight) = datum
elif len(datum) == 2:
(user_id, item_id) = datum
weight = 1.0
else:
raise ValueError(
"Expecting tuples of (user_id, item_id, weight) "
"or (user_id, item_id). Got {}".format(datum)
)
user_idx = self._user_id_mapping.get(user_id)
item_idx = self._item_id_mapping.get(item_id)
if user_idx is None:
raise ValueError(
"User id {} not in user id mapping. Make sure "
"you call the fit method.".format(user_id)
)
if item_idx is None:
raise ValueError(
"Item id {} not in item id mapping. Make sure "
"you call the fit method.".format(item_id)
)
return (user_idx, item_idx, weight)
[docs] def interactions_shape(self):
"""
Return a tuple of (num users, num items).
"""
return (len(self._user_id_mapping), len(self._item_id_mapping))
[docs] def build_interactions(self, data):
"""
Build an interaction matrix.
Two matrices will be returned: a (num_users, num_items)
COO matrix with interactions, and a (num_users, num_items)
matrix with the corresponding interaction weights.
Parameters
----------
data: iterable of (user_id, item_id) or (user_id, item_id, weight)
An iterable of interactions. The user and item ids will be
translated to internal model indices using the mappings
constructed during the fit call. If weights are not provided
they will be assumed to be 1.0.
Returns
-------
(interactions, weights): COO matrix, COO matrix
Two COO matrices: the interactions matrix
and the corresponding weights matrix.
"""
interactions = _IncrementalCOOMatrix(self.interactions_shape(), np.int32)
weights = _IncrementalCOOMatrix(self.interactions_shape(), np.float32)
for datum in data:
user_idx, item_idx, weight = self._unpack_datum(datum)
interactions.append(user_idx, item_idx, 1)
weights.append(user_idx, item_idx, weight)
return (interactions.tocoo(), weights.tocoo())
[docs] def user_features_shape(self):
"""
Return the shape of the user features matrix.
Returns
-------
(num user ids, num user features): tuple of ints
The shape.
"""
return (len(self._user_id_mapping), len(self._user_feature_mapping))
[docs] def build_user_features(self, data, normalize=True):
"""
Build a user features matrix out of an iterable of the form
(user id, [list of feature names]) or (user id, {feature name: feature weight}).
Parameters
----------
data: iterable of the form
(user id, [list of feature names]) or (user id,
{feature name: feature weight}).
User and feature ids will be translated to internal indices
constructed during the fit call.
normalize: bool, optional
If true, will ensure that feature weights sum to 1 in every row.
Returns
-------
feature matrix: CSR matrix (num users, num features)
Matrix of user features.
"""
builder = _FeatureBuilder(
self._user_id_mapping,
self._user_feature_mapping,
self._user_identity_features,
normalize,
"user",
)
return builder.build(data)
[docs] def item_features_shape(self):
"""
Return the shape of the item features matrix.
Returns
-------
(num item ids, num item features): tuple of ints
The shape.
"""
return (len(self._item_id_mapping), len(self._item_feature_mapping))
[docs] def build_item_features(self, data, normalize=True):
"""
Build a item features matrix out of an iterable of the form
(item id, [list of feature names]) or (item id, {feature name: feature weight}).
Parameters
----------
data: iterable of the form
(item id, [list of feature names]) or (item id,
{feature name: feature weight}).
Item and feature ids will be translated to internal indices
constructed during the fit call.
normalize: bool, optional
If true, will ensure that feature weights sum to 1 in every row.
Returns
-------
feature matrix: CSR matrix (num items, num features)
Matrix of item features.
"""
builder = _FeatureBuilder(
self._item_id_mapping,
self._item_feature_mapping,
self._item_identity_features,
normalize,
"item",
)
return builder.build(data)
[docs] def model_dimensions(self):
"""
Returns a tuple that characterizes the number of user/item feature
embeddings in a LightFM model for this dataset.
"""
return (len(self._user_feature_mapping), len(self._item_feature_mapping))
[docs] def mapping(self):
"""
Return the constructed mappings.
Invert these to map internal indices to external ids.
Returns
-------
(user id map, user feature map, item id map, item feature map): tuple of dictionaries
"""
return (
self._user_id_mapping,
self._user_feature_mapping,
self._item_id_mapping,
self._item_feature_mapping,
)