neighbors.models.KNN
The K-Nearest Neighbors algorithm makes predictions using a weighted mean of a subset of similar users. Similarity can be controlled via the metric
argument to the .fit
method, and the number of other users can be controlled with the k
argument to the .predict
method. NOTE: If user similiarity cannot be computed or no observed ratings have been made by the top k simililar users, this algorithm will fallback to the global mean on observed data for prediction (i.e. like the Mean
model).
Source code in neighbors/models.py
class KNN(Base):
"""
The K-Nearest Neighbors algorithm makes predictions using a weighted mean of a subset of similar users. Similarity can be controlled via the `metric` argument to the `.fit` method, and the number of other users can be controlled with the `k` argument to the `.predict` method. NOTE: If user similiarity cannot be computed or no observed ratings have been made by the top k simililar users, this algorithm will fallback to the global mean on observed data for prediction (i.e. like the `Mean` model).
"""
def __init__(
self, data, mask=None, n_mask_items=None, verbose=True, random_state=None
):
"""
Args:
data (pd.DataFrame): users x items dataframe
mask (pd.DataFrame, optional): A boolean dataframe used to split the data into 'observed' and 'missing' datasets. Defaults to None.
n_mask_items (int/float, optional): number of items to mask out, while the rest are treated as observed; Defaults to None.
data_range (int/float, optional): max - min of the data; Default computed from the input data. This is useful to set manually in case the input data do not span the full range of possible values
random_state (None, int, RandomState): a seed or random state used for all internal random operations (e.g. randomly mask half the data given n_mask_item = .05). Passing None will generate a new random seed. Default None.
verbose (bool; optional): print any initialization warnings; Default True
"""
super().__init__(
data, mask, n_mask_items, random_state=random_state, verbose=verbose
)
self.user_similarity = None
self.metric = None
def __repr__(self):
return f"{super().__repr__()[:-1]}, similarity_metric={self.metric})"
def fit(
self,
k=10,
metric="correlation",
axis=0,
dilate_by_nsamples=None,
skip_refit=False,
**kwargs,
):
"""Fit collaborative model to train data. Calculate similarity between subjects across items. Repeated called to fit with different k, but the same previous arguments will re-use the computed user x user similarity matrix.
Args:
k (int): maximum number of other users to use when making a prediction for a single user. If set to None will use all users. Default 10. Note: it's possible for predictions to come from fewer than k other users if a particular user has fewer similar neighbors with positive similarity scores.
metric (str; optional): type of similarity. One of 'correlation', 'spearman', 'kendall', 'cosine', or 'pearson'. 'pearson' is just an alias for 'correlation'. Default 'correlation'.
axis (int): dimension along which to compute mean, 0 = mean across users separately by item, 1 = mean across items separately by user; Default 0
skip_refit (bool; optional): skip re-estimation of user x user similarity matrix. Faster if only exploring different k and no other model parameters or masks are changing. Default False.
"""
metrics = ["pearson", "spearman", "kendall", "cosine", "correlation"]
if metric not in metrics:
raise ValueError(f"metric must be one of {metrics}")
self.metric = metric
if metric == "correlation":
metric = "pearson"
# Call parent fit which acts as a guard for non-masked data
super().fit()
# If fit is being called more than once in a row with different k, but no other arguments are changing, reuse the last computed similarity matrix to save time. Otherwise re-calculate it
if not skip_refit:
self.dilate_mask(n_samples=dilate_by_nsamples)
# Store the mean because we'll use it in cases we can't make a prediction
self.mean = self.masked_data.mean(skipna=True, axis=axis)
if metric in ["pearson", "kendall", "spearman"]:
# Fall back to pandas
sim = self.masked_data.T.corr(method=metric)
else:
# Convert distance metrics to similarity (currently only cosine)
sim = pd.DataFrame(
1 - nanpdist(self.masked_data.to_numpy(), metric=metric),
index=self.masked_data.index,
columns=self.masked_data.index,
)
self.user_similarity = sim
self._predict(k=k)
self.is_fit = True
def _predict(self, k):
"""Make predictions using computed user similarities.
Args:
k (int): number of closest neighbors to use
"""
predictions = self.masked_data.copy()
for row_idx, _ in self.masked_data.iterrows():
user_prediction_error = False
# Get the similarity of this user to all other users, ignoring self-similarity
top_user_sims = self.user_similarity.loc[row_idx].drop(row_idx)
if top_user_sims.isnull().all():
warnings.warn(
f"User {row_idx} has no variance in their ratings. Impossible to compute similarity with other users. Falling back to global mean for all predictions"
)
user_prediction_error = True # can't predict
else:
# Remove nan users and sort
top_user_sims = top_user_sims[~top_user_sims.isnull()].sort_values(
ascending=False
)
if len(top_user_sims) == 0:
user_prediction_error = True # can't predict
else:
# Get top k if requested
if k is not None:
top_user_sims = top_user_sims[: k + 1]
# Rescale similarity scores to the range 0 - 1, which has the effect of zeroing out negative similarities for currently supported similarity metrics.
# NOTE: we should revisit this approach for non-normalized similarity metrics e.g. euclidean distance
top_user_sims = top_user_sims.clip(lower=0, upper=1)
# No top users with positive correlations
if len(np.nonzero(top_user_sims.to_numpy())[0]) == 0:
user_prediction_error = True
else:
# NOTE: this code block is just a vectorized version of looping over every item for the current user and seeing whether we have observed ratings for each of the k other users to make a prediction with. We do this because for each item the *actual* number of other users' data availble for prediction will vary between 0-k based the pattern of sparsity
# Get the observed ratings from top users
top_user_ratings = self.masked_data.loc[top_user_sims.index, :]
# Make predictions = user_similarity_scores (column vector) * user x item (matrix of observed ratings)
# Do this in pandas rather than numpy because numpy will return nans when summing items if any item is nan
# Yields user x item matrix of ratings scaled by similarities
preds = (top_user_sims * top_user_ratings.T).T
# Add up the ratings from other users ignoring NaNs; this serves as the numerator of the formula
rating_sums = preds.sum()
# Now some of the values in preds will be nan because we never observed a rating for that user + item combo. We need to know how many are nans and which exact ones, because we need to sum down users for preds and then divide by the sum of the similarity weights we did end up using.
# Get locations of where we were able to make a prediction.
preds_mask = ~preds.isnull()
# Broadcast the user similarity vector over the user x item matrix so each column now contains the user similarity score if observed a prediction from that user and a 0 if not (True is converted to 1 during this multiplication whereas False is converted to 0)
user_sims_mat = (preds_mask.T * top_user_sims).T
# Now we can just sum down the rows which will give us the sum of the similarity weights we actually used
sim_sums = user_sims_mat.sum()
# Finally get the predictions by dividing the sum of ratings by sum of similarities we ended up using. This is how Surprise does it too: https://github.com/NicolasHug/Surprise/blob/master/surprise/prediction_algorithms/knns.py#L124
preds = rating_sums / sim_sums
# For items we can't predict because we never observed any ratings from the top k users for that item, fill in with the global mean for that item
if preds.isnull().any():
preds[preds.isnull()] = self.mean[preds.isnull()]
predictions.loc[row_idx] = preds.to_numpy()
# Handle cases where we were unable to make any predictions for this user
if user_prediction_error:
warnings.warn(
f"Not enough similar users with data to make any predictions for user {row_idx}. Falling back to global mean for all predictions"
)
predictions.loc[row_idx, :] = self.mean.to_numpy()
self.predictions = predictions
def plot_user_similarity(
self, figsize=(8, 8), label_fontsize=16, hide_title=False, heatmap_kwargs={}
):
"""
Plot a heatmap of user x user similarities learned on the observed data
Args:
figsize (tuple, optional): matplotlib figure size. Defaults to (8, 8).
label_fontsize (int; optional): fontsize for title text; Default 16
hide_title (bool; optional): hide title containing metric information; Default False
heatmap_kwargs (dict, optional): addition arguments to seaborn.heatmap.
Returns:
ax: matplotib axis handle
"""
if not self.is_fit:
raise ValueError("Model as not been fit")
if self.metric in ["correlation", "pearson", "spearman"]:
vmin, vmax = -1, 1
cmap = "RdBu_r"
else:
vmin, vmax = 0, 1
cmap = None
_, ax = plt.subplots(1, 1, figsize=figsize)
_ = ax.set(xlabel=None, ylabel=None)
ax = sns.heatmap(
self.user_similarity,
vmin=vmin,
vmax=vmax,
cmap=cmap,
square=True,
ax=ax,
**heatmap_kwargs,
)
if not hide_title:
_ = ax.set_title(f"Metric: {self.metric}", fontsize=label_fontsize)
return ax
__init__(self, data, mask=None, n_mask_items=None, verbose=True, random_state=None)
special
Parameters:
Name | Type | Description | Default |
---|---|---|---|
data |
pd.DataFrame |
users x items dataframe |
required |
mask |
pd.DataFrame |
A boolean dataframe used to split the data into 'observed' and 'missing' datasets. Defaults to None. |
None |
n_mask_items |
int/float |
number of items to mask out, while the rest are treated as observed; Defaults to None. |
None |
data_range |
int/float |
max - min of the data; Default computed from the input data. This is useful to set manually in case the input data do not span the full range of possible values |
required |
random_state |
None, int, RandomState |
a seed or random state used for all internal random operations (e.g. randomly mask half the data given n_mask_item = .05). Passing None will generate a new random seed. Default None. |
None |
verbose |
bool; optional |
print any initialization warnings; Default True |
True |
Source code in neighbors/models.py
def __init__(
self, data, mask=None, n_mask_items=None, verbose=True, random_state=None
):
"""
Args:
data (pd.DataFrame): users x items dataframe
mask (pd.DataFrame, optional): A boolean dataframe used to split the data into 'observed' and 'missing' datasets. Defaults to None.
n_mask_items (int/float, optional): number of items to mask out, while the rest are treated as observed; Defaults to None.
data_range (int/float, optional): max - min of the data; Default computed from the input data. This is useful to set manually in case the input data do not span the full range of possible values
random_state (None, int, RandomState): a seed or random state used for all internal random operations (e.g. randomly mask half the data given n_mask_item = .05). Passing None will generate a new random seed. Default None.
verbose (bool; optional): print any initialization warnings; Default True
"""
super().__init__(
data, mask, n_mask_items, random_state=random_state, verbose=verbose
)
self.user_similarity = None
self.metric = None
create_masked_data(self, n_mask_items=0.2)
inherited
Create a mask and apply it to data using number of items or % of items
Parameters:
Name | Type | Description | Default |
---|---|---|---|
n_items |
int/float |
if an integer is passed its raw value is used. Otherwise if a float is passed its taken to be a (rounded) percentage of the total items; Default 0.1 (10% of the data) |
required |
Source code in neighbors/models.py
def create_masked_data(self, n_mask_items=0.2):
"""
Create a mask and apply it to data using number of items or % of items
Args:
n_items (int/float, optional): if an integer is passed its raw value is used. Otherwise if a float is passed its taken to be a (rounded) percentage of the total items; Default 0.1 (10% of the data)
"""
if (
isinstance(n_mask_items, np.floating)
and (n_mask_items >= 1.0 or n_mask_items <= 0.0)
) or (
isinstance(n_mask_items, int)
and (n_mask_items >= self.data.shape[1] or n_mask_items <= 0)
):
raise TypeError(
"n_items should a float between 0-1 or an integer < the number of items"
)
self.mask = create_sparse_mask(
self.data, n_mask_items, random_state=self.random_state
)
self.masked_data = self.data[self.mask]
self.is_masked = True
self.n_mask_items = n_mask_items
dilate_mask(self, n_samples=None)
inherited
Dilate sparse time-series data by n_samples.
Overlapping data will be averaged. This method computes and stores the dilated mask in .dilated_mask
and internally updates the .masked_data
. Repeated calls to this method on the same model instance do not stack, but rather perform a new dilation on the original masked data. Called this method with None
will undo any dilation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
nsamples |
int |
Number of samples to dilate data |
required |
Source code in neighbors/models.py
def dilate_mask(self, n_samples=None):
"""Dilate sparse time-series data by n_samples.
Overlapping data will be averaged. This method computes and stores the dilated mask in `.dilated_mask` and internally updates the `.masked_data`. Repeated calls to this method on the same model instance **do not** stack, but rather perform a new dilation on the original masked data. Called this method with `None` will undo any dilation.
Args:
nsamples (int): Number of samples to dilate data
"""
if self.mask is None:
raise ValueError("Model has no mask and requires one to perform dilation")
if not self.is_masked and n_samples is not None:
raise ValueError("Make sure model instance has been masked.")
if isinstance(n_samples, np.floating) or (
n_samples is not None and n_samples >= self.data.shape[1]
):
raise TypeError("nsamples should be an integer < the number of items")
# Always reset to the undilated mask first
self.masked_data = self.data[self.mask]
if n_samples is not None:
# After masking, perform dilation and save as the new masked data
self.masked_data = self.masked_data.apply(
lambda x: self._conv_ts_mean_overlap(x, n_samples=n_samples),
axis=1,
result_type="broadcast",
)
# Calculate and save dilated mask
self.dilated_mask = ~self.masked_data.isnull()
self.is_mask_dilated = True
self.dilated_by_nsamples = n_samples
else:
self.dilated_mask = None
self.is_mask_dilated = False
self.dilated_by_nsamples = None
downsample(self, n_samples, sampling_freq=None, target_type='samples')
inherited
Downsample a model's rating matrix to a new target frequency or number of samples using averaging. Also downsamples a model's mask and dilated mask if they exist as well as a model's predictions if it's already been fit.
If target_type = 'samples' and sampling_freq is None, the new user x item matrix will have shape users x items * (1 / n_samples).
If target_type = 'seconds', the new user x item matrix will have shape users x items * (1 / n_samples * sampling_freq).
If target_type = 'hz', the new user x item matrix will have shape users x items * (1 / sampling_freq / n_samples).
Parameters:
Name | Type | Description | Default |
---|---|---|---|
n_samples |
int |
number of samples |
required |
sampling_freq |
int/float |
Sampling frequency of data; Default None |
None |
target_type |
str |
how to downsample; must be one of "samples", "seconds" or "hz". Defaults to "samples". |
'samples' |
Source code in neighbors/models.py
def downsample(self, n_samples, sampling_freq=None, target_type="samples"):
"""
Downsample a model's rating matrix to a new target frequency or number of samples using averaging. Also downsamples a model's mask and dilated mask if they exist as well as a model's predictions if it's already been fit.
If target_type = 'samples' and sampling_freq is None, the new user x item matrix will have shape users x items * (1 / n_samples).
If target_type = 'seconds', the new user x item matrix will have shape users x items * (1 / n_samples * sampling_freq).
If target_type = 'hz', the new user x item matrix will have shape users x items * (1 / sampling_freq / n_samples).
Args:
n_samples (int): number of samples
sampling_freq (int/float): Sampling frequency of data; Default None
target_type (str, optional): how to downsample; must be one of "samples", "seconds" or "hz". Defaults to "samples".
"""
self.data = downsample_dataframe(
self.data,
sampling_freq=sampling_freq,
n_samples=n_samples,
target_type=target_type,
)
if self.is_masked:
# Also downsample mask
self.mask = downsample_dataframe(
self.mask,
sampling_freq=sampling_freq,
n_samples=n_samples,
target_type=target_type,
)
# Ensure mask stays boolean
self.mask.loc[:, :] = self.mask > 0
# Masked data
self.masked_data = downsample_dataframe(
self.masked_data,
sampling_freq=sampling_freq,
n_samples=n_samples,
target_type=target_type,
)
# Dilated mask
if self.is_mask_dilated:
self.dilated_mask = downsample_dataframe(
self.dilated_mask,
sampling_freq=sampling_freq,
n_samples=n_samples,
target_type=target_type,
)
# Ensure mask stays boolean
self.dilated_mask.loc[:, :] = self.dilated_mask > 0
if self.is_fit:
self.predictions = downsample_dataframe(
self.predictions,
sampling_freq=sampling_freq,
n_samples=n_samples,
target_type=target_type,
)
fit(self, k=10, metric='correlation', axis=0, dilate_by_nsamples=None, skip_refit=False, **kwargs)
Fit collaborative model to train data. Calculate similarity between subjects across items. Repeated called to fit with different k, but the same previous arguments will re-use the computed user x user similarity matrix.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
k |
int |
maximum number of other users to use when making a prediction for a single user. If set to None will use all users. Default 10. Note: it's possible for predictions to come from fewer than k other users if a particular user has fewer similar neighbors with positive similarity scores. |
10 |
metric |
str; optional |
type of similarity. One of 'correlation', 'spearman', 'kendall', 'cosine', or 'pearson'. 'pearson' is just an alias for 'correlation'. Default 'correlation'. |
'correlation' |
axis |
int |
dimension along which to compute mean, 0 = mean across users separately by item, 1 = mean across items separately by user; Default 0 |
0 |
skip_refit |
bool; optional |
skip re-estimation of user x user similarity matrix. Faster if only exploring different k and no other model parameters or masks are changing. Default False. |
False |
Source code in neighbors/models.py
def fit(
self,
k=10,
metric="correlation",
axis=0,
dilate_by_nsamples=None,
skip_refit=False,
**kwargs,
):
"""Fit collaborative model to train data. Calculate similarity between subjects across items. Repeated called to fit with different k, but the same previous arguments will re-use the computed user x user similarity matrix.
Args:
k (int): maximum number of other users to use when making a prediction for a single user. If set to None will use all users. Default 10. Note: it's possible for predictions to come from fewer than k other users if a particular user has fewer similar neighbors with positive similarity scores.
metric (str; optional): type of similarity. One of 'correlation', 'spearman', 'kendall', 'cosine', or 'pearson'. 'pearson' is just an alias for 'correlation'. Default 'correlation'.
axis (int): dimension along which to compute mean, 0 = mean across users separately by item, 1 = mean across items separately by user; Default 0
skip_refit (bool; optional): skip re-estimation of user x user similarity matrix. Faster if only exploring different k and no other model parameters or masks are changing. Default False.
"""
metrics = ["pearson", "spearman", "kendall", "cosine", "correlation"]
if metric not in metrics:
raise ValueError(f"metric must be one of {metrics}")
self.metric = metric
if metric == "correlation":
metric = "pearson"
# Call parent fit which acts as a guard for non-masked data
super().fit()
# If fit is being called more than once in a row with different k, but no other arguments are changing, reuse the last computed similarity matrix to save time. Otherwise re-calculate it
if not skip_refit:
self.dilate_mask(n_samples=dilate_by_nsamples)
# Store the mean because we'll use it in cases we can't make a prediction
self.mean = self.masked_data.mean(skipna=True, axis=axis)
if metric in ["pearson", "kendall", "spearman"]:
# Fall back to pandas
sim = self.masked_data.T.corr(method=metric)
else:
# Convert distance metrics to similarity (currently only cosine)
sim = pd.DataFrame(
1 - nanpdist(self.masked_data.to_numpy(), metric=metric),
index=self.masked_data.index,
columns=self.masked_data.index,
)
self.user_similarity = sim
self._predict(k=k)
self.is_fit = True
plot_predictions(self, dataset='missing', figsize=(16, 8), label_fontsize=16, hide_title=False, heatmap_kwargs={})
inherited
Create plot of actual vs predicted values.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dataset |
str; optional |
one of 'full', 'observed', or 'missing'. Default 'missing'. |
'missing' |
figsize |
tuple; optional |
matplotlib figure size; Default (16,8) |
(16, 8) |
label_fontsize |
int; optional |
fontsize for all axis labels and titles; Default 16 |
16 |
hide_title |
bool; optional |
hide title containing RMSE and correlation performance if available; Default False |
False |
heatmap_kwargs |
dict |
addition arguments to seaborn.heatmap. |
{} |
Returns:
Type | Description |
---|---|
tuple |
(figure handle, axis handle) |
Source code in neighbors/models.py
def plot_predictions(
self,
dataset="missing",
figsize=(16, 8),
label_fontsize=16,
hide_title=False,
heatmap_kwargs={},
):
"""Create plot of actual vs predicted values.
Args:
dataset (str; optional): one of 'full', 'observed', or 'missing'. Default 'missing'.
figsize (tuple; optional): matplotlib figure size; Default (16,8)
label_fontsize (int; optional): fontsize for all axis labels and titles; Default 16
hide_title (bool; optional): hide title containing RMSE and correlation performance if available; Default False
heatmap_kwargs (dict, optional): addition arguments to seaborn.heatmap.
Returns:
tuple: (figure handle, axis handle)
"""
if not self.is_fit:
raise ValueError("Model has not been fit")
vmax = max(self.data.max().max(), self.data.max().max())
vmin = min(self.data.min().min(), self.data.min().min())
actual, pred = self._retrieve_predictions(dataset)
if actual is None:
ncols = 2
warnings.warn(
"Cannot score predictions on missing data because true values were never observed!"
)
else:
ncols = 3
heatmap_kwargs.setdefault("square", False)
heatmap_kwargs.setdefault("xticklabels", False)
heatmap_kwargs.setdefault("yticklabels", False)
heatmap_kwargs.setdefault("vmax", vmax)
heatmap_kwargs.setdefault("vmin", vmin)
f, ax = plt.subplots(nrows=1, ncols=ncols, figsize=figsize)
# The original data matrix (potentially masked)
sns.heatmap(self.masked_data, ax=ax[0], **heatmap_kwargs)
ax[0].set_title("Actual User/Item Ratings", fontsize=label_fontsize)
ax[0].set_xlabel("Items", fontsize=label_fontsize)
ax[0].set_ylabel("Users", fontsize=label_fontsize)
# The predicted data matrix
sns.heatmap(self.predictions, ax=ax[1], **heatmap_kwargs)
ax[1].set_title("Predicted User/Item Ratings", fontsize=label_fontsize)
ax[1].set_xlabel("Items", fontsize=label_fontsize)
ax[1].set_ylabel("Users", fontsize=label_fontsize)
f.tight_layout()
# Scatter plot if we can calculate it
if actual is not None:
nans = np.logical_or(np.isnan(actual), np.isnan(pred))
ax[2].scatter(
actual[~nans],
pred[~nans],
)
ax[2].set_xlabel("Actual", fontsize=label_fontsize)
ax[2].set_ylabel("Predicted", fontsize=label_fontsize)
ax[2].set_title("Ratings", fontsize=label_fontsize)
sns.despine()
r = self.score(dataset=dataset, by_user=True, metric="correlation")
rmse = self.score(dataset=dataset, by_user=True, metric="rmse")
if not hide_title:
plt.suptitle(
f"Mean RMSE: {np.round(rmse.mean(),3)} +/- {np.round(rmse.std(), 3)}\nMean Correlation: {np.round(r.mean(), 3)} +/- {np.round(r.std(), 3)}",
y=1.07,
fontsize=label_fontsize + 2,
)
plt.subplots_adjust(wspace=0.2)
return f, ax
plot_user_similarity(self, figsize=(8, 8), label_fontsize=16, hide_title=False, heatmap_kwargs={})
Plot a heatmap of user x user similarities learned on the observed data
Parameters:
Name | Type | Description | Default |
---|---|---|---|
figsize |
tuple |
matplotlib figure size. Defaults to (8, 8). |
(8, 8) |
label_fontsize |
int; optional |
fontsize for title text; Default 16 |
16 |
hide_title |
bool; optional |
hide title containing metric information; Default False |
False |
heatmap_kwargs |
dict |
addition arguments to seaborn.heatmap. |
{} |
Returns:
Type | Description |
---|---|
ax |
matplotib axis handle |
Source code in neighbors/models.py
def plot_user_similarity(
self, figsize=(8, 8), label_fontsize=16, hide_title=False, heatmap_kwargs={}
):
"""
Plot a heatmap of user x user similarities learned on the observed data
Args:
figsize (tuple, optional): matplotlib figure size. Defaults to (8, 8).
label_fontsize (int; optional): fontsize for title text; Default 16
hide_title (bool; optional): hide title containing metric information; Default False
heatmap_kwargs (dict, optional): addition arguments to seaborn.heatmap.
Returns:
ax: matplotib axis handle
"""
if not self.is_fit:
raise ValueError("Model as not been fit")
if self.metric in ["correlation", "pearson", "spearman"]:
vmin, vmax = -1, 1
cmap = "RdBu_r"
else:
vmin, vmax = 0, 1
cmap = None
_, ax = plt.subplots(1, 1, figsize=figsize)
_ = ax.set(xlabel=None, ylabel=None)
ax = sns.heatmap(
self.user_similarity,
vmin=vmin,
vmax=vmax,
cmap=cmap,
square=True,
ax=ax,
**heatmap_kwargs,
)
if not hide_title:
_ = ax.set_title(f"Metric: {self.metric}", fontsize=label_fontsize)
return ax
score(self, metric='rmse', dataset='missing', by_user=True, actual=None)
inherited
Get the performance of a fitted model by comparing observed and predicted data. This method is primarily useful if you want to calculate a single metric. Otherwise you should prefer the .summary()
method instead, which scores all metrics.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
metric |
str; optional |
what metric to compute, one of 'rmse', 'mse', 'mae' or 'correlation'; Default 'rmse'. |
'rmse' |
dataset |
str; optional |
how to compute scoring, either using 'observed', 'missing' or 'full'. Default 'missing'. |
'missing' |
by_user |
bool; optional |
whether to return a single score over all data points or a pandas Series of scores per user. Default True. |
True |
actual |
pd.DataFrame, None; optional |
a dataframe to score against; Default is None which uses the data provided when the model was initialized |
None |
Returns:
Type | Description |
---|---|
float/pd.Series |
score |
Source code in neighbors/models.py
def score(
self,
metric="rmse",
dataset="missing",
by_user=True,
actual=None,
):
"""Get the performance of a fitted model by comparing observed and predicted data. This method is primarily useful if you want to calculate a single metric. Otherwise you should prefer the `.summary()` method instead, which scores all metrics.
Args:
metric (str; optional): what metric to compute, one of 'rmse', 'mse', 'mae' or 'correlation'; Default 'rmse'.
dataset (str; optional): how to compute scoring, either using 'observed', 'missing' or 'full'. Default 'missing'.
by_user (bool; optional): whether to return a single score over all data points or a pandas Series of scores per user. Default True.
actual (pd.DataFrame, None; optional): a dataframe to score against; Default is None which uses the data provided when the model was initialized
Returns:
float/pd.Series: score
"""
if not self.is_fit:
raise ValueError("You must fit() model first before using this method.")
if metric not in ["rmse", "mse", "mae", "correlation"]:
raise ValueError(
"metric must be one of 'rmse', 'mse', 'mae', or 'correlation'"
)
# Get dataframes of observed and predicted values
# This will be a dense or sparse matrix the same shape as the input data
model_actual, pred = self._retrieve_predictions(dataset)
if actual is None:
actual = model_actual
else:
if actual.shape != self.data.shape:
raise ValueError(
"actual values dataframe supplied but shape does not match original data"
)
if actual is None:
warnings.warn(
"Cannot score predictions on missing data because true values were never observed!"
)
return None
with warnings.catch_warnings():
# Catch 'Mean of empty slice' warnings from np.nanmean
warnings.simplefilter("ignore", category=RuntimeWarning)
if by_user:
scores = []
for userid in range(actual.shape[0]):
user_actual = actual.iloc[userid, :].values
user_pred = pred.iloc[userid, :].values
if metric == "rmse":
score = np.sqrt(np.nanmean((user_pred - user_actual) ** 2))
elif metric == "mse":
score = np.nanmean((user_pred - user_actual) ** 2)
elif metric == "mae":
score = np.nanmean(np.abs(user_pred - user_actual))
elif metric == "correlation":
nans = np.logical_or(np.isnan(user_actual), np.isnan(user_pred))
if len(user_actual[~nans]) < 2 or len(user_pred[~nans]) < 2:
score = np.nan
else:
score = pearsonr(user_actual[~nans], user_pred[~nans])[0]
scores.append(score)
return pd.Series(scores, index=actual.index, name=f"{metric}_{dataset}")
else:
actual, pred = actual.to_numpy().flatten(), pred.to_numpy().flatten()
if metric == "rmse":
return np.sqrt(np.nanmean((pred - actual) ** 2))
elif metric == "mse":
return np.nanmean((pred - actual) ** 2)
elif metric == "mae":
return np.nanmean(np.abs(pred - actual))
elif metric == "correlation":
nans = np.logical_or(np.isnan(actual), np.isnan(pred))
if len(actual[~nans]) < 2 or len(pred[~nans]) < 2:
return np.nan
else:
return pearsonr(actual[~nans], pred[~nans])[0]
summary(self, verbose=False, actual=None, dataset=None)
inherited
Calculate the performance of a model and return a dataframe of results. Computes performance across all, observed, and missing datasets. Scores using rmse, mse, mae, and correlation. Computes scores across all subjects (i.e. ignoring the fact that ratings are clustered by subject) and the mean performance for each metric after calculating per-subject performance.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
verbose |
bool |
Print warning messages during scoring. Defaults to False. |
False |
actual |
pd.DataFrame, None; optional |
a dataframe to score against; Default is None which uses the data provided when the model was initialized |
None |
dataset |
str/None |
dataset to score. Must be one of 'full', 'observed','missing' or None to score both 'observed' and 'missing'; Default None |
None |
Returns:
Type | Description |
---|---|
pd.DataFrame |
long-form dataframe of model performance |
Source code in neighbors/models.py
def summary(self, verbose=False, actual=None, dataset=None):
"""
Calculate the performance of a model and return a dataframe of results. Computes performance across all, observed, and missing datasets. Scores using rmse, mse, mae, and correlation. Computes scores across all subjects (i.e. ignoring the fact that ratings are clustered by subject) and the mean performance for each metric after calculating per-subject performance.
Args:
verbose (bool, optional): Print warning messages during scoring. Defaults to False.
actual (pd.DataFrame, None; optional): a dataframe to score against; Default is None which uses the data provided when the model was initialized
dataset (str/None): dataset to score. Must be one of 'full', 'observed','missing' or None to score both 'observed' and 'missing'; Default None
Returns:
pd.DataFrame: long-form dataframe of model performance
"""
if not self.is_fit:
raise ValueError("Model has not been fit!")
if dataset is None:
if actual is None:
if self.is_dense:
dataset = ["missing", "observed"]
else:
dataset = ["observed"]
else:
dataset = ["missing", "observed"]
elif isinstance(dataset, str):
if actual is None and not self.is_dense and dataset in ["full", "missing"]:
raise ValueError(
"Cannot score predictions on missing values because no ground truth was observed"
)
dataset = [dataset]
# Compute results for all metrics, all datasets, separately for group and by subject
group_results = {
"algorithm": self.__class__.__name__,
}
subject_results = []
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
for metric in ["rmse", "mse", "mae", "correlation"]:
this_group_result = {}
this_subject_result = []
for dat in dataset:
this_group_result[dat] = self.score(
metric=metric, dataset=dat, actual=actual, by_user=False
)
this_subject_result.append(
self.score(
metric=metric,
dataset=dat,
by_user=True,
actual=actual,
)
)
# Dict of group results for this metric
group_results[metric] = this_group_result
# Dataframe of subject results for this metric
this_subject_result = pd.concat(this_subject_result, axis=1)
subject_results.append(this_subject_result)
group_results[f"{metric}_user"] = dict(
zip(
dataset,
this_subject_result.mean().values,
)
)
# Save final results to longform df
self.user_results = pd.concat(subject_results, axis=1)
group_results = pd.DataFrame(group_results)
group_results = (
group_results.reset_index()
.melt(
id_vars=["index", "algorithm"],
var_name="metric",
value_name="score",
)
.rename(columns={"index": "dataset"})
.sort_values(by=["dataset", "metric"])
.reset_index(drop=True)
.assign(
group=lambda df: df.metric.apply(
lambda x: "user" if "user" in x else "all"
),
metric=lambda df: df.metric.replace(
{
"correlation_user": "correlation",
"mse_user": "mse",
"rmse_user": "rmse",
"mae_user": "mae",
}
),
)
.sort_values(by=["dataset", "group", "metric"])
.reset_index(drop=True)[
["algorithm", "dataset", "group", "metric", "score"]
]
)
self.overall_results = group_results
if verbose:
if w:
print(w[-1].message)
print(
"User performance results (not returned) are accessible using .user_results"
)
print(
"Overall performance results (returned) are accesible using .overall_results"
)
return group_results
to_long_df(self)
inherited
Create a long format pandas dataframe with observed, predicted, and mask.
Source code in neighbors/models.py
def to_long_df(self):
"""Create a long format pandas dataframe with observed, predicted, and mask."""
observed = pd.DataFrame(columns=["User", "Item", "Rating", "Condition"])
for row in self.data.iterrows():
tmp = pd.DataFrame(columns=observed.columns)
tmp["Rating"] = row[1]
tmp["Item"] = self.data.columns
tmp["User"] = row[0]
tmp["Condition"] = "Observed"
if self.is_masked:
if self.is_mask_dilated:
tmp["Mask"] = self.dilated_mask.loc[row[0]]
else:
tmp["Mask"] = self.mask.loc[row[0]]
observed = observed.append(tmp)
if self.is_fit:
predicted = pd.DataFrame(columns=["User", "Item", "Rating", "Condition"])
for row in self.predictions.iterrows():
tmp = pd.DataFrame(columns=predicted.columns)
tmp["Rating"] = row[1]
tmp["Item"] = self.predictions.columns
tmp["User"] = row[0]
tmp["Condition"] = "Predicted"
if self.is_masked:
tmp["Mask"] = self.mask.loc[row[0]]
predicted = predicted.append(tmp)
observed = observed.append(predicted)
return observed
transform(self, return_only_predictions=False)
inherited
Return a user x item matrix of predictions after a model has been fit
Parameters:
Name | Type | Description | Default |
---|---|---|---|
return_only_predictions |
bool |
Returns both training and testing predictions rather than simply filling in missing values with predictions. Defaults to False. |
False |
Returns:
Type | Description |
---|---|
pd.DataFrame |
user x item ratings |
Source code in neighbors/models.py
def transform(self, return_only_predictions=False):
"""
Return a user x item matrix of predictions after a model has been fit
Args:
return_only_predictions (bool, optional): Returns both training and testing predictions rather than simply filling in missing values with predictions. Defaults to False.
Returns:
pd.DataFrame: user x item ratings
"""
if not self.is_fit:
raise ValueError("Model has not been fit!")
if return_only_predictions:
return self.predictions
else:
# Propagate observed values to return object
out = self.data[self.mask]
# Fill in missing values with predictions
out[~self.mask] = self.predictions[~self.mask]
return out