Source code for mne_rsa.rsa

# encoding: utf-8
"""Methods to compute representational similarity analysis (RSA)."""

import numpy as np
from joblib import Parallel, delayed
from scipy import stats

from .folds import create_folds
from .rdm import _ensure_condensed, compute_rdm, compute_rdm_cv
from .searchlight import searchlight

try:
    # Version 1.8.0 and up
    from scipy.stats._stats_py import _kendall_dis
except ImportError:
    from scipy.stats.stats import _kendall_dis


def _kendall_tau_a(x, y):
    """Compute Kendall's Tau metric, A-variant.

    Taken from scipy.stats.kendalltau and modified to be the tau-a variant.
    """
    x = np.asarray(x).ravel()
    y = np.asarray(y).ravel()

    if x.size != y.size:
        raise ValueError(
            "All inputs to `kendalltau` must be of the same size,"
            " found x-size %s and y-size %s" % (x.size, y.size)
        )
    elif not x.size or not y.size:
        return np.nan  # Return NaN if arrays are empty

    def count_rank_tie(ranks):
        cnt = np.bincount(ranks).astype("int64", copy=False)
        cnt = cnt[cnt > 1]
        return (
            (cnt * (cnt - 1) // 2).sum(),
            (cnt * (cnt - 1.0) * (cnt - 2)).sum(),
            (cnt * (cnt - 1.0) * (2 * cnt + 5)).sum(),
        )

    size = x.size
    perm = np.argsort(y)  # sort on y and convert y to dense ranks
    x, y = x[perm], y[perm]
    y = np.r_[True, y[1:] != y[:-1]].cumsum(dtype="intp")

    # stable sort on x and convert x to dense ranks
    perm = np.argsort(x, kind="mergesort")
    x, y = x[perm], y[perm]
    x = np.r_[True, x[1:] != x[:-1]].cumsum(dtype="intp")

    dis = _kendall_dis(x, y)  # discordant pairs

    obs = np.r_[True, (x[1:] != x[:-1]) | (y[1:] != y[:-1]), True]
    cnt = np.diff(np.nonzero(obs)[0]).astype("int64", copy=False)

    ntie = (cnt * (cnt - 1) // 2).sum()  # joint ties
    xtie, x0, x1 = count_rank_tie(x)  # ties in x, stats
    ytie, y0, y1 = count_rank_tie(y)  # ties in y, stats

    tot = (size * (size - 1)) // 2

    if xtie == tot or ytie == tot:
        return np.nan

    # Note that tot = con + dis + (xtie - ntie) + (ytie - ntie) + ntie
    #               = con + dis + xtie + ytie - ntie
    con_minus_dis = tot - xtie - ytie + ntie - 2 * dis
    tau = con_minus_dis / tot
    # Limit range to fix computational errors
    tau = min(1.0, max(-1.0, tau))

    return tau


def _consolidate_masks(masks):
    if type(masks[0]) == slice:
        mask = slice(None)
    else:
        mask = masks[0]
        for m in masks[1:]:
            mask &= m
    return mask


def _partial_correlation(rdm_data, rdm_model, masks=None, type="pearson"):
    """Compute partial Pearson/Spearman correlation."""
    if len(rdm_model) == 1:
        raise ValueError(
            "Need more than one model RDM to use partial " "correlation as metric."
        )
    if type not in ["pearson", "spearman"]:
        raise ValueError("Correlation type must be either 'pearson' or " "'spearman'")

    if masks is not None:
        mask = _consolidate_masks(masks)
        rdm_model = [rdm[mask] for rdm in rdm_model]
        rdm_data = rdm_data[mask]

    X = np.vstack([rdm_data] + rdm_model).T
    if type == "spearman":
        X = np.apply_along_axis(stats.rankdata, 0, X)
    X = X - X.mean(axis=0)
    cov_X_inv = np.linalg.pinv(X.T @ X)
    norm = np.sqrt(np.outer(np.diag(cov_X_inv), np.diag(cov_X_inv)))
    R_partial = cov_X_inv / norm
    return -R_partial[0, 1:]



[docs]
def rsa_gen(rdm_data_gen, rdm_model, metric="spearman", ignore_nan=False):
    """Generate RSA values between data and model RDMs.

    Will yield RSA scores for each data RDM.

    Parameters
    ----------
    rdm_data_gen : generator of ndarray, shape (n_items, n_items)
        The generator for data RDMs
    rdm_model : ndarray, shape (n_items, n_items) | list of ndarray
        The model RDM, or list of model RDMs.
    metric : str
        The RSA metric to use to compare the RDMs. Valid options are:

        * 'spearman' for Spearman's correlation (the default)
        * 'pearson' for Pearson's correlation
        * 'kendall-tau-a' for Kendall's Tau (alpha variant)
        * 'partial' for partial Pearson correlations
        * 'partial-spearman' for partial Spearman correlations
        * 'regression' for linear regression weights

        Defaults to 'spearman'.
    ignore_nan : bool
        Whether to treat NaN's as missing values and ignore them when computing the
        distance metric. Defaults to ``False``.

        .. versionadded:: 0.8

    Yields
    ------
    rsa_val : float | ndarray, shape (len(rdm_model),)
        For each data RDM, the representational similarity with the model RDM. When
        multiple model RDMs are specified, this will be a 1D array of similarities,
        comparing the data RDM with each model RDM.

    See Also
    --------
    rsa

    """
    if isinstance(rdm_model, list):
        return_array = True
        rdm_model = [_ensure_condensed(rdm, "rdm_model") for rdm in rdm_model]
    else:
        return_array = False
        rdm_model = [_ensure_condensed(rdm_model, "rdm_model")]

    if ignore_nan:
        masks = [~np.isnan(rdm) for rdm in rdm_model]
    else:
        masks = [slice(None)] * len(rdm_model)

    for rdm_data in rdm_data_gen:
        rdm_data = _ensure_condensed(rdm_data, "rdm_data")
        if ignore_nan:
            data_mask = ~np.isnan(rdm_data)
            masks = [m & data_mask for m in masks]
        rsa_vals = _rsa_single_rdm(rdm_data, rdm_model, metric, masks)
        if return_array:
            yield np.asarray(rsa_vals)
        else:
            yield rsa_vals[0]



def _rsa_single_rdm(rdm_data, rdm_model, metric, masks):
    """Compute RSA between a single data RDM and one or more model RDMs."""
    if metric == "spearman":
        rsa_vals = [
            stats.spearmanr(rdm_data[mask], rdm_model_[mask])[0]
            for rdm_model_, mask in zip(rdm_model, masks)
        ]
    elif metric == "pearson":
        rsa_vals = [
            stats.pearsonr(rdm_data[mask], rdm_model_[mask])[0]
            for rdm_model_, mask in zip(rdm_model, masks)
        ]
    elif metric == "kendall-tau-a":
        rsa_vals = [
            _kendall_tau_a(rdm_data[mask], rdm_model_[mask])
            for rdm_model_, mask in zip(rdm_model, masks)
        ]
    elif metric == "partial":
        rsa_vals = _partial_correlation(rdm_data, rdm_model, masks)
    elif metric == "partial-spearman":
        rsa_vals = _partial_correlation(rdm_data, rdm_model, masks, type="spearman")
    elif metric == "regression":
        mask = _consolidate_masks(masks)
        rdm_model = [rdm[mask] for rdm in rdm_model]
        rdm_data = rdm_data[mask]
        X = np.atleast_2d(np.array(rdm_model)).T
        X = X - X.mean(axis=0)
        y = rdm_data - rdm_data.mean()
        rsa_vals = np.linalg.lstsq(X, y, rcond=None)[0]
    else:
        raise ValueError(
            "Invalid RSA metric, must be one of: 'spearman', "
            "'pearson', 'partial', 'partial-spearman', "
            "'regression' or 'kendall-tau-a'."
        )
    return rsa_vals



[docs]
def rsa(
    rdm_data,
    rdm_model,
    metric="spearman",
    ignore_nan=False,
    n_data_rdms=None,
    n_jobs=1,
    verbose=False,
):
    """Perform RSA between data and model RDMs.

    Parameters
    ----------
    rdm_data : ndarray, shape (n_items, n_items) | list | generator
        The data RDM (or list/generator of data RDMs).
    rdm_model : ndarray, shape (n_items, n_items) | list of ndarray
        The model RDM (or list of model RDMs).
    metric : str
        The RSA metric to use to compare the RDMs. Valid options are:

        * 'spearman' for Spearman's correlation (the default)
        * 'pearson' for Pearson's correlation
        * 'kendall-tau-a' for Kendall's Tau (alpha variant)
        * 'partial' for partial Pearson correlations
        * 'partial-spearman' for partial Spearman correlations
        * 'regression' for linear regression weights

        Defaults to 'spearman'.
    ignore_nan : bool
        Whether to treat NaN's as missing values and ignore them when computing
        the distance metric. Defaults to ``False``.

        .. versionadded:: 0.8
    n_data_rdms : int | None
        The number of data RDMs. This is useful when displaying a progress bar, so an
        estimate can be made of the computation time remaining. This information is
        available if ``rdm_data`` is an array or a list, but if it is a generator, this
        information is not available and you may want to set it explicitly.
    n_jobs : int
        The number of processes (=number of CPU cores) to use. Specify -1 to use all
        available cores. Defaults to 1.
    verbose : bool
        Whether to display a progress bar. In order for this to work, you need the tqdm
        python module installed. Defaults to False.

    Returns
    -------
    rsa_val : float | ndarray, shape (len(rdm_data), len(rdm_model))
        Depending on whether one or more data and model RDMs were specified, a single
        similarity value or a 2D array of similarity values for each data RDM versus
        each model RDM.

    See Also
    --------
    rsa_gen

    """
    return_array = False
    if isinstance(rdm_data, list) or hasattr(rdm_data, "__next__"):
        return_array = True
    else:
        rdm_data = [rdm_data]

    if verbose:
        from tqdm import tqdm

        if n_data_rdms is not None:
            total = n_data_rdms
        elif hasattr(rdm_data, "__len__"):
            total = len(rdm_data)
        else:
            total = None
        rdm_data = tqdm(rdm_data, total=total, unit="RDM")

    if n_jobs == 1:
        rsa_vals = list(rsa_gen(rdm_data, rdm_model, metric, ignore_nan))
    else:

        def process_single_rdm(rdm):
            return next(rsa_gen([rdm], rdm_model, metric, ignore_nan))

        rsa_vals = Parallel(n_jobs)(
            delayed(process_single_rdm)(rdm) for rdm in rdm_data
        )
    if return_array:
        return np.asarray(rsa_vals)
    else:
        return rsa_vals[0]




[docs]
def rsa_array(
    X,
    rdm_model,
    patches=None,
    data_rdm_metric="correlation",
    data_rdm_params=dict(),
    rsa_metric="spearman",
    ignore_nan=False,
    y=None,
    n_folds=1,
    n_jobs=1,
    verbose=False,
):
    """Perform RSA on an array of data, possibly in a searchlight pattern.

    Parameters
    ----------
    X : ndarray, shape (n_items, n_series, n_times)
        An array containing the data.
    rdm_model : ndarray, shape (n, n) | (n * (n - 1) // 2,) | list of ndarray
        The model RDM, see :func:`compute_rdm`. For efficiency, you can give it in
        condensed form, meaning only the upper triangle of the matrix as a vector. See
        :func:`scipy.spatial.distance.squareform`. To perform RSA against multiple
        models at the same time, supply a list of model RDMs.

        Use :func:`compute_rdm` to compute RDMs.
    patches : generator of tuples | None
        Searchlight patches as generated by :class:`searchlight`. If ``None``, no
        searchlight is used. Defaults to ``None``.
    data_rdm_metric : str
        The metric to use to compute the data RDMs. This can be any metric supported by
        the scipy.distance.pdist function. Defaults to 'correlation'.
    data_rdm_params : dict
        Extra arguments for the distance metric used to compute the RDMs. Refer to
        :mod:`scipy.spatial.distance` for a list of all other metrics and their
        arguments. Defaults to an empty dictionary.
    rsa_metric : str
        The RSA metric to use to compare the RDMs. Valid options are:

        * 'spearman' for Spearman's correlation (the default)
        * 'pearson' for Pearson's correlation
        * 'kendall-tau-a' for Kendall's Tau (alpha variant)
        * 'partial' for partial Pearson correlations
        * 'partial-spearman' for partial Spearman correlations
        * 'regression' for linear regression weights

        Defaults to 'spearman'.
    ignore_nan : bool
        Whether to treat NaN's as missing values and ignore them when computing the
        distance metric. Defaults to ``False``.

        .. versionadded:: 0.8
    y : ndarray of int, shape (n_items,) | None
        (Deprecated) For each item, a number indicating the class to which the item
        belongs.  When ``None``, each item is assumed to belong to a different class.
        Defaults to ``None``.
    labels_rdm_model: list | None
        For each row in ``rdm_model``, a label that identifies the item to which it
        corresponds. This is used in combination with ``labels_X`` to align the data and
        model RDMs before comparing them. Each row should have a unique label. Labels
        may be of any python type that can be compared with ``==`` (int, float, string,
        tuple, etc). By default (``None``), the integers ``0:n_rows`` are used as
        labels.

        .. versionadded:: 0.10
    labels_X : list | None
        For each element in ``X`` (=the first dimension), a label that identifies the
        item to which it corresponds. This is used in combination with
        ``labels_rdm_model`` to align the data and model RDMs before comparing them.
        Multiple elements in ``X`` may correspond to the same item, in which case they
        should have the same label and will be averaged when computing the data RDM.
        Labels may be of any python type that can be compared with ``==`` (int, float,
        string, tuple, etc). By default (``None``), the integers ``0:len(X)`` are used
        as labels.

        .. versionadded:: 0.10
    n_folds : int | sklearn.model_selection.BaseCrollValidator | None
        Number of cross-validation folds to use when computing the distance metric.
        Folds are created based on the ``y`` parameter. Specify ``None`` to use the
        maximum number of folds possible, given the data. Alternatively, you can pass a
        Scikit-Learn cross validator object (e.g. ``sklearn.model_selection.KFold``) to
        assert fine-grained control over how folds are created.
        Defaults to 1 (no cross-validation).
    n_jobs : int
        The number of processes (=number of CPU cores) to use. Specify -1 to use all
        available cores. Defaults to 1.
    verbose : bool
        Whether to display a progress bar. In order for this to work, you need the tqdm
        python module installed. Defaults to False.

    Returns
    -------
    rsa_vals : ndarray, shape ([n_series,] [n_times,] [n_model_rdms])
        The RSA value for each searchlight patch. When ``spatial_radius`` is set to
        ``None``, there will only be no ``n_series`` dimension. When ``temporal_radius``
        is set to ``None``, there will be no time dimension. When multiple models have
        been supplied, the last dimension will contain RSA results for each model.

    See Also
    --------
    searchlight
    compute_rdm
    rdm_array

    """
    if patches is None:
        patches = searchlight(X.shape)  # One big searchlight patch

    # Create folds for cross-validated RDM metrics
    X = create_folds(X, y, n_folds)
    # The data is now folds x items x n_series x n_times

    if isinstance(rdm_model, list):
        rdm_model = [_ensure_condensed(rdm, "rdm_model") for rdm in rdm_model]
    else:
        rdm_model = [_ensure_condensed(rdm_model, "rdm_model")]

    if ignore_nan:
        masks = [~np.isnan(rdm) for rdm in rdm_model]
    else:
        masks = [slice(None)] * len(rdm_model)

    if verbose:
        from tqdm import tqdm

        shape = getattr(patches, "shape", (-1,))
        patches = tqdm(patches, unit="patch")
        try:
            setattr(patches, "shape", shape)
        except AttributeError:
            pass

    def rsa_single_patch(patch):
        """Compute RSA for a single searchlight patch."""
        if len(X) == 1:  # Check number of folds
            # No cross-validation
            rdm_data = compute_rdm(X[0][patch], data_rdm_metric, **data_rdm_params)
        else:
            # Use cross-validation
            rdm_data = compute_rdm_cv(
                X[(slice(None),) + patch], data_rdm_metric, **data_rdm_params
            )
        if ignore_nan:
            data_mask = ~np.isnan(rdm_data)
            patch_masks = [m & data_mask for m in masks]
        else:
            patch_masks = masks
        return _rsa_single_rdm(rdm_data, rdm_model, rsa_metric, patch_masks)

    # Call RSA multiple times in parallel for each searchlight patch
    data = Parallel(n_jobs=n_jobs)(
        delayed(rsa_single_patch)(patch) for patch in patches
    )

    # Figure out the desired dimensions of the resulting array
    dims = getattr(patches, "shape", (-1,))
    if len(rdm_model) > 1:
        dims = dims + (len(rdm_model),)

    return np.array(data).reshape(dims)