Source code for crema.dataset

"""The :py:class:`PsmDataset` class is used to define a collection of
peptide-spectrum matches.
"""
import logging

from .confidence import TdcConfidence
from .confidence import MixmaxConfidence
from .qvalues import tdc
from .utils import listify

LOGGER = logging.getLogger(__name__)


[docs]class PsmDataset:
    """Store a collection of peptide-spectrum matches (PSMs).

    Parameters
    ----------
    psms : pandas.DataFrame
        A :py:class:`pandas.DataFrame` of PSMs.
    target_column : str
        The column that indicates whether a PSM is a target or a decoy. This
        column should be boolean, where :code:`True` indicates a target and
        :code:`False` indicates a decoy.
    spectrum_columns : str or tuple of str
        One or more columns that together define a unique mass spectrum.
    score_columns : str or tuple of str, optional
        One or more columns that indicate scores by which crema can rank PSMs.
    peptide_column : str
        The column that defines a unique peptide. Modifications should be
        indicated either in square brackets :code:`[]` or parentheses
        :code:`()`. The exact modification format within these entities does
        not matter, so long as it is consistent.
    protein_columns : str
        The column that defines a unique protein.
    protein_delim : str
        The string delimiter that is needed to separate multiple proteins found
        in the protein column.
    peptide_pairing: dict
        A map of target and decoy peptide sequence pairings to be used for TDC.
        This should be in the form {key=target_sequence:value=decoy_sequence}
        where decoy sequences are shuffled versions of target sequences.
    copy_data : bool, optional
        If true, a deep copy of the data is created. This uses more memory, but
        is safer because it prevents accidental modification of the underlying
        data. This argument only has an effect when `pin_files` is a
        :py:class:`pandas.DataFrame`

    Attributes
    ----------
    data : pandas.DataFrame
    spectrum_columns : list of str
    score_columns : list of str
    target_column : str
    peptide_column : str
    protein_column : str
    protein_delim : str
    methods : dict
    peptide_pairing : dict
    """

    methods = {"tdc": TdcConfidence, "mixmax": MixmaxConfidence}

    def __init__(
        self,
        psms,
        target_column,
        spectrum_columns,
        score_columns,
        peptide_column,
        protein_column,
        protein_delim,
        peptide_pairing=None,
        copy_data=True,
    ):
        """Initialize a PsmDataset object."""
        self.score_columns = listify(score_columns)
        self._spectrum_columns = listify(spectrum_columns)
        self._target_column = target_column
        self._peptide_column = peptide_column
        self._protein_column = protein_column
        self._protein_delim = protein_delim
        self._peptide_pairing = peptide_pairing

        fields = sum(
            [
                self._spectrum_columns,
                self.score_columns,
                [self._target_column],
                [self._peptide_column],
                [self._protein_column],
            ],
            [],
        )
        self._data = psms.copy(deep=copy_data).loc[:, fields]
        self._data[target_column] = self._data[target_column].astype(bool)
        self._num_targets = self.targets.sum()
        self._num_decoys = (~self.targets).sum()

        if self.data.empty:
            raise ValueError("No PSMs were detected.")

        if not self._num_decoys:
            raise ValueError("No decoy PSMs were detected.")

        if not self._num_targets:
            raise ValueError("No target PSMs were detected.")

    @property
    def columns(self):
        """The columns of the PSM :py:class:`pandas.DataFrame`"""
        return self._data.columns

    @property
    def data(self):
        """The collection of PSMs as a :py:class:`pandas.DataFrame`."""
        return self._data.copy()

    @property
    def spectra(self):
        """The mass spectrum identifiers as a :py:class:`pandas.DataFrame`."""
        return self[self._spectrum_columns]

    @property
    def peptides(self):
        """The peptides as a :py:class:`pandas.Series`."""
        return self[self._peptide_column]

    @property
    def proteins(self):
        """The proteins as a :py:class:`pandas.Series`."""
        return self[self._protein_column]

    @property
    def protein_delim(self):
        """The delimiter to split protein IDs as a string."""
        return self[self._protein_delim]

    @property
    def scores(self):
        """The scores for each PSM as a :py:class:`pandas.DataFrame`."""
        return self[self.score_columns]

    @property
    def targets(self):
        """An array indicating whether each PSM is a target"""
        return self[self._target_column].values

    @property
    def peptide_pairing(self):
        """A dictionary containing target/decoy peptide pairs"""
        return self._peptide_pairing

    def __getitem__(self, column):
        """Return the specified column"""
        return self._data.loc[:, column]

[docs]    def assign_confidence(
        self,
        score_column=None,
        desc=None,
        eval_fdr=0.01,
        method="tdc",
        pep_fdr_type="psm-peptide",
        prot_fdr_type="best",
        threshold=0.01,
    ):
        """Assign confidence estimates to this collection of peptide-spectrum matches.

        Parameters
        ----------
        score_column : str, optional
            The score by which to rank the PSMs for confidence estimation. If
            :code:`None`, the score that yields the most PSMs at the specified
            false discovery rate threshold (`eval_fdr`), will be used.
        desc : bool, optional
            True if higher scores better, False if lower scores are better.
            If None, crema will try both and use the
            choice that yields the most PSMs at the specified false discovery
            rate threshold (`eval_fdr`). If `score_column` is :code:`None`,
            this parameter is ignored.
        eval_fdr : float, optional
            The false discovery rate threshold used to evaluate the best
            `score_column` and `desc` to choose. This should range from 0 to 1.
        method : {"tdc"}, optional
            The method for crema to use when calculating the confidence estimates.
        pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional
            The method for crema to use when calculating peptide level confidence
            estimates. Default is "psm-peptide".
        prot_fdr_type : {"best", "combine"}, optional
            The method for crema to use when calculating protein level confidence
            estimates. Default is "best".
        threshold : float or "q-value", optional
            The FDR threshold for accepting discoveries. Default is 0.01. If
            "q-value" is chosen, then "accept" column is replaced with
            "crema q-value".

        Returns
        -------
        Confidence object
            The confidence estimates for this PsmDataset.
        """
        if score_column is None:
            score_column, _, desc = self.find_best_score(eval_fdr)

        conf = self.methods[method](
            psms=self,
            score_column=score_column,
            desc=desc,
            eval_fdr=eval_fdr,
            pep_fdr_type=pep_fdr_type,
            prot_fdr_type=prot_fdr_type,
            threshold=threshold,
        )

        return conf

[docs]    def find_best_score(self, eval_fdr=0.01):
        """Find the best score for this collection of PSMs

        Try each of the available score columns, determining how many PSMs
        are detected below the provided false discovery rate threshold. The
        best score is the one that returns the most.

        Parameters
        ----------
        eval fdr : float
            The false discovery rate threshold used to find the best score.

        Returns
        -------
        best_score : str
            The best score.
        n_passing : int
            The number of PSMs that meet the specified FDR threshold.
        desc : bool
            True if higher scores better, False if lower scores are better.
        """
        best_score = None
        best_passing = 0
        for desc in (True, False):
            qvals = self.scores.apply(tdc, target=self.targets, desc=desc)
            num_passing = (qvals <= eval_fdr).sum()
            feat_idx = num_passing.idxmax()
            num_passing = num_passing[feat_idx]
            if num_passing > best_passing:
                best_passing = num_passing
                best_score = feat_idx
                best_desc = desc

        if best_score is None:
            raise RuntimeError("No PSMs were found below the 'eval_fdr'.")

        return best_score, best_passing, best_desc

[docs]    def set_protein_column(self, new_protein_column):
        """Replaces current protein column with input protein column

        Parameters
        ----------
        new_protein_column : pandas.Series

        Returns
        -------

        """
        self._data[self._protein_column] = new_protein_column
        return

[docs]    def set_peptide_column(self, new_peptide_column):
        """Replaces current peptide column with input peptide column

        Parameters
        ----------
        new_peptide_column : pandas.Series

        Returns
        -------

        """
        self._data[self._peptide_column] = new_peptide_column
        return