Source code for crema.dataset

"""The :py:class:`PsmDataset` class is used to define a collection of
peptide-spectrum matches.
"""
import logging

from .confidence import TdcConfidence
from .confidence import MixmaxConfidence
from .qvalues import tdc
from .utils import listify

LOGGER = logging.getLogger(__name__)


[docs]class PsmDataset: """Store a collection of peptide-spectrum matches (PSMs). Parameters ---------- psms : pandas.DataFrame A :py:class:`pandas.DataFrame` of PSMs. target_column : str The column that indicates whether a PSM is a target or a decoy. This column should be boolean, where :code:`True` indicates a target and :code:`False` indicates a decoy. spectrum_columns : str or tuple of str One or more columns that together define a unique mass spectrum. score_columns : str or tuple of str, optional One or more columns that indicate scores by which crema can rank PSMs. peptide_column : str The column that defines a unique peptide. Modifications should be indicated either in square brackets :code:`[]` or parentheses :code:`()`. The exact modification format within these entities does not matter, so long as it is consistent. protein_columns : str The column that defines a unique protein. protein_delim : str The string delimiter that is needed to separate multiple proteins found in the protein column. peptide_pairing: dict A map of target and decoy peptide sequence pairings to be used for TDC. This should be in the form {key=target_sequence:value=decoy_sequence} where decoy sequences are shuffled versions of target sequences. copy_data : bool, optional If true, a deep copy of the data is created. This uses more memory, but is safer because it prevents accidental modification of the underlying data. This argument only has an effect when `pin_files` is a :py:class:`pandas.DataFrame` Attributes ---------- data : pandas.DataFrame spectrum_columns : list of str score_columns : list of str target_column : str peptide_column : str protein_column : str protein_delim : str methods : dict peptide_pairing : dict """ methods = {"tdc": TdcConfidence, "mixmax": MixmaxConfidence} def __init__( self, psms, target_column, spectrum_columns, score_columns, peptide_column, protein_column, protein_delim, peptide_pairing=None, copy_data=True, ): """Initialize a PsmDataset object.""" self.score_columns = listify(score_columns) self._spectrum_columns = listify(spectrum_columns) self._target_column = target_column self._peptide_column = peptide_column self._protein_column = protein_column self._protein_delim = protein_delim self._peptide_pairing = peptide_pairing fields = sum( [ self._spectrum_columns, self.score_columns, [self._target_column], [self._peptide_column], [self._protein_column], ], [], ) self._data = psms.copy(deep=copy_data).loc[:, fields] self._data[target_column] = self._data[target_column].astype(bool) self._num_targets = self.targets.sum() self._num_decoys = (~self.targets).sum() if self.data.empty: raise ValueError("No PSMs were detected.") if not self._num_decoys: raise ValueError("No decoy PSMs were detected.") if not self._num_targets: raise ValueError("No target PSMs were detected.") @property def columns(self): """The columns of the PSM :py:class:`pandas.DataFrame`""" return self._data.columns @property def data(self): """The collection of PSMs as a :py:class:`pandas.DataFrame`.""" return self._data.copy() @property def spectra(self): """The mass spectrum identifiers as a :py:class:`pandas.DataFrame`.""" return self[self._spectrum_columns] @property def peptides(self): """The peptides as a :py:class:`pandas.Series`.""" return self[self._peptide_column] @property def proteins(self): """The proteins as a :py:class:`pandas.Series`.""" return self[self._protein_column] @property def protein_delim(self): """The delimiter to split protein IDs as a string.""" return self[self._protein_delim] @property def scores(self): """The scores for each PSM as a :py:class:`pandas.DataFrame`.""" return self[self.score_columns] @property def targets(self): """An array indicating whether each PSM is a target""" return self[self._target_column].values @property def peptide_pairing(self): """A dictionary containing target/decoy peptide pairs""" return self._peptide_pairing def __getitem__(self, column): """Return the specified column""" return self._data.loc[:, column]
[docs] def assign_confidence( self, score_column=None, desc=None, eval_fdr=0.01, method="tdc", pep_fdr_type="psm-peptide", prot_fdr_type="best", threshold=0.01, ): """Assign confidence estimates to this collection of peptide-spectrum matches. Parameters ---------- score_column : str, optional The score by which to rank the PSMs for confidence estimation. If :code:`None`, the score that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`), will be used. desc : bool, optional True if higher scores better, False if lower scores are better. If None, crema will try both and use the choice that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`). If `score_column` is :code:`None`, this parameter is ignored. eval_fdr : float, optional The false discovery rate threshold used to evaluate the best `score_column` and `desc` to choose. This should range from 0 to 1. method : {"tdc"}, optional The method for crema to use when calculating the confidence estimates. pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional The method for crema to use when calculating peptide level confidence estimates. Default is "psm-peptide". prot_fdr_type : {"best", "combine"}, optional The method for crema to use when calculating protein level confidence estimates. Default is "best". threshold : float or "q-value", optional The FDR threshold for accepting discoveries. Default is 0.01. If "q-value" is chosen, then "accept" column is replaced with "crema q-value". Returns ------- Confidence object The confidence estimates for this PsmDataset. """ if score_column is None: score_column, _, desc = self.find_best_score(eval_fdr) conf = self.methods[method]( psms=self, score_column=score_column, desc=desc, eval_fdr=eval_fdr, pep_fdr_type=pep_fdr_type, prot_fdr_type=prot_fdr_type, threshold=threshold, ) return conf
[docs] def find_best_score(self, eval_fdr=0.01): """Find the best score for this collection of PSMs Try each of the available score columns, determining how many PSMs are detected below the provided false discovery rate threshold. The best score is the one that returns the most. Parameters ---------- eval fdr : float The false discovery rate threshold used to find the best score. Returns ------- best_score : str The best score. n_passing : int The number of PSMs that meet the specified FDR threshold. desc : bool True if higher scores better, False if lower scores are better. """ best_score = None best_passing = 0 for desc in (True, False): qvals = self.scores.apply(tdc, target=self.targets, desc=desc) num_passing = (qvals <= eval_fdr).sum() feat_idx = num_passing.idxmax() num_passing = num_passing[feat_idx] if num_passing > best_passing: best_passing = num_passing best_score = feat_idx best_desc = desc if best_score is None: raise RuntimeError("No PSMs were found below the 'eval_fdr'.") return best_score, best_passing, best_desc
[docs] def set_protein_column(self, new_protein_column): """Replaces current protein column with input protein column Parameters ---------- new_protein_column : pandas.Series Returns ------- """ self._data[self._protein_column] = new_protein_column return
[docs] def set_peptide_column(self, new_peptide_column): """Replaces current peptide column with input peptide column Parameters ---------- new_peptide_column : pandas.Series Returns ------- """ self._data[self._peptide_column] = new_peptide_column return