Source code for crema.confidence

"""The :py:class:`Confidence` class is used to define a collection of
peptide-spectrum matches with calculated false discovery rates (FDR) and q-values.
"""
import logging
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod

from . import qvalues
from . import utils
from .writers.txt import to_txt

np.random.seed(0)

LOGGER = logging.getLogger(__name__)


[docs]def assign_confidence( psms, score_column=None, desc=None, eval_fdr=0.01, method="tdc", pep_fdr_type="psm-peptide", prot_fdr_type="best", threshold=0.01, ): """Assign confidence estimates to a collection of peptide-spectrum matches. Parameters ---------- psms : PsmDataset or list of PsmDataset objects The collections of PSMs score_column : str, optional The score by which to rank the PSMs for confidence estimation. If :code:`None`, the score that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`), will be used. desc : bool, optional True if higher scores better, False if lower scores are better. If None, crema will try both and use the choice that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`). If `score_column` is :code:`None`, this parameter is ignored. eval_fdr : float, optional The false discovery rate threshold used to evaluate the best `score_column` and `desc` to choose. This should range from 0 to 1. method : {"tdc"}, optional The method for crema to use when calculating the confidence estimates. pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional The method for Crema to use when calculating peptide level confidence estimates. prot_fdr_type : {"best", "combine"}, optional The method for crema to use when calculating protein level confidence estimates. Default is "best". threshold : float or "q-value", optional The FDR threshold for accepting discoveries. Default is 0.01. If "q-value" is chosen, then "accept" column is replaced with "crema q-value". Returns ------- Confidence object or List of Confidence objects The confidence estimates for each PsmDataset. """ if isinstance(psms, str): raise ValueError("'psms' should be a PsmDataset object, not a string.") # TODO unable to check whether psms is type PsmDataset w/o circular import try: assert isinstance(psms, list) except (AssertionError, TypeError): psms = [psms] confs = [] for dset in psms: conf = dset.assign_confidence( score_column=score_column, desc=desc, eval_fdr=eval_fdr, method=method, pep_fdr_type=pep_fdr_type, prot_fdr_type=prot_fdr_type, threshold=threshold, ) confs.append(conf) if len(confs) == 1: return confs[0] return confs
class Confidence(ABC): """Estimate statistical confidence estimates for a collection of PSMs. This should be a good base class for most of the methods, but not one directly called by a user. Parameters ---------- psms : a PsmDataset object A collection of PSMs score_column : str, optional The score by which to rank the PSMs for confidence estimation. If :code:`None`, the score that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`), will be used. desc : bool, optional True if higher scores better, False if lower scores are better. If None, crema will try both and use the choice that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`). If `score_column` is :code:`None`, this parameter is ignored. eval_fdr : float, optional The false discovery rate threshold used to evaluate the best `score_column` and `desc` to choose. This should range from 0 to 1. pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional The method for Crema to use when calculating peptide level confidence estimates. threshold : float or "q-value", optional The FDR threshold for accepting discoveries. Default is 0.01. If "q-value" is chosen, then "accept" column is replaced with "crema q-value". Attributes ---------- data : pandas.DataFrame dataset : crema.PsmDataset levels : list of str confidence_estimates : Dict A dictionary containing the confidence estimates at each level, each as a :py:class:`pandas.DataFrame`. decoy_confidence_estimates : Dict A dictionary containing the confidence estimates for the decoy hits at each level, each as a :py:class:`pandas.DataFrame` :meta private: """ _level_labs = { "psms": "PSMs", "peptides": "Peptides", "proteins": "Proteins", } @abstractmethod def _assign_confidence(self): """How should confidence estimates be assigned? Each method will have its own way of doing this. The results should be saved in the `confidence_estimates` and `decoy_confidence_estimates` attributes. """ pass def __init__( self, psms, score_column, desc=None, eval_fdr=0.01, pep_fdr_type="psm-peptide", prot_fdr_type="best", threshold=0.01, ): """Initialize a Confidence object.""" if eval_fdr < 0 or eval_fdr > 1: raise ValueError("'eval_fdr' should be between 0 and 1.") if ( pep_fdr_type != "psm-only" and pep_fdr_type != "peptide-only" and pep_fdr_type != "psm-peptide" ): raise ValueError( "'pep_fdr_type' should be 'psm-only','peptide-only', or 'psm-peptide'" ) pep_fdr_type_option = ["psm-only", "peptide-only", "psm-peptide"] if pep_fdr_type not in pep_fdr_type_option: raise ValueError("%s not valid pep_fdr_type" % (pep_fdr_type)) prot_fdr_type_option = ["best", "combine"] if prot_fdr_type not in prot_fdr_type_option: raise ValueError("%s not valid prot_fdr_type" % (prot_fdr_type)) if desc is None: scores, targ = psms[score_column], psms.targets t_pass = (qvalues.tdc(scores, targ, desc=True) <= eval_fdr).sum() f_pass = (qvalues.tdc(scores, targ, desc=False) <= eval_fdr).sum() desc = t_pass > f_pass self._dataset = psms self._data = psms.data self._score_column = score_column self._desc = desc self._eval_fdr = eval_fdr self._levels = ("psms", "peptides", "proteins") self._level_columns = ( self.dataset._spectrum_columns, self.dataset._peptide_column, self.dataset._protein_column, ) self._pep_fdr_type = pep_fdr_type self._prot_fdr_type = prot_fdr_type self.confidence_estimates = {} self.decoy_confidence_estimates = {} # Assign confidence estimates self._assign_confidence() # Clean up tables self._prettify_tables(threshold) @property def data(self): """The collection of PSMs as a :py:class:`pandas.DataFrame`.""" return self._data @property def dataset(self): """The underlying :py:class:`~crema.dataset.PsmDataset`""" return self._dataset @property def levels(self): """The available levels of confidence estimates""" return self._levels def _prettify_tables(self, threshold): """Reorder the columns of the result tables for consistency Parameters ---------- threshold : float or "q-value", optional The FDR threshold for accepting discoveries. Default is 0.01. If "q-value" is chosen, then "accept" column is replaced with "crema q-value". """ if threshold != "q-value": last_col = "accept" else: last_col = "crema q-value" cols = [ *self.dataset._spectrum_columns, self.dataset._peptide_column, self.dataset._protein_column, self._score_column, ] prot_cols = [ self.dataset._protein_column, self._score_column, ] cols.append(last_col) prot_cols.append(last_col) for level, df in self.confidence_estimates.items(): # use 'accept' column if threshold != 'q-value' if threshold != "q-value": df[last_col] = df["crema q-value"] <= threshold # reverse order so best score is begining of df df = df.iloc[::-1] if level != "proteins": self.confidence_estimates[level] = df.loc[:, cols] elif level == "proteins": self.confidence_estimates[level] = df.loc[:, prot_cols] # Remove q-value column for decoy files cols.pop() prot_cols.pop() for level, df in self.decoy_confidence_estimates.items(): if level != "proteins": self.decoy_confidence_estimates[level] = df.loc[:, cols] elif level == "proteins": self.decoy_confidence_estimates[level] = df.loc[:, prot_cols] def _compete(self, df, group_columns): """Perform target-decoy competition For each group defined by `group_columns`, keep only the element with the highest score. Parameters ---------- df : panda.DataFrame The DataFrame on which to perform the competition. group_columns: str or list of str The columns that define a group. The best score is retained within the group. Returns ------- pandas.DataFrame A :py:class:`pandas.DataFrame` containing only rows that won the competition. """ if self._desc: keep = "last" else: keep = "first" group_columns = utils.listify(group_columns) # Shuffle dataframe so ties are broken randomly. out_df = ( df.sample(frac=1) .sort_values([self._score_column] + group_columns) .drop_duplicates(group_columns, keep=keep) ) # This ensures that best score is at top of dataframe if self._desc == False: out_df = out_df[::-1] return out_df def __getitem__(self, column): """Return the specified column""" return self._data.loc[:, column] def __iter__(self): """...but we don't want this class to be an iterable (see PEP234)""" raise TypeError def to_txt(self, output_dir=None, file_root=None, sep="\t", decoys=False): """Save confidence estimates to delimited text files. Parameters ---------- output_dir : str or None, optional The directory in which to save the files. `None` will use the current working directory. file_root : str or None, optional An optional prefix for the confidence estimate files. The suffix will always be "crema.{level}.txt", where "{level}" indicates the level at which confidence estimation was performed (i.e. PSMs, peptides, proteins). sep : str, optional The delimiter to use. decoys : bool, optional Save decoys confidence estimates as well? Returns ------- list of str The paths to the saved files. """ return to_txt( self, output_dir=output_dir, file_root=file_root, sep=sep, decoys=decoys, )
[docs]class TdcConfidence(Confidence): """Assign confidence estimates using target decoy competition. Estimates q-values using the target decoy competition method. For set of target and decoy PSMs meeting a specified score threshold, the false discovery rate (FDR) is estimated as: .. math:: FDR = \\frac{Decoys + 1}{Targets} More formally, let the scores of target and decoy PSMs be indicated as :math:`f_1, f_2, ..., f_{m_f}` and :math:`d_1, d_2, ..., d_{m_d}`, respectively. For a score threshold :math:`t`, the false discovery rate is estimated as: .. math:: E\\{FDR(t)\\} = \\frac{|\\{d_i > t; i=1, ..., m_d\\}| + 1} {\\{|f_i > t; i=1, ..., m_f|\\}} The reported q-value for each PSM is the minimum FDR at which that PSM would be accepted. Parameters ---------- psms : a PsmDataset object A collection of PSMs score_column : str, optional The score by which to rank the PSMs for confidence estimation. If :code:`None`, the score that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`), will be used. desc : bool, optional True if higher scores better, False if lower scores are better. If None, crema will try both and use the choice that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`). If `score_column` is :code:`None`, this parameter is ignored. eval_fdr : float, optional The false discovery rate threshold used to evaluate the best `score_column` and `desc` to choose. This should range from 0 to 1. pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional The method for crema to use when calculating peptide level confidence estimates. prot_fdr_type : {"best", "combine"}, optional The method for crema to use when calculating protein level confidence estimates. Default is "best". threshold : float or "q-value", optional The FDR threshold for accepting discoveries. Default is 0.01. If "q-value" is chosen, then "accept" column is replaced with "crema q-value". Attributes ---------- data : pandas.DataFrame dataset : crema.PsmDataset levels : list of str confidence_estimates : Dict A dictionary containing the confidence estimates at each level, each as a :py:class:`pandas.DataFrame`. decoy_confidence_estimates : Dict A dictionary containing the confidence estimates for the decoy hits at each level, each as a :py:class:`pandas.DataFrame` """ def __init__( self, psms, score_column=None, desc=None, eval_fdr=0.01, pep_fdr_type="psm-peptide", prot_fdr_type="best", threshold=0.01, ): """Initialize a TdcConfidence object.""" LOGGER.info( "Assigning confidence estimates using target-decoy competition..." ) super().__init__( psms=psms, score_column=score_column, desc=desc, eval_fdr=eval_fdr, pep_fdr_type=pep_fdr_type, prot_fdr_type=prot_fdr_type, threshold=threshold, ) def _assign_confidence(self): """Assign confidence estimates using target-decoy competition""" pairing = self.dataset.peptide_pairing if pairing == None and self._pep_fdr_type != "psm-only": raise ValueError( "Must provide paired target decoy peptide infomation." ) for level, group_cols in zip(self.levels, self._level_columns): # NOTE line below can removed if psm-only and peptide-only methods are removed df = self.data pair_col = utils.new_column("pairing", df) if level == "peptides": if self._pep_fdr_type == "psm-only": group_cols = utils.listify(group_cols) elif ( self._pep_fdr_type == "peptide-only" or self._pep_fdr_type == "psm-peptide" ): if self._pep_fdr_type == "psm-peptide": df = self._compete(df, self.dataset._spectrum_columns) group_cols = utils.listify(group_cols) # replace sequence with pairing pair_col = utils.new_column("pairing", df) df[pair_col] = df[self.dataset._peptide_column].map( lambda x: pairing.get(x, x) ) group_cols = utils.listify(group_cols) + [pair_col] group_cols.remove(self.dataset._peptide_column) else: raise ValueError( f"'{self._pep_fdr_type}' is not a valid value for " "pep_fdr_type " ) elif level == "proteins": # Perform PSM level FDR df = self._compete(df, self.dataset._spectrum_columns) # Remove peptides found in multiple proteins df = df[ ~df[self.dataset._protein_column].str.contains( self.dataset._protein_delim ) ] # Determines how to aggregate protein score if self._prot_fdr_type == "best" and self._desc == True: # larger score is better agg_val = "max" elif self._prot_fdr_type == "best" and self._desc == False: # smaller score is better agg_val = "min" elif self._prot_fdr_type == "combine" and self._desc == True: agg_val = "sum" elif self._prot_fdr_type == "combine" and self._desc == False: agg_val = "prod" df2 = df.groupby( [ self.dataset._protein_column, self.dataset._target_column, ] ).agg({self._score_column: [agg_val]}) df2 = df2.reset_index() df2.columns = [ self.dataset._protein_column, self.dataset._target_column, self._score_column, ] df = df2 df = self._compete(df, group_cols) targets = df[self.dataset._target_column] # Now calculate q-values: df["crema q-value"] = qvalues.tdc( scores=df[self._score_column], target=targets, desc=self._desc, ) LOGGER.info( " - Found %i %s at q<=%g.", (df[targets]["crema q-value"] <= self._eval_fdr).sum(), self._level_labs[level], self._eval_fdr, ) self.confidence_estimates[level] = df.loc[targets, :] self.decoy_confidence_estimates[level] = df.loc[~targets, :]
[docs]class MixmaxConfidence(Confidence): """Assign confidence estimates using mix-max competition. Estimates qvalues using the mix-max competition method. To use this method a separate target and decoy database search using a calibrated score function must be used. # TODO Describe how mixmax works here Additional details can be found in this manuscript. U. Keich, A. Kertesz-Farkas, and W. S. Noble. Improved false discovery rate estimation procedure for shotgun proteomics. Journal of Proteome Research, 14(8):3148-3161, 2015. Parameters ---------- psms : a PsmDataset object A collection of PSMs score_column : str, optional The score by which to rank the PSMs for confidence estimation. If :code:`None`, the score that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`), will be used. desc : bool, optional True if higher scores better, False if lower scores are better. If None, crema will try both and use the choice that yields the most PSMs at the specified false discovery rate threshold (`eval_fdr`). If `score_column` is :code:`None`, this parameter is ignored. eval_fdr : float, optional The false discovery rate threshold used to evaluate the best `score_column` and `desc` to choose. This should range from 0 to 1. pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional The method for crema to use when calculating peptide level confidence estimates. Default is "psm-peptide". prot_fdr_type : {"best", "combine"}, optional The method for crema to use when calculating protein level confidence estimates. Default is "best". estimates. threshold : float or "q-value", optional The FDR threshold for accepting discoveries. Default is 0.01. If "q-value" is chosen, then "accept" column is replaced with "crema q-value". Attributes ---------- data : pandas.DataFrame dataset : crema.PsmDataset levels : list of str confidence_estimates : Dict A dictionary containing the confidence estimates at each level, each as a :py:class:`pandas.DataFrame`. decoy_confidence_estimates : Dict A dictionary containing the confidence estimates for the decoy hits at each level, each as a :py:class:`pandas.DataFrame` """ def __init__( self, psms, score_column=None, desc=None, eval_fdr=0.01, pep_fdr_type="psm-peptide", prot_fdr_type="best", threshold=0.01, ): """Initialize a TdcConfidence object.""" LOGGER.info( "Assigning confidence estimates using mix-max competition..." ) super().__init__( psms=psms, score_column=score_column, desc=desc, eval_fdr=eval_fdr, pep_fdr_type=pep_fdr_type, prot_fdr_type=prot_fdr_type, ) def _assign_confidence(self): """Assign confidence estimates using target-decoy competition""" # TODO maybe better way to do this # can not infer desc value as the wrong value will # result in a divide by zero error if self._desc == None: raise ValueError("'desc' has to be set for mix-max.") # TODO check if separate target-decoy search is done for level, group_cols in zip(self.levels, self._level_columns): if level == "peptides" or level == "proteins": continue df = self.data group_cols = utils.listify(group_cols) targets = df[df[self.dataset._target_column]] decoys = df[~df[self.dataset._target_column]] # TODO add following warning ""The mix-max procedure is not well behaved when # targets (%d) != # of decoys (%d)."," if self._desc: keep = "last" else: keep = "first" # sort targets by score column and keep top rank targets_sorted = ( targets.sample(frac=1) .sort_values([self._score_column] + group_cols) .drop_duplicates(group_cols, keep=keep, ignore_index=True) ) # sort decoys by score column and keep top rank decoys_sorted = ( decoys.sample(frac=1) .sort_values([self._score_column] + group_cols) .drop_duplicates(group_cols, keep=keep, ignore_index=True) ) # combine top ranked target and decoy into one dataframe combined = pd.concat([targets_sorted, decoys_sorted]) combined_sorted = combined.sample(frac=1).sort_values( [self._score_column], ascending=~self._desc, ignore_index=True ) # qvalues.py::calculate_mixmax_qval expects target scores # and decoy scores to be sorted from worst to best # qvalues.py::mixmax expected combined_sorted scores # to be sorted from best to worst if self._desc: # larger score is better combined_sorted = combined_sorted[::-1] else: # smaller score is better targets_sorted[self._score_column] = ( targets_sorted[self._score_column] * -1.0 ) decoys_sorted[self._score_column] = ( decoys_sorted[self._score_column] * -1.0 ) targets_sorted = targets_sorted[::-1] decoys_sorted = decoys_sorted[::-1] # Now calculate q-values: pi0, targets_sorted["crema q-value"] = qvalues.mixmax( target_scores=targets_sorted[self._score_column], decoy_scores=decoys_sorted[self._score_column], combined_score=combined_sorted[self._score_column], combined_score_target=combined_sorted[ self.dataset._target_column ], ) LOGGER.info(" - Estimated pi_zero = %f.", pi0) LOGGER.info( " - Found %i %s at q<=%g.", (targets_sorted["crema q-value"] <= self._eval_fdr).sum(), self._level_labs[level], self._eval_fdr, ) # reverse rows so that best score is at top targets_sorted = targets_sorted[::-1] # undo previous multipliation by -1.0 if self._desc == False: targets_sorted[self._score_column] = ( targets_sorted[self._score_column] * -1.0 ) self.confidence_estimates[level] = targets_sorted