"""The :py:class:`Confidence` class is used to define a collection of
peptide-spectrum matches with calculated false discovery rates (FDR) and q-values.
"""
import logging
import numpy as np
import pandas as pd
from abc import ABC, abstractmethod
from . import qvalues
from . import utils
from .writers.txt import to_txt
np.random.seed(0)
LOGGER = logging.getLogger(__name__)
[docs]def assign_confidence(
psms,
score_column=None,
desc=None,
eval_fdr=0.01,
method="tdc",
pep_fdr_type="psm-peptide",
prot_fdr_type="best",
threshold=0.01,
):
"""Assign confidence estimates to a collection of peptide-spectrum matches.
Parameters
----------
psms : PsmDataset or list of PsmDataset objects
The collections of PSMs
score_column : str, optional
The score by which to rank the PSMs for confidence estimation. If
:code:`None`, the score that yields the most PSMs at the specified
false discovery rate threshold (`eval_fdr`), will be used.
desc : bool, optional
True if higher scores better, False if lower scores are better.
If None, crema will try both and use the
choice that yields the most PSMs at the specified false discovery
rate threshold (`eval_fdr`). If `score_column` is :code:`None`,
this parameter is ignored.
eval_fdr : float, optional
The false discovery rate threshold used to evaluate the best
`score_column` and `desc` to choose. This should range from 0 to 1.
method : {"tdc"}, optional
The method for crema to use when calculating the confidence estimates.
pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional
The method for Crema to use when calculating peptide level confidence
estimates.
prot_fdr_type : {"best", "combine"}, optional
The method for crema to use when calculating protein level confidence
estimates. Default is "best".
threshold : float or "q-value", optional
The FDR threshold for accepting discoveries. Default is 0.01. If
"q-value" is chosen, then "accept" column is replaced with
"crema q-value".
Returns
-------
Confidence object or List of Confidence objects
The confidence estimates for each PsmDataset.
"""
if isinstance(psms, str):
raise ValueError("'psms' should be a PsmDataset object, not a string.")
# TODO unable to check whether psms is type PsmDataset w/o circular import
try:
assert isinstance(psms, list)
except (AssertionError, TypeError):
psms = [psms]
confs = []
for dset in psms:
conf = dset.assign_confidence(
score_column=score_column,
desc=desc,
eval_fdr=eval_fdr,
method=method,
pep_fdr_type=pep_fdr_type,
prot_fdr_type=prot_fdr_type,
threshold=threshold,
)
confs.append(conf)
if len(confs) == 1:
return confs[0]
return confs
class Confidence(ABC):
"""Estimate statistical confidence estimates for a collection of PSMs.
This should be a good base class for most of the methods, but not one
directly called by a user.
Parameters
----------
psms : a PsmDataset object
A collection of PSMs
score_column : str, optional
The score by which to rank the PSMs for confidence estimation. If
:code:`None`, the score that yields the most PSMs at the specified
false discovery rate threshold (`eval_fdr`), will be used.
desc : bool, optional
True if higher scores better, False if lower scores are better.
If None, crema will try both and use the
choice that yields the most PSMs at the specified false discovery
rate threshold (`eval_fdr`). If `score_column` is :code:`None`,
this parameter is ignored.
eval_fdr : float, optional
The false discovery rate threshold used to evaluate the best
`score_column` and `desc` to choose. This should range from 0 to 1.
pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional
The method for Crema to use when calculating peptide level confidence
estimates.
threshold : float or "q-value", optional
The FDR threshold for accepting discoveries. Default is 0.01. If
"q-value" is chosen, then "accept" column is replaced with
"crema q-value".
Attributes
----------
data : pandas.DataFrame
dataset : crema.PsmDataset
levels : list of str
confidence_estimates : Dict
A dictionary containing the confidence estimates at each level, each
as a :py:class:`pandas.DataFrame`.
decoy_confidence_estimates : Dict
A dictionary containing the confidence estimates for the decoy hits at
each level, each as a :py:class:`pandas.DataFrame`
:meta private:
"""
_level_labs = {
"psms": "PSMs",
"peptides": "Peptides",
"proteins": "Proteins",
}
@abstractmethod
def _assign_confidence(self):
"""How should confidence estimates be assigned?
Each method will have its own way of doing this. The results
should be saved in the `confidence_estimates` and
`decoy_confidence_estimates` attributes.
"""
pass
def __init__(
self,
psms,
score_column,
desc=None,
eval_fdr=0.01,
pep_fdr_type="psm-peptide",
prot_fdr_type="best",
threshold=0.01,
):
"""Initialize a Confidence object."""
if eval_fdr < 0 or eval_fdr > 1:
raise ValueError("'eval_fdr' should be between 0 and 1.")
if (
pep_fdr_type != "psm-only"
and pep_fdr_type != "peptide-only"
and pep_fdr_type != "psm-peptide"
):
raise ValueError(
"'pep_fdr_type' should be 'psm-only','peptide-only', or 'psm-peptide'"
)
pep_fdr_type_option = ["psm-only", "peptide-only", "psm-peptide"]
if pep_fdr_type not in pep_fdr_type_option:
raise ValueError("%s not valid pep_fdr_type" % (pep_fdr_type))
prot_fdr_type_option = ["best", "combine"]
if prot_fdr_type not in prot_fdr_type_option:
raise ValueError("%s not valid prot_fdr_type" % (prot_fdr_type))
if desc is None:
scores, targ = psms[score_column], psms.targets
t_pass = (qvalues.tdc(scores, targ, desc=True) <= eval_fdr).sum()
f_pass = (qvalues.tdc(scores, targ, desc=False) <= eval_fdr).sum()
desc = t_pass > f_pass
self._dataset = psms
self._data = psms.data
self._score_column = score_column
self._desc = desc
self._eval_fdr = eval_fdr
self._levels = ("psms", "peptides", "proteins")
self._level_columns = (
self.dataset._spectrum_columns,
self.dataset._peptide_column,
self.dataset._protein_column,
)
self._pep_fdr_type = pep_fdr_type
self._prot_fdr_type = prot_fdr_type
self.confidence_estimates = {}
self.decoy_confidence_estimates = {}
# Assign confidence estimates
self._assign_confidence()
# Clean up tables
self._prettify_tables(threshold)
@property
def data(self):
"""The collection of PSMs as a :py:class:`pandas.DataFrame`."""
return self._data
@property
def dataset(self):
"""The underlying :py:class:`~crema.dataset.PsmDataset`"""
return self._dataset
@property
def levels(self):
"""The available levels of confidence estimates"""
return self._levels
def _prettify_tables(self, threshold):
"""Reorder the columns of the result tables for consistency
Parameters
----------
threshold : float or "q-value", optional
The FDR threshold for accepting discoveries. Default is 0.01. If
"q-value" is chosen, then "accept" column is replaced with
"crema q-value".
"""
if threshold != "q-value":
last_col = "accept"
else:
last_col = "crema q-value"
cols = [
*self.dataset._spectrum_columns,
self.dataset._peptide_column,
self.dataset._protein_column,
self._score_column,
]
prot_cols = [
self.dataset._protein_column,
self._score_column,
]
cols.append(last_col)
prot_cols.append(last_col)
for level, df in self.confidence_estimates.items():
# use 'accept' column if threshold != 'q-value'
if threshold != "q-value":
df[last_col] = df["crema q-value"] <= threshold
# reverse order so best score is begining of df
df = df.iloc[::-1]
if level != "proteins":
self.confidence_estimates[level] = df.loc[:, cols]
elif level == "proteins":
self.confidence_estimates[level] = df.loc[:, prot_cols]
# Remove q-value column for decoy files
cols.pop()
prot_cols.pop()
for level, df in self.decoy_confidence_estimates.items():
if level != "proteins":
self.decoy_confidence_estimates[level] = df.loc[:, cols]
elif level == "proteins":
self.decoy_confidence_estimates[level] = df.loc[:, prot_cols]
def _compete(self, df, group_columns):
"""Perform target-decoy competition
For each group defined by `group_columns`, keep only the element
with the highest score.
Parameters
----------
df : panda.DataFrame
The DataFrame on which to perform the competition.
group_columns: str or list of str
The columns that define a group. The best score is retained
within the group.
Returns
-------
pandas.DataFrame
A :py:class:`pandas.DataFrame` containing only rows that won the
competition.
"""
if self._desc:
keep = "last"
else:
keep = "first"
group_columns = utils.listify(group_columns)
# Shuffle dataframe so ties are broken randomly.
out_df = (
df.sample(frac=1)
.sort_values([self._score_column] + group_columns)
.drop_duplicates(group_columns, keep=keep)
)
# This ensures that best score is at top of dataframe
if self._desc == False:
out_df = out_df[::-1]
return out_df
def __getitem__(self, column):
"""Return the specified column"""
return self._data.loc[:, column]
def __iter__(self):
"""...but we don't want this class to be an iterable (see PEP234)"""
raise TypeError
def to_txt(self, output_dir=None, file_root=None, sep="\t", decoys=False):
"""Save confidence estimates to delimited text files.
Parameters
----------
output_dir : str or None, optional
The directory in which to save the files. `None` will use the
current working directory.
file_root : str or None, optional
An optional prefix for the confidence estimate files. The suffix
will always be "crema.{level}.txt", where "{level}" indicates the
level at which confidence estimation was performed (i.e. PSMs,
peptides, proteins).
sep : str, optional
The delimiter to use.
decoys : bool, optional
Save decoys confidence estimates as well?
Returns
-------
list of str
The paths to the saved files.
"""
return to_txt(
self,
output_dir=output_dir,
file_root=file_root,
sep=sep,
decoys=decoys,
)
[docs]class TdcConfidence(Confidence):
"""Assign confidence estimates using target decoy competition.
Estimates q-values using the target decoy competition method. For set of
target and decoy PSMs meeting a specified score threshold, the false
discovery rate (FDR) is estimated as:
.. math::
FDR = \\frac{Decoys + 1}{Targets}
More formally, let the scores of target and decoy PSMs be indicated as
:math:`f_1, f_2, ..., f_{m_f}` and :math:`d_1, d_2, ..., d_{m_d}`,
respectively. For a score threshold :math:`t`, the false discovery
rate is estimated as:
.. math::
E\\{FDR(t)\\} = \\frac{|\\{d_i > t; i=1, ..., m_d\\}| + 1}
{\\{|f_i > t; i=1, ..., m_f|\\}}
The reported q-value for each PSM is the minimum FDR at which that
PSM would be accepted.
Parameters
----------
psms : a PsmDataset object
A collection of PSMs
score_column : str, optional
The score by which to rank the PSMs for confidence estimation. If
:code:`None`, the score that yields the most PSMs at the specified
false discovery rate threshold (`eval_fdr`), will be used.
desc : bool, optional
True if higher scores better, False if lower scores are better.
If None, crema will try both and use the
choice that yields the most PSMs at the specified false discovery
rate threshold (`eval_fdr`). If `score_column` is :code:`None`,
this parameter is ignored.
eval_fdr : float, optional
The false discovery rate threshold used to evaluate the best
`score_column` and `desc` to choose. This should range from 0 to 1.
pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional
The method for crema to use when calculating peptide level confidence
estimates.
prot_fdr_type : {"best", "combine"}, optional
The method for crema to use when calculating protein level confidence
estimates. Default is "best".
threshold : float or "q-value", optional
The FDR threshold for accepting discoveries. Default is 0.01. If
"q-value" is chosen, then "accept" column is replaced with
"crema q-value".
Attributes
----------
data : pandas.DataFrame
dataset : crema.PsmDataset
levels : list of str
confidence_estimates : Dict
A dictionary containing the confidence estimates at each level, each
as a :py:class:`pandas.DataFrame`.
decoy_confidence_estimates : Dict
A dictionary containing the confidence estimates for the decoy hits at
each level, each as a :py:class:`pandas.DataFrame`
"""
def __init__(
self,
psms,
score_column=None,
desc=None,
eval_fdr=0.01,
pep_fdr_type="psm-peptide",
prot_fdr_type="best",
threshold=0.01,
):
"""Initialize a TdcConfidence object."""
LOGGER.info(
"Assigning confidence estimates using target-decoy competition..."
)
super().__init__(
psms=psms,
score_column=score_column,
desc=desc,
eval_fdr=eval_fdr,
pep_fdr_type=pep_fdr_type,
prot_fdr_type=prot_fdr_type,
threshold=threshold,
)
def _assign_confidence(self):
"""Assign confidence estimates using target-decoy competition"""
pairing = self.dataset.peptide_pairing
if pairing == None and self._pep_fdr_type != "psm-only":
raise ValueError(
"Must provide paired target decoy peptide infomation."
)
for level, group_cols in zip(self.levels, self._level_columns):
# NOTE line below can removed if psm-only and peptide-only methods are removed
df = self.data
pair_col = utils.new_column("pairing", df)
if level == "peptides":
if self._pep_fdr_type == "psm-only":
group_cols = utils.listify(group_cols)
elif (
self._pep_fdr_type == "peptide-only"
or self._pep_fdr_type == "psm-peptide"
):
if self._pep_fdr_type == "psm-peptide":
df = self._compete(df, self.dataset._spectrum_columns)
group_cols = utils.listify(group_cols)
# replace sequence with pairing
pair_col = utils.new_column("pairing", df)
df[pair_col] = df[self.dataset._peptide_column].map(
lambda x: pairing.get(x, x)
)
group_cols = utils.listify(group_cols) + [pair_col]
group_cols.remove(self.dataset._peptide_column)
else:
raise ValueError(
f"'{self._pep_fdr_type}' is not a valid value for "
"pep_fdr_type "
)
elif level == "proteins":
# Perform PSM level FDR
df = self._compete(df, self.dataset._spectrum_columns)
# Remove peptides found in multiple proteins
df = df[
~df[self.dataset._protein_column].str.contains(
self.dataset._protein_delim
)
]
# Determines how to aggregate protein score
if self._prot_fdr_type == "best" and self._desc == True:
# larger score is better
agg_val = "max"
elif self._prot_fdr_type == "best" and self._desc == False:
# smaller score is better
agg_val = "min"
elif self._prot_fdr_type == "combine" and self._desc == True:
agg_val = "sum"
elif self._prot_fdr_type == "combine" and self._desc == False:
agg_val = "prod"
df2 = df.groupby(
[
self.dataset._protein_column,
self.dataset._target_column,
]
).agg({self._score_column: [agg_val]})
df2 = df2.reset_index()
df2.columns = [
self.dataset._protein_column,
self.dataset._target_column,
self._score_column,
]
df = df2
df = self._compete(df, group_cols)
targets = df[self.dataset._target_column]
# Now calculate q-values:
df["crema q-value"] = qvalues.tdc(
scores=df[self._score_column],
target=targets,
desc=self._desc,
)
LOGGER.info(
" - Found %i %s at q<=%g.",
(df[targets]["crema q-value"] <= self._eval_fdr).sum(),
self._level_labs[level],
self._eval_fdr,
)
self.confidence_estimates[level] = df.loc[targets, :]
self.decoy_confidence_estimates[level] = df.loc[~targets, :]
[docs]class MixmaxConfidence(Confidence):
"""Assign confidence estimates using mix-max competition.
Estimates qvalues using the mix-max competition method. To use this
method a separate target and decoy database search using a calibrated
score function must be used.
# TODO Describe how mixmax works here
Additional details can be found in this manuscript.
U. Keich, A. Kertesz-Farkas, and W. S. Noble. Improved false
discovery rate estimation procedure for shotgun proteomics.
Journal of Proteome Research, 14(8):3148-3161, 2015.
Parameters
----------
psms : a PsmDataset object
A collection of PSMs
score_column : str, optional
The score by which to rank the PSMs for confidence estimation. If
:code:`None`, the score that yields the most PSMs at the specified
false discovery rate threshold (`eval_fdr`), will be used.
desc : bool, optional
True if higher scores better, False if lower scores are better.
If None, crema will try both and use the
choice that yields the most PSMs at the specified false discovery
rate threshold (`eval_fdr`). If `score_column` is :code:`None`,
this parameter is ignored.
eval_fdr : float, optional
The false discovery rate threshold used to evaluate the best
`score_column` and `desc` to choose. This should range from 0 to 1.
pep_fdr_type : {"psm-only","peptide-only",psm-peptide"}, optional
The method for crema to use when calculating peptide level confidence
estimates. Default is "psm-peptide".
prot_fdr_type : {"best", "combine"}, optional
The method for crema to use when calculating protein level confidence
estimates. Default is "best".
estimates.
threshold : float or "q-value", optional
The FDR threshold for accepting discoveries. Default is 0.01. If
"q-value" is chosen, then "accept" column is replaced with
"crema q-value".
Attributes
----------
data : pandas.DataFrame
dataset : crema.PsmDataset
levels : list of str
confidence_estimates : Dict
A dictionary containing the confidence estimates at each level, each
as a :py:class:`pandas.DataFrame`.
decoy_confidence_estimates : Dict
A dictionary containing the confidence estimates for the decoy hits at
each level, each as a :py:class:`pandas.DataFrame`
"""
def __init__(
self,
psms,
score_column=None,
desc=None,
eval_fdr=0.01,
pep_fdr_type="psm-peptide",
prot_fdr_type="best",
threshold=0.01,
):
"""Initialize a TdcConfidence object."""
LOGGER.info(
"Assigning confidence estimates using mix-max competition..."
)
super().__init__(
psms=psms,
score_column=score_column,
desc=desc,
eval_fdr=eval_fdr,
pep_fdr_type=pep_fdr_type,
prot_fdr_type=prot_fdr_type,
)
def _assign_confidence(self):
"""Assign confidence estimates using target-decoy competition"""
# TODO maybe better way to do this
# can not infer desc value as the wrong value will
# result in a divide by zero error
if self._desc == None:
raise ValueError("'desc' has to be set for mix-max.")
# TODO check if separate target-decoy search is done
for level, group_cols in zip(self.levels, self._level_columns):
if level == "peptides" or level == "proteins":
continue
df = self.data
group_cols = utils.listify(group_cols)
targets = df[df[self.dataset._target_column]]
decoys = df[~df[self.dataset._target_column]]
# TODO add following warning ""The mix-max procedure is not well behaved when # targets (%d) != # of decoys (%d).","
if self._desc:
keep = "last"
else:
keep = "first"
# sort targets by score column and keep top rank
targets_sorted = (
targets.sample(frac=1)
.sort_values([self._score_column] + group_cols)
.drop_duplicates(group_cols, keep=keep, ignore_index=True)
)
# sort decoys by score column and keep top rank
decoys_sorted = (
decoys.sample(frac=1)
.sort_values([self._score_column] + group_cols)
.drop_duplicates(group_cols, keep=keep, ignore_index=True)
)
# combine top ranked target and decoy into one dataframe
combined = pd.concat([targets_sorted, decoys_sorted])
combined_sorted = combined.sample(frac=1).sort_values(
[self._score_column], ascending=~self._desc, ignore_index=True
)
# qvalues.py::calculate_mixmax_qval expects target scores
# and decoy scores to be sorted from worst to best
# qvalues.py::mixmax expected combined_sorted scores
# to be sorted from best to worst
if self._desc: # larger score is better
combined_sorted = combined_sorted[::-1]
else: # smaller score is better
targets_sorted[self._score_column] = (
targets_sorted[self._score_column] * -1.0
)
decoys_sorted[self._score_column] = (
decoys_sorted[self._score_column] * -1.0
)
targets_sorted = targets_sorted[::-1]
decoys_sorted = decoys_sorted[::-1]
# Now calculate q-values:
pi0, targets_sorted["crema q-value"] = qvalues.mixmax(
target_scores=targets_sorted[self._score_column],
decoy_scores=decoys_sorted[self._score_column],
combined_score=combined_sorted[self._score_column],
combined_score_target=combined_sorted[
self.dataset._target_column
],
)
LOGGER.info(" - Estimated pi_zero = %f.", pi0)
LOGGER.info(
" - Found %i %s at q<=%g.",
(targets_sorted["crema q-value"] <= self._eval_fdr).sum(),
self._level_labs[level],
self._eval_fdr,
)
# reverse rows so that best score is at top
targets_sorted = targets_sorted[::-1]
# undo previous multipliation by -1.0
if self._desc == False:
targets_sorted[self._score_column] = (
targets_sorted[self._score_column] * -1.0
)
self.confidence_estimates[level] = targets_sorted