"""A parser for the Tide tab-delimited format"""
import re
import logging
import pandas as pd
from .txt import read_txt
from .. import utils
LOGGER = logging.getLogger(__name__)
[docs]def read_tide(
txt_files, pairing_file_name=None, decoy_prefix="decoy_", copy_data=True
):
"""Read peptide-spectrum matches (PSMs) from Tide tab-delimited files.
Parameters
----------
txt_files : str, pandas.DataFrame or tuple of str
One or more collection of PSMs in the Tide tab-delimited format.
pairing_file_name : str, optional
A tab-delimited file that explicity pairs target and decoy peptide
sequences. Requires one column labled 'target' that contains target
sequences and a second colun labeled 'decoy' that contains decoy
sequences. This file can be generated by setting --peptide-list=T
in tide-index.
decoy_prefix : str, optional
The prefix used to indicate a decoy protein in the protein column.
Default value is 'decoy_'.
copy_data : bool, optional
If true, a deep copy of the data is created. This uses more memory, but
is safer because it prevents accidental modification of the underlying
data. This argument only has an effect when `txt_files` is a
:py:class:`pandas.DataFrame`
Returns
-------
PsmDataset
A :py:class:`~crema.dataset.PsmDataset` object containing the parsed
PSMs.
"""
target = "target/decoy"
peptide = "sequence"
spectrum = ["file", "scan"]
pairing = "original target sequence"
protein = "protein id"
protein_delim = ","
# Possible score columns output by Tide.
scores = {
"sp score",
"delta_cn",
"delta_lcn",
"xcorr score",
"exact p-value",
"refactored xcorr",
"res-ev p-value",
"combined p-value",
"tailor score",
}
scores_all = scores
# Keep only Tide scores that exist in all of the files.
if isinstance(txt_files, pd.DataFrame):
scores = scores.intersection(set(txt_files.columns))
else:
txt_files = utils.listify(txt_files)
for txt_file in txt_files:
with open(txt_file) as txt_ref:
cols = txt_ref.readline().rstrip().split("\t")
scores = scores.intersection(set(cols))
if not scores:
raise ValueError(
"Could not find any of the Tide score columns in all of the files."
f" The columns Crema looks for are {', '.join(list(scores_all))}"
)
scores = list(scores)
# Read in the files:
fields = [*spectrum, peptide, target, *scores, pairing, protein]
if isinstance(txt_files, pd.DataFrame):
data = txt_files.copy(deep=copy_data).loc[:, fields]
else:
data = pd.concat(
[utils.parse_psms_txt(f, fields, False) for f in txt_files]
)
psms = read_txt(
data,
target_column=target,
spectrum_columns=spectrum,
score_columns=scores,
peptide_column=peptide,
protein_column=protein,
protein_delim=protein_delim,
sep="\t",
pairing_file_name=pairing_file_name,
copy_data=False,
)
# always pair target and decoys for Tide
# explicit pairing done in read_txt
if pairing_file_name == None: # implicit pairing
psms._peptide_pairing = _create_pairing(data)
# Remove the start position of peptide in protein if present
# This looks like "protName(XX)"
# Remove decoy prefix from protein ID
protein_column = psms.proteins
new_protein_column = protein_column.str.replace(
"\\([^()]*\\)", "", regex=True
)
new_protein_column = new_protein_column.str.replace(
decoy_prefix, "", regex=True
)
psms.set_protein_column(new_protein_column)
return psms
def _create_pairing(pairing_data):
"""Parse a single Tide dataframe to implicity pair target and
decoy sequences.
Parameters
----------
pairing_data : pandas.DataFrame
A collection of PSMs with the necessary columns to create a
target/decoy peptide pairing. Required columns are "peptide mass",
"sequence", "target/decoy", "original target sequence"
Returns
-------
pairing : dict
A map of target and decoy peptide sequence pairings. Targets with
missing decoys will not be included among the keys.
"""
# ensure pairing_data dataframe contains all necessary columns
seq = "original target sequence"
req_fields = [
"sequence",
"target/decoy",
"original target sequence",
]
if not set(req_fields).issubset(pairing_data.columns):
miss = ", ".join(set(req_fields) - set(pairing_data.columns))
raise ValueError(
f"Required columns for peptide pairing were not detected: {miss}"
)
pairing_data = pairing_data.loc[:, req_fields]
pairing_data = (
pairing_data.sample(frac=1)
.drop_duplicates(["sequence"])
.reset_index(drop=False)
)
# Add a column of the sorted peptide:
pairing_data["mods"] = (
pairing_data["sequence"]
.str.split("(?=[A-Z])")
.apply(lambda x: "".join(sorted(x)))
)
# Split targets and decoys:
is_decoy = pairing_data["target/decoy"] == "decoy"
pairing_data = pairing_data.drop("target/decoy", axis=1)
targets = pairing_data.loc[~is_decoy, :].copy()
decoys = pairing_data.loc[is_decoy, :].copy()
# Strip the target sequence modifications:
targets[seq] = targets["sequence"].str.replace(r"\[.*?\]", "", regex=True)
# Add an 'ord' column to disambiguate multiple matches per peptide:
targets["ord"] = targets.groupby([seq, "mods"])["sequence"].rank("first")
decoys["ord"] = decoys.groupby([seq, "mods"])["sequence"].rank("first")
# Inner join the DataFrames to induce a pairing.
# Targets with a missing decoy will be dropped.
# Decoys with a missing target will be dropped.
merged = pd.merge(
targets,
decoys,
how="inner",
on=[seq, "mods", "ord"],
suffixes=["_t", "_d"],
)
return merged.set_index("sequence_t").loc[:, "sequence_d"].to_dict()