Source code for crema.parsers.tide

"""A parser for the Tide tab-delimited format"""
import re
import logging

import pandas as pd

from .txt import read_txt
from .. import utils

LOGGER = logging.getLogger(__name__)


[docs]def read_tide(
    txt_files, pairing_file_name=None, decoy_prefix="decoy_", copy_data=True
):
    """Read peptide-spectrum matches (PSMs) from Tide tab-delimited files.

    Parameters
    ----------
    txt_files : str, pandas.DataFrame or tuple of str
        One or more collection of PSMs in the Tide tab-delimited format.
    pairing_file_name : str, optional
        A tab-delimited file that explicity pairs target and decoy peptide
        sequences. Requires one column labled 'target' that contains target
        sequences and a second colun labeled 'decoy' that contains decoy
        sequences. This file can be generated by setting --peptide-list=T
        in tide-index.
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein in the protein column.
        Default value is 'decoy_'.
    copy_data : bool, optional
        If true, a deep copy of the data is created. This uses more memory, but
        is safer because it prevents accidental modification of the underlying
        data. This argument only has an effect when `txt_files` is a
        :py:class:`pandas.DataFrame`

    Returns
    -------
    PsmDataset
        A :py:class:`~crema.dataset.PsmDataset` object containing the parsed
        PSMs.
    """
    target = "target/decoy"
    peptide = "sequence"
    spectrum = ["file", "scan"]
    pairing = "original target sequence"
    protein = "protein id"
    protein_delim = ","

    # Possible score columns output by Tide.
    scores = {
        "sp score",
        "delta_cn",
        "delta_lcn",
        "xcorr score",
        "exact p-value",
        "refactored xcorr",
        "res-ev p-value",
        "combined p-value",
        "tailor score",
    }
    scores_all = scores

    # Keep only Tide scores that exist in all of the files.
    if isinstance(txt_files, pd.DataFrame):
        scores = scores.intersection(set(txt_files.columns))
    else:
        txt_files = utils.listify(txt_files)
        for txt_file in txt_files:
            with open(txt_file) as txt_ref:
                cols = txt_ref.readline().rstrip().split("\t")
                scores = scores.intersection(set(cols))

    if not scores:
        raise ValueError(
            "Could not find any of the Tide score columns in all of the files."
            f" The columns Crema looks for are {', '.join(list(scores_all))}"
        )

    scores = list(scores)

    # Read in the files:
    fields = [*spectrum, peptide, target, *scores, pairing, protein]
    if isinstance(txt_files, pd.DataFrame):
        data = txt_files.copy(deep=copy_data).loc[:, fields]
    else:
        data = pd.concat(
            [utils.parse_psms_txt(f, fields, False) for f in txt_files]
        )

    psms = read_txt(
        data,
        target_column=target,
        spectrum_columns=spectrum,
        score_columns=scores,
        peptide_column=peptide,
        protein_column=protein,
        protein_delim=protein_delim,
        sep="\t",
        pairing_file_name=pairing_file_name,
        copy_data=False,
    )

    # always pair target and decoys for Tide
    # explicit pairing done in read_txt
    if pairing_file_name == None:  # implicit pairing
        psms._peptide_pairing = _create_pairing(data)

    # Remove the start position of peptide in protein if present
    # This looks like "protName(XX)"
    # Remove decoy prefix from protein ID
    protein_column = psms.proteins
    new_protein_column = protein_column.str.replace(
        "\\([^()]*\\)", "", regex=True
    )
    new_protein_column = new_protein_column.str.replace(
        decoy_prefix, "", regex=True
    )
    psms.set_protein_column(new_protein_column)

    return psms


def _create_pairing(pairing_data):
    """Parse a single Tide dataframe to implicity pair target and
    decoy sequences.

    Parameters
    ----------
    pairing_data : pandas.DataFrame
        A collection of PSMs with the necessary columns to create a
        target/decoy peptide pairing. Required columns are "peptide mass",
        "sequence", "target/decoy", "original target sequence"

    Returns
    -------
    pairing : dict
        A map of target and decoy peptide sequence pairings. Targets with
        missing decoys will not be included among the keys.

    """
    # ensure pairing_data dataframe contains all necessary columns
    seq = "original target sequence"
    req_fields = [
        "sequence",
        "target/decoy",
        "original target sequence",
    ]

    if not set(req_fields).issubset(pairing_data.columns):
        miss = ", ".join(set(req_fields) - set(pairing_data.columns))
        raise ValueError(
            f"Required columns for peptide pairing were not detected: {miss}"
        )

    pairing_data = pairing_data.loc[:, req_fields]
    pairing_data = (
        pairing_data.sample(frac=1)
        .drop_duplicates(["sequence"])
        .reset_index(drop=False)
    )

    # Add a column of the sorted peptide:
    pairing_data["mods"] = (
        pairing_data["sequence"]
        .str.split("(?=[A-Z])")
        .apply(lambda x: "".join(sorted(x)))
    )

    # Split targets and decoys:
    is_decoy = pairing_data["target/decoy"] == "decoy"
    pairing_data = pairing_data.drop("target/decoy", axis=1)
    targets = pairing_data.loc[~is_decoy, :].copy()
    decoys = pairing_data.loc[is_decoy, :].copy()

    # Strip the target sequence modifications:
    targets[seq] = targets["sequence"].str.replace(r"\[.*?\]", "", regex=True)

    # Add an 'ord' column to disambiguate multiple matches per peptide:
    targets["ord"] = targets.groupby([seq, "mods"])["sequence"].rank("first")
    decoys["ord"] = decoys.groupby([seq, "mods"])["sequence"].rank("first")

    # Inner join the DataFrames to induce a pairing.
    # Targets with a missing decoy will be dropped.
    # Decoys with a missing target will be dropped.
    merged = pd.merge(
        targets,
        decoys,
        how="inner",
        on=[seq, "mods", "ord"],
        suffixes=["_t", "_d"],
    )

    return merged.set_index("sequence_t").loc[:, "sequence_d"].to_dict()