Source code for crema.parsers.msamanda

"""A parser for the MSAmanda tab-delimited format"""
import re
import logging

import pandas as pd

from .txt import read_txt
from .. import utils

LOGGER = logging.getLogger(__name__)


[docs]def read_msamanda(
    txt_files, pairing_file_name=None, decoy_prefix="REV_", copy_data=True
):
    """Read peptide-spectrum matches (PSMs) from MSAmanda tab-delimited files.

    Parameters
    ----------
    txt_files : str, pandas.DataFrame or tuple of str
        One or more collection of PSMs in the MSAmanda tab-delimited format.
    pairing_file_name : str, optional
        A tab-delimited file that explicity pairs target and decoy peptide
        sequences. Requires one column labled 'target' that contains target
        sequences and a second colun labeled 'decoy' that contains decoy
        sequences.
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein in the protein column.
        Default value is 'REV_'.
    copy_data : bool, optional
        If true, a deep copy of the data is created. This uses more memory, but
        is safer because it prevents accidental modification of the underlying
        data. This argument only has an effect when `txt_files` is a
        :py:class:`pandas.DataFrame`

    Returns
    -------
    PsmDataset
        A :py:class:`~crema.dataset.PsmDataset` object containing the parsed
        PSMs.
    """
    target = "target/decoy"
    peptide = "Sequence"
    spectrum = ["Filename", "Scan Number"]
    pairing = ""
    protein = "Protein Accessions"
    protein_delim = ";"

    # Possible score columns output by MSAmanda.
    scores = {
        "Amanda Score",
        "Weighted Probability",
    }
    scores_all = scores

    # Keep only MSAmanda scores that exist in all of the files.
    skip_first_row = False
    if isinstance(txt_files, pd.DataFrame):
        scores = scores.intersection(set(txt_files.columns))
    else:
        txt_files = utils.listify(txt_files)
        for txt_file in txt_files:
            with open(txt_file) as txt_ref:
                # First line of MSAmanda output consists only of version line
                # If statement below in case first line is removed
                line = txt_ref.readline().rstrip()
                if line.startswith("#version"):
                    line = txt_ref.readline().rstrip()
                    skip_first_row = True
                cols = line.split("\t")
                scores = scores.intersection(set(cols))

    if not scores:
        raise ValueError(
            "Could not find any of the MSAmanda score columns in all of the files."
            f"The columns Crema looks for are {', '.join(list(scores_all))}"
        )

    scores = list(scores)

    # Read in the files:
    fields = [*spectrum, peptide, target, *scores, pairing, protein]
    if isinstance(txt_files, pd.DataFrame):
        data = txt_files.copy(deep=copy_data).loc[:, fields]
    else:
        data = pd.concat(
            [
                utils.parse_psms_txt(f, fields, skip_first_row)
                for f in txt_files
            ]
        )

    data["target/decoy"] = ~data[protein].str.contains(decoy_prefix)

    psms = read_txt(
        data,
        target_column=target,
        spectrum_columns=spectrum,
        score_columns=scores,
        peptide_column=peptide,
        protein_column=protein,
        protein_delim=protein_delim,
        sep="\t",
        pairing_file_name=pairing_file_name,
        copy_data=False,
    )

    # Remove decoy prefix from protein ID
    protein_column = psms.proteins
    new_protein_column = protein_column.str.replace(
        decoy_prefix, "", regex=True
    )
    psms.set_protein_column(new_protein_column)

    return psms