Source code for crema.parsers.msamanda

"""A parser for the MSAmanda tab-delimited format"""
import re
import logging

import pandas as pd

from .txt import read_txt
from .. import utils

LOGGER = logging.getLogger(__name__)


[docs]def read_msamanda( txt_files, pairing_file_name=None, decoy_prefix="REV_", copy_data=True ): """Read peptide-spectrum matches (PSMs) from MSAmanda tab-delimited files. Parameters ---------- txt_files : str, pandas.DataFrame or tuple of str One or more collection of PSMs in the MSAmanda tab-delimited format. pairing_file_name : str, optional A tab-delimited file that explicity pairs target and decoy peptide sequences. Requires one column labled 'target' that contains target sequences and a second colun labeled 'decoy' that contains decoy sequences. decoy_prefix : str, optional The prefix used to indicate a decoy protein in the protein column. Default value is 'REV_'. copy_data : bool, optional If true, a deep copy of the data is created. This uses more memory, but is safer because it prevents accidental modification of the underlying data. This argument only has an effect when `txt_files` is a :py:class:`pandas.DataFrame` Returns ------- PsmDataset A :py:class:`~crema.dataset.PsmDataset` object containing the parsed PSMs. """ target = "target/decoy" peptide = "Sequence" spectrum = ["Filename", "Scan Number"] pairing = "" protein = "Protein Accessions" protein_delim = ";" # Possible score columns output by MSAmanda. scores = { "Amanda Score", "Weighted Probability", } scores_all = scores # Keep only MSAmanda scores that exist in all of the files. skip_first_row = False if isinstance(txt_files, pd.DataFrame): scores = scores.intersection(set(txt_files.columns)) else: txt_files = utils.listify(txt_files) for txt_file in txt_files: with open(txt_file) as txt_ref: # First line of MSAmanda output consists only of version line # If statement below in case first line is removed line = txt_ref.readline().rstrip() if line.startswith("#version"): line = txt_ref.readline().rstrip() skip_first_row = True cols = line.split("\t") scores = scores.intersection(set(cols)) if not scores: raise ValueError( "Could not find any of the MSAmanda score columns in all of the files." f"The columns Crema looks for are {', '.join(list(scores_all))}" ) scores = list(scores) # Read in the files: fields = [*spectrum, peptide, target, *scores, pairing, protein] if isinstance(txt_files, pd.DataFrame): data = txt_files.copy(deep=copy_data).loc[:, fields] else: data = pd.concat( [ utils.parse_psms_txt(f, fields, skip_first_row) for f in txt_files ] ) data["target/decoy"] = ~data[protein].str.contains(decoy_prefix) psms = read_txt( data, target_column=target, spectrum_columns=spectrum, score_columns=scores, peptide_column=peptide, protein_column=protein, protein_delim=protein_delim, sep="\t", pairing_file_name=pairing_file_name, copy_data=False, ) # Remove decoy prefix from protein ID protein_column = psms.proteins new_protein_column = protein_column.str.replace( decoy_prefix, "", regex=True ) psms.set_protein_column(new_protein_column) return psms