Source code for crema.parsers.msfragger

"""A parser for the MSFragger tab-delimited format"""
import re
import logging

import pandas as pd

from .txt import read_txt
from .pepxml import read_pepxml
from .pepxml import _parse_pepxml
from ..dataset import PsmDataset
from .. import utils

LOGGER = logging.getLogger(__name__)


[docs]def read_msfragger( txt_files, pairing_file_name=None, decoy_prefix="rev_", copy_data=True ): """Read peptide-spectrum matches (PSMs) from MSFragger pepXML files. Parameters ---------- txt_files : str, pandas.DataFrame or tuple of str One or more collection of PSMs in the MSFragger tab-delimited format. pairing_file_name : str, optional A tab-delimited file that explicity pairs target and decoy peptide sequences. Requires one column labled 'target' that contains target sequences and a second colun labeled 'decoy' that contains decoy sequences. decoy_prefix : str, optional The prefix used to indicate a decoy protein in the protein column. Default value is 'rev_'. copy_data : bool, optional If true, a deep copy of the data is created. This uses more memory, but is safer because it prevents accidental modification of the underlying data. This argument only has an effect when `txt_files` is a :py:class:`pandas.DataFrame` Returns ------- PsmDataset A :py:class:`~crema.dataset.PsmDataset` object containing the parsed PSMs. """ target = "label" peptide = "peptide" # TODO well annoying the column names for pepXML and tsv output are different # spectrum = ["Filename", "start scan"] # check this - is this for TSV? spectrum = ["ms_data_file", "scan"] pairing = "" # protein = "protein" # check for TSV protein = "proteins" protein_delim = ";" # The text below in any pepXML field identifies the field as a score field score_id = "search_engine_score:" # Possible score columns output by MSFragger. scores = { "hyperscore", "nextscore", "expect", "expectscore", } scores_all = scores # Read in the files: if isinstance(txt_files, pd.DataFrame): scores = scores.intersection(set(txt_files.columns)) else: txt_files = utils.listify(txt_files) data_list = [_parse_pepxml(f, decoy_prefix) for f in txt_files] for data_file in data_list: score_col = [c for c in data_file.columns if score_id in c] score_col = [re.sub(score_id, "", c) for c in score_col] scores = scores.intersection(set(score_col)) data = pd.concat(data_list) data.columns = data.columns.str.replace(score_id, "") if not scores: raise ValueError( "Could not find any of the MSFragger score columns in all of the " "files." f"The columns crema looks for are {', '.join(list(scores_all))}" ) scores = list(scores) data[score_col] = data[score_col].astype(float) psms = PsmDataset( psms=data, target_column=target, spectrum_columns=spectrum, score_columns=scores, peptide_column=peptide, protein_column=protein, protein_delim=protein_delim, copy_data=False, ) if pairing_file_name != None: psms._peptide_pairing = utils.create_pairing_from_file( pairing_file_name ) return psms