Source code for crema.parsers.msfragger

"""A parser for the MSFragger tab-delimited format"""
import re
import logging

import pandas as pd

from .txt import read_txt
from .pepxml import read_pepxml
from .pepxml import _parse_pepxml
from ..dataset import PsmDataset
from .. import utils

LOGGER = logging.getLogger(__name__)


[docs]def read_msfragger(
    txt_files, pairing_file_name=None, decoy_prefix="rev_", copy_data=True
):
    """Read peptide-spectrum matches (PSMs) from MSFragger pepXML files.

    Parameters
    ----------
    txt_files : str, pandas.DataFrame or tuple of str
        One or more collection of PSMs in the MSFragger tab-delimited format.
    pairing_file_name : str, optional
        A tab-delimited file that explicity pairs target and decoy peptide
        sequences. Requires one column labled 'target' that contains target
        sequences and a second colun labeled 'decoy' that contains decoy
        sequences.
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein in the protein column.
        Default value is 'rev_'.
    copy_data : bool, optional
        If true, a deep copy of the data is created. This uses more memory, but
        is safer because it prevents accidental modification of the underlying
        data. This argument only has an effect when `txt_files` is a
        :py:class:`pandas.DataFrame`

    Returns
    -------
    PsmDataset
        A :py:class:`~crema.dataset.PsmDataset` object containing the parsed
        PSMs.
    """
    target = "label"
    peptide = "peptide"
    # TODO well annoying the column names for pepXML and tsv output are different
    # spectrum = ["Filename", "start scan"] # check this - is this for TSV?
    spectrum = ["ms_data_file", "scan"]
    pairing = ""
    # protein = "protein" # check for TSV
    protein = "proteins"
    protein_delim = ";"

    # The text below in any pepXML field identifies the field as a score field
    score_id = "search_engine_score:"

    # Possible score columns output by MSFragger.
    scores = {
        "hyperscore",
        "nextscore",
        "expect",
        "expectscore",
    }
    scores_all = scores

    # Read in the files:
    if isinstance(txt_files, pd.DataFrame):
        scores = scores.intersection(set(txt_files.columns))
    else:
        txt_files = utils.listify(txt_files)
        data_list = [_parse_pepxml(f, decoy_prefix) for f in txt_files]

        for data_file in data_list:
            score_col = [c for c in data_file.columns if score_id in c]
            score_col = [re.sub(score_id, "", c) for c in score_col]
            scores = scores.intersection(set(score_col))

        data = pd.concat(data_list)
        data.columns = data.columns.str.replace(score_id, "")

    if not scores:
        raise ValueError(
            "Could not find any of the MSFragger score columns in all of the "
            "files."
            f"The columns crema looks for are {', '.join(list(scores_all))}"
        )

    scores = list(scores)
    data[score_col] = data[score_col].astype(float)

    psms = PsmDataset(
        psms=data,
        target_column=target,
        spectrum_columns=spectrum,
        score_columns=scores,
        peptide_column=peptide,
        protein_column=protein,
        protein_delim=protein_delim,
        copy_data=False,
    )

    if pairing_file_name != None:
        psms._peptide_pairing = utils.create_pairing_from_file(
            pairing_file_name
        )

    return psms