Source code for crema.parsers.msgf

"""A parser for the MSGF+ tab-delimited format"""
import re
import logging

import pandas as pd

from .txt import read_txt
from .. import utils

LOGGER = logging.getLogger(__name__)


[docs]def read_msgf(
    txt_files, pairing_file_name=None, decoy_prefix="XXX_", copy_data=True
):
    """Read peptide-spectrum matches (PSMs) from MSGF+ tab-delimited files.

    Parameters
    ----------
    txt_files : str, pandas.DataFrame or tuple of str
        One or more collection of PSMs in the MSGF+ tab-delimited format.
    pairing_file_name : str, optional
        A tab-delimited file that explicity pairs target and decoy peptide
        sequences. Requires one column labled 'target' that contains target
        sequences and a second colun labeled 'decoy' that contains decoy
        sequences.
    decoy_prefix : str, optional
        The prefix used to indicate a decoy protein in the protein column.
        Default value is 'XXX_'.
    copy_data : bool, optional
        If true, a deep copy of the data is created. This uses more memory, but
        is safer because it prevents accidental modification of the underlying
        data. This argument only has an effect when `txt_files` is a
        :py:class:`pandas.DataFrame`

    Returns
    -------
    PsmDataset
        A :py:class:`~crema.dataset.PsmDataset` object containing the parsed
        PSMs.
    """
    target = "target/decoy"
    peptide = "Peptide"
    spectrum = ["#SpecFile", "ScanNum"]
    pairing = ""
    protein = "Protein"
    # TODO need to test case where protein are in diff row
    protein_delim = ";"

    # Possible score columns output by MSGF+.
    scores = {
        "DeNovoScore",
        "MSGFScore",
        "SpecEValue",
        "EValue",
    }
    scores_all = scores

    # Keep only MSGF+ scores that exist in all of the files.
    if isinstance(txt_files, pd.DataFrame):
        scores = scores.intersection(set(txt_files.columns))
    else:
        txt_files = utils.listify(txt_files)
        for txt_file in txt_files:
            with open(txt_file) as txt_ref:
                cols = txt_ref.readline().rstrip().split("\t")
                scores = scores.intersection(set(cols))

    if not scores:
        raise ValueError(
            "Could not find any of the MSGF+ score columns in all of the files."
            f"The columns Crema looks for are {', '.join(list(scores_all))}"
        )

    scores = list(scores)

    # Read in the files:
    fields = [*spectrum, peptide, target, *scores, pairing, protein]
    if isinstance(txt_files, pd.DataFrame):
        data = txt_files.copy(deep=copy_data).loc[:, fields]
    else:
        data = pd.concat(
            [utils.parse_psms_txt(f, fields, False) for f in txt_files]
        )

    data["target/decoy"] = ~data[protein].str.contains(decoy_prefix)

    psms = read_txt(
        data,
        target_column=target,
        spectrum_columns=spectrum,
        score_columns=scores,
        peptide_column=peptide,
        protein_column=protein,
        protein_delim=protein_delim,
        sep="\t",
        pairing_file_name=pairing_file_name,
        copy_data=False,
    )

    # Remove pre/post from protein ID
    # This looks like "sp|P0AC43|SDHA_ECO57(pre=R,post=G)"
    # Remove decoy prefix from protein ID
    protein_column = psms.proteins
    new_protein_column = protein_column.str.replace(
        "\\([^()]*\\)", "", regex=True
    )
    new_protein_column = new_protein_column.str.replace(
        decoy_prefix, "", regex=True
    )
    psms.set_protein_column(new_protein_column)

    return psms