"""
This module contains the parser for PSMs in pepXML format.
This code is heavily based on Will Fondrie's Mokapot pepxml parser code
"""
import logging
from lxml import etree
from functools import partial
import pandas as pd
import itertools
import re
from ..utils import listify
from ..dataset import PsmDataset
LOGGER = logging.getLogger(__name__)
[docs]def read_pepxml(pepxml_files, decoy_prefix):
"""Read peptide-spectrum matches (PSMs) from pepXML files.
Parameters
----------
pepxml_files : str or tuple of str
One or more collections of PSMs in the pepXML format.
decoy_prefix : str
The prefix used to indicate a decoy protein in the
description lines of the FASTA file.
Returns
-------
PsmDataset
A :py:class:`~crema.dataset.PsmDataset` object
containing the PSMs from the pepxml file.
"""
pepxml_files = listify(pepxml_files)
# Create a dataframe from the PSMs in the pepXML files.
psms = pd.concat([_parse_pepxml(f, decoy_prefix) for f in pepxml_files])
# Initialize column names from pepXML standard specifications
spectrum_col = ["ms_data_file", "scan"]
score_col = [c for c in psms.columns if "search_engine_score" in c]
target_col = "label"
sequence_col = "peptide"
protein_col = "proteins"
protein_delim = ","
# Check that all column headers are valid, otherwise, throw error
if len(set(spectrum_col) & set(psms.columns)) < len(spectrum_col):
raise KeyError(
"The pepXML file does not contain the columns that define a "
f"spectrum. These are: {', '.join(spectrum_col)}."
)
if sequence_col not in psms.columns:
raise KeyError(
"The pepXML file does not contain the columns to specify "
"peptide sequence."
)
if target_col not in psms.columns:
raise KeyError(
"The pepXML file does not contain the column that specifies "
f"whether a PSM is a target or decoy, {target_col}"
)
if not score_col:
raise ValueError(
"No columns containing search engine scores were detected. These "
"start with 'search_engine_score*'."
)
if not protein_col:
raise ValueError(
"The pepXML file does not contain the columns to specify "
"protein sequence."
)
# Remove "search_engine_score:" from column name and score_col
# and convert scores to float
psms.columns = psms.columns.str.replace("search_engine_score:", "")
score_col = [re.sub("search_engine_score:", "", c) for c in score_col]
psms[score_col] = psms[score_col].astype(float)
return PsmDataset(
psms=psms,
target_column=target_col,
spectrum_columns=spectrum_col,
score_columns=score_col,
peptide_column=sequence_col,
protein_column=protein_col,
protein_delim=protein_delim,
copy_data=False,
)
def _parse_pepxml(pepxml_file, decoy_prefix):
"""Parse a single pepXML file using lxml into a DataFrame
Parameters
----------
pepxml_file : str
The pepXML file to read.
decoy_prefix : str
The prefix used to indicate a decoy protein in the
description lines of the FASTA file.
Returns
-------
pandas.DataFrame
A :py:class:`pandas.DataFrame` containing the parsed PSMs.
"""
LOGGER.info("Reading PSMs from %s...", pepxml_file)
parser = etree.iterparse(str(pepxml_file), tag="{*}msms_run_summary")
parse_fun = partial(_parse_msms_run, decoy_prefix=decoy_prefix)
spectra = map(parse_fun, parser)
try:
psms = itertools.chain.from_iterable(spectra)
df = pd.DataFrame.from_records(itertools.chain.from_iterable(psms))
df["ms_data_file"] = df["ms_data_file"].astype("category")
except etree.XMLSyntaxError:
raise ValueError(
f"{pepxml_file} is not a PepXML file or is malformed."
)
return df
def _parse_msms_run(msms_run, decoy_prefix):
"""Parse a single MS/MS run.
Each of these corresponds to a raw MS data file.
Parameters
----------
msms_run: tuple of anything, lxml.etree.Element
The second element of the tuple should be the XML element for a single
msms_run. The first is not used, but is necessary for compatibility
with using :code:`map()`.
decoy_prefix : str
The prefix used to indicate a decoy protein in the description lines of
the FASTA file.
Yields
------
dict
A dictionary describing all of the PSMs in a run.
"""
msms_run = msms_run[1]
ms_data_file = msms_run.get("base_name")
run_ext = msms_run.get("raw_data")
if not ms_data_file.endswith(run_ext):
ms_data_file += run_ext
run_info = {"ms_data_file": ms_data_file}
for spectrum in msms_run.iter("{*}spectrum_query"):
yield _parse_spectrum(spectrum, run_info, decoy_prefix)
def _parse_spectrum(spectrum, run_info, decoy_prefix):
"""Parse the PSMs for a single mass spectrum
Parameters
----------
spectrum : lxml.etree.Element
The XML element for a single
run_info : dict
The parsed run data.
decoy_prefix : str
The prefix used to indicate a decoy protein in the description lines of
the FASTA file.
Yields
------
dict
A dictionary describing all of the PSMs for a spectrum.
"""
spec_info = run_info.copy()
spec_info["scan"] = int(spectrum.get("end_scan"))
for psms in spectrum.iter("{*}search_result"):
for psm in psms.iter("{*}search_hit"):
yield _parse_psm(psm, spec_info, decoy_prefix=decoy_prefix)
def _parse_psm(psm_info, spec_info, decoy_prefix):
"""Parse a single PSM
Parameters
----------
psm_info : lxml.etree.Element
The XML element containing information about the PSM.
spec_info : dict
The parsed spectrum data.
decoy_prefix : str
The prefix used to indicate a decoy protein in the description lines of
the FASTA file.
Returns
-------
dict
A dictionary containing parsed data about the PSM.
"""
psm = spec_info.copy()
psm["peptide"] = psm_info.get("peptide")
psm["proteins"] = [psm_info.get("protein").split(" ")[0]]
psm["label"] = not psm["proteins"][0].startswith(decoy_prefix)
queries = [
"{*}modification_info",
"{*}search_score",
"{*}alternative_protein",
]
for element in psm_info.iter(*queries):
if "modification_info" in element.tag:
offset = 0
mod_pep = psm["peptide"]
for mod in element.iter("{*}mod_aminoacid_mass"):
idx = offset + int(mod.get("position"))
mass = mod.get("mass")
mod_pep = mod_pep[:idx] + "[" + mass + "]" + mod_pep[idx:]
offset += 2 + len(mass)
psm["peptide"] = mod_pep
elif "alternative_protein" in element.tag:
psm["proteins"].append(element.get("protein").split(" ")[0])
if not psm["label"]:
psm["label"] = not psm["proteins"][-1].startswith(decoy_prefix)
else:
psm["search_engine_score:" + element.get("name")] = element.get(
"value"
)
psm["proteins"] = ",".join(psm["proteins"])
return psm