Source code for crema.parsers.pepxml

"""
This module contains the parser for PSMs in pepXML format.
This code is heavily based on Will Fondrie's Mokapot pepxml parser code
"""
import logging
from lxml import etree
from functools import partial

import pandas as pd
import itertools
import re

from ..utils import listify
from ..dataset import PsmDataset

LOGGER = logging.getLogger(__name__)


[docs]def read_pepxml(pepxml_files, decoy_prefix): """Read peptide-spectrum matches (PSMs) from pepXML files. Parameters ---------- pepxml_files : str or tuple of str One or more collections of PSMs in the pepXML format. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Returns ------- PsmDataset A :py:class:`~crema.dataset.PsmDataset` object containing the PSMs from the pepxml file. """ pepxml_files = listify(pepxml_files) # Create a dataframe from the PSMs in the pepXML files. psms = pd.concat([_parse_pepxml(f, decoy_prefix) for f in pepxml_files]) # Initialize column names from pepXML standard specifications spectrum_col = ["ms_data_file", "scan"] score_col = [c for c in psms.columns if "search_engine_score" in c] target_col = "label" sequence_col = "peptide" protein_col = "proteins" protein_delim = "," # Check that all column headers are valid, otherwise, throw error if len(set(spectrum_col) & set(psms.columns)) < len(spectrum_col): raise KeyError( "The pepXML file does not contain the columns that define a " f"spectrum. These are: {', '.join(spectrum_col)}." ) if sequence_col not in psms.columns: raise KeyError( "The pepXML file does not contain the columns to specify " "peptide sequence." ) if target_col not in psms.columns: raise KeyError( "The pepXML file does not contain the column that specifies " f"whether a PSM is a target or decoy, {target_col}" ) if not score_col: raise ValueError( "No columns containing search engine scores were detected. These " "start with 'search_engine_score*'." ) if not protein_col: raise ValueError( "The pepXML file does not contain the columns to specify " "protein sequence." ) # Remove "search_engine_score:" from column name and score_col # and convert scores to float psms.columns = psms.columns.str.replace("search_engine_score:", "") score_col = [re.sub("search_engine_score:", "", c) for c in score_col] psms[score_col] = psms[score_col].astype(float) return PsmDataset( psms=psms, target_column=target_col, spectrum_columns=spectrum_col, score_columns=score_col, peptide_column=sequence_col, protein_column=protein_col, protein_delim=protein_delim, copy_data=False, )
def _parse_pepxml(pepxml_file, decoy_prefix): """Parse a single pepXML file using lxml into a DataFrame Parameters ---------- pepxml_file : str The pepXML file to read. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Returns ------- pandas.DataFrame A :py:class:`pandas.DataFrame` containing the parsed PSMs. """ LOGGER.info("Reading PSMs from %s...", pepxml_file) parser = etree.iterparse(str(pepxml_file), tag="{*}msms_run_summary") parse_fun = partial(_parse_msms_run, decoy_prefix=decoy_prefix) spectra = map(parse_fun, parser) try: psms = itertools.chain.from_iterable(spectra) df = pd.DataFrame.from_records(itertools.chain.from_iterable(psms)) df["ms_data_file"] = df["ms_data_file"].astype("category") except etree.XMLSyntaxError: raise ValueError( f"{pepxml_file} is not a PepXML file or is malformed." ) return df def _parse_msms_run(msms_run, decoy_prefix): """Parse a single MS/MS run. Each of these corresponds to a raw MS data file. Parameters ---------- msms_run: tuple of anything, lxml.etree.Element The second element of the tuple should be the XML element for a single msms_run. The first is not used, but is necessary for compatibility with using :code:`map()`. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Yields ------ dict A dictionary describing all of the PSMs in a run. """ msms_run = msms_run[1] ms_data_file = msms_run.get("base_name") run_ext = msms_run.get("raw_data") if not ms_data_file.endswith(run_ext): ms_data_file += run_ext run_info = {"ms_data_file": ms_data_file} for spectrum in msms_run.iter("{*}spectrum_query"): yield _parse_spectrum(spectrum, run_info, decoy_prefix) def _parse_spectrum(spectrum, run_info, decoy_prefix): """Parse the PSMs for a single mass spectrum Parameters ---------- spectrum : lxml.etree.Element The XML element for a single run_info : dict The parsed run data. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Yields ------ dict A dictionary describing all of the PSMs for a spectrum. """ spec_info = run_info.copy() spec_info["scan"] = int(spectrum.get("end_scan")) for psms in spectrum.iter("{*}search_result"): for psm in psms.iter("{*}search_hit"): yield _parse_psm(psm, spec_info, decoy_prefix=decoy_prefix) def _parse_psm(psm_info, spec_info, decoy_prefix): """Parse a single PSM Parameters ---------- psm_info : lxml.etree.Element The XML element containing information about the PSM. spec_info : dict The parsed spectrum data. decoy_prefix : str The prefix used to indicate a decoy protein in the description lines of the FASTA file. Returns ------- dict A dictionary containing parsed data about the PSM. """ psm = spec_info.copy() psm["peptide"] = psm_info.get("peptide") psm["proteins"] = [psm_info.get("protein").split(" ")[0]] psm["label"] = not psm["proteins"][0].startswith(decoy_prefix) queries = [ "{*}modification_info", "{*}search_score", "{*}alternative_protein", ] for element in psm_info.iter(*queries): if "modification_info" in element.tag: offset = 0 mod_pep = psm["peptide"] for mod in element.iter("{*}mod_aminoacid_mass"): idx = offset + int(mod.get("position")) mass = mod.get("mass") mod_pep = mod_pep[:idx] + "[" + mass + "]" + mod_pep[idx:] offset += 2 + len(mass) psm["peptide"] = mod_pep elif "alternative_protein" in element.tag: psm["proteins"].append(element.get("protein").split(" ")[0]) if not psm["label"]: psm["label"] = not psm["proteins"][-1].startswith(decoy_prefix) else: psm["search_engine_score:" + element.get("name")] = element.get( "value" ) psm["proteins"] = ",".join(psm["proteins"]) return psm