Source code for crema.parsers.mztab

"""This module contains the parser for PSMs in mzTab format"""
import logging

import pandas as pd
from pyteomics.mztab import MzTab

from ..utils import listify
from ..dataset import PsmDataset

LOGGER = logging.getLogger(__name__)


[docs]def read_mztab(mztab_files): """Read peptide-spectrum matches (PSMs) from mzTab files. Parameters ---------- mztab_files : str or tuple of str One or more collections of PSMs in the mzTab format. Returns ------- PsmDataset A :py:class:`~crema.dataset.PsmDataset` object containing the PSMs from the mzTab file. """ mztab_files = listify(mztab_files) # Create a dataframe from the PSMs in the mzTab files. psms = pd.concat([_parse_psms(f) for f in mztab_files]) # Initialize column names from mzTab standard specifications spectrum_col = ["spectra_ref"] score_col = [c for c in psms.columns if "search_engine_score" in c] target_col = "opt_global_cv_MS:1002217_decoy_peptide" sequence_col = "sequence" protein_col = "accession" # TODO check if correct delim_col = ";" # TODO check if correct mod_col = "modifications" # Check that all column headers are valid, otherwise, throw error if len(set(spectrum_col) & set(psms.columns)) < len(spectrum_col): raise KeyError( "The mzTab file does not contain the columns that define a " f"spectrum. These are: {', '.join(spectrum_col)}." ) if sequence_col not in psms.columns or mod_col not in psms.columns: raise KeyError( "The mzTab file does not contain the columns to specify " "peptide sequence and modifications." ) if target_col not in psms.columns: raise KeyError( "The mzTab file does not contain the column that specifies " f"whether a PSM is a target or decoy, {target_col}" ) if not score_col: raise ValueError( "No columns containing search engine scores were detected. These " "start with 'search_engine_score*'." ) # Create the necesssary columns psms["peptide"] = psms[sequence_col] + "[" + psms[mod_col] + "]" psms["target"] = ~psms[target_col].astype(bool) # Keep only the relevant columns columns = spectrum_col + score_col + ["peptide", "target"] + [protein_col] psms = psms.loc[:, columns] return PsmDataset( psms=psms, target_column="target", spectrum_columns=spectrum_col, score_columns=score_col, peptide_column="peptide", protein_column=protein_col, protein_delim=delim_col, copy_data=False, )
def _parse_psms(mztab_file): """Parse a single mzTab file using Pyteomics Parameters ---------- mztab_file : str The mzTab file to read. Returns ------- pandas.DataFrame A :py:class:`pandas.DataFrame` containing the parsed PSMs. """ LOGGER.info("Reading PSMs from %s...", mztab_file) return MzTab(str(mztab_file)).spectrum_match_table