Source code for crema.parsers.txt

"""A parser for generic delmited text files."""
import logging

import pandas as pd
from ..dataset import PsmDataset
from .. import utils

LOGGER = logging.getLogger(__name__)


[docs]def read_txt( txt_files, target_column, spectrum_columns, score_columns, peptide_column, protein_column, protein_delim, sep="\t", pairing_file_name=None, copy_data=True, ): """Read peptide-spectrum matches (PSMs) from delimited text files. Parameters ---------- txt_files : str, panda.DataFrame, or tuple of str One or more collection of PSMs in a tabular text format. target_column : str The column that indicates whether a PSM is a target or a decoy. spectrum_columns : str or tuple of str One or more columns that together define a unique mass spectrum. score_columns : str or tuple of str One or more columns that indicate scores by which crema can rank PSMs. peptide_column : str The column that defines a unique peptide. Modifications should be indicated either in square brackets :code:`[]` or parentheses :code:`()`. The exact modification format within these entities does not matter, so long as it is consistent. protein_column : str The column that defines a unique protein. protein_delim : str The delimiter to separate protein IDs. sep : str, optional The delimiter to use. pairing_file_name : str, optional A tab-delimited file that explicity pairs target and decoy peptide sequences. Requires one column labled 'target' that contains target sequences and a second colun labeled 'decoy' that contains decoy sequences. This file can be generated by setting --peptide-list=T in tide-index. copy_data : bool, optional If true, a deep copy of the data is created. This uses more memory, but is safer because it prevents accidental modification of the underlying data. This argument only has an effect when `pin_files` is a :py:class:`pandas.DataFrame` Returns ------- PsmDataset A :py:class:`~crema.dataset.PsmDataset` object containing the parsed PSMs. """ # Store column names in a list to be used by read_csv function fields = [target_column, peptide_column, protein_column] # Verify some arguments are lists: spectrum_columns = utils.listify(spectrum_columns) score_columns = utils.listify(score_columns) fields += spectrum_columns + score_columns # Parse the data if isinstance(txt_files, pd.DataFrame): data = txt_files.copy(deep=copy_data).loc[:, fields] else: data = pd.concat( [_parse_psms(f, sep, fields) for f in utils.listify(txt_files)] ) data[target_column] = _convert_target_col(data[target_column]) psms = PsmDataset( psms=data, target_column=target_column, spectrum_columns=spectrum_columns, score_columns=score_columns, peptide_column=peptide_column, protein_column=protein_column, protein_delim=protein_delim, copy_data=False, ) if pairing_file_name != None: psms._peptide_pairing = utils.create_pairing_from_file( pairing_file_name ) return psms
def _parse_psms(txt_file, sep, cols): """Parse a single delimited txt file. Parameters ---------- txt_file : str The Crux tab-delimited file to read. sep : str The delimiter to use. cols : list of str The columns to parse. Returns ------- pandas.DataFrame A :py:class:`pandas.DataFrame` containing the parsed PSMs """ LOGGER.info("Reading PSMs from %s...", txt_file) return pd.read_csv(txt_file, sep=sep, usecols=cols) def _convert_target_col(data): """Convert values in target column to boolean True/False. Parameters ---------- data : pandas.Series The target column before the target/decoy column has been converted to boolean. Returns ------- pandas.Series The target column after it has been converted to boolean. """ if data.dtype == bool: return data elif data.dtype == "object": targets = { "target": True, "t": True, "decoy": False, "d": False, "f": False, } return data.map(targets) elif len(data.unique()) > 2: raise ValueError( "The specificed target column appears to contain more than 2 " "values." ) return data > 0