Source code for rhapsody.features.EVmutation

# -*- coding: utf-8 -*-
"""This module defines a function for deriving coevolutionary features
from precomputed EVmutation scores."""

import numpy as np
from glob import glob
from os.path import splitext, join, basename
from prody import SETTINGS, LOGGER

__author__ = "Luca Ponzoni"
__date__ = "December 2019"
__maintainer__ = "Luca Ponzoni"
__email__ = ""
__status__ = "Production"

__all__ = ['EVMUT_FEATS', 'recoverEVmutFeatures']

EVMUT_FEATS = ['EVmut-DeltaE_epist', 'EVmut-DeltaE_indep',
               'EVmut-mut_aa_freq', 'EVmut-wt_aa_cons']
"""List of features derived from EVmutation database of precomputed
coevolution-based scores."""

[docs]def recoverEVmutFeatures(SAVs): """Compute EVmutation features by fetching precomputed scores from the downloaded local folder. If multiple values are found for a given variant, the average will be taken. :arg SAVs: list of SAV coordinates, e.g. ``'P17516 135 G E'``. :type SAVs: list or tuple of strings :return: an array of EVmutation features for each SAV :rtype: NumPy structured array """ LOGGER.timeit('_EVmut')'Recovering EVmutation data...') # extracts precomputed EVmutation scores for given mutants # NB: # negative DeltaE_epist --> deleterious effect # DeltaE_epist == 0 --> neutral effect (wild-type) # positive DeltaE_epist --> neutral/benign effect def find_matching_files(file_list, acc, pos): match_files = [] for fname in [f for f in file_list if f.startswith(acc)]: basename = splitext(fname)[0] res_range = basename.split("_")[-1] res_i = int(res_range.split("-")[0]) res_f = int(res_range.split("-")[1]) if res_i <= int(pos) <= res_f: match_files.append(fname) return match_files feat_dtype = np.dtype([(f, 'f') for f in EVMUT_FEATS]) features = np.zeros(len(SAVs), dtype=feat_dtype) features[:] = np.nan # recover EVmutation data EVmut_dir = SETTINGS.get('EVmutation_local_folder') if EVmut_dir is None: raise RuntimeError('EVmutation folder not set') file_list = [basename(f) for f in glob(join(EVmut_dir, '*.csv'))] if not file_list: raise RuntimeError('EVmutation folder does not contain any .csv files') for i, SAV in enumerate(SAVs): acc, pos, wt_aa, mut_aa = SAV.split() pos = int(pos) #'Recovering EVmutation data for {}.'.format(SAV)) # find files containing given SAV coordinates match_files = find_matching_files(file_list, acc, pos) # recover data and average them if multiple values are found mutant = f'{wt_aa}{pos}{mut_aa}' data = [] for fname in match_files: with open(join(EVmut_dir, fname), 'r') as f: for line in f: if line.startswith(mutant): ll = line.strip().split(';')[4:8] data.append(ll) break data = np.array(data, dtype=float) if len(data) == 0: # LOGGER.warn(f"EVmutation data not found for '{SAV}'") continue else: features[i] = tuple(np.mean(data, axis=0))'EVmutation scores recovered in %.1fs.', '_EVmut') return features
def calcEVmutPathClasses(EVmut_score): c = -SETTINGS.get('EVmutation_metrics')['optimal cutoff'] EVmut_class = np.where(EVmut_score < c, 'deleterious', 'neutral') EVmut_class[np.isnan(EVmut_score)] = '?' return EVmut_class