Source code for rhapsody.utils.misc

# -*- coding: utf-8 -*-
"""This module defines default configuration parameters and
a function for the initial setup and training of Rhapsody."""

import os
import tarfile
import pickle
import urllib.request
import shutil
import sklearn
import numpy as np
import prody as pd
import rhapsody as rd

__author__ = "Luca Ponzoni"
__date__ = "December 2019"
__maintainer__ = "Luca Ponzoni"
__email__ = "lponzoni@pitt.edu"
__status__ = "Production"

__all__ = ['DEFAULT_FEATSETS', 'initialSetup',
           'getDefaultTrainingDataset', 'getDefaultClassifiers',
           'importDefaultClassifier', 'delSettings', 'getSettings']

USERHOME = os.getenv('USERPROFILE') or os.getenv('HOME') or './'
DEFAULT_WORKING_DIR = os.path.join(USERHOME, 'rhapsody')
DEFAULT_EVMUT_DIR = os.path.join(DEFAULT_WORKING_DIR,
                                 'EVmutation_mutation_effects')
EVMUT_URL = 'https://marks.hms.harvard.edu/evmutation/data/effects.tar.gz'
PACKAGE_DATA = os.path.join(rd.__path__[0], 'data.tar.gz')
TRAINING_DATASET = 'precomputed_features-ID_opt.npy'
DEFAULT_CLSF_DIR = f'default_classifiers-sklearn_v{sklearn.__version__}'
DEFAULT_FEATSETS = {
  'full':    ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain',
              'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
              'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM'],
  'reduced': ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain',
              'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
              'stiffness-chain', 'BLOSUM'],
  'EVmut':   ['wt_PSIC', 'Delta_PSIC', 'SASA', 'ANM_MSF-chain',
              'ANM_effectiveness-chain', 'ANM_sensitivity-chain',
              'stiffness-chain', 'entropy', 'ranked_MI', 'BLOSUM',
              'EVmut-DeltaE_epist'],
}


[docs]def initialSetup(working_dir=None, refresh=False, download_EVmutation=True):
    """Function to be run right after installation for setting up the
    environment and main parameters and for training the default classifiers.
    By default, a working directory  will be created in the user home directory
    (:file:`~/rhapsody/`). Previous configuration data will be recovered.
    Additional data from EVmutation website will be automatically downloaded
    (~1.4GB).

    :arg working_dir: path to a local folder
    :type working_dir: str

    :arg refresh: if **True**, previous trained classifiers will be deleted,
        if found
    :type refresh: bool

    :arg download_EVmutation: if **True**, precomputed EVmutation scores will
        be downloaded (recommended)
    :type download_EVmutation: bool
    """

    pd.LOGGER.info(f'You are running Rhapsody v{rd.__version__}')

    # set working directory
    if working_dir is None:
        # check pre-existing configuration
        old_dir = pd.SETTINGS.get('rhapsody_local_folder')
        if type(old_dir) is str and os.path.isdir(old_dir):
            working_dir = old_dir
            pd.LOGGER.info('Pre-existing working directory detected: '
                           f'{working_dir}')
        else:
            # use default location and create folder if needed
            working_dir = DEFAULT_WORKING_DIR
            if os.path.isdir(working_dir):
                raise EnvironmentError(
                    f"A folder named '{working_dir}' already exists. "
                    "Please specify another name.")
            else:
                os.mkdir(working_dir)
                pd.LOGGER.info(f'Default working directory set: {working_dir}')
    else:
        working_dir = os.path.abspath(working_dir)
        if os.path.isdir(working_dir):
            pd.LOGGER.info(f'Working directory set: {working_dir}')
        else:
            raise EnvironmentError(f'Invalid working directory: {working_dir}')
    pd.SETTINGS['rhapsody_local_folder'] = working_dir

    # create pickles folder
    folder = os.path.join(working_dir, 'pickles')
    if not os.path.isdir(folder):
        os.mkdir(folder)

    # check for pre-existing folder containing trained classifiers
    folder = os.path.join(working_dir, DEFAULT_CLSF_DIR)
    training_dataset = None
    if os.path.isdir(folder) and not refresh:
        pd.LOGGER.info(f'Pre-existing classifiers found: {folder}')
        # check for missing classifiers
        for featset in DEFAULT_FEATSETS:
            fname = os.path.join(folder, featset, 'trained_classifier.pkl')
            if not os.path.isfile(fname):
                raise IOError(f"Missing classifier: '{featset}'. Please "
                              f'delete folder {folder} and rerun setup.')
    else:
        # delete old classifiers and train new ones
        if os.path.isdir(folder):
            shutil.rmtree(folder)
        os.mkdir(folder)
        pd.LOGGER.info(f'Classifiers folder created: {folder}')
        # delete EVmutation metrics as well, that must be updated
        pd.SETTINGS.pop('EVmutation_metrics')
        # import training dataset included with package
        training_dataset = getDefaultTrainingDataset()
        info = {
            'size': len(training_dataset),
            'fields': training_dataset.dtype.names
        }
        pd.SETTINGS['rhapsody_training_dataset'] = info
        # train new default classifiers
        pd.LOGGER.info('')
        for name, featset in DEFAULT_FEATSETS.items():
            clsf_folder = os.path.join(folder, name)
            os.mkdir(clsf_folder)
            logfile = os.path.join(clsf_folder, 'RF_training.log')
            # run training procedure
            pd.LOGGER.info(f'Training {name} classifier...')
            pd.LOGGER.start(logfile)
            fields = ['SAV_coords', 'true_label'] + featset
            rd.trainRFclassifier(training_dataset[fields])
            # move trained classifier and figures into folder
            output_files = ['predictions_distribution.png',
                            'pathogenicity_prob.png',
                            'ROC.png',
                            'feat_importances.png',
                            'trained_classifier.pkl', ]
            for file in output_files:
                shutil.move(file, clsf_folder)
            pd.LOGGER.close(logfile)
        pd.LOGGER.info('')

    # check EVmutation metrics
    metrics = pd.SETTINGS.get('EVmutation_metrics', default={})
    if 'AUROC' in metrics:
        pd.LOGGER.info(f'Pre-existing EVmutation metrics found.')
    else:
        # compute EVmutation metrics from included training dataset
        if training_dataset is None:
            training_dataset = getDefaultTrainingDataset()
        if 'EVmut-DeltaE_epist' not in training_dataset.dtype.names:
            pd.SETTINGS['EVmutation_metrics'] = {}
            pd.LOGGER.warn('Unable to compute EVmutation metrics: '
                           'precomputed scores not found.')
        else:
            sel = ~np.isnan(training_dataset['EVmut-DeltaE_epist'])
            # NB: EVmutation score and pathogenicity are anti-correlated
            true_labels = training_dataset['true_label'][sel]
            EVmut_predictor = -training_dataset['EVmut-DeltaE_epist'][sel]
            metrics = rd.calcScoreMetrics(true_labels, EVmut_predictor)
            pd.SETTINGS['EVmutation_metrics'] = metrics
            pd.LOGGER.info(f'EVmutation metrics computed.')

    # fetch EVmutation precomputed data, if needed
    folder = pd.SETTINGS.get('EVmutation_local_folder')
    if type(folder) is str and os.path.isdir(folder):
        pd.LOGGER.info(f'EVmutation folder found: {folder}')
    else:
        folder = DEFAULT_EVMUT_DIR
        if os.path.isdir(DEFAULT_EVMUT_DIR):
            pd.LOGGER.info(f'EVmutation folder found: {folder}')
        elif download_EVmutation:
            pd.LOGGER.info(f'Downloading EVmutation data...')
            # download tar.gz file and save it locally
            tgz = os.path.join(working_dir, 'effects.tar.gz')
            with urllib.request.urlopen(EVMUT_URL) as r, open(tgz, 'wb') as f:
                shutil.copyfileobj(r, f)
            # extract archive
            tar = tarfile.open(tgz, "r:gz")
            tar.extractall(path=folder)
            tar.close()
            os.remove(tgz)
            pd.LOGGER.info(f'EVmutation folder set: {folder}')
        else:
            folder = None
            msg = ('For full functionality, please consider downloading '
                   f'EVmutation data from {EVMUT_URL} and then set up the '
                   'relative path in the configuration file.')
            pd.LOGGER.warn(msg)
    pd.SETTINGS['EVmutation_local_folder'] = folder

    # check if DSSP is installed
    which = pd.utilities.which
    if which('dssp') is None and which('mkdssp') is None:
        msg = ('For full functionality, please consider installing DSSP, '
               'for instance by typing in a Linux terminal: '
               "'sudo apt install dssp'")
        pd.LOGGER.warn(msg)
    else:
        pd.LOGGER.info('DSSP is installed on the system.')

    pd.SETTINGS.save()
    pd.LOGGER.info('Setup complete.')

    return


[docs]def getDefaultTrainingDataset():
    # import training dataset included with package
    working_dir = pd.SETTINGS.get('rhapsody_local_folder')
    tar = tarfile.open(PACKAGE_DATA, "r:gz")
    tar.extractall(path=working_dir)
    tar.close()
    fname = os.path.join(working_dir, TRAINING_DATASET)
    training_dataset = np.load(fname)
    os.remove(fname)
    return training_dataset


[docs]def getDefaultClassifiers():
    """Returns a dictionary with the paths to the three default classifiers
    (``'full'``, ``'reduced'`` and ``'EVmut'``)
    """
    working_dir = pd.SETTINGS.get('rhapsody_local_folder')
    clsf_folder = os.path.join(working_dir, DEFAULT_CLSF_DIR)

    def_clsfs = {fs: os.path.join(clsf_folder, fs, 'trained_classifier.pkl')
                 for fs in DEFAULT_FEATSETS}

    if any([not os.path.isfile(c) for c in def_clsfs.values()]):
        raise IOError('One or more default classifiers are missing. '
                      'Please rerun setup with initialSetup(refresh=True)')
    else:
        return def_clsfs


[docs]def importDefaultClassifier(version):
    """Imports the specified classifier and its summary

    :arg version: either 'full', 'reduced' or 'EVmut'
    :type version: str
    """
    assert version in ['full', 'reduced', 'EVmut']
    with open(getDefaultClassifiers()[version], 'rb') as p:
        clsf = pickle.load(p)
    return clsf


[docs]def delSettings():
    for entry in ['rhapsody_local_folder', 'rhapsody_training_dataset',
                  'EVmutation_local_folder', 'EVmutation_metrics']:
        pd.SETTINGS.pop(entry)


[docs]def getSettings(print=True):
    """Returns and prints essential information about the current Rhapsody
    configuration, such as the location of working directory and default
    classifiers
    """

    config_dict = {}

    for entry in ['rhapsody_local_folder', 'rhapsody_training_dataset',
                  'EVmutation_local_folder', 'EVmutation_metrics']:
        config_dict[entry] = pd.SETTINGS.get(entry)

    def_clsfs = getDefaultClassifiers()
    for fs, path in def_clsfs.items():
        fs += ' classifier'
        config_dict[fs] = path

    if print:
        entries = ['rhapsody_local_folder', 'EVmutation_local_folder'] \
                  + [f'{c} classifier' for c in def_clsfs]
        for entry in entries:
            pd.LOGGER.info(f'{entry:24}: {config_dict[entry]}')
        d = pd.SETTINGS['rhapsody_training_dataset']
        pd.LOGGER.info('training dataset size   : {}'.format(d['size']))
        if 'AUROC' in pd.SETTINGS.get('EVmutation_metrics', {}):
            pd.LOGGER.info('EVmutation_metrics      : <computed>')
        else:
            pd.LOGGER.info('EVmutation_metrics      : <missing>')

    return config_dict