Source code for rhapsody.train.RFtraining

# -*- coding: utf-8 -*-
"""This module defines functions for training Random Forest classifiers
implementing Rhapsody's classification schemes."""

import pickle
import numpy as np
import numpy.lib.recfunctions as rfn
from collections import Counter
from prody import LOGGER
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support
from sklearn.utils import resample
from ..utils.settings import DEFAULT_FEATSETS, getDefaultTrainingDataset
from .figures import print_pred_distrib_figure, print_path_prob_figure
from .figures import print_ROC_figure, print_feat_imp_figure

__author__ = "Luca Ponzoni"
__date__ = "December 2019"
__maintainer__ = "Luca Ponzoni"
__email__ = "lponzoni@pitt.edu"
__status__ = "Production"

__all__ = ['calcScoreMetrics', 'calcClassMetrics', 'calcPathogenicityProbs',
           'RandomForestCV', 'trainRFclassifier',
           'extendDefaultTrainingDataset']


[docs]def calcScoreMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs): '''Compute accuracy metrics of continuous values (optionally bootstrapped) ''' def _calcScoreMetrics(y_test, y_pred): # compute ROC and AUROC fpr, tpr, roc_thr = roc_curve(y_test, y_pred) roc = {'FPR': fpr, 'TPR': tpr, 'thresholds': roc_thr} auroc = roc_auc_score(y_test, y_pred) # compute optimal cutoff J (argmax of Youden's index) diff = np.array([y-x for x, y in zip(fpr, tpr)]) Jopt = roc_thr[(-diff).argsort()][0] # compute Precision-Recall curve and AUPRC pre, rec, prc_thr = precision_recall_curve(y_test, y_pred) prc = {'precision': pre, 'recall': rec, 'thresholds': prc_thr} auprc = average_precision_score(y_test, y_pred) return { 'ROC': roc, 'AUROC': auroc, 'optimal cutoff': Jopt, 'PRC': prc, 'AUPRC': auprc } if bootstrap < 2: output = _calcScoreMetrics(y_test, y_pred) else: # apply bootstrap outputs = [] for i in range(bootstrap): yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs) outputs.append(_calcScoreMetrics(yy_test, yy_pred)) # compute mean and standard deviation of metrics output = {} for metric in ['AUROC', 'optimal cutoff', 'AUPRC']: v = [d[metric] for d in outputs] output[f'mean {metric}'] = np.mean(v) output[f'{metric} std'] = np.std(v) # compute average ROC mean_fpr = np.linspace(0, 1, len(y_pred)) mean_tpr = 0.0 for d in outputs: mean_tpr += np.interp(mean_fpr, d['ROC']['FPR'], d['ROC']['TPR']) mean_tpr /= bootstrap mean_tpr[0] = 0.0 mean_tpr[-1] = 1.0 output['mean ROC'] = {'FPR': mean_fpr, 'TPR': mean_tpr} return output
[docs]def calcClassMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs): '''Compute accuracy metrics of binary labels (optionally bootstrapped) ''' def _calcClassMetrics(y_test, y_pred): mcc = matthews_corrcoef(y_test, y_pred) pre, rec, f1s, sup = precision_recall_fscore_support( y_test, y_pred, labels=[0, 1]) avg_pre, avg_rec, avg_f1s, _ = precision_recall_fscore_support( y_test, y_pred, average='weighted') return { 'MCC': mcc, 'precision (0)': pre[0], 'recall (0)': rec[0], 'F1 score (0)': f1s[0], 'precision (1)': pre[1], 'recall (1)': rec[1], 'F1 score (1)': f1s[1], 'precision': avg_pre, 'recall': avg_rec, 'F1 score': avg_f1s } if bootstrap < 2: output = _calcClassMetrics(y_test, y_pred) else: # apply bootstrap outputs = [] for i in range(bootstrap): yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs) outputs.append(_calcClassMetrics(yy_test, yy_pred)) # compute mean and standard deviation of metrics output = {} for metric in outputs[0].keys(): v = [d[metric] for d in outputs] output[f'mean {metric}'] = np.mean(v) output[f'{metric} std'] = np.std(v) return output
[docs]def calcPathogenicityProbs(CV_info, num_bins=15, ppred_reliability_cutoff=200, pred_distrib_fig='predictions_distribution.png', path_prob_fig='pathogenicity_prob.png', **kwargs): '''Compute pathogenicity probabilities, from predictions on CV test sets ''' mean_Jopt = np.mean(CV_info['optimal cutoff']) preds = [np.array(CV_info['predictions_0']), np.array(CV_info['predictions_1'])] # compute (normalized) histograms dx = 1./num_bins bins = np.arange(0, 1+dx, dx) histo = np.empty((2, len(bins)-1)) norm_histo = np.empty_like(histo) for i in [0, 1]: h, _ = np.histogram(preds[i], bins, range=(0, 1)) histo[i] = h norm_histo[i] = h/len(preds[i]) # print predictions distribution figure if pred_distrib_fig is not None: print_pred_distrib_figure(pred_distrib_fig, bins, norm_histo, dx, mean_Jopt) # compute pathogenicity probability s = np.sum(norm_histo, axis=0) path_prob = np.divide(norm_histo[1], s, out=np.zeros_like(s), where=(s != 0)) # smooth path. probability profile and extend it to [0, 1] interval _smooth = _running_average(path_prob) # _smooth = _calcSmoothCurve(path_prob, 5) y_ext = np.concatenate([[0], _smooth, [1]]) x_ext = np.concatenate([[0], bins[:-1]+dx/2, [1]]) smooth_path_prob = np.array([x_ext, y_ext]) # print pathogenicity probability figure if path_prob_fig is not None: print_path_prob_figure( path_prob_fig, bins, histo, dx, path_prob, smooth_plot=smooth_path_prob, cutoff=ppred_reliability_cutoff) return np.array(smooth_path_prob)
def _running_average(curve): ext_pprob = np.concatenate([[0], curve, [1]]) return np.convolve(ext_pprob, np.ones((3,))/3, mode='valid') def _calcSmoothCurve(curve, smooth_window): # smooth pathogenicity probability profile n = len(curve) smooth_curve = np.zeros_like(curve) for i in range(n): p = curve[i] sw = 0 for k in range(1, smooth_window + 1): if (i-k < 0) or (i+k >= n): break else: sw = k p += curve[i-k] + curve[i+k] smooth_curve[i] = p / (1 + sw*2) return smooth_curve def _performCV(X, y, sel_SAVs, n_estimators=1000, max_features='auto', n_splits=10, ROC_fig='ROC.png', feature_names=None, CVseed=666, stratification=None, **kwargs): assert stratification in [None, 'protein', 'residue'] # set classifier classifier = RandomForestClassifier( n_estimators=n_estimators, max_features=max_features, oob_score=True, n_jobs=-1, class_weight='balanced') # define folds cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CVseed) CV_folds = [] for train, test in cv.split(X, y): CV_folds.append([train, test]) # protein-stratification: a same protein should not be found in # both training and test sets if stratification is not None: # for each fold, count occurrences of each protein/residue occurrences = {} if stratification == 'protein': # e.g. 'P01112' accs = np.array([s.split()[0] for s in sel_SAVs['SAV_coords']]) else: # e.g. P01112 99 accs = np.array([' '.join(s.split()[:2]) for s in sel_SAVs['SAV_coords']]) for k, (train, test) in enumerate(CV_folds): counts = Counter(accs[test]) for acc, count in counts.items(): occurrences.setdefault(acc, np.zeros(n_splits, dtype=int)) occurrences[acc][k] = count # for each acc. number, find fold with largest occurrences best_fold = {a: np.argmax(c) for a, c in occurrences.items()} new_folds = np.array([best_fold[a] for a in accs]) # update folds for k in range(n_splits): CV_folds[k][0] = np.where(new_folds != k)[0] CV_folds[k][1] = np.where(new_folds == k)[0] # cross-validation loop CV_info = {k: [] for k in [ 'AUROC', 'AUPRC', 'OOB score', 'optimal cutoff', 'MCC', 'precision (0)', 'recall (0)', 'F1 score (0)', 'precision (1)', 'recall (1)', 'F1 score (1)', 'precision', 'recall', 'F1 score', 'feat. importances', 'predictions_0', 'predictions_1']} mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 20) i = 0 for train, test in CV_folds: # create training and test datasets X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] # train Random Forest classifier classifier.fit(X_train, y_train) # calculate probabilities over decision trees y_pred = classifier.predict_proba(X_test)[:, 1] # compute ROC, AUROC, optimal cutoff (argmax of Youden's index), etc. sm = calcScoreMetrics(y_test, y_pred) for stat in ['AUROC', 'AUPRC', 'optimal cutoff']: CV_info[stat].append(sm[stat]) # compute Matthews corr. coeff., precision/recall, etc. on classes y_pred_binary = np.where(y_pred > sm['optimal cutoff'], 1, 0) cm = calcClassMetrics(y_test, y_pred_binary) for stat in cm.keys(): CV_info[stat].append(cm[stat]) # other info mean_tpr += np.interp(mean_fpr, sm['ROC']['FPR'], sm['ROC']['TPR']) CV_info['OOB score'].append(classifier.oob_score_) CV_info['feat. importances'].append( np.array(classifier.feature_importances_)) CV_info['predictions_0'].extend(y_pred[y_test == 0]) CV_info['predictions_1'].extend(y_pred[y_test == 1]) # print log i += 1 LOGGER.info('CV iteration #{:2d}: '.format(i) + 'AUROC = {:.3f} '.format(sm['AUROC']) + 'AUPRC = {:.3f} '.format(sm['AUPRC']) + 'OOB score = {:.3f}'.format(classifier.oob_score_)) # compute average ROC curves mean_tpr /= cv.get_n_splits(X, y) mean_tpr[0] = 0.0 mean_tpr[-1] = 1.0 # compute average ROC, optimal cutoff and other stats stats = {} for s in CV_info.keys(): if s in ['predictions_0', 'predictions_1']: continue stats[s] = (np.mean(CV_info[s], axis=0), np.std(CV_info[s], axis=0)) LOGGER.info('-'*60) LOGGER.info('Cross-validation summary:') LOGGER.info(f'training dataset size: {len(y):<d}') LOGGER.info(f'fraction of positives: {sum(y)/len(y):.3f}') for s in ['AUROC', 'AUPRC', 'OOB score', 'optimal cutoff']: if s == 'optimal cutoff': fields = ('optimal cutoff*:', stats[s][0], stats[s][1]) else: fields = (f'mean {s}:', stats[s][0], stats[s][1]) LOGGER.info('{:24} {:.3f} +/- {:.3f}'.format(*fields)) LOGGER.info("(* argmax of Youden's index)") n_feats = len(stats['feat. importances'][0]) if feature_names is None: feature_names = [f'feature {i}' for i in range(n_feats)] LOGGER.info('feature importances:') for i, feat_name in enumerate(feature_names): LOGGER.info('{:>23s}: {:.3f}'.format( feat_name, stats['feat. importances'][0][i])) LOGGER.info('-'*60) path_prob = calcPathogenicityProbs(CV_info, **kwargs) CV_summary = { 'dataset size': len(y), 'dataset bias': sum(y)/len(y), 'mean ROC': list(zip(mean_fpr, mean_tpr)), 'optimal cutoff': stats['optimal cutoff'], 'feat. importances': stats['feat. importances'], 'path. probability': path_prob, 'training dataset': sel_SAVs, 'folds': CV_folds } for s in ['AUROC', 'AUPRC', 'OOB score', 'MCC', 'precision (0)', 'recall (0)', 'F1 score (0)', 'precision (1)', 'recall (1)', 'F1 score (1)', 'precision', 'recall', 'F1 score']: CV_summary['mean ' + s] = stats[s] # plot average ROC if ROC_fig is not None: print_ROC_figure(ROC_fig, mean_fpr, mean_tpr, stats['AUROC']) return CV_summary def _importFeatMatrix(fm): assert fm.dtype.names is not None, \ "feat. matrix must be a NumPy structured array." assert 'true_label' in fm.dtype.names, \ "feat. matrix must have a 'true_label' field." assert 'SAV_coords' in fm.dtype.names, \ "feat. matrix must have a 'SAV_coords' field." assert set(fm['true_label']) == {0, 1}, \ 'Invalid true labels in feat. matrix.' # check for ambiguous cases in training dataset del_SAVs = set(fm[fm['true_label'] == 1]['SAV_coords']) neu_SAVs = set(fm[fm['true_label'] == 0]['SAV_coords']) amb_SAVs = del_SAVs.intersection(neu_SAVs) if amb_SAVs: raise RuntimeError('Ambiguous cases found in training dataset: {}' .format(amb_SAVs)) # eliminate rows containing NaN values from feature matrix featset = [f for f in fm.dtype.names if f not in ['true_label', 'SAV_coords']] sel = [~np.isnan(np.sum([x for x in r])) for r in fm[featset]] fms = fm[sel] n_i = len(fm) n_f = len(fms) dn = n_i - n_f if dn: LOGGER.info(f'{dn} out of {n_i} cases ignored with missing features.') # split into feature array and true label array X = np.array([[np.float32(x) for x in v] for v in fms[featset]]) y = fms['true_label'] sel_SAVs = fms[['SAV_coords', 'true_label']] return X, y, sel_SAVs, featset
[docs]def RandomForestCV(feat_matrix, n_estimators=1500, max_features=2, **kwargs): X, y, sel_SAVs, featset = _importFeatMatrix(feat_matrix) CV_summary = _performCV( X, y, sel_SAVs, n_estimators=n_estimators, max_features=max_features, feature_names=featset, **kwargs) return CV_summary
[docs]def trainRFclassifier(feat_matrix, n_estimators=1500, max_features=2, pickle_name='trained_classifier.pkl', feat_imp_fig='feat_importances.png', **kwargs): X, y, sel_SAVs, featset = _importFeatMatrix(feat_matrix) # calculate optimal Youden cutoff through CV CV_summary = _performCV( X, y, sel_SAVs, n_estimators=n_estimators, max_features=max_features, feature_names=featset, **kwargs) # train a classifier on the whole dataset clsf = RandomForestClassifier( n_estimators=n_estimators, max_features=max_features, oob_score=True, class_weight='balanced', n_jobs=-1) clsf.fit(X, y) fimp = clsf.feature_importances_ LOGGER.info('-'*60) LOGGER.info('Classifier training summary:') LOGGER.info(f'mean OOB score: {clsf.oob_score_:.3f}') LOGGER.info('feature importances:') for feat_name, importance in zip(featset, fimp): LOGGER.info(f'{feat_name:>23s}: {importance:.3f}') LOGGER.info('-'*60) if feat_imp_fig is not None: print_feat_imp_figure(feat_imp_fig, fimp, featset) clsf_dict = { 'trained RF': clsf, 'features': featset, 'CV summary': CV_summary, } # save pickle with trained classifier and other info if pickle_name is not None: pickle.dump(clsf_dict, open(pickle_name, 'wb')) return clsf_dict
[docs]def extendDefaultTrainingDataset(names, arrays, base_default_featset='full'): """base : array Input array to extend. names : string, sequence String or sequence of strings corresponding to the names of the new fields. data : array or sequence of arrays Array or sequence of arrays storing the fields to add to the base. """ training_dataset = getDefaultTrainingDataset() # select features from integrated dataset if base_default_featset is None: base_featset = [] if base_default_featset in DEFAULT_FEATSETS: base_featset = DEFAULT_FEATSETS[base_default_featset] else: base_featset = list(base_default_featset) featset = ['SAV_coords', 'true_label'] + base_featset base_dataset = training_dataset[featset] # extend base training dataset fm = rfn.rec_append_fields(base_dataset, names, arrays, dtypes='float32') return fm