# -*- coding: utf-8 -*-
"""This module defines functions for training Random Forest classifiers
implementing Rhapsody's classification schemes."""
import pickle
import numpy as np
import numpy.lib.recfunctions as rfn
from collections import Counter
from prody import LOGGER
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support
from sklearn.utils import resample
from ..utils.settings import DEFAULT_FEATSETS, getDefaultTrainingDataset
from .figures import print_pred_distrib_figure, print_path_prob_figure
from .figures import print_ROC_figure, print_feat_imp_figure
__author__ = "Luca Ponzoni"
__date__ = "December 2019"
__maintainer__ = "Luca Ponzoni"
__email__ = "lponzoni@pitt.edu"
__status__ = "Production"
__all__ = ['calcScoreMetrics', 'calcClassMetrics', 'calcPathogenicityProbs',
'RandomForestCV', 'trainRFclassifier',
'extendDefaultTrainingDataset']
[docs]def calcScoreMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs):
'''Compute accuracy metrics of continuous values (optionally bootstrapped)
'''
def _calcScoreMetrics(y_test, y_pred):
# compute ROC and AUROC
fpr, tpr, roc_thr = roc_curve(y_test, y_pred)
roc = {'FPR': fpr, 'TPR': tpr, 'thresholds': roc_thr}
auroc = roc_auc_score(y_test, y_pred)
# compute optimal cutoff J (argmax of Youden's index)
diff = np.array([y-x for x, y in zip(fpr, tpr)])
Jopt = roc_thr[(-diff).argsort()][0]
# compute Precision-Recall curve and AUPRC
pre, rec, prc_thr = precision_recall_curve(y_test, y_pred)
prc = {'precision': pre, 'recall': rec, 'thresholds': prc_thr}
auprc = average_precision_score(y_test, y_pred)
return {
'ROC': roc,
'AUROC': auroc,
'optimal cutoff': Jopt,
'PRC': prc,
'AUPRC': auprc
}
if bootstrap < 2:
output = _calcScoreMetrics(y_test, y_pred)
else:
# apply bootstrap
outputs = []
for i in range(bootstrap):
yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs)
outputs.append(_calcScoreMetrics(yy_test, yy_pred))
# compute mean and standard deviation of metrics
output = {}
for metric in ['AUROC', 'optimal cutoff', 'AUPRC']:
v = [d[metric] for d in outputs]
output[f'mean {metric}'] = np.mean(v)
output[f'{metric} std'] = np.std(v)
# compute average ROC
mean_fpr = np.linspace(0, 1, len(y_pred))
mean_tpr = 0.0
for d in outputs:
mean_tpr += np.interp(mean_fpr, d['ROC']['FPR'], d['ROC']['TPR'])
mean_tpr /= bootstrap
mean_tpr[0] = 0.0
mean_tpr[-1] = 1.0
output['mean ROC'] = {'FPR': mean_fpr, 'TPR': mean_tpr}
return output
[docs]def calcClassMetrics(y_test, y_pred, bootstrap=0, **resample_kwargs):
'''Compute accuracy metrics of binary labels (optionally bootstrapped)
'''
def _calcClassMetrics(y_test, y_pred):
mcc = matthews_corrcoef(y_test, y_pred)
pre, rec, f1s, sup = precision_recall_fscore_support(
y_test, y_pred, labels=[0, 1])
avg_pre, avg_rec, avg_f1s, _ = precision_recall_fscore_support(
y_test, y_pred, average='weighted')
return {
'MCC': mcc,
'precision (0)': pre[0],
'recall (0)': rec[0],
'F1 score (0)': f1s[0],
'precision (1)': pre[1],
'recall (1)': rec[1],
'F1 score (1)': f1s[1],
'precision': avg_pre,
'recall': avg_rec,
'F1 score': avg_f1s
}
if bootstrap < 2:
output = _calcClassMetrics(y_test, y_pred)
else:
# apply bootstrap
outputs = []
for i in range(bootstrap):
yy_test, yy_pred = resample(y_test, y_pred, **resample_kwargs)
outputs.append(_calcClassMetrics(yy_test, yy_pred))
# compute mean and standard deviation of metrics
output = {}
for metric in outputs[0].keys():
v = [d[metric] for d in outputs]
output[f'mean {metric}'] = np.mean(v)
output[f'{metric} std'] = np.std(v)
return output
[docs]def calcPathogenicityProbs(CV_info, num_bins=15,
ppred_reliability_cutoff=200,
pred_distrib_fig='predictions_distribution.png',
path_prob_fig='pathogenicity_prob.png', **kwargs):
'''Compute pathogenicity probabilities,
from predictions on CV test sets
'''
mean_Jopt = np.mean(CV_info['optimal cutoff'])
preds = [np.array(CV_info['predictions_0']),
np.array(CV_info['predictions_1'])]
# compute (normalized) histograms
dx = 1./num_bins
bins = np.arange(0, 1+dx, dx)
histo = np.empty((2, len(bins)-1))
norm_histo = np.empty_like(histo)
for i in [0, 1]:
h, _ = np.histogram(preds[i], bins, range=(0, 1))
histo[i] = h
norm_histo[i] = h/len(preds[i])
# print predictions distribution figure
if pred_distrib_fig is not None:
print_pred_distrib_figure(pred_distrib_fig, bins, norm_histo,
dx, mean_Jopt)
# compute pathogenicity probability
s = np.sum(norm_histo, axis=0)
path_prob = np.divide(norm_histo[1], s, out=np.zeros_like(s),
where=(s != 0))
# smooth path. probability profile and extend it to [0, 1] interval
_smooth = _running_average(path_prob)
# _smooth = _calcSmoothCurve(path_prob, 5)
y_ext = np.concatenate([[0], _smooth, [1]])
x_ext = np.concatenate([[0], bins[:-1]+dx/2, [1]])
smooth_path_prob = np.array([x_ext, y_ext])
# print pathogenicity probability figure
if path_prob_fig is not None:
print_path_prob_figure(
path_prob_fig, bins, histo, dx, path_prob,
smooth_plot=smooth_path_prob, cutoff=ppred_reliability_cutoff)
return np.array(smooth_path_prob)
def _running_average(curve):
ext_pprob = np.concatenate([[0], curve, [1]])
return np.convolve(ext_pprob, np.ones((3,))/3, mode='valid')
def _calcSmoothCurve(curve, smooth_window):
# smooth pathogenicity probability profile
n = len(curve)
smooth_curve = np.zeros_like(curve)
for i in range(n):
p = curve[i]
sw = 0
for k in range(1, smooth_window + 1):
if (i-k < 0) or (i+k >= n):
break
else:
sw = k
p += curve[i-k] + curve[i+k]
smooth_curve[i] = p / (1 + sw*2)
return smooth_curve
def _performCV(X, y, sel_SAVs, n_estimators=1000, max_features='auto',
n_splits=10, ROC_fig='ROC.png', feature_names=None,
CVseed=666, stratification=None, **kwargs):
assert stratification in [None, 'protein', 'residue']
# set classifier
classifier = RandomForestClassifier(
n_estimators=n_estimators, max_features=max_features,
oob_score=True, n_jobs=-1, class_weight='balanced')
# define folds
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=CVseed)
CV_folds = []
for train, test in cv.split(X, y):
CV_folds.append([train, test])
# protein-stratification: a same protein should not be found in
# both training and test sets
if stratification is not None:
# for each fold, count occurrences of each protein/residue
occurrences = {}
if stratification == 'protein':
# e.g. 'P01112'
accs = np.array([s.split()[0] for s in sel_SAVs['SAV_coords']])
else:
# e.g. P01112 99
accs = np.array([' '.join(s.split()[:2])
for s in sel_SAVs['SAV_coords']])
for k, (train, test) in enumerate(CV_folds):
counts = Counter(accs[test])
for acc, count in counts.items():
occurrences.setdefault(acc, np.zeros(n_splits, dtype=int))
occurrences[acc][k] = count
# for each acc. number, find fold with largest occurrences
best_fold = {a: np.argmax(c) for a, c in occurrences.items()}
new_folds = np.array([best_fold[a] for a in accs])
# update folds
for k in range(n_splits):
CV_folds[k][0] = np.where(new_folds != k)[0]
CV_folds[k][1] = np.where(new_folds == k)[0]
# cross-validation loop
CV_info = {k: [] for k in [
'AUROC', 'AUPRC', 'OOB score', 'optimal cutoff', 'MCC',
'precision (0)', 'recall (0)', 'F1 score (0)',
'precision (1)', 'recall (1)', 'F1 score (1)',
'precision', 'recall', 'F1 score',
'feat. importances', 'predictions_0', 'predictions_1']}
mean_tpr = 0.0
mean_fpr = np.linspace(0, 1, 20)
i = 0
for train, test in CV_folds:
# create training and test datasets
X_train = X[train]
X_test = X[test]
y_train = y[train]
y_test = y[test]
# train Random Forest classifier
classifier.fit(X_train, y_train)
# calculate probabilities over decision trees
y_pred = classifier.predict_proba(X_test)[:, 1]
# compute ROC, AUROC, optimal cutoff (argmax of Youden's index), etc.
sm = calcScoreMetrics(y_test, y_pred)
for stat in ['AUROC', 'AUPRC', 'optimal cutoff']:
CV_info[stat].append(sm[stat])
# compute Matthews corr. coeff., precision/recall, etc. on classes
y_pred_binary = np.where(y_pred > sm['optimal cutoff'], 1, 0)
cm = calcClassMetrics(y_test, y_pred_binary)
for stat in cm.keys():
CV_info[stat].append(cm[stat])
# other info
mean_tpr += np.interp(mean_fpr, sm['ROC']['FPR'], sm['ROC']['TPR'])
CV_info['OOB score'].append(classifier.oob_score_)
CV_info['feat. importances'].append(
np.array(classifier.feature_importances_))
CV_info['predictions_0'].extend(y_pred[y_test == 0])
CV_info['predictions_1'].extend(y_pred[y_test == 1])
# print log
i += 1
LOGGER.info('CV iteration #{:2d}: '.format(i) +
'AUROC = {:.3f} '.format(sm['AUROC']) +
'AUPRC = {:.3f} '.format(sm['AUPRC']) +
'OOB score = {:.3f}'.format(classifier.oob_score_))
# compute average ROC curves
mean_tpr /= cv.get_n_splits(X, y)
mean_tpr[0] = 0.0
mean_tpr[-1] = 1.0
# compute average ROC, optimal cutoff and other stats
stats = {}
for s in CV_info.keys():
if s in ['predictions_0', 'predictions_1']:
continue
stats[s] = (np.mean(CV_info[s], axis=0), np.std(CV_info[s], axis=0))
LOGGER.info('-'*60)
LOGGER.info('Cross-validation summary:')
LOGGER.info(f'training dataset size: {len(y):<d}')
LOGGER.info(f'fraction of positives: {sum(y)/len(y):.3f}')
for s in ['AUROC', 'AUPRC', 'OOB score', 'optimal cutoff']:
if s == 'optimal cutoff':
fields = ('optimal cutoff*:', stats[s][0], stats[s][1])
else:
fields = (f'mean {s}:', stats[s][0], stats[s][1])
LOGGER.info('{:24} {:.3f} +/- {:.3f}'.format(*fields))
LOGGER.info("(* argmax of Youden's index)")
n_feats = len(stats['feat. importances'][0])
if feature_names is None:
feature_names = [f'feature {i}' for i in range(n_feats)]
LOGGER.info('feature importances:')
for i, feat_name in enumerate(feature_names):
LOGGER.info('{:>23s}: {:.3f}'.format(
feat_name, stats['feat. importances'][0][i]))
LOGGER.info('-'*60)
path_prob = calcPathogenicityProbs(CV_info, **kwargs)
CV_summary = {
'dataset size': len(y),
'dataset bias': sum(y)/len(y),
'mean ROC': list(zip(mean_fpr, mean_tpr)),
'optimal cutoff': stats['optimal cutoff'],
'feat. importances': stats['feat. importances'],
'path. probability': path_prob,
'training dataset': sel_SAVs,
'folds': CV_folds
}
for s in ['AUROC', 'AUPRC', 'OOB score', 'MCC',
'precision (0)', 'recall (0)', 'F1 score (0)',
'precision (1)', 'recall (1)', 'F1 score (1)',
'precision', 'recall', 'F1 score']:
CV_summary['mean ' + s] = stats[s]
# plot average ROC
if ROC_fig is not None:
print_ROC_figure(ROC_fig, mean_fpr, mean_tpr, stats['AUROC'])
return CV_summary
def _importFeatMatrix(fm):
assert fm.dtype.names is not None, \
"feat. matrix must be a NumPy structured array."
assert 'true_label' in fm.dtype.names, \
"feat. matrix must have a 'true_label' field."
assert 'SAV_coords' in fm.dtype.names, \
"feat. matrix must have a 'SAV_coords' field."
assert set(fm['true_label']) == {0, 1}, \
'Invalid true labels in feat. matrix.'
# check for ambiguous cases in training dataset
del_SAVs = set(fm[fm['true_label'] == 1]['SAV_coords'])
neu_SAVs = set(fm[fm['true_label'] == 0]['SAV_coords'])
amb_SAVs = del_SAVs.intersection(neu_SAVs)
if amb_SAVs:
raise RuntimeError('Ambiguous cases found in training dataset: {}'
.format(amb_SAVs))
# eliminate rows containing NaN values from feature matrix
featset = [f for f in fm.dtype.names
if f not in ['true_label', 'SAV_coords']]
sel = [~np.isnan(np.sum([x for x in r])) for r in fm[featset]]
fms = fm[sel]
n_i = len(fm)
n_f = len(fms)
dn = n_i - n_f
if dn:
LOGGER.info(f'{dn} out of {n_i} cases ignored with missing features.')
# split into feature array and true label array
X = np.array([[np.float32(x) for x in v] for v in fms[featset]])
y = fms['true_label']
sel_SAVs = fms[['SAV_coords', 'true_label']]
return X, y, sel_SAVs, featset
[docs]def RandomForestCV(feat_matrix, n_estimators=1500, max_features=2, **kwargs):
X, y, sel_SAVs, featset = _importFeatMatrix(feat_matrix)
CV_summary = _performCV(
X, y, sel_SAVs, n_estimators=n_estimators,
max_features=max_features, feature_names=featset, **kwargs)
return CV_summary
[docs]def trainRFclassifier(feat_matrix, n_estimators=1500, max_features=2,
pickle_name='trained_classifier.pkl',
feat_imp_fig='feat_importances.png', **kwargs):
X, y, sel_SAVs, featset = _importFeatMatrix(feat_matrix)
# calculate optimal Youden cutoff through CV
CV_summary = _performCV(
X, y, sel_SAVs, n_estimators=n_estimators,
max_features=max_features, feature_names=featset, **kwargs)
# train a classifier on the whole dataset
clsf = RandomForestClassifier(
n_estimators=n_estimators, max_features=max_features,
oob_score=True, class_weight='balanced', n_jobs=-1)
clsf.fit(X, y)
fimp = clsf.feature_importances_
LOGGER.info('-'*60)
LOGGER.info('Classifier training summary:')
LOGGER.info(f'mean OOB score: {clsf.oob_score_:.3f}')
LOGGER.info('feature importances:')
for feat_name, importance in zip(featset, fimp):
LOGGER.info(f'{feat_name:>23s}: {importance:.3f}')
LOGGER.info('-'*60)
if feat_imp_fig is not None:
print_feat_imp_figure(feat_imp_fig, fimp, featset)
clsf_dict = {
'trained RF': clsf,
'features': featset,
'CV summary': CV_summary,
}
# save pickle with trained classifier and other info
if pickle_name is not None:
pickle.dump(clsf_dict, open(pickle_name, 'wb'))
return clsf_dict
[docs]def extendDefaultTrainingDataset(names, arrays, base_default_featset='full'):
"""base : array
Input array to extend.
names : string, sequence
String or sequence of strings corresponding to the names of the new fields.
data : array or sequence of arrays
Array or sequence of arrays storing the fields to add to the base.
"""
training_dataset = getDefaultTrainingDataset()
# select features from integrated dataset
if base_default_featset is None:
base_featset = []
if base_default_featset in DEFAULT_FEATSETS:
base_featset = DEFAULT_FEATSETS[base_default_featset]
else:
base_featset = list(base_default_featset)
featset = ['SAV_coords', 'true_label'] + base_featset
base_dataset = training_dataset[featset]
# extend base training dataset
fm = rfn.rec_append_fields(base_dataset, names, arrays, dtypes='float32')
return fm