# -*- coding: utf-8 -*-
"""This module defines functions for querying the PolyPhen-2 online tool,
parsing its output and deriving features that will be used by the Rhapsody
classifiers.
"""
import os
import requests
import datetime
import numpy as np
from prody import LOGGER
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from math import log
from .Uniprot import queryUniprot
__author__ = "Luca Ponzoni"
__date__ = "December 2019"
__maintainer__ = "Luca Ponzoni"
__email__ = "lponzoni@pitt.edu"
__status__ = "Production"
__all__ = ['PP2_FEATS', 'queryPolyPhen2', 'parsePolyPhen2output',
'getSAVcoords', 'calcPolyPhen2features']
PP2_FEATS = ['wt_PSIC', 'Delta_PSIC']
"""List of features derived from PolyPhen-2's output."""
pph2_columns = ['o_acc', 'o_pos', 'o_aa1', 'o_aa2', 'rsid',
'acc', 'pos', 'aa1', 'aa2', 'nt1', 'nt2',
'prediction', 'based_on', 'effect', 'pph2_class',
'pph2_prob', 'pph2_FPR', 'pph2_TPR', 'pph2_FDR',
'site', 'region', 'PHAT', 'dScore', 'Score1',
'Score2', 'MSAv', 'Nobs', 'Nstruct', 'Nfilt',
'PDB_id', 'PDB_pos', 'PDB_ch', 'ident', 'length',
'NormASA', 'SecStr', 'MapReg', 'dVol', 'dProp',
'B-fact', 'H-bonds', 'AveNHet', 'MinDHet', 'AveNInt',
'MinDInt', 'AveNSit', 'MinDSit', 'Transv', 'CodPos',
'CpG', 'MinDJxn', 'PfamHit', 'IdPmax', 'IdPSNP',
'IdQmin', 'other']
def _requests_retry_session(retries=10, timeout=1, backoff_factor=0.3,
status_forcelist=(404,), session=None):
# https://www.peterbe.com/plog/best-practice-with-retries-with-requests
# time intervals (in minutes) between retry can be found with:
# [min((backoff_factor*(2**(retries - 1))), 120) / 60 for i in range(30)]
# total time after 12 retries --> ~6 minutes
# total time after 16 retries --> ~14 minutes
# total time after 30 retries --> ~42 minutes
# total time after 60 retries --> ~102 minutes
# total time after 100 retries --> ~182 minutes
# total time after 200 retries --> ~6 hours
session = session or requests.Session()
retry = Retry(total=retries, read=retries, connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
return session
def _check_log_errors(text):
error_strings = [
'ERROR: Neither AA1',
'ERROR: Invalid variation position',
'WARNING: Swapped input residues AA1'
]
accs = []
for line in text.split('\n'):
if any([s in line for s in error_strings]):
acc = line.split(':')[0][1:]
accs.append(acc)
Uniprot_accs = set(accs)
if Uniprot_accs:
LOGGER.warn('Wrong SAV coordinates detected for '
f'the following Uniprot sequences: {Uniprot_accs}')
return Uniprot_accs
def _print_fasta_file(Uniprot_accs, filename='custom_sequences.fasta'):
date = datetime.date.today().strftime('%Y%m%d')
new_accs = {}
with open(filename, 'w', 1) as f:
for acc in Uniprot_accs:
new_acc = f"{acc}-{date}"
f.write(f">{new_acc}")
record = queryUniprot(acc)
sequence = record['sequence 0']
f.write(sequence)
# store new temporary accession numbers
new_accs[acc] = new_acc
return filename, new_accs
def _replace_strings_in_text(text, dict_substitutions):
for old_str, new_str in dict_substitutions.items():
text = text.replace(old_str, new_str)
return text
def _replace_strings_in_file(fname, new_fname, dict_substitutions):
with open(fname, 'r') as f:
text = f.read()
for old_str, new_str in dict_substitutions.items():
text = text.replace(old_str, new_str)
with open(new_fname, 'w') as f:
f.write(text)
return new_fname
[docs]def queryPolyPhen2(filename, dump=True, prefix='pph2',
fasta_file=None, fix_isoforms=False,
ignore_errors=False, **kwargs):
# original PolyPhen-2 curl command (see:
# http://genetics.bwh.harvard.edu/pph2/dokuwiki/faq ):
#
# curl -F _ggi_project=PPHWeb2 -F _ggi_origin=query \
# -F _ggi_target_pipeline=1 -F MODELNAME=HumDiv \
# -F UCSCDB=hg19 -F SNPFUNC=m -F NOTIFYME=myemail@myisp.com \
# -F _ggi_batch_file=@example_batch.txt \
# -D - http://genetics.bwh.harvard.edu/cgi-bin/ggi/ggi2.cgi
assert type(dump) is bool
assert type(prefix) is str
LOGGER.info('Submitting query to PolyPhen-2...')
num_lines = sum(1 for line in open(filename, 'rb') if line[0] != '#')
input_file = open(filename, 'rb')
# submit query
address = 'http://genetics.bwh.harvard.edu/ggi/cgi-bin/ggi2.cgi'
files = {
'_ggi_project': (None, 'PPHWeb2'),
'_ggi_origin': (None, 'query'),
'_ggi_target_pipeline': (None, '1'),
'_ggi_batch_file': ('query.txt', input_file),
'MODELNAME': (None, kwargs.get('MODELNAME', 'HumDiv')),
'UCSCDB': (None, kwargs.get('UCSCDB', 'hg19')),
'SNPFUNC': (None, kwargs.get('SNPFUNC', 'm'))
}
if fasta_file is not None:
# upload custom sequences
custom_fasta = open(fasta_file, 'rb')
files['uploaded_sequences_1'] = ('sequences.fa', custom_fasta)
response = requests.post(address, files=files)
# parse job ID from response page
jobID = response.cookies['polyphenweb2']
# results and semaphore files
results_dir = f'http://genetics.bwh.harvard.edu/ggi/pph2/{jobID}/1/'
files = {'started': results_dir + 'started.txt',
'completed': results_dir + 'completed.txt',
'short': results_dir + 'pph2-short.txt',
'full': results_dir + 'pph2-full.txt',
'log': results_dir + 'pph2-log.txt',
'snps': results_dir + 'pph2-snps.txt'}
# keep checking if the job has started/completed and,
# when done, fetch output files
output = {}
exts = ['started', 'completed', 'short', 'full', 'log', 'snps']
for k in exts:
# delay = timeout + backoff_factor*[2^(total_retries - 1)]
if k == 'started':
LOGGER.timeit('_started')
r = _requests_retry_session(retries=16).get(files[k])
LOGGER.report('Query to PolyPhen-2 started in %.1fs.', '_started')
LOGGER.info('PolyPhen-2 is running...')
elif k == 'completed':
LOGGER.timeit('_queryPP2')
r = _requests_retry_session(
retries=200, timeout=log(num_lines)/2).get(files[k])
LOGGER.report('Query to PolyPhen-2 completed in %.1fs.',
'_queryPP2')
else:
r = _requests_retry_session(retries=12).get(files[k])
output[k] = r.text
# print to file, if requested
if dump:
with open(prefix + '-' + k + '.txt', 'w', 1) as f:
print(r.text, file=f)
# check for conflicts between Uniprot sequences and isoforms used
# by Polyhen-2 (which are sometimes outdated)
Uniprot_accs = _check_log_errors(output['log'])
if Uniprot_accs:
if fix_isoforms:
LOGGER.info('PolyPhen-2 may have picked the wrong isoforms.')
LOGGER.info('Resubmitting query with correct isoforms --- '
'it may take up to a few hours to complete...')
# print file with freshly downloaded Uniprot sequences
fasta_fname, new_accs = _print_fasta_file(Uniprot_accs)
# replace accession numbers in list of SAVs
tmp_fname = filename + '.tmp'
_replace_strings_in_file(filename, tmp_fname, new_accs)
# resubmit query by manually uploading fasta sequences
output = queryPolyPhen2(
tmp_fname, dump=dump, prefix=prefix,
fasta_file=fasta_fname, fix_isoforms=False, **kwargs)
os.remove(tmp_fname)
# restore original accession numbers in output
orig_accs = dict([[v, k] for k, v in new_accs.items()])
for k in exts:
output[k] = _replace_strings_in_text(output[k], orig_accs)
if dump:
outfile = f'pph2-{k}.txt'
_replace_strings_in_file(outfile, outfile, orig_accs)
elif not ignore_errors:
LOGGER.warn('Please check PolyPhen-2 log file')
else:
LOGGER.error('Please check PolyPhen-2 log file')
return output
[docs]def parsePolyPhen2output(pph2_output):
'''Import PolyPhen-2 results directly from the output of
'queryPolyPhen2' or from a file (in 'full' format).
'''
assert type(pph2_output) in [dict, str]
if type(pph2_output) is dict:
lines = pph2_output['full'].split('\n')
else:
with open(pph2_output, 'r') as file:
lines = file.readlines()
# discard invalid lines
lines = [l for l in lines if l.strip() and l[0] != '#']
if not lines:
msg = (
"PolyPhen-2's output is empty. Please check file 'pph2-log.txt' "
"in the output folder for error messages from PolyPhen-2. \n"
"Typical errors include: \n"
"1) query contains *non-human* variants \n"
"2) variants' format is incorrect (e.g. "
'"UniprotID pos wt_aa mut_aa") \n'
"3) wild-type amino acids are in the wrong position on the "
"sequence (please refer to Uniprot's canonical isoform) \n"
"4) Uniprot accession number is not recognized by PolyPhen-2. \n"
)
raise RuntimeError(msg)
# define a structured array
pl_dtype = np.dtype([(col, 'U25') for col in pph2_columns])
parsed_lines = np.zeros(len(lines), dtype=pl_dtype)
# fill structured array
n_cols = len(pph2_columns)
for i, line in enumerate(lines):
# parse line
words = [w.strip() for w in line.split('\t')]
# check format
n_words = len(words)
if n_words == n_cols - 1:
# manually insert null 'other' column
words.append('?')
elif n_words != n_cols:
msg = 'Incorrect number of columns: {}'.format(n_words)
raise ValueError(msg)
# import to structured array
parsed_lines[i] = tuple(words)
LOGGER.info("PolyPhen-2's output parsed.")
return parsed_lines
[docs]def getSAVcoords(parsed_lines):
"""Extracts SAV Uniprot coordinates as provided by the user. If not
possible, the Uniprot coordinates computed by PolyPhen-2 will be returned.
A string containing the original submitted SAV is returned as well.
"""
SAV_dtype = np.dtype([('acc', 'U15'), ('pos', 'i'),
('aa_wt', 'U1'), ('aa_mut', 'U1'),
('text', 'U25')])
SAV_coords = np.empty(len(parsed_lines), dtype=SAV_dtype)
for i, line in enumerate(parsed_lines):
o_acc = line['o_acc']
if o_acc.startswith('rs') or o_acc.startswith('chr'):
# recover Uniprot accession number from PolyPhen-2 output
acc = line['acc']
pos = int(line['pos'])
aa1 = line['aa1']
aa2 = line['aa2']
SAV_str = o_acc
else:
acc = line['o_acc']
pos = int(line['o_pos'])
aa1 = line['o_aa1']
aa2 = line['o_aa2']
SAV_str = '{} {} {} {}'.format(acc, pos, aa1, aa2)
SAV_coords[i] = (acc, pos, aa1, aa2, SAV_str)
return SAV_coords
[docs]def calcPolyPhen2features(PolyPhen2output):
# define a datatype for sequence-conservation features
# extracted from the output of PolyPhen-2
feat_dtype = np.dtype([('wt_PSIC', 'f'),
('Delta_PSIC', 'f')])
# import selected quantities from PolyPhen-2's output
# into a structured array
f_l = PolyPhen2output[['Score1', 'dScore']]
f_t = [tuple(np.nan if x == '?' else x for x in l) for l in f_l]
features = np.array(f_t, dtype=feat_dtype)
LOGGER.info("Sequence-conservation features have been "
"retrieved from PolyPhen-2's output.")
return features