Commit 3a8a6fb5 authored by Martin Larralde's avatar Martin Larralde
Browse files

Add method to predict the structural class of a peptide

parent 1b0e1aac
......@@ -101,6 +101,13 @@ A non-exhaustive list of available features:
- Isoelectric point using one of 8 pKa scales.
- Molecular weight, taking into account isotope labelling, using one of 3 average weight tables.
- Biological properties:
- Structural class using methods and reference data from either
`Nakashima, Nishikawa & Ooi (1985) <https://doi.org/10.1093/oxfordjournals.jbchem.a135454>`_,
`Chou (1989) <10.1007/978-1-4613-1571-1>`_,
or `Chou & Zhang (1992) <10.1111/j.1432-1033.1992.tb17067.x>`_.
Setup
-----
......
......@@ -1259,6 +1259,132 @@ class Peptide(typing.Sequence[str]):
return profile
# --- Structural class ---------------------------------------------------
def structural_class(
self,
frequencies: str = "Chou",
distance: str = "correlation",
) -> str:
"""Predict the structural class of the peptide from its sequence.
The structural class of a protein, as defined in Levitt and Chothia
(1976), can be either α, β, α+β, or α/β, with ζ being later defined
for irregular proteins. It depends on the secondary structure of the
protein. Several methods have been proposed to elucidate the
structural class from the amino acid sequence, all based on
similarity with proteins which structures have been elucidated.
Chou and Zhang (1992) proposed a correlation-coefficient method
to predict the structural class of a protein based on its amino
acid compositions.
Arguments:
frequencies (`str`): The frequencies of the amino acids in
proteins of different structural classes to use as reference
centroids. Use `"Chou"` to load the frequencies of the 64
proteins analyzed in Chou (1989), or `"Nakashima"` to use
the normalized frequencies of the 135 proteins analyzed in
Nakashima *et al* (1986).
distance (`str`): The distance metric to use in the 20-D space
formed by the 20 usual amino acid to find the nearest
structural class for the peptide. Use `"cityblock"` to use
the Manhattan distance like in Chou (1989), `"euclidean"`
to use the Euclidean distance like in Nakashima *et al*
(1986), or `"correlation"` to use the correlation distance
like in Chou & Zhang (1992).
Returns:
`str`: The predicted protein class.
Example:
>>> cytochrome_c = Peptide(
... "MGDVAKGKKTFVQKCAQCHTVENGGKHKVGPNLWGLFGRKTGQAEGYSYT"
... "DANKSKGIVWNENTLMEYLENPKKYIPGTKMIFAGIKKKGERQDLVAYLK"
... "SATS"
... )
>>> cytochrome_c.structural_class()
'alpha'
>>> cytochrome_c.structural_class(frequencies="Nakashima")
'alpha'
>>> erabutoxin_b = Peptide(
... "MKTLLLTLVVVTIVCLDLGYTRICFNHQSSQPQTTKTCSPGESSCYHKQW"
... "SDFRGTIIERGCGCPTVKPGIKLSCCESEVCNN"
... )
>>> erabutoxin_b.structural_class()
'beta'
>>> erabutoxin_b.structural_class(distance="cityblock")
'beta'
>>> ferredoxin = Peptide(
... "MATYKVTLINEAEGINETIDCDDDTYILDAAEEAGLDLPYSCRAGACSTC"
... "AGTITSGTIDQSDQSFLDDDQIEAGYVLTCVAYPTSDCTIKTHQEEGLY"
... )
>>> ferredoxin.structural_class("Nakashima", "euclidean")
'zeta'
References:
- Chou, K-C., and C-T. Zhang.
*A Correlation-Coefficient Method to Predicting
Protein-Structural Classes from Amino Acid Compositions*.
European Journal of Biochemistry. 1992;207(2):429–33.
doi:10.1111/j.1432-1033.1992.tb17067.x. PMID:1633801.
- Chou, P. Y.
*Prediction of Protein Structural Classes from Amino Acid
Compositions*. In Prediction of Protein Structure and the
Principles of Protein Conformation, edited by G. D. Fasman.
Springer US. 1989:549–86.
doi:10.1007/978-1-4613-1571-1. ISBN:978-0-306-43131-9.
- Nakashima, H., K. Nishikawa, and T. Ooi.
*The Folding Type of a Protein Is Relevant to the Amino Acid
Composition*. Journal of Biochemistry. Jan 1986;99(1):153–62.
doi:10.1093/oxfordjournals.jbchem.a135454. PMID:3957893.
"""
# get peptide frequencies
pep_frequencies = self.frequencies()
# get reference frequencies
if frequencies == "Chou":
ref_frequencies = {
"alpha": tables.AA_FREQUENCIES["Chou_alpha"],
"beta": tables.AA_FREQUENCIES["Chou_beta"],
"alpha+beta": tables.AA_FREQUENCIES["Chou_alpha+beta"],
"alpha_beta": tables.AA_FREQUENCIES["Chou_alpha_beta"],
}
elif frequencies == "Nakashima":
ref_frequencies = {
"alpha": tables.AA_FREQUENCIES["Nakashima_alpha"],
"beta": tables.AA_FREQUENCIES["Nakashima_beta"],
"alpha+beta": tables.AA_FREQUENCIES["Nakashima_alpha+beta"],
"alpha_beta": tables.AA_FREQUENCIES["Nakashima_alpha_beta"],
"zeta": tables.AA_FREQUENCIES["Nakashima_zeta"],
}
# Nakashima frequencies are normalized, so we must normalize
# the peptide frequencies too in that case
mean = tables.AA_FREQUENCIES["Nakashima"]
sd = tables.AA_FREQUENCIES["Nakashima_sd"]
pep_frequencies = {
aa:(pep_frequencies[aa]-mean[aa])/sd[aa] for aa in mean
}
else:
raise ValueError(f"Invalid amino acid frequencies: {frequencies!r}")
distances = {}
if distance == "correlation":
for name,table in ref_frequencies.items():
s1 = sum(pep_frequencies[x]*table[x] for x in table)
s2 = sum(pep_frequencies[x]**2 for x in table)
s3 = sum(table[x]**2 for x in table)
distances[name] = 1 - s1 / math.sqrt(s2*s3)
elif distance == "cityblock":
for name,table in ref_frequencies.items():
s = sum(abs(pep_frequencies[x]-table[x]) for x in table)
distances[name] = s
elif distance == "euclidean":
for name,table in ref_frequencies.items():
s = sum((pep_frequencies[x]-table[x])**2 for x in table)
distances[name] = math.sqrt(s)
return min(distances, key=distances.get)
# --- Descriptors --------------------------------------------------------
def blosum_indices(self) -> BLOSUMIndices:
......
A,0.093
G,0.091
S,0.067
V,0.065
N,0.064
T,0.062
K,0.059
D,0.059
L,0.058
Y,0.057
I,0.049
E,0.046
R,0.041
C,0.039
Q,0.039
P,0.038
F,0.028
H,0.017
W,0.016
M,0.013
A,0.116
K,0.120
L,0.090
G,0.081
V,0.068
D,0.067
E,0.055
F,0.050
S,0.050
T,0.049
H,0.045
N,0.040
I,0.037
P,0.034
Q,0.027
Y,0.026
R,0.022
M,0.020
W,0.013
C,0.009
V,0.087
G,0.087
A,0.083
L,0.078
S,0.075
K,0.074
E,0.059
D,0.056
I,0.055
T,0.055
P,0.043
N,0.042
F,0.036
R,0.034
Y,0.030
Q,0.026
H,0.025
M,0.021
W,0.017
C,0.015
S,0.123
G,0.107
T,0.091
V,0.082
A,0.073
L,0.064
N,0.050
P,0.046
Q,0.044
D,0.044
I,0.043
K,0.041
Y,0.040
E,0.031
F,0.031
C,0.027
R,0.024
H,0.018
W,0.016
M,0.006
A,0.0874
C,0.0162
D,0.0572
E,0.0639
F,0.0387
G,0.0782
H,0.0215
I,0.0515
K,0.0678
L,0.0820
M,0.0208
N,0.0439
P,0.0449
Q,0.0391
R,0.0481
S,0.0656
T,0.0584
V,0.0701
W,0.0117
Y,0.0333
A,0.0889
C,0.0294
D,0.0576
E,0.0618
F,0.0360
G,0.0800
H,0.0200
I,0.0474
K,0.0718
L,0.0637
M,0.0140
N,0.0560
P,0.0429
Q,0.0317
R,0.0405
S,0.0705
T,0.0641
V,0.0650
W,0.0128
Y,0.0459
A,0.1163
C,0.0171
D,0.0652
E,0.0652
F,0.0422
G,0.0766
H,0.0279
I,0.0372
K,0.1010
L,0.0889
M,0.0242
N,0.0379
P,0.0381
Q,0.0333
R,0.0279
S,0.0544
T,0.0491
V,0.0602
W,0.0117
Y,0.0187
A,0.0883
C,0.0143
D,0.0612
E,0.0612
F,0.0388
G,0.0871
H,0.0219
I,0.0582
K,0.0655
L,0.0854
M,0.0214
N,0.0413
P,0.0436
Q,0.0344
R,0.0435
S,0.0589
T,0.0550
V,0.0762
W,0.0138
Y,0.0302
A,0.0754
C,0.0348
D,0.0537
E,0.0375
F,0.0375
G,0.0987
H,0.0164
I,0.0476
K,0.0466
L,0.0669
M,0.0124
N,0.0490
P,0.0523
Q,0.0412
R,0.0322
S,0.0950
T,0.0783
V,0.0748
W,0.0148
Y,0.0367
A,0.0367
C,0.0153
D,0.0220
E,0.0288
F,0.0185
G,0.0298
H,0.0132
I,0.0229
K,0.0334
L,0.0316
M,0.0126
N,0.0199
P,0.0204
Q,0.0174
R,0.0253
S,0.0273
T,0.0230
V,0.0248
W,0.0098
Y,0.0187
A,0.0890
C,0.1204
D,0.0885
E,0.0685
F,0.0173
G,0.1049
H,0.0102
I,0.0699
K,0.0327
L,0.0402
M,0.0053
N,0.0416
P,0.0582
Q,0.0403
R,0.0108
S,0.0642
T,0.0435
V,0.0489
W,0.0062
Y,0.0395
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment