Advanced search performed using these criteria:
This resulted in 7046 protein chains in 6215 PDB entries.
Entries were downloaded:
Each PDB ID and chain was used in order to get NMRCore, FindCore, FindCore extended (aka FindCore2) and Cyrange data.
cyrange <- "#FF853F"
nmrcore <- "#7E78FF"
findcore <- "#63CC4C"
number_of_domains <- read.csv("number_of_domains.csv", head=T, sep="\t", na="-")
ratio_of_residues <- read.csv("ratio_of_residues.csv", head=T, sep="\t", na="-")
dis_data <- read.csv("disorder_vs_core_disorder.csv", sep="\t", head=T)
s2_data <- read.csv("s2_data.csv", head=T, sep="\t")
ss_data <- read.csv("sec_str_ratios.csv", sep="\t", head=T)
fin_vs_cyr <- read.csv("unique_residues_fin_v_cyr.csv", head=T, sep="\t", na="-")
nmr_vs_cyr <- read.csv("unique_residues_nmr_v_cyr.csv", head=T, sep="\t", na="-")
fin_vs_nmr <- read.csv("unique_residues_fin_v_nmr.csv", head=T, sep="\t", na="-")
nmr_v_cyr <- read.csv("unique_residues_with_disorder_nmrcore_cyrange.csv", head=T, sep="\t", na="-")
nmr_v_fin <- read.csv("unique_residues_with_disorder_nmrcore_findcore2.csv", head=T, sep="\t", na="-")
fin_v_cyr <- read.csv("unique_residues_with_disorder_cyrange_findcore2.csv", head=T, sep="\t", na="-")
nmr_v_cyr_ss <- read.csv("sec_str_unique_ratios_cyrange_vs_nmrcore.csv", head=T, sep="\t", na="-")
nmr_v_fin_ss <- read.csv("sec_str_unique_ratios_findcore2_vs_nmrcore.csv", head=T, sep="\t", na="-")
fin_v_cyr_ss <- read.csv("sec_str_unique_ratios_cyrange_vs_findcore2.csv", head=T, sep="\t", na="-")
# Plotting
boxplot(number_of_domains$FindCore,
number_of_domains$FindCore2,
number_of_domains$Cyrange,
number_of_domains$NMRCore,
col=c(findcore, findcore, cyrange, nmrcore),
xaxt = 'n')
legend("topleft",
c("FindCore", "FindCore2", "Cyrange", "NMRCore"),
col=c(findcore, findcore, cyrange, nmrcore),
lwd=4)
axis(1, at=c(1,2,3,4), c("FindCore", "FindCore2", "Cyrange", "NMRCore"))
NMRCore and FindCore finds an unrealistic number of regions. FindCore 2 and Cyrange finds 1.8893611 and 1.6038773 regions on average.
# Define histogram function
hist_of_ratios <- function(x){
if(x == "cyrange"){xcol <- cyrange}
else if(x == "nmrcore"){xcol <- nmrcore}
else if(x == "findcore" || x == "findcore2"){xcol <- findcore}
if(x == "s2"){
hist(1-s2_data$s2,
xlim=c(0, 1), main=x, xlab="Ratio of core residues",
breaks=50,
col="white")
}
else{
hist(ratio_of_residues[[x]],
xlim=c(0, 1), main=x, xlab="Ratio of core residues",
breaks=50,
col=xcol)
}
}
# Plotting
par(mfrow=c(2,3))
hist_of_ratios("findcore")
hist_of_ratios("findcore2")
hist_of_ratios("cyrange")
hist_of_ratios("nmrcore")
hist_of_ratios("s2")
par(mfrow=c(1,1))
According to Welch Two Sample t-test the differences in the ratios of core residues are the following:
# Define custom density plot
custom_density <- function(data, xlab, legend_at){
max_y <- max(density(data$cyrange)$y)
if(max_y < max(density(data$findcore2)$y)){
max_y = max(density(data$findcore2)$y)
}
if(max_y < max(density(data$nmrcore)$y)){
max_y <- max(density(data$nmrcore)$y)
}
plot(density(data$cyrange), col=cyrange, lwd=3,
main="",
ylim=c(0, max_y),
xlab="Ratio of disordered core-domain residues")
lines(density(data$findcore), col=findcore, lwd=3)
lines(density(data$nmrcore), col=nmrcore, lwd=3)
axis(1, at=median(data$cyrange, na.rm=T), col=cyrange, lwd=3, labels="")
axis(1, at=median(data$findcore2, na.rm=T), col=findcore, lwd=3, labels="")
axis(1, at=median(data$nmrcore, na.rm=T), col=nmrcore, lwd=3, labels="")
legend(legend_at,
c("FindCore", "Cyrange", "NMRCore"),
col=c(findcore, cyrange, nmrcore),
lwd=4)
}
custom_density(dis_data, "Ratio of disordered core-domain residues", "topright")
On average 0.1095728 of core residues are disordered when identified by Cyrange, and 0.125645 when identified with FindCore2. This difference is significant according to Welch paired t-test with p-value 1.753912610^{-218}. Significantly more of the identified core residues are disordered according to FindCore2 than Cyrange.
custom_density(s2_data, "Ratio of low order core-domain residues", "topright")
custom_density(ss_data, "Ratios of core residues within SSEs", "topleft")
The core-residues identified by FindCore2 are significantly less structured (i.e. found in secondary structural elements) than those identified by Cyrange. Welch paired t-test p-value: 7.429417310^{-205}
plot_uniques <- function(data, x, y){
library(LSD)
DF <- data.frame(data[[x]],data[[y]])
heatscatter(DF[,1],DF[,2], xlab=x, ylab=y, main="Ratios of uniquely identified residues")
}
plot_uniques(fin_vs_cyr, "findcore2", "cyrange")
plot_uniques(nmr_vs_cyr, "nmrcore", "cyrange")
plot_uniques(fin_vs_nmr, "findcore2", "nmrcore")
different_10 <- which(fin_vs_cyr$findcore2 > 0.1 & fin_vs_cyr$cyrange > 0.1)
different_20 <- which(fin_vs_cyr$findcore2 > 0.2 & fin_vs_cyr$cyrange > 0.2)
similar_90 <- which(fin_vs_cyr$findcore2 < 0.1 & fin_vs_cyr$cyrange < 0.1)
There are 84 entries (out of 6692, 0.0125523%) that are more than 10% different, and 44 entries (0.006575%) that are more than 20% different by the two methods.
0.684997% of the entries (4584) are minimum 90% the same.
FindCore2 uniquely identifies significantly more residues as core domain residues than Cyrange (Welch paired test p-value=1.164165310^{-223}).
# Define plotting function (binary)
plot_disorder <- function(data, x, y){
disordered <- data[which(data$dis_ratio >= 0.6),]
folded <- data[which(data$dis_ratio < 0.6),]
plot(folded[[x]], folded[[y]], bg="blue", pch=21,
main="",
xlab=x,
ylab=y)
points(disordered[[x]], disordered[[y]], bg="red", pch=21)
}
# Binary scatter plots
par(mfrow=c(2,2))
plot_disorder(fin_v_cyr, "findcore2", "cyrange")
plot_disorder(nmr_v_cyr, "nmrcore", "cyrange")
plot_disorder(nmr_v_fin, "findcore2", "nmrcore")
par(mfrow=c(1,1))
# Define plotting function (gradient)
plot_disorder_gradient <- function(data, x, y){
rbPal <- colorRampPalette(c('blue', 'red'))
data$Col <- rbPal(20)[as.numeric(cut(data$dis_ratio,breaks = 20))]
sorted <- data[with(data, order(dis_ratio)), ]
plot(sorted[[x]], sorted[[y]],
main="",
pch = 20,
col = sorted$Col,
xlab=x,
ylab=y)
}
# Gradient scatter plots
par(mfrow=c(2,2))
plot_disorder_gradient(fin_v_cyr, "findcore2", "cyrange")
plot_disorder_gradient(nmr_v_cyr, "nmrcore", "cyrange")
plot_disorder_gradient(nmr_v_fin, "findcore2", "nmrcore")
par(mfrow=c(1,1))
# Define plotting function (density)
plot_disorder_density <- function(data, x, y){
if(x == "cyrange"){xcol <- cyrange}
else if(x == "findcore2"){xcol <- findcore}
else if(x == "nmrcore"){xcol <- nmrcore}
if(y == "cyrange"){ycol <- cyrange}
else if(y == "findcore2"){ycol <- findcore}
else if(y == "nmrcore"){ycol <- nmrcore}
disordered <- data[which(data$dis_ratio >= 0.6),]
plot(density(disordered[[x]]), col=xcol, lwd=3,
main="",
xlab="Ratio of unique core residues")
lines(density(disordered[[y]]), col=ycol, lwd=3)
legend("topright",
c(x, y),
col=c(xcol, ycol),
lwd=4)
}
# Density plots
par(mfrow=c(2,2))
plot_disorder_density(fin_v_cyr, "cyrange", "findcore2")
plot_disorder_density(nmr_v_cyr, "cyrange", "nmrcore")
plot_disorder_density(nmr_v_fin, "nmrcore", "findcore2")
par(mfrow=c(1,1))
plot_sse_density <- function(data, x, y){
if(x == "cyrange"){xcol <- cyrange}
else if(x == "findcore2"){xcol <- findcore}
else if(x == "nmrcore"){xcol <- nmrcore}
if(y == "cyrange"){ycol <- cyrange}
else if(y == "findcore2"){ycol <- findcore}
else if(y == "nmrcore"){ycol <- nmrcore}
max_y <- max(density(data[[x]])$y)
if(max_y < max(density(data[[y]])$y)){
max_y = max(density(data[[y]])$y)
}
plot(density(data[[x]]), col=xcol, lwd=3,
main="",
ylim=c(0, max_y),
xlab="Ratio of unique core residues")
lines(density(data[[y]]), col=ycol, lwd=3)
legend("topright",
c(x, y),
col=c(xcol, ycol),
lwd=4)
}
par(mfrow=c(2,2))
plot_sse_density(nmr_v_fin_ss, "nmrcore", "findcore2")
plot_sse_density(nmr_v_cyr_ss, "cyrange", "nmrcore")
plot_sse_density(fin_v_cyr_ss, "cyrange", "findcore2")
par(mfrow=c(1,1))