Evaluating methods to identify well-defined protein residues in NMR ensembles

Explorative analysis and statistics

1) Data assembly

Advanced search performed using these criteria:

Protein molecule
No DNA/RNA
From 2000-01-01 to 2016-10-20
10 <= x <= 100 models
50 <= x residues
Solution/Solid-state NMR

This resulted in 7046 protein chains in 6215 PDB entries.

Entries were downloaded:

FASTA format

Each PDB ID and chain was used in order to get NMRCore, FindCore, FindCore extended (aka FindCore2) and Cyrange data.

cyrange <- "#FF853F"
nmrcore <- "#7E78FF"
findcore <- "#63CC4C"

number_of_domains <- read.csv("number_of_domains.csv", head=T, sep="\t", na="-")
ratio_of_residues <- read.csv("ratio_of_residues.csv", head=T, sep="\t", na="-")
dis_data <- read.csv("disorder_vs_core_disorder.csv", sep="\t", head=T)
s2_data <- read.csv("s2_data.csv", head=T, sep="\t")
ss_data <- read.csv("sec_str_ratios.csv", sep="\t", head=T)
fin_vs_cyr <- read.csv("unique_residues_fin_v_cyr.csv", head=T, sep="\t", na="-")
nmr_vs_cyr <- read.csv("unique_residues_nmr_v_cyr.csv", head=T, sep="\t", na="-")
fin_vs_nmr <- read.csv("unique_residues_fin_v_nmr.csv", head=T, sep="\t", na="-")
nmr_v_cyr <- read.csv("unique_residues_with_disorder_nmrcore_cyrange.csv", head=T, sep="\t", na="-")
nmr_v_fin <- read.csv("unique_residues_with_disorder_nmrcore_findcore2.csv", head=T, sep="\t", na="-")
fin_v_cyr <- read.csv("unique_residues_with_disorder_cyrange_findcore2.csv", head=T, sep="\t", na="-")
nmr_v_cyr_ss <- read.csv("sec_str_unique_ratios_cyrange_vs_nmrcore.csv", head=T, sep="\t", na="-")
nmr_v_fin_ss <- read.csv("sec_str_unique_ratios_findcore2_vs_nmrcore.csv", head=T, sep="\t", na="-")
fin_v_cyr_ss <- read.csv("sec_str_unique_ratios_cyrange_vs_findcore2.csv", head=T, sep="\t", na="-")

2) Number of domains per method

# Plotting
boxplot(number_of_domains$FindCore, 
        number_of_domains$FindCore2, 
        number_of_domains$Cyrange, 
        number_of_domains$NMRCore,
        col=c(findcore, findcore, cyrange, nmrcore),
        xaxt = 'n')
legend("topleft", 
       c("FindCore", "FindCore2", "Cyrange", "NMRCore"),
       col=c(findcore, findcore, cyrange, nmrcore),
       lwd=4)
axis(1, at=c(1,2,3,4), c("FindCore", "FindCore2", "Cyrange", "NMRCore"))

NMRCore and FindCore finds an unrealistic number of regions. FindCore 2 and Cyrange finds 1.8893611 and 1.6038773 regions on average.

3) Ratio of core residues per method

# Define histogram function
hist_of_ratios <- function(x){
  if(x == "cyrange"){xcol <- cyrange}
  else if(x == "nmrcore"){xcol <- nmrcore}
  else if(x == "findcore" || x == "findcore2"){xcol <- findcore}
  
  if(x == "s2"){
    hist(1-s2_data$s2, 
    xlim=c(0, 1), main=x, xlab="Ratio of core residues",
    breaks=50,
    col="white") 
  }
  else{
  hist(ratio_of_residues[[x]], 
    xlim=c(0, 1), main=x, xlab="Ratio of core residues",
    breaks=50,
    col=xcol)     
  }
}

# Plotting
par(mfrow=c(2,3))
hist_of_ratios("findcore")
hist_of_ratios("findcore2")
hist_of_ratios("cyrange")
hist_of_ratios("nmrcore")
hist_of_ratios("s2")
par(mfrow=c(1,1))

According to Welch Two Sample t-test the differences in the ratios of core residues are the following:

FindCore is very significantly different (lower) than all the other methods
Cyrange is significantly different (lower) than NMRCore and FindCore2
NMRCore and FindCore2 are not significantly different

4) Secondary structure of core domain residues

# Define custom density plot
custom_density <- function(data, xlab, legend_at){
  max_y <- max(density(data$cyrange)$y)
  if(max_y < max(density(data$findcore2)$y)){
    max_y = max(density(data$findcore2)$y)
  }
  if(max_y < max(density(data$nmrcore)$y)){
    max_y <- max(density(data$nmrcore)$y)
  }
  plot(density(data$cyrange), col=cyrange, lwd=3, 
    main="",
    ylim=c(0, max_y),
    xlab="Ratio of disordered core-domain residues")
    lines(density(data$findcore), col=findcore, lwd=3)
    lines(density(data$nmrcore), col=nmrcore, lwd=3)
    axis(1, at=median(data$cyrange, na.rm=T), col=cyrange, lwd=3, labels="")
    axis(1, at=median(data$findcore2, na.rm=T), col=findcore, lwd=3, labels="")
    axis(1, at=median(data$nmrcore, na.rm=T), col=nmrcore, lwd=3, labels="")
  legend(legend_at, 
    c("FindCore", "Cyrange", "NMRCore"),
    col=c(findcore, cyrange, nmrcore),
    lwd=4)  
}

4.1) Ratio of core-domain residues with disorder propensities > 0.5

custom_density(dis_data, "Ratio of disordered core-domain residues", "topright")

On average 0.1095728 of core residues are disordered when identified by Cyrange, and 0.125645 when identified with FindCore2. This difference is significant according to Welch paired t-test with p-value 1.753912610^{-218}. Significantly more of the identified core residues are disordered according to FindCore2 than Cyrange.

4.2) Ratio of core-domain residues with S2 < 0.85

custom_density(s2_data, "Ratio of low order core-domain residues", "topright")

4.3) Ratio of core-domain residues in secondary structural elements

custom_density(ss_data, "Ratios of core residues within SSEs", "topleft")

The core-residues identified by FindCore2 are significantly less structured (i.e. found in secondary structural elements) than those identified by Cyrange. Welch paired t-test p-value: 7.429417310^{-205}

5) Ratio of unique residues

plot_uniques <- function(data, x, y){
  library(LSD)
  DF <- data.frame(data[[x]],data[[y]])
  heatscatter(DF[,1],DF[,2], xlab=x, ylab=y, main="Ratios of uniquely identified residues")
}

plot_uniques(fin_vs_cyr, "findcore2", "cyrange")

plot_uniques(nmr_vs_cyr, "nmrcore", "cyrange")

plot_uniques(fin_vs_nmr, "findcore2", "nmrcore")

different_10 <-  which(fin_vs_cyr$findcore2 > 0.1 & fin_vs_cyr$cyrange > 0.1)
different_20 <-  which(fin_vs_cyr$findcore2 > 0.2 & fin_vs_cyr$cyrange > 0.2)
similar_90 <-  which(fin_vs_cyr$findcore2 < 0.1 & fin_vs_cyr$cyrange < 0.1)

There are 84 entries (out of 6692, 0.0125523%) that are more than 10% different, and 44 entries (0.006575%) that are more than 20% different by the two methods.

0.684997% of the entries (4584) are minimum 90% the same.

FindCore2 uniquely identifies significantly more residues as core domain residues than Cyrange (Welch paired test p-value=1.164165310^{-223}).

6) Uniquely identified residues in disordered and well-defined protein chains

6.1) Binary-coloured scatter plots

# Define plotting function (binary)
plot_disorder <- function(data, x, y){
  disordered <- data[which(data$dis_ratio >= 0.6),]
  folded <- data[which(data$dis_ratio < 0.6),]
  plot(folded[[x]], folded[[y]], bg="blue", pch=21,
    main="",
    xlab=x,
    ylab=y)
  points(disordered[[x]], disordered[[y]], bg="red", pch=21)
}

# Binary scatter plots
par(mfrow=c(2,2))
plot_disorder(fin_v_cyr, "findcore2", "cyrange")
plot_disorder(nmr_v_cyr, "nmrcore", "cyrange")
plot_disorder(nmr_v_fin, "findcore2", "nmrcore")
par(mfrow=c(1,1))

6.2) Gradient-coloured scatter plots

# Define plotting function (gradient)
plot_disorder_gradient <- function(data, x, y){
  rbPal <- colorRampPalette(c('blue', 'red'))
  data$Col <- rbPal(20)[as.numeric(cut(data$dis_ratio,breaks = 20))]
  sorted <- data[with(data, order(dis_ratio)), ]
  plot(sorted[[x]], sorted[[y]],
    main="",
    pch = 20,
    col = sorted$Col,
    xlab=x,
    ylab=y)
}

# Gradient scatter plots
par(mfrow=c(2,2))
plot_disorder_gradient(fin_v_cyr, "findcore2", "cyrange")
plot_disorder_gradient(nmr_v_cyr, "nmrcore", "cyrange")
plot_disorder_gradient(nmr_v_fin, "findcore2", "nmrcore")
par(mfrow=c(1,1))

6.3) Density plots

# Define plotting function (density)
plot_disorder_density <- function(data, x, y){
    if(x == "cyrange"){xcol <- cyrange}
    else if(x == "findcore2"){xcol <- findcore}
    else if(x == "nmrcore"){xcol <- nmrcore}
    if(y == "cyrange"){ycol <- cyrange}
    else if(y == "findcore2"){ycol <- findcore}
    else if(y == "nmrcore"){ycol <- nmrcore}
  disordered <- data[which(data$dis_ratio >= 0.6),]
  plot(density(disordered[[x]]), col=xcol, lwd=3,
    main="",
    xlab="Ratio of unique core residues")
  lines(density(disordered[[y]]), col=ycol, lwd=3)
  legend("topright", 
    c(x, y),
    col=c(xcol, ycol),
    lwd=4)
}

# Density plots
par(mfrow=c(2,2))
plot_disorder_density(fin_v_cyr, "cyrange", "findcore2")
plot_disorder_density(nmr_v_cyr, "cyrange", "nmrcore")
plot_disorder_density(nmr_v_fin, "nmrcore", "findcore2")
par(mfrow=c(1,1))

6.4) Density plots of unique core residues outside of SSEs

plot_sse_density <- function(data, x, y){
  if(x == "cyrange"){xcol <- cyrange}
  else if(x == "findcore2"){xcol <- findcore}
  else if(x == "nmrcore"){xcol <- nmrcore}
  
  if(y == "cyrange"){ycol <- cyrange}
  else if(y == "findcore2"){ycol <- findcore}
  else if(y == "nmrcore"){ycol <- nmrcore}
  
  max_y <- max(density(data[[x]])$y)
  if(max_y < max(density(data[[y]])$y)){
    max_y = max(density(data[[y]])$y)
  }
  
  plot(density(data[[x]]), col=xcol, lwd=3,
    main="",
    ylim=c(0, max_y),
    xlab="Ratio of unique core residues")
  lines(density(data[[y]]), col=ycol, lwd=3)
  legend("topright", 
    c(x, y),
    col=c(xcol, ycol),
    lwd=4)
}

par(mfrow=c(2,2))
plot_sse_density(nmr_v_fin_ss, "nmrcore", "findcore2")
plot_sse_density(nmr_v_cyr_ss, "cyrange", "nmrcore")
plot_sse_density(fin_v_cyr_ss, "cyrange", "findcore2")
par(mfrow=c(1,1))