Evaluating methods to identify well-defined protein residues in NMR ensembles

Explorative analysis and statistics

1) Data assembly

Advanced search performed using these criteria:

  • Protein molecule
  • No DNA/RNA
  • From 2000-01-01 to 2016-10-20
  • 10 <= x <= 100 models
  • 50 <= x residues
  • Solution/Solid-state NMR

This resulted in 7046 protein chains in 6215 PDB entries.

Entries were downloaded:

  • FASTA format

Each PDB ID and chain was used in order to get NMRCore, FindCore, FindCore extended (aka FindCore2) and Cyrange data.

cyrange <- "#FF853F"
nmrcore <- "#7E78FF"
findcore <- "#63CC4C"
number_of_domains <- read.csv("number_of_domains.csv", head=T, sep="\t", na="-")
ratio_of_residues <- read.csv("ratio_of_residues.csv", head=T, sep="\t", na="-")
dis_data <- read.csv("disorder_vs_core_disorder.csv", sep="\t", head=T)
s2_data <- read.csv("s2_data.csv", head=T, sep="\t")
ss_data <- read.csv("sec_str_ratios.csv", sep="\t", head=T)
fin_vs_cyr <- read.csv("unique_residues_fin_v_cyr.csv", head=T, sep="\t", na="-")
nmr_vs_cyr <- read.csv("unique_residues_nmr_v_cyr.csv", head=T, sep="\t", na="-")
fin_vs_nmr <- read.csv("unique_residues_fin_v_nmr.csv", head=T, sep="\t", na="-")
nmr_v_cyr <- read.csv("unique_residues_with_disorder_nmrcore_cyrange.csv", head=T, sep="\t", na="-")
nmr_v_fin <- read.csv("unique_residues_with_disorder_nmrcore_findcore2.csv", head=T, sep="\t", na="-")
fin_v_cyr <- read.csv("unique_residues_with_disorder_cyrange_findcore2.csv", head=T, sep="\t", na="-")
nmr_v_cyr_ss <- read.csv("sec_str_unique_ratios_cyrange_vs_nmrcore.csv", head=T, sep="\t", na="-")
nmr_v_fin_ss <- read.csv("sec_str_unique_ratios_findcore2_vs_nmrcore.csv", head=T, sep="\t", na="-")
fin_v_cyr_ss <- read.csv("sec_str_unique_ratios_cyrange_vs_findcore2.csv", head=T, sep="\t", na="-")

2) Number of domains per method

# Plotting
boxplot(number_of_domains$FindCore, 
        number_of_domains$FindCore2, 
        number_of_domains$Cyrange, 
        number_of_domains$NMRCore,
        col=c(findcore, findcore, cyrange, nmrcore),
        xaxt = 'n')
legend("topleft", 
       c("FindCore", "FindCore2", "Cyrange", "NMRCore"),
       col=c(findcore, findcore, cyrange, nmrcore),
       lwd=4)
axis(1, at=c(1,2,3,4), c("FindCore", "FindCore2", "Cyrange", "NMRCore"))

NMRCore and FindCore finds an unrealistic number of regions. FindCore 2 and Cyrange finds 1.8893611 and 1.6038773 regions on average.

3) Ratio of core residues per method

# Define histogram function
hist_of_ratios <- function(x){
  if(x == "cyrange"){xcol <- cyrange}
  else if(x == "nmrcore"){xcol <- nmrcore}
  else if(x == "findcore" || x == "findcore2"){xcol <- findcore}
  
  if(x == "s2"){
    hist(1-s2_data$s2, 
    xlim=c(0, 1), main=x, xlab="Ratio of core residues",
    breaks=50,
    col="white") 
  }
  else{
  hist(ratio_of_residues[[x]], 
    xlim=c(0, 1), main=x, xlab="Ratio of core residues",
    breaks=50,
    col=xcol)     
  }
}

# Plotting
par(mfrow=c(2,3))
hist_of_ratios("findcore")
hist_of_ratios("findcore2")
hist_of_ratios("cyrange")
hist_of_ratios("nmrcore")
hist_of_ratios("s2")
par(mfrow=c(1,1))

According to Welch Two Sample t-test the differences in the ratios of core residues are the following:

  • FindCore is very significantly different (lower) than all the other methods
  • Cyrange is significantly different (lower) than NMRCore and FindCore2
  • NMRCore and FindCore2 are not significantly different

4) Secondary structure of core domain residues

# Define custom density plot
custom_density <- function(data, xlab, legend_at){
  max_y <- max(density(data$cyrange)$y)
  if(max_y < max(density(data$findcore2)$y)){
    max_y = max(density(data$findcore2)$y)
  }
  if(max_y < max(density(data$nmrcore)$y)){
    max_y <- max(density(data$nmrcore)$y)
  }
  plot(density(data$cyrange), col=cyrange, lwd=3, 
    main="",
    ylim=c(0, max_y),
    xlab="Ratio of disordered core-domain residues")
    lines(density(data$findcore), col=findcore, lwd=3)
    lines(density(data$nmrcore), col=nmrcore, lwd=3)
    axis(1, at=median(data$cyrange, na.rm=T), col=cyrange, lwd=3, labels="")
    axis(1, at=median(data$findcore2, na.rm=T), col=findcore, lwd=3, labels="")
    axis(1, at=median(data$nmrcore, na.rm=T), col=nmrcore, lwd=3, labels="")
  legend(legend_at, 
    c("FindCore", "Cyrange", "NMRCore"),
    col=c(findcore, cyrange, nmrcore),
    lwd=4)  
}

4.1) Ratio of core-domain residues with disorder propensities > 0.5

custom_density(dis_data, "Ratio of disordered core-domain residues", "topright")

On average 0.1095728 of core residues are disordered when identified by Cyrange, and 0.125645 when identified with FindCore2. This difference is significant according to Welch paired t-test with p-value 1.753912610^{-218}. Significantly more of the identified core residues are disordered according to FindCore2 than Cyrange.

4.2) Ratio of core-domain residues with S2 < 0.85

custom_density(s2_data, "Ratio of low order core-domain residues", "topright")

4.3) Ratio of core-domain residues in secondary structural elements

custom_density(ss_data, "Ratios of core residues within SSEs", "topleft")

The core-residues identified by FindCore2 are significantly less structured (i.e. found in secondary structural elements) than those identified by Cyrange. Welch paired t-test p-value: 7.429417310^{-205}

5) Ratio of unique residues

plot_uniques <- function(data, x, y){
  library(LSD)
  DF <- data.frame(data[[x]],data[[y]])
  heatscatter(DF[,1],DF[,2], xlab=x, ylab=y, main="Ratios of uniquely identified residues")
}

plot_uniques(fin_vs_cyr, "findcore2", "cyrange")

plot_uniques(nmr_vs_cyr, "nmrcore", "cyrange")

plot_uniques(fin_vs_nmr, "findcore2", "nmrcore")

different_10 <-  which(fin_vs_cyr$findcore2 > 0.1 & fin_vs_cyr$cyrange > 0.1)
different_20 <-  which(fin_vs_cyr$findcore2 > 0.2 & fin_vs_cyr$cyrange > 0.2)
similar_90 <-  which(fin_vs_cyr$findcore2 < 0.1 & fin_vs_cyr$cyrange < 0.1)

There are 84 entries (out of 6692, 0.0125523%) that are more than 10% different, and 44 entries (0.006575%) that are more than 20% different by the two methods.

0.684997% of the entries (4584) are minimum 90% the same.

FindCore2 uniquely identifies significantly more residues as core domain residues than Cyrange (Welch paired test p-value=1.164165310^{-223}).

6) Uniquely identified residues in disordered and well-defined protein chains

6.1) Binary-coloured scatter plots

# Define plotting function (binary)
plot_disorder <- function(data, x, y){
  disordered <- data[which(data$dis_ratio >= 0.6),]
  folded <- data[which(data$dis_ratio < 0.6),]
  plot(folded[[x]], folded[[y]], bg="blue", pch=21,
    main="",
    xlab=x,
    ylab=y)
  points(disordered[[x]], disordered[[y]], bg="red", pch=21)
}

# Binary scatter plots
par(mfrow=c(2,2))
plot_disorder(fin_v_cyr, "findcore2", "cyrange")
plot_disorder(nmr_v_cyr, "nmrcore", "cyrange")
plot_disorder(nmr_v_fin, "findcore2", "nmrcore")
par(mfrow=c(1,1))

6.2) Gradient-coloured scatter plots

# Define plotting function (gradient)
plot_disorder_gradient <- function(data, x, y){
  rbPal <- colorRampPalette(c('blue', 'red'))
  data$Col <- rbPal(20)[as.numeric(cut(data$dis_ratio,breaks = 20))]
  sorted <- data[with(data, order(dis_ratio)), ]
  plot(sorted[[x]], sorted[[y]],
    main="",
    pch = 20,
    col = sorted$Col,
    xlab=x,
    ylab=y)
}

# Gradient scatter plots
par(mfrow=c(2,2))
plot_disorder_gradient(fin_v_cyr, "findcore2", "cyrange")
plot_disorder_gradient(nmr_v_cyr, "nmrcore", "cyrange")
plot_disorder_gradient(nmr_v_fin, "findcore2", "nmrcore")
par(mfrow=c(1,1))

6.3) Density plots

# Define plotting function (density)
plot_disorder_density <- function(data, x, y){
    if(x == "cyrange"){xcol <- cyrange}
    else if(x == "findcore2"){xcol <- findcore}
    else if(x == "nmrcore"){xcol <- nmrcore}
    if(y == "cyrange"){ycol <- cyrange}
    else if(y == "findcore2"){ycol <- findcore}
    else if(y == "nmrcore"){ycol <- nmrcore}
  disordered <- data[which(data$dis_ratio >= 0.6),]
  plot(density(disordered[[x]]), col=xcol, lwd=3,
    main="",
    xlab="Ratio of unique core residues")
  lines(density(disordered[[y]]), col=ycol, lwd=3)
  legend("topright", 
    c(x, y),
    col=c(xcol, ycol),
    lwd=4)
}

# Density plots
par(mfrow=c(2,2))
plot_disorder_density(fin_v_cyr, "cyrange", "findcore2")
plot_disorder_density(nmr_v_cyr, "cyrange", "nmrcore")
plot_disorder_density(nmr_v_fin, "nmrcore", "findcore2")
par(mfrow=c(1,1))

6.4) Density plots of unique core residues outside of SSEs

plot_sse_density <- function(data, x, y){
  if(x == "cyrange"){xcol <- cyrange}
  else if(x == "findcore2"){xcol <- findcore}
  else if(x == "nmrcore"){xcol <- nmrcore}
  
  if(y == "cyrange"){ycol <- cyrange}
  else if(y == "findcore2"){ycol <- findcore}
  else if(y == "nmrcore"){ycol <- nmrcore}
  
  max_y <- max(density(data[[x]])$y)
  if(max_y < max(density(data[[y]])$y)){
    max_y = max(density(data[[y]])$y)
  }
  
  plot(density(data[[x]]), col=xcol, lwd=3,
    main="",
    ylim=c(0, max_y),
    xlab="Ratio of unique core residues")
  lines(density(data[[y]]), col=ycol, lwd=3)
  legend("topright", 
    c(x, y),
    col=c(xcol, ycol),
    lwd=4)
}

par(mfrow=c(2,2))
plot_sse_density(nmr_v_fin_ss, "nmrcore", "findcore2")
plot_sse_density(nmr_v_cyr_ss, "cyrange", "nmrcore")
plot_sse_density(fin_v_cyr_ss, "cyrange", "findcore2")
par(mfrow=c(1,1))