library(bookdown)
library(minpack.lm)
library(readxl)
library(dplyr)
library(purrr)
library(tidyr)
library(ggplot2)
library(ggpubr)
library(tibble)
library(ggsci)
library(latex2exp)
library(AICcmodavg)
library(vtreat)
library(kableExtra)
library(broom)
# automatically create a bib database for R packages
knitr::write_bib(c(
.packages(), 'bookdown', 'knitr', 'rmarkdown'
), 'packages.bib')Figure 0.1: Statistical analysis roadmap.
Model evaluation based on a complete 4-fold CV (see Rpubs1.4) generated CV-score distributions for each binding model. This allowed for addition of a second dimension to this analysis by further investigating the central tendencies of those distributions by means of analysis of variance (ANOVA). ANOVA is generally used to study the effect that one or more factors might have on the variance in a response variable based on the respective factor means.
\[\begin{equation} Y_{ij} = \beta_0 + \beta_1X_{1i} + \beta_2X_{2i} + \beta_3X_{3i} + \beta_4X_{4i} + \epsilon_{ij} \tag{2.1} \end{equation}\]
\[\begin{gather} \beta_{X_k} = \frac{\Psi_k}{\sum\limits_{j=1}^m c_{jk}^2} \quad \text{, where} \\ \Psi_k = \sum\limits_{j=1}^m c_{jk} \bar{Y_j} \notag \tag{2.2} \end{gather}\]
\[\begin{equation} K = \begin{bmatrix} -4/5 & 0 & 0 & 0 \\ 1/5 & 1/4 & -1/3 & -1/2 \\ 1/5 & 1/4 & -1/3 & 1/2 \\ 1/5 & 1/4 & 2/3 & 0 \\ 1/5 & -3/4 & 0 & 0 \\ \end{bmatrix} \tag{2.3} \end{equation}\]
\[\begin{equation} \begin{aligned} H_{01}: \Psi_1 &= 0 \\ \beta_{X_1} &= \left(\frac{\bar{Y}_{bm1to2.deg.add} + \bar{Y}_{bm1to2.deg} + \bar{Y}_{bm1to2.coop.add} + \bar{Y}_{bm1to2.coop}}{4}\right) - \bar{Y}_{bm1to1} = 0 \\ H_{02}: \Psi_2 &= 0 \\ \beta_{X_2} &= \left(\frac{\bar{Y}_{bm1to2.deg.add} + \bar{Y}_{bm1to2.deg} + \bar{Y}_{bm1to2.coop.add}}{3}\right) - \bar{Y}_{bm1to2.coop} = 0 \\ H_{03}: \Psi_3 &= 0 \\ \beta_{X_3} &= \bar{Y}_{bm1to2.coop.add} - \left( \frac{\bar{Y}_{bm1to2.deg.add} + \bar{Y}_{bm1to2.deg}}{2}\right) = 0 \\ H_{04}: \Psi_4 &= 0 \\ \beta_{X_4} &= \frac{\bar{Y}_{bm1to2.deg} - \bar{Y}_{bm1to2.deg.add}}{2} = 0 \end{aligned} \tag{2.4} \end{equation}\]
\[\begin{equation} \begin{aligned} H_{01}: Model C1: Y_i &= \beta_0 + \beta_2X_{2i} + \beta_3X_{3i} + \beta_4X_{4i} + \epsilon_i \\ H_{02}: Model C2: Y_i &= \beta_0 + \beta_1X_{1i} + \beta_3X_{3i} + \beta_4X_{4i} + \epsilon_i \\ H_{03}: Model C3: Y_i &= \beta_0 + \beta_1X_{1i} + \beta_2X_{2i} + \beta_4X_{4i} + \epsilon_i \\ H_{04}: Model C4: Y_i &= \beta_0 + \beta_1X_{1i} + \beta_2X_{2i} + \beta_3X_{3i} + \epsilon_i \\ H_A: Model A: Y_i &= \beta_0 + \beta_1X_{1i} + \beta_2X_{2i} + \beta_3X_{3i} + \beta_4X_{4i} + \epsilon_i \end{aligned} \tag{2.5} \end{equation}\]
\[\begin{equation} F = \frac{SSR \times \frac{1}{p_A - p_C}}{SSE_A \times \frac{1}{n - p_A}} = \frac{SSR}{MSE_A} \tag{2.6} \end{equation}\]
\[\begin{gather} SSR = SSE_{C_i} - SSE_A = \frac{\Psi_k^2}{\sum\limits_{j=1}^m \frac{c_{jk}^2}{n_k}} \quad \text{, and} \\ SSR_A = \sum\limits_{i=1}^{k} SSR_{C_i} \notag \tag{2.7} \end{gather}\]
\[\begin{gather} SSE_A = \sum\limits_{m=1}^k \sum\limits_{i=1}^{n_m} (Y_{mi} - \bar{Y_i}_.)^2 \quad \text{, and} \\ \bar{Y_i}_. = \frac{1}{n_m} \sum\limits_{i=1}^{n_m} Y_{mi} \notag \tag{2.8} \end{gather}\]
\[\begin{equation} CI_{0.95}: \hat{\Psi}_k \pm \sqrt{F^*_{\alpha;df1,df2}} \times \sqrt{MSE_A \times \sum\limits_{j=1}^m \frac{c_{jk}^2}{n_k}} \tag{2.9} \end{equation}\]
contrast.ci <- function(model.lm, level = .05){
ci_margin <- function(x){
omega <- sum(x^2 / n)
err <- sqrt(fcrit * msw * omega)
err
}
rss <- resid(model.lm)
rssDF <- model.lm$df.residual
msw <- t(rss) %*% rss / rssDF
fcrit <- qf(level, 1, rssDF, lower.tail = FALSE)
model.coef <- coef(model.lm)[-1]
#cont_mat <- model.matrix(model.lm)[,-1]
#N <- dim(cont_mat)[1]
n <- 220
csquare <- apply(custom_cont, 2, function(x) sum(x^2))
ci_mat <- matrix(NA, length(model.coef), 2L,
dimnames = list(names(model.coef), c("lower", "upper")))
cont_se <- apply(custom_cont, 2, ci_margin)
ci_mat[,1] <- model.coef * csquare - cont_se
ci_mat[,2] <- model.coef * csquare + cont_se
ci_mat
}compCV.tbl <- readRDS("Output/compCV.rds")
source("Functions/robust_ANOVA.R", local = knitr::knit_global())
source("Functions/ggplot_theme.R", local = knitr::knit_global())
compCV.tbl <- compCV.tbl%>%
mutate(fold_class = ifelse(grepl("1-2", fold_points),
"outlier",
"normal"),
fold_class = factor(fold_class, levels = c("normal",
"outlier")))
custom_cont <- matrix(c(-4/5,1/5,1/5,1/5,1/5,
0,1/4,1/4,1/4,-3/4,
0,-1/3,-1/3,2/3,0,
0,-1/2,1/2,0,0),
nrow = 5)
colnames(custom_cont) <- c("k1", "k2", "k3", "k4")
rownames(custom_cont) <- c("bm1to1", "bm1to2.deg.add",
"bm1to2.deg", "bm1to2.coop.add",
"bm1to2.coop")
####################LinearReg and ANOVA#####################
contrasts(compCV.tbl$model) <- custom_cont
contrasts(compCV.tbl$fold_class) <- c(1,-1)
compModel.lm <- lm(RMSE ~ 1, data = compCV.tbl)
augModel1.lm <- lm(RMSE ~ model, data = compCV.tbl)
augModel2.lm <- lm(RMSE ~ fold_class + model, data = compCV.tbl)
augModel1.aov <- aov(augModel1.lm)
augModel2.aov <- aov(augModel2.lm)
cont_summary1 <- summary(augModel1.aov,
split = list(
model = list(
k1 = 1, k2 = 2, k3 = 3, k4 = 4)
))
####Robust linear Reg and ANOVA #######################
#### Linear Regression
trim <- 0.2
n <- 220
cutoff <- floor(n * trim)
compCV.trim.tbl<- compCV.tbl %>%
group_by(model) %>%
arrange(model,RMSE) %>%
mutate(index = c(1:220)) %>%
filter(index > cutoff & index <= 220 - cutoff) %>%
dplyr::select(-index) %>%
ungroup()
augModel1.trim.lm <- lm(RMSE ~ model, data = compCV.trim.tbl)
#### ANOVA
# bring tidy tibble into list mode
compCV.ls <- compCV.tbl %>%
dplyr::select(c(model, RMSE)) %>%
mutate(index = rep(c(1:220), times = 5)) %>%
pivot_wider(names_from = model, values_from = RMSE) %>%
dplyr::select(-index) %>%
as.list()
robustComp <- lincon(compCV.ls, custom_cont)options(knitr.kable.NA = '')
pre <- function(sum.aov){
nRow <- nrow(sum.aov[[1]][2])
ssr <- sum.aov[[1]][2][-nRow,]
sseA <- sum.aov[[1]][2][nRow,]
pre <- ssr / (ssr + sseA)
return(pre)
}
#####defining ANOVA summary table####
linconts <- diag(apply(custom_cont, 2,
function(x) augModel1.lm$coefficients[-1] * sum(x^2)))
aov.compModel <- tidy(anova(compModel.lm))
aov.df <- data.frame(Source = c("AugModel", "psi1", "psi2", "psi3", "psi4",
"Residuals", "Total"),
psihat = c(NA, linconts, NA, NA),
df = unlist(c(cont_summary1[[1]][1],
aov.compModel[1,"df"])),
SS = unlist(c(cont_summary1[[1]][2],
aov.compModel[1,"sumsq"])),
MS = unlist(c(cont_summary1[[1]][3],
aov.compModel[1,"meansq"])),
Fstat = unlist(c(cont_summary1[[1]][4],NA)),
p = unlist(c(cont_summary1[[1]][5],NA)),
PRE = c(pre(cont_summary1), NA, NA))
aov.df <- aov.df %>%
mutate(across(where(is.numeric), ~ round(., digits = 4)))
rownames(aov.df) <- c("$Y_{ij} = \\beta_0 + \\beta_1X_1 + \\beta_2X_2 + \\beta_3X_3 + \\beta_4X_4 + \\epsilon_{ij}$",
"$Y_{ij} = \\beta_0 + 0 + \\beta_2X_2 + \\beta_3X_3 + \\beta_4X_4 + \\epsilon_{ij}$",
"$Y_{ij} = \\beta_0 + \\beta_1X_1 + 0 + \\beta_3X_3 + \\beta_4X_4 + \\epsilon_{ij}$",
"$Y_{ij} = \\beta_0 + \\beta_1X_1 + \\beta_2X_2 + 0 + \\beta_4X_4 + \\epsilon_{ij}$",
"$Y_{ij} = \\beta_0 + \\beta_1X_1 + \\beta_2X_2 + \\beta_3X_3 + 0 + \\epsilon_{ij}$",
"---",
"$Y_{ij} = \\beta_0 + \\epsilon_{ij}$")
aov.df %>%
kable(caption = "**ANOVA summary table.** Shown is a composite table including summary statistics of the ANOVA but also the estimated contrasts ($\\hat{\\Psi}$) and effect sizes ($\\hat{\\eta}^2$). The $SS$ column represents sums of squares. When the source is the augmented model (\"AugModel\") the $SS$ value represents a reduction in sums of squares ($SSR$) on 4 degrees of freedom ($df$). This is also true for the partitioning of the $SSR$ of the augmented model into $SSR$ components of individual contrasts (\"psi1\"-\"psi4\") based on 1 $df$. When the source is \"Residuals\" the $SS$ value refers to the sum of squared error ($SSE$) of the augmented model and when the source is \" Total\" the $SS$ value refers to the $SSE$ of the grand mean model. In the $MS$ and $F_{stat}$ columns, respectively, corresponding mean squared errors and observed $F$-statistics are listed. The $p$ value represents the conditional probability of observing an $F$-statistic as extreme as computed for this instance or even more extreme. The $\\hat{\\eta}^2$ value represents the proportional reduction in error computed as $\\hat{\\eta}^2 = \\frac{SSR_i}{SSR_i + SSE_A}$, where $SSE_A$ represents the sum of squared error of the augmented model.",
col.names = c("$Source$",
"$\\hat{\\Psi}$",
"$df$",
"$SS$",
"$MS$",
"$F_{stat}$",
"$p(\\geq F_{stat})$",
"$\\hat{\\eta}^2$")) %>%
kable_classic() %>%
row_spec(0, background = "grey")| \(Source\) | \(\hat{\Psi}\) | \(df\) | \(SS\) | \(MS\) | \(F_{stat}\) | \(p(\geq F_{stat})\) | \(\hat{\eta}^2\) | |
|---|---|---|---|---|---|---|---|---|
| \(Y_{ij} = \beta_0 + \beta_1X_1 + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \epsilon_{ij}\) | AugModel | 4 | 0.0274 | 0.0068 | 30.9648 | 0.0000 | 0.1016 | |
| \(Y_{ij} = \beta_0 + 0 + \beta_2X_2 + \beta_3X_3 + \beta_4X_4 + \epsilon_{ij}\) | psi1 | -0.0096 | 1 | 0.0252 | 0.0252 | 113.9305 | 0.0000 | 0.0942 |
| \(Y_{ij} = \beta_0 + \beta_1X_1 + 0 + \beta_3X_3 + \beta_4X_4 + \epsilon_{ij}\) | psi2 | 0.0026 | 1 | 0.0020 | 0.0020 | 9.0324 | 0.0027 | 0.0082 |
| \(Y_{ij} = \beta_0 + \beta_1X_1 + \beta_2X_2 + 0 + \beta_4X_4 + \epsilon_{ij}\) | psi3 | -0.0008 | 1 | 0.0002 | 0.0002 | 0.8964 | 0.3439 | 0.0008 |
| \(Y_{ij} = \beta_0 + \beta_1X_1 + \beta_2X_2 + \beta_3X_3 + 0 + \epsilon_{ij}\) | psi4 | 0.0000 | 1 | 0.0000 | 0.0000 | 0.0000 | 0.9996 | 0.0000 |
| — | Residuals | 1095 | 0.2420 | 0.0002 | ||||
| \(Y_{ij} = \beta_0 + \epsilon_{ij}\) | Total | 1099 | 0.2694 | 0.0002 |
bm1to1 model and the pooled mean CV-score of all tow-site binding models (see eq. (2.4))bm1to2.coop model and the average mean CV-score of the remaining two-site binding modelsbm1to2.coop.add model and the degenerative models bm1to2.deg and bm1to2.deg.add, respectivelybm1to2.deg and bm1to2.deg.add models, respectivelyci.df <- data.frame(lower = contrast.ci(augModel1.lm)[,1],
upper = contrast.ci(augModel1.lm)[,2],
contrast = c("1","2","3","4"))
ci.A <- ci.df %>%
pivot_longer(-contrast,names_to = "bound", values_to = "margin") %>%
mutate(coefficient = rep(linconts, each = 2)) %>%
ggplot(aes(x = margin, y = contrast))+
geom_point(size = 2)+
geom_line(size = 1.0)+
geom_point(aes(x = coefficient), color = "blue", size = 3, shape = 18)+
geom_vline(xintercept = 0, lty = 2, size = 1.0,
color = "red")+
theme_bw()+
theme(axis.ticks.length=unit(.07, "cm"))+
theme(axis.ticks = element_line(colour = "black", size = 1))+
theme(axis.text.y = element_text(color="black",size = 12))+
theme(axis.text.x = element_text(color="black",size = 12))+
theme(axis.title.y = element_text(size = 14,face = "bold"))+
theme(axis.title.x = element_text(size = 14,face = "bold"))+
theme(panel.grid.minor.x = element_line(color = "grey80"),
panel.grid.major.x = element_line(color = "grey80"),
panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
panel.border = element_rect(size = 1.2, color = "black"))+
xlab(TeX("$\\Delta$CV-Score", bold = TRUE))
#### Residual plot#####
resid.A <-
augModel1.lm %>%
augment() %>%
ggplot(aes(sample = .resid))+
stat_qq(aes(color = model), alpha = 0.7, size = 1.3)+
stat_qq_line(color = "black", size = 1.3)+
scale_color_uchicago()+
mytheme_axes_box+
theme(legend.title = element_blank(),
legend.text = element_text(size = 10,
face = "bold"),
legend.position = c(0.3,0.72))
ggarrange(resid.A, ci.A,
labels = c("A","B"),
ncol = 2, nrow = 1)Figure 3.1: A: ANOVA residual QQ-plots. The residuals of the augmented model used to fit the CV-scores conditional on the type of binding model were sorted in ascending order (ordinate) and plotted against the quantiles from a standard normal distribution (abscissa). B: Interval estimates at the 95% confidence level. For each contrast (1-4) a 95% confidence interval was computed around the respective least squares point estimate (blue diamonds) based on a common variance term (\(MSE_A\), see eq. 2.26) amongst each level of the binding model factor.
The regression analysis of the augmented model revealed that the type of binding model has an overall statistical effect on the CV-score distribution. However, the results of this omnibus ANOVA and corresponding effect sizes are based on four \(df\) and therefore are hardly interpretable. That is why a general linear model was formalized that is able to perform contrast analysis. The benefit of contrast analysis is that it allows to formulate focused hypotheses testing differences of groups of means that are based on one \(df\) while controlling the \(\alpha\)-level. In accordance with the CV analysis, a test of \(H_{01}\) which involves the key hypothesis of this work revealed that the one-site binding model bm1to1 has a statistically higher mean fold prediction error than the average two-site binding model (cf. tab 3.1 and fig 3.1B) which is determined by the negative direction of the contrast (\(\Psi_1\) = -9.6 \(\times\) 10-3). \(\Psi_1\) is part of the most important contrast-coded regression coefficient (\(\beta_{X_1}\)) of the ANOVA model due to the fact that the largest PRE is associated with it (\(\hat{\eta}^2\) = 9.42%). Further considering the sub-set of the two-site binding models it was found by testing \(H_{02}\) that the global bm1to2.coop model has a statistically lower mean CV-score than the average remaining two-site binding models as determined by the positive direction of the associated contrast (\(\Psi_2\) = \(\times\) 10-3). However, the corresponding contrast-coded regression coefficient (\(\beta_{X_2}\)) has a relatively low impact on the PRE (\(\hat{\eta}^2\) = 0.82%). Additionally computed interval estimates of the contrasts at the 95% confidence level were based on a common variance term amongst the factor levels (\(MSE_A\)) which reflects the assumtpion about homoscedastic distributed residuals of the general linear ANOVA model. A comparison of the \(CI_{95}\) associated with \(\Psi_1\) and \(\Psi_2\) revealed that the range of possible hypotheses outside these intervals including the respective zero hypotheses \(H_{01}\) and \(H_{02}\), that is, the contrasts equal zero, is larger for \(\Psi_1\) than for \(\Psi_2\). From this one might conclude that currently the data are more consistent with a clearer separation between one-site and two-site binding models (\(H_{01}\)) than with a further distinction of the two-site binding models (\(H_{02}\)). However, this is an indication based on a first experimental analysis. Ultimately, more biological replications are needed which will offer the possibility for meta-analysis of the intervals.