# Course: EDF 6403 - Quantitative Foundations of Educational Research
# Stats Analysis 8: Factorial ANOVA main effects analysis
# Author: Nguyet Hoang | hoangt@ufl.edu

# Step 0: Load library
library(psych)
library(haven)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
library(emmeans)
## Welcome to emmeans.
## Caution: You lose important information if you filter this package's results.
## See '? untidy'
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
library(moments)
library(scales)
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
library(nortest)

# Step 1: Read the data
data <- read_sav("SA8_Online_and_studying.sav")
View(data)
colnames(data)
## [1] "ID"          "Course_type" "Stdy_env"    "Score"
attr(data$Course_type, "label")
## In-person    Online    Hybrid 
##         1         2         3
attr(data$Stdy_env, "label")
##   Quiet   Noisy Library 
##       1       2       3
# Labeling the data
data$Course_type <- factor(data$Course_type, levels = c(1, 2, 3), labels = c("In-person", "Online", "Hybrid"))
data$Stdy_env <- factor(data$Stdy_env, levels = c(1, 2, 3), labels = c("Quiet", "Noisy", "Library"))

# Descriptive statistics for the four groups
describeBy(data$Score, list(data$Course_type, data$Stdy_env), mat = TRUE)
##     item    group1  group2 vars  n     mean       sd median  trimmed    mad
## X11    1 In-person   Quiet    1 16 77.87500 5.536244  78.00 77.92857 6.6717
## X12    2    Online   Quiet    1 24 76.33333 7.539269  76.00 76.50000 7.4130
## X13    3    Hybrid   Quiet    1 16 76.12500 6.365270  76.00 75.64286 5.1891
## X14    4 In-person   Noisy    1 32 74.65625 5.615987  75.25 75.01923 5.9304
## X15    5    Online   Noisy    1 48 75.27083 5.869103  75.25 75.27500 5.9304
## X16    6    Hybrid   Noisy    1 32 76.18750 5.645081  75.25 76.09615 5.1891
## X17    7 In-person Library    1 16 76.50000 6.099180  77.50 76.14286 6.6717
## X18    8    Online Library    1 24 77.08333 5.740146  77.00 77.25000 5.1891
## X19    9    Hybrid Library    1 16 79.75000 6.212890  81.00 80.14286 7.4130
##       min   max range        skew    kurtosis        se
## X11 69.00 86.00    17 -0.09192076 -1.41098432 1.3840611
## X12 61.00 89.00    28 -0.15850677 -0.84597835 1.5389469
## X13 67.00 92.00    25  0.77405686  0.17929056 1.5913176
## X14 61.25 83.25    22 -0.52377389 -0.65393990 0.9927757
## X15 60.25 89.25    29 -0.02021683 -0.07810387 0.8471320
## X16 65.25 87.25    22  0.19964028 -0.72550345 0.9979187
## X17 68.00 90.00    22  0.33468934 -0.82263484 1.5247951
## X18 63.00 87.00    24 -0.31227306 -0.34035638 1.1717024
## X19 65.00 89.00    24 -0.69310504 -0.25381418 1.5532225
# Step 2: Running the model
model = lm(Score ~ Course_type * Stdy_env, data = data, contrasts=list(Course_type=contr.sum, Stdy_env=contr.sum))
anova_model <- Anova(model, type = "III")

# Step 3: Check homogeneity assumption
levene_test_results <- leveneTest(Score ~ Course_type * Stdy_env, data = data)
levene_test_results
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value Pr(>F)
## group   8   0.417 0.9101
##       215
# Step 4: Checking normality
data$residuals <- residuals(model)
residuals <- data$residuals
# 4.1. Use skewness and kurtosis statistics
skew_val <- skew(residuals, type = 2)
kurt_val <- kurtosi(residuals, type = 2)
print(paste("Skewness:", number(skew_val, accuracy = 0.01)))
## [1] "Skewness: -0.08"
print(paste("Kurtosis:", number(kurt_val, accuracy = 0.01)))
## [1] "Kurtosis: -0.17"
# 4.2. A histogram with a normal curve overlayed on top of it
# 4.2.1. Calculate the statistics
m   <- mean(residuals, na.rm = TRUE)
md  <- median(residuals, na.rm = TRUE)
std <- sd(residuals, na.rm = TRUE)
# 4.2.2. Set up the histogram
h <- hist(residuals, plot=FALSE)
plot(h, col="white", border="white", main="",
     xlab="", ylab="", axes=FALSE,
     ylim=c(0, max(h$counts) * 1.2))
grid(nx=NA, ny=NULL, col="gray", lty="dotted")
par(new=TRUE)
hist(residuals, freq=TRUE,
     col = "lightblue", border = "black",
     xlab="SA3 residuals",
     main="Histogram of the residuals",
     cex.main = 0.9,
     ylim=c(0, max(h$counts) * 1.2))
# 4.2.3. Add the normal curve
scaling_factor <- length(residuals) * diff(h$breaks)[1]
curve(dnorm(x, mean=m, sd=std) * scaling_factor,
      col="darkred", lwd=2, add=TRUE, yaxt="n")
abline(v = m, col = "darkblue", lwd = 2, lty = 2)
# 4.2.4. Add a Legend
legend("topright", legend = c(
  paste("Mean =", sprintf("%.2f", m)),
  paste("SD =", sprintf("%.2f", std)),
  paste("Median =", sprintf("%.2f", md)),
  "Normal Curve"),
  col = c("darkblue", "transparent", "transparent","darkred"),
  lty = c(2, 0, 0, 1), lwd = c(2, 0, 0, 2),
  bty = "n", cex = 0.8)
box(bty = "l")

# 4.3. A QQplot
# 4.3.1. Draw the points
qqnorm(residuals, main = "Q-Q Plot of data residuals",
       xlab = "Theoretical Quantiles (Normal Distribution)",
       ylab = "Observed data residuals",
       pch = 19, col = "steelblue")
# 4.3.2. Add the reference line
qqline(residuals, col = "darkred", lwd = 2)

# 4.4. The Lilliefors-corrected KS test (n = 224 > 50)
lillie_test_result <- lillie.test(residuals)
lillie_test_result
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residuals
## D = 0.031776, p-value = 0.8407
# Step 5: ANOVA table and effect size:
anova_model
## Anova Table (Type III tests)
## 
## Response: Score
##                       Sum Sq  Df    F value  Pr(>F)    
## (Intercept)          1141916   1 31258.5139 < 2e-16 ***
## Course_type               48   2     0.6620 0.51686    
## Stdy_env                 223   2     3.0552 0.04916 *  
## Course_type:Stdy_env     113   4     0.7708 0.54530    
## Residuals               7854 215                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
eta_squared <- eta_squared(anova_model, partial = TRUE)
## Type 3 ANOVAs only give sensible and informative results when covariates
##   are mean-centered and factors are coded with orthogonal contrasts (such
##   as those produced by `contr.sum`, `contr.poly`, or `contr.helmert`, but
##   *not* by the default `contr.treatment`).
eta_squared
## # Effect Size for ANOVA (Type III)
## 
## Parameter            | Eta2 (partial) |       95% CI
## ----------------------------------------------------
## Course_type          |       6.12e-03 | [0.00, 1.00]
## Stdy_env             |           0.03 | [0.00, 1.00]
## Course_type:Stdy_env |           0.01 | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
omega_squared <- omega_squared(anova_model, partial = TRUE)
## Type 3 ANOVAs only give sensible and informative results when covariates
##   are mean-centered and factors are coded with orthogonal contrasts (such
##   as those produced by `contr.sum`, `contr.poly`, or `contr.helmert`, but
##   *not* by the default `contr.treatment`).
omega_squared
## # Effect Size for ANOVA (Type III)
## 
## Parameter            | Omega2 (partial) |       95% CI
## ------------------------------------------------------
## Course_type          |             0.00 | [0.00, 1.00]
## Stdy_env             |             0.02 | [0.00, 1.00]
## Course_type:Stdy_env |             0.00 | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
# Step 6: the following main effects are significant, but the interaction is not.
# Since only study environment is significant, we will run post hoc tests for study environment only.

emmeans_model_SE <- emmeans(model, ~ Stdy_env, weights="cells")
## NOTE: Results may be misleading due to involvement in interactions
bonferroni_result_SE <- contrast(emmeans_model_SE, method = "pairwise", adjust = "Bonferroni")
final_result_SE <- as.data.frame(summary(bonferroni_result_SE, infer = c(TRUE, TRUE)))
final_result_SE
##  contrast          estimate        SE  df  lower.CL upper.CL t.ratio p.value
##  Quiet - Noisy    1.3571429 0.9892008 215 -1.029658 3.743944   1.372  0.5145
##  Quiet - Library -0.9642857 1.1422307 215 -3.720326 1.791754  -0.844  1.0000
##  Noisy - Library -2.3214286 0.9892008 215 -4.708229 0.065372  -2.347  0.0595
## 
## Results are averaged over the levels of: Course_type 
## Confidence level used: 0.95 
## Conf-level adjustment: bonferroni method for 3 estimates 
## P value adjustment: bonferroni method for 3 tests
# Step 7: Graphs
emm <- emmeans(model, ~ Course_type * Stdy_env)
emm_df <- as.data.frame(emm)
emm_df
##  Course_type Stdy_env   emmean        SE  df lower.CL upper.CL
##  In-person   Quiet    77.87500 1.5110292 215 74.89667 80.85333
##  Online      Quiet    76.33333 1.2337502 215 73.90154 78.76513
##  Hybrid      Quiet    76.12500 1.5110292 215 73.14667 79.10333
##  In-person   Noisy    74.65625 1.0684590 215 72.55025 76.76225
##  Online      Noisy    75.27083 0.8723931 215 73.55129 76.99037
##  Hybrid      Noisy    76.18750 1.0684590 215 74.08150 78.29350
##  In-person   Library  76.50000 1.5110292 215 73.52167 79.47833
##  Online      Library  77.08333 1.2337502 215 74.65154 79.51513
##  Hybrid      Library  79.75000 1.5110292 215 76.77167 82.72833
## 
## Confidence level used: 0.95
# Course type on x-axis, emmean on y-axis, lines for each Study environment
ggplot(emm_df, aes(x = Course_type, y = emmean, group = Stdy_env, color = Stdy_env)) +
  geom_point(size = 3) +
  geom_line() +
  labs(title = "Interaction Plot: Estimated Marginal Means",
       x = "Course type",
       y = "Estimated Marginal Means",
       color = "Study environment") +
  theme_minimal()

# Study environment on x-axis, emmean on y-axis, lines for each course type
ggplot(emm_df, aes(x = Stdy_env, y = emmean, group = Course_type, color = Course_type)) +
  geom_point(size = 3) +
  geom_line() +
  labs(title = "Interaction Plot: Estimated Marginal Means",
       x = "Study environment",
       y = "Estimated Marginal Means",
       color = "Course type") +
  theme_minimal()