# Load required libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(lm.beta)
library(e1071)
##
## Attaching package: 'e1071'
## The following object is masked from 'package:ggplot2':
##
## element
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.1 ✔ tidyr 1.3.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ e1071::element() masks ggplot2::element()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Import dataset
AMATUS_data <- read_csv("C:/Users/McdonaldAdero/Downloads/AMATUS_dataset.csv")
## Rows: 1106 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id_unique;id_sample;sample;sex;age;age_range;native_speaker;math_gr...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Preview first few rows
head(AMATUS_data)
# Column separation
AMATUS_data <- AMATUS_data %>%
separate(col = 1, # the first column
into = str_split(names(AMATUS_data)[1], ";")[[1]], # split using header names
sep = ";",
remove = TRUE)
# Preview the result
head(AMATUS_data)
#Extract only the selected variables
AMATUS_selected <- AMATUS_data %>%
select(
sum_arith_perf, # Outcome
sex, # Dichotomous predictor
teacher_stage, # Multi-categorical predictor
score_TAI_short, # Numerical predictor 1
score_FSMAS_SE, # Numerical predictor 2
score_BFI_N # Numerical predictor 3
)
#Convert numeric columns to numeric type
AMATUS_selected <- AMATUS_selected %>%
mutate(
sum_arith_perf = as.numeric(sum_arith_perf),
score_TAI_short = as.numeric(score_TAI_short),
score_FSMAS_SE = as.numeric(score_FSMAS_SE),
score_BFI_N = as.numeric(score_BFI_N)
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `sum_arith_perf = as.numeric(sum_arith_perf)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#Keep only complete cases (no missing values)
AMATUS_clean <- AMATUS_selected %>% drop_na()
#Preview the new dataset
head(AMATUS_clean)
library(tidyverse)
library(psych) # for skew
# Select numerical variables
num_vars <- AMATUS_clean %>%
select(sum_arith_perf, score_TAI_short, score_FSMAS_SE, score_BFI_N)
# Compute descriptive stats
desc_stats <- data.frame(
Variable = names(num_vars),
Mean = sapply(num_vars, mean, na.rm = TRUE),
SD = sapply(num_vars, sd, na.rm = TRUE),
Min = sapply(num_vars, min, na.rm = TRUE),
Max = sapply(num_vars, max, na.rm = TRUE),
Skew = sapply(num_vars, psych::skew, na.rm = TRUE)
)
# View the table
desc_stats
# Sex
sex_table <- AMATUS_clean %>%
group_by(sex) %>%
summarise(
Count = n(),
Percent = n()/nrow(AMATUS_clean)*100
)
# Teacher stage
teacher_table <- AMATUS_clean %>%
group_by(teacher_stage) %>%
summarise(
Count = n(),
Percent = n()/nrow(AMATUS_clean)*100
)
sex_table
teacher_table
# Boxplot for sum_arith_perf by sex
ggplot(AMATUS_clean, aes(x = sex, y = sum_arith_perf, fill = sex)) +
geom_boxplot() +
labs(title = "Arithmetic Performance by Sex", x = "Sex", y = "Sum Arithmetic Performance") +
theme_minimal()

# Boxplot for sum_arith_perf by teacher_stage
ggplot(AMATUS_clean, aes(x = teacher_stage, y = sum_arith_perf, fill = teacher_stage)) +
geom_boxplot() +
labs(title = "Arithmetic Performance by Teacher Stage", x = "Teacher Stage", y = "Sum Arithmetic Performance") +
theme_minimal()

# Numerical variables: outcome + predictors
num_vars <- AMATUS_clean %>%
select(sum_arith_perf, score_TAI_short, score_FSMAS_SE, score_BFI_N)
# Correlation matrix
cor_matrix <- round(cor(num_vars, use = "pairwise.complete.obs"), 2)
cor_matrix
## sum_arith_perf score_TAI_short score_FSMAS_SE score_BFI_N
## sum_arith_perf 1.00 -0.20 0.25 -0.19
## score_TAI_short -0.20 1.00 -0.13 0.41
## score_FSMAS_SE 0.25 -0.13 1.00 -0.06
## score_BFI_N -0.19 0.41 -0.06 1.00
# Convert categorical predictors to factors
AMATUS_clean <- AMATUS_clean %>%
mutate(
sex = factor(sex),
teacher_stage = factor(teacher_stage)
)
# Standard multiple regression
model <- lm(sum_arith_perf ~ sex + teacher_stage + score_TAI_short + score_FSMAS_SE + score_BFI_N,
data = AMATUS_clean)
# View model summary
summary(model)
##
## Call:
## lm(formula = sum_arith_perf ~ sex + teacher_stage + score_TAI_short +
## score_FSMAS_SE + score_BFI_N, data = AMATUS_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9431 -4.1973 -0.3713 4.0237 19.6073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.50014 5.60744 0.268 0.789387
## sexm 3.28348 1.36352 2.408 0.017106 *
## teacher_stageprimary school teacher 2.98254 2.56816 1.161 0.247126
## teacher_stagestudy -1.16630 2.49779 -0.467 0.641148
## score_TAI_short -0.12301 0.13041 -0.943 0.346920
## score_FSMAS_SE 0.37576 0.10679 3.519 0.000556 ***
## score_BFI_N -0.10050 0.09228 -1.089 0.277674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.291 on 170 degrees of freedom
## Multiple R-squared: 0.201, Adjusted R-squared: 0.1728
## F-statistic: 7.127 on 6 and 170 DF, p-value: 8.641e-07
# ANOVA table
anova(model)
# Load package for standardized betas
library(lm.beta)
# Standardized regression
model_std <- lm.beta(model)
summary(model_std)
##
## Call:
## lm(formula = sum_arith_perf ~ sex + teacher_stage + score_TAI_short +
## score_FSMAS_SE + score_BFI_N, data = AMATUS_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9431 -4.1973 -0.3713 4.0237 19.6073
##
## Coefficients:
## Estimate Standardized Std. Error t value
## (Intercept) 1.50014 NA 5.60744 0.268
## sexm 3.28348 0.17117 1.36352 2.408
## teacher_stageprimary school teacher 2.98254 0.19469 2.56816 1.161
## teacher_stagestudy -1.16630 -0.07901 2.49779 -0.467
## score_TAI_short -0.12301 -0.07298 0.13041 -0.943
## score_FSMAS_SE 0.37576 0.24443 0.10679 3.519
## score_BFI_N -0.10050 -0.08285 0.09228 -1.089
## Pr(>|t|)
## (Intercept) 0.789387
## sexm 0.017106 *
## teacher_stageprimary school teacher 0.247126
## teacher_stagestudy 0.641148
## score_TAI_short 0.346920
## score_FSMAS_SE 0.000556 ***
## score_BFI_N 0.277674
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.291 on 170 degrees of freedom
## Multiple R-squared: 0.201, Adjusted R-squared: 0.1728
## F-statistic: 7.127 on 6 and 170 DF, p-value: 8.641e-07
# Test unique contribution of score_TAI_short
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
# Type II or III ANOVA for partial effect
Anova(model, type = "II") # or type="III" if needed
# Create polynomial term (squared)
AMATUS_clean <- AMATUS_clean %>%
mutate(score_TAI_short2 = score_TAI_short^2)
# Polynomial regression including all predictors
poly_model <- lm(sum_arith_perf ~ sex + teacher_stage + score_TAI_short + score_TAI_short2 +
score_FSMAS_SE + score_BFI_N,
data = AMATUS_clean)
# Summary of model
summary(poly_model)
##
## Call:
## lm(formula = sum_arith_perf ~ sex + teacher_stage + score_TAI_short +
## score_TAI_short2 + score_FSMAS_SE + score_BFI_N, data = AMATUS_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.926 -4.217 -0.359 4.016 19.595
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.658837 6.926228 0.240 0.811008
## sexm 3.282550 1.367744 2.400 0.017483 *
## teacher_stageprimary school teacher 2.991303 2.585391 1.157 0.248904
## teacher_stagestudy -1.159673 2.510836 -0.462 0.644770
## score_TAI_short -0.151648 0.741283 -0.205 0.838150
## score_TAI_short2 0.001106 0.028180 0.039 0.968734
## score_FSMAS_SE 0.375635 0.107148 3.506 0.000583 ***
## score_BFI_N -0.100217 0.092843 -1.079 0.281940
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.309 on 169 degrees of freedom
## Multiple R-squared: 0.201, Adjusted R-squared: 0.1679
## F-statistic: 6.073 on 7 and 169 DF, p-value: 2.467e-06
# ANOVA to test unique contribution of polynomial term
anova(poly_model)
# Create interaction term
AMATUS_clean <- AMATUS_clean %>%
mutate(TAI_SE_interaction = score_TAI_short * score_FSMAS_SE)
# Moderation model
mod_model <- lm(sum_arith_perf ~ sex + teacher_stage + score_TAI_short +
score_FSMAS_SE + TAI_SE_interaction + score_BFI_N,
data = AMATUS_clean)
summary(mod_model)
##
## Call:
## lm(formula = sum_arith_perf ~ sex + teacher_stage + score_TAI_short +
## score_FSMAS_SE + TAI_SE_interaction + score_BFI_N, data = AMATUS_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.8882 -4.2409 -0.5512 3.9409 19.9803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.86299 17.82970 0.721 0.4716
## sexm 3.21909 1.36909 2.351 0.0199 *
## teacher_stageprimary school teacher 2.87995 2.57685 1.118 0.2653
## teacher_stagestudy -1.19129 2.50212 -0.476 0.6346
## score_TAI_short -0.93152 1.21113 -0.769 0.4429
## score_FSMAS_SE 0.10845 0.41220 0.263 0.7928
## TAI_SE_interaction 0.01929 0.02872 0.671 0.5028
## score_BFI_N -0.10314 0.09252 -1.115 0.2665
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.301 on 169 degrees of freedom
## Multiple R-squared: 0.2031, Adjusted R-squared: 0.1701
## F-statistic: 6.154 on 7 and 169 DF, p-value: 2.021e-06
# Test unique contribution of interaction
anova(mod_model)
# Plot interaction
library(interactions)
interact_plot(mod_model, pred = score_TAI_short, modx = score_FSMAS_SE,
plot.points = TRUE, interval = TRUE)
## Warning: score_TAI_short and score_FSMAS_SE are not included in an interaction with
## one another in the model.
## Warning: 46.1431688881329 is outside the observed range of score_FSMAS_SE

# --- Step 4: Regression diagnostics on original multiple regression ---
orig_model <- lm(sum_arith_perf ~ sex + teacher_stage + score_TAI_short + score_FSMAS_SE + score_BFI_N,
data = AMATUS_clean)
# 1. Residual plots
par(mfrow = c(2,2))
plot(orig_model)

# 2. Cook's distance for influential points
cooksd <- cooks.distance(orig_model)
plot(cooksd, type = "h", main = "Cook's Distance")
abline(h = 4/(nrow(AMATUS_clean)-length(orig_model$coefficients)-2), col = "red")
# 3. Check normality of residuals
shapiro.test(resid(orig_model))
##
## Shapiro-Wilk normality test
##
## data: resid(orig_model)
## W = 0.99091, p-value = 0.3253
# 4. Check multicollinearity
vif(orig_model)
## GVIF Df GVIF^(1/(2*Df))
## sex 1.075009 1 1.036827
## teacher_stage 1.051262 2 1.012576
## score_TAI_short 1.273688 1 1.128578
## score_FSMAS_SE 1.026628 1 1.013226
## score_BFI_N 1.231393 1 1.109682

library(mediation)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: mvtnorm
## Loading required package: sandwich
## Registered S3 method overwritten by 'lme4':
## method from
## na.action.merMod car
## mediation: Causal Mediation Analysis
## Version: 4.5.1
##
## Attaching package: 'mediation'
## The following object is masked from 'package:psych':
##
## mediate
library(car)
# --- Step 0: Define variables for the mediation model ---
vars <- c("sum_arith_perf", "score_TAI_short", "score_BFI_N",
"score_FSMAS_SE", "sex", "teacher_stage")
# --- Step 1: Keep only relevant columns and complete cases ---
AMATUS_model <- AMATUS_clean[, vars]
AMATUS_model <- AMATUS_model[complete.cases(AMATUS_model), ]
# --- Step 2: Convert categorical variables to factors with fixed levels ---
AMATUS_model$sex <- factor(AMATUS_model$sex, levels = unique(AMATUS_model$sex))
AMATUS_model$teacher_stage <- factor(AMATUS_model$teacher_stage,
levels = unique(AMATUS_model$teacher_stage))
# --- Step 3: Fit the mediator model ---
med_model <- lm(score_BFI_N ~ score_TAI_short + sex + teacher_stage + score_FSMAS_SE,
data = AMATUS_model)
# --- Step 4: Fit the outcome model including the mediator ---
out_model <- lm(sum_arith_perf ~ score_TAI_short + score_BFI_N + sex + teacher_stage + score_FSMAS_SE,
data = AMATUS_model)
# --- Step 5: Conduct mediation analysis with bootstrapping ---
med_results <- mediate(med_model, out_model,
treat = "score_TAI_short",
mediator = "score_BFI_N",
boot = TRUE, sims = 500) # reduced sims for stability
## Running nonparametric bootstrap
# --- Step 6: Show summary of mediation results ---
summary(med_results)
##
## Causal Mediation Analysis
##
## Nonparametric Bootstrap Confidence Intervals with the Percentile Method
##
## Estimate 95% CI Lower 95% CI Upper p-value
## ACME -0.051190 -0.170452 0.047673 0.284
## ADE -0.123006 -0.389220 0.170132 0.476
## Total Effect -0.174196 -0.390728 0.088525 0.212
## Prop. Mediated 0.293862 -3.753955 4.756096 0.496
##
## Sample Size Used: 177
##
##
## Simulations: 500
library(ggplot2)
library(grid)
# --- Extract coefficients from your mediation results ---
a <- med_results$d0 # effect of TAI -> BFI (average causal mediation effect)
b <- med_results$z0 # effect of BFI -> Outcome (approximated)
c_prime <- med_results$z0 + med_results$d0 # direct effect (simplified for labeling)
# --- Define node positions ---
nodes <- data.frame(
name = c("score_TAI_short\n(Treatment)",
"score_BFI_N\n(Mediator)",
"sum_arith_perf\n(Outcome)"),
x = c(1, 2, 3),
y = c(2, 2, 2)
)
# --- Define edges ---
edges <- data.frame(
from = c(1, 2, 1),
to = c(2, 3, 3),
label = c(round(a, 2), round(b, 2), round(c_prime, 2))
)
# --- Create diagram ---
ggplot() +
# Nodes
geom_point(data = nodes, aes(x=x, y=y), size=12, color="skyblue") +
geom_text(data = nodes, aes(x=x, y=y, label=name), color="black") +
# Edges with arrows
geom_curve(data = edges,
aes(x=nodes$x[from], y=nodes$y[from],
xend=nodes$x[to], yend=nodes$y[to]),
curvature = 0.2, arrow = arrow(length = unit(0.3,"cm")), color="darkblue", size=1.2) +
# Edge labels (coefficients)
geom_text(data = edges,
aes(x=(nodes$x[from]+nodes$x[to])/2,
y=(nodes$y[from]+nodes$y[to])/2 + 0.15,
label=label), color="red", size=5) +
theme_void() +
xlim(0.5, 3.5) + ylim(1.5, 2.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
