Regression Analysis-McDonald Owuor

# Load required libraries
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(lm.beta)
library(e1071)

## 
## Attaching package: 'e1071'

## The following object is masked from 'package:ggplot2':
## 
##     element

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.1     ✔ tidyr     1.3.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%()     masks ggplot2::%+%()
## ✖ psych::alpha()   masks ggplot2::alpha()
## ✖ e1071::element() masks ggplot2::element()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Import dataset
AMATUS_data <- read_csv("C:/Users/McdonaldAdero/Downloads/AMATUS_dataset.csv")

## Rows: 1106 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): id_unique;id_sample;sample;sex;age;age_range;native_speaker;math_gr...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Preview first few rows
head(AMATUS_data)

# Column separation
AMATUS_data <- AMATUS_data %>%
  separate(col = 1,                 # the first column
           into = str_split(names(AMATUS_data)[1], ";")[[1]], # split using header names
           sep = ";", 
           remove = TRUE)

# Preview the result
head(AMATUS_data)

#Extract only the selected variables
AMATUS_selected <- AMATUS_data %>%
  select(
    sum_arith_perf,   # Outcome
    sex,              # Dichotomous predictor
    teacher_stage,    # Multi-categorical predictor
    score_TAI_short,  # Numerical predictor 1
    score_FSMAS_SE,   # Numerical predictor 2
    score_BFI_N       # Numerical predictor 3
  )

#Convert numeric columns to numeric type
AMATUS_selected <- AMATUS_selected %>%
  mutate(
    sum_arith_perf = as.numeric(sum_arith_perf),
    score_TAI_short = as.numeric(score_TAI_short),
    score_FSMAS_SE = as.numeric(score_FSMAS_SE),
    score_BFI_N = as.numeric(score_BFI_N)
  )

## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `sum_arith_perf = as.numeric(sum_arith_perf)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.

#Keep only complete cases (no missing values)
AMATUS_clean <- AMATUS_selected %>% drop_na()

#Preview the new dataset
head(AMATUS_clean)

library(tidyverse)
library(psych)  # for skew

# Select numerical variables
num_vars <- AMATUS_clean %>%
  select(sum_arith_perf, score_TAI_short, score_FSMAS_SE, score_BFI_N)

# Compute descriptive stats
desc_stats <- data.frame(
  Variable = names(num_vars),
  Mean = sapply(num_vars, mean, na.rm = TRUE),
  SD   = sapply(num_vars, sd, na.rm = TRUE),
  Min  = sapply(num_vars, min, na.rm = TRUE),
  Max  = sapply(num_vars, max, na.rm = TRUE),
  Skew = sapply(num_vars, psych::skew, na.rm = TRUE)
)

# View the table
desc_stats

# Sex
sex_table <- AMATUS_clean %>%
  group_by(sex) %>%
  summarise(
    Count = n(),
    Percent = n()/nrow(AMATUS_clean)*100
  )

# Teacher stage
teacher_table <- AMATUS_clean %>%
  group_by(teacher_stage) %>%
  summarise(
    Count = n(),
    Percent = n()/nrow(AMATUS_clean)*100
  )

sex_table

teacher_table

# Boxplot for sum_arith_perf by sex
ggplot(AMATUS_clean, aes(x = sex, y = sum_arith_perf, fill = sex)) +
  geom_boxplot() +
  labs(title = "Arithmetic Performance by Sex", x = "Sex", y = "Sum Arithmetic Performance") +
  theme_minimal()

# Boxplot for sum_arith_perf by teacher_stage
ggplot(AMATUS_clean, aes(x = teacher_stage, y = sum_arith_perf, fill = teacher_stage)) +
  geom_boxplot() +
  labs(title = "Arithmetic Performance by Teacher Stage", x = "Teacher Stage", y = "Sum Arithmetic Performance") +
  theme_minimal()

# Numerical variables: outcome + predictors
num_vars <- AMATUS_clean %>%
  select(sum_arith_perf, score_TAI_short, score_FSMAS_SE, score_BFI_N)
# Correlation matrix
cor_matrix <- round(cor(num_vars, use = "pairwise.complete.obs"), 2)
cor_matrix

##                 sum_arith_perf score_TAI_short score_FSMAS_SE score_BFI_N
## sum_arith_perf            1.00           -0.20           0.25       -0.19
## score_TAI_short          -0.20            1.00          -0.13        0.41
## score_FSMAS_SE            0.25           -0.13           1.00       -0.06
## score_BFI_N              -0.19            0.41          -0.06        1.00

# Convert categorical predictors to factors
AMATUS_clean <- AMATUS_clean %>%
  mutate(
    sex = factor(sex),
    teacher_stage = factor(teacher_stage)
  )

# Standard multiple regression
model <- lm(sum_arith_perf ~ sex + teacher_stage + score_TAI_short + score_FSMAS_SE + score_BFI_N, 
            data = AMATUS_clean)

# View model summary
summary(model)

## 
## Call:
## lm(formula = sum_arith_perf ~ sex + teacher_stage + score_TAI_short + 
##     score_FSMAS_SE + score_BFI_N, data = AMATUS_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9431  -4.1973  -0.3713   4.0237  19.6073 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1.50014    5.60744   0.268 0.789387    
## sexm                                 3.28348    1.36352   2.408 0.017106 *  
## teacher_stageprimary school teacher  2.98254    2.56816   1.161 0.247126    
## teacher_stagestudy                  -1.16630    2.49779  -0.467 0.641148    
## score_TAI_short                     -0.12301    0.13041  -0.943 0.346920    
## score_FSMAS_SE                       0.37576    0.10679   3.519 0.000556 ***
## score_BFI_N                         -0.10050    0.09228  -1.089 0.277674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.291 on 170 degrees of freedom
## Multiple R-squared:  0.201,  Adjusted R-squared:  0.1728 
## F-statistic: 7.127 on 6 and 170 DF,  p-value: 8.641e-07

# ANOVA table
anova(model)

# Load package for standardized betas
library(lm.beta)

# Standardized regression
model_std <- lm.beta(model)
summary(model_std)

## 
## Call:
## lm(formula = sum_arith_perf ~ sex + teacher_stage + score_TAI_short + 
##     score_FSMAS_SE + score_BFI_N, data = AMATUS_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9431  -4.1973  -0.3713   4.0237  19.6073 
## 
## Coefficients:
##                                     Estimate Standardized Std. Error t value
## (Intercept)                          1.50014           NA    5.60744   0.268
## sexm                                 3.28348      0.17117    1.36352   2.408
## teacher_stageprimary school teacher  2.98254      0.19469    2.56816   1.161
## teacher_stagestudy                  -1.16630     -0.07901    2.49779  -0.467
## score_TAI_short                     -0.12301     -0.07298    0.13041  -0.943
## score_FSMAS_SE                       0.37576      0.24443    0.10679   3.519
## score_BFI_N                         -0.10050     -0.08285    0.09228  -1.089
##                                     Pr(>|t|)    
## (Intercept)                         0.789387    
## sexm                                0.017106 *  
## teacher_stageprimary school teacher 0.247126    
## teacher_stagestudy                  0.641148    
## score_TAI_short                     0.346920    
## score_FSMAS_SE                      0.000556 ***
## score_BFI_N                         0.277674    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.291 on 170 degrees of freedom
## Multiple R-squared:  0.201,  Adjusted R-squared:  0.1728 
## F-statistic: 7.127 on 6 and 170 DF,  p-value: 8.641e-07

# Test unique contribution of score_TAI_short
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:purrr':
## 
##     some

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

# Type II or III ANOVA for partial effect
Anova(model, type = "II")  # or type="III" if needed

# Create polynomial term (squared)
AMATUS_clean <- AMATUS_clean %>%
  mutate(score_TAI_short2 = score_TAI_short^2)

# Polynomial regression including all predictors
poly_model <- lm(sum_arith_perf ~ sex + teacher_stage + score_TAI_short + score_TAI_short2 +
                   score_FSMAS_SE + score_BFI_N,
                 data = AMATUS_clean)

# Summary of model
summary(poly_model)

## 
## Call:
## lm(formula = sum_arith_perf ~ sex + teacher_stage + score_TAI_short + 
##     score_TAI_short2 + score_FSMAS_SE + score_BFI_N, data = AMATUS_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -13.926  -4.217  -0.359   4.016  19.595 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          1.658837   6.926228   0.240 0.811008    
## sexm                                 3.282550   1.367744   2.400 0.017483 *  
## teacher_stageprimary school teacher  2.991303   2.585391   1.157 0.248904    
## teacher_stagestudy                  -1.159673   2.510836  -0.462 0.644770    
## score_TAI_short                     -0.151648   0.741283  -0.205 0.838150    
## score_TAI_short2                     0.001106   0.028180   0.039 0.968734    
## score_FSMAS_SE                       0.375635   0.107148   3.506 0.000583 ***
## score_BFI_N                         -0.100217   0.092843  -1.079 0.281940    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.309 on 169 degrees of freedom
## Multiple R-squared:  0.201,  Adjusted R-squared:  0.1679 
## F-statistic: 6.073 on 7 and 169 DF,  p-value: 2.467e-06

# ANOVA to test unique contribution of polynomial term
anova(poly_model)

# Create interaction term
AMATUS_clean <- AMATUS_clean %>%
  mutate(TAI_SE_interaction = score_TAI_short * score_FSMAS_SE)

# Moderation model
mod_model <- lm(sum_arith_perf ~ sex + teacher_stage + score_TAI_short + 
                  score_FSMAS_SE + TAI_SE_interaction + score_BFI_N,
                data = AMATUS_clean)

summary(mod_model)

## 
## Call:
## lm(formula = sum_arith_perf ~ sex + teacher_stage + score_TAI_short + 
##     score_FSMAS_SE + TAI_SE_interaction + score_BFI_N, data = AMATUS_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.8882  -4.2409  -0.5512   3.9409  19.9803 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                         12.86299   17.82970   0.721   0.4716  
## sexm                                 3.21909    1.36909   2.351   0.0199 *
## teacher_stageprimary school teacher  2.87995    2.57685   1.118   0.2653  
## teacher_stagestudy                  -1.19129    2.50212  -0.476   0.6346  
## score_TAI_short                     -0.93152    1.21113  -0.769   0.4429  
## score_FSMAS_SE                       0.10845    0.41220   0.263   0.7928  
## TAI_SE_interaction                   0.01929    0.02872   0.671   0.5028  
## score_BFI_N                         -0.10314    0.09252  -1.115   0.2665  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.301 on 169 degrees of freedom
## Multiple R-squared:  0.2031, Adjusted R-squared:  0.1701 
## F-statistic: 6.154 on 7 and 169 DF,  p-value: 2.021e-06

# Test unique contribution of interaction
anova(mod_model)

# Plot interaction
library(interactions)

interact_plot(mod_model, pred = score_TAI_short, modx = score_FSMAS_SE,
              plot.points = TRUE, interval = TRUE)

## Warning: score_TAI_short and score_FSMAS_SE are not included in an interaction with
## one another in the model.

## Warning: 46.1431688881329 is outside the observed range of score_FSMAS_SE

# --- Step 4: Regression diagnostics on original multiple regression ---
orig_model <- lm(sum_arith_perf ~ sex + teacher_stage + score_TAI_short + score_FSMAS_SE + score_BFI_N, 
                 data = AMATUS_clean)

# 1. Residual plots
par(mfrow = c(2,2))
plot(orig_model)

# 2. Cook's distance for influential points
cooksd <- cooks.distance(orig_model)
plot(cooksd, type = "h", main = "Cook's Distance")
abline(h = 4/(nrow(AMATUS_clean)-length(orig_model$coefficients)-2), col = "red")

# 3. Check normality of residuals
shapiro.test(resid(orig_model))

## 
##  Shapiro-Wilk normality test
## 
## data:  resid(orig_model)
## W = 0.99091, p-value = 0.3253

# 4. Check multicollinearity
vif(orig_model)

##                     GVIF Df GVIF^(1/(2*Df))
## sex             1.075009  1        1.036827
## teacher_stage   1.051262  2        1.012576
## score_TAI_short 1.273688  1        1.128578
## score_FSMAS_SE  1.026628  1        1.013226
## score_BFI_N     1.231393  1        1.109682

library(mediation)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: mvtnorm

## Loading required package: sandwich

## Registered S3 method overwritten by 'lme4':
##   method           from
##   na.action.merMod car

## mediation: Causal Mediation Analysis
## Version: 4.5.1

## 
## Attaching package: 'mediation'

## The following object is masked from 'package:psych':
## 
##     mediate

library(car)

# --- Step 0: Define variables for the mediation model ---
vars <- c("sum_arith_perf", "score_TAI_short", "score_BFI_N", 
          "score_FSMAS_SE", "sex", "teacher_stage")

# --- Step 1: Keep only relevant columns and complete cases ---
AMATUS_model <- AMATUS_clean[, vars]
AMATUS_model <- AMATUS_model[complete.cases(AMATUS_model), ]

# --- Step 2: Convert categorical variables to factors with fixed levels ---
AMATUS_model$sex <- factor(AMATUS_model$sex, levels = unique(AMATUS_model$sex))
AMATUS_model$teacher_stage <- factor(AMATUS_model$teacher_stage, 
                                     levels = unique(AMATUS_model$teacher_stage))

# --- Step 3: Fit the mediator model ---
med_model <- lm(score_BFI_N ~ score_TAI_short + sex + teacher_stage + score_FSMAS_SE, 
                data = AMATUS_model)

# --- Step 4: Fit the outcome model including the mediator ---
out_model <- lm(sum_arith_perf ~ score_TAI_short + score_BFI_N + sex + teacher_stage + score_FSMAS_SE,
                data = AMATUS_model)

# --- Step 5: Conduct mediation analysis with bootstrapping ---
med_results <- mediate(med_model, out_model, 
                       treat = "score_TAI_short", 
                       mediator = "score_BFI_N",
                       boot = TRUE, sims = 500)  # reduced sims for stability

## Running nonparametric bootstrap

# --- Step 6: Show summary of mediation results ---
summary(med_results)

## 
## Causal Mediation Analysis 
## 
## Nonparametric Bootstrap Confidence Intervals with the Percentile Method
## 
##                 Estimate 95% CI Lower 95% CI Upper p-value
## ACME           -0.051190    -0.170452     0.047673   0.284
## ADE            -0.123006    -0.389220     0.170132   0.476
## Total Effect   -0.174196    -0.390728     0.088525   0.212
## Prop. Mediated  0.293862    -3.753955     4.756096   0.496
## 
## Sample Size Used: 177 
## 
## 
## Simulations: 500

library(ggplot2)
library(grid)

# --- Extract coefficients from your mediation results ---
a <- med_results$d0     # effect of TAI -> BFI (average causal mediation effect)
b <- med_results$z0     # effect of BFI -> Outcome (approximated)
c_prime <- med_results$z0 + med_results$d0  # direct effect (simplified for labeling)

# --- Define node positions ---
nodes <- data.frame(
  name = c("score_TAI_short\n(Treatment)", 
           "score_BFI_N\n(Mediator)", 
           "sum_arith_perf\n(Outcome)"),
  x = c(1, 2, 3),
  y = c(2, 2, 2)
)

# --- Define edges ---
edges <- data.frame(
  from = c(1, 2, 1),
  to = c(2, 3, 3),
  label = c(round(a, 2), round(b, 2), round(c_prime, 2))
)

# --- Create diagram ---
ggplot() +
  # Nodes
  geom_point(data = nodes, aes(x=x, y=y), size=12, color="skyblue") +
  geom_text(data = nodes, aes(x=x, y=y, label=name), color="black") +
  
  # Edges with arrows
  geom_curve(data = edges, 
             aes(x=nodes$x[from], y=nodes$y[from],
                 xend=nodes$x[to], yend=nodes$y[to]), 
             curvature = 0.2, arrow = arrow(length = unit(0.3,"cm")), color="darkblue", size=1.2) +
  
  # Edge labels (coefficients)
  geom_text(data = edges, 
            aes(x=(nodes$x[from]+nodes$x[to])/2, 
                y=(nodes$y[from]+nodes$y[to])/2 + 0.15, 
                label=label), color="red", size=5) +
  
  theme_void() +
  xlim(0.5, 3.5) + ylim(1.5, 2.5)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Regression Analysis-McDonald Owuor

2026-04-20