Library

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(MVN)
## Warning: package 'MVN' was built under R version 4.5.3
library(biotools)
## Warning: package 'biotools' was built under R version 4.5.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## ---
## biotools version 4.3
library(heplots)
## Warning: package 'heplots' was built under R version 4.5.3
## Loading required package: broom
## 
## Attaching package: 'heplots'
## 
## The following object is masked from 'package:biotools':
## 
##     boxM
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(psych)
## Warning: package 'psych' was built under R version 4.5.3
## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following object is masked from 'package:MVN':
## 
##     mardia
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Load Data

df <- read.csv("education_intervention_10000.csv")
head(df,10)
##    student_id school_id grade_level intervention ses_index attendance_rate
## 1    93207890       110           9      blended     1.379           0.996
## 2    61581758       128          11      blended     2.037           0.930
## 3    75844861       158           9     tutoring     0.403           0.933
## 4    57564202       153           9      flipped    -0.442           0.924
## 5    57864707       100          12     tutoring    -0.572           0.971
## 6    78728520       100          11      blended    -1.128           0.875
## 7    73540890       144          12     tutoring     0.280           0.982
## 8    83052348       125           9     tutoring    -0.608           0.936
## 9    60636531       143          11      control    -0.016           0.865
## 10   55920596       120          11      blended    -1.269           0.810
##    study_hours_wk teacher_experience_years class_size baseline_math
## 1            5.93                      0.0         28          76.9
## 2            2.52                      6.7         32          78.9
## 3            5.32                      8.3         23          69.4
## 4            5.26                      9.2         30          55.9
## 5            7.47                     10.6         21          55.3
## 6            4.12                     15.7         25          59.4
## 7            3.61                      8.3         34          56.4
## 8            2.78                      4.1         24          67.9
## 9            1.05                      7.6         14          67.5
## 10           2.73                      0.2         37          81.5
##    baseline_reading baseline_motivation_1_10 post_math post_reading
## 1              65.3                      7.7      87.8         75.1
## 2              82.3                      7.1      88.9         89.0
## 3              83.9                      5.8      84.8         95.2
## 4              61.8                      6.7      72.7         68.3
## 5              66.4                      4.6      75.7         83.4
## 6              67.2                      7.4      69.1         78.9
## 7              71.1                      6.0      66.5         87.1
## 8              47.9                      9.7      82.1         58.6
## 9              79.6                      7.1      76.6         87.1
## 10             69.9                      5.9      85.3         76.0
##    test_anxiety_1_10 passed
## 1                2.6      1
## 2                4.3      1
## 3                3.8      1
## 4                5.6      1
## 5                7.6      1
## 6                6.8      0
## 7                5.4      1
## 8                3.2      1
## 9                4.1      1
## 10               4.8      1

Struktur Data

str(df)
## 'data.frame':    10000 obs. of  16 variables:
##  $ student_id              : int  93207890 61581758 75844861 57564202 57864707 78728520 73540890 83052348 60636531 55920596 ...
##  $ school_id               : int  110 128 158 153 100 100 144 125 143 120 ...
##  $ grade_level             : int  9 11 9 9 12 11 12 9 11 11 ...
##  $ intervention            : chr  "blended" "blended" "tutoring" "flipped" ...
##  $ ses_index               : num  1.379 2.037 0.403 -0.442 -0.572 ...
##  $ attendance_rate         : num  0.996 0.93 0.933 0.924 0.971 0.875 0.982 0.936 0.865 0.81 ...
##  $ study_hours_wk          : num  5.93 2.52 5.32 5.26 7.47 4.12 3.61 2.78 1.05 2.73 ...
##  $ teacher_experience_years: num  0 6.7 8.3 9.2 10.6 15.7 8.3 4.1 7.6 0.2 ...
##  $ class_size              : int  28 32 23 30 21 25 34 24 14 37 ...
##  $ baseline_math           : num  76.9 78.9 69.4 55.9 55.3 59.4 56.4 67.9 67.5 81.5 ...
##  $ baseline_reading        : num  65.3 82.3 83.9 61.8 66.4 67.2 71.1 47.9 79.6 69.9 ...
##  $ baseline_motivation_1_10: num  7.7 7.1 5.8 6.7 4.6 7.4 6 9.7 7.1 5.9 ...
##  $ post_math               : num  87.8 88.9 84.8 72.7 75.7 69.1 66.5 82.1 76.6 85.3 ...
##  $ post_reading            : num  75.1 89 95.2 68.3 83.4 78.9 87.1 58.6 87.1 76 ...
##  $ test_anxiety_1_10       : num  2.6 4.3 3.8 5.6 7.6 6.8 5.4 3.2 4.1 4.8 ...
##  $ passed                  : int  1 1 1 1 1 0 1 1 1 1 ...
summary(df)
##    student_id         school_id      grade_level    intervention      
##  Min.   :10001836   Min.   :100.0   Min.   : 9.00   Length:10000      
##  1st Qu.:31792792   1st Qu.:115.0   1st Qu.: 9.00   Class :character  
##  Median :54680352   Median :130.0   Median :10.00   Mode  :character  
##  Mean   :54791854   Mean   :129.6   Mean   :10.42                     
##  3rd Qu.:77195169   3rd Qu.:145.0   3rd Qu.:11.00                     
##  Max.   :99992163   Max.   :159.0   Max.   :12.00                     
##    ses_index         attendance_rate  study_hours_wk   teacher_experience_years
##  Min.   :-2.500000   Min.   :0.6670   Min.   : 0.020   Min.   : 0.000          
##  1st Qu.:-0.678250   1st Qu.:0.8730   1st Qu.: 2.250   1st Qu.: 5.600          
##  Median : 0.001000   Median :0.9160   Median : 3.910   Median : 9.000          
##  Mean   :-0.002791   Mean   :0.9129   Mean   : 4.576   Mean   : 9.057          
##  3rd Qu.: 0.671000   3rd Qu.:0.9570   3rd Qu.: 6.140   3rd Qu.:12.400          
##  Max.   : 2.500000   Max.   :1.0000   Max.   :22.850   Max.   :28.700          
##    class_size    baseline_math    baseline_reading baseline_motivation_1_10
##  Min.   :10.00   Min.   : 20.00   Min.   : 23.30   Min.   : 1.000          
##  1st Qu.:22.00   1st Qu.: 61.00   1st Qu.: 63.60   1st Qu.: 5.300          
##  Median :26.00   Median : 70.00   Median : 71.80   Median : 6.500          
##  Mean   :26.06   Mean   : 70.03   Mean   : 71.77   Mean   : 6.456          
##  3rd Qu.:30.00   3rd Qu.: 79.20   3rd Qu.: 79.92   3rd Qu.: 7.600          
##  Max.   :40.00   Max.   :100.00   Max.   :100.00   Max.   :10.000          
##    post_math      post_reading    test_anxiety_1_10     passed      
##  Min.   : 29.7   Min.   : 35.90   Min.   : 1.000    Min.   :0.0000  
##  1st Qu.: 71.9   1st Qu.: 72.50   1st Qu.: 3.700    1st Qu.:1.0000  
##  Median : 81.4   Median : 81.40   Median : 4.800    Median :1.0000  
##  Mean   : 80.9   Mean   : 80.87   Mean   : 4.764    Mean   :0.9042  
##  3rd Qu.: 91.5   3rd Qu.: 90.20   3rd Qu.: 5.900    3rd Qu.:1.0000  
##  Max.   :100.0   Max.   :100.00   Max.   :10.000    Max.   :1.0000

cek missing values

colSums(is.na(df))
##               student_id                school_id              grade_level 
##                        0                        0                        0 
##             intervention                ses_index          attendance_rate 
##                        0                        0                        0 
##           study_hours_wk teacher_experience_years               class_size 
##                        0                        0                        0 
##            baseline_math         baseline_reading baseline_motivation_1_10 
##                        0                        0                        0 
##                post_math             post_reading        test_anxiety_1_10 
##                        0                        0                        0 
##                   passed 
##                        0

##EDA

# histogram numerik
df %>%
  dplyr::select_if(is.numeric) %>%
  tidyr::pivot_longer(cols = everything()) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 20, fill = "skyblue", color = "black") +
  facet_wrap(~name, scales = "free") +
  theme_minimal() +
  labs(title = "Distribusi Variabel Numerik")

# barplot kategorik
df$intervention <- as.factor(df$intervention)

ggplot(df, aes(x = intervention)) +
  geom_bar(fill = "orange") +
  theme_minimal() +
  labs(title = "Distribusi Intervention")

pilih variabel

df_mancova <- dplyr::select(df,
                            intervention,
                            post_math,
                            post_reading,
                            baseline_math,
                            baseline_reading)

head(df_mancova)
##   intervention post_math post_reading baseline_math baseline_reading
## 1      blended      87.8         75.1          76.9             65.3
## 2      blended      88.9         89.0          78.9             82.3
## 3     tutoring      84.8         95.2          69.4             83.9
## 4      flipped      72.7         68.3          55.9             61.8
## 5     tutoring      75.7         83.4          55.3             66.4
## 6      blended      69.1         78.9          59.4             67.2

index_terpilih <- c(
  8747, 3695, 8717, 8634, 8955, 2419, 9575, 3038, 9573, 6551,
  1050, 8139, 6222, 7402, 7763, 9160, 3635, 9036, 7985, 4812,
  9112, 4410, 5178, 3125, 6227, 2701, 6765, 1049, 5174, 3188,
  5027, 6060, 3738, 8708, 8904, 4130, 2220, 7498, 3525, 2953,
  6573, 3576, 7062, 9698, 2214, 1407, 4584, 8532, 3082, 8982,
  7425, 5315, 3645, 3622, 3532, 7177, 7865, 4934, 7860, 5125,
  6243, 5960, 6838, 6017, 2539, 2623, 1530, 1863, 7956, 4028,
  8818, 4055, 6209, 5666, 3445, 8624, 4668, 7284, 3043, 2113,
  5572, 2328, 8170, 8278, 2869, 3538, 6042, 6918, 4358, 8195,
  7281, 4960, 7854, 8472, 4099, 6655, 4010, 7258, 3493, 9247
)

# biar index aman
df <- df %>% mutate(row_id = row_number())

df_mancova <- df %>%
  filter(row_id %in% index_terpilih) %>%
  dplyr::select(
    intervention,
    post_math,
    post_reading,
    baseline_math,
    baseline_reading
  )

dim(df_mancova)
## [1] 100   5

histogram

df_mancova %>%
  pivot_longer(cols = -intervention) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 20, fill = "skyblue", color = "black") +
  facet_wrap(~name, scales = "free") +
  theme_minimal()

cek outlier

boxplot(df_mancova[,2:3],
        main = "Boxplot Dependent Variables",
        col = "lightblue")

faktor

df_mancova$intervention <- as.factor(df_mancova$intervention)

uji normalitas multivariat

mvn_result <- mvn(
  data = df_mancova[,2:3],
  mvn_test = "mardia"
)

mvn_result$multivariate_normality
##              Test Statistic p.value     Method      MVN
## 1 Mardia Skewness     7.160   0.128 asymptotic ✓ Normal
## 2 Mardia Kurtosis    -1.338   0.181 asymptotic ✓ Normal

homogenitas (box m)

biotools::boxM(df_mancova[,2:3], df_mancova$intervention)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  df_mancova[, 2:3]
## Chi-Sq (approx.) = 5.7976, df = 9, p-value = 0.76

dependensi

cor_matrix <- cor(df_mancova[,2:3])
cor_matrix
##              post_math post_reading
## post_math    1.0000000    0.2540452
## post_reading 0.2540452    1.0000000
cortest.bartlett(cor_matrix, n = nrow(df_mancova))
## $chisq
## [1] 6.504791
## 
## $p.value
## [1] 0.01075842
## 
## $df
## [1] 1

linearitas

pairs(~ baseline_math +
        baseline_reading +
        post_math +
        post_reading,
      data = df_mancova,
      col = df_mancova$intervention)

ggplot(df_mancova,
       aes(x = baseline_math,
           y = post_math,
           color = intervention)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

MANOVA

# MANOVA
model_manova <- manova(
  cbind(post_math, post_reading) ~ intervention,
  data = df_mancova
)

summary(model_manova, test = "Pillai")
##              Df   Pillai approx F num Df den Df Pr(>F)
## intervention  3 0.051208  0.84086      6    192 0.5398
## Residuals    96
# follow-up (per variabel)
summary.aov(model_manova)
##  Response post_math :
##              Df  Sum Sq Mean Sq F value Pr(>F)
## intervention  3   519.9  173.29  1.1284 0.3415
## Residuals    96 14742.6  153.57               
## 
##  Response post_reading :
##              Df  Sum Sq Mean Sq F value Pr(>F)
## intervention  3    82.7  27.569  0.2097 0.8895
## Residuals    96 12623.8 131.498
# cek interaksi (opsional tapi bagus)
# sebenarnya di MANOVA jarang dipakai, tapi bisa untuk eksplor

ANCOVA - MATH

# ANCOVA Math
ancova_math <- aov(
  post_math ~ intervention + baseline_math,
  data = df_mancova
)

summary(ancova_math)
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## intervention   3    520     173   12.62 5.17e-07 ***
## baseline_math  1  13438   13438  978.54  < 2e-16 ***
## Residuals     95   1305      14                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# cek interaksi (WAJIB)
ancova_math_interaksi <- aov(
  post_math ~ intervention * baseline_math,
  data = df_mancova
)

summary(ancova_math_interaksi)
##                            Df Sum Sq Mean Sq  F value   Pr(>F)    
## intervention                3    520     173   13.259 2.88e-07 ***
## baseline_math               1  13438   13438 1028.171  < 2e-16 ***
## intervention:baseline_math  3    102      34    2.606   0.0564 .  
## Residuals                  92   1202      13                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ANCOVA -READING

# ANCOVA Reading
ancova_reading <- aov(
  post_reading ~ intervention + baseline_reading,
  data = df_mancova
)

summary(ancova_reading)
##                  Df Sum Sq Mean Sq F value Pr(>F)    
## intervention      3     83      28   1.677  0.177    
## baseline_reading  1  11062   11062 672.675 <2e-16 ***
## Residuals        95   1562      16                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# cek interaksi (WAJIB)
ancova_reading_interaksi <- aov(
  post_reading ~ intervention * baseline_reading,
  data = df_mancova
)

summary(ancova_reading_interaksi)
##                               Df Sum Sq Mean Sq F value Pr(>F)    
## intervention                   3     83      28   1.637  0.186    
## baseline_reading               1  11062   11062 656.885 <2e-16 ***
## intervention:baseline_reading  3     13       4   0.257  0.856    
## Residuals                     92   1549      17                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

MANCOVA

# MANCOVA (utama)
model <- manova(
  cbind(post_math, post_reading) ~
    intervention + baseline_math + baseline_reading,
  data = df_mancova
)

summary(model, test = "Pillai")
##                  Df  Pillai approx F num Df den Df    Pr(>F)    
## intervention      3 0.32591     6.10      6    188 7.383e-06 ***
## baseline_math     1 0.91384   493.17      2     93 < 2.2e-16 ***
## baseline_reading  1 0.86630   301.29      2     93 < 2.2e-16 ***
## Residuals        94                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# follow-up
summary.aov(model)
##  Response post_math :
##                  Df  Sum Sq Mean Sq  F value    Pr(>F)    
## intervention      3   519.9   173.3  12.5033 5.976e-07 ***
## baseline_math     1 13438.0 13438.0 969.5848 < 2.2e-16 ***
## baseline_reading  1     1.8     1.8   0.1303    0.7189    
## Residuals        94  1302.8    13.9                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response post_reading :
##                  Df  Sum Sq Mean Sq F value    Pr(>F)    
## intervention      3    82.7    27.6   1.659    0.1812    
## baseline_math     1  1045.7  1045.7  62.927 4.419e-12 ***
## baseline_reading  1 10016.1 10016.1 602.735 < 2.2e-16 ***
## Residuals        94  1562.1    16.6                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# cek interaksi (biar aman)
model_interaksi <- manova(
  cbind(post_math, post_reading) ~
    intervention * baseline_math +
    intervention * baseline_reading,
  data = df_mancova
)

summary(model_interaksi, test = "Pillai")
##                               Df  Pillai approx F num Df den Df    Pr(>F)    
## intervention                   3 0.35038     6.23      6    176 5.951e-06 ***
## baseline_math                  1 0.92090   506.42      2     87 < 2.2e-16 ***
## baseline_reading               1 0.86958   290.04      2     87 < 2.2e-16 ***
## intervention:baseline_math     3 0.09676     1.49      6    176    0.1836    
## intervention:baseline_reading  3 0.03112     0.46      6    176    0.8346    
## Residuals                     88                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1