#knitr opotions not present warnign and messages
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
message = FALSE
)
— R Workshop for Educational Researchers: PISA 2022 Examples —
This script is adapted to follow the user’s workshop slides, using specific functions and PISA 2022 variables.
— 1.1. Install and Load Required Packages — You can run this block once to install all packages needed for the workshop.
# install.packages(c("dplyr", "tidyr", "car", "lsr", "effectsize", "agricolae", "rosetta", "afex", "rstatix", "MOTE"))
library(dplyr) # For data manipulation (e.g., filter)
library(tidyr) # For data tidying
library(car) # For Levene's test and Anova()
library(lsr) # For cohensD()
library(effectsize) # For eta_squared(), omega_squared(), etc.
library(agricolae) # For HSD.test() (Tukey-Kramer)
library(rosetta) # For posthocTGH() (Games-Howell)
library(afex) # For aov_ez() (Repeated-Measures ANOVA)
library(rstatix) # For gather() and pairwise_t_test()
library(MOTE) # For RM ANOVA effect size (omega.partial.SS.rm)
ds <- rio::import("~/Courses/An Introduction to R and RStudio for Educational Researchers/assets/data/ds.rds", trust=T) #select your path you can also use file.choose()
#example
# ds <- rio::import(file.choose(), trust=T)
vars_to_keep <- c(
# --- Key Identifiers ---
"CNTSTUID", # Student ID (essential for repeated measures)
# --- Demographic / IVs from Examples ---
"ST004D01T", # Gender (for t-test, 2-way ANOVA)
"IMMIG", # Immigration Status (for 1-way)
"HISCED", # Highest Parental Education (2-way ANOVA)
"ESCS", # Economic, Social, and Cultural Status (for regression)
# --- Main Plausible Values (DVs) ---
"PV1MATH", # PV 1 Math (main DV)
"PV1READ", # PV 1 Reading (for paired t-test)
"PV1SCIE", # PV 1 Science (good to have)
# --- Math Subscale PVs (for RM ANOVA) ---
"PV1MPEM", # Math - Employing
"PV1MPFS", # Math - Formulating
"PV1MPIN", # Math - Interpreting
"PV1MPRE", # Math - Reasoning
# --- Key WLE Composite Variables ---
"HOMEPOS", # Home Possessions
"ICTRES", # ICT Resources
"TEACHSUP", # Teacher Support
"BELONG", # Sense of Belonging
"ANXMAT", # Math Anxiety
"MATHEFF", # Math Self-Efficacy
"FEELSAFE", # Feeling Safe at School
"STRESAGR" # Stress Resistance
)
#keep selected variables
ds <- ds %>%
dplyr::select(all_of(vars_to_keep))
# Convert categorical variables to factors with labels
categorical_vars_to_convert <- c("ST004D01T", "IMMIG", "HISCED")
ds <- ds %>%
dplyr::mutate(
# 1. Convert *only* the specified categorical variables to character
across(all_of(categorical_vars_to_convert), sjlabelled::as_character),
# 2. Convert all *other* remaining labelled variables (like ESCS, HOMEPOS)
# to simple numeric, stripping the labels that cause problems.
# We use where(function(x) inherits(x, "haven_labelled")) to correctly
# identify all haven_labelled columns.
across(
where(function(x) inherits(x, "haven_labelled")) & !all_of(categorical_vars_to_convert),
as.numeric
)
)
# Let's inspect our data
head(ds)
## CNTSTUID ST004D01T IMMIG HISCED ESCS PV1MATH
## 1 70200001 Female Native student ISCED level 6 0.1836 639.004
## 2 70200002 Male Native student ISCED level 5 0.8261 697.191
## 3 70200003 Male Native student ISCED level 3.3 -1.0357 693.710
## 4 70200004 Male Native student ISCED level 4 -0.9606 427.317
## 5 70200005 Female Native student ISCED level 5 0.0856 436.462
## 6 70200006 Female First-Generation student ISCED level 7 0.1268 569.982
## PV1READ PV1SCIE PV1MPEM PV1MPFS PV1MPIN PV1MPRE HOMEPOS ICTRES TEACHSUP
## 1 676.298 710.634 604.382 518.732 602.757 537.068 0.7524 0.1940 -0.5635
## 2 625.585 670.646 705.040 763.661 733.566 706.337 0.7842 0.6249 0.1475
## 3 620.116 666.095 676.642 690.547 682.130 630.753 0.0666 -0.3987 -0.5635
## 4 381.495 340.308 401.548 421.798 407.066 378.730 -0.9300 -0.9028 -0.1002
## 5 448.199 456.333 437.563 454.383 414.746 364.784 -0.8949 0.2514 1.5558
## 6 469.441 475.158 528.852 493.759 459.876 523.219 -0.5988 -0.4733 0.1475
## BELONG ANXMAT MATHEFF FEELSAFE STRESAGR
## 1 0.2442 0.3729 0.1429 -0.7560 0.9777
## 2 -0.0437 0.6647 -0.2874 1.1246 -0.7402
## 3 -0.6137 0.0510 0.2226 0.4417 -0.2053
## 4 -1.1147 0.6387 -0.9344 -0.7560 0.2201
## 5 2.1143 2.5026 -0.9322 1.1246 -0.9940
## 6 0.5159 -0.7358 0.2637 0.1413 0.2183
#structure of the data
str(ds)
## 'data.frame': 6606 obs. of 20 variables:
## $ CNTSTUID : num 70200001 70200002 70200003 70200004 70200005 ...
## ..- attr(*, "label")= chr "Intl. Student ID"
## ..- attr(*, "format.spss")= chr "F8.0"
## $ ST004D01T: chr "Female" "Male" "Male" "Male" ...
## ..- attr(*, "label")= chr "Student (Standardized) Gender"
## $ IMMIG : chr "Native student" "Native student" "Native student" "Native student" ...
## ..- attr(*, "label")= chr "Index on immigrant background (OECD definition)"
## $ HISCED : chr "ISCED level 6" "ISCED level 5" "ISCED level 3.3" "ISCED level 4" ...
## ..- attr(*, "label")= chr "Highest level of education of parents (ISCED)"
## $ ESCS : num 0.1836 0.8261 -1.0357 -0.9606 0.0856 ...
## $ PV1MATH : num 639 697 694 427 436 ...
## ..- attr(*, "label")= chr "Plausible Value 1 in Mathematics"
## ..- attr(*, "format.spss")= chr "F8.3"
## $ PV1READ : num 676 626 620 381 448 ...
## ..- attr(*, "label")= chr "Plausible Value 1 in Reading"
## ..- attr(*, "format.spss")= chr "F8.3"
## $ PV1SCIE : num 711 671 666 340 456 ...
## ..- attr(*, "label")= chr "Plausible Value 1 in Science"
## ..- attr(*, "format.spss")= chr "F8.3"
## $ PV1MPEM : num 604 705 677 402 438 ...
## ..- attr(*, "label")= chr "Plausible Value 1 in Process Subscale of Mathematics - Employing Mathematical Concepts, Facts, and Procedures"
## ..- attr(*, "format.spss")= chr "F8.3"
## $ PV1MPFS : num 519 764 691 422 454 ...
## ..- attr(*, "label")= chr "Plausible Value 1 in Process Subscale of Mathematics - Formulating Situations Mathematically"
## ..- attr(*, "format.spss")= chr "F8.3"
## $ PV1MPIN : num 603 734 682 407 415 ...
## ..- attr(*, "label")= chr "Plausible Value 1 in Process Subscale of Mathematics - Interpreting, Applying, and Evaluating Mathematical Outcomes"
## ..- attr(*, "format.spss")= chr "F8.3"
## $ PV1MPRE : num 537 706 631 379 365 ...
## ..- attr(*, "label")= chr "Plausible Value 1 in Process Subscale of Mathematics - Reasoning"
## ..- attr(*, "format.spss")= chr "F8.3"
## $ HOMEPOS : num 0.7524 0.7842 0.0666 -0.93 -0.8949 ...
## $ ICTRES : num 0.194 0.625 -0.399 -0.903 0.251 ...
## $ TEACHSUP : num -0.564 0.147 -0.564 -0.1 1.556 ...
## $ BELONG : num 0.2442 -0.0437 -0.6137 -1.1147 2.1143 ...
## $ ANXMAT : num 0.373 0.665 0.051 0.639 2.503 ...
## $ MATHEFF : num 0.143 -0.287 0.223 -0.934 -0.932 ...
## $ FEELSAFE : num -0.756 1.125 0.442 -0.756 1.125 ...
## $ STRESAGR : num 0.978 -0.74 -0.205 0.22 -0.994 ...
— 2. Descriptive Statistics —
# Get a full summary of the entire dataset
summary(ds)
## CNTSTUID ST004D01T IMMIG HISCED
## Min. :70200001 Length:6606 Length:6606 Length:6606
## 1st Qu.:70201836 Class :character Class :character Class :character
## Median :70203674 Mode :character Mode :character Mode :character
## Mean :70203673
## 3rd Qu.:70205513
## Max. :70207345
##
## ESCS PV1MATH PV1READ PV1SCIE
## Min. :-3.5488 Min. :218.6 Min. :135.9 Min. :187.5
## 1st Qu.:-0.2327 1st Qu.:503.1 1st Qu.:476.9 1st Qu.:495.7
## Median : 0.4817 Median :582.5 Median :552.9 Median :568.7
## Mean : 0.2904 Mean :574.2 Mean :544.4 Mean :560.8
## 3rd Qu.: 0.9036 3rd Qu.:648.2 3rd Qu.:619.6 3rd Qu.:631.1
## Max. : 3.2780 Max. :943.0 Max. :859.5 Max. :873.3
## NA's :47
## PV1MPEM PV1MPFS PV1MPIN PV1MPRE
## Min. :178.6 Min. :117.7 Min. :177.5 Min. :221.5
## 1st Qu.:504.1 1st Qu.:500.4 1st Qu.:499.5 1st Qu.:499.4
## Median :587.4 Median :581.5 Median :583.5 Median :581.7
## Mean :579.2 Mean :574.8 Mean :576.9 Mean :573.5
## 3rd Qu.:658.8 3rd Qu.:653.7 3rd Qu.:656.8 3rd Qu.:651.0
## Max. :917.8 Max. :889.3 Max. :938.8 Max. :891.1
##
## HOMEPOS ICTRES TEACHSUP BELONG
## Min. :-2.67320 Min. :-5.0283 Min. :-2.9095 Min. :-3.2583
## 1st Qu.:-0.45730 1st Qu.:-0.1902 1st Qu.:-0.3322 1st Qu.:-0.7246
## Median : 0.09185 Median : 0.4428 Median : 0.1475 Median :-0.3380
## Mean : 0.12359 Mean : 0.3908 Mean : 0.3820 Mean :-0.2293
## 3rd Qu.: 0.65922 3rd Qu.: 0.9904 3rd Qu.: 1.5558 3rd Qu.: 0.1140
## Max. : 4.40720 Max. : 5.0992 Max. : 1.5558 Max. : 2.7562
## NA's :40 NA's :40 NA's :68 NA's :49
## ANXMAT MATHEFF FEELSAFE STRESAGR
## Min. :-2.3945 Min. :-3.5080 Min. :-2.7886 Min. :-5.2913
## 1st Qu.:-0.3794 1st Qu.:-0.4624 1st Qu.:-0.7560 1st Qu.:-0.5873
## Median : 0.1932 Median : 0.0268 Median : 0.4417 Median :-0.1026
## Mean : 0.1636 Mean : 0.2355 Mean : 0.1710 Mean :-0.1678
## 3rd Qu.: 0.7889 3rd Qu.: 0.9390 3rd Qu.: 1.1246 3rd Qu.: 0.2926
## Max. : 2.6350 Max. : 2.3556 Max. : 1.1246 Max. : 5.5675
## NA's :82 NA's :77 NA's :43 NA's :147
# Get descriptives for a specific variable (e.g., Math Score)
#Descriptives for PV1MATH
cat("Mean:", mean(ds$PV1MATH), "\n")
## Mean: 574.2388
cat("Median:", median(ds$PV1MATH), "\n")
## Median: 582.544
cat("Std Deviation:", sd(ds$PV1MATH), "\n")
## Std Deviation: 102.7442
cat("Min:", min(ds$PV1MATH), "\n")
## Min: 218.571
cat("Max:", max(ds$PV1MATH), "\n")
## Max: 943.041
# Frequency table for a categorical variable (Gender)
table(ds$ST004D01T)
##
## Female Male
## 3248 3358
# Cross-tabulation (Gender vs. Immigration Status)
table(ds$ST004D01T, ds$IMMIG)
##
## First-Generation student Native student Second-Generation student
## Female 515 2291 345
## Male 550 2313 356
# More advanced descriptives (e.g., mean math score by gender)
aggregate(PV1MATH ~ ST004D01T, data = ds, FUN = mean)
## ST004D01T PV1MATH
## 1 Female 568.329
## 2 Male 579.955
Q: Is there a significant difference in Math scores (PV1MATH) between Genders (ST004D01T)?
# Assumption Check: Homogeneity of Variance (Levene's Test)
print(car::leveneTest(PV1MATH ~ ST004D01T, data = ds, center = "mean"))
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 1 26.005 0.0000003501 ***
## 6604
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Assumption Check: Normality (Shapiro-Wilk Test)
# Note: Fails on large samples. Visual inspection (hist/QQ) is often better.
# We use try() because it might fail if the sample is too large
try(print(shapiro.test(ds$PV1MATH[ds$ST004D01T == "Female"])))
##
## Shapiro-Wilk normality test
##
## data: ds$PV1MATH[ds$ST004D01T == "Female"]
## W = 0.99516, p-value = 0.000000007692
try(print(shapiro.test(ds$PV1MATH[ds$ST004D01T == "Male"])))
##
## Shapiro-Wilk normality test
##
## data: ds$PV1MATH[ds$ST004D01T == "Male"]
## W = 0.99252, p-value = 0.000000000003229
# Student's t-test (assuming equal variances)
t.test(PV1MATH ~ ST004D01T, data = ds, var.equal = TRUE)
##
## Two Sample t-test
##
## data: PV1MATH by ST004D01T
## t = -4.6049, df = 6604, p-value = 0.000004205
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -16.575315 -6.676737
## sample estimates:
## mean in group Female mean in group Male
## 568.329 579.955
# Effect Size (Cohen's d, pooled)
lsr::cohensD(PV1MATH ~ ST004D01T, data = ds, method = "pooled")
## [1] 0.113328
# Q: Is there a significant difference in Math scores (PV1MATH) between Genders (ST004D01T)?
# (This test does NOT assume equal variances)
# Welch's t-test (R default)
t.test(PV1MATH ~ ST004D01T, data = ds, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: PV1MATH by ST004D01T
## t = -4.6116, df = 6584.6, p-value = 0.000004072
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -16.568110 -6.683942
## sample estimates:
## mean in group Female mean in group Male
## 568.329 579.955
# Effect Size (Cohen's d, unequal)
lsr::cohensD(PV1MATH ~ ST004D01T, data = ds, method = "unequal")
## [1] 0.1134105
Q: Is there a significant difference between students’ Math (PV1MATH) and Reading (PV1READ) scores?
# Assumption Check: Normality of differences
diffs_t_test <- (ds$PV1MATH - ds$PV1READ) |> as.numeric()
#shapiro.test(diffs_t_test) #too big to test
# Paired t-test
t.test(ds$PV1MATH, ds$PV1READ, paired = TRUE)
##
## Paired t-test
##
## data: ds$PV1MATH and ds$PV1READ
## t = 38.704, df = 6605, p-value < 2.2e-16
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 28.34339 31.36769
## sample estimates:
## mean difference
## 29.85554
# Effect Size (Cohen's d, paired)
lsr::cohensD(ds$PV1MATH, ds$PV1READ, method = "paired")
## [1] 0.4762002
Q: Is there a significant difference in Math scores (PV1MATH) among Immigration groups (IMMIG)?
# Assumption Check: Homogeneity of Variance (Levene's Test)
car::leveneTest(PV1MATH ~ IMMIG, data = ds, center = "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 2 13.714 0.00000114 ***
## 6367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# One-Way ANOVA
anova_output <- aov(PV1MATH ~ IMMIG, data = ds)
summary(anova_output)
## Df Sum Sq Mean Sq F value Pr(>F)
## IMMIG 2 1381400 690700 66.93 <2e-16 ***
## Residuals 6367 65707105 10320
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 236 observations deleted due to missingness
# Effect Sizes (eta, omega, f)
#ANOVA Effect Sizes (Omega Squared)
effectsize::omega_squared(model = anova_output, partial = FALSE)
## # Effect Size for ANOVA (Type I)
##
## Parameter | Omega2 | 95% CI
## ---------------------------------
## IMMIG | 0.02 | [0.01, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
#ANOVA Effect Sizes (Eta Squared)
effectsize::eta_squared(model = anova_output, partial = FALSE)
## # Effect Size for ANOVA (Type I)
##
## Parameter | Eta2 | 95% CI
## -------------------------------
## IMMIG | 0.02 | [0.02, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
#ANOVA Effect Sizes (Cohen's f)
effectsize::cohens_f(model = anova_output, partial = FALSE)
## # Effect Size for ANOVA (Type I)
##
## Parameter | Cohen's f | 95% CI
## -----------------------------------
## IMMIG | 0.14 | [0.12, Inf]
##
## - One-sided CIs: upper bound fixed at [Inf].
Q: Which immigration groups differ in Math scores?
# Note: `unbalanced = TRUE` is robust for unequal group sizes.
tukey_result <- agricolae::HSD.test(anova_output, "IMMIG", group = FALSE, unbalanced = TRUE)
tukey_result$comparison
## difference pvalue signif.
## First-Generation student - Native student 24.86290 0.0000 ***
## First-Generation student - Second-Generation student -16.64893 0.0002 ***
## Native student - Second-Generation student -41.51183 0.0000 ***
## LCL UCL
## First-Generation student - Native student 14.98114 34.744654
## First-Generation student - Second-Generation student -26.53069 -6.767172
## Native student - Second-Generation student -51.39359 -31.630067
Q: Is there a significant difference in Math scores (PV1MATH) among Immigration groups (IMMIG)?
# (Use this when Levene's test fails / variances are unequal)
welch_anova_output <- oneway.test(PV1MATH ~ IMMIG, data = ds, var.equal = FALSE)
welch_anova_output
##
## One-way analysis of means (not assuming equal variances)
##
## data: PV1MATH and IMMIG
## F = 69.716, num df = 2.0, denom df = 1522.2, p-value < 2.2e-16
# Effect Size (Omega Squared for Welch's)
effectsize::omega_squared(model = welch_anova_output)
## # Effect Size for ANOVA
##
## Omega2 | 95% CI
## ---------------------
## 0.08 | [0.06, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
Q: Which groups differ? (Post-hoc for Welch’s ANOVA)
games_howell_result <- rosetta::posthocTGH(y = ds$PV1MATH, x = ds$IMMIG, method = "games-howell")
games_howell_result
## n means variances
## First-Generation student 1065 592 8768
## Native student 4604 567 10733
## Second-Generation student 701 609 9961
## diff ci.lo ci.hi t df
## Native student-First-Generation student -25 -32.5 -17 7.6 1720
## Second-Generation student-First-Generation student 17 5.5 28 3.5 1430
## Second-Generation student-Native student 42 32.0 51 10.2 945
## p
## Native student-First-Generation student <.01
## Second-Generation student-First-Generation student <.01
## Second-Generation student-Native student <.01
Q: Is there an interaction effect of Gender (ST004D01T) and Immigration (IMMIG) on Math scores (PV1MATH)?
# Assumption Check: Homogeneity of Variance (Levene's Test)
car::leveneTest(PV1MATH ~ ST004D01T * IMMIG, data = ds, center = "mean")
## Levene's Test for Homogeneity of Variance (center = "mean")
## Df F value Pr(>F)
## group 5 10.938 0.0000000001678 ***
## 6364
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Two-Way ANOVA (Type III SS)
# Using contr.sum for Type III SS in a non-balanced design
tw_model <- lm(PV1MATH ~ ST004D01T * IMMIG, data = ds,
contrasts = list(ST004D01T = contr.sum, IMMIG = contr.sum))
anova_output_tw <- car::Anova(tw_model, type = "III")
anova_output_tw
## Anova Table (Type III tests)
##
## Response: PV1MATH
## Sum Sq Df F value Pr(>F)
## (Intercept) 1209070007 1 117587.0710 < 2.2e-16 ***
## ST004D01T 204243 1 19.8635 0.00000846 ***
## IMMIG 1367634 2 66.5040 < 2.2e-16 ***
## ST004D01T:IMMIG 14619 2 0.7109 0.4912
## Residuals 65436799 6364
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Effect Sizes (Partial Omega Squared)
effectsize::omega_squared(model = anova_output_tw, partial = TRUE)
## # Effect Size for ANOVA (Type III)
##
## Parameter | Omega2 (partial) | 95% CI
## -------------------------------------------------
## ST004D01T | 2.95e-03 | [0.00, 1.00]
## IMMIG | 0.02 | [0.01, 1.00]
## ST004D01T:IMMIG | 0.00 | [0.00, 1.00]
##
## - One-sided CIs: upper bound fixed at [1.00].
Q: Which specific group combinations are different?
cat("\n--- 3.9. Post-Hoc (Two-Way, Tukey-Kramer) ---\n")
##
## --- 3.9. Post-Hoc (Two-Way, Tukey-Kramer) ---
# Note: We pass the `lm` object 'tw_model' here.
tukey_twoway_result <- agricolae::HSD.test(tw_model, c("ST004D01T", "IMMIG"), group = FALSE, unbalanced = TRUE)
print(tukey_twoway_result$comparison)
## difference
## Female:First-Generation student - Female:Native student 23.1310487
## Female:First-Generation student - Female:Second-Generation student -13.4626958
## Female:First-Generation student - Male:First-Generation student -14.1616182
## Female:First-Generation student - Male:Native student 12.0208081
## Female:First-Generation student - Male:Second-Generation student -34.1377589
## Female:Native student - Female:Second-Generation student -36.5937444
## Female:Native student - Male:First-Generation student -37.2926669
## Female:Native student - Male:Native student -11.1102405
## Female:Native student - Male:Second-Generation student -57.2688076
## Female:Second-Generation student - Male:First-Generation student -0.6989224
## Female:Second-Generation student - Male:Native student 25.4835039
## Female:Second-Generation student - Male:Second-Generation student -20.6750631
## Male:First-Generation student - Male:Native student 26.1824264
## Male:First-Generation student - Male:Second-Generation student -19.9761407
## Male:Native student - Male:Second-Generation student -46.1585670
## pvalue
## Female:First-Generation student - Female:Native student 0.0014
## Female:First-Generation student - Female:Second-Generation student 0.2100
## Female:First-Generation student - Male:First-Generation student 0.1636
## Female:First-Generation student - Male:Native student 0.3311
## Female:First-Generation student - Male:Second-Generation student 0.0000
## Female:Native student - Female:Second-Generation student 0.0000
## Female:Native student - Male:First-Generation student 0.0000
## Female:Native student - Male:Native student 0.4230
## Female:Native student - Male:Second-Generation student 0.0000
## Female:Second-Generation student - Male:First-Generation student 1.0000
## Female:Second-Generation student - Male:Native student 0.0003
## Female:Second-Generation student - Male:Second-Generation student 0.0069
## Male:First-Generation student - Male:Native student 0.0002
## Male:First-Generation student - Male:Second-Generation student 0.0103
## Male:Native student - Male:Second-Generation student 0.0000
## signif.
## Female:First-Generation student - Female:Native student **
## Female:First-Generation student - Female:Second-Generation student
## Female:First-Generation student - Male:First-Generation student
## Female:First-Generation student - Male:Native student
## Female:First-Generation student - Male:Second-Generation student ***
## Female:Native student - Female:Second-Generation student ***
## Female:Native student - Male:First-Generation student ***
## Female:Native student - Male:Native student
## Female:Native student - Male:Second-Generation student ***
## Female:Second-Generation student - Male:First-Generation student
## Female:Second-Generation student - Male:Native student ***
## Female:Second-Generation student - Male:Second-Generation student **
## Male:First-Generation student - Male:Native student ***
## Male:First-Generation student - Male:Second-Generation student *
## Male:Native student - Male:Second-Generation student ***
## LCL
## Female:First-Generation student - Female:Native student 6.164230
## Female:First-Generation student - Female:Second-Generation student -30.429514
## Female:First-Generation student - Male:First-Generation student -31.128437
## Female:First-Generation student - Male:Native student -4.946011
## Female:First-Generation student - Male:Second-Generation student -51.104578
## Female:Native student - Female:Second-Generation student -53.560563
## Female:Native student - Male:First-Generation student -54.259486
## Female:Native student - Male:Native student -28.077059
## Female:Native student - Male:Second-Generation student -74.235626
## Female:Second-Generation student - Male:First-Generation student -17.665741
## Female:Second-Generation student - Male:Native student 8.516685
## Female:Second-Generation student - Male:Second-Generation student -37.641882
## Male:First-Generation student - Male:Native student 9.215608
## Male:First-Generation student - Male:Second-Generation student -36.942959
## Male:Native student - Male:Second-Generation student -63.125386
## UCL
## Female:First-Generation student - Female:Native student 40.097867
## Female:First-Generation student - Female:Second-Generation student 3.504123
## Female:First-Generation student - Male:First-Generation student 2.805200
## Female:First-Generation student - Male:Native student 28.987627
## Female:First-Generation student - Male:Second-Generation student -17.170940
## Female:Native student - Female:Second-Generation student -19.626926
## Female:Native student - Male:First-Generation student -20.325848
## Female:Native student - Male:Native student 5.856578
## Female:Native student - Male:Second-Generation student -40.301989
## Female:Second-Generation student - Male:First-Generation student 16.267896
## Female:Second-Generation student - Male:Native student 42.450323
## Female:Second-Generation student - Male:Second-Generation student -3.708244
## Male:First-Generation student - Male:Native student 43.149245
## Male:First-Generation student - Male:Second-Generation student -3.009322
## Male:Native student - Male:Second-Generation student -29.191748
Q: Is there a significant difference between the four Math subscales
# (Employing, Formulating, Interpreting, Reasoning)?
# Data Prep: Reshape data from wide to long
# We need an ID var (CNTSTUID) and the measures
ds_long <- rstatix::gather(ds,
PV1MPEM, PV1MPFS, PV1MPIN, PV1MPRE,
key = "subscale", value = "score")
#Reshaped data for RM-ANOVA (first 6 rows)
head(ds_long)
## CNTSTUID ST004D01T IMMIG HISCED ESCS PV1MATH
## 1 70200001 Female Native student ISCED level 6 0.1836 639.004
## 2 70200002 Male Native student ISCED level 5 0.8261 697.191
## 3 70200003 Male Native student ISCED level 3.3 -1.0357 693.710
## 4 70200004 Male Native student ISCED level 4 -0.9606 427.317
## 5 70200005 Female Native student ISCED level 5 0.0856 436.462
## 6 70200006 Female First-Generation student ISCED level 7 0.1268 569.982
## PV1READ PV1SCIE HOMEPOS ICTRES TEACHSUP BELONG ANXMAT MATHEFF FEELSAFE
## 1 676.298 710.634 0.7524 0.1940 -0.5635 0.2442 0.3729 0.1429 -0.7560
## 2 625.585 670.646 0.7842 0.6249 0.1475 -0.0437 0.6647 -0.2874 1.1246
## 3 620.116 666.095 0.0666 -0.3987 -0.5635 -0.6137 0.0510 0.2226 0.4417
## 4 381.495 340.308 -0.9300 -0.9028 -0.1002 -1.1147 0.6387 -0.9344 -0.7560
## 5 448.199 456.333 -0.8949 0.2514 1.5558 2.1143 2.5026 -0.9322 1.1246
## 6 469.441 475.158 -0.5988 -0.4733 0.1475 0.5159 -0.7358 0.2637 0.1413
## STRESAGR subscale score
## 1 0.9777 PV1MPEM 604.382
## 2 -0.7402 PV1MPEM 705.040
## 3 -0.2053 PV1MPEM 676.642
## 4 0.2201 PV1MPEM 401.548
## 5 -0.9940 PV1MPEM 437.563
## 6 0.2183 PV1MPEM 528.852
ds_long_sub <- ds_long %>%
dplyr::filter(CNTSTUID %in% unique(ds_long$CNTSTUID)[1:100]) #just 100
# RM ANOVA Test using afex::aov_ez
rm_aov <- afex::aov_ez(id = "CNTSTUID",
dv = "score",
data = ds_long_sub, #just 100
within = "subscale",
include_aov = TRUE,
type = "III")
#RM-ANOVA Summary (Mauchly's Sphericity Test)
srma <- summary(rm_aov, multivariate = FALSE, univariate = TRUE)
print(srma)
##
## Univariate Type III Repeated-Measures ANOVA Assuming Sphericity
##
## Sum Sq num Df Error SS den Df F value Pr(>F)
## (Intercept) 135588373 1 4986162 99 2692.101 <2e-16 ***
## subscale 3854 3 306733 297 1.244 0.2939
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
##
## Mauchly Tests for Sphericity
##
## Test statistic p-value
## subscale 0.93471 0.25234
##
##
## Greenhouse-Geisser and Huynh-Feldt Corrections
## for Departure from Sphericity
##
## GG eps Pr(>F[GG])
## subscale 0.95726 0.2939
##
## HF eps Pr(>F[HF])
## subscale 0.9888774 0.2939534
# Effect Size (Generalized Eta Squared)
effectsize::eta_squared(model = rm_aov, generalized = "subscale")
## # Effect Size for ANOVA (Type III)
##
## Parameter | Eta2 (generalized) | 95% CI
## ---------------------------------------------
## subscale | 7.28e-04 | [0.00, 1.00]
##
## - Observed variables: subscale
## - One-sided CIs: upper bound fixed at [1.00].
# Effect Size (Generalized Omega Squared using MOTE)
try(print(MOTE::omega.partial.SS.rm(
dfm = srma$univariate.tests["subscale", "num Df"],
dfe = srma$univariate.tests["subscale", "den Df"],
msm = srma$univariate.tests["subscale", "Sum Sq"] / srma$univariate.tests["subscale", "num Df"],
mse = srma$univariate.tests["subscale", "Error SS"] / srma$univariate.tests["subscale", "den Df"],
mss = srma$univariate.tests["(Intercept)", "Error SS"] / srma$univariate.tests["(Intercept)", "den Df"],
ssm = srma$univariate.tests["subscale", "Sum Sq"],
sse = srma$univariate.tests["subscale", "Error SS"],
sss = srma$univariate.tests["(Intercept)", "Error SS"],
a = .05
)))
## $omega
## [1] 0.0001413607
##
## $omegalow
## [1] 0
##
## $omegahigh
## [1] 0.9999952
##
## $dfm
## [1] 3
##
## $dfe
## [1] 297
##
## $F
## [1] 1.243962
##
## $p
## [1] 0.2939455
##
## $estimate
## [1] "$\\omega^2_{p}$ = 0.00, 95\\% CI [0.00, 1.00]"
##
## $statistic
## [1] "$F$(3, 297) = 1.24, $p$ = .294"
Q: Which math subscales are different from each other?
# Note: slide uses p.adjust.method = "none".
# "holm" or "bonferroni" is generally recommended for multiple comparisons.
post_hoc_rm <- rstatix::pairwise_t_test(ds_long, score ~ subscale,
paired = TRUE,
p.adjust.method = "none")
post_hoc_rm
## # A tibble: 6 × 10
## .y. group1 group2 n1 n2 statistic df p p.adj p.adj.signif
## * <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 score PV1MP… PV1MP… 6606 6606 7.22 6605 5.74e-13 5.74e-13 ****
## 2 score PV1MP… PV1MP… 6606 6606 3.71 6605 2.08e- 4 2.08e- 4 ***
## 3 score PV1MP… PV1MP… 6606 6606 9.94 6605 4.11e-23 4.11e-23 ****
## 4 score PV1MP… PV1MP… 6606 6606 -3.42 6605 6.4 e- 4 6.4 e- 4 ***
## 5 score PV1MP… PV1MP… 6606 6606 2.37 6605 1.8 e- 2 1.8 e- 2 *
## 6 score PV1MP… PV1MP… 6606 6606 5.68 6605 1.4 e- 8 1.4 e- 8 ****
Q: Is there a difference in Math score distributions between Genders?
# --- Wilcoxon-Mann-Whitney U Test (Independent Samples) ---
# Non-parametric alternative to the independent t-test.
#
wilcox_result <- wilcox.test(PV1MATH ~ ST004D01T, data = ds)
wilcox_result
##
## Wilcoxon rank sum test with continuity correction
##
## data: PV1MATH by ST004D01T
## W = 5069037, p-value = 0.0000007053
## alternative hypothesis: true location shift is not equal to 0
Q: Is there a difference in paired distributions between Math and Reading scores?
# --- Wilcoxon Signed-Rank Test (Paired Samples) ---
# Non-parametric alternative to the paired t-test.
#
paired_wilcox_result <- wilcox.test(ds$PV1MATH, ds$PV1READ, paired = TRUE)
paired_wilcox_result
##
## Wilcoxon signed rank test with continuity correction
##
## data: ds$PV1MATH and ds$PV1READ
## V = 16329735, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
Q: Is there a difference in Math score distributions among the immigration groups?
# --- Kruskal-Wallis Test ---
# Non-parametric alternative to the One-Way ANOVA.
kruskal_result <- kruskal.test(PV1MATH ~ IMMIG, data = ds)
kruskal_result
##
## Kruskal-Wallis rank sum test
##
## data: PV1MATH by IMMIG
## Kruskal-Wallis chi-squared = 115.01, df = 2, p-value < 2.2e-16
Q: Can we predict a student’s Math score (PV1MATH) using their ESCS?
model_simple <- lm(PV1MATH ~ ESCS, data = ds)
summary(model_simple)
##
## Call:
## lm(formula = PV1MATH ~ ESCS, data = ds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -313.16 -63.84 3.76 64.60 325.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 559.913 1.219 459.39 <2e-16 ***
## ESCS 51.638 1.382 37.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 93.2 on 6557 degrees of freedom
## (47 observations deleted due to missingness)
## Multiple R-squared: 0.1755, Adjusted R-squared: 0.1754
## F-statistic: 1396 on 1 and 6557 DF, p-value: < 2.2e-16
Q: Can we predict a student’s Math score using their ESCS, Gender, and Immigration status?
model_multiple <- lm(PV1MATH ~ ESCS + ST004D01T + IMMIG, data = ds)
summary(model_multiple)
##
## Call:
## lm(formula = PV1MATH ~ ESCS + ST004D01T + IMMIG, data = ds)
##
## Residuals:
## Min 1Q Median 3Q Max
## -316.69 -63.00 4.90 63.96 315.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 557.942 3.173 175.865 < 2e-16 ***
## ESCS 50.755 1.416 35.845 < 2e-16 ***
## ST004D01TMale 13.734 2.319 5.924 0.00000000331 ***
## IMMIGNative student -8.397 3.178 -2.642 0.00826 **
## IMMIGSecond-Generation student 19.956 4.501 4.434 0.00000942220 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 92.47 on 6361 degrees of freedom
## (240 observations deleted due to missingness)
## Multiple R-squared: 0.1884, Adjusted R-squared: 0.1879
## F-statistic: 369.1 on 4 and 6361 DF, p-value: < 2.2e-16