Data import

mydata <- read.table("~/Desktop/study_performance.csv",
                     header = TRUE, sep = ",", dec = ".")
mydata$ID <- 1:nrow(mydata)
head(mydata)
##   gender race_ethnicity parental_level_of_education        lunch
## 1 female        group B           bachelor's degree     standard
## 2 female        group C                some college     standard
## 3 female        group B             master's degree     standard
## 4   male        group A          associate's degree free/reduced
## 5   male        group C                some college     standard
## 6 female        group B          associate's degree     standard
##   test_preparation_course math_score reading_score writing_score ID
## 1                    none         72            72            74  1
## 2               completed         69            90            88  2
## 3                    none         90            95            93  3
## 4                    none         47            57            44  4
## 5                    none         76            78            75  5
## 6                    none         71            83            78  6

Data description

Data manipulation

colnames(mydata) [1] <- ("Gender")
colnames(mydata) [2] <- ("Background")
colnames(mydata) [3] <- ("P.education")
colnames(mydata) [4] <- ("Meal quality")
colnames(mydata) [5] <- ("Preparation")
colnames(mydata) [6] <- ("Math")
colnames(mydata) [7] <- ("Reading")
colnames(mydata) [8] <- ("Writing")
colnames(mydata) [9] <- ("id")
head(mydata)
##   Gender Background        P.education Meal quality Preparation Math Reading
## 1 female    group B  bachelor's degree     standard        none   72      72
## 2 female    group C       some college     standard   completed   69      90
## 3 female    group B    master's degree     standard        none   90      95
## 4   male    group A associate's degree free/reduced        none   47      57
## 5   male    group C       some college     standard        none   76      78
## 6 female    group B associate's degree     standard        none   71      83
##   Writing id
## 1      74  1
## 2      88  2
## 3      93  3
## 4      44  4
## 5      75  5
## 6      78  6
mydata$ID <- seq(1, nrow(mydata))
# For my own convenience, I changed words to abbreviations.
mydata$Gender <- factor(mydata$Gender, 
                             levels = c("female", "male"), 
                             labels = c("F", "M"))
mydata$Background <- factor(mydata$Background, 
                             levels = c("group A", "group B", "group C", "group D"), 
                             labels = c("A", "B", "C", "D"))
mydata$P.education <- factor(mydata$P.education, 
                             levels = c("bachelor's degree", " master's degree", "associates's degree", "some college", "high school"), 
                             labels = c("BD", "MD", "AD", "CD", "HS"))

Descriptive statistics

library(pastecs)
round(stat.desc(mydata[ , c(6,7,8)]), 2)
##                  Math  Reading  Writing
## nbr.val       1000.00  1000.00  1000.00
## nbr.null         1.00     0.00     0.00
## nbr.na           0.00     0.00     0.00
## min              0.00    17.00    10.00
## max            100.00   100.00   100.00
## range          100.00    83.00    90.00
## sum          66089.00 69169.00 68054.00
## median          66.00    70.00    69.00
## mean            66.09    69.17    68.05
## SE.mean          0.48     0.46     0.48
## CI.mean.0.95     0.94     0.91     0.94
## var            229.92   213.17   230.91
## std.dev         15.16    14.60    15.20
## coef.var         0.23     0.21     0.22

Research question 1: Are there differences in how successful students were on math, reading and writing test?

#install.packages("rstatix")
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
#install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks pastecs::extract()
## ✖ dplyr::filter()  masks rstatix::filter(), stats::filter()
## ✖ dplyr::first()   masks pastecs::first()
## ✖ dplyr::lag()     masks stats::lag()
## ✖ dplyr::last()    masks pastecs::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
mydata_long <- mydata %>% pivot_longer(cols = c("Math", "Reading", "Writing"),
                                       names_to = "Tests",
                                       values_to = "Scores") %>% convert_as_factor(Tests)
mydata_long <- as.data.frame(mydata_long[ ,c(6,8,9)])

tail(mydata_long,10)
##        id   Tests Scores
## 2991  997 Writing     55
## 2992  998    Math     59
## 2993  998 Reading     71
## 2994  998 Writing     65
## 2995  999    Math     68
## 2996  999 Reading     78
## 2997  999 Writing     77
## 2998 1000    Math     77
## 2999 1000 Reading     86
## 3000 1000 Writing     86
#install.packages("ggpubr")
library(ggpubr)
ggboxplot(mydata_long,
          x="Tests",
          y="Scores",
           add= "jitter")

library(rstatix)
mydata_long %>%
  group_by(Tests) %>%
  get_summary_stats(Scores, type = "common")
## # A tibble: 3 × 11
##   Tests   variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>   <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Math    Scores    1000     0   100     66  20    66.1  15.2 0.479 0.941
## 2 Reading Scores    1000    17   100     70  20    69.2  14.6 0.462 0.906
## 3 Writing Scores    1000    10   100     69  21.2  68.1  15.2 0.481 0.943
mydata_long %>%
  group_by(Tests) %>%
  shapiro_test(Scores)
## # A tibble: 3 × 4
##   Tests   variable statistic         p
##   <fct>   <chr>        <dbl>     <dbl>
## 1 Math    Scores       0.993 0.000145 
## 2 Reading Scores       0.993 0.000106 
## 3 Writing Scores       0.992 0.0000292
ANOVA_results <- anova_test(dv = Scores,
                            wid = id,
                            within = Tests,
                            data = mydata_long)
ANOVA_results
## ANOVA Table (type III tests)
## 
## $ANOVA
##   Effect DFn  DFd      F        p p<.05   ges
## 1  Tests   2 1998 75.783 1.89e-32     * 0.007
## 
## $`Mauchly's Test for Sphericity`
##   Effect     W         p p<.05
## 1  Tests 0.529 9.61e-139     *
## 
## $`Sphericity Corrections`
##   Effect  GGe        DF[GG]    p[GG] p[GG]<.05  HFe        DF[HF]    p[HF]
## 1  Tests 0.68 1.36, 1358.21 5.89e-23         * 0.68 1.36, 1359.42 5.65e-23
##   p[HF]<.05
## 1         *
get_anova_table(ANOVA_results, correction = "auto")
## ANOVA Table (type III tests)
## 
##   Effect  DFn     DFd      F        p p<.05   ges
## 1  Tests 1.36 1358.21 75.783 5.89e-23     * 0.007
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following objects are masked from 'package:rstatix':
## 
##     cohens_d, eta_squared
interpret_eta_squared(0.007, rules = "cohen1992")
## [1] "very small"
## (Rules: cohen1992)
#install.packages("rstatix")
library(rstatix)
pwc <- mydata_long %>%
pairwise_t_test(Scores ~ Tests,
paired = TRUE,
p.adjust.method =
"bonferroni")
pwc
## # A tibble: 3 × 10
##   .y.   group1 group2    n1    n2 statistic    df        p    p.adj p.adj.signif
## * <chr> <chr>  <chr>  <int> <int>     <dbl> <dbl>    <dbl>    <dbl> <chr>       
## 1 Scor… Math   Readi…  1000  1000    -10.8    999 7.32e-26 2.20e-25 ****        
## 2 Scor… Math   Writi…  1000  1000     -6.52   999 1.14e-10 3.42e-10 ****        
## 3 Scor… Readi… Writi…  1000  1000      7.79   999 1.71e-14 5.13e-14 ****
library(rstatix)
pwc <- pwc %>%
add_y_position (fun = "median", step.increase
= 0.35)
library (ggpubr)
ggboxplot (mydata_long, x = "Tests", y = "Scores", add = "point", ylim=c(0, 18)) +
  stat_pvalue_manual (pwc, hide.ns = FALSE) +
  stat_summary (fun = mean, geom = "point", shape = 16, size = 4, aes(group = Tests), color = "darkslategray", position = position_dodge(width = 0.8)) +
stat_summary(fun = mean, colour = "darkslategray",
             position = position_dodge(width = 0.8),
geom = "text", just = 0.5, hjust = -2,
aes (label = round (after_stat(y), digits = 2), group = Tests)) +
labs (subtitle = get_test_label(ANOVA_results, detailed = TRUE),
      caption = get_pwc_label(pwc))
## Warning in stat_summary(fun = mean, colour = "darkslategray", position =
## position_dodge(width = 0.8), : Ignoring unknown parameters: `just`

library(rstatix)
paires_nonpar <- wilcox_test(Scores ~ Tests,
                             paired = TRUE,
                             p.adjust.method = "bonferroni",
                             data=mydata_long)
paires_nonpar
## # A tibble: 3 × 9
##   .y.    group1  group2     n1    n2 statistic        p    p.adj p.adj.signif
## * <chr>  <chr>   <chr>   <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 Scores Math    Reading  1000  1000   140794. 2.39e-24 7.17e-24 ****        
## 2 Scores Math    Writing  1000  1000   178754. 3.57e-10 1.07e- 9 ****        
## 3 Scores Reading Writing  1000  1000   269903  3.82e-14 1.15e-13 ****
library(rstatix)
FriedmanANOVA <- friedman_test(Scores~ Tests|id,
                               data=mydata_long)
FriedmanANOVA
## # A tibble: 1 × 6
##   .y.        n statistic    df        p method       
## * <chr>  <int>     <dbl> <dbl>    <dbl> <chr>        
## 1 Scores  1000      103.     2 5.36e-23 Friedman test
library(effectsize)
effectsize:: kendalls_w(Scores ~ Tests | id,
                        data=mydata_long)
## Warning: 157 block(s) contain ties, some containing only 1 unique ranking.
## Kendall's W |       95% CI
## --------------------------
## 0.05        | [0.04, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
interpret_kendalls_w(0.05)
## [1] "slight agreement"
## (Rules: landis1977)
library(rstatix)
comparisons <- paires_nonpar %>%
add_y_position(fun = "median", step.increase = 0.35)

library (ggpubr)
ggboxplot(mydata_long, x = "Tests", y = "Scores", add = "point", ylim= c(0, 18)) +
  stat_pvalue_manual (comparisons, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16,size= 4,
               aes (group= Tests), color = "darkseagreen4", 
               position=position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkseagreen4",
               position=position_dodge(width = 0.8),
               geom = "text", just = 0.5, hjust = -8,
               aes (label = round(after_stat(y), digits = 2), group = Tests)) +
labs (subtitle=get_test_label(FriedmanANOVA, detailed = TRUE),
caption = get_pwc_label (comparisons))
## Warning in stat_summary(fun = median, colour = "darkseagreen4", position =
## position_dodge(width = 0.8), : Ignoring unknown parameters: `just`

Research question 2: Is there correlation between scores in Math and Reading tests?

mydata[1:10 , c(6,7,9)]
##    Math Reading id
## 1    72      72  1
## 2    69      90  2
## 3    90      95  3
## 4    47      57  4
## 5    76      78  5
## 6    71      83  6
## 7    88      95  7
## 8    40      43  8
## 9    64      64  9
## 10   38      60 10
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
scatterplotMatrix(mydata[ , c(6,7)], smooth=FALSE)

H0: ρ(m,r)=o

library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(mydata[ ,c(6,7)]),
      type="pearson")
##         Math Reading
## Math    1.00    0.82
## Reading 0.82    1.00
## 
## n= 1000 
## 
## 
## P
##         Math Reading
## Math          0     
## Reading  0