Matej Suhalj

Research Question:
Are there differences in math, reading and writing exams?

mydata <- read.table("./original_data4.csv", header = TRUE, sep = ",")
head(mydata)

##   ID Gender MathScore ReadingScore WritingScore
## 1  0 female        72           72           74
## 2  1 female        69           90           88
## 3  2 female        90           95           93
## 4  3   male        47           57           44
## 5  4   male        76           78           75
## 6  5 female        71           83           78

Unit of observation: one student

Description of data:

ID: student identificator
Gender: male, female
MathScore: results on a math exam
ReadingScore: results on a reading exam
WritingScore: results on a writing exam

The data set was taken from Kaggle.com (Students exam scores: Extended dataset) and the sample size is 999 students.

library(psych)
mydata$GenderF <- factor(mydata$Gender,
                         levels = c("male", "female"),
                         labels = c(0, 1))
head(mydata)

##   ID Gender MathScore ReadingScore WritingScore GenderF
## 1  0 female        72           72           74       1
## 2  1 female        69           90           88       1
## 3  2 female        90           95           93       1
## 4  3   male        47           57           44       0
## 5  4   male        76           78           75       0
## 6  5 female        71           83           78       1

library(rstatix)

## Warning: package 'rstatix' was built under R version 4.3.2

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.2

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks rstatix::filter(), stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

#Wide format of data is changed to long format.
mydata_long <- mydata %>%
                 pivot_longer(
                   cols = c("MathScore", "ReadingScore", "WritingScore"),
                   names_to = "Exams", 
                   values_to = "Points") %>%
                 convert_as_factor(Exams)


mydata_long <- as.data.frame(mydata_long)

tail(mydata_long)

##       ID Gender GenderF        Exams Points
## 2995 998 female       1    MathScore     68
## 2996 998 female       1 ReadingScore     78
## 2997 998 female       1 WritingScore     77
## 2998 999 female       1    MathScore     77
## 2999 999 female       1 ReadingScore     86
## 3000 999 female       1 WritingScore     86

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.3.2

#Boxplot for each variable.
ggboxplot(mydata_long, 
          x = "Exams", 
          y = "Points", 
          add = "jitter")

library(tidyverse)
library(ggpubr)
library(rstatix)

#Finding outliers.
mydata_long %>%
  group_by(Exams) %>%
  identify_outliers(`Points`)

## # A tibble: 19 × 7
##    Exams           ID Gender GenderF Points is.outlier is.extreme
##    <fct>        <int> <chr>  <fct>    <int> <lgl>      <lgl>     
##  1 MathScore       17 female 1           18 TRUE       FALSE     
##  2 MathScore       59 female 1            0 TRUE       FALSE     
##  3 MathScore      145 female 1           22 TRUE       FALSE     
##  4 MathScore      338 female 1           24 TRUE       FALSE     
##  5 MathScore      466 female 1           26 TRUE       FALSE     
##  6 MathScore      787 female 1           19 TRUE       FALSE     
##  7 MathScore      842 female 1           23 TRUE       FALSE     
##  8 MathScore      980 female 1            8 TRUE       FALSE     
##  9 ReadingScore    59 female 1           17 TRUE       FALSE     
## 10 ReadingScore    76 male   0           26 TRUE       FALSE     
## 11 ReadingScore   211 male   0           28 TRUE       FALSE     
## 12 ReadingScore   327 male   0           23 TRUE       FALSE     
## 13 ReadingScore   596 male   0           24 TRUE       FALSE     
## 14 ReadingScore   980 female 1           24 TRUE       FALSE     
## 15 WritingScore    59 female 1           10 TRUE       FALSE     
## 16 WritingScore    76 male   0           22 TRUE       FALSE     
## 17 WritingScore   327 male   0           19 TRUE       FALSE     
## 18 WritingScore   596 male   0           15 TRUE       FALSE     
## 19 WritingScore   980 female 1           23 TRUE       FALSE

#Removing outliers.
mydata_long <- mydata_long %>%
  filter(!ID == (980))
mydata_long <- mydata_long %>%
  filter(!ID == (17))
mydata_long <- mydata_long %>%
  filter(!ID == (59))
mydata_long <- mydata_long %>%
  filter(!ID == (145))
mydata_long <- mydata_long %>%
  filter(!ID == (338))
mydata_long <- mydata_long %>%
  filter(!ID == (466))
mydata_long <- mydata_long %>%
  filter(!ID == (787))
mydata_long <- mydata_long %>%
  filter(!ID == (842))
mydata_long <- mydata_long %>%
  filter(!ID == (76))
mydata_long <- mydata_long %>%
  filter(!ID == (211))
mydata_long <- mydata_long %>%
  filter(!ID == (327))
mydata_long <- mydata_long %>%
  filter(!ID == (596))

#Checking if there are anymore outliers
mydata_long %>%
  group_by(Exams) %>%
  identify_outliers(`Points`)

## # A tibble: 2 × 7
##   Exams           ID Gender GenderF Points is.outlier is.extreme
##   <fct>        <int> <chr>  <fct>    <int> <lgl>      <lgl>     
## 1 ReadingScore   601 female 1           29 TRUE       FALSE     
## 2 ReadingScore   896 male   0           29 TRUE       FALSE

mydata_long <- mydata_long %>%
  filter(!ID == (601))
mydata_long <- mydata_long %>%
  filter(!ID == (896))

#Checking if there are anymore outliers
mydata_long %>%
  group_by(Exams) %>%
  identify_outliers(`Points`)

## [1] Exams      ID         Gender     GenderF    Points     is.outlier is.extreme
## <0 rows> (or 0-length row.names)

tail(mydata_long)

##       ID Gender GenderF        Exams Points
## 2953 998 female       1    MathScore     68
## 2954 998 female       1 ReadingScore     78
## 2955 998 female       1 WritingScore     77
## 2956 999 female       1    MathScore     77
## 2957 999 female       1 ReadingScore     86
## 2958 999 female       1 WritingScore     86

Assumptions:
- All variables are numeric.
- Normality: The variables in the population are normally distributed (Shapiro-Wilk test).
- Sphericity: the variances of the differences between the examines variables are equal (Mauchly test for sphericity).
- No significant outliers.

library(rstatix)

#Checking normality with Shapiro-Wilk test
mydata_long %>%
  group_by(Exams) %>%
  shapiro_test(Points)

## # A tibble: 3 × 4
##   Exams        variable statistic         p
##   <fct>        <chr>        <dbl>     <dbl>
## 1 MathScore    Points       0.996 0.0117   
## 2 ReadingScore Points       0.994 0.000537 
## 3 WritingScore Points       0.992 0.0000500

Variable Points needs to be normally distributed within each of the stated Exams.
Example of MathScore:
H0: Variable Points in normally distributed within MathScore.
H1: Variable Points in NOT normally distributed within MathScore.

H0 of WritingScore is rejected at p<0.05, therefore normality is violated and I will use Friedman ANOVA.

library(rstatix)

#Descriptive statistics for each variable.
mydata_long %>%
  group_by(Exams) %>%
  get_summary_stats(Points, type = "common")

## # A tibble: 3 × 11
##   Exams        variable     n   min   max median   iqr  mean    sd    se    ci
##   <fct>        <fct>    <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MathScore    Points     986    27   100   66.5    20  66.7  14.3 0.457 0.896
## 2 ReadingScore Points     986    31   100   70      20  69.7  13.9 0.443 0.869
## 3 WritingScore Points     986    32   100   69      21  68.6  14.4 0.459 0.902

Because I will use FriedmanANOVA (non-parametric test) I need to compare medians between those 3 exams and not means.

Friedman ANOVA

library(rstatix)

#Friedman ANOVA.
FriedmanANOVA <- friedman_test(Points ~ Exams | ID,
                               data = mydata_long)

#Results
FriedmanANOVA

## # A tibble: 1 × 6
##   .y.        n statistic    df        p method       
## * <chr>  <int>     <dbl> <dbl>    <dbl> <chr>        
## 1 Points   986      99.2     2 2.92e-22 Friedman test

H0: Location distribution of all 3 variables are the same.
H1: At least 1 location distribution is different.

I reject H0 at p<0.001.

library(effectsize)

## 
## Attaching package: 'effectsize'

## The following objects are masked from 'package:rstatix':
## 
##     cohens_d, eta_squared

## The following object is masked from 'package:psych':
## 
##     phi

effectsize::kendalls_w(Points ~ Exams | ID,
                       data = mydata_long)

## Warning: 156 block(s) contain ties, some containing only 1 unique ranking.

## Kendall's W |       95% CI
## --------------------------
## 0.05        | [0.03, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].

interpret_kendalls_w(0.05)

## [1] "slight agreement"
## (Rules: landis1977)

There is slight agrement in effect size, which just confirms my hyposthesis testing where I rejected H0.

library(rstatix)

#Wilcoxon signed rank tests - comparing all possible paires.
paires_nonpar <- wilcox_test(Points ~ Exams, 
                             paired = TRUE, 
                             p.adjust.method = "bonferroni",
                             data = mydata_long)

paires_nonpar

## # A tibble: 3 × 9
##   .y.    group1      group2    n1    n2 statistic        p    p.adj p.adj.signif
## * <chr>  <chr>       <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 Points MathScore   Readi…   986   986   137718. 1.20e-23 3.60e-23 ****        
## 2 Points MathScore   Writi…   986   986   173838. 5.67e-10 1.7 e- 9 ****        
## 3 Points ReadingSco… Writi…   986   986   259886. 3.47e-13 1.04e-12 ****

All the p values are statistically significant. No location distributions in all 3 combinations are the same.

library(rstatix)
comparisons <- paires_nonpar %>% 
                 add_y_position(fun = "median", step.increase = 0.35)

library(ggpubr)
ggboxplot(mydata_long, x = "Exams", y = "Points", add = "point", ylim=c(0, 18)) +
  stat_pvalue_manual(comparisons, hide.ns = FALSE) +
  stat_summary(fun = median, geom = "point", shape = 16, size = 4,
               aes(group = Exams), color = "darkred",
               position = position_dodge(width = 0.8)) +
  stat_summary(fun = median, colour = "darkred", 
               position = position_dodge(width = 0.8),
               geom = "text", vjust = 0.5, hjust = -8,
               aes(label = round(after_stat(y), digits = 2), group = Exams)) +
  labs(subtitle = get_test_label(FriedmanANOVA,  detailed = TRUE),
       caption = get_pwc_label(comparisons))

Conclusion:
Based on the sample data, I found that there are differences between math, reading and writing exams (p<0.001). The effect size is 0,05, which means there is slight agreement in the statement that there is no difference between those 3 exams.

Example 2

2024-01-06

Matej Suhalj

Friedman ANOVA