This comprehensive analysis follows the CRISP-DM (Cross-Industry Standard Process for Data Mining) methodology to investigate factors affecting mathematics performance in Ekiti_schools. i examined six specific research questions using both descriptive and inferential statistics approaches to understand the relationships between continuous assessment frequency, demographic factors, and mathematics unified examination scores in Ekiti_State.
The primary objective of this analysis is to understand the factors that influence mathematics performance in schools.
Question: How often do Continuous Assessments occur in mathematics classes? Purpose: To quantify and describe the distribution of continuous assessment frequency across the dataset.
Question: What is the overall distribution and characteristics of Mathematics Unified Examination scores? Purpose: To provide descriptive statistics and understand the central tendencies and variability in mathematics performance.
Question: Does the frequency of Continuous Assessments affect Unified Examination scores? Null Hypothesis (H₀₃): There is no mean difference in UE scores across different CA frequency groups. Alternative Hypothesis (H₁₃): At least one group differs significantly in mean UE scores. Statistical Test: One-way ANOVA (α = 0.05)
Question: Are there significant gender differences in Mathematics Unified Examination scores? Null Hypothesis (H₀₄): Mean UE scores are equal between male and female students. Alternative Hypothesis (H₁₄): Mean UE scores differ significantly between genders. Statistical Test: Independent samples t-test (α = 0.05)
Question: Do students from urban and rural locations perform differently on Mathematics Unified Examinations? Null Hypothesis (H₀₅): Mean UE scores are equal between urban and rural students. Alternative Hypothesis (H₁₅): Mean UE scores differ significantly between locations. Statistical Test: Independent samples t-test (α = 0.05)
Question: Is there a significant difference in Mathematics performance between public and private schools? Null Hypothesis (H₀₆): Mean UE scores are equal between public and private schools. Alternative Hypothesis (H₁₆): Mean UE scores differ significantly between school types. Statistical Test: Independent samples t-test (α = 0.05)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(broom)
library(rstatix)
##
## Attaching package: 'rstatix'
##
## The following object is masked from 'package:stats':
##
## filter
raw_data <- read_excel("C:/Users/MUSAAB-TECH/OneDrive/Ado/merged_school_data.xlsx")
head(raw_data)
## # A tibble: 6 × 9
## `STUDENT S/N` Schools `School Names` `School Type` `School Location` GENDER
## <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 1 School A Muslim Grammar … Public Urban F
## 2 2 School A Muslim Grammar … Public Urban M
## 3 3 School A Muslim Grammar … Public Urban F
## 4 4 School A Muslim Grammar … Public Urban M
## 5 5 School A Muslim Grammar … Public Urban M
## 6 6 School A Muslim Grammar … Public Urban M
## # ℹ 3 more variables: `MATH SCORE` <dbl>, AGE <dbl>,
## # `FREQUENCY OF CONTINOUS ASSESSMENT` <dbl>
str(raw_data)
## tibble [2,135 × 9] (S3: tbl_df/tbl/data.frame)
## $ STUDENT S/N : num [1:2135] 1 2 3 4 5 6 7 8 9 10 ...
## $ Schools : chr [1:2135] "School A" "School A" "School A" "School A" ...
## $ School Names : chr [1:2135] "Muslim Grammar School Ado" "Muslim Grammar School Ado" "Muslim Grammar School Ado" "Muslim Grammar School Ado" ...
## $ School Type : chr [1:2135] "Public" "Public" "Public" "Public" ...
## $ School Location : chr [1:2135] "Urban" "Urban" "Urban" "Urban" ...
## $ GENDER : chr [1:2135] "F" "M" "F" "M" ...
## $ MATH SCORE : num [1:2135] 66 67 71 69 67 69 71 72 69 68 ...
## $ AGE : num [1:2135] 15 18 15 15 18 16 15 14 14 16 ...
## $ FREQUENCY OF CONTINOUS ASSESSMENT: num [1:2135] 3 3 3 3 3 3 3 3 3 3 ...
summary(raw_data)
## STUDENT S/N Schools School Names School Type
## Min. : 1.0 Length:2135 Length:2135 Length:2135
## 1st Qu.: 46.0 Class :character Class :character Class :character
## Median : 92.0 Mode :character Mode :character Mode :character
## Mean :100.8
## 3rd Qu.:144.0
## Max. :558.0
## School Location GENDER MATH SCORE AGE
## Length:2135 Length:2135 Min. : 6.00 Min. :14.00
## Class :character Class :character 1st Qu.:58.00 1st Qu.:16.00
## Mode :character Mode :character Median :66.00 Median :16.00
## Mean :64.29 Mean :16.46
## 3rd Qu.:68.00 3rd Qu.:17.00
## Max. :88.00 Max. :20.00
## FREQUENCY OF CONTINOUS ASSESSMENT
## Min. :2.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.628
## 3rd Qu.:3.000
## Max. :3.000
dim(raw_data)
## [1] 2135 9
missing_case <- sum(is.na(raw_data))
missing_case
## [1] 2
df_dup <- raw_data # alias for brevity
# Count duplicated rows (entire-row duplicates)
n_dup_rows <- sum(duplicated(df_dup))
cat("Number of fully duplicated rows:", n_dup_rows, "\n")
## Number of fully duplicated rows: 0
colnames(raw_data)[colSums(is.na(raw_data)) > 0]
## [1] "GENDER"
## vector of columns that are not unique
drop_cols <- c("STUDENT S/N", "MATH SCORE", "AGE", "Schools", "School Names")
## keep everything except the dropped columns
df_filt <- raw_data[ , !names(raw_data) %in% drop_cols]
## list of unique values per (remaining) column
uniq_vals <- lapply(df_filt, unique)
## pretty, vertical print-out
for (nm in names(uniq_vals)) {
cat("\n---", nm, "---\n")
print(uniq_vals[[nm]])
}
##
## --- School Type ---
## [1] "Public" "Private"
##
## --- School Location ---
## [1] "Urban" "Rural"
##
## --- GENDER ---
## [1] "F" "M" NA
##
## --- FREQUENCY OF CONTINOUS ASSESSMENT ---
## [1] 3 2
library(janitor) # To clean_names()
##
## Attaching package: 'janitor'
## The following object is masked from 'package:rstatix':
##
## make_clean_names
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
df <- raw_data %>%
clean_names() %>%
mutate(
# factor (categorical) columns
school_type = factor(school_type),
school_location = factor(school_location),
gender = factor(gender, levels = c("F", "M")),
)
summary(df)
## student_s_n schools school_names school_type
## Min. : 1.0 Length:2135 Length:2135 Private: 74
## 1st Qu.: 46.0 Class :character Class :character Public :2061
## Median : 92.0 Mode :character Mode :character
## Mean :100.8
## 3rd Qu.:144.0
## Max. :558.0
## school_location gender math_score age
## Rural: 345 F :1082 Min. : 6.00 Min. :14.00
## Urban:1790 M :1051 1st Qu.:58.00 1st Qu.:16.00
## NA's: 2 Median :66.00 Median :16.00
## Mean :64.29 Mean :16.46
## 3rd Qu.:68.00 3rd Qu.:17.00
## Max. :88.00 Max. :20.00
## frequency_of_continous_assessment
## Min. :2.000
## 1st Qu.:2.000
## Median :3.000
## Mean :2.628
## 3rd Qu.:3.000
## Max. :3.000
class(df$gender) # To confirm if conversion worked
## [1] "factor"
# Select numeric columns and reshape to long format
num_long <- df %>%
select(where(is.numeric), -student_s_n, -age) %>%
# keep only numeric variables
pivot_longer(
cols = everything(),
names_to = "variable",
values_to = "value"
)
# Combined boxplot
ggplot(num_long, aes(x = variable, y = value)) +
geom_boxplot(outlier.colour = "red", fill = "skyblue", alpha = 0.7) +
coord_flip() + # puts variables on the y-axis for readability
labs(
title = "Box-plots of all numeric variables (outliers in red)",
x = NULL,
y = "Value"
) +
theme_minimal(base_size = 13)
### Full summarry statistics
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
summary_table <- df %>%
select(where(is.numeric), -student_s_n, -age) %>%
psych::describe()
print(summary_table)
## vars n mean sd median trimmed mad min
## math_score 1 2135 64.29 7.22 66 64.38 5.93 6
## frequency_of_continous_assessment 2 2135 2.63 0.48 3 2.66 0.00 2
## max range skew kurtosis se
## math_score 88 82 -0.40 2.13 0.16
## frequency_of_continous_assessment 3 1 -0.53 -1.72 0.01
df_plot <- df %>%
mutate(
gender = factor(gender),
ca_freq = frequency_of_continous_assessment,
math = math_score
)
# Gender distribution
# Prepare counts and percentages
gender_counts <- df_plot %>%
filter(!is.na(gender)) %>% # drop missing genders
count(gender, name = "n") %>% # frequency table
arrange(desc(n)) %>% # largest slice first
mutate(
pct = n / sum(n), # proportion
label = scales::percent(pct, accuracy = 1)
)
# Pie chart with percentage labels
ggplot(gender_counts, aes(x = "", y = pct, fill = gender)) +
geom_col(width = 1, colour = "white") + # stacked bar
coord_polar(theta = "y") + # → pie
geom_text(
aes(label = label),
position = position_stack(vjust = 0.5),
colour = "white",
fontface = "bold",
size = 4
) +
scale_fill_brewer(palette = "Dark2") +
labs(
title = "Gender Distribution (Pie Chart)",
fill = "Gender"
) +
theme_void(base_size = 14) # clean, no axes
df_plot %>%
filter(!is.na(gender), !is.na(math_score)) %>% # remove missing values
mutate(gender = factor(gender)) %>% # ensure factor
ggplot(aes(gender, math_score, colour = gender)) +
geom_jitter(width = .2, alpha = .4, size = 1, na.rm = TRUE) +
geom_boxplot(width = .4, alpha = .2, outlier.shape = NA, na.rm = TRUE) +
scale_colour_brewer(palette = "Dark2", guide = "none") +
labs(
title = "Maths Scores by Gender",
x = NULL,
y = "Maths Score"
) +
theme_minimal(base_size = 14)
library(tidyverse)
df_plot %>%
filter(!is.na(frequency_of_continous_assessment)) %>% # drop NA rows
mutate(ca_freq = factor(frequency_of_continous_assessment)) %>%
ggplot(aes(ca_freq)) +
geom_bar(fill = "seagreen4", colour = "white") +
geom_text(
stat = "count",
aes(label = after_stat(count)), # modern syntax for counts
vjust = -0.25,
fontface = "bold",
size = 4,
colour = "black"
) +
scale_y_continuous(
expand = expansion(mult = c(0, 0.05)) # 5 % head-room for the labels
) +
labs(
title = "Frequency of Continuous Assessments",
x = "Number of CAs",
y = "Count"
) +
theme_minimal(base_size = 14)
# Overall Maths-score distribution
ggplot(df_plot, aes(math)) +
geom_histogram(binwidth = 5, fill = "orchid", colour = "white", alpha = .8) +
geom_density(colour = "darkorchid4", size = 1) +
labs(title = "Distribution of Maths Unified-Exam scores",
x = "Score", y = "Frequency")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
library(scales) # for percent_format()
##
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
##
## alpha, rescale
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# Create 25-point bands
score_bands <- df_plot %>%
filter(!is.na(math_score)) %>%
mutate(
band = cut(
math_score,
breaks = c(-Inf, 25, 50, 75, 100), # band edges
labels = c("0–25", "26–50", "51–75", "76–100"),
right = TRUE, include.lowest = TRUE
)
)
# Count & percentage per band
band_summary <- score_bands %>%
count(band, name = "n") %>%
mutate(
pct = n / sum(n),
label = percent(pct, accuracy = 0.1)
)
print(band_summary)
## # A tibble: 4 × 4
## band n pct label
## <fct> <int> <dbl> <chr>
## 1 0–25 1 0.000468 0.0%
## 2 26–50 63 0.0295 3.0%
## 3 51–75 1966 0.921 92.1%
## 4 76–100 105 0.0492 4.9%
# Percentage bar chart, emphasising 0–25 band
ggplot(band_summary, aes(band, pct, fill = band == "0–25")) +
geom_col(colour = "white") +
geom_text(aes(label = label), vjust = -0.4, fontface = "bold") +
scale_y_continuous(labels = percent_format(), expand = expansion(mult = c(0, 0.08))) +
scale_fill_manual(values = c("TRUE" = "firebrick", "FALSE" = "steelblue"), guide = "none") +
labs(
title = "Students Performance in Mathematics Unified Examination",
x = "Maths Unified-Exam Score",
y = "Percentage of Students"
) +
theme_minimal(base_size = 14)
describe(df_plot)
## vars n mean sd median trimmed mad
## student_s_n 1 2135 100.76 67.47 92 95.51 72.65
## schools* 2 2135 6.10 3.63 6 6.05 5.93
## school_names* 3 2135 6.25 3.32 6 6.24 4.45
## school_type* 4 2135 1.97 0.18 2 2.00 0.00
## school_location* 5 2135 1.84 0.37 2 1.92 0.00
## gender* 6 2133 1.49 0.50 1 1.49 0.00
## math_score 7 2135 64.29 7.22 66 64.38 5.93
## age 8 2135 16.46 1.23 16 16.45 1.48
## frequency_of_continous_assessment 9 2135 2.63 0.48 3 2.66 0.00
## ca_freq 10 2135 2.63 0.48 3 2.66 0.00
## math 11 2135 64.29 7.22 66 64.38 5.93
## min max range skew kurtosis se
## student_s_n 1 558 557 0.69 0.45 1.46
## schools* 1 12 11 0.04 -1.36 0.08
## school_names* 1 12 11 -0.02 -1.31 0.07
## school_type* 1 2 1 -5.08 23.86 0.00
## school_location* 1 2 1 -1.84 1.38 0.01
## gender* 1 2 1 0.03 -2.00 0.01
## math_score 6 88 82 -0.40 2.13 0.16
## age 14 20 6 0.06 -0.64 0.03
## frequency_of_continous_assessment 2 3 1 -0.53 -1.72 0.01
## ca_freq 2 3 1 -0.53 -1.72 0.01
## math 6 88 82 -0.40 2.13 0.16
str(df_plot)
## tibble [2,135 × 11] (S3: tbl_df/tbl/data.frame)
## $ student_s_n : num [1:2135] 1 2 3 4 5 6 7 8 9 10 ...
## $ schools : chr [1:2135] "School A" "School A" "School A" "School A" ...
## $ school_names : chr [1:2135] "Muslim Grammar School Ado" "Muslim Grammar School Ado" "Muslim Grammar School Ado" "Muslim Grammar School Ado" ...
## $ school_type : Factor w/ 2 levels "Private","Public": 2 2 2 2 2 2 2 2 2 2 ...
## $ school_location : Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 2 2 ...
## $ gender : Factor w/ 2 levels "F","M": 1 2 1 2 2 2 2 1 2 2 ...
## $ math_score : num [1:2135] 66 67 71 69 67 69 71 72 69 68 ...
## $ age : num [1:2135] 15 18 15 15 18 16 15 14 14 16 ...
## $ frequency_of_continous_assessment: num [1:2135] 3 3 3 3 3 3 3 3 3 3 ...
## $ ca_freq : num [1:2135] 3 3 3 3 3 3 3 3 3 3 ...
## $ math : num [1:2135] 66 67 71 69 67 69 71 72 69 68 ...
summary(df_plot)
## student_s_n schools school_names school_type
## Min. : 1.0 Length:2135 Length:2135 Private: 74
## 1st Qu.: 46.0 Class :character Class :character Public :2061
## Median : 92.0 Mode :character Mode :character
## Mean :100.8
## 3rd Qu.:144.0
## Max. :558.0
## school_location gender math_score age
## Rural: 345 F :1082 Min. : 6.00 Min. :14.00
## Urban:1790 M :1051 1st Qu.:58.00 1st Qu.:16.00
## NA's: 2 Median :66.00 Median :16.00
## Mean :64.29 Mean :16.46
## 3rd Qu.:68.00 3rd Qu.:17.00
## Max. :88.00 Max. :20.00
## frequency_of_continous_assessment ca_freq math
## Min. :2.000 Min. :2.000 Min. : 6.00
## 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:58.00
## Median :3.000 Median :3.000 Median :66.00
## Mean :2.628 Mean :2.628 Mean :64.29
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:68.00
## Max. :3.000 Max. :3.000 Max. :88.00
head(df_plot)
## # A tibble: 6 × 11
## student_s_n schools school_names school_type school_location gender math_score
## <dbl> <chr> <chr> <fct> <fct> <fct> <dbl>
## 1 1 School… Muslim Gram… Public Urban F 66
## 2 2 School… Muslim Gram… Public Urban M 67
## 3 3 School… Muslim Gram… Public Urban F 71
## 4 4 School… Muslim Gram… Public Urban M 69
## 5 5 School… Muslim Gram… Public Urban M 67
## 6 6 School… Muslim Gram… Public Urban M 69
## # ℹ 4 more variables: age <dbl>, frequency_of_continous_assessment <dbl>,
## # ca_freq <dbl>, math <dbl>
stats_data <- df_plot %>% select(math_score, gender, ca_freq, school_type, school_location)
head(stats_data)
## # A tibble: 6 × 5
## math_score gender ca_freq school_type school_location
## <dbl> <fct> <dbl> <fct> <fct>
## 1 66 F 3 Public Urban
## 2 67 M 3 Public Urban
## 3 71 F 3 Public Urban
## 4 69 M 3 Public Urban
## 5 67 M 3 Public Urban
## 6 69 M 3 Public Urban
str(stats_data)
## tibble [2,135 × 5] (S3: tbl_df/tbl/data.frame)
## $ math_score : num [1:2135] 66 67 71 69 67 69 71 72 69 68 ...
## $ gender : Factor w/ 2 levels "F","M": 1 2 1 2 2 2 2 1 2 2 ...
## $ ca_freq : num [1:2135] 3 3 3 3 3 3 3 3 3 3 ...
## $ school_type : Factor w/ 2 levels "Private","Public": 2 2 2 2 2 2 2 2 2 2 ...
## $ school_location: Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 2 2 ...
## list of unique values per (remaining) column
final_uniq_vals <- lapply(stats_data, unique)
## pretty, vertical print-out
for (nm in names(final_uniq_vals)) {
cat("\n---", nm, "---\n")
print(final_uniq_vals[[nm]])
}
##
## --- math_score ---
## [1] 66 67 71 69 72 68 73 65 70 64 63 61 76 74 75 78 59 60 62 57 56 54 53 51 83
## [26] 58 84 82 77 80 86 79 27 55 52 50 45 44 48 46 49 87 88 47 6 85
##
## --- gender ---
## [1] F M <NA>
## Levels: F M
##
## --- ca_freq ---
## [1] 3 2
##
## --- school_type ---
## [1] Public Private
## Levels: Private Public
##
## --- school_location ---
## [1] Urban Rural
## Levels: Rural Urban
# Recode: F→1, M→2 • Public→1, Private→2 • Urban→1, Rural→2
stats_data <- stats_data %>%
mutate(
gender_num = recode(gender, "F" = 1L, "M" = 2L, .default = NA_integer_),
school_type_num = recode(school_type, "Public" = 1L, "Private" = 2L, .default = NA_integer_),
school_location_num = recode(school_location, "Urban" = 1L, "Rural" = 2L, .default = NA_integer_)
)
stats_data <- stats_data %>%
mutate(
gender_num = {
mode_val <- stats::median(gender_num, na.rm = TRUE) # mode for a 1/2 binary = the median
replace(gender_num, is.na(gender_num), mode_val)
}
)
str(stats_data)
## tibble [2,135 × 8] (S3: tbl_df/tbl/data.frame)
## $ math_score : num [1:2135] 66 67 71 69 67 69 71 72 69 68 ...
## $ gender : Factor w/ 2 levels "F","M": 1 2 1 2 2 2 2 1 2 2 ...
## $ ca_freq : num [1:2135] 3 3 3 3 3 3 3 3 3 3 ...
## $ school_type : Factor w/ 2 levels "Private","Public": 2 2 2 2 2 2 2 2 2 2 ...
## $ school_location : Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 2 2 ...
## $ gender_num : int [1:2135] 1 2 1 2 2 2 2 1 2 2 ...
## $ school_type_num : int [1:2135] 1 1 1 1 1 1 1 1 1 1 ...
## $ school_location_num: int [1:2135] 1 1 1 1 1 1 1 1 1 1 ...
# Ensure R treats ca_freq as a categorical variable (factor) for the test and plot.
stats_data$ca_freq_factor <- as.factor(stats_data$ca_freq)
# Descriptive Statistics
freq_descriptives <- stats_data %>%
group_by(ca_freq_factor) %>%
summarise(
n = n(),
mean = round(mean(math_score), 2),
sd = round(sd(math_score), 2),
.groups = 'drop'
)
print("Descriptive Statistics by Assessment Frequency:")
## [1] "Descriptive Statistics by Assessment Frequency:"
print(freq_descriptives)
## # A tibble: 2 × 4
## ca_freq_factor n mean sd
## <fct> <int> <dbl> <dbl>
## 1 2 795 61.1 8.05
## 2 3 1340 66.2 5.92
# Perform independent samples t-test
t_test_freq <- t.test(math_score ~ ca_freq_factor, data = stats_data, var.equal = FALSE)
t_test_results_freq <- tidy(t_test_freq)
print("Independent Samples t-test Results - Assessment Frequency:")
## [1] "Independent Samples t-test Results - Assessment Frequency:"
print(t_test_results_freq)
## # A tibble: 1 × 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -5.10 61.1 66.2 -15.6 3.70e-50 1306. -5.75 -4.46
## # ℹ 2 more variables: method <chr>, alternative <chr>
# Effect size (Cohen's d)
cohens_d_freq <- cohens_d(math_score ~ ca_freq_factor, data = stats_data, var.equal = FALSE)
print("Effect Size (Cohen's d) - Assessment Frequency:")
## [1] "Effect Size (Cohen's d) - Assessment Frequency:"
print(cohens_d_freq)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 math_score 2 3 -0.722 795 1340 moderate
# Visualization
p_freq <- ggplot(stats_data, aes(x = ca_freq_factor, y = math_score, fill = ca_freq_factor)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "red") +
labs(
title = "Mathematics Scores by Assessment Frequency",
subtitle = paste0("t = ", round(t_test_results_freq$statistic, 3),
", p = ", round(t_test_results_freq$p.value, 4),
", Cohen's d = ", round(cohens_d_freq$effsize, 3)),
x = "Continuous Assessment Frequency Group",
y = "Mathematics Score",
fill = "Frequency Group",
caption = "RQ3: Testing the impact of assessment frequency"
)
print(p_freq)
# RQ4: t-test for gender differences
stats_data$gender_num_factor <- as.factor(stats_data$gender_num)
# Descriptive Statistics
gender_descriptives <- stats_data %>%
group_by(gender_num_factor) %>%
summarise(
n = n(),
mean = round(mean(math_score), 2),
sd = round(sd(math_score), 2),
.groups = 'drop'
)
print("Descriptive Statistics by Gender:")
## [1] "Descriptive Statistics by Gender:"
print(gender_descriptives)
## # A tibble: 2 × 4
## gender_num_factor n mean sd
## <fct> <int> <dbl> <dbl>
## 1 1 1084 64.4 7.23
## 2 2 1051 64.2 7.22
# Perform independent samples t-test
t_test_gender <- t.test(math_score ~ gender_num_factor, data = stats_data, var.equal = FALSE)
t_test_results_gender <- tidy(t_test_gender)
print("Independent Samples t-test Results - Gender:")
## [1] "Independent Samples t-test Results - Gender:"
print(t_test_results_gender)
## # A tibble: 1 × 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.172 64.4 64.2 0.550 0.582 2131. -0.441 0.785
## # ℹ 2 more variables: method <chr>, alternative <chr>
# Effect size (Cohen's d) using the rstatix package
cohens_d_gender <- cohens_d(math_score ~ gender_num_factor, data = stats_data, var.equal = FALSE)
print("Effect Size (Cohen's d) - Gender:")
## [1] "Effect Size (Cohen's d) - Gender:"
print(cohens_d_gender)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 math_score 1 2 0.0238 1084 1051 negligible
# Visualization
p_gender <- ggplot(stats_data, aes(x = gender_num_factor, y = math_score, fill = gender_num_factor)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "red") +
labs(
title = "Mathematics Scores by Gender",
subtitle = paste0("t = ", round(t_test_results_gender$statistic, 3),
", p = ", round(t_test_results_gender$p.value, 4),
", Cohen's d = ", round(cohens_d_gender$effsize, 3)),
x = "Gender",
y = "Mathematics Score",
caption = "RQ4: Testing gender differences in mathematics performance"
) +
# This section is used to match the data levels "1" and "2"
scale_fill_manual(
name = "Gender",
labels = c("Female", "Male"),
values = c("1" = "pink", "2" = "lightblue")
) +
# This line relabels the x-axis ticks from "1" and "2" to "Female" and "Male"
scale_x_discrete(labels = c("1" = "Female", "2" = "Male"))
print(p_gender)
# Ensues School_location_num is treated as a factor
stats_data$school_location_num_factor <- as.factor(stats_data$school_location_num)
# Descriptive Statistics
location_descriptives <- stats_data %>%
group_by(school_location_num_factor) %>%
summarise(
n = n(),
mean = round(mean(math_score, na.rm = TRUE), 2),
sd = round(sd(math_score, na.rm = TRUE), 2),
.groups = 'drop'
)
print("Descriptive Statistics by School Location:")
## [1] "Descriptive Statistics by School Location:"
print(location_descriptives)
## # A tibble: 2 × 4
## school_location_num_factor n mean sd
## <fct> <int> <dbl> <dbl>
## 1 1 1790 65.0 6.85
## 2 2 345 60.7 8.04
# Perform independent samples t-test
t_test_location <- t.test(math_score ~ school_location_num_factor, data = stats_data, var.equal = FALSE)
t_test_results_location <- tidy(t_test_location)
print("Independent Samples t-test Results - School Location:")
## [1] "Independent Samples t-test Results - School Location:"
print(t_test_results_location)
## # A tibble: 1 × 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.26 65.0 60.7 9.22 1.18e-18 445. 3.35 5.17
## # ℹ 2 more variables: method <chr>, alternative <chr>
# Effect size (Cohen's d)
cohens_d_location <- cohens_d(math_score ~ school_location_num_factor, data = stats_data, var.equal = FALSE)
print("Effect Size (Cohen's d) - School Location:")
## [1] "Effect Size (Cohen's d) - School Location:"
print(cohens_d_location)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 math_score 1 2 0.571 1790 345 moderate
# Visualization
p_location <- ggplot(stats_data, aes(x = school_location_num_factor, y = math_score, fill = school_location_num_factor)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "red") +
labs(
title = "Mathematics Scores by School Location",
subtitle = paste0("t = ", round(t_test_results_location$statistic, 3),
", p = ", round(t_test_results_location$p.value, 4),
", Cohen's d = ", round(cohens_d_location$effsize, 3)),
x = "School Location",
y = "Mathematics Score",
caption = "RQ5: Testing location effects on mathematics performance"
) +
scale_fill_manual(
name = "Location",
labels = c("Urban", "Rural"),
values = c("1" = "#78c2ad", "2" = "#f3969a")
) +
scale_x_discrete(labels = c("1" = "Urban", "2" = "Rural"))
print(p_location)
# Ensures school_type_num is treated as a factor
stats_data$school_type_num_factor <- as.factor(stats_data$school_type_num)
# Descriptive Statistics
school_type_descriptives <- stats_data %>%
group_by(school_type_num_factor) %>%
summarise(
n = n(),
mean = round(mean(math_score, na.rm = TRUE), 2),
sd = round(sd(math_score, na.rm = TRUE), 2),
.groups = 'drop'
)
print("Descriptive Statistics by School Type:")
## [1] "Descriptive Statistics by School Type:"
print(school_type_descriptives)
## # A tibble: 2 × 4
## school_type_num_factor n mean sd
## <fct> <int> <dbl> <dbl>
## 1 1 2061 64.2 7.21
## 2 2 74 68.1 6.67
# Perform independent samples t-test
t_test_school_type <- t.test(math_score ~ school_type_num_factor, data = stats_data, var.equal = FALSE)
t_test_results_school_type <- tidy(t_test_school_type)
print("Independent Samples t-test Results - School Type:")
## [1] "Independent Samples t-test Results - School Type:"
print(t_test_results_school_type)
## # A tibble: 1 × 10
## estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -3.96 64.1 68.1 -5.00 0.00000333 79.2 -5.54 -2.38
## # ℹ 2 more variables: method <chr>, alternative <chr>
# Effect size (Cohen's d)
cohens_d_school_type <- cohens_d(math_score ~ school_type_num_factor, data = stats_data, var.equal = FALSE)
print("Effect Size (Cohen's d) - School Type:")
## [1] "Effect Size (Cohen's d) - School Type:"
print(cohens_d_school_type)
## # A tibble: 1 × 7
## .y. group1 group2 effsize n1 n2 magnitude
## * <chr> <chr> <chr> <dbl> <int> <int> <ord>
## 1 math_score 1 2 -0.570 2061 74 moderate
# Visualization
p_school_type <- ggplot(stats_data, aes(x = school_type_num_factor, y = math_score, fill = school_type_num_factor)) +
geom_boxplot(alpha = 0.7) +
geom_jitter(width = 0.2, alpha = 0.3, size = 0.8) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "red") +
labs(
title = "Mathematics Scores by School Type",
subtitle = paste0("t = ", round(t_test_results_school_type$statistic, 3),
", p = ", round(t_test_results_school_type$p.value, 4),
", Cohen's d = ", round(cohens_d_school_type$effsize, 3)),
x = "School Type",
y = "Mathematics Score",
caption = "RQ6: Testing school type impact on mathematics performance"
) +
scale_fill_manual(
name = "School Type",
labels = c("Public", "Private"),
values = c("1" = "#8c96c6", "2" = "#f9a65a")
) +
scale_x_discrete(labels = c("1" = "Public", "2" = "Private"))
print(p_school_type)