This dataset contains test scores of 30,641 students in three subjects: Maths, Reading and Writing. It also contains information about the students such as their gender, Parent’s educational level and ethnicity. While the data on student’s gender and their test scores is complete, all other variables were noted to have missing values. All missing values were marked “NA” (Not Available) and were dropped from subsequent analysis and descriptive Statistics. The dataset is a good candidate for comparing the relationship between test scores and factors such as gender and ethnicity. Descriptive analysis was carried out by visualising the data in RStudio in order to tease out further insights for explanatory analysis. Data visualisation focused on three factors:
Tests were graded on a scale from 1 to 100. For the sake of visualisation, I grouped scores into grade bands as follows:
The following general observation was made:
After dropping all records for which Ethnic Group was not available , 19,243 student records across 5 Ethnic Groups (A to E) remained.
The following general observation was made
Further analysis can be carried out to determine:
Visualisations I played with but did not include in the main body.
Student test scores - https://www.kaggle.com/datasets/desalegngeb/students-exam-scores
If you’d like to reproduce what I have done
#Loading useful packages
library(tidyverse)
library(RColorBrewer)
library(scales)
library(extrafont)
#Setting a theme
theme_set(theme_classic()+
theme(
plot.title = element_text(hjust = 0.5,
family = "Corbel",
size = 20,
face = "bold"
),
plot.subtitle = element_text(hjust = 0.5,
family = "Corbel",
size = 15,
face = "bold"),
plot.caption = element_text(family = "Corbel",
size = 8,
face = "bold"),
axis.title.x = element_text(color = "black",
family = "Corbel",
size = 13,
face = "bold"),
axis.text.x = element_text(size = 11),
axis.title.y = element_text(color = "black",
family = "Corbel",
size = 13,
face = "bold"),
axis.text.y = element_text(size = 11),
panel.grid.major = element_line(color = "grey90")
)
)
#Loading Dataset
STS <- read.csv("Student test scores.csv")
glimpse(STS)
#No. of students
STS %>%
ggplot(aes(x = Gender, fill = Gender))+
geom_bar(colour = "black",
linewidth = 1,
width = 0.6,
alpha = 0.8)+
geom_text(aes(label = scales::comma(after_stat(count))),
stat = "count", vjust = 1.5,
size = 5,
colour = "white", fontface = "bold")+
scale_y_continuous(name = NULL,
limits = c(0, 16000),
breaks = seq(0, 16000, 5000),
expand = expansion(0)
)+
scale_x_discrete(name = "Gender",
labels = c("Female", "Male"))+
scale_fill_manual(values = c("#cf1578", "#039fbe"))+
labs(title = "Number of Students that took the tests")+
theme(legend.position = "none")
#Scores using Grades
#Maths Grades
STS %>%
mutate(Gender = str_to_sentence(Gender),
Grade = case_when(MathScore >= 70 & MathScore <=100 ~ "A",
MathScore >= 60 & MathScore <70 ~ "B",
MathScore >= 50 & MathScore <60 ~ "C",
TRUE ~ "F")
) %>%
group_by(Gender, Grade) %>%
summarise(Grade_count = n()) %>%
group_by(Gender) %>%
mutate(Grade_count = round(Grade_count/sum(Grade_count), 3)) %>%
ggplot(aes(x = Gender, y = Grade_count, fill = Grade))+
geom_col(position = "fill",
width = 0.7,
colour = "black",
linewidth = 1)+
geom_text(aes(label = scales::percent(Grade_count)),
, vjust = 1.5,
size = 4, position = "fill",
colour = "white", fontface = "bold"
)+
scale_y_continuous(name = NULL,
labels = scales::percent,
limits = c(0, 1.1),
breaks = seq(0,1, 0.25),
expand = expansion(0))+
scale_x_discrete(name = "Gender")+
scale_fill_manual(values = c("#05f721", "#2505f7", "#05f7d3", "#f74e05"))+
labs(title = "Maths Test Grades",
subtitle = "Maths is the only subject in which boys scored higher than girls")+
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())
#Reading Grades
STS %>%
mutate(Gender = str_to_sentence(Gender),
Grade = case_when(ReadingScore >= 70 & ReadingScore <=100 ~ "A",
ReadingScore >= 60 & ReadingScore <70 ~ "B",
ReadingScore >= 50 & ReadingScore <60 ~ "C",
TRUE ~ "F")
) %>%
group_by(Gender, Grade) %>%
summarise(Grade_count = n()) %>%
group_by(Gender) %>%
mutate(Grade_count = round(Grade_count/sum(Grade_count), 3)) %>%
ggplot(aes(x = Gender, y = Grade_count, fill = Grade))+
geom_col(position = "fill",
width = 0.7,
colour = "black",
linewidth = 1)+
geom_text(aes(label = scales::percent(Grade_count)),
, vjust = 1.5,
size = 4, position = "fill",
colour = "white", fontface = "bold"
)+
scale_y_continuous(name = NULL,
labels = scales::percent,
limits = c(0, 1.1),
breaks = seq(0,1, 0.25),
expand = expansion(0))+
scale_x_discrete(name = "Gender")+
scale_fill_manual(values = c("#05f721", "#2505f7", "#05f7d3", "#f74e05"))+
labs(title = "Reading Test Grades")+
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())
#Writing Grades
STS %>%
mutate(Gender = str_to_sentence(Gender),
Grade = case_when(WritingScore >= 70 & WritingScore <=100 ~ "A",
WritingScore >= 60 & WritingScore <70 ~ "B",
WritingScore >= 50 & WritingScore <60 ~ "C",
TRUE ~ "F")
) %>%
group_by(Gender, Grade) %>%
summarise(Grade_count = n()) %>%
group_by(Gender) %>%
mutate(Grade_count = round(Grade_count/sum(Grade_count), 3)) %>%
ggplot(aes(x = Gender, y = Grade_count, fill = Grade))+
geom_col(position = "fill",
width = 0.7,
colour = "black",
linewidth = 1)+
geom_text(aes(label = scales::percent(Grade_count)),
, vjust = 1.5,
size = 4, position = "fill",
colour = "white", fontface = "bold"
)+
scale_y_continuous(name = NULL,
labels = scales::percent,
limits = c(0, 1.1),
breaks = seq(0,1, 0.25),
expand = expansion(0))+
scale_x_discrete(name = "Gender")+
scale_fill_manual(values = c("#05f721", "#2505f7", "#05f7d3", "#f74e05"))+
labs(title = "Writing Test Grades")+
theme(axis.text.y = element_blank(),
axis.line.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid.major = element_blank())
#Ethnicity
STS %>%
mutate(EthnicGroup = str_to_sentence(EthnicGroup)) %>%
drop_na() %>%
ggplot(aes(x = EthnicGroup, fill = EthnicGroup))+
geom_bar(colour = "black",
linewidth = 1,
alpha = 0.8)+
geom_text(aes(label = scales::comma(after_stat(count))),
stat = "count", vjust = 1.5,
size = 5,
colour = "white", fontface = "bold")+
scale_y_continuous(name = NULL,
limits = c(0, 7000),
breaks = seq(0, 6000, 2000),
expand = expansion(0))+
scale_x_discrete(name = "Ethnic Group")+
scale_fill_manual(values = c("#05f7c3", "#0576f7",
"#6a05f7", "#f70596", "#f70529"))+
labs(title = "Ethnicity of test takers")+
theme(legend.position = "bottom")
#Boxplots
#Maths
STS %>%
mutate(EthnicGroup = str_to_sentence(EthnicGroup)) %>%
drop_na() %>%
ggplot(aes(x = EthnicGroup, y = MathScore, fill = EthnicGroup))+
geom_boxplot()+
scale_y_continuous(name = "Maths Score",
limits = c(0, 110),
breaks = seq(0, 100, 25))+
scale_x_discrete(name = "Ethnic Group")+
scale_fill_manual(name = "Ethnic Group", values = c("#e5e5e5", "#e5e5e5",
"#e5e5e5", "#e5e5e5", "#f70529"))+
labs(title = "Maths test scores",
subtitle = "Group e lead the pack across all subjects")
#Reading
STS %>%
mutate(EthnicGroup = str_to_sentence(EthnicGroup)) %>%
drop_na() %>%
ggplot(aes(x = EthnicGroup, y = ReadingScore, fill = EthnicGroup))+
geom_boxplot()+
scale_y_continuous(name = "Reading Score",
limits = c(0, 110),
breaks = seq(0, 100, 25))+
scale_x_discrete(name = "Ethnic Group")+
scale_fill_manual(name = "Ethnic Group", values = c("#e5e5e5", "#e5e5e5",
"#e5e5e5", "#e5e5e5", "#f70529"))+
labs(title = "Reading test scores",
subtitle = "Group e lead the pack across all subjects")
#Writing
STS %>%
mutate(EthnicGroup = str_to_sentence(EthnicGroup)) %>%
drop_na() %>%
ggplot(aes(x = EthnicGroup, y = WritingScore, fill = EthnicGroup))+
geom_boxplot()+
scale_y_continuous(name = "Writing Score",
limits = c(0, 110),
breaks = seq(0, 100, 25))+
scale_x_discrete(name = "Ethnic Group")+
scale_fill_manual(name = "Ethnic Group", values = c("#e5e5e5", "#e5e5e5",
"#e5e5e5", "#e5e5e5", "#f70529"))+
labs(title = "Writing test scores",
subtitle = "Group e lead the pack across all subjects")
#Appendix
#Scores by Gender
#Maths
STS %>%
mutate(Gender = str_to_sentence(Gender)) %>%
ggplot(aes(x = MathScore, fill = Gender))+
geom_histogram(binwidth = 5,
colour = "black")+
scale_y_continuous(name = NULL,
limits = c(0, 2200),
breaks = seq(0, 2000, 1000),
expand = expansion(0))+
scale_x_continuous(name = "Scores")+
scale_fill_manual(values = c("#cf1578", "#039fbe"))+
labs(title = "Maths test")+
theme(legend.position = "none")+
facet_wrap(~Gender)
#Reading
STS %>%
mutate(Gender = str_to_sentence(Gender)) %>%
ggplot(aes(x = ReadingScore, fill = Gender))+
geom_histogram(binwidth = 5,
colour = "black")+
scale_y_continuous(name = NULL,
limits = c(0, 2200),
breaks = seq(0, 2000, 1000),
expand = expansion(0))+
scale_x_continuous(name = "Scores")+
scale_fill_manual(values = c("#cf1578", "#039fbe"))+
labs(title = "Reading test")+
theme(legend.position = "none")+
facet_wrap(~Gender)
#Writing
STS %>%
mutate(Gender = str_to_sentence(Gender)) %>%
ggplot(aes(x = WritingScore, fill = Gender))+
geom_histogram(binwidth = 5,
colour = "black")+
scale_y_continuous(name = NULL,
limits = c(0, 2200),
breaks = seq(0, 2000, 1000),
expand = expansion(0))+
scale_x_continuous(name = "Scores")+
scale_fill_manual(values = c("#cf1578", "#039fbe"))+
labs(title = "Writing test")+
theme(legend.position = "none")+
facet_wrap(~Gender)
#Boxplots
#Maths
STS %>%
mutate(Gender = str_to_sentence(Gender)) %>%
ggplot(aes(x = MathScore, fill = Gender))+
geom_boxplot()+
scale_y_continuous(name = NULL,
limits = NULL,
breaks = NULL)+
scale_x_continuous(name = "Maths Scores")+
scale_fill_manual(values = c("#cf1578", "#039fbe"))+
labs(title = "Maths test scores")+
theme(legend.position = "top")
#Reading
STS %>%
mutate(Gender = str_to_sentence(Gender)) %>%
ggplot(aes(x = ReadingScore, fill = Gender))+
geom_boxplot()+
scale_y_continuous(name = NULL,
limits = NULL,
breaks = NULL)+
scale_x_continuous(name = "Reading Scores")+
scale_fill_manual(values = c("#cf1578", "#039fbe"))+
labs(title = "Reading test scores")+
theme(legend.position = "top")
#Writing
STS %>%
mutate(Gender = str_to_sentence(Gender)) %>%
ggplot(aes(x = WritingScore, fill = Gender))+
geom_boxplot()+
scale_y_continuous(name = NULL,
limits = NULL,
breaks = NULL)+
scale_x_continuous(name = "Writing Scores")+
scale_fill_manual(values = c("#cf1578", "#039fbe"))+
labs(title = "Writing test scores")+
theme(legend.position = "top")
#Time spent
STS %>%
mutate(EthnicGroup = str_to_sentence(EthnicGroup)) %>%
drop_na() %>%
ggplot(aes(x = WklyStudyHours, fill = WklyStudyHours))+
geom_bar()+
facet_wrap(~EthnicGroup)
STS %>%
mutate(EthnicGroup = str_to_sentence(EthnicGroup)) %>%
filter(EthnicGroup == "Group e") %>%
drop_na() %>%
ggplot(aes(x = Gender, y = MathScore))+
geom_boxplot()+
scale_y_continuous(name = "Maths Score",
limits = c(0, 110),
breaks = seq(0, 100, 25))+
scale_x_discrete(name = "Gender",
labels = c("Female", "Male"))
STS %>%
mutate(EthnicGroup = str_to_sentence(EthnicGroup)) %>%
filter(EthnicGroup == "Group e") %>%
drop_na() %>%
ggplot(aes(x = Gender, y = ReadingScore))+
geom_boxplot()+
scale_y_continuous(name = "Reading Score",
limits = c(0, 110),
breaks = seq(0, 100, 25))+
scale_x_discrete(name = "Gender",
labels = c("Female", "Male"))
STS %>%
mutate(EthnicGroup = str_to_sentence(EthnicGroup)) %>%
filter(EthnicGroup == "Group e") %>%
drop_na() %>%
ggplot(aes(x = Gender, y = WritingScore))+
geom_boxplot()+
scale_y_continuous(name = "Writing Score",
limits = c(0, 110),
breaks = seq(0, 100, 25))+
scale_x_discrete(name = "Gender",
labels = c("Female", "Male"))