Data was gathered using questionnaires and mark reports. Mark reports were provided by schools and consist of grades and absences. Grades are evaluated in three periods and marked in a 20-point grading scale where 0 is the lowest possible mark and 20 the highest. The former was used to gather the data for the several demographic, social/emotional and school related variables. The final questionnaire containing 37 questions in a single A4 sheet was answered in class by 788 students. Later 111 answers were discarded due to lack of identification details. At the end the data was merged into two samples that related to Mathematics (395) and Portuguese (649) classes (Cortez and Silva 2008).
The merged dataset consist of 13 variables that uniquely identifies each row and 20 that describe their performance at one of the subjects. All the features can be split into categorical and numerical ones, but we are rather interested in analyzing the variables rather than the right survey distribution (Dua and Graff 2017).
supp_plot_m <- make_barplot_with_bg_tiles(master_data,famsup_m,schoolsup_m,paid_m,G3_m,
title = '',
bar_color = "#30475E")
supp_plot_p <- make_barplot_with_bg_tiles(master_data,famsup_p,schoolsup_p,paid_p,G3_p,
title = '',
bar_color = "#30475E")
supp_plot_p <- supp_plot_p + theme(axis.line.y.right = element_blank(),
axis.text.y.right = element_blank(),
axis.ticks.y.right = element_blank(),
axis.title.y.right = element_blank(),
axis.title.y.left = element_text(size=18),
axis.text.y.left = element_text(size=19))
supp_plot_m <- supp_plot_m + theme(axis.line.y.left = element_blank(),
axis.text.y.left = element_blank(),
axis.ticks.y.left = element_blank(),
axis.title.y.left = element_blank(),
axis.title.y.right = element_text(size=18),
axis.text.y.right = element_text(size=19))
comb <- ggarrange(supp_plot_p,supp_plot_m,
align = "h",
labels = c("Portuguese","Maths"),
font.label = list(size = 12, face = 'plain'),
hjust = c(-2,-2.5),
vjust = 3,
common.legend = T ,
legend = "bottom")
annotate_figure(comb , top = text_grob("Does obtaining support improve performance?",
size = 23))
What we observe is that in Portuguese classes most students get only family support or no support at all. Interestingly, at the same time those students have the highest average grade. This would suggest that support is given to students who tend to perform poorly in class rather than students perform well BECAUSE they got support.
In Maths most students get family and paid support or no support. The highest average is among those getting paid support and both paid and family support.
It’s worth mentioning that the small group of students getting school support and paid support at the same time has a close-to-the-lowest average from both subjects.
# data
grade_cuts <- c(-1,7,13,21)
tmp1 <- master_data %>%
select(matches("G._m")) %>%
drop_na()%>%
mutate(across(everything(),
~cut(.x,breaks = grade_cuts,
labels = c("low","medium","high")))) %>%
group_by(G1_m,G2_m,G3_m) %>%
summarise(n = n()) %>%
mutate(G1_m = paste0("First ",G1_m),
G2_m = paste0("Second ",G2_m),
G3_m = paste0("Final ",G3_m)) %>%
arrange(G1_m,G2_m)
# final plot
nodes <- data.frame(ID = unique(c(tmp1$G1_m,tmp1$G2_m,tmp1$G3_m) %>% sort()),
x = c(rep(3,3),rep(1,3),rep(2,3)),
y = rep(c(3,1,2),3))
tmp2 <- master_data %>%
select(matches("G._m")) %>%
drop_na()%>%
mutate(across(everything(),
~cut(.x,breaks =grade_cuts,
labels = c("low","medium","high")))) %>%
group_by(G1_m,G2_m) %>%
summarise(n = n()) %>%
mutate(G1_m = paste0("First ",G1_m),
G2_m = paste0("Second ",G2_m)) %>%
rename(N1 = G1_m, N2 = G2_m, Value = n)
tmp3 <- master_data %>%
select(matches("G._m")) %>%
drop_na()%>%
mutate(across(everything(),
~cut(.x,breaks = grade_cuts,
labels = c("low","medium","high")))) %>%
group_by(G2_m,G3_m) %>%
summarise(n = n()) %>%
mutate(G2_m = paste0("Second ",G2_m),
G3_m = paste0("Final ",G3_m)) %>%
rename(N1 = G2_m, N2 = G3_m, Value = n)
tmp4 <- rbind(tmp2,tmp3)
edges <- data.frame(ID = 1:nrow(tmp4),
N1 = tmp4$N1,
N2 = tmp4$N2,
Value = tmp4$Value)
cols <- c(low="#F05454",
medium="#30475E",
high ="#121212")
style <- sapply(nodes$ID, function(id)
list(col=cols[ gsub("(First|Second|Final) ", "", id) ]), simplify=FALSE)
r <- makeRiver(nodes,edges,styles =style)
d <- list(srt = 0, textcex=1, textcol = "white")
plot(r, plot_area=1, nodewidth=10, default_style=d)
title("Flows of students between performance groups")
We observe that the performace of students is rather persistant across the year. Very little students tend to either improve or worsen their performance.
The biggest observable flows between performance groups happens between first and second period especially between low and medium group. Nonetheless, those flows are modest and seem not to change the overall distribution of performance where the most students are the medium ones.
grade_per_subject_long <-
master_data %>%
mutate(row_num = row_number()) %>%
select(row_num, starts_with("G", ignore.case = FALSE)) %>%
pivot_longer(-row_num) %>%
separate(name, c("grade", "subject"))
subject_pretty_names <- tibble(short_name = c('m', 'p'),
pretty_name = c('Mathematics','Portugese'))
periods_colors <- tibble(period = 1:3,
color_name = color_palette$colors[2:4])
data_to_boxplot <- grade_per_subject_long %>%
inner_join(subject_pretty_names, by = c("subject" = "short_name")) %>% separate(grade, 1, into = c("to_remove", "period")) %>%
mutate(period = as.numeric(period)) %>%
inner_join(periods_colors)
grades_boxplot <- ggplot(data = data_to_boxplot,
aes(y = value, x = factor(pretty_name),
fill = factor(period),
colour=factor(period))) +
geom_boxplot(alpha = 0.6, show.legend = F) +
geom_point(position = position_jitterdodge()) +
ggtitle("What is the distribution of grades across \n subjects and periods in the current school year?") +
labs(x = "Subject", y = "Grade", colour = "Period") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = periods_colors$color_name) +
scale_fill_manual(values = periods_colors$color_name) +
guides(fill=FALSE)
grades_boxplot
Grades for Portugese are consistently higher than in Mathematics. We do not observe neither a positive nor negative trend relating to the subsequent periods results. However, there are significantly more students that failed Mathematics.
grade_per_subject_long <-
master_data %>%
mutate(row_num = row_number()) %>%
select(row_num, starts_with("G", ignore.case = FALSE)) %>%
pivot_longer(-row_num) %>%
separate(name, c("grade", "subject"))
absences.agg <- master_data %>%
group_by(freetime_m, goout_m) %>%
summarise(across(absences_m, ~n()))
low.high.map <- tibble(ugly_level = c(1, 2, 3, 4, 5),
pretty_level = c('very low',
'low',
'medium',
'high',
'very high'))
absences.agg.to.plot.m <- absences.agg %>%
ungroup() %>%
inner_join(low.high.map,
by = c("freetime_m" = "ugly_level")) %>%
rename( freetime_pretty = pretty_level) %>%
inner_join(low.high.map,
by = c("goout_m" = "ugly_level")) %>%
rename( goout_pretty = pretty_level) %>% select(freetime_pretty, goout_pretty, absences_m)
absences.agg <- master_data %>%
group_by(freetime_p, goout_p) %>%
summarise(across(.cols = absences_p, .fns = ~sum(.)))
low.high.map <- tibble(ugly_level = c(1, 2, 3, 4, 5),
pretty_level = c('very low',
'low',
'medium',
'high',
'very high'))
absences.agg.to.plot.p <- absences.agg %>%
ungroup() %>%
inner_join(low.high.map,
by = c("freetime_p" = "ugly_level")) %>%
rename(freetime_pretty = pretty_level) %>%
inner_join(low.high.map,
by = c("goout_p" = "ugly_level")) %>%
mutate(goout_p = factor(goout_p)) %>%
rename( goout_pretty = pretty_level) %>%
select(freetime_pretty, goout_pretty, absences_p)
absences.agg.to.plot.p$freetime_pretty <- factor(absences.agg.to.plot.p$freetime_pretty, levels = c("very low", "low", "medium", "high", "very high"))
absences.agg.to.plot.p$goout_pretty <- factor(absences.agg.to.plot.p$goout_pretty, levels = c("very low", "low", "medium", "high", "very high"))
ggplot(absences.agg.to.plot.p, aes(x = freetime_pretty, y = goout_pretty)) +
geom_tile(aes(fill = absences_p), color = 'white', show.legend = F) +
theme_minimal() +
geom_text(aes(label = absences_p), size = 5, fontface = 'bold', color = 'white') +
labs(title ='How the amount of spare time and going out with friends \n influence the number of absences at Portugese classes?',
x = "Free time after school",
y = "Going out with friends") +
theme(panel.grid = element_blank(),
plot.title = element_text(hjust = 0.5, size = 15),
axis.text = element_text(size = 10),
axis.title = element_text(size = 15)) +
scale_fill_gradient(low = color_palette$colors[3],
high = color_palette$colors[2])
Most of the absences is related to the students that have medium amount of free time after school and are likely to go out with friends. It seems that hanging out with colleagues might influence more the number of absences than not having a lot of chores.
We can infer that there is no clear relationship between parents education and students’ performance. Also parents’ cohabitation status seems to be independent from the performance. We can observe that most students have parents with same level of education (red trend on diagonal).
pass_fail_association <- master_data %>%
mutate(current_failure_m = if_else(G3_m < 10, 1, 0),
current_failure_p = if_else(G3_p < 10, 1, 0),
failures_m_before = if_else(failures_m > 0, 1, 0),
failures_p_before = if_else(failures_p > 0, 1, 0)) %>%
group_by(failures_m_before, current_failure_m, failures_p_before, current_failure_p) %>% count()
pass_fail_association$group <- c("Rockstars",
"Accidents \n will happen",
"Accidents \n will happen",
"Accidents \n will happen",
"Accidents \n will happen",
"Accidents \n will happen",
"Accepting trade-off",
"Rockstars",
"Misery loves \n company",
"Accepting trade-off",
"Misery loves \n company",
"Misery loves \n company",
"Misery loves \n company")
pass_fail_association_aggregate <- pass_fail_association %>%
group_by(group) %>%
summarise(student_count = sum(n))
We created five groups that describe the consistency of the students’ performance across periods.
The Rockstars are depicted as the students that either are the gifted ones and pass both subjects consequently or after failing both previously now passed both of them.
Accidents will happen describes a group of people that generally pass both Mathematics and Portugese subjects but have experiences at least one failure.
With the Accepting trade-off cluster we describe the students that sacrifice one subject for the sake of the other.
By the mistery loves company description we try to motivate the folks that have issues with learning at school and consequently experience problems at school.
ggplot(pass_fail_association_aggregate,
aes(area = student_count,
label = group,
fill = student_count)) +
geom_treemap() +
geom_treemap_text(fontface = "italic", colour = "white", place = "centre") +
labs(title = "What groups students form across school years?",
fill = "Students \n count") +
theme(plot.title = element_text(hjust = 0.5, size = 15),
legend.title = element_text(hjust = 0.5, size = 15),
axis.text = element_text(size = 15)) +
scale_fill_gradient(low = color_palette$colors[3],
high = color_palette$colors[2])
There is a huge group of students that do not have problems with passing the subjects. Some people are aware of their capabilities and accepted the trade-off. Luckily, the misery loves company is not very big. Considering the last group it is bigger than it might be expected.
all_numeric_var <- dictionary %>% filter(type == 'numeric') %>% pull(variable)
dims <- paste0(all_numeric_var[4:11], "_m")
dims <- append(dims, "G3_m")
final_dims <- c("Dalc_m", "Walc_m", "studytime_m", "absences_m", "G3_m")
master_data_to_radar_m <- master_data %>%
mutate(across(all_of(final_dims), scales::rescale)) %>%
group_by(higher_m) %>%
summarise(across(all_of(final_dims), ~mean(.)))
names(master_data_to_radar_m) <- c("Wants to take \n higher education",
"Workday alcohol \n consumption",
"Weekend \n alcohol \n consumption",
"Weekly \n study time",
"Number of \n school absences",
"Mathematics \n final grade")
ggradar(master_data_to_radar_m,
plot.title = "What is the impact of students' lifestyle?",
legend.title = "Wants to \n take higher \n education",
axis.label.size = 3,
legend.text.size = 10,
gridline.mid.colour = color_palette$colors[4],
group.colours = color_palette$colors[c(2,3)]) +
theme(legend.title=element_text(size=10), plot.title = element_text(size = 15))
Our assumptions are mostly correct and are confirmed with the graph. Students that want to continue their education at universities have better grades in Mathematics and their lifestyle is better. Interestingly the number of school absences does not play a remarkable role.
# calculate percentages
helper_df <- master_data %>%
summarise(mop = sum(G3_m>G3_p),
pom = sum(G3_m<G3_p),
fmop = sum((G3_m>G3_p) & sex == 'F'),
fpom = sum((G3_m<G3_p) & sex == 'F')) %>%
mutate(fmop_perc = fmop/mop, mmop_perc = 1-fmop_perc,
fpom_perc = fpom/pom, mpom_perc = 1-fpom_perc,
mop_perc = mop/nrow(master_data),
pom_perc = pom/nrow(master_data))
helper_df_2 <- helper_df %>%
pivot_longer(c(mmop_perc,fmop_perc,mpom_perc,fpom_perc)) %>%
select(c(name,value)) %>%
.[1:2,]
helper_df_3 <- helper_df %>%
pivot_longer(c(mmop_perc,fmop_perc,mpom_perc,fpom_perc)) %>%
select(c(name,value)) %>%
.[3:4,]
seg_end_lst <- get_diag_segment_ends(side_length = 22)
seg_end <- seg_end_lst$end
fem_perc_diag <- seg_end_lst$fem_perc
plt <-
ggplot()+
geom_segment(aes(x=0,y=0,xend=seg_end,yend=seg_end), size = 6, color ="#F05454", alpha = 0.5 )+
geom_segment(aes(x=seg_end,y=seg_end,xend=20,yend=20), size = 6, color ="#30475E",alpha = 0.5 )+
geom_point(data = master_data,aes(x=G3_m,y=G3_p,color = sex),
size = 3,
position=position_dodge(width=0.5),
alpha = 1)+
annotate("text",label = paste("Better portuguese \nthan maths:\n",
round(helper_df$pom_perc*100),
"% of all"),
size = 4 ,
fontface = "italic",
x =4.5, y =17)+
annotate("text",
label = paste("Better maths \nthan portuguese:\n",
round(helper_df$mop_perc*100),
"% of all"),
size = 4 ,
fontface = "italic",
x =17, y =3) +
annotate("text",
label = paste0(round(fem_perc_diag*100),"%"),
size = 4,
color ="White",
x =1, y =1,
angle = 45)+
annotate("text",
label = paste0(round((1-fem_perc_diag)*100),"%"),
size = 4,
color ="White",
x =19, y =19,
angle = 45)+
guides(fill="none")+
scale_color_manual(values = c("#F05454","#30475E"),labels = c("female","male"))+
coord_fixed()+
labs(x="Maths grade", y="Portuguese grade", title = "Relationship between grades from two subjects by gender")+
theme(panel.grid = element_blank(),
legend.box.background = element_blank(),
legend.key = element_rect(colour = "transparent",fill = alpha("red", 0)),
plot.margin = margin(0,0,0,-20)
)
plt_b <- ggplot(data = helper_df_2)+
geom_colh(aes(y = 0,x = value,fill = name),
position = "stackv",
width = 0.5)+
scale_fill_manual(values = c("#F05454", "#30475E"),
labels = c("female","male")) +
theme_void()+
theme(legend.position = "none")+
geom_text(aes(x = (1-value)+0.1, y = 0,label = paste0(round(value,2)*100,"%")),
color = "white")
plt_l <- ggplot(data = helper_df_3)+
geom_col(aes(x = 0,y = value,fill = name),
position = "stack",
width = 0.5)+
scale_fill_manual(values = c("#F05454", "#30475E"),
labels = c("female","male")) +
theme_void()+
theme(legend.position = "none")+
geom_text(aes(x =0 , y = value - 0.1,label = paste0(round(value,2)*100,"%")),
color = "white")
lay <- rbind(c(1,2,2,2),
c(NA,NA,NA,NA),
c(NA,NA,3,NA))
plt_fin <- arrangeGrob(plt_l,plt,plt_b,
layout_matrix = lay,
widths = c(3,1,20,5),
heights = c(25,1,2))
grid.arrange(plt_fin)
We can clearly see that in general students perform better in Portuguese class than in math class. In addition, among those who are better in maths there are more males but in among those who are better in Portuguese - female are dominating.
math_pyr <- master_data %>%
group_by(age,sex) %>%
summarise(grade_m = mean(G3_m), grade_p = mean(G3_p), n = n()) %>%
filter(age<=19) %>%
mutate(sex = ifelse(sex == "F","Female","Male")) %>%
pyramid_chart( x = age, y = grade_m, group = sex,
xlab = "Maths average grade",
bar_colors = c("#F05454","#30475E"))
port_pyr <- master_data %>%
group_by(age,sex) %>%
summarise(grade_m = mean(G3_m), grade_p = mean(G3_p), n = n()) %>%
filter(age<=19) %>%
mutate(sex = ifelse(sex == "F","Female","Male")) %>%
pyramid_chart( x = age, y = grade_p, group = sex,
xlab = "Portuguese average grade",
bar_colors = c("#F05454","#30475E"))
comb <- ggarrange(math_pyr,port_pyr)
annotate_figure(comb,top = text_grob("Grade average by age and gender",
size = 20))
The distribution of grades among genders when controlling for age reveals that the difference in performance in math classes is most striking among the youngest students.