Data

I chose a dataset called “Alcohol Effects On Study” from Kaggle. It contains data on students from 2 portuguese schools, including 649 observations of 33 variables: students’ alcohol intake, grades (the dataset with grades for Portuguese was chosen), socioeconomic characteristics, study habits, etc. Some preprocessing had to be performed on the variables of interest for the visualizations, the code would be hidden, but can be viewed on demand.

The visualizations explore the grade distributions, effects of alcohol on grades, attendance and devotion to studies.

library(knitr)
library(ggplot2)
library(dplyr)
library(kableExtra)
data <- read.csv("C:/Users/Anna/Downloads/Portuguese.csv")
library(tidyverse)
data <- tibble::rowid_to_column(data, "index")

Plots

stats <- data %>% group_by(sex) %>% summarise(mean = mean(G3),
                                             n = n())
p1<-ggplot(data, aes(x=G3, fill=sex)) +
  geom_histogram(aes(color = sex, fill = sex), 
                position = "identity", bins = 20, alpha = 0.4)+
  xlab("Final grade for Portuguese") + 
  ylab("Number of students") +
  geom_vline(data = stats, aes(xintercept = mean, color = sex), size = 0.8, linetype = "dashed") 
p1
Fig. 1. Distribution of final grades for Portuguese by sex

Fig. 1. Distribution of final grades for Portuguese by sex

data$dalc1 = as.factor(data$Dalc)
data$dalc = dplyr::recode(data$dalc1, "1" = "very low", "2" = "low", "3" = "moderate", "4" = "high", "5" = "very high")
p2 <- ggplot(data, aes(x=dalc, y=G3, fill = dalc)) + 
  geom_violin(trim=FALSE, alpha = 0.7) +
  geom_boxplot(width=0.1)+
  scale_fill_brewer(palette="YlOrRd")+
  xlab("Alcohol intake on weekdays") + 
  ylab("Final grade on Portuguese") +
  theme(legend.position='none')
p2
Fig. 2. Distribution of final grades for Portuguese by alcohol intake on weekdays

Fig. 2. Distribution of final grades for Portuguese by alcohol intake on weekdays

p3 <- ggplot(data, aes(x=dalc, y=absences, fill = dalc)) + 
  geom_violin(trim=FALSE, alpha = 0.7) +
  geom_boxplot(width=0.1)+
  scale_fill_brewer(palette="RdPu")+
  xlab("Alcohol intake on weekdays") + 
  ylab("Abscences") +
  theme(legend.position='none')
p3
Fig. 3. Distribution of absences by alcohol intake on weekdays

Fig. 3. Distribution of absences by alcohol intake on weekdays

data$studyttime1 = as.factor(data$studytime)
data$studyt = dplyr::recode(data$studyttime1, "1" = "<2 hours", "2" = "2 to 5 hours", "3" = "5 to 10 hours", "4" = ">10 hours")
p4 <- ggplot(data=data, aes(x=studyt, y=index, fill = dalc)) +
  geom_bar(stat="identity", position = "fill")+
  scale_fill_brewer(palette="PiYG")+
  xlab("Study time per week") + 
  ylab("Alcohol intake on weekdays") +
  labs(fill = "Alcohol intake")
p4
Fig. 4. Distribution of alcohol intake on weekdays by study time per week

Fig. 4. Distribution of alcohol intake on weekdays by study time per week

Table

df <- data%>%
  group_by(dalc, studyt) %>%
  summarise(mean_final_grade = mean(G3, na.rm = TRUE), mean_absences = mean(absences, na.rm = TRUE))

kable(df, col.names = c('Weekly alcohol intake', 'Weekly study time', "Final grade in Portuguese", "Abscences"), align = "llrr", caption = "Table 1.2 Filtered data", digits = 3)%>%
  kable_styling()
Table 1.2 Filtered data
Weekly alcohol intake Weekly study time Final grade in Portuguese Abscences
very low <2 hours 11.358 3.813
very low 2 to 5 hours 12.314 3.150
very low 5 to 10 hours 13.382 2.408
very low >10 hours 13.462 2.962
low <2 hours 9.667 4.708
low 2 to 5 hours 12.192 4.615
low 5 to 10 hours 13.111 1.944
low >10 hours 13.667 4.667
moderate <2 hours 11.080 5.240
moderate 2 to 5 hours 11.231 4.846
moderate 5 to 10 hours 8.000 14.000
moderate >10 hours 12.000 2.500
high <2 hours 10.667 3.333
high 2 to 5 hours 7.700 4.900
high 5 to 10 hours 11.000 12.000
very high <2 hours 9.700 7.400
very high 2 to 5 hours 12.000 8.500
very high 5 to 10 hours 11.000 10.000
very high >10 hours 9.000 1.000

New technique

The new visualization technique I learned is visualizing data with lattice package. I found it on sthda.com which is a website I often use for tips and tricks on using R. The tips for using this package were suggested on the page Visualization at the top of the screen and I got curious to try it out.

I also want to give a bit of feedback on this tool: it seems to be somewhat outdated as it has quite a simple syntax, but it is quite hard to find information on the elements of the charts or customization (it took me about 40 minutes to find out how to switch colors without creating a separate column in the dataset / a new dataset for colors / changing the settings of the entire package). It was fun trying a new technique, but ggplot2 and plotly seem more usable for me.

library("lattice")

panel.smoother <- function(x, y) {
  panel.xyplot(x, y) # show points
  panel.loess(x, y)  # show smoothed line
}
xyplot(G2 ~ G1 | sex, data = data,
       type = c("p", "r"),
       scales=list(cex=.8),
       groups = sex,
       par.settings = list(superpose.symbol = list(col = c("#FE7D6A", "#74B72E"),
                                                   pch = 19),
                           superpose.line = list(col = c("#FE7D6A", "#74B72E"),
                                                 lwd = 2)),
       pch = 19,
   xlab="Grade for Portuguese 1st semester", 
   ylab="Grade for Portuguese 2nd semester")
Fig. 5. Relationship of grades for the 1st and 2nd semester by sex

Fig. 5. Relationship of grades for the 1st and 2nd semester by sex