Summary

Short exploratory data analysis focusing on the alcohol variables from the Portuguese class dataset. The data is a survey of students in portuguese language courses in secondary school.

kaggle: Student Alcohol Consumption dataset

Dataset origin:

library(ggplot2)
library(plyr)
library(dplyr)
library(gridExtra)
library(alluvial)
library(waffle)
library(extrafont)

data.source <- read.table("C:/Users/marc/Desktop/Data/161019_student_alcohol/student-por.csv",sep=",",header=TRUE)

data.source$Dalc <- as.factor(data.source$Dalc)      
data.source$Dalc <- mapvalues(data.source$Dalc, 
                              from = 1:5, 
                              to = c("Very Low", "Low", "Medium", "High", "Very High"))

data.source$Walc <- as.factor(data.source$Walc)      
data.source$Walc <- mapvalues(data.source$Walc, 
                              from = 1:5, 
                              to = c("Very Low", "Low", "Medium", "High", "Very High"))

Overview

#loadfonts(device="win")
#fa <- fontawesome(c('fa-stack-overflow'))

windowsFonts(FontAwesome=windowsFont("FontAwesome"))

alcohol.d <- as.data.frame(table(data.source$Dalc))
par.d <- as.numeric(alcohol.d$Freq)
names(par.d) <- alcohol.d$Var1
par.d <- round(par.d/10)

waffle.col <- c("#00d27f","#adff00","#f9d62e","#fc913a","#ff4e50")

c1 <- waffle(par.d, rows=5, 
             use_glyph="glass", 
             size=2, 
             title = "Workday alcohol consumption among students",
             glyph_size=8,
             xlab="1 glass == 10 students",
             colors=waffle.col,
             legend_pos= "top"
             )

alcohol.w <- as.data.frame(table(data.source$Walc))
par.w <- as.numeric(alcohol.w$Freq)
names(par.w) <- alcohol.w$Var1
par.w <- round(par.w/10)

c2 <- waffle(par.w, rows=5, 
             use_glyph="glass", 
             size=2, 
             title = "Weekend alcohol consumption among students",
             glyph_size=8,
             xlab="1 glass == 10 students",
             colors=waffle.col,
             legend_pos= "top"
             )

grid.arrange(c1,c2, nrow=2)

School and gender

c3 <- ggplot(data.source, aes(x=Dalc, y=school, color=sex))+
      geom_jitter(alpha=0.7)+
       scale_colour_manual(values=c("#ff7f50", "#468499"))+
      theme_bw()+
      xlab("Workday alcohol consumption")+
      ylab("School")+
      ggtitle("Workday alcohol consumption per school and sex")


c4 <- ggplot(data.source, aes(x=Walc, y=school, color=sex))+
      geom_jitter(alpha=0.7)+
       scale_colour_manual(values=c("#ff7f50", "#468499"))+
      theme_bw()+
      xlab("Weekend alcohol consumption")+
      ylab("School")+
      ggtitle("Weekend alcohol consumption per school and sex")

grid.arrange(c3,c4, nrow=2)

Alcohol and grades

Workday alcohol consumption and grades

#workday
c5 <- ggplot(data.source, aes(x=Dalc, y=G1, fill=Dalc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("First period grade")

c6 <- ggplot(data.source, aes(x=Dalc, y=G2, fill=Dalc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("Second period grade")

c7 <- ggplot(data.source, aes(x=Dalc, y=G3, fill=Dalc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("Final period grade")

grid.arrange(c5,c6,c7,ncol=3)

  • Very high level of alcohol consumption during the week doesn’t seem to help to get a great final grade.

Weekend alcohol consumption and grades

#weekend
c8 <- ggplot(data.source, aes(x=Walc, y=G1, fill=Walc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("First period grade")

c9 <- ggplot(data.source, aes(x=Walc, y=G2, fill=Walc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("Second period grade")

c10 <- ggplot(data.source, aes(x=Walc, y=G3, fill=Walc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("Final period grade")

grid.arrange(c8,c9,c10,ncol=3)

Alcohol consumption and school absences

ggplot(data.source, aes(x=Dalc, y=absences, fill=Dalc))+
      geom_violin()+
      scale_fill_manual(values = waffle.col)+
      theme_bw()+
      theme(legend.position="none")+
      ggtitle("Absences distribution per Workday alcohol consumption")+
      xlab("Alcohol consumption")+
      ylab("Number of school absences")

ggplot(data.source, aes(x=Walc, y=absences, fill=Walc))+
      geom_violin()+
      scale_fill_manual(values = waffle.col)+
      theme_bw()+
      theme(legend.position="none")+
      ggtitle("Absences distribution per Weekend alcohol consumption")+
      xlab("Alcohol consumption")

Alcohol consumption and student’s age

ggplot(data.source, aes(x=age, fill=Dalc))+
      geom_histogram(binwidth=1, colour="black")+
      facet_grid(~Dalc)+
      scale_fill_manual(values= waffle.col)+
      theme_bw()+
      theme(legend.position="none")+
      ggtitle("Workday alcohol consumption per age")+
      xlab("Student's age")  

ggplot(data.source, aes(x=age, fill=Walc))+
      geom_histogram(binwidth=1, colour="black")+
      facet_grid(~Walc)+
      scale_fill_manual(values= waffle.col)+
      theme_bw()+
      theme(legend.position="none")+
      ggtitle("Weekend alcohol consumption per age")+
      xlab("Student's age")