Short exploratory data analysis focusing on the alcohol variables from the Portuguese class dataset. The data is a survey of students in portuguese language courses in secondary school.
kaggle: Student Alcohol Consumption dataset
Dataset origin:
Fabio Pagnotta, Hossain Mohammad Amran
Department of Computer Science,University of Camerino
library(ggplot2)
library(plyr)
library(dplyr)
library(gridExtra)
library(alluvial)
library(waffle)
library(extrafont)
data.source <- read.table("C:/Users/marc/Desktop/Data/161019_student_alcohol/student-por.csv",sep=",",header=TRUE)
data.source$Dalc <- as.factor(data.source$Dalc)
data.source$Dalc <- mapvalues(data.source$Dalc,
from = 1:5,
to = c("Very Low", "Low", "Medium", "High", "Very High"))
data.source$Walc <- as.factor(data.source$Walc)
data.source$Walc <- mapvalues(data.source$Walc,
from = 1:5,
to = c("Very Low", "Low", "Medium", "High", "Very High"))
#loadfonts(device="win")
#fa <- fontawesome(c('fa-stack-overflow'))
windowsFonts(FontAwesome=windowsFont("FontAwesome"))
alcohol.d <- as.data.frame(table(data.source$Dalc))
par.d <- as.numeric(alcohol.d$Freq)
names(par.d) <- alcohol.d$Var1
par.d <- round(par.d/10)
waffle.col <- c("#00d27f","#adff00","#f9d62e","#fc913a","#ff4e50")
c1 <- waffle(par.d, rows=5,
use_glyph="glass",
size=2,
title = "Workday alcohol consumption among students",
glyph_size=8,
xlab="1 glass == 10 students",
colors=waffle.col,
legend_pos= "top"
)
alcohol.w <- as.data.frame(table(data.source$Walc))
par.w <- as.numeric(alcohol.w$Freq)
names(par.w) <- alcohol.w$Var1
par.w <- round(par.w/10)
c2 <- waffle(par.w, rows=5,
use_glyph="glass",
size=2,
title = "Weekend alcohol consumption among students",
glyph_size=8,
xlab="1 glass == 10 students",
colors=waffle.col,
legend_pos= "top"
)
grid.arrange(c1,c2, nrow=2)
c3 <- ggplot(data.source, aes(x=Dalc, y=school, color=sex))+
geom_jitter(alpha=0.7)+
scale_colour_manual(values=c("#ff7f50", "#468499"))+
theme_bw()+
xlab("Workday alcohol consumption")+
ylab("School")+
ggtitle("Workday alcohol consumption per school and sex")
c4 <- ggplot(data.source, aes(x=Walc, y=school, color=sex))+
geom_jitter(alpha=0.7)+
scale_colour_manual(values=c("#ff7f50", "#468499"))+
theme_bw()+
xlab("Weekend alcohol consumption")+
ylab("School")+
ggtitle("Weekend alcohol consumption per school and sex")
grid.arrange(c3,c4, nrow=2)
#workday
c5 <- ggplot(data.source, aes(x=Dalc, y=G1, fill=Dalc))+
geom_boxplot()+
theme_bw()+
theme(legend.position="none")+
scale_fill_manual(values=waffle.col)+
xlab("Alcohol consumption")+
ylab("Grade")+
ggtitle("First period grade")
c6 <- ggplot(data.source, aes(x=Dalc, y=G2, fill=Dalc))+
geom_boxplot()+
theme_bw()+
theme(legend.position="none")+
scale_fill_manual(values=waffle.col)+
xlab("Alcohol consumption")+
ylab("Grade")+
ggtitle("Second period grade")
c7 <- ggplot(data.source, aes(x=Dalc, y=G3, fill=Dalc))+
geom_boxplot()+
theme_bw()+
theme(legend.position="none")+
scale_fill_manual(values=waffle.col)+
xlab("Alcohol consumption")+
ylab("Grade")+
ggtitle("Final period grade")
grid.arrange(c5,c6,c7,ncol=3)
#weekend
c8 <- ggplot(data.source, aes(x=Walc, y=G1, fill=Walc))+
geom_boxplot()+
theme_bw()+
theme(legend.position="none")+
scale_fill_manual(values=waffle.col)+
xlab("Alcohol consumption")+
ylab("Grade")+
ggtitle("First period grade")
c9 <- ggplot(data.source, aes(x=Walc, y=G2, fill=Walc))+
geom_boxplot()+
theme_bw()+
theme(legend.position="none")+
scale_fill_manual(values=waffle.col)+
xlab("Alcohol consumption")+
ylab("Grade")+
ggtitle("Second period grade")
c10 <- ggplot(data.source, aes(x=Walc, y=G3, fill=Walc))+
geom_boxplot()+
theme_bw()+
theme(legend.position="none")+
scale_fill_manual(values=waffle.col)+
xlab("Alcohol consumption")+
ylab("Grade")+
ggtitle("Final period grade")
grid.arrange(c8,c9,c10,ncol=3)
ggplot(data.source, aes(x=Dalc, y=absences, fill=Dalc))+
geom_violin()+
scale_fill_manual(values = waffle.col)+
theme_bw()+
theme(legend.position="none")+
ggtitle("Absences distribution per Workday alcohol consumption")+
xlab("Alcohol consumption")+
ylab("Number of school absences")
ggplot(data.source, aes(x=Walc, y=absences, fill=Walc))+
geom_violin()+
scale_fill_manual(values = waffle.col)+
theme_bw()+
theme(legend.position="none")+
ggtitle("Absences distribution per Weekend alcohol consumption")+
xlab("Alcohol consumption")
ggplot(data.source, aes(x=age, fill=Dalc))+
geom_histogram(binwidth=1, colour="black")+
facet_grid(~Dalc)+
scale_fill_manual(values= waffle.col)+
theme_bw()+
theme(legend.position="none")+
ggtitle("Workday alcohol consumption per age")+
xlab("Student's age")
ggplot(data.source, aes(x=age, fill=Walc))+
geom_histogram(binwidth=1, colour="black")+
facet_grid(~Walc)+
scale_fill_manual(values= waffle.col)+
theme_bw()+
theme(legend.position="none")+
ggtitle("Weekend alcohol consumption per age")+
xlab("Student's age")