The main purpose of this investigation is to analyze the information from the Student Alcohol Consumption dataset, and how it correlates with student academic performance, and 32 additional demographic-related factors.
d2 = read.csv("student-por.csv")
head(d2)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob
## 1 GP F 18 U GT3 A 4 4 at_home teacher
## 2 GP F 17 U GT3 T 1 1 at_home other
## 3 GP F 15 U LE3 T 1 1 at_home other
## 4 GP F 15 U GT3 T 4 2 health services
## 5 GP F 16 U GT3 T 3 3 other other
## 6 GP M 16 U LE3 T 4 3 services other
## reason guardian traveltime studytime failures schoolsup famsup paid
## 1 course mother 2 2 0 yes no no
## 2 course father 1 2 0 no yes no
## 3 other mother 1 2 0 yes no no
## 4 home mother 1 3 0 no yes no
## 5 home father 1 2 0 no yes no
## 6 reputation mother 1 2 0 no yes no
## activities nursery higher internet romantic famrel freetime goout Dalc
## 1 no yes yes no no 4 3 4 1
## 2 no no yes yes no 5 3 3 1
## 3 no yes yes yes no 4 3 2 2
## 4 yes yes yes yes yes 3 2 2 1
## 5 no yes yes no no 4 3 2 1
## 6 yes yes yes yes no 5 4 2 1
## Walc health absences G1 G2 G3
## 1 1 3 4 0 11 11
## 2 1 3 2 9 11 11
## 3 3 3 6 12 13 12
## 4 1 5 0 14 14 14
## 5 2 5 0 11 13 13
## 6 2 5 6 12 12 13
library(plotly)
mom.job <- c(table(d2$Mjob))
dad.job <- c(table(d2$Fjob))
jobs <- c("At home", "Health", "Other", "Services", "Teacher")
jobs
## [1] "At home" "Health" "Other" "Services" "Teacher"
p1 <- plot_ly(d2, x = ~jobs, y= ~mom.job, type = "bar", name = "Mother Profession") %>%
add_trace(y= ~dad.job, name = "Father Profession")%>%
layout(yaxis = list(title = 'Count'), barmode = 'group')
p1
A greater relative proportion of Mothers work in housekeeping, healthcare, and teaching professions. Fathers relatively, are more likely to work in other professions or service-related positions, such as administration and law enforcement.
mom <- c(table(d2$Medu))
dad <- c(table(d2$Fedu))
z <- data.frame(mom, dad)
z
## mom dad
## 0 6 7
## 1 143 174
## 2 186 209
## 3 139 131
## 4 175 128
edu <- c("None", "Primary Edu", "5th-9th Grade", "Secondary Edu", "Higher Edu")
p2 <- plot_ly(d2, x = ~edu , y= ~mom, type = "bar", name = "Mother Edu Level") %>%
add_trace(y= ~dad, name = "Father Edu Level")%>%
layout(yaxis = list(title = 'Count'),
xaxis = list(title = 'Education Level'),
barmode = 'group')
p2
A greater proportion of mothers achieved higher education levels in comparison to their male counterparts.
library(ggplot2)
library(gridExtra)
df1 <- d2 %>% select(sex,G1)
p1<-ggplot(d2, aes(x=G1)) + geom_histogram(fill="dodgerblue", colour="black",binwidth=1) +
facet_grid(sex ~ .)+geom_vline(data=aggregate(df1[2], df1[1], median),
mapping=aes(xintercept=G1), color="red")
df2 <- d2 %>% select(sex,G2)
p2<-ggplot(d2, aes(x=G2)) + geom_histogram(fill="green4", colour="black",binwidth = 1) +
facet_grid(sex ~ .)+geom_vline(data=aggregate(df2[2], df2[1], median),
mapping=aes(xintercept=G2), color="red")
df3 <- d2 %>% select(sex,G3)
p3<-ggplot(d2, aes(x=G3)) + geom_histogram(fill="yellow2", colour="black",binwidth = 1) +
facet_grid(sex ~ .)+geom_vline(data=aggregate(df3[2], df3[1], median),
mapping=aes(xintercept=G3), color="red")
grid.arrange(p1,p2,p3)
According to the chart above, female students achieved a higher median score than male students across all three exams.
library(ggplot2)
library(gridExtra)
## Slice the data
names(d2)
## [1] "school" "sex" "age" "address" "famsize"
## [6] "Pstatus" "Medu" "Fedu" "Mjob" "Fjob"
## [11] "reason" "guardian" "traveltime" "studytime" "failures"
## [16] "schoolsup" "famsup" "paid" "activities" "nursery"
## [21] "higher" "internet" "romantic" "famrel" "freetime"
## [26] "goout" "Dalc" "Walc" "health" "absences"
## [31] "G1" "G2" "G3"
p1 <- d2[c(7:8,27:28,31:33)]
d3 <- data.frame(p1$Medu,p1$Fedu, p1$Dalc, p1$Walc, G = c(p1[ ,"G1"],p1[, "G2"], p1[, "G3"]))
## Remove the Outliers
clean_d3 <- summary(d3$G)
iq <- IQR(d3$G)
lower <- clean_d3[2]- (1.5*iq)
lower
## 1st Qu.
## 4
upper <- clean_d3[5] + (1.5*iq)
upper
## 3rd Qu.
## 20
## Remove the lower bound outliers
newdata <- d3[which(d3$G > lower), ]
table(newdata$G) ##There were no data greater or equal to upper bound
##
## 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
## 9 19 59 117 172 275 298 240 234 188 122 83 65 36 4
## Relationship between Students' grade and mothers' education level
ggplot(newdata, aes(x=G))+geom_histogram(fill = "yellow", colour="black", binwidth =1)+
facet_grid(p1.Medu ~ .)+geom_vline(data = aggregate(newdata[5], newdata[1], median),
mapping = aes(xintercept = G), color = "red" )+labs(x = "Grades", y = "Numbers of People")
## Relationship between Students' grade and fathers' education level
ggplot(newdata, aes(x=G))+geom_histogram(fill = "green1", colour="black", binwidth =1)+
facet_grid(p1.Fedu ~ .)+geom_vline(data = aggregate(newdata[5], newdata[2], median),
mapping = aes(xintercept = G), color = "red" )+labs(x = "Grades", y = "Numbers of People")
As we can infer from those histograms, parents’ education levels demonstrate a positive correlation with the academic performance of students.
ggplot(newdata, aes(x=G))+geom_histogram(fill = "slateblue", colour="black", binwidth =1)+
facet_grid(p1.Dalc ~ .)+geom_vline(data = aggregate(newdata[5], newdata[3], median),
mapping = aes(xintercept = G), color = "red" )+labs(x = "Grades", y = "Numbers of People")
Students who consume less alcohol during schooldays perform better academically in school.
ggplot(newdata, aes(x=G))+geom_histogram(fill = "cyan", colour="black", binwidth =1)+
facet_grid(p1.Walc ~ .)+geom_vline(data = aggregate(newdata[5], newdata[4], median),
mapping = aes(xintercept = G), color = "red" )+labs(x = "Grades", y = "Numbers of People")
According to the histograms, students who consume less alcohol over the weekend perform better academically than those who consume more.
## Romantic relationship and grade distribution
## Take G1 and G2 median of grade as average score
par(mfrow=c(1,2))
library(ggplot2)
library(scales)
d2 %>% group_by(romantic) %>% summarise(G1_Avg_Score=median(G1,na.rm=T)) %>% ggplot(aes(x=romantic,y=G1_Avg_Score, fill=romantic))+geom_bar(stat="identity")+scale_fill_manual("legend", values = c("yes" = "dodgerblue", "no" = "orange"))
d2 %>% group_by(romantic) %>% summarise(G2_Avg_Score=median(G2,na.rm=T)) %>%
ggplot(aes(x=romantic,y=G2_Avg_Score, fill=romantic))+geom_bar(stat="identity")+geom_bar(stat="identity")+scale_fill_manual("legend", values = c("yes" = "dodgerblue", "no" = "orange"))
According to the bar graphs, students not involved in romantic relationships perform better academically.
## Use G1 data as population to proof Central Limit Theorem
g1.b <- c(d2$G1)
mean(g1.b)
## [1] 11.39908
# Sample size = 10
repeated <- 10000
sample.size <- 10
xbar <- numeric(repeated)
for(i in 1: repeated){
xbar[i] <- mean(sample(g1.b, 10, replace = FALSE))
}
# Sample size = 20
repeated <- 10000
sample.size1 <- 20
xbar1 <- numeric(repeated)
for(i in 1: repeated){
xbar1[i] <- mean(sample(g1.b, 20, replace = FALSE))
}
# Sample size = 30
repeated <- 10000
sample.size2 <- 30
xbar2 <- numeric(repeated)
for(i in 1: repeated){
xbar2[i] <- mean(sample(g1.b, 30, replace = FALSE))
}
# Sample size = 50
repeated <- 10000
sample.size3 <- 50
xbar3 <- numeric(repeated)
for(i in 1: repeated){
xbar3[i] <- mean(sample(g1.b, 50, replace = FALSE))
}
The mean of different size of sample means
cat(mean(xbar), mean(xbar1), mean(xbar2), mean(xbar3))
## 11.39101 11.40014 11.39192 11.39535
The standard deviation of different size of sample means
cat(sd(xbar),sd(xbar1),sd(xbar2),sd(xbar3))
## 0.8546925 0.6063982 0.4915786 0.3711083
While the sample size increase, the standard deviation of sample menas will decrease. The distribution of sample mean will be approximately normal distributed.
Let’s see the distribution of sample means under different sample size.
par(mfrow = c(2,2))
hist(xbar, prob = TRUE, breaks = 15, xlim = c(5,18), ylim = c(0, 1),
xlab = "Grade",main = "Sample size = 10")
hist(xbar1, prob = TRUE, breaks = 15, xlim = c(8,14), ylim = c(0, 1),
xlab = "Grade",main = "Sample size = 20")
hist(xbar2, prob = TRUE, breaks = 15, xlim = c(5,18), ylim = c(0, 1),
xlab = "Grade",main = "Sample size = 30")
hist(xbar3, prob = TRUE, breaks = 15, xlim = c(5,18), ylim = c(0, 1),
xlab = "Grade",main = "Sample size = 50")
To reward students’ participation and efforts in Portuguese class, the teacher is planning to hold a celebration party has free pizza and BBQ. However, the bountiful feast would depend on students’ academically performance, especially the final grade. If students score high in the final grade, the teacher would generously add Popeyes fried chicken into feast list.
# We want to verify the rules whether the students score high/low in the third grade. To setup the evaluation standarad, we assume G3 >= 12 as "high".
# For repeatability of samples
set.seed(10)
# Create Training Data
d2$high <- ifelse(d2$G3>=12,1,0)
d2$high <- as.factor(d2$high)
d2<- d2[,c(1:32,34)]
d2 <- d2 %>% mutate(schoolsup=as.factor(schoolsup),famsup=as.factor(famsup),paid=as.factor(paid),activities=as.factor(activities),nursery=as.factor(nursery),higher=as.factor(higher),internet=as.factor(internet),romantic=as.factor(romantic))
d2_train <- sample(1:nrow(d2), 0.9*nrow(d2))
training_data <- d2[d2_train, ]
# Create Test Data
test <- -d2_train
testing_data <- d2[test,]
# Build Logit Models and Predict
log_model <- glm(high~.,data=training_data,family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Model Diagnostics
summary(log_model)
##
## Call:
## glm(formula = high ~ ., family = binomial, data = training_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1324 -0.0176 0.0000 0.0245 2.5595
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -63.652209 10.478467 -6.075 1.24e-09 ***
## schoolMS -0.545839 0.746322 -0.731 0.46455
## sexM -0.681313 0.602173 -1.131 0.25788
## age 0.852051 0.284503 2.995 0.00275 **
## addressU -0.255435 0.632087 -0.404 0.68613
## famsizeLE3 -0.549217 0.562229 -0.977 0.32864
## PstatusT -0.771938 0.987802 -0.781 0.43453
## Medu -0.179009 0.355384 -0.504 0.61447
## Fedu 0.177079 0.343096 0.516 0.60577
## Mjobhealth 1.999162 1.597205 1.252 0.21069
## Mjobother 1.797232 0.851429 2.111 0.03479 *
## Mjobservices 2.020078 0.986222 2.048 0.04053 *
## Mjobteacher 3.272536 1.351425 2.422 0.01545 *
## Fjobhealth -1.117100 2.142081 -0.522 0.60202
## Fjobother -1.748823 1.241409 -1.409 0.15891
## Fjobservices -1.136750 1.283551 -0.886 0.37582
## Fjobteacher -0.605353 2.095789 -0.289 0.77270
## reasonhome 0.776047 0.657052 1.181 0.23756
## reasonother 0.793563 0.894768 0.887 0.37514
## reasonreputation 0.061417 0.729778 0.084 0.93293
## guardianmother -1.087434 0.646850 -1.681 0.09274 .
## guardianother 1.248981 1.265718 0.987 0.32375
## traveltime 0.657790 0.374754 1.755 0.07922 .
## studytime 0.397463 0.314744 1.263 0.20666
## failures -3.351393 1.152607 -2.908 0.00364 **
## schoolsupyes -0.311465 0.725205 -0.429 0.66757
## famsupyes -0.018874 0.579119 -0.033 0.97400
## paidyes 0.488711 1.066881 0.458 0.64690
## activitiesyes 1.011579 0.580293 1.743 0.08129 .
## nurseryyes -1.073394 0.718518 -1.494 0.13520
## higheryes 2.246232 1.800826 1.247 0.21227
## internetyes 0.903722 0.660500 1.368 0.17124
## romanticyes 0.337297 0.554257 0.609 0.54282
## famrel 0.393319 0.319298 1.232 0.21801
## freetime -0.130476 0.274078 -0.476 0.63404
## goout 0.270205 0.293591 0.920 0.35739
## Dalc 0.082510 0.379837 0.217 0.82803
## Walc -0.243368 0.328880 -0.740 0.45931
## health -0.006127 0.216565 -0.028 0.97743
## absences -0.087401 0.063477 -1.377 0.16854
## G1 0.741057 0.263694 2.810 0.00495 **
## G2 3.359930 0.532125 6.314 2.72e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 805.97 on 583 degrees of freedom
## Residual deviance: 129.12 on 542 degrees of freedom
## AIC: 213.12
##
## Number of Fisher Scoring iterations: 9
predict_log <- predict(log_model,newdata=testing_data,type="response")
predict_log <- round(predict_log)
mean(predict_log==testing_data$high)
## [1] 0.8769231
Use logistic regression to predict whether the students score high or low. The output of this model have to be evaluated by using a unbiased probability threshold of 0.5.Therefore, we could see that 87% of results were accurate.
Based on prediction result, failures, age, G1 and G2 are significant predictors for model.
library(ROCR)
pred.obj <- prediction(predictions = predict_log,labels=testing_data$high)
roc.perf <- performance(pred.obj,measure = "tpr",x.measure = "fpr")
plot(roc.perf,main="ROC Curve for Logistic Regression",col="red",lwd=3)
abline(a=0,b=1,lwd=2,lty=2)
The ROC curve is closer to the left-hand boarder and top boarder which means the test of prediction model is more accurate.
The area under the ROC curve?
perf.auc <- performance(pred.obj,measure = "auc")
unlist(perf.auc@y.values)
## [1] 0.8768939