Research Objective

The main purpose of this investigation is to analyze the information from the Student Alcohol Consumption dataset, and how it correlates with student academic performance, and 32 additional demographic-related factors.

Dataset

d2 = read.csv("student-por.csv")
head(d2)
##   school sex age address famsize Pstatus Medu Fedu     Mjob     Fjob
## 1     GP   F  18       U     GT3       A    4    4  at_home  teacher
## 2     GP   F  17       U     GT3       T    1    1  at_home    other
## 3     GP   F  15       U     LE3       T    1    1  at_home    other
## 4     GP   F  15       U     GT3       T    4    2   health services
## 5     GP   F  16       U     GT3       T    3    3    other    other
## 6     GP   M  16       U     LE3       T    4    3 services    other
##       reason guardian traveltime studytime failures schoolsup famsup paid
## 1     course   mother          2         2        0       yes     no   no
## 2     course   father          1         2        0        no    yes   no
## 3      other   mother          1         2        0       yes     no   no
## 4       home   mother          1         3        0        no    yes   no
## 5       home   father          1         2        0        no    yes   no
## 6 reputation   mother          1         2        0        no    yes   no
##   activities nursery higher internet romantic famrel freetime goout Dalc
## 1         no     yes    yes       no       no      4        3     4    1
## 2         no      no    yes      yes       no      5        3     3    1
## 3         no     yes    yes      yes       no      4        3     2    2
## 4        yes     yes    yes      yes      yes      3        2     2    1
## 5         no     yes    yes       no       no      4        3     2    1
## 6        yes     yes    yes      yes       no      5        4     2    1
##   Walc health absences G1 G2 G3
## 1    1      3        4  0 11 11
## 2    1      3        2  9 11 11
## 3    3      3        6 12 13 12
## 4    1      5        0 14 14 14
## 5    2      5        0 11 13 13
## 6    2      5        6 12 12 13

Categorical Data

Differences in Profession Between the Mothers and Fathers of Students

library(plotly)
mom.job <- c(table(d2$Mjob))
dad.job <- c(table(d2$Fjob))
jobs <- c("At home", "Health", "Other", "Services", "Teacher")
jobs
## [1] "At home"  "Health"   "Other"    "Services" "Teacher"
p1 <- plot_ly(d2, x = ~jobs, y= ~mom.job, type = "bar", name = "Mother Profession") %>%
  add_trace(y= ~dad.job, name = "Father Profession")%>%
  layout(yaxis = list(title = 'Count'), barmode = 'group')
p1

A greater relative proportion of Mothers work in housekeeping, healthcare, and teaching professions. Fathers relatively, are more likely to work in other professions or service-related positions, such as administration and law enforcement.

Differences in Highest Education Levels Completed Between Mothers and Fathers

mom <- c(table(d2$Medu))
dad <- c(table(d2$Fedu))
z <- data.frame(mom, dad)
z
##   mom dad
## 0   6   7
## 1 143 174
## 2 186 209
## 3 139 131
## 4 175 128
edu <- c("None", "Primary Edu", "5th-9th Grade", "Secondary Edu", "Higher Edu")
p2 <- plot_ly(d2, x = ~edu , y= ~mom, type = "bar", name = "Mother Edu Level") %>%
  add_trace(y= ~dad, name = "Father Edu Level")%>%
  layout(yaxis = list(title = 'Count'),
         xaxis = list(title = 'Education Level'),
         barmode = 'group')
p2

A greater proportion of mothers achieved higher education levels in comparison to their male counterparts.

Numerical Data

Difference in Grades Distribution Between Female and Male Students

library(ggplot2)
library(gridExtra)
df1 <- d2 %>% select(sex,G1)
p1<-ggplot(d2, aes(x=G1)) + geom_histogram(fill="dodgerblue", colour="black",binwidth=1) +
facet_grid(sex ~ .)+geom_vline(data=aggregate(df1[2], df1[1], median), 
      mapping=aes(xintercept=G1), color="red")

df2 <- d2 %>% select(sex,G2)
p2<-ggplot(d2, aes(x=G2)) + geom_histogram(fill="green4", colour="black",binwidth = 1) +
facet_grid(sex ~ .)+geom_vline(data=aggregate(df2[2], df2[1], median), 
      mapping=aes(xintercept=G2), color="red") 

df3 <- d2 %>% select(sex,G3)
p3<-ggplot(d2, aes(x=G3)) + geom_histogram(fill="yellow2", colour="black",binwidth = 1) +
facet_grid(sex ~ .)+geom_vline(data=aggregate(df3[2], df3[1], median), 
      mapping=aes(xintercept=G3), color="red") 

grid.arrange(p1,p2,p3)

According to the chart above, female students achieved a higher median score than male students across all three exams.

Further Relationship of Variables

Would Parents’ Education Level Affect Students’ Grades?

library(ggplot2)
library(gridExtra)
## Slice the data
names(d2)
##  [1] "school"     "sex"        "age"        "address"    "famsize"   
##  [6] "Pstatus"    "Medu"       "Fedu"       "Mjob"       "Fjob"      
## [11] "reason"     "guardian"   "traveltime" "studytime"  "failures"  
## [16] "schoolsup"  "famsup"     "paid"       "activities" "nursery"   
## [21] "higher"     "internet"   "romantic"   "famrel"     "freetime"  
## [26] "goout"      "Dalc"       "Walc"       "health"     "absences"  
## [31] "G1"         "G2"         "G3"
p1 <- d2[c(7:8,27:28,31:33)]
d3 <- data.frame(p1$Medu,p1$Fedu, p1$Dalc, p1$Walc, G = c(p1[ ,"G1"],p1[, "G2"], p1[, "G3"]))

## Remove the Outliers
clean_d3 <- summary(d3$G)
iq <- IQR(d3$G)
lower <- clean_d3[2]- (1.5*iq)
lower
## 1st Qu. 
##       4
upper <- clean_d3[5] + (1.5*iq)
upper
## 3rd Qu. 
##      20
## Remove the lower bound outliers
newdata <- d3[which(d3$G > lower), ]
table(newdata$G) ##There were no data greater or equal to upper bound
## 
##   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
##   9  19  59 117 172 275 298 240 234 188 122  83  65  36   4
## Relationship between Students' grade and mothers' education level
ggplot(newdata, aes(x=G))+geom_histogram(fill = "yellow", colour="black", binwidth =1)+
  facet_grid(p1.Medu ~ .)+geom_vline(data = aggregate(newdata[5], newdata[1], median),
   mapping = aes(xintercept = G), color = "red" )+labs(x = "Grades", y = "Numbers of People") 

## Relationship between Students' grade and fathers' education level
ggplot(newdata, aes(x=G))+geom_histogram(fill = "green1", colour="black", binwidth =1)+
  facet_grid(p1.Fedu ~ .)+geom_vline(data = aggregate(newdata[5], newdata[2], median),
   mapping = aes(xintercept = G), color = "red" )+labs(x = "Grades", y = "Numbers of People") 

As we can infer from those histograms, parents’ education levels demonstrate a positive correlation with the academic performance of students.

Does Alcohol Consumption During Schooldays Affect Students’ Academic Performance?

ggplot(newdata, aes(x=G))+geom_histogram(fill = "slateblue", colour="black", binwidth =1)+
  facet_grid(p1.Dalc ~ .)+geom_vline(data = aggregate(newdata[5], newdata[3], median),
    mapping = aes(xintercept = G), color = "red" )+labs(x = "Grades", y = "Numbers of People") 

Students who consume less alcohol during schooldays perform better academically in school.

Does Alcohol Consumption on Weekends Affect Students’ Academic Performance?

ggplot(newdata, aes(x=G))+geom_histogram(fill = "cyan", colour="black", binwidth =1)+
  facet_grid(p1.Walc ~ .)+geom_vline(data = aggregate(newdata[5], newdata[4], median),
   mapping = aes(xintercept = G), color = "red" )+labs(x = "Grades", y = "Numbers of People") 

According to the histograms, students who consume less alcohol over the weekend perform better academically than those who consume more.

Would Romantic Relationship Affect Grades?

## Romantic relationship and grade distribution
## Take G1 and G2 median of grade as average score
par(mfrow=c(1,2))
library(ggplot2)
library(scales)
d2 %>% group_by(romantic) %>% summarise(G1_Avg_Score=median(G1,na.rm=T)) %>% ggplot(aes(x=romantic,y=G1_Avg_Score, fill=romantic))+geom_bar(stat="identity")+scale_fill_manual("legend", values = c("yes" = "dodgerblue", "no" = "orange"))

d2 %>% group_by(romantic) %>% summarise(G2_Avg_Score=median(G2,na.rm=T)) %>%
  ggplot(aes(x=romantic,y=G2_Avg_Score, fill=romantic))+geom_bar(stat="identity")+geom_bar(stat="identity")+scale_fill_manual("legend", values = c("yes" = "dodgerblue", "no" = "orange"))

According to the bar graphs, students not involved in romantic relationships perform better academically.

Proof Central Limit Theorem

## Use G1 data as population to proof Central Limit Theorem
g1.b <- c(d2$G1)
mean(g1.b)
## [1] 11.39908
# Sample size = 10
repeated <- 10000
sample.size <- 10
xbar <- numeric(repeated)
for(i in 1: repeated){
  xbar[i] <- mean(sample(g1.b, 10, replace = FALSE))
}

# Sample size = 20
repeated <- 10000
sample.size1 <- 20
xbar1 <- numeric(repeated)
for(i in 1: repeated){
  xbar1[i] <- mean(sample(g1.b, 20, replace = FALSE))
}

# Sample size = 30
repeated <- 10000
sample.size2 <- 30
xbar2 <- numeric(repeated)
for(i in 1: repeated){
  xbar2[i] <- mean(sample(g1.b, 30, replace = FALSE))
}

# Sample size = 50
repeated <- 10000
sample.size3 <- 50
xbar3 <- numeric(repeated)
for(i in 1: repeated){
  xbar3[i] <- mean(sample(g1.b, 50, replace = FALSE))
}

The mean of different size of sample means

cat(mean(xbar), mean(xbar1), mean(xbar2), mean(xbar3))
## 11.39101 11.40014 11.39192 11.39535

The standard deviation of different size of sample means

cat(sd(xbar),sd(xbar1),sd(xbar2),sd(xbar3))
## 0.8546925 0.6063982 0.4915786 0.3711083

While the sample size increase, the standard deviation of sample menas will decrease. The distribution of sample mean will be approximately normal distributed.

Let’s see the distribution of sample means under different sample size.

par(mfrow = c(2,2))
hist(xbar, prob = TRUE, breaks = 15, xlim = c(5,18), ylim = c(0, 1),
     xlab = "Grade",main = "Sample size = 10")
hist(xbar1, prob = TRUE, breaks = 15, xlim = c(8,14), ylim = c(0, 1),
     xlab = "Grade",main = "Sample size = 20")
hist(xbar2, prob = TRUE, breaks = 15, xlim = c(5,18), ylim = c(0, 1),
     xlab = "Grade",main = "Sample size = 30")
hist(xbar3, prob = TRUE, breaks = 15, xlim = c(5,18), ylim = c(0, 1),
     xlab = "Grade",main = "Sample size = 50")

Logistic Regression - Predict Grade 3 Performance

To reward students’ participation and efforts in Portuguese class, the teacher is planning to hold a celebration party has free pizza and BBQ. However, the bountiful feast would depend on students’ academically performance, especially the final grade. If students score high in the final grade, the teacher would generously add Popeyes fried chicken into feast list.

# We want to verify the rules whether the students score high/low in the third grade. To setup the evaluation standarad, we assume G3 >= 12 as "high".

# For repeatability of samples
set.seed(10)
# Create Training Data
d2$high <- ifelse(d2$G3>=12,1,0)
d2$high <- as.factor(d2$high)
d2<- d2[,c(1:32,34)]
d2 <- d2 %>% mutate(schoolsup=as.factor(schoolsup),famsup=as.factor(famsup),paid=as.factor(paid),activities=as.factor(activities),nursery=as.factor(nursery),higher=as.factor(higher),internet=as.factor(internet),romantic=as.factor(romantic)) 
d2_train <- sample(1:nrow(d2), 0.9*nrow(d2))
training_data <- d2[d2_train, ]

# Create Test Data
test <- -d2_train
testing_data <- d2[test,]

# Build Logit Models and Predict
log_model <- glm(high~.,data=training_data,family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Model Diagnostics
summary(log_model)
## 
## Call:
## glm(formula = high ~ ., family = binomial, data = training_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1324  -0.0176   0.0000   0.0245   2.5595  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -63.652209  10.478467  -6.075 1.24e-09 ***
## schoolMS          -0.545839   0.746322  -0.731  0.46455    
## sexM              -0.681313   0.602173  -1.131  0.25788    
## age                0.852051   0.284503   2.995  0.00275 ** 
## addressU          -0.255435   0.632087  -0.404  0.68613    
## famsizeLE3        -0.549217   0.562229  -0.977  0.32864    
## PstatusT          -0.771938   0.987802  -0.781  0.43453    
## Medu              -0.179009   0.355384  -0.504  0.61447    
## Fedu               0.177079   0.343096   0.516  0.60577    
## Mjobhealth         1.999162   1.597205   1.252  0.21069    
## Mjobother          1.797232   0.851429   2.111  0.03479 *  
## Mjobservices       2.020078   0.986222   2.048  0.04053 *  
## Mjobteacher        3.272536   1.351425   2.422  0.01545 *  
## Fjobhealth        -1.117100   2.142081  -0.522  0.60202    
## Fjobother         -1.748823   1.241409  -1.409  0.15891    
## Fjobservices      -1.136750   1.283551  -0.886  0.37582    
## Fjobteacher       -0.605353   2.095789  -0.289  0.77270    
## reasonhome         0.776047   0.657052   1.181  0.23756    
## reasonother        0.793563   0.894768   0.887  0.37514    
## reasonreputation   0.061417   0.729778   0.084  0.93293    
## guardianmother    -1.087434   0.646850  -1.681  0.09274 .  
## guardianother      1.248981   1.265718   0.987  0.32375    
## traveltime         0.657790   0.374754   1.755  0.07922 .  
## studytime          0.397463   0.314744   1.263  0.20666    
## failures          -3.351393   1.152607  -2.908  0.00364 ** 
## schoolsupyes      -0.311465   0.725205  -0.429  0.66757    
## famsupyes         -0.018874   0.579119  -0.033  0.97400    
## paidyes            0.488711   1.066881   0.458  0.64690    
## activitiesyes      1.011579   0.580293   1.743  0.08129 .  
## nurseryyes        -1.073394   0.718518  -1.494  0.13520    
## higheryes          2.246232   1.800826   1.247  0.21227    
## internetyes        0.903722   0.660500   1.368  0.17124    
## romanticyes        0.337297   0.554257   0.609  0.54282    
## famrel             0.393319   0.319298   1.232  0.21801    
## freetime          -0.130476   0.274078  -0.476  0.63404    
## goout              0.270205   0.293591   0.920  0.35739    
## Dalc               0.082510   0.379837   0.217  0.82803    
## Walc              -0.243368   0.328880  -0.740  0.45931    
## health            -0.006127   0.216565  -0.028  0.97743    
## absences          -0.087401   0.063477  -1.377  0.16854    
## G1                 0.741057   0.263694   2.810  0.00495 ** 
## G2                 3.359930   0.532125   6.314 2.72e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 805.97  on 583  degrees of freedom
## Residual deviance: 129.12  on 542  degrees of freedom
## AIC: 213.12
## 
## Number of Fisher Scoring iterations: 9
predict_log <- predict(log_model,newdata=testing_data,type="response")
predict_log <- round(predict_log)
mean(predict_log==testing_data$high)
## [1] 0.8769231

Use logistic regression to predict whether the students score high or low. The output of this model have to be evaluated by using a unbiased probability threshold of 0.5.Therefore, we could see that 87% of results were accurate.

Based on prediction result, failures, age, G1 and G2 are significant predictors for model.

ROC Curves

library(ROCR)

pred.obj <- prediction(predictions = predict_log,labels=testing_data$high)
roc.perf <- performance(pred.obj,measure = "tpr",x.measure = "fpr")

plot(roc.perf,main="ROC Curve for Logistic Regression",col="red",lwd=3)
abline(a=0,b=1,lwd=2,lty=2)

The ROC curve is closer to the left-hand boarder and top boarder which means the test of prediction model is more accurate.

AUC Value

The area under the ROC curve?

perf.auc <- performance(pred.obj,measure = "auc")
unlist(perf.auc@y.values)
## [1] 0.8768939

Summary and Conclusion