DATS 6101 FINAL PROJECT

 Our final project examines whether alcohol consumption has any predictive power over student average grades. The dataset from "https://archive.ics.uci.edu/ml/datasets/STUDENT+ALCOHOL+CONSUMPTION", we are going to clean data, conduct EDA and build a model. Other features may be important predictors of student grades. We are going to predict student average grades.

#First, we are going to combine both tables into one. There are some students who are repeated in both tables, so we would like to use their records to examine if math and Portuguese grades are comparable.
library(ggplot2)
library(plyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(alluvial)
library(extrafont)

## Registering fonts with R

math=read.table("student-mat.csv",sep=",",header=TRUE)
port=read.table("student-por.csv",sep=",",header=TRUE)

#Here, we are going to merge two table into one table. One of the attributes "paid" is course specific rather than student specific.
source = merge(math,port,by=c("school","sex","age","address","famsize","Pstatus",
                            "Medu","Fedu","Mjob","Fjob","reason","nursery","internet",
                            "guardian","guardian","traveltime","studytime","failures",
                            "schoolsup","famsup","activities","higher","romantic",
                            "famrel","freetime","goout","Dalc","Walc","health","absences"))
print(nrow(source))

## [1] 85

#There are 85 students who belong to both tables. So we are going to examine their average math and Portuguese grades.

 First, among the 85 students, no one consumed high or very high levels of alcohol on daily basis. Second, almost all of those who earned relatively high scores consumed very low levels of alcohol on weekdays. Third, math and Portuguese grades seem to correlate highly with each other. When we regress Portuguese grades on math grades, the adjusted R-squared is 0.55. This means that the correlation coefficient between math and Portuguese grades is about 0.74 and that about 55% of the variation in Portuguese grades can be explained by the variation in math grades. This is an indication that we can go ahead and combine the two tables together, average grades in math or Portuguese reflect general student performance.

#here we are going to calcuate the (G1 G2 G3)'s mean value for math and port class.
source$mathgrades=rowMeans(cbind(source$G1.x,source$G2.x,source$G3.x))
source$portgrades=rowMeans(cbind(source$G1.y,source$G2.y,source$G3.y))
#here just factor the column elements Dalc
source$Dalc <- as.factor(source$Dalc)      
source$Dalc <- mapvalues(source$Dalc, 
                              from = 1:5, 
                              to = c("Very Low", "Low", "Medium", "High", "Very High"))

## The following `from` values were not present in `x`: 4, 5

str=ggplot(source, aes(x=mathgrades, y=portgrades)) +
 geom_point(aes(colour=factor(Dalc))) + scale_colour_hue(l=25,c=150)+
geom_smooth(method = "lm", se = FALSE)

#factor the column elements Dalc
source$Walc <- as.factor(source$Walc)      
source$Walc <- mapvalues(source$Walc, 
                              from = 1:5, 
                              to = c("Very Low", "Low", "Medium", "High", "Very High"))

str1=ggplot(source, aes(x=mathgrades, y=portgrades))+
geom_point(aes(colour=factor(Walc)))+ scale_colour_hue(l=25,c=150)+
geom_smooth(method = "lm", se = FALSE)

grid.arrange(str,str1,nrow=2)

#Very high level of alcohol consumption during the week doesn’t seem to help to get a great final grade.
waffle.col <- c("#00d27f","#adff00","#f9d62e","#fc913a","#ff4e50")

c1 <- ggplot(source, aes(x=Dalc, y=source$G1.y, fill=Dalc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("First period grade")

c2 <- ggplot(source, aes(x=Dalc, y=source$G2.y, fill=Dalc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("Second period grade")

c3 <- ggplot(source, aes(x=Dalc, y=source$G3.y, fill=Dalc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("Final period grade")

grid.arrange(c1,c2,c3,ncol=3)

c4 <- ggplot(source, aes(x=Walc, y=source$G1.y, fill=Walc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("First period grade")

c5 <- ggplot(source, aes(x=Walc, y=source$G2.y, fill=Walc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("Second period grade")

c6 <- ggplot(source, aes(x=Walc, y=source$G3.y, fill=Walc))+
      geom_boxplot()+
      theme_bw()+
      theme(legend.position="none")+
      scale_fill_manual(values=waffle.col)+
      xlab("Alcohol consumption")+
      ylab("Grade")+
      ggtitle("Third period grade")

grid.arrange(c4,c5,c6, ncol=3)

combine<-rbind(math,port) #combine the two datasets
# and eliminate the repeats:
unique<-combine %>% distinct(school,sex,age,address,famsize,Pstatus,
                Medu,Fedu,Mjob,Fjob,reason,
                guardian,traveltime,studytime,failures,
                schoolsup, famsup,activities,nursery,higher,internet,
                romantic,famrel,freetime,goout,Dalc,Walc,health,absences, .keep_all = TRUE)
#add a column with average grades (math or Portuguese, whichever is available)
unique$avggrades=rowMeans(cbind(unique$G1,unique$G2,unique$G3))
# and drop grades in 3 marking periods.
unique<-unique[,-(31:33)]

 Here is a basic boxplot of average subject grades grouped by the levels of daily alcohol consumption.
 As the boxplot shows, the median average grade is higher among those students who had very low levels of daily alcohol consumption. However, the median grade of the students with medium, high, and very high levels of daily alcohol consumption doesn't seem to be very different. We are going to run a multiple linear regression and build a regression tree of average grades on all other variables.

ggplot(unique, aes(x=Dalc, y=avggrades, group=Dalc))+
  geom_boxplot()+
  theme(legend.position="none")+
  scale_fill_manual(values=waffle.col)+
  xlab("Daily Alcohol consumption")+
  ylab("Average Grades")+
  ggtitle("Average Grade")

failureind<-which(names(unique)=="failures")
unique<-unique[,-failureind]

 Adjusted R-squared in the multiple regression is 0.1685, which is good(low). It means only 16.85% of the variation in the average grades by the variation for everything else. Average grades have statistically significant impact on famsize, studytime, schoolsup, paid, high, romantic, goout andd health.

#multiple regression 
lm2<-lm(avggrades~., data=unique[,1:30])
summary(lm2)

## 
## Call:
## lm(formula = avggrades ~ ., data = unique[, 1:30])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.8048  -1.5890   0.1455   1.8676   8.4868 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.6017499  1.8139110   5.845 7.04e-09 ***
## schoolMS         -0.3723012  0.2574810  -1.446  0.14854    
## sexM             -0.1215773  0.2255528  -0.539  0.59000    
## age              -0.1198027  0.0906465  -1.322  0.18661    
## addressU          0.2930648  0.2427090   1.207  0.22756    
## famsizeLE3        0.4975437  0.2219491   2.242  0.02522 *  
## PstatusT          0.0640413  0.3168516   0.202  0.83987    
## Medu              0.1848194  0.1396447   1.323  0.18600    
## Fedu              0.1642152  0.1234646   1.330  0.18383    
## Mjobhealth        0.7195927  0.4873500   1.477  0.14014    
## Mjobother         0.0878501  0.2874717   0.306  0.75998    
## Mjobservices      0.3595006  0.3392675   1.060  0.28959    
## Mjobteacher       0.1089511  0.4567083   0.239  0.81150    
## Fjobhealth       -0.2298997  0.6841685  -0.336  0.73693    
## Fjobother         0.0004672  0.4281211   0.001  0.99913    
## Fjobservices     -0.2993809  0.4493361  -0.666  0.50540    
## Fjobteacher       1.1650673  0.6051943   1.925  0.05452 .  
## reasonhome        0.2418873  0.2555691   0.946  0.34416    
## reasonother       0.3335054  0.3412013   0.977  0.32861    
## reasonreputation  0.4954622  0.2654763   1.866  0.06232 .  
## guardianmother   -0.1771667  0.2439794  -0.726  0.46793    
## guardianother    -0.2497580  0.4596099  -0.543  0.58698    
## traveltime       -0.1100901  0.1449817  -0.759  0.44785    
## studytime         0.5080568  0.1272501   3.993 7.06e-05 ***
## schoolsupyes     -1.5979812  0.3190187  -5.009 6.56e-07 ***
## famsupyes        -0.3691163  0.2091140  -1.765  0.07787 .  
## paidyes          -0.6938560  0.2413583  -2.875  0.00414 ** 
## activitiesyes     0.0980887  0.2022346   0.485  0.62777    
## nurseryyes        0.0309093  0.2473158   0.125  0.90057    
## higheryes         1.9236766  0.3618564   5.316 1.33e-07 ***
## internetyes       0.4269133  0.2569866   1.661  0.09701 .  
## romanticyes      -0.4996783  0.2097075  -2.383  0.01739 *  
## famrel            0.1569738  0.1056496   1.486  0.13768    
## freetime         -0.0399315  0.1029791  -0.388  0.69828    
## goout            -0.2127131  0.0995379  -2.137  0.03286 *  
## Dalc             -0.1303141  0.1389503  -0.938  0.34857    
## Walc             -0.0210543  0.1092166  -0.193  0.84718    
## health           -0.1562929  0.0709717  -2.202  0.02790 *  
## absences         -0.0181346  0.0162657  -1.115  0.26519    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.943 on 920 degrees of freedom
## Multiple R-squared:  0.2015, Adjusted R-squared:  0.1685 
## F-statistic:  6.11 on 38 and 920 DF,  p-value: < 2.2e-16

Here shows the decision tree diagram, "higher" seems to be important which indicates wether the student wants to obtain higher education. The majority of surveyed students would like to pursue higher education and their average grade is significantly higher (11.4/20) than the average grade of those who don't (8.47/20). 
Regression tree analysis shows that mother's education is another important feature Students whose mothers had at least secondary education had significantly higher grade (12.2) than the students whose mothers had no secondaay education.

library(rpart)
library(DMwR)

## Loading required package: lattice

## Loading required package: grid

## 
## Attaching package: 'DMwR'

## The following object is masked from 'package:plyr':
## 
##     join

fit <- rpart(avggrades~.,  data= unique)
#printcp(fit) # display the results 
#plotcp(fit) # visualize cross-validation results 
#summary(fit) # detailed summary of splits
prettyTree(fit, main = "Regression Tree")

library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

set.seed(4543)
rf2<-randomForest(avggrades~., data=unique[,1:30], ntree=500, importance=T)
rf.predictions<-predict(rf2,unique)
nmse.rf<-mean((rf.predictions-unique[,"avggrades"])^2)/mean((mean(unique$avggrades)-unique[,"avggrades"])^2)
print(nmse.rf) #0.2

## [1] 0.2030411

The top 10 most important variables that impact student average grades are: 1. Higher- wants to take higher education 2. Medu- mother’s education 3. studytime- weekly study time 4. schoolsup- extra educational support 5. Dalc- workday alcohol consumption 6. goout- going out with friends 7. Walc- weekend alcohol consumption 8. reason- reason to choose this school 9. Fedu- father’s education 10. sex- student’s sex What’s interesting about these results is that some features that would be conventionally thought as important did not end up in the top ten list such as Pstatus, famsupport, famrel, & absences while some other variables such as higher and Medu turned out to be very important.

#export the unique table and clean it name as unique_clean
#write.table(unique, "~/Desktop/6101/final/unique.txt", sep="\t")
library(leaps)
library(ISLR)
#This is essentially best fit 
unique_clean <- read.csv("unique_clean.csv")
#View(unique_clean)
reg.best1 <- regsubsets(avggrades~school+sex+age+address+famsize+Pstatus+Medu+Fedu+Mjob+Fjob, data = unique_clean, nvmax = 19)
#The plot will show the Adjust R^2 when using the variables across the bottom of first ten variables
plot(reg.best1, scale = "adjr2", main = "First 10 Adjusted R^2")

#for mid ten variables
reg.best2 <- regsubsets(avggrades~reason+guardian+traveltime+studytime+schoolsup+famsup+paid+activities+nursery+higher, data = unique_clean, nvmax = 19)
plot(reg.best2, scale = "adjr2", main = "Mid 10 Adjusted R^2")

#for last ten variables
reg.best3 <- regsubsets(avggrades~internet+romantic+famrel+freetime+goout+Dalc+Walc+health+absences, data = unique_clean, nvmax = 19)
plot(reg.best3, scale = "adjr2", main = "Last 10 Adjusted R^2")

#for all variables
reg.best <- regsubsets(avggrades~., data = unique_clean, nvmax = 19)
plot(reg.best, scale = "adjr2", main = "All variable Adjusted R^2")

#here some variables are missing values because those variables are not effect this model by USING ADJUSETED R^2

varImpPlot(rf2,type=1) # this metric is obtained by measuring the %increase in MSE of the model if the variable is removed

In chunk below, we look at the rules to decide whether or not the student scores high/low in their average grades. Here, we define averagegrades>=18 as a “high”.
We have used a set.seed() function to ensure that the samples produced are reproducible. We use the logistic regression predictive model to predict whether the student scores high or low. We see that around 98.3% of results were classified accurately.
According to the summary of the model, the significant predictors are famsup,romantic. We need to remember that the model is log scaled to output probabilities.
The AUC comes to around 0.9786 which is not bad. We have a fixed test and training data for the problem. In a real world scenario, the data we deal with are unseen by the model.

#Logistic Regression
#Due to our dataset, only schoolsup, famsup, higher, paid, activities, nusery, internet and romantic those factors have binary data type (YES or NO), other factors don't have the charastic to do logistic regression. We still could build logistic regression model to do prediction. 
set.seed(2)
unique$high <- ifelse(unique$avggrades>=18,1,0)
unique$high <- as.factor(unique$high)
unique <- unique %>% mutate(schoolsup=as.factor(schoolsup),famsup=as.factor(famsup),paid=as.factor(paid),activities=as.factor(activities),nursery=as.factor(nursery),higher=as.factor(higher),internet=as.factor(internet),romantic=as.factor(romantic))
train <- sample(dim(unique)[1],dim(unique)[1]*0.9)
test <- train
training_data <- unique[train,]
testing_data <- unique[test,]
log_model <- glm(high~higher+Medu+schoolsup+studytime+Dalc+Fedu+reason+goout+sex+Walc+health+age+address+Mjob+school+famrel+Fjob+freetime+activities+guardian+traveltime+famsup+nursery+romantic+internet+Pstatus+famsize+absences+paid,data=training_data,family = binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

#summary the log_model
summary(log_model)

## 
## Call:
## glm(formula = high ~ higher + Medu + schoolsup + studytime + 
##     Dalc + Fedu + reason + goout + sex + Walc + health + age + 
##     address + Mjob + school + famrel + Fjob + freetime + activities + 
##     guardian + traveltime + famsup + nursery + romantic + internet + 
##     Pstatus + famsize + absences + paid, family = binomial, data = training_data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.37905  -0.02845  -0.00340  -0.00001   3.00421  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)   
## (Intercept)      -2.997e+01  3.881e+03  -0.008  0.99384   
## higheryes         1.685e+01  3.881e+03   0.004  0.99654   
## Medu              7.580e-01  7.704e-01   0.984  0.32516   
## schoolsupyes     -2.025e+01  3.241e+03  -0.006  0.99501   
## studytime         1.575e+00  5.835e-01   2.699  0.00696 **
## Dalc              1.505e-01  8.266e-01   0.182  0.85550   
## Fedu              3.184e-01  5.984e-01   0.532  0.59462   
## reasonhome       -3.465e+00  2.205e+00  -1.572  0.11603   
## reasonother       4.792e-01  1.581e+00   0.303  0.76175   
## reasonreputation  4.890e-01  1.350e+00   0.362  0.71719   
## goout             8.009e-01  5.724e-01   1.399  0.16173   
## sexM              1.642e+00  1.200e+00   1.368  0.17132   
## Walc             -1.039e+00  7.113e-01  -1.461  0.14414   
## health           -7.657e-03  3.847e-01  -0.020  0.98412   
## age               2.889e-01  4.779e-01   0.605  0.54544   
## addressU         -5.527e-01  1.090e+00  -0.507  0.61221   
## Mjobhealth        4.057e-01  2.445e+00   0.166  0.86822   
## Mjobother        -3.111e+00  2.254e+00  -1.380  0.16747   
## Mjobservices     -1.299e+00  1.615e+00  -0.804  0.42115   
## Mjobteacher      -1.912e-01  2.254e+00  -0.085  0.93238   
## schoolMS         -3.595e-01  1.420e+00  -0.253  0.80012   
## famrel           -6.665e-01  7.066e-01  -0.943  0.34554   
## Fjobhealth       -2.097e+01  7.343e+03  -0.003  0.99772   
## Fjobother        -3.047e+00  1.811e+00  -1.682  0.09251 . 
## Fjobservices     -4.643e+00  2.314e+00  -2.007  0.04478 * 
## Fjobteacher       1.609e+00  2.092e+00   0.769  0.44182   
## freetime         -6.189e-01  4.963e-01  -1.247  0.21236   
## activitiesyes    -3.158e+00  1.305e+00  -2.420  0.01554 * 
## guardianmother   -5.695e-02  1.341e+00  -0.042  0.96613   
## guardianother    -1.670e+01  3.824e+03  -0.004  0.99651   
## traveltime        8.136e-01  8.132e-01   1.000  0.31710   
## famsupyes        -2.315e+00  1.120e+00  -2.067  0.03877 * 
## nurseryyes        1.071e+00  1.780e+00   0.602  0.54723   
## romanticyes      -4.234e+00  1.863e+00  -2.273  0.02302 * 
## internetyes       1.612e+00  1.862e+00   0.866  0.38667   
## PstatusT          2.683e+00  2.104e+00   1.275  0.20214   
## famsizeLE3        1.403e+00  1.102e+00   1.273  0.20292   
## absences          1.563e-01  8.430e-02   1.854  0.06379 . 
## paidyes          -1.502e-01  1.038e+00  -0.145  0.88493   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 143.170  on 862  degrees of freedom
## Residual deviance:  68.232  on 824  degrees of freedom
## AIC: 146.23
## 
## Number of Fisher Scoring iterations: 21

#Because the output is in log terms we need to convert to % using exp
exp(coef(log_model))

##      (Intercept)        higheryes             Medu     schoolsupyes 
##     9.674829e-14     2.083857e+07     2.134004e+00     1.598763e-09 
##        studytime             Dalc             Fedu       reasonhome 
##     4.830317e+00     1.162441e+00     1.374950e+00     3.126551e-02 
##      reasonother reasonreputation            goout             sexM 
##     1.614809e+00     1.630721e+00     2.227614e+00     5.166697e+00 
##             Walc           health              age         addressU 
##     3.538415e-01     9.923725e-01     1.334974e+00     5.753835e-01 
##       Mjobhealth        Mjobother     Mjobservices      Mjobteacher 
##     1.500386e+00     4.456790e-02     2.727153e-01     8.259339e-01 
##         schoolMS           famrel       Fjobhealth        Fjobother 
##     6.979957e-01     5.135016e-01     7.789440e-10     4.749968e-02 
##     Fjobservices      Fjobteacher         freetime    activitiesyes 
##     9.629954e-03     4.998158e+00     5.385198e-01     4.249956e-02 
##   guardianmother    guardianother       traveltime        famsupyes 
##     9.446464e-01     5.578303e-08     2.255995e+00     9.877154e-02 
##       nurseryyes      romanticyes      internetyes         PstatusT 
##     2.919615e+00     1.449046e-02     5.012144e+00     1.463460e+01 
##       famsizeLE3         absences          paidyes 
##     4.068146e+00     1.169128e+00     8.605561e-01

library(ResourceSelection)

## ResourceSelection 0.3-2   2017-02-28

#Test our models goodness of fit 
hoslem.test(training_data$high, fitted(log_model))

## Warning in Ops.factor(1, y): '-' not meaningful for factors

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  training_data$high, fitted(log_model)
## X-squared = 863, df = 8, p-value < 2.2e-16

#We can also use a ROC curve gauge how well we are classifying the data in combination with the predict function
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

#First we want to use the data to predict the likelihood that each student will be admitted.
prob=predict(log_model, type = c("response"))
#prob
#Then we can plot our hit "rate hit"
h <- roc(high~prob, data=training_data)
plot(h, main='Rate Hit',col="blue",lwd=3)

predict_log <- predict(log_model,newdata=testing_data,type="response")
predict_log <- round(predict_log)
mean(predict_log==testing_data$high)

## [1] 0.9837775

#ROC and AUC
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

#In order to use the package we first have to set the prediction 
newpred <- prediction(prob,training_data$high)
#Next we want to measure true possitives which is "tpr" and also False Positives "fpr"
newpred.performance <- performance(newpred, measure = "tpr",x.measure = "fpr")
#then we plot these two measures
plot(newpred.performance,main="ROC Curve for Logistic Regression Model", lwd=2,colorize=TRUE)
abline(a=0,b=1,lwd=2,lty=2)

#Looking pretty good, we can also get the AUC again using the performance function
AUC <- performance(newpred, measure = "auc")
AUC = unlist(AUC@y.values)
AUC

## [1] 0.9786303

Conclusion: Our project explores the degree of statistical association (sort of correlation) between alcohol consumption and grades. The purpose of this notebook is to understand if alcohol consumption (and what other factors) drive attainment in general. Thus, our target variable is the average grade rather than the grade in any of the three marking periods. As result, the two strongest predictors of student average grade are the willingness to pursue higher education (higher) and mother’s education (Medu). However, to totally disentangling the direction of causality between the two features would require different methods and perhaps much more extensive data.