library("ggplot2")
library("gplots")
library("glmnet")
library("MASS")
library("tidyverse")
library("dplyr")
library("reshape")
library("ggpubr")
library("ggplot2")
library("glmnet")
library("reshape2")
library("heatmaply")
library("dummies")
library("dplyr")
library("tidyr")
library("caTools")
library("caret")
library("ROCR")
library("ggpubr")
library("glmnetUtils")
library("GGally")
library("glmnet")
library("dplyr")
library("ggplot2")
library("tidyr")
library("lars")
library("leaps")
library("gbm")
library("rpart")
library("corrplot")
library("Metrics")
library("rpart.plot")
library("randomForest")
pacman::p_load(tidyverse)
pacman::p_load(caret)
pacman::p_load(rpart)
pacman::p_load(rpart.plot)
pacman::p_load(corrplot)
pacman::p_load(Metrics)
Reading Data
rm(list=ls())
setwd("/Users/kayhanbabakan/Dropbox/15071 Analytics Edge Team/Team Project")
The working directory was changed to /Users/kayhanbabakan/Dropbox/15071 Analytics Edge Team/Team Project inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
students_math<-read.csv("student-mat.csv",sep = ";")
students_por<-read.csv("student-por.csv", sep = ";")
students_both<-read.csv("studentsinboth.csv", sep = ",")
Pre-Processing
students_math$school = as.factor(students_math$school)
students_math$sex = as.factor(students_math$sex)
students_math$address = as.factor(students_math$address)
students_math$famsize = as.factor(students_math$famsize)
students_math$Fjob = as.factor(students_math$Fjob)
students_math$Mjob = as.factor(students_math$Mjob)
students_math$reason = as.factor(students_math$reason)
students_math$guardian = as.factor(students_math$guardian)
students_math$schoolsup = as.factor(students_math$schoolsup)
students_math$famsup = as.factor(students_math$famsup)
students_math$paid = as.factor(students_math$paid)
students_math$activities = as.factor(students_math$activities)
students_math$nursery = as.factor(students_math$nursery)
students_math$higher = as.factor(students_math$higher)
students_math$internet = as.factor(students_math$internet)
students_math$higher = as.factor(students_math$higher)
students_math$romantic = as.factor(students_math$romantic)
students_math$Pstatus = as.factor(students_math$Pstatus)
Coliniearity testing
students_mathonly = select(students_math,-c(G1,G2))
students_mathonly = na.omit(students_mathonly) #removing nas from the entire data set
students_mathonly2 = model.matrix(~.,data=students_mathonly)
corplot=ggcorr(students_mathonly2,size=2)+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
WalcxDalc = ggplot(students_mathonly)+
geom_bar(aes(Walc,Dalc),stat="identity")
MeduxFedu=ggplot(students_mathonly)+
geom_bar(aes(as.factor(Medu),fill=as.factor(Fedu)),stat="count")
Initial indications
#grade above percetnage
FailuresxG3=ggplot(students_mathonly,aes(x=failures,fill=G3<=15))+
geom_histogram(binwidth=1,position='fill')
FamrelxG3=ggplot(students_mathonly,aes(x=famrel,fill=G3<=15))+
geom_histogram(binwidth=1,position='fill')
GooutxG3= ggplot(students_mathonly,aes(x=goout,fill=G3<=15))+
geom_histogram(binwidth=1,position='fill')
StudytimexG3=ggplot(students_mathonly,aes(x=studytime,fill=G3<=15))+
geom_histogram(binwidth=1,position='fill')
RomanticxG3=ggplot(students_mathonly,aes(x=romantic,fill=G3<=14))+
geom_bar(position='fill')
AbsencesxG3= ggplot(students_mathonly, aes(absences, G3))+
geom_jitter(data=students_mathonly, color="blue")+
xlab("absences")+
ylab("3Q Grades")+
theme(axis.line=element_line(color="black"))+
border(color="black", size=1, linetype=1)
ggarrange(FailuresxG3,FamrelxG3,GooutxG3,StudytimexG3,RomanticxG3,AbsencesxG3,legend = "top",common.legend = TRUE)
Linear Modeling
set.seed(1)
split = createDataPartition(students_mathonly$G3, p = 0.65, list = FALSE)
math.train = students_mathonly[split,]
math.test = students_mathonly[-split,]
math.lm = glm(G3~., data = math.train, family="gaussian")
step.math.lm =step(math.lm)
Start: AIC=1518.53
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + Fjob + reason + guardian + traveltime + studytime +
failures + schoolsup + famsup + paid + activities + nursery +
higher + internet + romantic + famrel + freetime + goout +
Dalc + Walc + health + absences
Df Deviance AIC
- Fjob 4 3925.4 1513.1
- reason 3 3910.0 1514.1
- guardian 2 3895.9 1515.1
- nursery 1 3887.0 1516.5
- internet 1 3887.0 1516.5
- Walc 1 3887.6 1516.6
- Dalc 1 3887.9 1516.6
- school 1 3892.1 1516.9
- address 1 3892.7 1516.9
- traveltime 1 3893.0 1516.9
- Fedu 1 3893.8 1517.0
- activities 1 3894.1 1517.0
- freetime 1 3897.1 1517.2
- famrel 1 3897.5 1517.2
- Pstatus 1 3897.9 1517.2
- paid 1 3904.8 1517.7
- goout 1 3911.7 1518.2
<none> 3886.9 1518.5
- famsup 1 3918.9 1518.7
- age 1 3924.3 1519.0
- Medu 1 3925.0 1519.0
- higher 1 3925.3 1519.1
- health 1 3932.5 1519.5
- studytime 1 3944.5 1520.3
- romantic 1 3949.1 1520.6
- schoolsup 1 3952.6 1520.9
- famsize 1 3957.0 1521.2
- absences 1 3964.7 1521.7
- sex 1 3992.6 1523.5
- Mjob 4 4101.6 1524.5
- failures 1 4078.2 1529.0
Step: AIC=1513.08
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + reason + guardian + traveltime + studytime +
failures + schoolsup + famsup + paid + activities + nursery +
higher + internet + romantic + famrel + freetime + goout +
Dalc + Walc + health + absences
Df Deviance AIC
- reason 3 3956.5 1509.1
- guardian 2 3935.2 1509.7
- Dalc 1 3925.4 1511.1
- nursery 1 3925.4 1511.1
- Walc 1 3925.8 1511.1
- internet 1 3926.3 1511.1
- school 1 3929.0 1511.3
- traveltime 1 3929.7 1511.4
- famrel 1 3930.8 1511.4
- address 1 3931.3 1511.5
- Pstatus 1 3935.4 1511.7
- freetime 1 3936.3 1511.8
- activities 1 3938.0 1511.9
- goout 1 3944.3 1512.3
- paid 1 3944.9 1512.4
- Fedu 1 3946.6 1512.5
<none> 3925.4 1513.1
- age 1 3958.2 1513.2
- famsup 1 3959.7 1513.3
- Medu 1 3964.3 1513.6
- health 1 3965.2 1513.7
- higher 1 3966.8 1513.8
- studytime 1 3979.6 1514.6
- romantic 1 3983.9 1514.9
- famsize 1 3990.7 1515.3
- schoolsup 1 3993.0 1515.5
- absences 1 4005.4 1516.3
- Mjob 4 4132.8 1518.4
- sex 1 4039.9 1518.5
- failures 1 4114.0 1523.2
Step: AIC=1509.12
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + guardian + traveltime + studytime + failures +
schoolsup + famsup + paid + activities + nursery + higher +
internet + romantic + famrel + freetime + goout + Dalc +
Walc + health + absences
Df Deviance AIC
- guardian 2 3966.7 1505.8
- Dalc 1 3956.7 1507.1
- nursery 1 3956.8 1507.1
- Walc 1 3956.9 1507.2
- internet 1 3958.0 1507.2
- address 1 3959.5 1507.3
- school 1 3960.0 1507.3
- traveltime 1 3961.6 1507.5
- famrel 1 3962.7 1507.5
- freetime 1 3966.2 1507.8
- Pstatus 1 3966.3 1507.8
- activities 1 3966.6 1507.8
- Fedu 1 3974.7 1508.3
- paid 1 3976.3 1508.4
- goout 1 3980.5 1508.7
<none> 3956.5 1509.1
- famsup 1 3990.8 1509.3
- age 1 3993.3 1509.5
- higher 1 3993.9 1509.5
- Medu 1 4001.6 1510.0
- health 1 4007.3 1510.4
- romantic 1 4015.6 1511.0
- studytime 1 4021.2 1511.3
- famsize 1 4023.7 1511.5
- schoolsup 1 4026.8 1511.7
- absences 1 4041.4 1512.6
- sex 1 4063.7 1514.0
- Mjob 4 4185.2 1515.7
- failures 1 4161.0 1520.2
Step: AIC=1505.79
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + traveltime + studytime + failures + schoolsup +
famsup + paid + activities + nursery + higher + internet +
romantic + famrel + freetime + goout + Dalc + Walc + health +
absences
Df Deviance AIC
- nursery 1 3966.8 1503.8
- Dalc 1 3967.1 1503.8
- Walc 1 3967.2 1503.8
- internet 1 3967.8 1503.9
- traveltime 1 3969.8 1504.0
- school 1 3970.0 1504.0
- address 1 3972.4 1504.2
- famrel 1 3973.0 1504.2
- Pstatus 1 3977.3 1504.5
- activities 1 3977.4 1504.5
- freetime 1 3978.0 1504.5
- paid 1 3986.6 1505.1
- Fedu 1 3993.7 1505.5
<none> 3966.7 1505.8
- goout 1 3997.7 1505.8
- famsup 1 3999.7 1505.9
- age 1 4002.1 1506.1
- Medu 1 4005.9 1506.3
- higher 1 4007.1 1506.4
- health 1 4016.3 1507.0
- romantic 1 4026.1 1507.6
- famsize 1 4034.9 1508.2
- schoolsup 1 4036.6 1508.3
- studytime 1 4037.6 1508.4
- absences 1 4052.4 1509.3
- sex 1 4079.9 1511.1
- Mjob 4 4199.9 1512.6
- failures 1 4176.3 1517.1
Step: AIC=1503.79
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + traveltime + studytime + failures + schoolsup +
famsup + paid + activities + higher + internet + romantic +
famrel + freetime + goout + Dalc + Walc + health + absences
Df Deviance AIC
- Dalc 1 3967.1 1501.8
- Walc 1 3967.2 1501.8
- internet 1 3967.9 1501.9
- traveltime 1 3969.9 1502.0
- school 1 3970.0 1502.0
- address 1 3972.4 1502.2
- famrel 1 3973.0 1502.2
- Pstatus 1 3977.3 1502.5
- activities 1 3977.5 1502.5
- freetime 1 3978.1 1502.5
- paid 1 3986.8 1503.1
- Fedu 1 3993.9 1503.5
<none> 3966.8 1503.8
- goout 1 3997.7 1503.8
- famsup 1 3999.7 1503.9
- age 1 4002.2 1504.1
- Medu 1 4005.9 1504.3
- higher 1 4007.1 1504.4
- health 1 4016.3 1505.0
- romantic 1 4026.1 1505.6
- schoolsup 1 4036.6 1506.3
- famsize 1 4036.8 1506.3
- studytime 1 4038.3 1506.4
- absences 1 4052.4 1507.3
- sex 1 4079.9 1509.1
- Mjob 4 4200.0 1510.6
- failures 1 4177.8 1515.2
Step: AIC=1501.81
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + traveltime + studytime + failures + schoolsup +
famsup + paid + activities + higher + internet + romantic +
famrel + freetime + goout + Walc + health + absences
Df Deviance AIC
- Walc 1 3967.3 1499.8
- internet 1 3968.2 1499.9
- traveltime 1 3970.1 1500.0
- school 1 3970.3 1500.0
- address 1 3972.8 1500.2
- famrel 1 3973.2 1500.2
- Pstatus 1 3977.5 1500.5
- activities 1 3978.0 1500.5
- freetime 1 3979.5 1500.6
- paid 1 3987.5 1501.1
- Fedu 1 3994.2 1501.6
<none> 3967.1 1501.8
- goout 1 3998.2 1501.8
- famsup 1 3999.8 1501.9
- age 1 4002.2 1502.1
- Medu 1 4006.4 1502.4
- higher 1 4007.9 1502.5
- health 1 4016.4 1503.0
- romantic 1 4026.1 1503.6
- schoolsup 1 4036.9 1504.3
- famsize 1 4037.7 1504.4
- studytime 1 4039.0 1504.5
- absences 1 4053.3 1505.4
- sex 1 4083.9 1507.3
- Mjob 4 4200.1 1508.6
- failures 1 4177.8 1513.2
Step: AIC=1499.82
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + traveltime + studytime + failures + schoolsup +
famsup + paid + activities + higher + internet + romantic +
famrel + freetime + goout + health + absences
Df Deviance AIC
- internet 1 3968.4 1497.9
- traveltime 1 3970.5 1498.0
- school 1 3970.5 1498.0
- address 1 3973.2 1498.2
- famrel 1 3973.7 1498.2
- Pstatus 1 3977.7 1498.5
- activities 1 3978.1 1498.5
- freetime 1 3979.8 1498.6
- paid 1 3987.5 1499.1
- Fedu 1 3994.2 1499.6
<none> 3967.3 1499.8
- famsup 1 3999.8 1499.9
- age 1 4002.2 1500.1
- Medu 1 4007.2 1500.4
- goout 1 4008.0 1500.5
- higher 1 4008.0 1500.5
- health 1 4016.8 1501.0
- romantic 1 4026.2 1501.6
- schoolsup 1 4036.9 1502.3
- famsize 1 4038.0 1502.4
- studytime 1 4042.1 1502.7
- absences 1 4055.1 1503.5
- sex 1 4090.5 1505.8
- Mjob 4 4200.6 1506.6
- failures 1 4179.5 1511.3
Step: AIC=1497.89
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + traveltime + studytime + failures + schoolsup +
famsup + paid + activities + higher + romantic + famrel +
freetime + goout + health + absences
Df Deviance AIC
- traveltime 1 3971.9 1496.1
- school 1 3972.0 1496.1
- address 1 3973.6 1496.2
- famrel 1 3974.6 1496.3
- Pstatus 1 3978.1 1496.5
- activities 1 3978.9 1496.6
- freetime 1 3980.9 1496.7
- paid 1 3987.9 1497.2
- Fedu 1 3994.4 1497.6
<none> 3968.4 1497.9
- famsup 1 4001.8 1498.1
- age 1 4003.1 1498.2
- Medu 1 4008.0 1498.5
- goout 1 4009.6 1498.6
- higher 1 4009.9 1498.6
- health 1 4016.8 1499.0
- romantic 1 4029.8 1499.9
- schoolsup 1 4038.4 1500.4
- famsize 1 4039.2 1500.5
- studytime 1 4042.6 1500.7
- absences 1 4055.1 1501.5
- sex 1 4090.6 1503.8
- Mjob 4 4202.4 1504.7
- failures 1 4179.6 1509.3
Step: AIC=1496.12
G3 ~ school + sex + age + address + famsize + Pstatus + Medu +
Fedu + Mjob + studytime + failures + schoolsup + famsup +
paid + activities + higher + romantic + famrel + freetime +
goout + health + absences
Df Deviance AIC
- school 1 3974.1 1494.3
- famrel 1 3977.9 1494.5
- address 1 3980.6 1494.7
- Pstatus 1 3981.3 1494.7
- activities 1 3984.0 1494.9
- freetime 1 3984.8 1495.0
- paid 1 3992.9 1495.5
- Fedu 1 4002.2 1496.1
<none> 3971.9 1496.1
- age 1 4005.1 1496.3
- Medu 1 4010.6 1496.6
- famsup 1 4011.4 1496.7
- higher 1 4014.6 1496.9
- goout 1 4015.0 1496.9
- health 1 4018.7 1497.2
- romantic 1 4036.0 1498.3
- famsize 1 4040.1 1498.5
- schoolsup 1 4044.4 1498.8
- studytime 1 4049.5 1499.1
- absences 1 4058.2 1499.7
- sex 1 4091.3 1501.8
- Mjob 4 4204.6 1502.9
- failures 1 4182.0 1507.5
Step: AIC=1494.27
G3 ~ sex + age + address + famsize + Pstatus + Medu + Fedu +
Mjob + studytime + failures + schoolsup + famsup + paid +
activities + higher + romantic + famrel + freetime + goout +
health + absences
Df Deviance AIC
- famrel 1 3979.7 1492.6
- address 1 3981.3 1492.7
- Pstatus 1 3983.7 1492.9
- freetime 1 3987.6 1493.1
- activities 1 3987.8 1493.2
- paid 1 3996.2 1493.7
- Fedu 1 4004.4 1494.2
<none> 3974.1 1494.3
- age 1 4006.0 1494.3
- Medu 1 4012.7 1494.8
- famsup 1 4016.8 1495.0
- higher 1 4020.2 1495.2
- goout 1 4020.2 1495.3
- health 1 4020.4 1495.3
- romantic 1 4037.3 1496.3
- famsize 1 4043.3 1496.7
- schoolsup 1 4046.3 1496.9
- studytime 1 4050.1 1497.2
- absences 1 4058.6 1497.7
- sex 1 4092.9 1499.9
- Mjob 4 4206.7 1501.0
- failures 1 4187.0 1505.8
Step: AIC=1492.63
G3 ~ sex + age + address + famsize + Pstatus + Medu + Fedu +
Mjob + studytime + failures + schoolsup + famsup + paid +
activities + higher + romantic + freetime + goout + health +
absences
Df Deviance AIC
- address 1 3987.4 1491.1
- Pstatus 1 3991.1 1491.4
- activities 1 3993.1 1491.5
- freetime 1 3996.2 1491.7
- paid 1 4001.8 1492.1
- age 1 4008.5 1492.5
- Fedu 1 4009.8 1492.6
<none> 3979.7 1492.6
- Medu 1 4018.0 1493.1
- famsup 1 4022.4 1493.4
- health 1 4024.3 1493.5
- goout 1 4025.1 1493.6
- higher 1 4026.6 1493.7
- famsize 1 4046.8 1495.0
- romantic 1 4047.4 1495.0
- schoolsup 1 4050.8 1495.2
- studytime 1 4057.9 1495.7
- absences 1 4062.9 1496.0
- sex 1 4098.2 1498.2
- Mjob 4 4214.3 1499.5
- failures 1 4201.7 1504.7
Step: AIC=1491.14
G3 ~ sex + age + famsize + Pstatus + Medu + Fedu + Mjob + studytime +
failures + schoolsup + famsup + paid + activities + higher +
romantic + freetime + goout + health + absences
Df Deviance AIC
- Pstatus 1 3998.7 1489.9
- activities 1 4003.1 1490.2
- freetime 1 4005.7 1490.3
- paid 1 4009.1 1490.5
- Fedu 1 4017.4 1491.1
<none> 3987.4 1491.1
- age 1 4020.1 1491.2
- Medu 1 4027.1 1491.7
- goout 1 4030.9 1491.9
- health 1 4031.9 1492.0
- famsup 1 4033.4 1492.1
- higher 1 4036.6 1492.3
- romantic 1 4051.9 1493.3
- famsize 1 4052.6 1493.3
- schoolsup 1 4057.5 1493.7
- studytime 1 4065.2 1494.1
- absences 1 4067.4 1494.3
- sex 1 4101.8 1496.5
- Mjob 4 4223.6 1498.0
- failures 1 4209.7 1503.2
Step: AIC=1489.87
G3 ~ sex + age + famsize + Medu + Fedu + Mjob + studytime + failures +
schoolsup + famsup + paid + activities + higher + romantic +
freetime + goout + health + absences
Df Deviance AIC
- activities 1 4011.4 1488.7
- freetime 1 4016.8 1489.0
- paid 1 4023.3 1489.5
- Fedu 1 4027.3 1489.7
- age 1 4029.5 1489.8
<none> 3998.7 1489.9
- Medu 1 4033.4 1490.1
- goout 1 4042.8 1490.7
- famsup 1 4043.7 1490.8
- health 1 4045.1 1490.9
- higher 1 4045.5 1490.9
- famsize 1 4059.8 1491.8
- romantic 1 4063.0 1492.0
- schoolsup 1 4070.6 1492.5
- absences 1 4071.9 1492.6
- studytime 1 4076.6 1492.9
- sex 1 4115.4 1495.3
- Mjob 4 4231.4 1496.5
- failures 1 4230.2 1502.4
Step: AIC=1488.69
G3 ~ sex + age + famsize + Medu + Fedu + Mjob + studytime + failures +
schoolsup + famsup + paid + higher + romantic + freetime +
goout + health + absences
Df Deviance AIC
- freetime 1 4027.6 1487.7
- Fedu 1 4036.4 1488.3
- age 1 4037.5 1488.4
- paid 1 4037.9 1488.4
<none> 4011.4 1488.7
- Medu 1 4046.8 1489.0
- higher 1 4053.2 1489.4
- famsup 1 4055.5 1489.5
- goout 1 4058.3 1489.7
- health 1 4058.9 1489.7
- famsize 1 4073.6 1490.7
- romantic 1 4081.0 1491.1
- absences 1 4082.5 1491.2
- schoolsup 1 4082.9 1491.3
- studytime 1 4083.2 1491.3
- sex 1 4118.0 1493.5
- Mjob 4 4237.3 1494.9
- failures 1 4245.2 1501.4
Step: AIC=1487.73
G3 ~ sex + age + famsize + Medu + Fedu + Mjob + studytime + failures +
schoolsup + famsup + paid + higher + romantic + goout + health +
absences
Df Deviance AIC
- Fedu 1 4050.0 1487.2
- paid 1 4052.0 1487.3
- age 1 4054.0 1487.4
<none> 4027.6 1487.7
- goout 1 4063.2 1488.0
- Medu 1 4065.1 1488.1
- famsup 1 4068.0 1488.3
- higher 1 4068.5 1488.3
- health 1 4074.9 1488.8
- famsize 1 4089.7 1489.7
- absences 1 4092.3 1489.9
- studytime 1 4093.6 1489.9
- romantic 1 4097.1 1490.2
- schoolsup 1 4099.7 1490.3
- sex 1 4150.5 1493.5
- Mjob 4 4257.5 1494.1
- failures 1 4262.5 1500.4
Step: AIC=1487.17
G3 ~ sex + age + famsize + Medu + Mjob + studytime + failures +
schoolsup + famsup + paid + higher + romantic + goout + health +
absences
Df Deviance AIC
- paid 1 4071.2 1486.5
- age 1 4076.2 1486.8
<none> 4050.0 1487.2
- famsup 1 4085.3 1487.4
- goout 1 4087.2 1487.5
- health 1 4093.1 1487.9
- higher 1 4097.1 1488.2
- famsize 1 4109.0 1488.9
- studytime 1 4109.2 1488.9
- absences 1 4112.6 1489.1
- schoolsup 1 4118.8 1489.5
- romantic 1 4119.3 1489.6
- Medu 1 4160.0 1492.1
- sex 1 4173.6 1493.0
- Mjob 4 4285.8 1493.8
- failures 1 4311.9 1501.4
Step: AIC=1486.52
G3 ~ sex + age + famsize + Medu + Mjob + studytime + failures +
schoolsup + famsup + higher + romantic + goout + health +
absences
Df Deviance AIC
- famsup 1 4094.6 1486.0
- age 1 4096.7 1486.1
<none> 4071.2 1486.5
- goout 1 4107.2 1486.8
- health 1 4118.8 1487.5
- higher 1 4126.2 1488.0
- famsize 1 4129.2 1488.2
- absences 1 4130.3 1488.2
- studytime 1 4136.1 1488.6
- schoolsup 1 4136.3 1488.6
- romantic 1 4138.3 1488.8
- Medu 1 4182.0 1491.5
- sex 1 4189.9 1492.0
- Mjob 4 4299.9 1492.7
- failures 1 4347.4 1501.5
Step: AIC=1486
G3 ~ sex + age + famsize + Medu + Mjob + studytime + failures +
schoolsup + higher + romantic + goout + health + absences
Df Deviance AIC
- age 1 4114.5 1485.3
<none> 4094.6 1486.0
- goout 1 4135.3 1486.6
- higher 1 4145.4 1487.2
- health 1 4147.2 1487.3
- absences 1 4151.1 1487.5
- studytime 1 4156.5 1487.9
- schoolsup 1 4158.9 1488.0
- romantic 1 4166.2 1488.5
- famsize 1 4170.8 1488.8
- Medu 1 4199.6 1490.6
- Mjob 4 4322.9 1492.1
- sex 1 4241.6 1493.1
- failures 1 4378.1 1501.3
Step: AIC=1485.26
G3 ~ sex + famsize + Medu + Mjob + studytime + failures + schoolsup +
higher + romantic + goout + health + absences
Df Deviance AIC
<none> 4114.5 1485.3
- absences 1 4162.9 1486.3
- health 1 4164.0 1486.4
- schoolsup 1 4164.2 1486.4
- goout 1 4165.8 1486.5
- higher 1 4172.9 1486.9
- studytime 1 4173.0 1486.9
- famsize 1 4187.0 1487.8
- romantic 1 4203.3 1488.8
- Medu 1 4229.5 1490.4
- Mjob 4 4343.3 1491.3
- sex 1 4268.9 1492.8
- failures 1 4447.4 1503.4
Stepwise variable reduction linear output basis standard threshold
summary(step.math.lm)
Call:
glm(formula = G3 ~ sex + famsize + Medu + Mjob + studytime +
failures + schoolsup + higher + romantic + goout + health +
absences, family = "gaussian", data = math.train)
Deviance Residuals:
Min 1Q Median 3Q Max
-11.7901 -1.8104 0.3114 2.9874 7.7140
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.59026 1.77241 4.282 2.66e-05 ***
sexM 1.73983 0.57600 3.021 0.00279 **
famsizeLE3 1.19372 0.57669 2.070 0.03952 *
Medu 0.84212 0.32314 2.606 0.00973 **
Mjobhealth 0.37399 1.22879 0.304 0.76112
Mjobother -1.26355 0.80875 -1.562 0.11951
Mjobservices 0.50424 0.91135 0.553 0.58057
Mjobteacher -2.27170 1.23298 -1.842 0.06663 .
studytime 0.60745 0.32659 1.860 0.06410 .
failures -1.67838 0.37847 -4.435 1.40e-05 ***
schoolsupyes -1.40571 0.82011 -1.714 0.08780 .
higheryes 2.07726 1.11806 1.858 0.06439 .
romanticyes -1.26152 0.55074 -2.291 0.02284 *
goout -0.39622 0.22746 -1.742 0.08279 .
health -0.31522 0.18421 -1.711 0.08831 .
absences 0.05206 0.03077 1.692 0.09201 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 16.93193)
Null deviance: 5724.9 on 258 degrees of freedom
Residual deviance: 4114.5 on 243 degrees of freedom
AIC: 1485.3
Number of Fisher Scoring iterations: 2
#Linear Regression Predictions
PredictTrain = predict(step.math.lm, newdata = math.train)
PredictTest = predict(step.math.lm, newdata = math.test)
#Linear Regression KPIs
mean_train = mean(math.train$G3)
SSETrain = sum((PredictTrain - math.train$G3)^2)
SSTTrain = sum((math.train$G3 - mean_train)^2)
R2 = 1 - SSETrain/SSTTrain
SSETest = sum((PredictTest - math.test$G3)^2)
SSTTest = sum((math.test$G3 - mean_train)^2)
OSR2 = 1 - SSETest/SSTTest
RMSE = RMSE(PredictTest,math.test$G3)
MAE = MAE(PredictTest,math.test$G3)
KPILM=data.frame("Model"="LM","R2"=R2,"OSR2"=OSR2,"RMSE"=RMSE,"MAE"=MAE)
KPILM
#Lasso
#set training and test dataset
x.math.train=model.matrix(G3~.-1,data=math.train)
y.math.train<-math.train[,c("G3")] #set Y for glmnet fitting
x.math.test=model.matrix(G3~.-1,data=math.test)
y.math.test=math.test[,"G3"]
lasso.lambdas = c(exp(seq(5,-5,-.1)))
set.seed(1)
cv.lasso = cv.glmnet(x.math.train,y.math.train,alpha=1,lambda=lasso.lambdas,nfolds=10)
bestlambda = cv.lasso$lambda.min
lasso = glmnet(x.math.train,y.math.train,alpha=1,lambda = bestlambda)
#Lasso Predictions
pred.train.lasso <- predict(lasso,x.math.train)
pred.test.lasso <- predict(lasso,x.math.test)
#Lasso KPIs
R2.Lasso = 1-sum((pred.train.lasso - y.math.train)^2)/SSTTrain
OSR2.lasso <- 1-sum((pred.test.lasso-y.math.test)^2)/SSTTest
RMSE.lasso = RMSE(pred.test.lasso,math.test$G3)
MAE.lasso = MAE(pred.test.lasso,math.test$G3)
KPILasso=data.frame("Model"="Lasso","R2"=R2.Lasso,"OSR2"=OSR2.lasso,"RMSE"= RMSE.lasso,"MAE"=MAE.lasso)
KPILasso
#cart
default_tree <- rpart(G3 ~ school+sex+age+address+famsize+Pstatus+Medu+Fedu+Mjob+Fjob+reason+guardian+traveltime+studytime+failures+schoolsup+famsup+paid+activities+nursery+higher+internet+romantic+famrel+freetime+goout+Dalc+Walc+health+absences, data=math.train)
par(mar=c(1,1,1,1))
prp(default_tree)
print(default_tree, digits=3)
n= 259
node), split, n, deviance, yval
* denotes terminal node
1) root 259 5720.0 10.40
2) failures>=0.5 54 1410.0 7.15
4) absences< 1 19 219.0 1.42 *
5) absences>=1 35 233.0 10.30 *
3) failures< 0.5 205 3600.0 11.20
6) Medu< 2.5 79 1400.0 10.10
12) absences< 0.5 22 875.0 7.64
24) guardian=mother,other 13 373.0 4.08 *
25) guardian=father 9 99.6 12.80 *
13) absences>=0.5 57 342.0 11.00 *
7) Medu>=2.5 126 2030.0 11.90
14) schoolsup=yes 12 169.0 8.67 *
15) schoolsup=no 114 1720.0 12.30
30) age>=15.5 90 1420.0 11.80
60) studytime< 2.5 68 1030.0 11.20
120) famsup=yes 41 560.0 10.40
240) Mjob=at_home,other,teacher 26 408.0 9.46
480) health>=1.5 19 247.0 8.42 *
481) health< 1.5 7 85.4 12.30 *
241) Mjob=health,services 15 92.9 11.90 *
121) famsup=no 27 393.0 12.50 *
61) studytime>=2.5 22 302.0 13.50
122) Walc>=1.5 9 35.6 10.20 *
123) Walc< 1.5 13 102.0 15.80 *
31) age< 15.5 24 194.0 14.20 *
#Cart Cross Validation
RSquared <- function(data, lev = NULL, model = NULL, ...) {
c(RSq = cor(data$obs, data$pred) ** 2)
}
cv.trees = train(G3~school+sex+age+address+famsize+Pstatus+Medu+Fedu+Mjob+Fjob+reason+guardian+traveltime+studytime+failures+schoolsup+famsup+paid+activities+nursery+higher+internet+romantic+famrel+freetime+goout+Dalc+Walc+health+absences,
data = math.train,
method = "rpart",
trControl = trainControl(method = "cv", number = 10, summaryFunction=RSquared), # 10-fold cv
metric="RSq", maximize=TRUE,
tuneGrid = data.frame(.cp = seq(0,.0004,.00001)))
#cart best tree
best_cp <- cv.trees$bestTune$cp
best_tree <- rpart(G3 ~ school+sex+age+address+famsize+Pstatus+Medu+Fedu+Mjob+Fjob+reason+guardian+traveltime+studytime+failures+schoolsup+famsup+paid+activities+nursery+higher+internet+romantic+famrel+freetime+goout+Dalc+Walc+health+absences, data=math.train, cp=best_cp)
best_tree
n= 259
node), split, n, deviance, yval
* denotes terminal node
1) root 259 5724.91900 10.378380
2) failures>=0.5 54 1412.81500 7.148148
4) absences< 1 19 218.63160 1.421053 *
5) absences>=1 35 232.68570 10.257140
10) Walc>=1.5 22 155.31820 9.590909
20) Mjob=health,other 8 29.87500 8.375000 *
21) Mjob=at_home,services,teacher 14 106.85710 10.285710 *
11) Walc< 1.5 13 51.07692 11.384620 *
3) failures< 0.5 205 3600.22400 11.229270
6) Medu< 2.5 79 1400.38000 10.088610
12) absences< 0.5 22 875.09090 7.636364
24) guardian=mother,other 13 372.92310 4.076923 *
25) guardian=father 9 99.55556 12.777780 *
13) absences>=0.5 57 341.92980 11.035090
26) health>=2.5 44 222.72730 10.727270
52) romantic=no 28 151.71430 10.285710
104) sex=F 16 67.93750 9.562500 *
105) sex=M 12 64.25000 11.250000 *
53) romantic=yes 16 56.00000 11.500000 *
27) health< 2.5 13 100.92310 12.076920 *
7) Medu>=2.5 126 2032.61100 11.944440
14) schoolsup=yes 12 168.66670 8.666667 *
15) schoolsup=no 114 1721.44700 12.289470
30) age>=15.5 90 1415.55600 11.777780
60) studytime< 2.5 68 1027.69100 11.220590
120) famsup=yes 41 559.51220 10.365850
240) Mjob=at_home,other,teacher 26 408.46150 9.461538
480) health>=1.5 19 246.63160 8.421053 *
481) health< 1.5 7 85.42857 12.285710 *
241) Mjob=health,services 15 92.93333 11.933330 *
121) famsup=no 27 392.74070 12.518520
242) health>=3.5 18 334.27780 11.611110 *
243) health< 3.5 9 14.00000 14.333330 *
61) studytime>=2.5 22 301.50000 13.500000
122) Walc>=1.5 9 35.55556 10.222220 *
123) Walc< 1.5 13 102.30770 15.769230 *
31) age< 15.5 24 193.95830 14.208330
62) studytime>=2.5 7 28.00000 13.000000 *
63) studytime< 2.5 17 151.52940 14.705880 *
prp(best_tree)
print(best_tree, digits=3)
n= 259
node), split, n, deviance, yval
* denotes terminal node
1) root 259 5720.0 10.40
2) failures>=0.5 54 1410.0 7.15
4) absences< 1 19 219.0 1.42 *
5) absences>=1 35 233.0 10.30
10) Walc>=1.5 22 155.0 9.59
20) Mjob=health,other 8 29.9 8.38 *
21) Mjob=at_home,services,teacher 14 107.0 10.30 *
11) Walc< 1.5 13 51.1 11.40 *
3) failures< 0.5 205 3600.0 11.20
6) Medu< 2.5 79 1400.0 10.10
12) absences< 0.5 22 875.0 7.64
24) guardian=mother,other 13 373.0 4.08 *
25) guardian=father 9 99.6 12.80 *
13) absences>=0.5 57 342.0 11.00
26) health>=2.5 44 223.0 10.70
52) romantic=no 28 152.0 10.30
104) sex=F 16 67.9 9.56 *
105) sex=M 12 64.2 11.20 *
53) romantic=yes 16 56.0 11.50 *
27) health< 2.5 13 101.0 12.10 *
7) Medu>=2.5 126 2030.0 11.90
14) schoolsup=yes 12 169.0 8.67 *
15) schoolsup=no 114 1720.0 12.30
30) age>=15.5 90 1420.0 11.80
60) studytime< 2.5 68 1030.0 11.20
120) famsup=yes 41 560.0 10.40
240) Mjob=at_home,other,teacher 26 408.0 9.46
480) health>=1.5 19 247.0 8.42 *
481) health< 1.5 7 85.4 12.30 *
241) Mjob=health,services 15 92.9 11.90 *
121) famsup=no 27 393.0 12.50
242) health>=3.5 18 334.0 11.60 *
243) health< 3.5 9 14.0 14.30 *
61) studytime>=2.5 22 302.0 13.50
122) Walc>=1.5 9 35.6 10.20 *
123) Walc< 1.5 13 102.0 15.80 *
31) age< 15.5 24 194.0 14.20
62) studytime>=2.5 7 28.0 13.00 *
63) studytime< 2.5 17 152.0 14.70 *
#Cart KPIs
default_pred_train = predict(default_tree, newdata = math.train)
best_pred_train = predict(best_tree, newdata=math.train)
default_pred <- predict(default_tree, newdata = math.test)
best_pred <- predict(best_tree, newdata=math.test)
actualtrain <- math.train$G3
actual <- math.test$G3
R2cart_default=cor(actualtrain, default_pred_train) ^ 2
R2cart=cor(actualtrain, best_pred_train) ^ 2
#OSR2
OSR2cart_default=cor(actual, default_pred) ^ 2
OSR2cart=cor(actual, best_pred) ^ 2
#MAE
MAEcart_default=Metrics::mae(actual, default_pred)
MAEcart=Metrics::mae(actual, best_pred)
#RMSE
RMSEcart_default=Metrics::rmse(actual, default_pred)
RMSEcart=Metrics::rmse(actual, best_pred)
KPIcart = data.frame("Model"="Cart","R2"=R2cart,"OSR2"=OSR2cart,"RMSE"= RMSEcart,"MAE"=MAEcart)
KPIcart
set.seed(1)
rf.cv.math=train(y=math.train$G3, x=subset(math.train, select=-c(G3)), method="rf", nodsize=25, ntree=80, trControl=trainControl(method="cv", number=10), tuneGrid=data.frame(mtry=seq(10,30,1)))
rf.cv.math
Random Forest
259 samples
30 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 234, 234, 232, 232, 233, 233, ...
Resampling results across tuning parameters:
mtry RMSE Rsquared MAE
10 3.963764 0.3172931 3.077470
11 3.851030 0.3541703 2.958513
12 3.900203 0.3373594 3.028728
13 3.816625 0.3667195 2.960741
14 3.856354 0.3491972 2.953694
15 3.834740 0.3633247 2.957087
16 3.905774 0.3233089 3.004829
17 3.834463 0.3545586 2.982463
18 3.859076 0.3409640 3.010669
19 3.842274 0.3496722 2.990125
20 3.834634 0.3460991 2.980646
21 3.787923 0.3648602 2.916338
22 3.826180 0.3503290 2.933181
23 3.877063 0.3278590 3.007338
24 3.861419 0.3372604 2.981451
25 3.829523 0.3545664 2.980368
26 3.810090 0.3565369 2.954413
27 3.775078 0.3698736 2.906899
28 3.833981 0.3512126 2.960694
29 3.921614 0.3164576 3.026684
30 3.834718 0.3441478 2.943957
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was mtry = 27.
#4 RF with CV
mod.rf.math=randomForest(G3~., data=math.train,mtry=27,nodesize=25,ntree=80)
important_vars_ames=importance(mod.rf.math)
important_vars_ames
IncNodePurity
school 13.598025
sex 74.051025
age 117.017024
address 17.549492
famsize 22.460835
Pstatus 9.767272
Medu 177.078430
Fedu 151.125383
Mjob 175.574637
Fjob 90.998446
reason 96.759936
guardian 124.566849
traveltime 36.180535
studytime 61.785055
failures 811.508862
schoolsup 43.680819
famsup 39.000993
paid 12.276863
activities 17.551460
nursery 17.105788
higher 62.736530
internet 3.906562
romantic 33.805567
famrel 63.653592
freetime 86.209069
goout 85.048995
Dalc 31.216909
Walc 97.885504
health 164.735303
absences 1016.886700
#predict
pred.train.rf.math=predict(mod.rf.math,newdata=math.train)
pred.test.rf.math=predict(mod.rf.math,newdata=math.test)
#performance of rf
R2.rf.math=1-sum((pred.train.rf.math-math.train$G3)^2)/SSTTrain
MAE.rf.math=mean(abs(pred.train.rf.math-math.train$G3))
RMSE.rf.math=sqrt(mean((pred.train.rf.math-math.train$G3)^2))
OSR2.rf.math.test=1-sum((pred.test.rf.math-math.test$G3)^2)/SSTTest
MAE.rf.math.test=mean(abs(pred.test.rf.math-math.test$G3))
RMSE.rf.math.test=sqrt(mean((pred.test.rf.math-math.test$G3)^2))
KPIrf=data.frame("Model"="RF","R2"=R2.rf.math,"OSR2"=OSR2.rf.math.test,"MAE"=MAE.rf.math.test,"RMSE"=RMSE.rf.math.test)
KPIrf
#all Model KPIs
rbind(KPILM,KPILasso,KPIcart,KPIrf)