Loading the Libaries and dataset

library(glmnet)
library(caret)
library(rpart)
library(klaR)
library(e1071)
library(caTools)
library(gbm)
library(ggplot2)
library(ggthemes)

setwd("~/Documents/Data Mining Slides/R and associated")
student_mat <- read.csv("student-mat.csv")
student_por <- read.csv("student-por.csv")
d3=merge(student_mat,student_por, all=TRUE)
print(nrow(d3)) 
## [1] 1044
students <- subset( d3, select = -c(G1, G2 ))

Distribution and Analysis

Let us delve into the analysis of data distribution. The state of parents’ education is routinely cited as an important predictor of student academic success in many student analytics literature. Below is the plot of the job distribution of mothers.

Mother’s Job

ggplot(data = students, aes(x = students$Mjob)) +
  geom_histogram(bins = 12, stat = "count", fill='dodgerblue1') + 
  #geom_density() + 
  ggtitle("Distribution of Mother's Job in student data") + 
  xlab("Mother's Job") +  
  ylab("Distribution of Mother's Job") +
  theme(plot.title=element_text(face="bold"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

Mother’s job is grouped into five categories - Housewife, Employment sectors (Health, Education, Other services), and Other. The classification into health and education is really important as these two are highly crucial in the upbringing of a child and a mother who is employed in these fields might help the child further. In addition, the distinction of housewife, employed and other is also important because mothers who are full-time homemakers might be able to focus more on their child’s needs. All these hyotheses need to be tested for veracity.

Now let us compare Mother’s Job and Student’s Grade.

Comparing Mother’s Job vs Student Grades

ggplot(students, aes(Mjob, fill=students$G3)) +
  geom_bar(aes(y = (..count..)/sum(..count..)), position ='stack', fill='dodgerblue1') + 
  ggtitle("Impact of Mother's Job on students' Grade") +
  labs(x="Mother's Job", y="Proportion", fill="Grades") +
  theme(plot.title=element_text(face="bold"))

We can observe an interesting pattern here. Kids of mothers who are housewives are more likely to perform below average than other kids. Meanwhile kids whose mothers are employed in the health sector are more likely to perform above average.

Willingness to Pursue Higher Education

Let us move on to the distribution of Higher Education. This variable represents the students’ willingness to pursue higher education.

ggplot(data = students, aes(x = students$higher)) +
  geom_histogram(bins = 12, stat = "count", fill="dodgerblue1") +
  #geom_density() +
  ggtitle("Distribution of Higher Education in student data") +
  xlab("Higher Education") +
  ylab("Distribution of Higher Education")+
  theme(plot.title=element_text(face="bold"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

It is pretty skewed as expected. A lot of students have answered yes. We might have to equalize the distribution before finding whether this yields any significant information about the grades.

Target Variable - Distribution of Grades

Let us analyze the target variable Grades in its original numeric form firstly.

ggplot(data = d3, aes(x = d3$G3)) +
  geom_histogram(bins = 12, stat = "count", color='black', fill='dodgerblue1') +
  geom_density() +
  ggtitle("Distribution of Grade in student data") +
  xlab("Grade") +
  ylab("Distribution of Grade")+
  theme(plot.title=element_text(face="bold"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

We can see that the range 10 is the approximate average where the grades of most students fall in. There is one odd pattern though - there are no students in the grades 2 and 3. This seems like an issue in data quality, given our sample size.

History of Failures

Distribution of previous failures.

ggplot(data = students, aes(x = students$failures)) +
  geom_histogram(bins = 12, stat = "count", colo='black', fill='dodgerblue1') +
  geom_density() +
  ggtitle("Distribution of previous failures in student data") +
  xlab("Failures") +
  ylab("Distribution of Failures")+
  theme(plot.title=element_text(face="bold"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad, colo

This might turn out to be an important variable like the previous default history variable in the loan default prediction project.

Relations between variables

Failures vs Grades

Let us find the correlation between Failures and Grades.

cor(d3$G3, d3$failures)
## [1] -0.3831453
plot(x=d3$failures, y=d3$G3, xlab = "Failures", ylab = "Rank", col = "blue")

Age vs Grades

Correlation between Age and Grades

cor(d3$age, d3$G3)
## [1] -0.1252824
plot(x=d3$age, y=d3$G3, xlab = "Age", ylab = "Rank", col = "red")

Grades vs Study Time

Boxplot between Grades and Study Time

boxplot(d3$studytime ~ d3$G3, data = d3,  xlab = "Rank", ylab = "Study Time", main = "Box plot of Rank vs Study Time" ,col = "red")

Model Building

Kitchen Sink Linear Regression

Before deciding on any model for our dataset, let us throw in all the variables and build a kitchen sink linear regression keeping the target as a numeric variable. The goal of this particular exercise is to aid in the selection of important variables.

set.seed (199)
trainindex=sample (1:nrow(students), nrow(students)/2)
train <- students[trainindex, ]
test <- students[-trainindex, ]
lm.model <- lm(G3 ~ ., data = train)
lm.y <- predict.lm(lm.model, test)
mean((lm.y - test$G3)^2)
## [1] 11.83529
test$G3[test$G3 >=0 & test$G3 <=5] <- "Low"
test$G3[test$G3 == 10 | test$G3 ==6 | test$G3 == 7 | test$G3 == 8 | test$G3 == 9] <- "BelowAverage"
test$G3[test$G3 >10 & test$G3 <=15] <- "Average"
test$G3[test$G3 >15 & test$G3 <=20] <- "High"
table(test$G3)
## 
##      Average BelowAverage         High          Low 
##          274          166           57           25
test$G3 <- as.factor(test$G3)
lm.y <- round(lm.y)
lm.y <- as.numeric(lm.y)
lm.y[lm.y >=0 & lm.y <=5] <- "Low"
lm.y[lm.y== 10 | lm.y ==6 | lm.y == 7 | lm.y == 8 | lm.y == 9] <- "BelowAverage"
lm.y[lm.y>10 & lm.y <=15] <- "Average"
lm.y[lm.y >15 & lm.y <=20] <- "High"
table(lm.y)
## lm.y
##      Average BelowAverage         High          Low 
##          393          120            5            4
lm.y <- as.factor(lm.y)
regconfusion <- confusionMatrix(lm.y, test$G3)
accuracyReg <- regconfusion$overall[1]
accuracyReg
##  Accuracy 
## 0.5651341

So the linear regression model with all the variables has yielded an accuracy of 56%.

Removing Multicoolinearity

Let us build a correlation matrix for all numeric variables to eliminate multicollinearity

##Correlation matrix with numbers
library(corrplot)
M <- cor(students[,c(3,7,8,13,14,15,24:31)])
corrplot(M, method="number")

3- fold cross-validated Linear Regression

Now let us rebuild linear regression after removing correlated variables, low p-value variables and by applying 3-fold cross-validation.

#linear regression with 3-fold cross validation
set.seed(1234)
students <- students[sample(nrow(students)),]
folds <- cut(seq(1,nrow(students)),breaks=3,labels=FALSE)
accuracyReg <- rep(0,3)
for(i in 1:3){
  testIndexes <- which(folds==i,arr.ind=TRUE)
  testData <- students[testIndexes, ]
  trainData <- students[-testIndexes, ]
  lm.model <- lm(G3 ~ ., data = trainData)
  lm.y <- predict.lm(lm.model, testData)
  testData$G3 <- cut(testData$G3, c(-Inf, 5, 10, 15, Inf), labels =c("Low", "BelowAverage", "Average","High"))
  testData$G3 <- as.factor(testData$G3)
  lm.y <- round(lm.y)
  lm.y <- as.numeric(lm.y)
  lm.y <- cut(lm.y, c(-Inf,5,10,15, Inf), labels =c("Low", "BelowAverage", "Average","High"))
  lm.y <- as.factor(lm.y)
  regconfusion <- confusionMatrix(lm.y, testData$G3)
  accuracyReg[i] <- regconfusion$overall[1]
  print(accuracyReg)
}
## [1] 0.5775862 0.0000000 0.0000000
## [1] 0.5775862 0.5574713 0.0000000
## [1] 0.5775862 0.5574713 0.5402299

The highest accuracy that we get is 59%. So cross-validation has increased our accuracy by 3%.

Lasso Regularization

Now let us move on to application of lasso regularization to combine the variable selection. This step is especially necessary because of the small sample size of our dataset.

#Lasso regularization 
students <- students[,-c(8,28)]
x=model.matrix (G3~., students)[,-1]
y=students$G3
set.seed (199)
train=sample (1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
lasso.mod = glmnet (x[train,], y[train], alpha =1)
cv.out = cv.glmnet (x[train,], y[train], alpha =1)
bestlam =cv.out$lambda.min
lasso.pred = predict (lasso.mod, s=bestlam, newx=x[test,])
y.test[y.test >=0 & y.test <=5] <- "Low"
y.test[y.test == 10 | y.test ==6 | y.test == 7 | y.test == 8 | y.test == 9] <- "BelowAverage"
y.test[y.test >10 & y.test <=15] <- "Average"
y.test[y.test >15 & y.test <=20] <- "High"
y.test <- as.factor(y.test)
y.test = factor(y.test,levels(y.test)[c(1,2,4,3)])
predLassoValues <- round(lasso.pred[,1])
predLassoValues <- as.numeric(predLassoValues)
predLassoValues[predLassoValues >=0 & predLassoValues <=5] <- "Low"
predLassoValues[predLassoValues== 10 | predLassoValues ==6 | predLassoValues == 7 | predLassoValues == 8 | predLassoValues == 9] <- "BelowAverage"
predLassoValues[predLassoValues>10 & predLassoValues <=15] <- "Average"
predLassoValues[predLassoValues >15 & predLassoValues <=20] <- "High"
predLassoValues <- as.factor(predLassoValues)
levels(predLassoValues)[4] <- "High"
lassoregconfusion <- confusionMatrix(predLassoValues, y.test)
accuracyLasso <- lassoregconfusion$overall[1]
accuracyLasso
##  Accuracy 
## 0.5842912
#predict(lasso.mod, s=bestlam, type = "coefficients")[1:28,]

Lasso regularization does not have much of an impact on our linear regression. The accuracy hovers around 54%, infact reducing my 2 percentage points.

Our target variable is a continuous numeric - grades ranging from 0 as the lowest to 20 as the highest. We are going to categorize the target variable into different bins, because we are not focused on determining a child’s exact grade, rather on whether it is low, average, or best.

Categorization

Categorization of the target variable into Low, Below Average, Average/Above Average and High.

students$G3[students$G3>=0 & students$G3 <=5] <- "Low"
students$G3[students$G3==10] <- "BelowAverage"
students$G3[students$G3==9] <- "BelowAverage"
students$G3[students$G3==8] <- "BelowAverage"
students$G3[students$G3==7] <- "BelowAverage"
students$G3[students$G3==6] <- "BelowAverage"
students$G3[students$G3 > 10 & students$G3 <=14] <- "Average"
students$G3[students$G3 >14 & students$G3 <=20] <- "High"
students$G3 <- as.factor(students$G3)

Let us build classification models to see if they improve in performance over our regression models.

Decision Trees and Naive Bayes with 3-fold cross-validation

Decision Trees and Naive Bayes

#RP, NB & GB with 3-fold cross-validation
set.seed(1234)
students <- students[sample(nrow(students)),]
folds <- cut(seq(1,nrow(students)),breaks=3,labels=FALSE)
accuracylist_nb <- rep(0,3)
accuracylist_rp <- rep(0,3)
accuracylist_gb <- rep(0,3)
for(i in 1:3){
  testIndexes <- which(folds==i,arr.ind=TRUE)
  testData <- students[testIndexes, ]
  trainData <- students[-testIndexes, ]
  nbmodel <- naiveBayes(G3~., data = trainData)
  rpmodel <- rpart(G3~., data =trainData, cp=0)
  gbmodel <-  gbm(G3~., data=trainData, n.trees=200, interaction.depth=6, shrinkage=0.01)
  predictionNB <- predict(nbmodel, testData)
  predictionRP <- predict(rpmodel, testData, type = 'class')
  nboutput <- confusionMatrix(testData$G3, predictionNB)
  rpoutput <- confusionMatrix(testData$G3, predictionRP)
  accuracylist_nb[i] <- paste0(nboutput$overall[1])
  accuracylist_rp[i] <- paste0(rpoutput$overall[1])
}
## Distribution not specified, assuming multinomial ...
## Distribution not specified, assuming multinomial ...
## Distribution not specified, assuming multinomial ...
print(accuracylist_nb)
## [1] "0.456896551724138" "0.387931034482759" "0.448275862068966"
print(accuracylist_rp)
## [1] "0.46264367816092"  "0.448275862068966" "0.451149425287356"

Both the models have performed poorly compared to the linear regression model with 3-fold classification.

#Visualizing the rpart tree
plot(rpmodel, compress = TRUE)
text(rpmodel, use.n = TRUE)