Reading the datasets
#Reading the dataset
train <- read.csv("diabetes_train-std.csv")
test <- read.csv("diabetes_test-std.csv")
attach(train)
#Converting class varaible to factor
train$classvariable <- as.factor(train$classvariable)
test$classvariable <- as.factor(test$classvariable)
library(rpart)
#Fitting rpart
cfit <- rpart(classvariable ~ numpreg + plasmacon + bloodpress + skinfold + seruminsulin + BMI + pedigreefunction + age,data = train, method = 'class')
par(mar=c(1,1,0.25,1))
plot(cfit, branch = 0.4,uniform = TRUE, compress = TRUE)
text(cfit)
#Prediction on test set
rpart.predict <- predict(cfit, newdata = test,type = "class")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
cf <-confusionMatrix(rpart.predict,test$classvariable)
#Mean error rate
mean.error.rate.rpart <- 1- cf$overall[1]
mean.error.rate.rpart
## Accuracy
## 0.2408964
Plotting the cp plot and reading the value to determine if pruning is necessary.
#How to read plotcp - http://www.wekaleamstudios.co.uk/posts/classification-trees-using-the-rpart-function/#m3mLNpeke0I
par(mar=c(3,3,3,3))
plotcp(cfit,lty = 3, col = 1)
printcp(cfit)
##
## Classification tree:
## rpart(formula = classvariable ~ numpreg + plasmacon + bloodpress +
## skinfold + seruminsulin + BMI + pedigreefunction + age, data = train,
## method = "class")
##
## Variables actually used in tree construction:
## [1] age BMI pedigreefunction plasmacon
##
## Root node error: 154/400 = 0.385
##
## n= 400
##
## CP nsplit rel error xerror xstd
## 1 0.279221 0 1.00000 1.00000 0.063194
## 2 0.032468 1 0.72078 0.85714 0.061067
## 3 0.016234 6 0.55195 0.81818 0.060327
## 4 0.012987 8 0.51948 0.84416 0.060828
## 5 0.010000 10 0.49351 0.83766 0.060706
The plot shows 2 branches as optimum. The CP value for 2 branches would be around 0.03. Pruning and replotting the tree:
#After pruning
cfit2 = prune(cfit, cp = 0.03)
par(mar=c(1,1,0.25,1))
plot(cfit2, branch = 0.4,uniform = TRUE, compress = TRUE)
text(cfit2)
Using the pruned tree to predict and pulling up the mean error rate and confusion matrix
#Prediction on test set
rpart.prune.predict <- predict(cfit2, newdata = test,type = "class")
cf.prune <-confusionMatrix(rpart.prune.predict,test$classvariable)
#Mean error rate
mean.error.rate.rpart.prune <- 1- cf.prune$overall[1]
mean.error.rate.rpart.prune
## Accuracy
## 0.2156863
#Confusion Matrix
cf.prune$table
## Reference
## Prediction 0 1
## 0 205 37
## 1 40 75
Running a random forest with default settings:
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
rf.diab <- randomForest(classvariable ~ numpreg + plasmacon + bloodpress + skinfold + seruminsulin + BMI + pedigreefunction + age,data = train,importance=TRUE)
rf.yhat <- predict(rf.diab,newdata = test)
Plotting the errors from Random Forest model:
par(mar=c(3,3,3,3))
plot(rf.diab, type="l")
Important variables are:
varImpPlot(rf.diab,main = "Important Variables")
importance(rf.diab)
## 0 1 MeanDecreaseAccuracy
## numpreg 8.092932 0.1835287 6.8698447
## plasmacon 21.855425 20.3960578 28.1393381
## bloodpress -1.908877 -2.2502017 -3.0025119
## skinfold 1.017169 -0.3356204 0.7924919
## seruminsulin 3.488566 0.4599741 3.1783688
## BMI 10.749431 12.6523482 16.4091620
## pedigreefunction 5.638079 4.5883538 7.3318656
## age 5.706137 9.0374097 10.7430051
## MeanDecreaseGini
## numpreg 15.36864
## plasmacon 43.56476
## bloodpress 16.70306
## skinfold 14.41862
## seruminsulin 14.20837
## BMI 33.86676
## pedigreefunction 25.02633
## age 24.93435
Confusion Matrix and the mean error rate:
rf.cm <- confusionMatrix(rf.yhat,test$classvariable)
#Mean error rate
mean.error.rate.rf <- 1- rf.cm$overall[1]
mean.error.rate.rf
## Accuracy
## 0.1960784
#Confusion Matrix
rf.cm$table
## Reference
## Prediction 0 1
## 0 218 43
## 1 27 69
#Since GBM does not work on factor outcomes, re-reading the dataset
train <- read.csv("diabetes_train-std.csv")
test <- read.csv("diabetes_test-std.csv")
attach(train)
## The following objects are masked from train (pos = 8):
##
## age, bloodpress, BMI, classvariable, numpreg,
## pedigreefunction, plasmacon, seruminsulin, skinfold
Modeling gradient boosting with following parameters:
Interaction depth is the maximum depth of variable interaction while shrinkage is the learning rate or a parameter applied to each tree in expansion. Using these parameters give us a much better accuracy than default settings.
library(gbm)
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
gbm.diab <- gbm(classvariable ~ numpreg + plasmacon + bloodpress + skinfold + seruminsulin + BMI + pedigreefunction + age,data = train, distribution = "bernoulli",n.trees =500,interaction.depth = 4, shrinkage = 0.005)
Most important variables:
par(mar=c(4,4,4,4))
summary(gbm.diab)
## var rel.inf
## plasmacon plasmacon 39.375942
## BMI BMI 23.012549
## pedigreefunction pedigreefunction 11.966060
## age age 11.577427
## numpreg numpreg 4.698251
## seruminsulin seruminsulin 4.489544
## skinfold skinfold 2.606234
## bloodpress bloodpress 2.273994
Confusion Matrix and Mean Error Rate:
Since GBM prdicts probabilities, I have selected a threshold of 0.385 as the ideal cut off for 1’s and 0’s since such was the class proportion of 1’s
gbm.yhat <- predict(gbm.diab,newdata = test, n.trees = 500)
summary(gbm.yhat)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.50600 -1.62200 -0.95600 -0.75740 -0.02431 1.99200
gbm.yhat1 <- ifelse(gbm.yhat < 0.385,0,1)
library(caret)
gbm.cm <- confusionMatrix(gbm.yhat1,test$classvariable)
#Mean error rate
mean.error.rate.gbm <- 1- gbm.cm$overall[1]
mean.error.rate.gbm
## Accuracy
## 0.2044818
#Confusion Matrix
gbm.cm$table
## Reference
## Prediction 0 1
## 0 229 57
## 1 16 55