In this problem we are going to analyze the diabetes dataset predict whether or not an individual suffers from diabetes.We will be using decision tree classifier with and without the meta methods - Bagging and Boosting.

Reading the datasets

#Reading the dataset
train <- read.csv("diabetes_train-std.csv")
test <- read.csv("diabetes_test-std.csv")

attach(train)

#Converting class varaible to factor 
train$classvariable <- as.factor(train$classvariable)
test$classvariable <- as.factor(test$classvariable)

(a) Fit a classification tree. Plot the tree, and report the mean error rate (fraction of incorrect labels) on test data. Report the confusion matrix. Using rpart in R to fit the decision tree.

library(rpart)

#Fitting rpart
cfit <- rpart(classvariable ~ numpreg + plasmacon + bloodpress + skinfold + seruminsulin + BMI + pedigreefunction + age,data = train, method = 'class')

par(mar=c(1,1,0.25,1))
plot(cfit, branch = 0.4,uniform = TRUE, compress = TRUE)
text(cfit)

#Prediction on test set
rpart.predict <- predict(cfit, newdata = test,type = "class")

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

cf <-confusionMatrix(rpart.predict,test$classvariable)

#Mean error rate
mean.error.rate.rpart <- 1- cf$overall[1]
mean.error.rate.rpart
##  Accuracy 
## 0.2408964

Plotting the cp plot and reading the value to determine if pruning is necessary.

#How to read plotcp - http://www.wekaleamstudios.co.uk/posts/classification-trees-using-the-rpart-function/#m3mLNpeke0I

par(mar=c(3,3,3,3))
plotcp(cfit,lty = 3, col = 1)

printcp(cfit)
## 
## Classification tree:
## rpart(formula = classvariable ~ numpreg + plasmacon + bloodpress + 
##     skinfold + seruminsulin + BMI + pedigreefunction + age, data = train, 
##     method = "class")
## 
## Variables actually used in tree construction:
## [1] age              BMI              pedigreefunction plasmacon       
## 
## Root node error: 154/400 = 0.385
## 
## n= 400 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.279221      0   1.00000 1.00000 0.063194
## 2 0.032468      1   0.72078 0.85714 0.061067
## 3 0.016234      6   0.55195 0.81818 0.060327
## 4 0.012987      8   0.51948 0.84416 0.060828
## 5 0.010000     10   0.49351 0.83766 0.060706

The plot shows 2 branches as optimum. The CP value for 2 branches would be around 0.03. Pruning and replotting the tree:

#After pruning
cfit2 = prune(cfit, cp = 0.03)

par(mar=c(1,1,0.25,1))
plot(cfit2, branch = 0.4,uniform = TRUE, compress = TRUE)
text(cfit2)

Using the pruned tree to predict and pulling up the mean error rate and confusion matrix

#Prediction on test set
rpart.prune.predict <- predict(cfit2, newdata = test,type = "class")

cf.prune <-confusionMatrix(rpart.prune.predict,test$classvariable)

#Mean error rate
mean.error.rate.rpart.prune <- 1- cf.prune$overall[1]
mean.error.rate.rpart.prune
##  Accuracy 
## 0.2156863
#Confusion Matrix
cf.prune$table
##           Reference
## Prediction   0   1
##          0 205  37
##          1  40  75

(b) Analyze the data using random forests. Report the mean error rate and the confusion matrix.

Running a random forest with default settings:

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
rf.diab <- randomForest(classvariable ~ numpreg + plasmacon + bloodpress + skinfold + seruminsulin + BMI + pedigreefunction + age,data = train,importance=TRUE)

rf.yhat <- predict(rf.diab,newdata = test)

Plotting the errors from Random Forest model:

par(mar=c(3,3,3,3))
plot(rf.diab, type="l")

Important variables are:

varImpPlot(rf.diab,main = "Important Variables")

importance(rf.diab)
##                          0          1 MeanDecreaseAccuracy
## numpreg           8.092932  0.1835287            6.8698447
## plasmacon        21.855425 20.3960578           28.1393381
## bloodpress       -1.908877 -2.2502017           -3.0025119
## skinfold          1.017169 -0.3356204            0.7924919
## seruminsulin      3.488566  0.4599741            3.1783688
## BMI              10.749431 12.6523482           16.4091620
## pedigreefunction  5.638079  4.5883538            7.3318656
## age               5.706137  9.0374097           10.7430051
##                  MeanDecreaseGini
## numpreg                  15.36864
## plasmacon                43.56476
## bloodpress               16.70306
## skinfold                 14.41862
## seruminsulin             14.20837
## BMI                      33.86676
## pedigreefunction         25.02633
## age                      24.93435

Confusion Matrix and the mean error rate:

rf.cm <- confusionMatrix(rf.yhat,test$classvariable)

#Mean error rate
mean.error.rate.rf <- 1- rf.cm$overall[1]
mean.error.rate.rf
##  Accuracy 
## 0.1960784
#Confusion Matrix
rf.cm$table
##           Reference
## Prediction   0   1
##          0 218  43
##          1  27  69

(c)Use gradient boosted decision tree (gbdt) to analyze the data.Report the mean error rate and the confusion matrix. Using gbm package in R.

#Since GBM does not work on factor outcomes, re-reading the dataset
train <- read.csv("diabetes_train-std.csv")
test <- read.csv("diabetes_test-std.csv")

attach(train)
## The following objects are masked from train (pos = 8):
## 
##     age, bloodpress, BMI, classvariable, numpreg,
##     pedigreefunction, plasmacon, seruminsulin, skinfold

Modeling gradient boosting with following parameters:

  1. 500 trees
  2. Interaction Depth = 4
  3. Shrinkage = 0.05

Interaction depth is the maximum depth of variable interaction while shrinkage is the learning rate or a parameter applied to each tree in expansion. Using these parameters give us a much better accuracy than default settings.

library(gbm)
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
gbm.diab <- gbm(classvariable ~ numpreg + plasmacon + bloodpress + skinfold + seruminsulin + BMI + pedigreefunction + age,data = train, distribution = "bernoulli",n.trees =500,interaction.depth = 4, shrinkage = 0.005)

Most important variables:

par(mar=c(4,4,4,4))
summary(gbm.diab)

##                               var   rel.inf
## plasmacon               plasmacon 39.375942
## BMI                           BMI 23.012549
## pedigreefunction pedigreefunction 11.966060
## age                           age 11.577427
## numpreg                   numpreg  4.698251
## seruminsulin         seruminsulin  4.489544
## skinfold                 skinfold  2.606234
## bloodpress             bloodpress  2.273994

Confusion Matrix and Mean Error Rate:

Since GBM prdicts probabilities, I have selected a threshold of 0.385 as the ideal cut off for 1’s and 0’s since such was the class proportion of 1’s

gbm.yhat <- predict(gbm.diab,newdata = test, n.trees = 500)
summary(gbm.yhat)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.50600 -1.62200 -0.95600 -0.75740 -0.02431  1.99200
gbm.yhat1 <- ifelse(gbm.yhat < 0.385,0,1)

library(caret)
gbm.cm <- confusionMatrix(gbm.yhat1,test$classvariable)

#Mean error rate
mean.error.rate.gbm <- 1- gbm.cm$overall[1]
mean.error.rate.gbm
##  Accuracy 
## 0.2044818
#Confusion Matrix
gbm.cm$table
##           Reference
## Prediction   0   1
##          0 229  57
##          1  16  55