Start by downloading the data set Data set from Kaggle originally from the National Institute of Diabetes and Digestive and Kidney Diseases
diabetes <- read.csv("~/Downloads/diabetes 3.csv")
#attach(diabetes)
colnames(diabetes)[9] <- "diabetic" #this changed the column name from Outcome to Diabetic
head(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age diabetic
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
summary(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## diabetic
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
summary(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## diabetic
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
str(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ diabetic : int 1 0 1 0 1 0 1 0 1 1 ...
#install.packages('ggplot2')
#install.packages('tidyverse')
library(ggplot2)
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ✔ purrr 0.3.3
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
HISTOGRAMS TO VISUALIZE FREQUENCY OF VARIABLES
Age <-diabetes$Age
hist(Age)
Glucose <- diabetes$Glucose
hist(Glucose)
BMI <- diabetes$BMI
hist(BMI)
BloodPressure <- diabetes$BloodPressure
hist(BloodPressure)
SkinThickness <- diabetes$SkinThickness
hist(SkinThickness)
Insulin <- diabetes$Insulin
hist(Insulin)
DiabetesPedigree <- diabetes$DiabetesPedigreeFunction
hist(DiabetesPedigree)
Pregnancies <- diabetes$Pregnancies
hist(Pregnancies)
Diabetesdf<-data.frame(Age <-diabetes$Age,
Glucose <- diabetes$Glucose,
BloodPressure <- diabetes$BloodPressure,
BMI <- diabetes$BMI,
DiabetesPedigree <- diabetes$DiabetesPedigreeFunction,
Insulin <- diabetes$Insulin,
SkinThickness <- diabetes$SkinThickness,
Pregnancies <- diabetes$Pregnancies)
pairs(Diabetesdf)
Correlation Visualization
diabetes_cor <- round(cor(diabetes[1:8]),1)
#install.packages("ggcorrplot")
library(ggcorrplot)
ggcorrplot(diabetes_cor)
# Compute correlation matrix
diabetes_cor <- round(cor(diabetes[1:8]),1)
diabetes_cor
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.0 0.1 0.1 -0.1
## Glucose 0.1 1.0 0.2 0.1
## BloodPressure 0.1 0.2 1.0 0.2
## SkinThickness -0.1 0.1 0.2 1.0
## Insulin -0.1 0.3 0.1 0.4
## BMI 0.0 0.2 0.3 0.4
## DiabetesPedigreeFunction 0.0 0.1 0.0 0.2
## Age 0.5 0.3 0.2 -0.1
## Insulin BMI DiabetesPedigreeFunction Age
## Pregnancies -0.1 0.0 0.0 0.5
## Glucose 0.3 0.2 0.1 0.3
## BloodPressure 0.1 0.3 0.0 0.2
## SkinThickness 0.4 0.4 0.2 -0.1
## Insulin 1.0 0.2 0.2 0.0
## BMI 0.2 1.0 0.1 0.0
## DiabetesPedigreeFunction 0.2 0.1 1.0 0.0
## Age 0.0 0.0 0.0 1.0
LOGISTIC REGRESSION
#install.packages('dplyr')
library(dplyr)
# set the seed to make the partition is reproductible
set.seed(1)
# Model Fitting
model_algorithm = model <- glm(diabetic ~ Pregnancies +
Glucose +
BloodPressure +
SkinThickness +
Insulin +
BMI +
DiabetesPedigreeFunction +
Age ,
family=binomial(link='logit'),data=diabetes)
print(summary(model_algorithm))
##
## Call:
## glm(formula = diabetic ~ Pregnancies + Glucose + BloodPressure +
## SkinThickness + Insulin + BMI + DiabetesPedigreeFunction +
## Age, family = binomial(link = "logit"), data = diabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5566 -0.7274 -0.4159 0.7267 2.9297
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.4046964 0.7166359 -11.728 < 2e-16 ***
## Pregnancies 0.1231823 0.0320776 3.840 0.000123 ***
## Glucose 0.0351637 0.0037087 9.481 < 2e-16 ***
## BloodPressure -0.0132955 0.0052336 -2.540 0.011072 *
## SkinThickness 0.0006190 0.0068994 0.090 0.928515
## Insulin -0.0011917 0.0009012 -1.322 0.186065
## BMI 0.0897010 0.0150876 5.945 2.76e-09 ***
## DiabetesPedigreeFunction 0.9451797 0.2991475 3.160 0.001580 **
## Age 0.0148690 0.0093348 1.593 0.111192
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 723.45 on 759 degrees of freedom
## AIC: 741.45
##
## Number of Fisher Scoring iterations: 5
print(anova(model_algorithm, test="Chisq"))
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: diabetic
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 767 993.48
## Pregnancies 1 37.274 766 956.21 1.026e-09 ***
## Glucose 1 171.260 765 784.95 < 2.2e-16 ***
## BloodPressure 1 0.888 764 784.06 0.3460418
## SkinThickness 1 3.999 763 780.06 0.0455212 *
## Insulin 1 1.972 762 778.09 0.1602210
## BMI 1 41.243 761 736.85 1.344e-10 ***
## DiabetesPedigreeFunction 1 10.880 760 725.97 0.0009719 ***
## Age 1 2.522 759 723.45 0.1122535
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_final = model <- glm(diabetic ~ Glucose + BMI + Pregnancies+ DiabetesPedigreeFunction,
family=binomial(link='logit'),data=diabetes)
print(summary(model_final))
##
## Call:
## glm(formula = diabetic ~ Glucose + BMI + Pregnancies + DiabetesPedigreeFunction,
## family = binomial(link = "logit"), data = diabetes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7581 -0.7349 -0.4264 0.7580 2.9008
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.415851 0.656908 -12.811 < 2e-16 ***
## Glucose 0.033826 0.003345 10.112 < 2e-16 ***
## BMI 0.078097 0.013771 5.671 1.42e-08 ***
## Pregnancies 0.141926 0.027105 5.236 1.64e-07 ***
## DiabetesPedigreeFunction 0.901294 0.291696 3.090 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 734.31 on 763 degrees of freedom
## AIC: 744.31
##
## Number of Fisher Scoring iterations: 5
print(anova(model_final, test="Chisq"))
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: diabetic
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 767 993.48
## Glucose 1 184.764 766 808.72 < 2.2e-16 ***
## BMI 1 37.317 765 771.40 1.004e-09 ***
## Pregnancies 1 27.278 764 744.12 1.762e-07 ***
## DiabetesPedigreeFunction 1 9.819 763 734.31 0.001727 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
CLASSIFICATION TREE
#install.packages('tree')
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
set.seed(2)
names(diabetes)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "diabetic"
diabetes$diabetic <- as.factor(diabetes$diabetic)
set.seed(2)
train <- sample(dim(diabetes)[1], floor(dim(diabetes)[1]/2))
test <- diabetes[-train,]
tree.diabetes <-tree(diabetic~ .,diabetes ,subset=train)
summary(tree.diabetes)
##
## Classification tree:
## tree(formula = diabetic ~ ., data = diabetes, subset = train)
## Variables actually used in tree construction:
## [1] "Glucose" "BMI"
## [3] "Pregnancies" "Age"
## [5] "SkinThickness" "DiabetesPedigreeFunction"
## [7] "BloodPressure"
## Number of terminal nodes: 24
## Residual mean deviance: 0.5656 = 203.6 / 360
## Misclassification error rate: 0.1484 = 57 / 384
This is the tree that is not pruned
plot(tree.diabetes)
text(tree.diabetes, pretty=0)
# size is number of terminal nodes
# dev is the error
# k is the cost complexity parameter (alpha)
tree.diabetes.cv <-cv.tree(tree.diabetes, FUN=prune.misclass)
tree.diabetes.cv #this line gives the size and corresponding error
## $size
## [1] 24 17 14 7 3 2 1
##
## $dev
## [1] 122 119 117 117 120 120 130
##
## $k
## [1] -Inf 0.000000 1.000000 2.571429 4.500000 5.000000 31.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
#plots the size and errors
set.seed(2)
plot(tree.diabetes.cv$size, tree.diabetes.cv$dev, type="b")
Plots the pruned tree based on desired complexity off 5 1: Diabettic 0: Non Diabetic
prune.diabetes<-prune.misclass(tree.diabetes, best=5)
plot(prune.diabetes)
text(prune.diabetes, pretty=0)
tree.pred2<-predict(prune.diabetes, newdata = diabetes, type="class")
test.pred<-tree.pred2[-train]
diabetic.test <- diabetes$diabetic[-train]
cm<-table(test.pred, diabetic.test)
cm
## diabetic.test
## test.pred 0 1
## 0 216 66
## 1 32 70
#Error Rate
sum(diag(cm))/sum(cm)
## [1] 0.7447917
Random Forest & Bagging
#install.packages("randomForest")
#install.packages("caret")
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
diabetes_data <- read.csv("~/Downloads/diabetes 4.csv")
set.seed(1)
rows<-sample(nrow(diabetes_data))
diabetes_data<-diabetes_data[rows,]
split<-round(nrow(diabetes_data)*.80)
train_data<-diabetes_data[1:split, ]
test_data<-diabetes_data[(split+1):nrow(diabetes_data), ]
rf.diabetes<-train(Outcome ~ .,
train_data,
method="ranger",
tuneLength=2,
trControl=trainControl(method="cv",number=5,verboseIter=TRUE))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## + Fold1: mtry=2, min.node.size=5, splitrule=variance
## - Fold1: mtry=2, min.node.size=5, splitrule=variance
## + Fold1: mtry=8, min.node.size=5, splitrule=variance
## - Fold1: mtry=8, min.node.size=5, splitrule=variance
## + Fold1: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold1: mtry=8, min.node.size=5, splitrule=extratrees
## - Fold1: mtry=8, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=2, min.node.size=5, splitrule=variance
## - Fold2: mtry=2, min.node.size=5, splitrule=variance
## + Fold2: mtry=8, min.node.size=5, splitrule=variance
## - Fold2: mtry=8, min.node.size=5, splitrule=variance
## + Fold2: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold2: mtry=8, min.node.size=5, splitrule=extratrees
## - Fold2: mtry=8, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=2, min.node.size=5, splitrule=variance
## - Fold3: mtry=2, min.node.size=5, splitrule=variance
## + Fold3: mtry=8, min.node.size=5, splitrule=variance
## - Fold3: mtry=8, min.node.size=5, splitrule=variance
## + Fold3: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold3: mtry=8, min.node.size=5, splitrule=extratrees
## - Fold3: mtry=8, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=2, min.node.size=5, splitrule=variance
## - Fold4: mtry=2, min.node.size=5, splitrule=variance
## + Fold4: mtry=8, min.node.size=5, splitrule=variance
## - Fold4: mtry=8, min.node.size=5, splitrule=variance
## + Fold4: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold4: mtry=8, min.node.size=5, splitrule=extratrees
## - Fold4: mtry=8, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=2, min.node.size=5, splitrule=variance
## - Fold5: mtry=2, min.node.size=5, splitrule=variance
## + Fold5: mtry=8, min.node.size=5, splitrule=variance
## - Fold5: mtry=8, min.node.size=5, splitrule=variance
## + Fold5: mtry=2, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=2, min.node.size=5, splitrule=extratrees
## + Fold5: mtry=8, min.node.size=5, splitrule=extratrees
## - Fold5: mtry=8, min.node.size=5, splitrule=extratrees
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 8, splitrule = extratrees, min.node.size = 5 on full training set
print(rf.diabetes)
## Random Forest
##
## 614 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 492, 491, 491, 491, 491
## Resampling results across tuning parameters:
##
## mtry splitrule RMSE Rsquared MAE
## 2 variance 0.4121997 0.2580207 0.3388697
## 2 extratrees 0.4081891 0.2792656 0.3511579
## 8 variance 0.4172275 0.2452707 0.3286029
## 8 extratrees 0.4068651 0.2766567 0.3334482
##
## Tuning parameter 'min.node.size' was held constant at a value of 5
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were mtry = 8, splitrule = extratrees
## and min.node.size = 5.
plot(rf.diabetes)
model_glm<-train(Outcome ~ .,
train_data,
method="glm",
tuneLength=2,
trControl=trainControl(method="cv",number=5,verboseIter=TRUE))
## Warning in train.default(x, y, weights = w, ...): You are trying to do
## regression and your outcome only has two possible values Are you trying to do
## classification? If so, use a 2 level factor as your outcome column.
## + Fold1: parameter=none
## - Fold1: parameter=none
## + Fold2: parameter=none
## - Fold2: parameter=none
## + Fold3: parameter=none
## - Fold3: parameter=none
## + Fold4: parameter=none
## - Fold4: parameter=none
## + Fold5: parameter=none
## - Fold5: parameter=none
## Aggregating results
## Fitting final model on full training set
print(model_glm)
## Generalized Linear Model
##
## 614 samples
## 8 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 491, 491, 492, 491, 491
## Resampling results:
##
## RMSE Rsquared MAE
## 0.4115855 0.2631804 0.3451513
Boosting
objControl <- trainControl(method='cv',
number=3,
returnResamp='none',
summaryFunction = twoClassSummary,
classProbs = TRUE)
outcomeName<-'Outcome'
train_data1<-train_data
train_data1$Outcome<-ifelse(train_data1$Outcome==1,'Yes','No')
train_data1$Outcome<-as.factor(train_data1$Outcome)
test_data$Outcome<-ifelse(test_data$Outcome==1,'Yes','No')
test_data$Outcome<-as.factor(test_data$Outcome)
names(train_data1)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "Outcome"
predictorsNames <- names(train_data1)[names(train_data1) != outcomeName]
predictorsNames
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
head(train_data1)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 679 3 121 52 0 0 36.0
## 129 1 117 88 24 145 34.5
## 509 2 84 50 23 76 30.4
## 471 1 144 82 40 0 41.3
## 299 14 100 78 25 184 36.6
## 270 2 146 0 0 0 27.5
## DiabetesPedigreeFunction Age Outcome
## 679 0.127 25 Yes
## 129 0.403 40 Yes
## 509 0.968 21 No
## 471 0.607 28 No
## 299 0.412 46 Yes
## 270 0.240 28 Yes
set.seed(1)
model_gbm<- train(train_data1[,predictorsNames[1:8]], train_data1[,outcomeName],
method="gbm",
trControl=objControl,
metric = "ROC",
preProc = c("center", "scale"))
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2625 nan 0.1000 0.0144
## 2 1.2428 nan 0.1000 0.0082
## 3 1.2159 nan 0.1000 0.0126
## 4 1.1940 nan 0.1000 0.0087
## 5 1.1763 nan 0.1000 0.0060
## 6 1.1631 nan 0.1000 0.0020
## 7 1.1443 nan 0.1000 0.0057
## 8 1.1322 nan 0.1000 0.0042
## 9 1.1214 nan 0.1000 0.0034
## 10 1.1091 nan 0.1000 0.0056
## 20 1.0305 nan 0.1000 0.0006
## 40 0.9528 nan 0.1000 0.0004
## 60 0.9180 nan 0.1000 -0.0015
## 80 0.8929 nan 0.1000 -0.0031
## 100 0.8707 nan 0.1000 -0.0030
## 120 0.8492 nan 0.1000 -0.0024
## 140 0.8357 nan 0.1000 -0.0015
## 150 0.8285 nan 0.1000 -0.0006
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2591 nan 0.1000 0.0211
## 2 1.2275 nan 0.1000 0.0127
## 3 1.2019 nan 0.1000 0.0124
## 4 1.1748 nan 0.1000 0.0095
## 5 1.1490 nan 0.1000 0.0080
## 6 1.1297 nan 0.1000 0.0034
## 7 1.1095 nan 0.1000 0.0055
## 8 1.0948 nan 0.1000 0.0046
## 9 1.0791 nan 0.1000 0.0063
## 10 1.0641 nan 0.1000 0.0023
## 20 0.9701 nan 0.1000 -0.0031
## 40 0.8800 nan 0.1000 -0.0013
## 60 0.8184 nan 0.1000 -0.0010
## 80 0.7825 nan 0.1000 -0.0012
## 100 0.7498 nan 0.1000 -0.0019
## 120 0.7220 nan 0.1000 -0.0029
## 140 0.6966 nan 0.1000 -0.0020
## 150 0.6812 nan 0.1000 -0.0014
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2464 nan 0.1000 0.0194
## 2 1.2042 nan 0.1000 0.0166
## 3 1.1717 nan 0.1000 0.0108
## 4 1.1374 nan 0.1000 0.0110
## 5 1.1092 nan 0.1000 0.0092
## 6 1.0895 nan 0.1000 0.0056
## 7 1.0654 nan 0.1000 0.0103
## 8 1.0473 nan 0.1000 0.0042
## 9 1.0329 nan 0.1000 0.0037
## 10 1.0157 nan 0.1000 0.0064
## 20 0.9153 nan 0.1000 -0.0033
## 40 0.8112 nan 0.1000 -0.0010
## 60 0.7420 nan 0.1000 -0.0017
## 80 0.6838 nan 0.1000 -0.0019
## 100 0.6381 nan 0.1000 -0.0011
## 120 0.5983 nan 0.1000 -0.0013
## 140 0.5670 nan 0.1000 -0.0023
## 150 0.5481 nan 0.1000 -0.0023
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2628 nan 0.1000 0.0172
## 2 1.2341 nan 0.1000 0.0157
## 3 1.2065 nan 0.1000 0.0110
## 4 1.1838 nan 0.1000 0.0090
## 5 1.1657 nan 0.1000 0.0062
## 6 1.1480 nan 0.1000 0.0067
## 7 1.1343 nan 0.1000 0.0072
## 8 1.1195 nan 0.1000 0.0047
## 9 1.1061 nan 0.1000 0.0014
## 10 1.0946 nan 0.1000 0.0041
## 20 1.0105 nan 0.1000 0.0009
## 40 0.9202 nan 0.1000 -0.0016
## 60 0.8738 nan 0.1000 -0.0015
## 80 0.8422 nan 0.1000 -0.0028
## 100 0.8148 nan 0.1000 -0.0022
## 120 0.7951 nan 0.1000 -0.0017
## 140 0.7788 nan 0.1000 -0.0020
## 150 0.7730 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2527 nan 0.1000 0.0213
## 2 1.2180 nan 0.1000 0.0154
## 3 1.1843 nan 0.1000 0.0151
## 4 1.1555 nan 0.1000 0.0127
## 5 1.1281 nan 0.1000 0.0100
## 6 1.1081 nan 0.1000 0.0067
## 7 1.0889 nan 0.1000 0.0082
## 8 1.0725 nan 0.1000 0.0053
## 9 1.0589 nan 0.1000 0.0037
## 10 1.0410 nan 0.1000 0.0076
## 20 0.9337 nan 0.1000 0.0034
## 40 0.8344 nan 0.1000 -0.0000
## 60 0.7781 nan 0.1000 -0.0034
## 80 0.7358 nan 0.1000 -0.0035
## 100 0.7044 nan 0.1000 -0.0016
## 120 0.6759 nan 0.1000 -0.0027
## 140 0.6447 nan 0.1000 -0.0016
## 150 0.6307 nan 0.1000 -0.0012
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2428 nan 0.1000 0.0215
## 2 1.1934 nan 0.1000 0.0141
## 3 1.1577 nan 0.1000 0.0158
## 4 1.1264 nan 0.1000 0.0094
## 5 1.1058 nan 0.1000 0.0079
## 6 1.0828 nan 0.1000 0.0081
## 7 1.0590 nan 0.1000 0.0036
## 8 1.0387 nan 0.1000 0.0054
## 9 1.0202 nan 0.1000 0.0010
## 10 1.0034 nan 0.1000 0.0028
## 20 0.8772 nan 0.1000 -0.0015
## 40 0.7752 nan 0.1000 -0.0033
## 60 0.7133 nan 0.1000 -0.0018
## 80 0.6574 nan 0.1000 -0.0034
## 100 0.6100 nan 0.1000 -0.0041
## 120 0.5637 nan 0.1000 -0.0022
## 140 0.5369 nan 0.1000 -0.0021
## 150 0.5217 nan 0.1000 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2694 nan 0.1000 0.0142
## 2 1.2444 nan 0.1000 0.0091
## 3 1.2128 nan 0.1000 0.0094
## 4 1.1853 nan 0.1000 0.0110
## 5 1.1710 nan 0.1000 0.0055
## 6 1.1528 nan 0.1000 0.0076
## 7 1.1377 nan 0.1000 0.0077
## 8 1.1224 nan 0.1000 0.0053
## 9 1.1060 nan 0.1000 0.0065
## 10 1.0937 nan 0.1000 0.0056
## 20 0.9937 nan 0.1000 0.0027
## 40 0.8968 nan 0.1000 0.0008
## 60 0.8542 nan 0.1000 -0.0013
## 80 0.8289 nan 0.1000 -0.0016
## 100 0.8132 nan 0.1000 -0.0015
## 120 0.7967 nan 0.1000 -0.0011
## 140 0.7820 nan 0.1000 -0.0022
## 150 0.7768 nan 0.1000 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2518 nan 0.1000 0.0176
## 2 1.2098 nan 0.1000 0.0161
## 3 1.1754 nan 0.1000 0.0148
## 4 1.1462 nan 0.1000 0.0109
## 5 1.1237 nan 0.1000 0.0109
## 6 1.0980 nan 0.1000 0.0118
## 7 1.0827 nan 0.1000 0.0042
## 8 1.0613 nan 0.1000 0.0067
## 9 1.0453 nan 0.1000 0.0038
## 10 1.0280 nan 0.1000 0.0052
## 20 0.9204 nan 0.1000 -0.0008
## 40 0.8190 nan 0.1000 -0.0022
## 60 0.7723 nan 0.1000 -0.0044
## 80 0.7373 nan 0.1000 -0.0038
## 100 0.7001 nan 0.1000 -0.0030
## 120 0.6701 nan 0.1000 -0.0020
## 140 0.6514 nan 0.1000 -0.0023
## 150 0.6376 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2415 nan 0.1000 0.0227
## 2 1.1949 nan 0.1000 0.0209
## 3 1.1556 nan 0.1000 0.0157
## 4 1.1241 nan 0.1000 0.0109
## 5 1.0974 nan 0.1000 0.0095
## 6 1.0651 nan 0.1000 0.0081
## 7 1.0425 nan 0.1000 0.0034
## 8 1.0203 nan 0.1000 0.0078
## 9 0.9996 nan 0.1000 0.0077
## 10 0.9834 nan 0.1000 0.0028
## 20 0.8661 nan 0.1000 -0.0041
## 40 0.7582 nan 0.1000 -0.0014
## 60 0.6912 nan 0.1000 -0.0038
## 80 0.6414 nan 0.1000 -0.0027
## 100 0.5951 nan 0.1000 -0.0024
## 120 0.5625 nan 0.1000 -0.0028
## 140 0.5230 nan 0.1000 -0.0016
## 150 0.5071 nan 0.1000 -0.0004
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.2557 nan 0.1000 0.0184
## 2 1.2159 nan 0.1000 0.0147
## 3 1.1851 nan 0.1000 0.0122
## 4 1.1567 nan 0.1000 0.0081
## 5 1.1360 nan 0.1000 0.0096
## 6 1.1178 nan 0.1000 0.0047
## 7 1.0971 nan 0.1000 0.0091
## 8 1.0803 nan 0.1000 0.0056
## 9 1.0632 nan 0.1000 0.0062
## 10 1.0506 nan 0.1000 0.0030
## 20 0.9681 nan 0.1000 -0.0021
## 40 0.8851 nan 0.1000 -0.0017
## 50 0.8615 nan 0.1000 -0.0000
summary(model_gbm)
## var rel.inf
## Glucose Glucose 44.081631
## BMI BMI 18.548428
## Age Age 15.853752
## DiabetesPedigreeFunction DiabetesPedigreeFunction 8.496028
## Pregnancies Pregnancies 4.586239
## BloodPressure BloodPressure 3.612479
## Insulin Insulin 3.137974
## SkinThickness SkinThickness 1.683469
print(model_gbm)
## Stochastic Gradient Boosting
##
## 614 samples
## 8 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (8), scaled (8)
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 410, 409, 409
## Resampling results across tuning parameters:
##
## interaction.depth n.trees ROC Sens Spec
## 1 50 0.8108507 0.8766614 0.5303780
## 1 100 0.8185576 0.8566112 0.5670345
## 1 150 0.8098644 0.8515417 0.5623415
## 2 50 0.8214690 0.8515227 0.5440766
## 2 100 0.8091650 0.8364282 0.5669711
## 2 150 0.8063956 0.8389534 0.5993151
## 3 50 0.8158474 0.8364092 0.5901826
## 3 100 0.8085575 0.8439850 0.5715373
## 3 150 0.7985773 0.8314157 0.5347539
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## ROC was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 50, interaction.depth =
## 2, shrinkage = 0.1 and n.minobsinnode = 10.