Objective:
The objective of this exercise is to predict if a patient has diabetes for the Pima Indians Diabetes Data set using multiple ML algorithms.
Version 01: We will build and train following Machine Learning models during this analysis. The predictor variable will be Age 1. Logistic Regression (glm)
Version 02: We will build more Machine Learning Model during this analysis, namely LDA< Random Forest, SVM, KNN, AdaBoost. the variable to predict is Outcome
Data Set:
The data set is comprised of 768 observations and 9 variables. It is available in the packAge mlbench. We will be using diabetes as our response/target variable.
Data Description for the 9 variables are as follows.
pregnant - Number of times pregnant glucose - Plasma glucose concentration (glucose tolerance test) pressure - Diastolic blood pressure (mm Hg) triceps - Triceps skin fold thickness (mm) insulin - 2-Hour serum insulin (mu U/ml) mass - Body mass index (weight in kg/(height in m)^2) pedigree - Diabetes pedigree function Age - Age (years) diabetes - Class variable (test for diabetes) More information on the dataset can be read by accessing the website: http://math.furman.edu/~dcs/courses/math47/R/library/mlbench/html/PimaIndiansDiabetes.html
library(knitr)
#library(tidyverse) #Required for analysis, visualization
library(plotly) # Required for interactive plotting of graphs
## Loading required package: ggplot2
## Warning in register(): Can't find generic `scale_type` in package ggplot2 to
## register S3 method.
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggsci) #Themes packAge for the plots
# This R environment comes with all of CRAN preinstalled, as well as many other helpful packAges
# The environment is defined by the kaggle/rstats docker imAge: https://github.com/kaggle/docker-rstats
# For example, here's several helpful packAges to load in
library(ggplot2) # Data visualization
library(readr) # CSV file I/O, e.g. the read_csv function
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
library("ggcorrplot")
library("ROCR")
# Any results you write to the current directory are saved as output.
diab<-read.csv("diabetes.csv") # It reads the CSV file and assigns to diab object
Version 01:
head(diab) # Elements of first few rows
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
tail(diab)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 763 9 89 62 0 0 22.5
## 764 10 101 76 48 180 32.9
## 765 2 122 70 27 0 36.8
## 766 5 121 72 23 112 26.2
## 767 1 126 60 0 0 30.1
## 768 1 93 70 31 0 30.4
## DiabetesPedigreeFunction Age Outcome
## 763 0.142 33 0
## 764 0.171 63 0
## 765 0.340 27 0
## 766 0.245 30 0
## 767 0.349 47 1
## 768 0.315 23 0
colnames(diab)
## [1] "Pregnancies" "Glucose"
## [3] "BloodPressure" "SkinThickness"
## [5] "Insulin" "BMI"
## [7] "DiabetesPedigreeFunction" "Age"
## [9] "Outcome"
str(diab)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
summary(diab)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
diab$Outcome<-factor(diab$Outcome)
class(diab$Outcome)
## [1] "factor"
# Create Age Category column
diab$Age_Cat <- ifelse(diab$Age < 21, "<21",
ifelse((diab$Age>=21) & (diab$Age<=25), "21-25",
ifelse((diab$Age>25) & (diab$Age<=30), "25-30",
ifelse((diab$Age>30) & (diab$Age<=35), "30-35",
ifelse((diab$Age>35) & (diab$Age<=40), "35-40",
ifelse((diab$Age>40) & (diab$Age<=50), "40-50",
ifelse((diab$Age>50) & (diab$Age<=60), "50-60",">60")))))))
diab$Age_Cat <- factor(diab$Age_Cat, levels = c('<21','21-25','25-30','30-35','35-40','40-50','50-60','>60'))
table(diab$Age_Cat)
##
## <21 21-25 25-30 30-35 35-40 40-50 50-60 >60
## 0 267 150 81 76 113 54 27
Most of the subjects are in between the Ages 21 - 30
# Histogram of Age
library(ggplot2)
ggplot(aes(x = Age), data=diab) +
geom_histogram(binwidth=1, color='black', fill = "#F79420") +
scale_x_continuous(limits=c(20,90), breaks=seq(20,90,5)) +
xlab("Age") +
ylab("No of people by age")
## Warning: Removed 2 rows containing missing values (geom_bar).
# Barplot by Age
library(ggplot2)
ggplot(aes(x = Age_Cat), data = diab) +
geom_bar(fill='steelblue')
# box plot of Age vs BMI
library(ggplot2)
ggplot(aes(x=Age_Cat, y = BMI), data = diab) +
geom_boxplot() +
coord_cartesian(ylim = c(0,70))
by(diab$BMI, diab$Age, summary)
## diab$Age: 21
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 24.00 27.40 27.82 33.40 43.50
## ------------------------------------------------------------
## diab$Age: 22
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 25.27 28.70 29.51 33.30 57.30
## ------------------------------------------------------------
## diab$Age: 23
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.10 25.45 30.25 31.50 35.05 52.30
## ------------------------------------------------------------
## diab$Age: 24
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 27.12 32.80 32.57 37.40 45.30
## ------------------------------------------------------------
## diab$Age: 25
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 27.70 32.95 31.94 36.02 59.40
## ------------------------------------------------------------
## diab$Age: 26
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 28.60 34.80 34.92 43.20 67.10
## ------------------------------------------------------------
## diab$Age: 27
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.20 26.75 31.15 31.95 37.83 49.30
## ------------------------------------------------------------
## diab$Age: 28
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.50 28.85 34.50 33.64 38.00 52.90
## ------------------------------------------------------------
## diab$Age: 29
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.30 29.00 32.00 33.54 37.70 45.60
## ------------------------------------------------------------
## diab$Age: 30
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 26.20 31.20 30.03 34.90 42.90
## ------------------------------------------------------------
## diab$Age: 31
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.80 27.98 32.45 34.02 37.52 49.70
## ------------------------------------------------------------
## diab$Age: 32
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.20 28.57 30.00 32.32 35.05 48.30
## ------------------------------------------------------------
## diab$Age: 33
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.50 29.70 32.80 32.34 34.70 43.30
## ------------------------------------------------------------
## diab$Age: 34
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 28.62 30.70 31.16 33.83 38.70
## ------------------------------------------------------------
## diab$Age: 35
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 32.08 34.10 33.78 35.73 43.40
## ------------------------------------------------------------
## diab$Age: 36
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.50 27.30 30.05 31.72 33.23 50.00
## ------------------------------------------------------------
## diab$Age: 37
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 26.55 31.30 32.08 37.50 48.80
## ------------------------------------------------------------
## diab$Age: 38
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 30.00 32.27 34.05 35.57 39.42 45.60
## ------------------------------------------------------------
## diab$Age: 39
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.70 28.45 32.25 31.98 35.45 40.60
## ------------------------------------------------------------
## diab$Age: 40
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.00 27.60 33.70 33.54 37.30 52.30
## ------------------------------------------------------------
## diab$Age: 41
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.20 32.80 35.20 35.26 38.33 43.30
## ------------------------------------------------------------
## diab$Age: 42
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.40 31.40 33.15 34.98 39.00 46.70
## ------------------------------------------------------------
## diab$Age: 43
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.80 34.00 37.10 36.89 39.40 47.90
## ------------------------------------------------------------
## diab$Age: 44
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.60 30.30 34.10 34.16 40.05 42.30
## ------------------------------------------------------------
## diab$Age: 45
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.60 32.55 32.80 34.96 36.90 46.80
## ------------------------------------------------------------
## diab$Age: 46
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.10 29.70 32.90 34.52 36.60 46.20
## ------------------------------------------------------------
## diab$Age: 47
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.50 30.18 32.95 34.57 39.55 45.00
## ------------------------------------------------------------
## diab$Age: 48
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.80 22.70 28.80 29.98 35.30 42.30
## ------------------------------------------------------------
## diab$Age: 49
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.60 30.00 30.40 32.02 33.60 38.50
## ------------------------------------------------------------
## diab$Age: 50
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.30 29.18 31.35 31.23 34.05 36.10
## ------------------------------------------------------------
## diab$Age: 51
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.80 31.48 36.25 33.98 37.60 39.00
## ------------------------------------------------------------
## diab$Age: 52
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.00 32.15 34.60 33.48 35.77 37.70
## ------------------------------------------------------------
## diab$Age: 53
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.3 24.8 30.3 30.5 30.5 43.6
## ------------------------------------------------------------
## diab$Age: 54
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 29.23 34.50 30.80 40.83 45.40
## ------------------------------------------------------------
## diab$Age: 55
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.10 23.43 24.65 27.02 28.25 37.70
## ------------------------------------------------------------
## diab$Age: 56
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.1 29.0 34.9 31.7 36.0 37.1
## ------------------------------------------------------------
## diab$Age: 57
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.2 24.9 27.1 29.7 36.8 37.5
## ------------------------------------------------------------
## diab$Age: 58
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.60 26.00 32.00 32.43 36.45 46.50
## ------------------------------------------------------------
## diab$Age: 59
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.50 25.40 27.30 26.97 28.70 30.10
## ------------------------------------------------------------
## diab$Age: 60
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.60 23.80 30.10 28.74 34.20 36.00
## ------------------------------------------------------------
## diab$Age: 61
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 28.8 29.4 30.0 30.0 30.6 31.2
## ------------------------------------------------------------
## diab$Age: 62
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.50 26.65 27.30 28.95 29.60 34.70
## ------------------------------------------------------------
## diab$Age: 63
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 28.00 29.35 31.10 30.77 32.52 32.90
## ------------------------------------------------------------
## diab$Age: 64
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25 25 25 25 25 25
## ------------------------------------------------------------
## diab$Age: 65
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.90 27.80 33.70 31.60 36.45 39.20
## ------------------------------------------------------------
## diab$Age: 66
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.60 27.50 29.70 30.38 32.58 35.50
## ------------------------------------------------------------
## diab$Age: 67
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.70 23.90 26.10 28.77 32.30 38.50
## ------------------------------------------------------------
## diab$Age: 68
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.6 35.6 35.6 35.6 35.6 35.6
## ------------------------------------------------------------
## diab$Age: 69
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 6.7 13.4 13.4 20.1 26.8
## ------------------------------------------------------------
## diab$Age: 70
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.5 32.5 32.5 32.5 32.5 32.5
## ------------------------------------------------------------
## diab$Age: 72
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.6 19.6 19.6 19.6 19.6 19.6
## ------------------------------------------------------------
## diab$Age: 81
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.9 25.9 25.9 25.9 25.9 25.9
# Compute correlation matrix
diab_cor <- round(cor(diab[1:8]),1)
diab_cor
## Pregnancies Glucose BloodPressure SkinThickness
## Pregnancies 1.0 0.1 0.1 -0.1
## Glucose 0.1 1.0 0.2 0.1
## BloodPressure 0.1 0.2 1.0 0.2
## SkinThickness -0.1 0.1 0.2 1.0
## Insulin -0.1 0.3 0.1 0.4
## BMI 0.0 0.2 0.3 0.4
## DiabetesPedigreeFunction 0.0 0.1 0.0 0.2
## Age 0.5 0.3 0.2 -0.1
## Insulin BMI DiabetesPedigreeFunction Age
## Pregnancies -0.1 0.0 0.0 0.5
## Glucose 0.3 0.2 0.1 0.3
## BloodPressure 0.1 0.3 0.0 0.2
## SkinThickness 0.4 0.4 0.2 -0.1
## Insulin 1.0 0.2 0.2 0.0
## BMI 0.2 1.0 0.1 0.0
## DiabetesPedigreeFunction 0.2 0.1 1.0 0.0
## Age 0.0 0.0 0.0 1.0
library(ggcorrplot)
ggcorrplot(diab_cor)
No strong correlation observed between variables. So, no need to drop any of them for analysis
# Split dataset into train and test sets
require(caTools)
## Loading required package: caTools
set.seed(3)
sample = sample.split(diab$Outcome, SplitRatio=0.75)
train = subset(diab, sample==TRUE)
test = subset(diab, sample==FALSE)
nrow(diab)
## [1] 768
nrow(train)
## [1] 576
nrow(test)
## [1] 192
# distribution of Age category in Train set
table(train$Age)
##
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## 46 49 26 35 41 23 25 20 22 17 20 14 12 11 7 11 14 11 8 8 17 12 9 8 13 11
## 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 72 81
## 6 4 4 7 6 6 4 5 3 2 4 3 3 5 2 4 4 1 1 3 3 1 2 1 1 1
# Structure of train set
str(train)
## 'data.frame': 576 obs. of 10 variables:
## $ Pregnancies : int 6 1 1 0 5 3 10 2 8 10 ...
## $ Glucose : int 148 85 89 137 116 78 115 197 125 168 ...
## $ BloodPressure : int 72 66 66 40 74 50 0 70 96 74 ...
## $ SkinThickness : int 35 29 23 35 0 32 0 45 0 0 ...
## $ Insulin : int 0 0 94 168 0 88 0 543 0 0 ...
## $ BMI : num 33.6 26.6 28.1 43.1 25.6 31 35.3 30.5 0 38 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.167 2.288 0.201 ...
## $ Age : int 50 31 21 33 30 26 29 53 54 34 ...
## $ Outcome : Factor w/ 2 levels "0","1": 2 1 1 2 1 2 1 2 2 2 ...
## $ Age_Cat : Factor w/ 8 levels "<21","21-25",..: 6 4 2 4 3 3 3 7 7 4 ...
# Baseline model
table(diab$Outcome)
##
## 0 1
## 500 268
# Baseline accuracy
baseline <- round(500/nrow(diab),2)
baseline
## [1] 0.65
Do not select a model whose accuracy is lower than the baseline model. In this case, it is 0.65
# Fit model - using all independent variables
AllVar <- glm(Outcome ~ ., data = train, family = binomial)
summary(AllVar)
##
## Call:
## glm(formula = Outcome ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5680 -0.7074 -0.4311 0.7421 2.8979
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.140633 1.442614 -4.257 2.08e-05 ***
## Pregnancies 0.043779 0.039640 1.104 0.269414
## Glucose 0.034511 0.004255 8.111 5.00e-16 ***
## BloodPressure -0.015237 0.006253 -2.437 0.014818 *
## SkinThickness 0.004535 0.007823 0.580 0.562170
## Insulin -0.001881 0.001034 -1.819 0.068962 .
## BMI 0.066069 0.017458 3.785 0.000154 ***
## DiabetesPedigreeFunction 1.042778 0.353860 2.947 0.003210 **
## Age -0.048658 0.052789 -0.922 0.356659
## Age_Cat25-30 0.557992 0.418509 1.333 0.182439
## Age_Cat30-35 1.370840 0.626633 2.188 0.028697 *
## Age_Cat35-40 1.542470 0.884172 1.745 0.081066 .
## Age_Cat40-50 2.662067 1.208167 2.203 0.027567 *
## Age_Cat50-60 2.709033 1.748276 1.550 0.121251
## Age_Cat>60 1.850745 2.301301 0.804 0.421272
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 745.11 on 575 degrees of freedom
## Residual deviance: 541.06 on 561 degrees of freedom
## AIC: 571.06
##
## Number of Fisher Scoring iterations: 5
# Let's predict outcome on Training dataset
PredictTrain <- predict(AllVar, type = "response")
summary(PredictTrain)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0024 0.1162 0.2667 0.3490 0.5489 0.9876
# This computes the averAge prediction for each of the two outcomes
tapply(PredictTrain, train$Outcome, mean)
## 0 1
## 0.2339362 0.5635518
Confusion Matrix: Compares the actual outcomes with the predicted ones
| Predicted = 0 | Predicted = 1 | |
|---|---|---|
| Actual = 0 | True Negatives (TN) | False Positives (FP) |
| Actual = 1 | False Negatives (FN) | True Positives (TP) |
Sensitivity = TP / TP + FN (True Positive rate)
Specificity = TN / TN + FP (True Negative rate)
The model with a higher threshold has lower Sensitivity but higher Specificity.
The model with a lower threshold has higher Sensitivity but lower Specificity.
The outcome of a logistic regression model is a probability.
We can do this using a threshold value t
- If P(y=1) >= t, predict 1 - If P(y=1) < t, predict 0
What value should be selected for t?
Often selected based on which errors are better
If t is large, predict P(y=1) rarely (when P(y=1) is large)
If t is small, predict P(y=0) rarely (when P(y=1) is small)
With no preference between errors, select t=0.5. Predicts the more likely outcome
# Build confusion matrix with a threshold value of 0.5
threshold_0.5 <- table(train$Outcome, PredictTrain > 0.5)
threshold_0.5
##
## FALSE TRUE
## 0 333 42
## 1 83 118
# Accuracy
accuracy_0.5 <- round(sum(diag(threshold_0.5))/sum(threshold_0.5),2)
sprintf("Accuracy is %s",accuracy_0.5)
## [1] "Accuracy is 0.78"
# Mis-classification error rate
MC_0.5 <- 1-accuracy_0.5
sprintf("Mis-classification error is %s",MC_0.5)
## [1] "Mis-classification error is 0.22"
sensitivity0.5 <- round(118/(83+118),2)
specificity0.5 <- round(333/(333+42),2)
sprintf("Sensitivity at 0.5 threshold: %s", sensitivity0.5)
## [1] "Sensitivity at 0.5 threshold: 0.59"
sprintf("Specificity at 0.5 threshold: %s", specificity0.5)
## [1] "Specificity at 0.5 threshold: 0.89"
# Build confusion matrix with a threshold value of 0.7
threshold_0.7 <- table(train$Outcome, PredictTrain > 0.7)
threshold_0.7
##
## FALSE TRUE
## 0 359 16
## 1 123 78
# Accuracy
accuracy_0.7 <- round(sum(diag(threshold_0.7))/sum(threshold_0.7),2)
sprintf('Accuracy is %s', accuracy_0.7)
## [1] "Accuracy is 0.76"
# Mis-classification error rate
MC_0.7 <- 1-accuracy_0.7
sprintf("Mis-classification error is %s",MC_0.7)
## [1] "Mis-classification error is 0.24"
sensitivity0.7 <- round(78/(123+78),2)
specificity0.7 <- round(359/(359+16),2)
sprintf("Sensitivity at 0.7 threshold: %s", sensitivity0.7)
## [1] "Sensitivity at 0.7 threshold: 0.39"
sprintf("Specificity at 0.7 threshold: %s", specificity0.7)
## [1] "Specificity at 0.7 threshold: 0.96"
# Build confusion matrix with a threshold value of 0.2
threshold_0.2 <- table(train$Outcome, PredictTrain > 0.2)
threshold_0.2
##
## FALSE TRUE
## 0 215 160
## 1 21 180
# Accuracy
accuracy_0.2 <- round(sum(diag(threshold_0.2))/sum(threshold_0.2),2)
sprintf("Accuracy is %s", accuracy_0.2)
## [1] "Accuracy is 0.69"
# Mis-classification error rate
MC_0.2 <- 1-accuracy_0.2
sprintf("Mis-classification error is %s",MC_0.2)
## [1] "Mis-classification error is 0.31"
sensitivity0.2 <- round(180/(21+180),2)
specificity0.2 <- round(215/(215+160),2)
sprintf("Sensitivity at 0.2 threshold: %s",sensitivity0.2)
## [1] "Sensitivity at 0.2 threshold: 0.9"
sprintf("Specificity at 0.2 threshold: %s",specificity0.2)
## [1] "Specificity at 0.2 threshold: 0.57"
ROC Curve will help us decide as which threshold is best
High threshold: - High specificity - Low sensitivity
Low threshold: - Low specificity - High sensitivity
# Generate ROC Curves
library(ROCR)
ROCRpred = prediction(PredictTrain, train$Outcome)
ROCRperf = performance(ROCRpred, "tpr", "fpr")
# Adding threshold labels
plot(ROCRperf, colorize=TRUE, print.cutoffs.at = seq(0,1,0.1), text.adj = c(-0.2, 1.7))
abline(a=0, b=1)
auc_train <- round(as.numeric(performance(ROCRpred, "auc")@y.values),2)
legend(.8, .2, auc_train, title = "AUC", cex=1)
AUC (Area under the ROC curve): Absolute value of quality of prediction
AUC = Maximum of 1 (Perfect prediction)
AUC = minimum of 0.5 (just guessing)
| Predicted class = 0 | Predicted class = 1 | |
|---|---|---|
| Actual class = 0 | True Negatives (TN) | False Positives (FP) |
| Actual class = 1 | False Negatives (FN) | True Positives (TP) |
N = Number of obervations
Overall accuracy = (TN + TP) / N
Overall error rate = (FP + FN) / N
False positive error rate = 1 - specificity ## Make predictions on test set
# Making predictions on test set
PredictTest <- predict(AllVar, type = "response", newdata = test)
# Convert probabilities to values using the below
## Based on ROC curve above, selected a threshold of 0.5
test_tab <- table(test$Outcome, PredictTest > 0.5)
test_tab
##
## FALSE TRUE
## 0 115 10
## 1 25 42
accuracy_test <- round(sum(diag(test_tab))/sum(test_tab),2)
sprintf("Accuracy on test set is %s", accuracy_test)
## [1] "Accuracy on test set is 0.82"
# Compute test set AUC
ROCRPredTest = prediction(PredictTest, test$Outcome)
auc = round(as.numeric(performance(ROCRPredTest, "auc")@y.values),2)
auc
## [1] 0.89
Version 02:
p3<-ggplot(diab,aes(x=Pregnancies))+geom_density(aes(fill=Outcome),alpha=0.6)+
geom_vline(aes(xintercept=mean(Pregnancies)),
color="blue", linetype="dashed", size=1)+facet_grid(.~Outcome)+scale_fill_aaas()
ggplotly(p3)
p3<-ggplot(diab,aes(x=Age, y=Pregnancies, size=Glucose, fill=BloodPressure))+geom_point(alpha=0.2)+
facet_grid(.~Outcome)+geom_jitter(width = 0.4)+scale_x_continuous(limits = c(18, 80))+scale_fill_material("red")
ggplotly(p3)
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]: number
## of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]: number
## of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]: number
## of items to replace is not a multiple of replacement length
## Warning in L$marker$color[idx] <- aes2plotly(data, params, "fill")[idx]: number
## of items to replace is not a multiple of replacement length
library(caret) # ML packAge for various methods
## Loading required package: lattice
# Create the training and test datasets
set.seed(100)
hci<-diab
# Step 1: Get row numbers for the training data
trainRowNumbers <- createDataPartition(hci$Outcome, p=0.8, list=FALSE) # Data partition for dividing the dataset into training and testing data set. This is useful for cross validation
# Step 2: Create the training dataset
trainData <- hci[trainRowNumbers,]
# Step 3: Create the test dataset
testData <- hci[-trainRowNumbers,]
# Store X and Y for later use.
x = trainData[, 1:8]
y=trainData$Outcome
xt= testData[, 1:8]
yt=testData$Outcome
# # See the structure of the new dataset
preProcess_range_modeltr <- preProcess(trainData, method='range')
preProcess_range_modelts <- preProcess(testData, method='range')
trainData <- predict(preProcess_range_modeltr, newdata = trainData)
testData <- predict(preProcess_range_modelts, newdata = testData)
# Append the Y variable
trainData$Outcome <- y
testData$Outcome<-yt
levels(trainData$Outcome) <- c("Class0", "Class1") # Convert binary outcome into character for caret packAge
levels(testData$Outcome) <- c("Class0", "Class1")
#apply(trainData[, 1:8], 2, FUN=function(x){c('min'=min(x), 'max'=max(x))})
#str(trainData)
#fit control
fitControl <- trainControl(
method = 'cv', # k-fold cross validation
number = 5, # number of folds
savePredictions = 'final', # saves predictions for optimal tuning parameter
classProbs = T, # should class probabilities be returned
summaryFunction=twoClassSummary # results summary function
)
# Step 1: Tune hyper parameters by setting tuneLength
set.seed(100)
model1 = train(Outcome ~ ., data=trainData, method='lda', tuneLength = 5, metric='ROC', trControl = fitControl)
model2 = train(Outcome ~ ., data=trainData, method='knn', tuneLength=2, trControl = fitControl) #KNN Model
model3 = train(Outcome ~ ., data=trainData, method='svmRadial', tuneLength=2, trControl = fitControl) #SVM
model4 = train(Outcome ~ ., data=trainData, method='rpart', tuneLength=2, trControl = fitControl) #RandomForest
model5 = train(Outcome ~ ., data=trainData, method='adaboost', tuneLength=2, trControl = fitControl) #Adaboost
# Compare model performances using resample()
models_compare <- resamples(list(LDA=model1,KNN=model2,SVM=model3,RF=model4, ADA=model5))
# Summary of the models performances
summary(models_compare)
##
## Call:
## summary.resamples(object = models_compare)
##
## Models: LDA, KNN, SVM, RF, ADA
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.8052326 0.8191860 0.8316860 0.8357558 0.8421512 0.8805233 0
## KNN 0.7337209 0.7495640 0.7530523 0.7662791 0.7943314 0.8007267 0
## SVM 0.8183140 0.8191860 0.8328488 0.8402907 0.8473837 0.8837209 0
## RF 0.6478198 0.6702035 0.6879360 0.6915698 0.6879360 0.7639535 0
## ADA 0.7287791 0.7482558 0.8040698 0.7788953 0.8049419 0.8084302 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.8125 0.8625 0.8750 0.8725 0.9000 0.9125 0
## KNN 0.8250 0.8625 0.8750 0.8675 0.8875 0.8875 0
## SVM 0.8125 0.8625 0.8875 0.8700 0.8875 0.9000 0
## RF 0.7125 0.7375 0.8625 0.8175 0.8875 0.8875 0
## ADA 0.7375 0.7875 0.8000 0.7925 0.8125 0.8250 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LDA 0.5581395 0.6046512 0.6279070 0.6186047 0.6511628 0.6511628 0
## KNN 0.3953488 0.3953488 0.3953488 0.4697674 0.5116279 0.6511628 0
## SVM 0.5348837 0.5813953 0.6046512 0.6093023 0.6511628 0.6744186 0
## RF 0.4883721 0.4883721 0.5581395 0.5534884 0.6046512 0.6279070 0
## ADA 0.5581395 0.5581395 0.5581395 0.5767442 0.5813953 0.6279070 0
# Draw box plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
bwplot(models_compare, scales=scales)