Load required Libraries

library(ggplot2)
library(psych)
library(dplyr)
library(caret)

Read Provided data

diabetes <- read.csv("E:\\R Code files\\R Data Sets\\diabetes.csv")
str(diabetes)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
head(diabetes)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
diabetes$Outcome <- as.factor(diabetes$Outcome)

All 8 independent variables are numeric. There are tow outcomes, this data is good for classifciation. Lets change Outcome to categorical Variable

Exploratory Data analysis

Lets understand the data and its distribution.

Pregnancies data

table(diabetes$Pregnancies)
## 
##   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  17 
## 111 135 103  75  68  57  50  45  38  28  24  11   9  10   2   1   1
ggplot(data = diabetes,aes(x = Pregnancies)) +
  geom_histogram(binwidth = 0.5,aes(fill = Outcome),position = "dodge") +
  ggtitle("Pregnancies Data Distribution") + ylab("OutCode Counts") +
  theme_gray() +
  theme_update(plot.title = element_text(hjust = 0.5))

Pregnancies data is right skewed.

ggplot(data = diabetes,aes(x = Outcome, y = Pregnancies)) +
  geom_boxplot( aes(fill= Outcome)) +
  scale_y_continuous(breaks = seq(1,12,1),limits = c(0,12)) +
  ggtitle("Pregnancies Histogram") +
  stat_summary(fun.y=mean, colour="darkred", geom="point", 
                           shape=18, size=3,show.legend = TRUE) +
  theme_gray() + 
  theme_update(plot.title = element_text(hjust = 0.5))

Box plot shows, woman who had more pregnancies are more prone to diabetes.This may be important variable for model.

Glucose

ggplot(data = diabetes,aes(x = Outcome, y = Glucose)) +
  geom_boxplot( aes(fill= Outcome)) +
  scale_y_continuous(breaks = seq(80,200,10),limits = c(80,200)) +
  ggtitle("Glucose Histogram") +
  stat_summary(fun.y=mean, colour="darkred", geom="point", 
                           shape=18, size=3,show.legend = TRUE) +
  theme_gray() + 
  theme_update(plot.title = element_text(hjust = 0.5))

Diabetics woman have high Plasma glucose concentration. On average this value is 140 for diabetics woman while this is quite low for non-diabetics.

Blood Pressure

table(diabetes$BloodPressure)
## 
##   0  24  30  38  40  44  46  48  50  52  54  55  56  58  60  61  62  64 
##  35   1   2   1   1   4   2   5  13  11  11   2  12  21  37   1  34  43 
##  65  66  68  70  72  74  75  76  78  80  82  84  85  86  88  90  92  94 
##   7  30  45  57  44  52   8  39  45  40  30  23   6  21  25  22   8   6 
##  95  96  98 100 102 104 106 108 110 114 122 
##   1   4   3   3   1   2   3   2   3   1   1

There are 35 peoples with 0 Blood pressure. It is not medically possiable. Let 0 blood pressure with the median value.

# Replace 0 blood pressure with median blood pressuure
diabetes$BloodPressure <- ifelse(diabetes$BloodPressure == 0, 
                                 median(diabetes$BloodPressure,na.rm = TRUE),
                                 diabetes$BloodPressure
                                 )
ggplot(data = diabetes,aes(x = Outcome, y = BloodPressure)) +
  geom_boxplot( aes(fill= Outcome)) +
  scale_y_continuous(breaks = seq(60,110,10),limits = c(60,110)) +
  ylab("Blood Pressure") +
  ggtitle("Blood Pressure Histogram") +
  stat_summary(fun.y=mean, colour="darkred", geom="point", 
                           shape=18, size=3,show.legend = TRUE) +
  theme_gray() +
  theme_update(plot.title = element_text(hjust = 0.5))

Diastolic blood pressure for diabetic woman is higher compare to non-diabetics.

Triceps skin fold thickness

Triceps skin-fold thickness normal value for female 23

table(diabetes$SkinThickness)
## 
##   0   7   8  10  11  12  13  14  15  16  17  18  19  20  21  22  23  24 
## 227   2   2   5   6   7  11   6  14   6  14  20  18  13  10  16  22  12 
##  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42 
##  16  16  23  20  17  27  19  31  20   8  15  14  16   7  18  16  15  11 
##  43  44  45  46  47  48  49  50  51  52  54  56  60  63  99 
##   6   5   6   8   4   4   3   3   1   2   2   1   1   1   1

There are 227 observation shows its value 0. Which is not medically true. Let’s replace this value with the median value.

diabetes$SkinThickness <- ifelse(
  diabetes$SkinThickness == 0 , 
  median(diabetes$SkinThickness,na.rm = TRUE),
  diabetes$SkinThickness)
ggplot(data = diabetes,aes(x = Outcome, y = SkinThickness)) +
  geom_boxplot( aes(fill= Outcome),outlier.colour = "red", outlier.size = 5) +
  scale_y_continuous(breaks = seq(0,100,10),limits = c(0,100)) +
  ylab("Triceps skin fold thickness") +
  ggtitle("Skin Thickness Histogram") +
  stat_summary(fun.y=mean, colour="darkred", geom="point", 
                           shape=18, size=3,show.legend = TRUE) +
  theme_gray() +
  theme_update(plot.title = element_text(hjust = 0.5))

Boxplot shows that diabetics woman normally has high skin thickness. Red big dots are outlier but ignoring this outlier to consider the extreme case.

Body mass index

table(diabetes$BMI == 0)
## 
## FALSE  TRUE 
##   757    11

There are 11 observation where 0 BMI information provided. It is medically not possible. Let’s replace with healthy BMI at higher end to 30.

diabetes$BMI <- ifelse(diabetes$BMI == 0, 32, diabetes$BMI)

ggplot(data = diabetes,aes(x = Outcome, y = BMI)) +
  geom_boxplot( aes(fill= Outcome),outlier.colour = "red", outlier.size = 5) +
  scale_y_continuous(breaks = seq(20,70,5),limits = c(20,70)) +
  ylab("BMI") +
  ggtitle("Body mass index Histogram") +
  stat_summary(fun.y=mean, colour="darkred", geom="point", 
                           shape=18, size=3,show.legend = TRUE) +
  theme_gray() +
  theme_update(plot.title = element_text(hjust = 0.5))

BMI for diabetics’ woman is high compare to non-diabetics. There are few outlier, let not treat them to consider the extreme cases of BMI.

Diabetes pedigree function

ggplot(data = diabetes,aes(x = Outcome, y = DiabetesPedigreeFunction)) +
  geom_boxplot( aes(fill= Outcome),outlier.colour = "red", outlier.size = 5) +
  scale_y_continuous(breaks = seq(0,2,0.2),limits = c(0,2)) +
  ylab("Diabetes Pedigree Function") +
  ggtitle("Diabetes Pedigree Function") +
  stat_summary(fun.y=mean, colour="darkred", geom="point", 
                           shape=18, size=3,show.legend = TRUE) +
  theme_gray() +
  theme_update(plot.title = element_text(hjust = 0.5))

Age

ggplot(data = diabetes,aes(x = Outcome, y = Age)) +
  geom_boxplot( aes(fill= Outcome),outlier.colour = "red", outlier.size = 5) +
  scale_y_continuous(breaks = seq(20,80,10),limits = c(20,80)) +
  ylab("Age") +
  ggtitle("Diabetes Pedigree Function") +
  stat_summary(fun.y=mean, colour="darkred", geom="point", 
                           shape=18, size=3,show.legend = TRUE) +
  theme_gray() +
  theme_update(plot.title = element_text(hjust = 0.5))

Older people has more chance to be diabetics. There are more outliers for non-diabetics that seems normal.

Proptional table

prop.table(table(diabetes$Outcome)) 
## 
##         0         1 
## 0.6510417 0.3489583

There are 65% non-diabetics and 35% are diabetics cases in sample set of data.

Correlation

pairs.panels(diabetes)

Glucose,BMI age and pregnanies are highly and moderately correlated with the outcomes.

Split data into training set and test data set

set.seed(2017)

trainIndex <- createDataPartition(diabetes$Outcome, p = .8, 
                                  list = FALSE, 
                                  times = 1)


diabetes$Outcome <- as.factor(diabetes$Outcome)

diabetes.train <- diabetes[trainIndex,]
diabetes.test <- diabetes[-trainIndex,]

# Training data Proption
prop.table(table(diabetes.train$Outcome))
## 
##         0         1 
## 0.6504065 0.3495935
# Test data Proption
prop.table(table(diabetes.test$Outcome))
## 
##         0         1 
## 0.6535948 0.3464052

Fit Random Forest Model in training set data

ctl <- trainControl(
  method = "repeatedcv",
  number = 10,
  repeats = 10
) 

# Performance Parameters Setting
grid <- expand.grid(mtry  = c(3,4,5))

model.Random.Forest <- train(
  Outcome ~ ., 
  data = diabetes.train, 
  method = "rf",
  tuneGrid = grid,
  trControl = ctl)

model.Random.Forest
## Random Forest 
## 
## 615 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 554, 554, 553, 554, 553, 554, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   3     0.7597171  0.4536582
##   4     0.7546616  0.4439400
##   5     0.7540243  0.4427589
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# The final value used for the model was mtry = 4.

# Plot Model
plot(model.Random.Forest)

Evaluate Model Performance

predict.Random.Forest <- predict(model.Random.Forest,diabetes.test)

confusionMatrix(predict.Random.Forest,diabetes.test$Outcome) 
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 80 16
##          1 20 37
##                                           
##                Accuracy : 0.7647          
##                  95% CI : (0.6894, 0.8294)
##     No Information Rate : 0.6536          
##     P-Value [Acc > NIR] : 0.001988        
##                                           
##                   Kappa : 0.4894          
##  Mcnemar's Test P-Value : 0.617075        
##                                           
##             Sensitivity : 0.8000          
##             Specificity : 0.6981          
##          Pos Pred Value : 0.8333          
##          Neg Pred Value : 0.6491          
##              Prevalence : 0.6536          
##          Detection Rate : 0.5229          
##    Detection Prevalence : 0.6275          
##       Balanced Accuracy : 0.7491          
##                                           
##        'Positive' Class : 0               
##