library(ggplot2)
library(psych)
library(dplyr)
library(caret)
diabetes <- read.csv("E:\\R Code files\\R Data Sets\\diabetes.csv")
str(diabetes)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
head(diabetes)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
diabetes$Outcome <- as.factor(diabetes$Outcome)
All 8 independent variables are numeric. There are tow outcomes, this data is good for classifciation. Lets change Outcome to categorical Variable
Lets understand the data and its distribution.
Pregnancies data
table(diabetes$Pregnancies)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 17
## 111 135 103 75 68 57 50 45 38 28 24 11 9 10 2 1 1
ggplot(data = diabetes,aes(x = Pregnancies)) +
geom_histogram(binwidth = 0.5,aes(fill = Outcome),position = "dodge") +
ggtitle("Pregnancies Data Distribution") + ylab("OutCode Counts") +
theme_gray() +
theme_update(plot.title = element_text(hjust = 0.5))
Pregnancies data is right skewed.
ggplot(data = diabetes,aes(x = Outcome, y = Pregnancies)) +
geom_boxplot( aes(fill= Outcome)) +
scale_y_continuous(breaks = seq(1,12,1),limits = c(0,12)) +
ggtitle("Pregnancies Histogram") +
stat_summary(fun.y=mean, colour="darkred", geom="point",
shape=18, size=3,show.legend = TRUE) +
theme_gray() +
theme_update(plot.title = element_text(hjust = 0.5))
Box plot shows, woman who had more pregnancies are more prone to diabetes.This may be important variable for model.
Glucose
ggplot(data = diabetes,aes(x = Outcome, y = Glucose)) +
geom_boxplot( aes(fill= Outcome)) +
scale_y_continuous(breaks = seq(80,200,10),limits = c(80,200)) +
ggtitle("Glucose Histogram") +
stat_summary(fun.y=mean, colour="darkred", geom="point",
shape=18, size=3,show.legend = TRUE) +
theme_gray() +
theme_update(plot.title = element_text(hjust = 0.5))
Diabetics woman have high Plasma glucose concentration. On average this value is 140 for diabetics woman while this is quite low for non-diabetics.
Blood Pressure
table(diabetes$BloodPressure)
##
## 0 24 30 38 40 44 46 48 50 52 54 55 56 58 60 61 62 64
## 35 1 2 1 1 4 2 5 13 11 11 2 12 21 37 1 34 43
## 65 66 68 70 72 74 75 76 78 80 82 84 85 86 88 90 92 94
## 7 30 45 57 44 52 8 39 45 40 30 23 6 21 25 22 8 6
## 95 96 98 100 102 104 106 108 110 114 122
## 1 4 3 3 1 2 3 2 3 1 1
There are 35 peoples with 0 Blood pressure. It is not medically possiable. Let 0 blood pressure with the median value.
# Replace 0 blood pressure with median blood pressuure
diabetes$BloodPressure <- ifelse(diabetes$BloodPressure == 0,
median(diabetes$BloodPressure,na.rm = TRUE),
diabetes$BloodPressure
)
ggplot(data = diabetes,aes(x = Outcome, y = BloodPressure)) +
geom_boxplot( aes(fill= Outcome)) +
scale_y_continuous(breaks = seq(60,110,10),limits = c(60,110)) +
ylab("Blood Pressure") +
ggtitle("Blood Pressure Histogram") +
stat_summary(fun.y=mean, colour="darkred", geom="point",
shape=18, size=3,show.legend = TRUE) +
theme_gray() +
theme_update(plot.title = element_text(hjust = 0.5))
Diastolic blood pressure for diabetic woman is higher compare to non-diabetics.
Triceps skin fold thickness
Triceps skin-fold thickness normal value for female 23
table(diabetes$SkinThickness)
##
## 0 7 8 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 227 2 2 5 6 7 11 6 14 6 14 20 18 13 10 16 22 12
## 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
## 16 16 23 20 17 27 19 31 20 8 15 14 16 7 18 16 15 11
## 43 44 45 46 47 48 49 50 51 52 54 56 60 63 99
## 6 5 6 8 4 4 3 3 1 2 2 1 1 1 1
There are 227 observation shows its value 0. Which is not medically true. Let’s replace this value with the median value.
diabetes$SkinThickness <- ifelse(
diabetes$SkinThickness == 0 ,
median(diabetes$SkinThickness,na.rm = TRUE),
diabetes$SkinThickness)
ggplot(data = diabetes,aes(x = Outcome, y = SkinThickness)) +
geom_boxplot( aes(fill= Outcome),outlier.colour = "red", outlier.size = 5) +
scale_y_continuous(breaks = seq(0,100,10),limits = c(0,100)) +
ylab("Triceps skin fold thickness") +
ggtitle("Skin Thickness Histogram") +
stat_summary(fun.y=mean, colour="darkred", geom="point",
shape=18, size=3,show.legend = TRUE) +
theme_gray() +
theme_update(plot.title = element_text(hjust = 0.5))
Boxplot shows that diabetics woman normally has high skin thickness. Red big dots are outlier but ignoring this outlier to consider the extreme case.
Body mass index
table(diabetes$BMI == 0)
##
## FALSE TRUE
## 757 11
There are 11 observation where 0 BMI information provided. It is medically not possible. Let’s replace with healthy BMI at higher end to 30.
diabetes$BMI <- ifelse(diabetes$BMI == 0, 32, diabetes$BMI)
ggplot(data = diabetes,aes(x = Outcome, y = BMI)) +
geom_boxplot( aes(fill= Outcome),outlier.colour = "red", outlier.size = 5) +
scale_y_continuous(breaks = seq(20,70,5),limits = c(20,70)) +
ylab("BMI") +
ggtitle("Body mass index Histogram") +
stat_summary(fun.y=mean, colour="darkred", geom="point",
shape=18, size=3,show.legend = TRUE) +
theme_gray() +
theme_update(plot.title = element_text(hjust = 0.5))
BMI for diabetics’ woman is high compare to non-diabetics. There are few outlier, let not treat them to consider the extreme cases of BMI.
Diabetes pedigree function
ggplot(data = diabetes,aes(x = Outcome, y = DiabetesPedigreeFunction)) +
geom_boxplot( aes(fill= Outcome),outlier.colour = "red", outlier.size = 5) +
scale_y_continuous(breaks = seq(0,2,0.2),limits = c(0,2)) +
ylab("Diabetes Pedigree Function") +
ggtitle("Diabetes Pedigree Function") +
stat_summary(fun.y=mean, colour="darkred", geom="point",
shape=18, size=3,show.legend = TRUE) +
theme_gray() +
theme_update(plot.title = element_text(hjust = 0.5))
Age
ggplot(data = diabetes,aes(x = Outcome, y = Age)) +
geom_boxplot( aes(fill= Outcome),outlier.colour = "red", outlier.size = 5) +
scale_y_continuous(breaks = seq(20,80,10),limits = c(20,80)) +
ylab("Age") +
ggtitle("Diabetes Pedigree Function") +
stat_summary(fun.y=mean, colour="darkred", geom="point",
shape=18, size=3,show.legend = TRUE) +
theme_gray() +
theme_update(plot.title = element_text(hjust = 0.5))
Older people has more chance to be diabetics. There are more outliers for non-diabetics that seems normal.
Proptional table
prop.table(table(diabetes$Outcome))
##
## 0 1
## 0.6510417 0.3489583
There are 65% non-diabetics and 35% are diabetics cases in sample set of data.
Correlation
pairs.panels(diabetes)
Glucose,BMI age and pregnanies are highly and moderately correlated with the outcomes.
set.seed(2017)
trainIndex <- createDataPartition(diabetes$Outcome, p = .8,
list = FALSE,
times = 1)
diabetes$Outcome <- as.factor(diabetes$Outcome)
diabetes.train <- diabetes[trainIndex,]
diabetes.test <- diabetes[-trainIndex,]
# Training data Proption
prop.table(table(diabetes.train$Outcome))
##
## 0 1
## 0.6504065 0.3495935
# Test data Proption
prop.table(table(diabetes.test$Outcome))
##
## 0 1
## 0.6535948 0.3464052
ctl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 10
)
# Performance Parameters Setting
grid <- expand.grid(mtry = c(3,4,5))
model.Random.Forest <- train(
Outcome ~ .,
data = diabetes.train,
method = "rf",
tuneGrid = grid,
trControl = ctl)
model.Random.Forest
## Random Forest
##
## 615 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 554, 554, 553, 554, 553, 554, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 3 0.7597171 0.4536582
## 4 0.7546616 0.4439400
## 5 0.7540243 0.4427589
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# The final value used for the model was mtry = 4.
# Plot Model
plot(model.Random.Forest)
predict.Random.Forest <- predict(model.Random.Forest,diabetes.test)
confusionMatrix(predict.Random.Forest,diabetes.test$Outcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 80 16
## 1 20 37
##
## Accuracy : 0.7647
## 95% CI : (0.6894, 0.8294)
## No Information Rate : 0.6536
## P-Value [Acc > NIR] : 0.001988
##
## Kappa : 0.4894
## Mcnemar's Test P-Value : 0.617075
##
## Sensitivity : 0.8000
## Specificity : 0.6981
## Pos Pred Value : 0.8333
## Neg Pred Value : 0.6491
## Prevalence : 0.6536
## Detection Rate : 0.5229
## Detection Prevalence : 0.6275
## Balanced Accuracy : 0.7491
##
## 'Positive' Class : 0
##