Context: This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, based on certain diagnostic measurements included in the dataset. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
Problem Statement: Build a model to accurately predict whether the patients in the dataset have diabetes or not?
Dataset Description: The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.
Pregnancies: Number of times pregnant
Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
BloodPressure: Diastolic blood pressure (mm Hg)
SkinThickness: Triceps skin fold thickness (mm)
Insulin: 2-Hour serum insulin (mu U/ml)
BMI: Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction: Diabetes pedigree function
Age: Age (years)
Outcome: Class variable (0 or 1) 268 of 768 are 1, the others are 0
Approach: Following pointers will be helpful to structure your findings.
•Glucose •BloodPressure •SkinThickness •Insulin •BMI#Read the file from the location where you have stored it.
diab_case=read.csv(file.choose(),header = T)
#It will show first 5 observations from the data frame
head(diab_case)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
#Following are all the required libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggplot2)
library(class)
library(corrplot)
## corrplot 0.84 loaded
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(e1071)
diab_case1=select(diab_case,Glucose,BloodPressure,SkinThickness,Insulin,BMI)
head(diab_case1)
## Glucose BloodPressure SkinThickness Insulin BMI
## 1 148 72 35 0 33.6
## 2 85 66 29 0 26.6
## 3 183 64 0 0 23.3
## 4 89 66 23 94 28.1
## 5 137 40 35 168 43.1
## 6 116 74 0 0 25.6
diab_case1[diab_case1=='0']=NA#Replacing 0 with NA
summary(diab_case1)#This will show the NA present in every individual variable
## Glucose BloodPressure SkinThickness Insulin
## Min. : 44.0 Min. : 24.00 Min. : 7.00 Min. : 14.00
## 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:22.00 1st Qu.: 76.25
## Median :117.0 Median : 72.00 Median :29.00 Median :125.00
## Mean :121.7 Mean : 72.41 Mean :29.15 Mean :155.55
## 3rd Qu.:141.0 3rd Qu.: 80.00 3rd Qu.:36.00 3rd Qu.:190.00
## Max. :199.0 Max. :122.00 Max. :99.00 Max. :846.00
## NA's :5 NA's :35 NA's :227 NA's :374
## BMI
## Min. :18.20
## 1st Qu.:27.50
## Median :32.30
## Mean :32.46
## 3rd Qu.:36.60
## Max. :67.10
## NA's :11
sum(is.na(diab_case1))#To check total number is NA present
## [1] 652
diab_case1=kNN(diab_case1,k=sqrt(nrow(diab_case1)))#KNN Imputation method to remove NA
diab_case1=diab_case1[,1:5]#Removing dummy values
#Replacing the treated variables with the untreated variables present in main data
diab_case$Glucose=diab_case1$Glucose
diab_case$BloodPressure=diab_case1$BloodPressure
diab_case$SkinThickness=diab_case1$SkinThickness
diab_case$Insulin=diab_case1$Insulin
diab_case$BMI=diab_case1$BMI
summary(diab_case)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 44.0 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 64.00 1st Qu.:23.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :30.00
## Mean : 3.845 Mean :121.6 Mean : 72.51 Mean :29.02
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:33.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 14.0 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 85.0 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median :125.0 Median :32.40 Median :0.3725 Median :29.00
## Mean :142.2 Mean :32.49 Mean :0.4719 Mean :33.24
## 3rd Qu.:168.0 3rd Qu.:36.62 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
datatype=lapply(diab_case,class)#Generate List
class(datatype)
## [1] "list"
datatype=data.frame(unlist(datatype))#It will convert list in to data frame
barplot(table(datatype),main = 'datatype',col='blue',ylab ='count of variables')
pairs(diab_case,main="Scatter Plot",col='red')#Scatter Plot
diab_case_cor=cor(diab_case)#Creating correlatin matrix
heatmap(diab_case_cor,main='Heat Map')#Plotting heatmap
corrplot(diab_case_cor,method = 'number',type = 'upper',main='Co-Relation Plot')
diab_case$Outcome=factor(diab_case$Outcome)#It will convert variable Outcome into factor as it is required for classification purpose
diab_case2=diab_case[,-9]#We'll be removing dependant variable Outcome as KNN works with numeric data by generating an another subset
#Normalization/standardization/scaling of variables
normalize=function(x){
return((x-min(x))/(max(x)-min(x)))
}
diab_case2=normalize(diab_case2)
#Holdout Cross Validation method
#Divide data in training and testing
indexdiab=sample(nrow(diab_case2),0.75*nrow(diab_case2))
train_diab=diab_case2[indexdiab,]#75% Training data for KNN
test_diab=diab_case2[-indexdiab,]#25% Testing Data for KNN
#Generate outcome vectors for traindata and testdata
ytrain=diab_case$Outcome[indexdiab]
ytest=diab_case$Outcome[-indexdiab]
#for generating KNN model
knnmodel=knn(train_diab,test_diab,k=sqrt(nrow(train_diab)),cl=ytrain);knnmodel
## [1] 1 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1
## [36] 1 0 1 0 1 0 0 1 0 1 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0
## [71] 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## [106] 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 1 0 0 0
## [141] 1 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 0 0 0 0 0
## [176] 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 1 0
## Levels: 0 1
ytest=factor(ytest)
confusionMatrix(ytest,knnmodel)# Accuracy : 0.7448 of patients being correctly classified
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 101 18
## 1 36 37
##
## Accuracy : 0.7188
## 95% CI : (0.6495, 0.7811)
## No Information Rate : 0.7135
## P-Value [Acc > NIR] : 0.4727
##
## Kappa : 0.3734
##
## Mcnemar's Test P-Value : 0.0207
##
## Sensitivity : 0.7372
## Specificity : 0.6727
## Pos Pred Value : 0.8487
## Neg Pred Value : 0.5068
## Prevalence : 0.7135
## Detection Rate : 0.5260
## Detection Prevalence : 0.6198
## Balanced Accuracy : 0.7050
##
## 'Positive' Class : 0
##
index=sample(nrow(diab_case),0.75*nrow(diab_case))#Here we are dividing data in 75:25 ratio
train_diab<-diab_case[index,]#Selecting 75% of the data
test_diab<-diab_case[-index,]#Selecting remaining 25% of the data
diabmodel<-glm(Outcome~.,data = train_diab,family = "binomial")#Generating Logistic Regression Model
summary(diabmodel)#It shows the summary of the generated model
##
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = train_diab)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7121 -0.7034 -0.3764 0.7044 2.4824
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.6674352 0.9739099 -9.926 < 2e-16 ***
## Pregnancies 0.1068938 0.0366690 2.915 0.003556 **
## Glucose 0.0384667 0.0048416 7.945 1.94e-15 ***
## BloodPressure -0.0021258 0.0101516 -0.209 0.834128
## SkinThickness 0.0091882 0.0161532 0.569 0.569481
## Insulin -0.0007412 0.0013911 -0.533 0.594167
## BMI 0.0763412 0.0227738 3.352 0.000802 ***
## DiabetesPedigreeFunction 0.9211206 0.3319629 2.775 0.005524 **
## Age 0.0228689 0.0111368 2.053 0.040029 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 743.86 on 575 degrees of freedom
## Residual deviance: 525.68 on 567 degrees of freedom
## AIC: 543.68
##
## Number of Fisher Scoring iterations: 5
#It will calculate the probability of patients being correct classified using Binary Logistic Regression
#Steps for ROCR Curve
train_diab$predprob<-fitted(diabmodel)#Predicting the probability values of patient being diabetic with the help of model we built
head(train_diab)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 210 7 184 84 33 207 35.5
## 662 1 199 76 43 274 42.9
## 699 4 127 88 11 155 34.5
## 655 1 106 70 28 135 34.2
## 299 14 100 78 25 184 36.6
## 273 3 122 78 22 105 23.0
## DiabetesPedigreeFunction Age Outcome predprob
## 210 0.355 41 1 0.89135326
## 662 1.394 22 1 0.96034087
## 699 0.598 28 0 0.32507422
## 655 0.142 22 0 0.09709836
## 299 0.412 46 1 0.45730808
## 273 0.254 40 0 0.14298056
pred<-prediction(train_diab$predprob,train_diab$Outcome)#Predicting the probabilistic values for the ROC curve
perf<-performance(pred,"tpr","fpr")#It will check the performance
plot(perf,colorize=T,print.cutoffs.at=seq(0.1,by=0.05),main='ROC Curve Plot for Training Data')#Plotting the curve
train_diab$predoutcome=ifelse(train_diab$predprob>0.35,1,0)#Filtering the values that are above the threshold value=0.35
train_diab$predoutcome=factor(train_diab$predoutcome)#Convert it into factor so that it can be compared with actual Outcome
confusionMatrix(train_diab$Outcome,train_diab$predoutcome)#Confusion matrix is generated for training data
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 293 83
## 1 54 146
##
## Accuracy : 0.7622
## 95% CI : (0.7252, 0.7964)
## No Information Rate : 0.6024
## P-Value [Acc > NIR] : 3.933e-16
##
## Kappa : 0.4925
##
## Mcnemar's Test P-Value : 0.01675
##
## Sensitivity : 0.8444
## Specificity : 0.6376
## Pos Pred Value : 0.7793
## Neg Pred Value : 0.7300
## Prevalence : 0.6024
## Detection Rate : 0.5087
## Detection Prevalence : 0.6528
## Balanced Accuracy : 0.7410
##
## 'Positive' Class : 0
##
#For train data Accuracy : 0.7569
diab_auctrain=performance(pred,"auc")#To find AUC(Area Under The Curve)
diab_auctrain@y.values#AUC Value is in y.values
## [[1]]
## [1] 0.8512234
#Process is same as that of training data except
test_diab$predprob=predict(diabmodel,test_diab,type = "response")#type='response because predict function will generate sigmoid values and we are interested in values of probaility
pred_test=prediction(test_diab$predprob,test_diab$Outcome)
perf_test<-performance(pred_test,"tpr","fpr")
plot(perf_test,colorize=T,print.cutoffs.at=seq(0.1,by=0.05),main='ROC Curve For Testing Data')
test_diab$predoutcome=ifelse(test_diab$predprob<0.35,0,1)
test_diab$predoutcome=factor(test_diab$predoutcome)
confusionMatrix(test_diab$Outcome,test_diab$predoutcome)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 99 25
## 1 24 44
##
## Accuracy : 0.7448
## 95% CI : (0.677, 0.8048)
## No Information Rate : 0.6406
## P-Value [Acc > NIR] : 0.001345
##
## Kappa : 0.444
##
## Mcnemar's Test P-Value : 1.000000
##
## Sensitivity : 0.8049
## Specificity : 0.6377
## Pos Pred Value : 0.7984
## Neg Pred Value : 0.6471
## Prevalence : 0.6406
## Detection Rate : 0.5156
## Detection Prevalence : 0.6458
## Balanced Accuracy : 0.7213
##
## 'Positive' Class : 0
##
diab_auctest=performance(pred_test,"auc")
diab_auctest@y.values#Area Under The ROC Curve for train data
## [[1]]
## [1] 0.8235294
#For test data Accuracy : 0.7552
train_diab$predoutcome=NULL
train_diab$predprob=NULL
test_diab$predprob=NULL
test_diab$predoutcome=NULL
diab_caserf=diab_case#Generated seperate subset for random forest
set.seed(100)
Diab_RF=randomForest(Outcome~.,data=train_diab,ntree=10);Diab_RF#Generating random forest classifier
##
## Call:
## randomForest(formula = Outcome ~ ., data = train_diab, ntree = 10)
## Type of random forest: classification
## Number of trees: 10
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 27.85%
## Confusion matrix:
## 0 1 class.error
## 0 295 77 0.2069892
## 1 82 117 0.4120603
plot(Diab_RF)#This will plot the random forest graph
#It will calculate the probability of patients being correct classified using Random Forest
predrf_train=predict(Diab_RF,train_diab)
confusionMatrix(train_diab$Outcome,predrf_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 373 3
## 1 7 193
##
## Accuracy : 0.9826
## 95% CI : (0.9683, 0.9916)
## No Information Rate : 0.6597
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9615
##
## Mcnemar's Test P-Value : 0.3428
##
## Sensitivity : 0.9816
## Specificity : 0.9847
## Pos Pred Value : 0.9920
## Neg Pred Value : 0.9650
## Prevalence : 0.6597
## Detection Rate : 0.6476
## Detection Prevalence : 0.6528
## Balanced Accuracy : 0.9831
##
## 'Positive' Class : 0
##
#For train data Accuracy : 0.9861
predrf_test=predict(Diab_RF,test_diab)
confusionMatrix(test_diab$Outcome,predrf_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 106 18
## 1 34 34
##
## Accuracy : 0.7292
## 95% CI : (0.6605, 0.7906)
## No Information Rate : 0.7292
## P-Value [Acc > NIR] : 0.53725
##
## Kappa : 0.3747
##
## Mcnemar's Test P-Value : 0.03751
##
## Sensitivity : 0.7571
## Specificity : 0.6538
## Pos Pred Value : 0.8548
## Neg Pred Value : 0.5000
## Prevalence : 0.7292
## Detection Rate : 0.5521
## Detection Prevalence : 0.6458
## Balanced Accuracy : 0.7055
##
## 'Positive' Class : 0
##
#For test data Accuracy : 0.75
naive_model_diab=naiveBayes(Outcome~.,data = train_diab);naive_model_diab#Method to build naive bayes classification model
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## 0 1
## 0.6527778 0.3472222
##
## Conditional probabilities:
## Pregnancies
## Y [,1] [,2]
## 0 3.265957 3.054790
## 1 4.910000 3.788624
##
## Glucose
## Y [,1] [,2]
## 0 110.6144 24.53075
## 1 143.2950 29.51787
##
## BloodPressure
## Y [,1] [,2]
## 0 70.81649 12.10932
## 1 75.80000 11.62652
##
## SkinThickness
## Y [,1] [,2]
## 0 27.30585 8.880515
## 1 32.18500 8.733392
##
## Insulin
## Y [,1] [,2]
## 0 120.6995 81.77884
## 1 184.4450 102.50004
##
## BMI
## Y [,1] [,2]
## 0 30.87128 6.651193
## 1 34.85100 6.239427
##
## DiabetesPedigreeFunction
## Y [,1] [,2]
## 0 0.4272021 0.3065820
## 1 0.5624750 0.3801887
##
## Age
## Y [,1] [,2]
## 0 31.07447 11.41133
## 1 37.58500 10.71010
#It will calculate the probability of patients being correct classified using Naive Bayes
predi_train_naive=predict(naive_model_diab,train_diab)
confusionMatrix(train_diab$Outcome,predi_train_naive)#Training Accuracy : 0.7743
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 313 63
## 1 68 132
##
## Accuracy : 0.7726
## 95% CI : (0.7361, 0.8062)
## No Information Rate : 0.6615
## P-Value [Acc > NIR] : 3.843e-09
##
## Kappa : 0.4953
##
## Mcnemar's Test P-Value : 0.7267
##
## Sensitivity : 0.8215
## Specificity : 0.6769
## Pos Pred Value : 0.8324
## Neg Pred Value : 0.6600
## Prevalence : 0.6615
## Detection Rate : 0.5434
## Detection Prevalence : 0.6528
## Balanced Accuracy : 0.7492
##
## 'Positive' Class : 0
##
predi_test_naive=predict(naive_model_diab,test_diab)
confusionMatrix(test_diab$Outcome,predi_test_naive)# Testing Accuracy : 0.7604
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 107 17
## 1 32 36
##
## Accuracy : 0.7448
## 95% CI : (0.677, 0.8048)
## No Information Rate : 0.724
## P-Value [Acc > NIR] : 0.2889
##
## Kappa : 0.4129
##
## Mcnemar's Test P-Value : 0.0455
##
## Sensitivity : 0.7698
## Specificity : 0.6792
## Pos Pred Value : 0.8629
## Neg Pred Value : 0.5294
## Prevalence : 0.7240
## Detection Rate : 0.5573
## Detection Prevalence : 0.6458
## Balanced Accuracy : 0.7245
##
## 'Positive' Class : 0
##
svm_case_diab=svm(Outcome~.,data = train_diab,kernel='linear',scale = F);svm_case_diab#Genrating SVM with scale=F as data is already normalized
##
## Call:
## svm(formula = Outcome ~ ., data = train_diab, kernel = "linear",
## scale = F)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 295
#It will calculate the probability of patients being correct classified using SVM
predict_svm_casestudy=predict(svm_case_diab,train_diab)
confusionMatrix(train_diab$Outcome,predict_svm_casestudy)# Accuracy : 0.7569
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 333 43
## 1 88 112
##
## Accuracy : 0.7726
## 95% CI : (0.7361, 0.8062)
## No Information Rate : 0.7309
## P-Value [Acc > NIR] : 0.0125899
##
## Kappa : 0.4704
##
## Mcnemar's Test P-Value : 0.0001209
##
## Sensitivity : 0.7910
## Specificity : 0.7226
## Pos Pred Value : 0.8856
## Neg Pred Value : 0.5600
## Prevalence : 0.7309
## Detection Rate : 0.5781
## Detection Prevalence : 0.6528
## Balanced Accuracy : 0.7568
##
## 'Positive' Class : 0
##
predict_svm_casestudy_train=predict(svm_case_diab,test_diab)
confusionMatrix(test_diab$Outcome,predict_svm_casestudy_train) #Accuracy : 0.7552
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 110 14
## 1 32 36
##
## Accuracy : 0.7604
## 95% CI : (0.6937, 0.8189)
## No Information Rate : 0.7396
## P-Value [Acc > NIR] : 0.28551
##
## Kappa : 0.443
##
## Mcnemar's Test P-Value : 0.01219
##
## Sensitivity : 0.7746
## Specificity : 0.7200
## Pos Pred Value : 0.8871
## Neg Pred Value : 0.5294
## Prevalence : 0.7396
## Detection Rate : 0.5729
## Detection Prevalence : 0.6458
## Balanced Accuracy : 0.7473
##
## 'Positive' Class : 0
##