Import Libraries

library(caTools)
## Warning: package 'caTools' was built under R version 3.6.1
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.1
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess

Import Data

pimaData <- read.csv("db.csv")

Data Exploration

head(pimaData)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
summary(pimaData)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
str(pimaData)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...

Columns Names

colnames(pimaData) <- c("pregnancies","glucose","bp","skinThickness","insulin","bmi","dpf","age","outcome")

Missing Values

table(is.na(pimaData))
## 
## FALSE 
##  6912

There is no missing values in dataset

Splitting Data

splitData <- sample.split(pimaData,SplitRatio = 0.8)

Training And testing Data

train <- subset(pimaData,splitData=='TRUE')
test <- subset(pimaData,splitData=='FALSE')

Data Modeling

model <- glm(outcome~.,train,family = "binomial")
summary(model)
## 
## Call:
## glm(formula = outcome ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7035  -0.7241  -0.4041   0.7044   2.8671  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -8.8335216  0.8520401 -10.367  < 2e-16 ***
## pregnancies    0.1385463  0.0361898   3.828 0.000129 ***
## glucose        0.0342450  0.0043732   7.831 4.86e-15 ***
## bp            -0.0111904  0.0061003  -1.834 0.066597 .  
## skinThickness -0.0009522  0.0080535  -0.118 0.905879    
## insulin       -0.0010204  0.0010980  -0.929 0.352740    
## bmi            0.0961369  0.0171581   5.603 2.11e-08 ***
## dpf            1.1910481  0.3549444   3.356 0.000792 ***
## age            0.0167324  0.0110002   1.521 0.128233    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 768.89  on 597  degrees of freedom
## Residual deviance: 560.13  on 589  degrees of freedom
## AIC: 578.13
## 
## Number of Fisher Scoring iterations: 5

Prediction

pred <- predict(model,test,type = "response")

Choosing Threshold

rocPred = prediction(pred,test$outcome)
rocPref <- performance(rocPred,"tpr","fpr")

Plotting Threshold

plot(rocPref,colorize=TRUE,print.cuttoffs.at=seq(0.1,by=0.1))

Agian Predict Using 0.3 Threshold

table(ActualValue=test$outcome,PredictedValue=pred>0.3)
##            PredictedValue
## ActualValue FALSE TRUE
##           0    73   34
##           1    11   52