Load the necessary packages

library(cdata)  # data wrangling
## Warning: package 'cdata' was built under R version 4.0.5
## Loading required package: wrapr
## Warning: package 'wrapr' was built under R version 4.0.5
library(ggplot2) # elegant plots
## Warning: package 'ggplot2' was built under R version 4.0.5
library(knitr)    # display table 
## Warning: package 'knitr' was built under R version 4.0.5
library(reshape2)  # data wrangling
## Warning: package 'reshape2' was built under R version 4.0.5
library(WVPlots)   # ROC plot, double density plot and enrichment/recall plot
## Warning: package 'WVPlots' was built under R version 4.0.5
library(sigr)      # AUC calculation
## Warning: package 'sigr' was built under R version 4.0.5

Next, import data into the working environment as data frame. Get the statistical summaries of each variables. summary() syntax is also great in checking the presence of missing value. str() characterize the format in which each variable is stored

data=read.csv('diabetes.csv')
summary(data)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
str(data)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...

The numerical outcome column is changed to factor for modeling purpose later.

data$Outcome=as.factor(data$Outcome)
summary(data$Outcome)
##   0   1 
## 500 268

Split data in training (80%) and test data (20%)

# for reproducibility
set.seed(123)
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]

Data visualization

target='Outcome'
vars=setdiff(colnames(train),target)

# moving data from wide to tall form
data_long=unpivot_to_blocks(train,nameForNewKeyColumn = "variables",
                            nameForNewValueColumn = "values",columnsToTakeFrom = vars)
# str(data_long)
# plot the histogram
ggplot(data_long,aes(x=values)) + 
  geom_histogram(bins=10,fill="gray") +
  facet_wrap(~variables,ncol = 3,scales="free")

# boxplot for each variables with regards to group
ggplot(data_long,aes(x=Outcome,y=values)) +
  geom_boxplot(color="blue",fill="blue",alpha=0.2,notch=TRUE,
               outlier.color="red",outlier.fill = "red",outlier.size = 2) +
  facet_wrap(~variables,ncol = 3,scales = "free")
## notch went outside hinges. Try setting notch=FALSE.

# Correlation matrix
cormat=cor(train[,vars])

cormat[upper.tri(cormat)]=NA
melted_cormat=melt(cormat,na.rm = TRUE)
# heat-map
ggplot(data=melted_cormat,aes(Var1,Var2,fill=value)) +
  geom_tile(color='white') +
  scale_fill_gradient2(low = "blue", high="red",mid="white",midpoint = 0,
                       limit=c(-1,1),space = "Lab",name="Correlation") +
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45,vjust = 1,size = 12,hjust = 1)) +
  coord_fixed() +
  geom_text(aes(Var1,Var2,label=round(value,2)),color="black",size=3)

# plot grouped scatter plot
# Create the plots
pairs(data[,vars], pch=20,cex=0.3,col=data$Outcome, 
      lower.panel = NULL)

Model training and evaluation

# Apply linear model (logistic regression) to predict onset of diabetes
model=glm(Outcome~.,data=train,family = binomial("logit"))
summary(model)
## 
## Call:
## glm(formula = Outcome ~ ., family = binomial("logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5779  -0.7064  -0.3995   0.6919   2.9952  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.691834   0.823617 -10.553  < 2e-16 ***
## Pregnancies               0.094582   0.037563   2.518   0.0118 *  
## Glucose                   0.036772   0.004306   8.540  < 2e-16 ***
## BloodPressure            -0.014473   0.006276  -2.306   0.0211 *  
## SkinThickness            -0.001349   0.007795  -0.173   0.8626    
## Insulin                  -0.001391   0.001054  -1.321   0.1866    
## BMI                       0.095024   0.017468   5.440 5.33e-08 ***
## DiabetesPedigreeFunction  0.853762   0.343927   2.482   0.0131 *  
## Age                       0.020382   0.011001   1.853   0.0639 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 784.01  on 612  degrees of freedom
## Residual deviance: 559.51  on 604  degrees of freedom
## AIC: 577.51
## 
## Number of Fisher Scoring iterations: 5

Store the prediction of model in ‘train’ and ‘test’ data frame

# prediction
train$pred=predict(model,newdata = train,type = "response")
test$pred=predict(model,newdata=test,type="response")

# confusion matrix for model evaluation
(confmat=table(truth=train$Outcome,predict=train$pred>0.5))
##      predict
## truth FALSE TRUE
##     0   363   43
##     1    85  122
(acc=sum(diag(confmat))/sum(confmat))
## [1] 0.7911909
(confmat_test=table(truth=test$Outcome,predict=test$pred>0.5))
##      predict
## truth FALSE TRUE
##     0    80   14
##     1    27   34
(acc_test=sum(diag(confmat_test))/sum(confmat_test))
## [1] 0.7354839
(precision=confmat_test[2,2]/sum(confmat_test[,2]))
## [1] 0.7083333
(recall=confmat_test[2,2]/sum(confmat_test[2,]))
## [1] 0.557377

Visualize the model performance

# Double density plot
plt=DoubleDensityPlot(test,xvar = "pred",truthVar = "Outcome",
                  title = "Distribution of scores of LR model in test data")
plt+geom_vline(xintercept = 0.5, color="red",linetype=2)

# ROC curve
test$Outcome_numeric=as.numeric(test$Outcome)-1
ROCPlot(test,xvar='pred',truthVar = "Outcome_numeric",truthTarget = TRUE,
        title = "Logistic regression test performance")

calcAUC(test$pred,test$Outcome_numeric)
## [1] 0.8004883

Posterior probability threshold for modeling trade-off

# PRT plot (modeling trade-off) to select other threshold
train$Outcome_numeric=as.numeric(train$Outcome)-1
plt=PRTPlot(test,'pred','Outcome_numeric',TRUE,plotvars = c('enrichment','recall'),
        thresholdrange = c(0,1),title = 'Enrichment/recall vs threshold for LR model')
plt+geom_vline(xintercept = 0.375,color="red",linetype=2)

# 0.375 is seleted as threshold
# Confusion matrix of test data
(confmat_test=table(truth=test$Outcome,predict=test$pred>0.375))
##      predict
## truth FALSE TRUE
##     0    74   20
##     1    18   43
(acc_test=sum(diag(confmat_test))/sum(confmat_test))
## [1] 0.7548387
(precision=confmat_test[2,2]/sum(confmat_test[,2]))
## [1] 0.6825397
(recall=confmat_test[2,2]/sum(confmat_test[2,]))
## [1] 0.704918