Load the necessary packages
library(cdata) # data wrangling
## Warning: package 'cdata' was built under R version 4.0.5
## Loading required package: wrapr
## Warning: package 'wrapr' was built under R version 4.0.5
library(ggplot2) # elegant plots
## Warning: package 'ggplot2' was built under R version 4.0.5
library(knitr) # display table
## Warning: package 'knitr' was built under R version 4.0.5
library(reshape2) # data wrangling
## Warning: package 'reshape2' was built under R version 4.0.5
library(WVPlots) # ROC plot, double density plot and enrichment/recall plot
## Warning: package 'WVPlots' was built under R version 4.0.5
library(sigr) # AUC calculation
## Warning: package 'sigr' was built under R version 4.0.5
Next, import data into the working environment as data frame. Get the statistical summaries of each variables. summary() syntax is also great in checking the presence of missing value. str() characterize the format in which each variable is stored
data=read.csv('diabetes.csv')
summary(data)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
str(data)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
The numerical outcome column is changed to factor for modeling purpose later.
data$Outcome=as.factor(data$Outcome)
summary(data$Outcome)
## 0 1
## 500 268
Split data in training (80%) and test data (20%)
# for reproducibility
set.seed(123)
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]
target='Outcome'
vars=setdiff(colnames(train),target)
# moving data from wide to tall form
data_long=unpivot_to_blocks(train,nameForNewKeyColumn = "variables",
nameForNewValueColumn = "values",columnsToTakeFrom = vars)
# str(data_long)
# plot the histogram
ggplot(data_long,aes(x=values)) +
geom_histogram(bins=10,fill="gray") +
facet_wrap(~variables,ncol = 3,scales="free")
# boxplot for each variables with regards to group
ggplot(data_long,aes(x=Outcome,y=values)) +
geom_boxplot(color="blue",fill="blue",alpha=0.2,notch=TRUE,
outlier.color="red",outlier.fill = "red",outlier.size = 2) +
facet_wrap(~variables,ncol = 3,scales = "free")
## notch went outside hinges. Try setting notch=FALSE.
# Correlation matrix
cormat=cor(train[,vars])
cormat[upper.tri(cormat)]=NA
melted_cormat=melt(cormat,na.rm = TRUE)
# heat-map
ggplot(data=melted_cormat,aes(Var1,Var2,fill=value)) +
geom_tile(color='white') +
scale_fill_gradient2(low = "blue", high="red",mid="white",midpoint = 0,
limit=c(-1,1),space = "Lab",name="Correlation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45,vjust = 1,size = 12,hjust = 1)) +
coord_fixed() +
geom_text(aes(Var1,Var2,label=round(value,2)),color="black",size=3)
# plot grouped scatter plot
# Create the plots
pairs(data[,vars], pch=20,cex=0.3,col=data$Outcome,
lower.panel = NULL)
# Apply linear model (logistic regression) to predict onset of diabetes
model=glm(Outcome~.,data=train,family = binomial("logit"))
summary(model)
##
## Call:
## glm(formula = Outcome ~ ., family = binomial("logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5779 -0.7064 -0.3995 0.6919 2.9952
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.691834 0.823617 -10.553 < 2e-16 ***
## Pregnancies 0.094582 0.037563 2.518 0.0118 *
## Glucose 0.036772 0.004306 8.540 < 2e-16 ***
## BloodPressure -0.014473 0.006276 -2.306 0.0211 *
## SkinThickness -0.001349 0.007795 -0.173 0.8626
## Insulin -0.001391 0.001054 -1.321 0.1866
## BMI 0.095024 0.017468 5.440 5.33e-08 ***
## DiabetesPedigreeFunction 0.853762 0.343927 2.482 0.0131 *
## Age 0.020382 0.011001 1.853 0.0639 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 784.01 on 612 degrees of freedom
## Residual deviance: 559.51 on 604 degrees of freedom
## AIC: 577.51
##
## Number of Fisher Scoring iterations: 5
Store the prediction of model in ‘train’ and ‘test’ data frame
# prediction
train$pred=predict(model,newdata = train,type = "response")
test$pred=predict(model,newdata=test,type="response")
# confusion matrix for model evaluation
(confmat=table(truth=train$Outcome,predict=train$pred>0.5))
## predict
## truth FALSE TRUE
## 0 363 43
## 1 85 122
(acc=sum(diag(confmat))/sum(confmat))
## [1] 0.7911909
(confmat_test=table(truth=test$Outcome,predict=test$pred>0.5))
## predict
## truth FALSE TRUE
## 0 80 14
## 1 27 34
(acc_test=sum(diag(confmat_test))/sum(confmat_test))
## [1] 0.7354839
(precision=confmat_test[2,2]/sum(confmat_test[,2]))
## [1] 0.7083333
(recall=confmat_test[2,2]/sum(confmat_test[2,]))
## [1] 0.557377
Visualize the model performance
# Double density plot
plt=DoubleDensityPlot(test,xvar = "pred",truthVar = "Outcome",
title = "Distribution of scores of LR model in test data")
plt+geom_vline(xintercept = 0.5, color="red",linetype=2)
# ROC curve
test$Outcome_numeric=as.numeric(test$Outcome)-1
ROCPlot(test,xvar='pred',truthVar = "Outcome_numeric",truthTarget = TRUE,
title = "Logistic regression test performance")
calcAUC(test$pred,test$Outcome_numeric)
## [1] 0.8004883
# PRT plot (modeling trade-off) to select other threshold
train$Outcome_numeric=as.numeric(train$Outcome)-1
plt=PRTPlot(test,'pred','Outcome_numeric',TRUE,plotvars = c('enrichment','recall'),
thresholdrange = c(0,1),title = 'Enrichment/recall vs threshold for LR model')
plt+geom_vline(xintercept = 0.375,color="red",linetype=2)
# 0.375 is seleted as threshold
# Confusion matrix of test data
(confmat_test=table(truth=test$Outcome,predict=test$pred>0.375))
## predict
## truth FALSE TRUE
## 0 74 20
## 1 18 43
(acc_test=sum(diag(confmat_test))/sum(confmat_test))
## [1] 0.7548387
(precision=confmat_test[2,2]/sum(confmat_test[,2]))
## [1] 0.6825397
(recall=confmat_test[2,2]/sum(confmat_test[2,]))
## [1] 0.704918