The dataset:

The National Institute of Diabetes and Digestive and Kidney Diseases conducted a study of 768 adult female Pima Indians living near Phoenix, AZ. The purpose of the study was to investigate the factors related to diabetes. You can obtain the dataset from the URL https://www.kaggle.com/uciml/pima-indians-diabetes-database

library(skimr)
## Warning: package 'skimr' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(tidyr)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(ggplot2)

Exploratory data analysis:

pima <- read_csv("diabetes_pima.csv")
## Rows: 768 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (9): Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, D...
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
pima <- as_tibble(pima)
skim(pima)
Data summary
Name pima
Number of rows 768
Number of columns 9
_______________________
Column type frequency:
numeric 9
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Pregnancies 0 1 3.85 3.37 0.00 1.00 3.00 6.00 17.00 ▇▃▂▁▁
Glucose 0 1 120.89 31.97 0.00 99.00 117.00 140.25 199.00 ▁▁▇▆▂
BloodPressure 0 1 69.11 19.36 0.00 62.00 72.00 80.00 122.00 ▁▁▇▇▁
SkinThickness 0 1 20.54 15.95 0.00 0.00 23.00 32.00 99.00 ▇▇▂▁▁
Insulin 0 1 79.80 115.24 0.00 0.00 30.50 127.25 846.00 ▇▁▁▁▁
BMI 0 1 31.99 7.88 0.00 27.30 32.00 36.60 67.10 ▁▃▇▂▁
DiabetesPedigreeFunction 0 1 0.47 0.33 0.08 0.24 0.37 0.63 2.42 ▇▃▁▁▁
Age 0 1 33.24 11.76 21.00 24.00 29.00 41.00 81.00 ▇▃▁▁▁
Outcome 0 1 0.35 0.48 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
library(data.table)
## Warning: package 'data.table' was built under R version 4.0.5
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
  as.data.table(pima)[, lapply(.SD, function(x) sum(x==0))]
##    Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1:         111       5            35           227     374  11
##    DiabetesPedigreeFunction Age Outcome
## 1:                        0   0     500
pima$Glucose[pima$Glucose == 0] = mean(pima$Glucose)
pima$BloodPressure [pima$BloodPressure  == 0] = mean(pima$BloodPressure )
pima$SkinThickness [pima$SkinThickness  == 0] = mean(pima$SkinThickness)
pima$Insulin [pima$Insulin  == 0] = mean(pima$Insulin )
pima$BMI[pima$BMI== 0] = mean(pima$BMI)
skim(pima)
Data summary
Name pima
Number of rows 768
Number of columns 9
_______________________
Column type frequency:
numeric 9
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Pregnancies 0 1 3.85 3.37 0.00 1.00 3.00 6.00 17.00 ▇▃▂▁▁
Glucose 0 1 121.68 30.44 44.00 99.75 117.00 140.25 199.00 ▁▇▇▃▂
BloodPressure 0 1 72.25 12.12 24.00 64.00 72.00 80.00 122.00 ▁▂▇▂▁
SkinThickness 0 1 26.61 9.63 7.00 20.54 23.00 32.00 99.00 ▇▆▁▁▁
Insulin 0 1 118.66 93.08 14.00 79.80 79.80 127.25 846.00 ▇▁▁▁▁
BMI 0 1 32.45 6.88 18.20 27.50 32.00 36.60 67.10 ▅▇▃▁▁
DiabetesPedigreeFunction 0 1 0.47 0.33 0.08 0.24 0.37 0.63 2.42 ▇▃▁▁▁
Age 0 1 33.24 11.76 21.00 24.00 29.00 41.00 81.00 ▇▃▁▁▁
Outcome 0 1 0.35 0.48 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
d <- melt(pima[,-c(9)])
## Warning in melt(pima[, -c(9)]): The melt generic in data.table has been passed a
## tbl_df and will attempt to redirect to the relevant reshape2 method; please note
## that reshape2 is deprecated, and this redirection is now deprecated as well.
## To continue using melt methods from reshape2 while both libraries are attached,
## e.g. melt.list, you can prepend the namespace like reshape2::melt(pima[,
## -c(9)]). In the next version, this warning will become an error.
## No id variables; using all as measure variables
ggplot(d,aes(x = value)) + 
    facet_wrap(~variable,scales = "free_x") + 
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

library(GGally)
## Warning: package 'GGally' was built under R version 4.0.5
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggcorr(pima)+ theme_bw()

ggpairs(pima)+theme_bw()

GLM MODEL

samp_size = floor(0.8*nrow(pima))  
 set.seed(123)   
train_ind = sample(seq_len(nrow(pima)),size = samp_size)  
train_pima =pima[train_ind,] 
test_pima=pima[-train_ind,] 
glm_pima <- glm(Outcome~., data= train_pima, family="binomial")
summary(glm_pima)
## 
## Call:
## glm(formula = Outcome ~ ., family = "binomial", data = train_pima)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4859  -0.7197  -0.4066   0.7037   2.3872  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.896798   0.892546  -9.968  < 2e-16 ***
## Pregnancies               0.118979   0.036385   3.270  0.00108 ** 
## Glucose                   0.039610   0.004481   8.840  < 2e-16 ***
## BloodPressure            -0.013688   0.009661  -1.417  0.15652    
## SkinThickness            -0.006079   0.012557  -0.484  0.62830    
## Insulin                  -0.001314   0.001152  -1.141  0.25403    
## BMI                       0.100306   0.019772   5.073 3.91e-07 ***
## DiabetesPedigreeFunction  0.738823   0.322400   2.292  0.02193 *  
## Age                       0.013945   0.010580   1.318  0.18751    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 796.42  on 613  degrees of freedom
## Residual deviance: 568.64  on 605  degrees of freedom
## AIC: 586.64
## 
## Number of Fisher Scoring iterations: 5
exp(glm_pima$coefficients)
##              (Intercept)              Pregnancies                  Glucose 
##             0.0001368264             1.1263466266             1.0404047028 
##            BloodPressure            SkinThickness                  Insulin 
##             0.9864053267             0.9939393364             0.9986865211 
##                      BMI DiabetesPedigreeFunction                      Age 
##             1.1055088515             2.0934703253             1.0140425991

Interpretation of the model:

  1. A one unit increase in Pregnancies, with all other predictors held constant would result in increaing the odd ratio of the outcome by 12%.
  2. A one unit increase in glucose, with all other predictors held constant would result in increasing the odd ratio of the outcome by 4%.
  3. A one unit increase in BloodPressure, with all other predictors held constant would result in decreasing the odd ratio of the outcome by 2%.
  4. A one unit increase in SkinThickness, with all other predictors held constant would result in decreasing the odd ratio of the outcome by 1%.
  5. A one unit increase in Insulin, with all other predictors held constant would result in decreasing the odd ratio of the outcome by 1%.
  6. A one unit increase in BMI, with all other predictors held constant would result in increaing the odd ratio of the outcome by 1%.
glm_pima.update <- glm(Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction, data= train_pima, family="binomial")
summary(glm_pima.update)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction, 
##     family = "binomial", data = train_pima)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7485  -0.7198  -0.4013   0.7055   2.4333  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -9.101547   0.775057 -11.743  < 2e-16 ***
## Pregnancies               0.135980   0.030832   4.410 1.03e-05 ***
## Glucose                   0.037916   0.004026   9.417  < 2e-16 ***
## BMI                       0.085389   0.016601   5.144 2.70e-07 ***
## DiabetesPedigreeFunction  0.733512   0.317770   2.308    0.021 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 796.42  on 613  degrees of freedom
## Residual deviance: 573.05  on 609  degrees of freedom
## AIC: 583.05
## 
## Number of Fisher Scoring iterations: 5
anova(glm_pima,glm_pima.update,test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Outcome ~ Pregnancies + Glucose + BloodPressure + SkinThickness + 
##     Insulin + BMI + DiabetesPedigreeFunction + Age
## Model 2: Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       605     568.64                     
## 2       609     573.05 -4  -4.4069   0.3537

Prediction

 glm_predict = predict(glm_pima, test_pima, type = "response")
tab = table(ActualValue = test_pima$Outcome, PredictedValue = glm_predict>0.5)
print(tab)
##            PredictedValue
## ActualValue FALSE TRUE
##           0    88   14
##           1    23   29