library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.1     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Exploratory Data Analysis

Importing dataset

df <- read.csv("data_diabetes.csv", header = TRUE, sep = ",")

Looking dataset

dim(df)
## [1] 100000      9
head(df)
##   gender age hypertension heart_disease smoking_history   bmi HbA1c_level
## 1 Female  80            0             1           never 25.19         6.6
## 2 Female  54            0             0         No Info 27.32         6.6
## 3   Male  28            0             0           never 27.32         5.7
## 4 Female  36            0             0         current 23.45         5.0
## 5   Male  76            1             1         current 20.14         4.8
## 6 Female  20            0             0           never 27.32         6.6
##   blood_glucose_level diabetes
## 1                 140        0
## 2                  80        0
## 3                 158        0
## 4                 155        0
## 5                 155        0
## 6                  85        0
str(df)
## 'data.frame':    100000 obs. of  9 variables:
##  $ gender             : chr  "Female" "Female" "Male" "Female" ...
##  $ age                : num  80 54 28 36 76 20 44 79 42 32 ...
##  $ hypertension       : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ heart_disease      : int  1 0 0 0 1 0 0 0 0 0 ...
##  $ smoking_history    : chr  "never" "No Info" "never" "current" ...
##  $ bmi                : num  25.2 27.3 27.3 23.4 20.1 ...
##  $ HbA1c_level        : num  6.6 6.6 5.7 5 4.8 6.6 6.5 5.7 4.8 5 ...
##  $ blood_glucose_level: int  140 80 158 155 155 85 200 85 145 100 ...
##  $ diabetes           : int  0 0 0 0 0 0 1 0 0 0 ...
BasicSummary <- function(df, dgts=3){
  m <- ncol(df)
  varNames <- colnames(df)
  varType <- vector("character",m)
  topLevel <- vector("character",m)
  topCount <- vector("numeric",m)
  missCount <- vector("numeric",m)
  levels <- vector("numeric",m)
  
  for(i in 1:m){
    x<-df[,i]
    varType[i] <- class(x)
    xtab <- table(x,useNA="ifany")
    levels[i] <- length(xtab)
    nums <- as.numeric(xtab)
    maxnum <- max(nums)
    topCount[i] <- maxnum
    maxIndex <- which.max(nums)
    lvls <- names(xtab)
    topLevel[i] <- lvls[maxIndex]
    missIndex <- which((is.na(x)) | (x== "") | (x == " "))
    missCount[i] <- length(missIndex)
  }
  
  n <- nrow(df)
  topFrac <- round(topCount/n, digits = dgts)
  missFrac <- round(missCount/n, digits = dgts)
  summaryFrame <- data.frame(variable = varNames, type = varType, levels = levels, topLevel = topLevel, topCount = topCount, topFrac = topFrac, missFreq = missCount, missFrac = missFrac)
  return(summaryFrame)
}
BasicSummary(df)
##              variable      type levels topLevel topCount topFrac missFreq
## 1              gender character      3   Female    58552   0.586        0
## 2                 age   numeric    102       80     5621   0.056        0
## 3        hypertension   integer      2        0    92515   0.925        0
## 4       heart_disease   integer      2        0    96058   0.961        0
## 5     smoking_history character      6  No Info    35816   0.358        0
## 6                 bmi   numeric   4247    27.32    25495   0.255        0
## 7         HbA1c_level   numeric     18      6.6     8540   0.085        0
## 8 blood_glucose_level   integer     18      130     7794   0.078        0
## 9            diabetes   integer      2        0    91500   0.915        0
##   missFrac
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0
## 7        0
## 8        0
## 9        0
summary(df)
##     gender               age         hypertension     heart_disease    
##  Length:100000      Min.   : 0.08   Min.   :0.00000   Min.   :0.00000  
##  Class :character   1st Qu.:24.00   1st Qu.:0.00000   1st Qu.:0.00000  
##  Mode  :character   Median :43.00   Median :0.00000   Median :0.00000  
##                     Mean   :41.89   Mean   :0.07485   Mean   :0.03942  
##                     3rd Qu.:60.00   3rd Qu.:0.00000   3rd Qu.:0.00000  
##                     Max.   :80.00   Max.   :1.00000   Max.   :1.00000  
##  smoking_history         bmi         HbA1c_level    blood_glucose_level
##  Length:100000      Min.   :10.01   Min.   :3.500   Min.   : 80.0      
##  Class :character   1st Qu.:23.63   1st Qu.:4.800   1st Qu.:100.0      
##  Mode  :character   Median :27.32   Median :5.800   Median :140.0      
##                     Mean   :27.32   Mean   :5.528   Mean   :138.1      
##                     3rd Qu.:29.58   3rd Qu.:6.200   3rd Qu.:159.0      
##                     Max.   :95.69   Max.   :9.000   Max.   :300.0      
##     diabetes    
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.085  
##  3rd Qu.:0.000  
##  Max.   :1.000
  1. 100000 observations with 9 attributes.
  2. Three is three data type: integer, character, numeric.
  3. Integer data type: hypertension, heart_disease, blood_glucose_level, diabetes.
  4. Character data type: gender and smoking_history.
  5. Numeric data type: age, bmi, and HbA1c_level.
  6. There are no missing value in every variable

Checking Duplicates

duplicate_count <- sum(duplicated(df))
print(duplicate_count)
## [1] 3854
duplicate_rows <- duplicated(df)

df <- df[!duplicate_rows, ]
dim(df)
## [1] 96146     9

There are 3854 duplicates removed. The reason we removed the duplicate is to reduce the amount of inaccuracies

We’ll be looking at outliers in age, bmi, HbA1c_level, and blood_glucose_level

meanAge <- mean(df$age)
sdAge <- sd(df$age)
par(mfrow=c(1,2), pty="m")

plot(df$age, main='Three Sigma Edit Rule', xlab='Record',ylab='age',ylim=c(-50,150))
abline(h=meanAge+3*(sdAge),col='purple',lwd=3,lty='dotdash')
abline(h=meanAge-3*(sdAge),col='purple',lwd=3,lty='dotdash')

boxplot(df$age, main = "Boxplot of age", ylab = "Value")
outliers <- boxplot.stats(df$age)$out
points(rep(1, length(outliers)), outliers, col = "red", pch = 16)
legend("topright", legend = "Outliers", col = "red", pch = 16)

meanHb <- mean(df$HbA1c_level)
sdHb <- sd(df$HbA1c_level)
par(mfrow=c(1,2), pty="m")

plot(df$HbA1c_level, main='Three Sigma Edit Rule', xlab='Record',ylab='HbA1c',ylim=c(2,10))
abline(h=meanHb+3*(sdHb),col='purple',lwd=3,lty='dotdash')
abline(h=meanHb-3*(sdHb),col='purple',lwd=3,lty='dotdash')

boxplot(df$HbA1c_level, main = "Boxplot of HbA1c_level", ylab = "Value")
outliers <- boxplot.stats(df$HbA1c_level)$out
points(rep(1, length(outliers)), outliers, col = "red", pch = 16)
legend("topright", legend = "Outliers", col = "red", pch = 16)

meanBlood <- mean(df$blood_glucose_level)
sdBlood <- sd(df$blood_glucose_level)
par(mfrow=c(1,2), pty="m")

plot(df$blood_glucose_level, main='Three Sigma Edit Rule', xlab='Record',ylab='Blood glucose level',ylim=c(0,350))
abline(h=meanBlood+3*(sdBlood),col='purple',lwd=3,lty='dotdash')
abline(h=meanBlood-3*(sdBlood),col='purple',lwd=3,lty='dotdash')

boxplot(df$blood_glucose_level, main = "Boxplot of blood glucose level", ylab = "Value")
outliers <- boxplot.stats(df$blood_glucose_level)$out
points(rep(1, length(outliers)), outliers, col = "red", pch = 16)
legend("topright", legend = "Outliers", col = "red", pch = 16)

  1. There are no outliers in the age variable.
  2. There are some outliers in the HbA1c_level and blood glucose level, but this is a health related dataset, there are bound to be rare value in health related dataset.
par(mfrow = c(1,3))
qqnorm(df$age,main="Age Normality")
qqline(df$age)

qqnorm(df$HbA1c_level,main="HbA1c Normality")
qqline(df$HbA1c_level)

qqnorm(df$blood_glucose_level,main="Blood glucose level Normality")
qqline(df$blood_glucose_level)

##EDA

plot(df$age, df$heart_disease, main="Relation between age and heart disease")

plot(df$bmi, df$heart_disease, main="Relation between BMI and Heart disease")

plot(df$age, df$diabetes, main="Relation between age and diabetes")

plot(df$bmi, df$diabetes, main="Relation between BMI and diabetes")

plot(df$blood_glucose_level, df$diabetes, main="Relation between blood glucose level and diabetes")

plot(df$HbA1c_level, df$diabetes, main="Relation between HbA1c level and diabetes")

Based on the data obtained above, age has an effect on heart problems, but has no effect on diabetes. The BMI of people who have heart problems and diabetes have a nearly identical range, between 10 and 60. People who have diabetes have higher blood glucose levels and HbA1c levels than those who don’t have diabetes.

##Data anomalies

freq_table2 <- table(df$smoking_history)
freq_table2
## 
##     current        ever      former       never     No Info not current 
##        9197        3998        9299       34398       32887        6367

Because there are a lot of “no info” part on the smoking_history, it is better to drop the column.

#Dropping smoking_history table
df <- df[, -5]
head(df)
##   gender age hypertension heart_disease   bmi HbA1c_level blood_glucose_level
## 1 Female  80            0             1 25.19         6.6                 140
## 2 Female  54            0             0 27.32         6.6                  80
## 3   Male  28            0             0 27.32         5.7                 158
## 4 Female  36            0             0 23.45         5.0                 155
## 5   Male  76            1             1 20.14         4.8                 155
## 6 Female  20            0             0 27.32         6.6                  85
##   diabetes
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0
freq_table1 <- table(df$gender)
freq_table1
## 
## Female   Male  Other 
##  56161  39967     18

We want to encode the gender into 0 and 1 Because the “Other” gender is not specified it is better to drop evey record with the gender “other”

df <- df[df$gender != 'Other', ]
freq_table1 <- table(df$gender)
freq_table1
## 
## Female   Male 
##  56161  39967

encode the gender column into 0 and 1

df <- df %>% 
  mutate(gender = recode(gender, "Female" = 0, "Male" = 1))
head(df)
##   gender age hypertension heart_disease   bmi HbA1c_level blood_glucose_level
## 1      0  80            0             1 25.19         6.6                 140
## 2      0  54            0             0 27.32         6.6                  80
## 3      1  28            0             0 27.32         5.7                 158
## 4      0  36            0             0 23.45         5.0                 155
## 5      1  76            1             1 20.14         4.8                 155
## 6      0  20            0             0 27.32         6.6                  85
##   diabetes
## 1        0
## 2        0
## 3        0
## 4        0
## 5        0
## 6        0
freq_table1 <- table(df$gender)
freq_table1
## 
##     0     1 
## 56161 39967
str(df)
## 'data.frame':    96128 obs. of  8 variables:
##  $ gender             : num  0 0 1 0 1 0 0 0 1 0 ...
##  $ age                : num  80 54 28 36 76 20 44 79 42 32 ...
##  $ hypertension       : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ heart_disease      : int  1 0 0 0 1 0 0 0 0 0 ...
##  $ bmi                : num  25.2 27.3 27.3 23.4 20.1 ...
##  $ HbA1c_level        : num  6.6 6.6 5.7 5 4.8 6.6 6.5 5.7 4.8 5 ...
##  $ blood_glucose_level: int  140 80 158 155 155 85 200 85 145 100 ...
##  $ diabetes           : int  0 0 0 0 0 0 1 0 0 0 ...

all the variable is in the right data type

Correlation between variables

library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(df)
corrplot(cor_matrix, method = "color")

print(cor_matrix)
##                          gender         age hypertension heart_disease
## gender               1.00000000 -0.02866998   0.01419567    0.07851230
## age                 -0.02866998  1.00000000   0.25729732    0.23844929
## hypertension         0.01419567  0.25729732   1.00000000    0.11997218
## heart_disease        0.07851230  0.23844929   0.11997218    1.00000000
## bmi                 -0.02349047  0.34477912   0.14812419    0.06138153
## HbA1c_level          0.01992833  0.10670828   0.08144289    0.06814212
## blood_glucose_level  0.01783067  0.11432338   0.08484144    0.07083787
## diabetes             0.03783691  0.26491789   0.19569612    0.17070111
##                             bmi HbA1c_level blood_glucose_level   diabetes
## gender              -0.02349047  0.01992833          0.01783067 0.03783691
## age                  0.34477912  0.10670828          0.11432338 0.26491789
## hypertension         0.14812419  0.08144289          0.08484144 0.19569612
## heart_disease        0.06138153  0.06814212          0.07083787 0.17070111
## bmi                  1.00000000  0.08444296          0.09259268 0.21495125
## HbA1c_level          0.08444296  1.00000000          0.17171716 0.40644594
## blood_glucose_level  0.09259268  0.17171716          1.00000000 0.42436644
## diabetes             0.21495125  0.40644594          0.42436644 1.00000000

From this pearson correlation, as we can see the highest correlation between diabetes and blood_glucose_level (0.42436644). With the second highest is between diabetes and HbA1c_level (0.40644594)

Model

library(mlbench)
library(dplyr)
library(rpart)
library(rpart.plot)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(MLmetrics)
## 
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
## 
##     Recall

Logistic Regression

Splitting dataset

set.seed(123)

n <- nrow(df)
train <- sample(n, round(0.7 * n))
DiaTrain <- df[train,]
DiaValidation <- df[-train,]
head(DiaTrain)
##       gender age hypertension heart_disease   bmi HbA1c_level
## 52822      1  36            0             0 27.32         5.8
## 59314      1  80            0             0 27.32         3.5
## 2987       1  13            0             0 27.32         4.8
## 30326      0  59            0             0 37.55         6.0
## 99044      0   6            0             0 16.63         4.0
## 70333      0  39            0             0 33.60         4.0
##       blood_glucose_level diabetes
## 52822                 100        0
## 59314                 160        0
## 2987                   80        0
## 30326                 100        0
## 99044                  90        0
## 70333                 140        0
head(DiaValidation)
##    gender age hypertension heart_disease   bmi HbA1c_level blood_glucose_level
## 3       1  28            0             0 27.32         5.7                 158
## 4       0  36            0             0 23.45         5.0                 155
## 9       1  42            0             0 33.64         4.8                 145
## 13      0  78            0             0 36.05         5.0                 130
## 19      0  42            0             0 27.32         5.7                  80
## 24      0  72            0             1 27.94         6.5                 130
##    diabetes
## 3         0
## 4         0
## 9         0
## 13        0
## 19        0
## 24        0
dim(DiaTrain)
## [1] 67290     8
dim(DiaValidation)
## [1] 28838     8

Model Fitting

For the first model we are using all the variable

logisticFull <- glm(diabetes ~., family="binomial", data = df)
summary(logisticFull)
## 
## Call:
## glm(formula = diabetes ~ ., family = "binomial", data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7576  -0.2029  -0.0702  -0.0183   3.6911  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.732e+01  2.898e-01 -94.278  < 2e-16 ***
## gender               2.671e-01  3.587e-02   7.446 9.59e-14 ***
## age                  4.708e-02  1.102e-03  42.701  < 2e-16 ***
## hypertension         7.601e-01  4.689e-02  16.210  < 2e-16 ***
## heart_disease        7.096e-01  6.037e-02  11.754  < 2e-16 ***
## bmi                  8.938e-02  2.528e-03  35.359  < 2e-16 ***
## HbA1c_level          2.334e+00  3.577e-02  65.261  < 2e-16 ***
## blood_glucose_level  3.329e-02  4.823e-04  69.014  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57377  on 96127  degrees of freedom
## Residual deviance: 22511  on 96120  degrees of freedom
## AIC: 22527
## 
## Number of Fisher Scoring iterations: 8

all the variables are highly statistically significant.

According to cdc people are at risk of type 2 diabetes if: have prediabetes, are overweight, are 45 years or older, have family relatives with type 2 diabetes, and physically not active. Thus, for the second model We choose some selected variable such as bmi, age, and HbA1c_level.

logistic1 <- glm(diabetes ~ age + bmi + HbA1c_level, family="binomial", data = df)
summary(logistic1)
## 
## Call:
## glm(formula = diabetes ~ age + bmi + HbA1c_level, family = "binomial", 
##     data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0878  -0.3071  -0.1103  -0.0303   3.7069  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.213e+01  2.248e-01  -98.43   <2e-16 ***
## age          5.331e-02  9.198e-04   57.96   <2e-16 ***
## bmi          9.296e-02  2.126e-03   43.72   <2e-16 ***
## HbA1c_level  2.318e+00  3.062e-02   75.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 57377  on 96127  degrees of freedom
## Residual deviance: 30870  on 96124  degrees of freedom
## AIC: 30878
## 
## Number of Fisher Scoring iterations: 8
phatFull <- predict(logisticFull, newdata = df, type = "response")
phatSelect <- predict(logistic1, newdata = df, type = "response")

Performance (logsitic)

ROC

ROCFull <- roc(df$diabetes, phatFull, plot=TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

ROCSelect <- roc(df$diabetes, phatSelect, plot=TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

### AUC

AUC(phatFull, DiaValidation$diabetes)
## [1] 5.990376
AUC(phatSelect, DiaValidation$diabetes)
## [1] 5.994437

Confusion Matrix & Accuracy

threshold <- table(df$diabetes, phatFull > 0.5)
accuracy <- round(sum(diag(threshold)) / sum(threshold), 2)
threshold
##    
##     FALSE  TRUE
##   0 86822   824
##   1  3139  5343
sprintf("Accuracy is %s", accuracy)
## [1] "Accuracy is 0.96"
threshold <- table(df$diabetes, phatSelect > 0.5)
accuracy <- round(sum(diag(threshold)) / sum(threshold), 2)
threshold
##    
##     FALSE  TRUE
##   0 87079   567
##   1  4863  3619
sprintf("Accuracy is %s", accuracy)
## [1] "Accuracy is 0.94"

Conclusion

For this logistic regression, We are using two kind of model, the first model with all the variable and the second model with age, bmi, and HbA1c_level as the variable. The first model that we were using are statistically high in significant and the secind model are too. The result show there are higher accuracy in the first model with 0.96 accuracy rather than the second model with 0.94 accuracy. Thus, the first model with all variable are outperforming second variable with three selected variable.