library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.4.1 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read.csv("data_diabetes.csv", header = TRUE, sep = ",")
dim(df)
## [1] 100000 9
head(df)
## gender age hypertension heart_disease smoking_history bmi HbA1c_level
## 1 Female 80 0 1 never 25.19 6.6
## 2 Female 54 0 0 No Info 27.32 6.6
## 3 Male 28 0 0 never 27.32 5.7
## 4 Female 36 0 0 current 23.45 5.0
## 5 Male 76 1 1 current 20.14 4.8
## 6 Female 20 0 0 never 27.32 6.6
## blood_glucose_level diabetes
## 1 140 0
## 2 80 0
## 3 158 0
## 4 155 0
## 5 155 0
## 6 85 0
str(df)
## 'data.frame': 100000 obs. of 9 variables:
## $ gender : chr "Female" "Female" "Male" "Female" ...
## $ age : num 80 54 28 36 76 20 44 79 42 32 ...
## $ hypertension : int 0 0 0 0 1 0 0 0 0 0 ...
## $ heart_disease : int 1 0 0 0 1 0 0 0 0 0 ...
## $ smoking_history : chr "never" "No Info" "never" "current" ...
## $ bmi : num 25.2 27.3 27.3 23.4 20.1 ...
## $ HbA1c_level : num 6.6 6.6 5.7 5 4.8 6.6 6.5 5.7 4.8 5 ...
## $ blood_glucose_level: int 140 80 158 155 155 85 200 85 145 100 ...
## $ diabetes : int 0 0 0 0 0 0 1 0 0 0 ...
BasicSummary <- function(df, dgts=3){
m <- ncol(df)
varNames <- colnames(df)
varType <- vector("character",m)
topLevel <- vector("character",m)
topCount <- vector("numeric",m)
missCount <- vector("numeric",m)
levels <- vector("numeric",m)
for(i in 1:m){
x<-df[,i]
varType[i] <- class(x)
xtab <- table(x,useNA="ifany")
levels[i] <- length(xtab)
nums <- as.numeric(xtab)
maxnum <- max(nums)
topCount[i] <- maxnum
maxIndex <- which.max(nums)
lvls <- names(xtab)
topLevel[i] <- lvls[maxIndex]
missIndex <- which((is.na(x)) | (x== "") | (x == " "))
missCount[i] <- length(missIndex)
}
n <- nrow(df)
topFrac <- round(topCount/n, digits = dgts)
missFrac <- round(missCount/n, digits = dgts)
summaryFrame <- data.frame(variable = varNames, type = varType, levels = levels, topLevel = topLevel, topCount = topCount, topFrac = topFrac, missFreq = missCount, missFrac = missFrac)
return(summaryFrame)
}
BasicSummary(df)
## variable type levels topLevel topCount topFrac missFreq
## 1 gender character 3 Female 58552 0.586 0
## 2 age numeric 102 80 5621 0.056 0
## 3 hypertension integer 2 0 92515 0.925 0
## 4 heart_disease integer 2 0 96058 0.961 0
## 5 smoking_history character 6 No Info 35816 0.358 0
## 6 bmi numeric 4247 27.32 25495 0.255 0
## 7 HbA1c_level numeric 18 6.6 8540 0.085 0
## 8 blood_glucose_level integer 18 130 7794 0.078 0
## 9 diabetes integer 2 0 91500 0.915 0
## missFrac
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
summary(df)
## gender age hypertension heart_disease
## Length:100000 Min. : 0.08 Min. :0.00000 Min. :0.00000
## Class :character 1st Qu.:24.00 1st Qu.:0.00000 1st Qu.:0.00000
## Mode :character Median :43.00 Median :0.00000 Median :0.00000
## Mean :41.89 Mean :0.07485 Mean :0.03942
## 3rd Qu.:60.00 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :80.00 Max. :1.00000 Max. :1.00000
## smoking_history bmi HbA1c_level blood_glucose_level
## Length:100000 Min. :10.01 Min. :3.500 Min. : 80.0
## Class :character 1st Qu.:23.63 1st Qu.:4.800 1st Qu.:100.0
## Mode :character Median :27.32 Median :5.800 Median :140.0
## Mean :27.32 Mean :5.528 Mean :138.1
## 3rd Qu.:29.58 3rd Qu.:6.200 3rd Qu.:159.0
## Max. :95.69 Max. :9.000 Max. :300.0
## diabetes
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.085
## 3rd Qu.:0.000
## Max. :1.000
duplicate_count <- sum(duplicated(df))
print(duplicate_count)
## [1] 3854
duplicate_rows <- duplicated(df)
df <- df[!duplicate_rows, ]
dim(df)
## [1] 96146 9
There are 3854 duplicates removed. The reason we removed the duplicate is to reduce the amount of inaccuracies
We’ll be looking at outliers in age, bmi, HbA1c_level, and blood_glucose_level
meanAge <- mean(df$age)
sdAge <- sd(df$age)
par(mfrow=c(1,2), pty="m")
plot(df$age, main='Three Sigma Edit Rule', xlab='Record',ylab='age',ylim=c(-50,150))
abline(h=meanAge+3*(sdAge),col='purple',lwd=3,lty='dotdash')
abline(h=meanAge-3*(sdAge),col='purple',lwd=3,lty='dotdash')
boxplot(df$age, main = "Boxplot of age", ylab = "Value")
outliers <- boxplot.stats(df$age)$out
points(rep(1, length(outliers)), outliers, col = "red", pch = 16)
legend("topright", legend = "Outliers", col = "red", pch = 16)
meanHb <- mean(df$HbA1c_level)
sdHb <- sd(df$HbA1c_level)
par(mfrow=c(1,2), pty="m")
plot(df$HbA1c_level, main='Three Sigma Edit Rule', xlab='Record',ylab='HbA1c',ylim=c(2,10))
abline(h=meanHb+3*(sdHb),col='purple',lwd=3,lty='dotdash')
abline(h=meanHb-3*(sdHb),col='purple',lwd=3,lty='dotdash')
boxplot(df$HbA1c_level, main = "Boxplot of HbA1c_level", ylab = "Value")
outliers <- boxplot.stats(df$HbA1c_level)$out
points(rep(1, length(outliers)), outliers, col = "red", pch = 16)
legend("topright", legend = "Outliers", col = "red", pch = 16)
meanBlood <- mean(df$blood_glucose_level)
sdBlood <- sd(df$blood_glucose_level)
par(mfrow=c(1,2), pty="m")
plot(df$blood_glucose_level, main='Three Sigma Edit Rule', xlab='Record',ylab='Blood glucose level',ylim=c(0,350))
abline(h=meanBlood+3*(sdBlood),col='purple',lwd=3,lty='dotdash')
abline(h=meanBlood-3*(sdBlood),col='purple',lwd=3,lty='dotdash')
boxplot(df$blood_glucose_level, main = "Boxplot of blood glucose level", ylab = "Value")
outliers <- boxplot.stats(df$blood_glucose_level)$out
points(rep(1, length(outliers)), outliers, col = "red", pch = 16)
legend("topright", legend = "Outliers", col = "red", pch = 16)
par(mfrow = c(1,3))
qqnorm(df$age,main="Age Normality")
qqline(df$age)
qqnorm(df$HbA1c_level,main="HbA1c Normality")
qqline(df$HbA1c_level)
qqnorm(df$blood_glucose_level,main="Blood glucose level Normality")
qqline(df$blood_glucose_level)
##EDA
plot(df$age, df$heart_disease, main="Relation between age and heart disease")
plot(df$bmi, df$heart_disease, main="Relation between BMI and Heart disease")
plot(df$age, df$diabetes, main="Relation between age and diabetes")
plot(df$bmi, df$diabetes, main="Relation between BMI and diabetes")
plot(df$blood_glucose_level, df$diabetes, main="Relation between blood glucose level and diabetes")
plot(df$HbA1c_level, df$diabetes, main="Relation between HbA1c level and diabetes")
Based on the data obtained above, age has an effect on heart problems,
but has no effect on diabetes. The BMI of people who have heart problems
and diabetes have a nearly identical range, between 10 and 60. People
who have diabetes have higher blood glucose levels and HbA1c levels than
those who don’t have diabetes.
##Data anomalies
freq_table2 <- table(df$smoking_history)
freq_table2
##
## current ever former never No Info not current
## 9197 3998 9299 34398 32887 6367
Because there are a lot of “no info” part on the smoking_history, it is better to drop the column.
#Dropping smoking_history table
df <- df[, -5]
head(df)
## gender age hypertension heart_disease bmi HbA1c_level blood_glucose_level
## 1 Female 80 0 1 25.19 6.6 140
## 2 Female 54 0 0 27.32 6.6 80
## 3 Male 28 0 0 27.32 5.7 158
## 4 Female 36 0 0 23.45 5.0 155
## 5 Male 76 1 1 20.14 4.8 155
## 6 Female 20 0 0 27.32 6.6 85
## diabetes
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
freq_table1 <- table(df$gender)
freq_table1
##
## Female Male Other
## 56161 39967 18
We want to encode the gender into 0 and 1 Because the “Other” gender is not specified it is better to drop evey record with the gender “other”
df <- df[df$gender != 'Other', ]
freq_table1 <- table(df$gender)
freq_table1
##
## Female Male
## 56161 39967
encode the gender column into 0 and 1
df <- df %>%
mutate(gender = recode(gender, "Female" = 0, "Male" = 1))
head(df)
## gender age hypertension heart_disease bmi HbA1c_level blood_glucose_level
## 1 0 80 0 1 25.19 6.6 140
## 2 0 54 0 0 27.32 6.6 80
## 3 1 28 0 0 27.32 5.7 158
## 4 0 36 0 0 23.45 5.0 155
## 5 1 76 1 1 20.14 4.8 155
## 6 0 20 0 0 27.32 6.6 85
## diabetes
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
freq_table1 <- table(df$gender)
freq_table1
##
## 0 1
## 56161 39967
str(df)
## 'data.frame': 96128 obs. of 8 variables:
## $ gender : num 0 0 1 0 1 0 0 0 1 0 ...
## $ age : num 80 54 28 36 76 20 44 79 42 32 ...
## $ hypertension : int 0 0 0 0 1 0 0 0 0 0 ...
## $ heart_disease : int 1 0 0 0 1 0 0 0 0 0 ...
## $ bmi : num 25.2 27.3 27.3 23.4 20.1 ...
## $ HbA1c_level : num 6.6 6.6 5.7 5 4.8 6.6 6.5 5.7 4.8 5 ...
## $ blood_glucose_level: int 140 80 158 155 155 85 200 85 145 100 ...
## $ diabetes : int 0 0 0 0 0 0 1 0 0 0 ...
all the variable is in the right data type
library(corrplot)
## corrplot 0.92 loaded
cor_matrix <- cor(df)
corrplot(cor_matrix, method = "color")
print(cor_matrix)
## gender age hypertension heart_disease
## gender 1.00000000 -0.02866998 0.01419567 0.07851230
## age -0.02866998 1.00000000 0.25729732 0.23844929
## hypertension 0.01419567 0.25729732 1.00000000 0.11997218
## heart_disease 0.07851230 0.23844929 0.11997218 1.00000000
## bmi -0.02349047 0.34477912 0.14812419 0.06138153
## HbA1c_level 0.01992833 0.10670828 0.08144289 0.06814212
## blood_glucose_level 0.01783067 0.11432338 0.08484144 0.07083787
## diabetes 0.03783691 0.26491789 0.19569612 0.17070111
## bmi HbA1c_level blood_glucose_level diabetes
## gender -0.02349047 0.01992833 0.01783067 0.03783691
## age 0.34477912 0.10670828 0.11432338 0.26491789
## hypertension 0.14812419 0.08144289 0.08484144 0.19569612
## heart_disease 0.06138153 0.06814212 0.07083787 0.17070111
## bmi 1.00000000 0.08444296 0.09259268 0.21495125
## HbA1c_level 0.08444296 1.00000000 0.17171716 0.40644594
## blood_glucose_level 0.09259268 0.17171716 1.00000000 0.42436644
## diabetes 0.21495125 0.40644594 0.42436644 1.00000000
From this pearson correlation, as we can see the highest correlation between diabetes and blood_glucose_level (0.42436644). With the second highest is between diabetes and HbA1c_level (0.40644594)
library(mlbench)
library(dplyr)
library(rpart)
library(rpart.plot)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
## The following object is masked from 'package:base':
##
## Recall
set.seed(123)
n <- nrow(df)
train <- sample(n, round(0.7 * n))
DiaTrain <- df[train,]
DiaValidation <- df[-train,]
head(DiaTrain)
## gender age hypertension heart_disease bmi HbA1c_level
## 52822 1 36 0 0 27.32 5.8
## 59314 1 80 0 0 27.32 3.5
## 2987 1 13 0 0 27.32 4.8
## 30326 0 59 0 0 37.55 6.0
## 99044 0 6 0 0 16.63 4.0
## 70333 0 39 0 0 33.60 4.0
## blood_glucose_level diabetes
## 52822 100 0
## 59314 160 0
## 2987 80 0
## 30326 100 0
## 99044 90 0
## 70333 140 0
head(DiaValidation)
## gender age hypertension heart_disease bmi HbA1c_level blood_glucose_level
## 3 1 28 0 0 27.32 5.7 158
## 4 0 36 0 0 23.45 5.0 155
## 9 1 42 0 0 33.64 4.8 145
## 13 0 78 0 0 36.05 5.0 130
## 19 0 42 0 0 27.32 5.7 80
## 24 0 72 0 1 27.94 6.5 130
## diabetes
## 3 0
## 4 0
## 9 0
## 13 0
## 19 0
## 24 0
dim(DiaTrain)
## [1] 67290 8
dim(DiaValidation)
## [1] 28838 8
For the first model we are using all the variable
logisticFull <- glm(diabetes ~., family="binomial", data = df)
summary(logisticFull)
##
## Call:
## glm(formula = diabetes ~ ., family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7576 -0.2029 -0.0702 -0.0183 3.6911
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.732e+01 2.898e-01 -94.278 < 2e-16 ***
## gender 2.671e-01 3.587e-02 7.446 9.59e-14 ***
## age 4.708e-02 1.102e-03 42.701 < 2e-16 ***
## hypertension 7.601e-01 4.689e-02 16.210 < 2e-16 ***
## heart_disease 7.096e-01 6.037e-02 11.754 < 2e-16 ***
## bmi 8.938e-02 2.528e-03 35.359 < 2e-16 ***
## HbA1c_level 2.334e+00 3.577e-02 65.261 < 2e-16 ***
## blood_glucose_level 3.329e-02 4.823e-04 69.014 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 57377 on 96127 degrees of freedom
## Residual deviance: 22511 on 96120 degrees of freedom
## AIC: 22527
##
## Number of Fisher Scoring iterations: 8
all the variables are highly statistically significant.
According to cdc people are at risk of type 2 diabetes if: have prediabetes, are overweight, are 45 years or older, have family relatives with type 2 diabetes, and physically not active. Thus, for the second model We choose some selected variable such as bmi, age, and HbA1c_level.
logistic1 <- glm(diabetes ~ age + bmi + HbA1c_level, family="binomial", data = df)
summary(logistic1)
##
## Call:
## glm(formula = diabetes ~ age + bmi + HbA1c_level, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0878 -0.3071 -0.1103 -0.0303 3.7069
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.213e+01 2.248e-01 -98.43 <2e-16 ***
## age 5.331e-02 9.198e-04 57.96 <2e-16 ***
## bmi 9.296e-02 2.126e-03 43.72 <2e-16 ***
## HbA1c_level 2.318e+00 3.062e-02 75.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 57377 on 96127 degrees of freedom
## Residual deviance: 30870 on 96124 degrees of freedom
## AIC: 30878
##
## Number of Fisher Scoring iterations: 8
phatFull <- predict(logisticFull, newdata = df, type = "response")
phatSelect <- predict(logistic1, newdata = df, type = "response")
ROCFull <- roc(df$diabetes, phatFull, plot=TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROCSelect <- roc(df$diabetes, phatSelect, plot=TRUE, print.auc = TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
### AUC
AUC(phatFull, DiaValidation$diabetes)
## [1] 5.990376
AUC(phatSelect, DiaValidation$diabetes)
## [1] 5.994437
threshold <- table(df$diabetes, phatFull > 0.5)
accuracy <- round(sum(diag(threshold)) / sum(threshold), 2)
threshold
##
## FALSE TRUE
## 0 86822 824
## 1 3139 5343
sprintf("Accuracy is %s", accuracy)
## [1] "Accuracy is 0.96"
threshold <- table(df$diabetes, phatSelect > 0.5)
accuracy <- round(sum(diag(threshold)) / sum(threshold), 2)
threshold
##
## FALSE TRUE
## 0 87079 567
## 1 4863 3619
sprintf("Accuracy is %s", accuracy)
## [1] "Accuracy is 0.94"
For this logistic regression, We are using two kind of model, the first model with all the variable and the second model with age, bmi, and HbA1c_level as the variable. The first model that we were using are statistically high in significant and the secind model are too. The result show there are higher accuracy in the first model with 0.96 accuracy rather than the second model with 0.94 accuracy. Thus, the first model with all variable are outperforming second variable with three selected variable.