This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
Step 1. Data
This synthetic dataset is provided on Kaggle by Rabie El Kharoua for educational purposes. It was last updated on June 11, 2024, and can be accessed at: https://www.kaggle.com/datasets/rabieelkharoua/diabetes-health-dataset-analysis?resource=download.”
diabetes <- read.csv("diabetes_data.csv")
head(diabetes)
## PatientID Age Gender Ethnicity SocioeconomicStatus EducationLevel BMI
## 1 6000 44 0 1 2 1 32.98528
## 2 6001 51 1 0 1 2 39.91676
## 3 6002 89 1 0 1 3 19.78225
## 4 6003 21 1 1 1 2 32.37688
## 5 6004 27 1 0 1 3 16.80860
## 6 6005 65 0 0 0 0 15.82081
## Smoking AlcoholConsumption PhysicalActivity DietQuality SleepQuality
## 1 1 4.499365 2.443385 4.898831 4.049885
## 2 0 1.578919 8.301264 8.941093 7.508150
## 3 0 1.177301 6.103395 7.722543 7.708387
## 4 1 1.714621 8.645465 4.804044 6.286548
## 5 0 15.462549 4.629383 2.532756 9.771125
## 6 1 17.781024 9.252522 2.309158 9.869401
## FamilyHistoryDiabetes GestationalDiabetes PolycysticOvarySyndrome
## 1 1 1 0
## 2 0 0 0
## 3 1 0 0
## 4 1 1 0
## 5 0 0 0
## 6 0 0 0
## PreviousPreDiabetes Hypertension SystolicBP DiastolicBP FastingBloodSugar
## 1 0 0 93 73 163.68716
## 2 0 0 165 99 188.34707
## 3 0 0 119 91 127.70365
## 4 1 0 169 87 82.68842
## 5 0 0 165 69 90.74339
## 6 0 0 144 64 119.59384
## HbA1c SerumCreatinine BUNLevels CholesterolTotal CholesterolLDL
## 1 9.283631 2.6656067 28.190147 254.2707 86.99363
## 2 7.326870 4.1721767 32.149491 155.3588 110.05611
## 3 4.083426 1.9731682 10.018375 231.6089 62.03579
## 4 6.516645 3.0577965 44.123281 176.5924 68.23841
## 5 5.607222 4.1503535 7.757117 157.3441 66.47622
## 6 8.523665 0.7330912 35.797135 250.0019 65.20200
## CholesterolHDL CholesterolTriglycerides AntihypertensiveMedications Statins
## 1 70.80147 190.33583 0 0
## 2 39.90011 81.17247 0 0
## 3 62.48067 279.80907 1 1
## 4 46.97782 112.75140 0 0
## 5 40.05975 381.52878 1 1
## 6 24.70504 395.49481 0 1
## AntidiabeticMedications FrequentUrination ExcessiveThirst
## 1 1 0 0
## 2 0 0 0
## 3 0 0 0
## 4 1 0 0
## 5 0 0 0
## 6 0 0 0
## UnexplainedWeightLoss FatigueLevels BlurredVision SlowHealingSores
## 1 0 9.534169 0 0
## 2 0 0.123214 0 0
## 3 0 9.643320 0 0
## 4 0 3.403557 0 0
## 5 0 2.924687 0 0
## 6 0 1.973642 0 0
## TinglingHandsFeet QualityOfLifeScore HeavyMetalsExposure
## 1 1 73.76511 0
## 2 0 91.44575 0
## 3 0 54.48574 0
## 4 0 77.86676 0
## 5 0 37.73181 0
## 6 0 86.37897 0
## OccupationalExposureChemicals WaterQuality MedicalCheckupsFrequency
## 1 0 0 1.782724
## 2 0 1 3.381070
## 3 0 0 2.701019
## 4 0 1 1.409056
## 5 0 0 1.218452
## 6 0 0 1.535161
## MedicationAdherence HealthLiteracy Diagnosis DoctorInCharge
## 1 4.486980 7.211349 1 Confidential
## 2 5.961705 5.024612 1 Confidential
## 3 8.950821 7.034944 0 Confidential
## 4 3.124769 4.717774 0 Confidential
## 5 6.977741 7.887940 0 Confidential
## 6 9.682226 2.744281 0 Confidential
Step 2. Data Quality
# check for data size and data types
str(diabetes)
## 'data.frame': 1879 obs. of 46 variables:
## $ PatientID : int 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 ...
## $ Age : int 44 51 89 21 27 65 61 74 54 82 ...
## $ Gender : int 0 1 1 1 1 0 1 1 0 1 ...
## $ Ethnicity : int 1 0 0 1 0 0 2 3 0 0 ...
## $ SocioeconomicStatus : int 2 1 1 1 1 0 1 0 1 1 ...
## $ EducationLevel : int 1 2 3 2 3 0 3 3 2 1 ...
## $ BMI : num 33 39.9 19.8 32.4 16.8 ...
## $ Smoking : int 1 0 0 1 0 1 0 0 0 1 ...
## $ AlcoholConsumption : num 4.5 1.58 1.18 1.71 15.46 ...
## $ PhysicalActivity : num 2.44 8.3 6.1 8.65 4.63 ...
## $ DietQuality : num 4.9 8.94 7.72 4.8 2.53 ...
## $ SleepQuality : num 4.05 7.51 7.71 6.29 9.77 ...
## $ FamilyHistoryDiabetes : int 1 0 1 1 0 0 0 0 0 0 ...
## $ GestationalDiabetes : int 1 0 0 1 0 0 0 1 0 0 ...
## $ PolycysticOvarySyndrome : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PreviousPreDiabetes : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Hypertension : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SystolicBP : int 93 165 119 169 165 144 109 128 172 95 ...
## $ DiastolicBP : int 73 99 91 87 69 64 96 98 66 85 ...
## $ FastingBloodSugar : num 163.7 188.3 127.7 82.7 90.7 ...
## $ HbA1c : num 9.28 7.33 4.08 6.52 5.61 ...
## $ SerumCreatinine : num 2.67 4.17 1.97 3.06 4.15 ...
## $ BUNLevels : num 28.19 32.15 10.02 44.12 7.76 ...
## $ CholesterolTotal : num 254 155 232 177 157 ...
## $ CholesterolLDL : num 87 110.1 62 68.2 66.5 ...
## $ CholesterolHDL : num 70.8 39.9 62.5 47 40.1 ...
## $ CholesterolTriglycerides : num 190.3 81.2 279.8 112.8 381.5 ...
## $ AntihypertensiveMedications : int 0 0 1 0 1 0 0 0 0 0 ...
## $ Statins : int 0 0 1 0 1 1 0 0 0 0 ...
## $ AntidiabeticMedications : int 1 0 0 1 0 0 0 0 0 0 ...
## $ FrequentUrination : int 0 0 0 0 0 0 1 0 0 0 ...
## $ ExcessiveThirst : int 0 0 0 0 0 0 0 0 0 1 ...
## $ UnexplainedWeightLoss : int 0 0 0 0 0 0 0 1 0 0 ...
## $ FatigueLevels : num 9.534 0.123 9.643 3.404 2.925 ...
## $ BlurredVision : int 0 0 0 0 0 0 0 0 1 1 ...
## $ SlowHealingSores : int 0 0 0 0 0 0 0 0 0 0 ...
## $ TinglingHandsFeet : int 1 0 0 0 0 0 0 0 0 0 ...
## $ QualityOfLifeScore : num 73.8 91.4 54.5 77.9 37.7 ...
## $ HeavyMetalsExposure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ OccupationalExposureChemicals: int 0 0 0 0 0 0 0 0 0 0 ...
## $ WaterQuality : int 0 1 0 1 0 0 0 0 0 0 ...
## $ MedicalCheckupsFrequency : num 1.78 3.38 2.7 1.41 1.22 ...
## $ MedicationAdherence : num 4.49 5.96 8.95 3.12 6.98 ...
## $ HealthLiteracy : num 7.21 5.02 7.03 4.72 7.89 ...
## $ Diagnosis : int 1 1 0 0 0 0 0 0 1 0 ...
## $ DoctorInCharge : chr "Confidential" "Confidential" "Confidential" "Confidential" ...
# descriptive statistics
summary(diabetes)
## PatientID Age Gender Ethnicity
## Min. :6000 Min. :20.00 Min. :0.0000 Min. :0.0000
## 1st Qu.:6470 1st Qu.:38.00 1st Qu.:0.0000 1st Qu.:0.0000
## Median :6939 Median :55.00 Median :0.0000 Median :0.0000
## Mean :6939 Mean :55.04 Mean :0.4875 Mean :0.7557
## 3rd Qu.:7408 3rd Qu.:73.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :7878 Max. :90.00 Max. :1.0000 Max. :3.0000
## SocioeconomicStatus EducationLevel BMI Smoking
## Min. :0.000 Min. :0.000 Min. :15.03 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:21.47 1st Qu.:0.0000
## Median :1.000 Median :2.000 Median :27.72 Median :0.0000
## Mean :0.992 Mean :1.699 Mean :27.69 Mean :0.2815
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:33.86 3rd Qu.:1.0000
## Max. :2.000 Max. :3.000 Max. :40.00 Max. :1.0000
## AlcoholConsumption PhysicalActivity DietQuality SleepQuality
## Min. :9.276e-04 Min. :0.004089 Min. :0.0008853 Min. :4.004
## 1st Qu.:4.790e+00 1st Qu.:2.751022 1st Qu.:2.4768017 1st Qu.:5.482
## Median :1.017e+01 Median :5.249002 Median :4.8885659 Median :7.095
## Mean :1.010e+01 Mean :5.200790 Mean :4.8958011 Mean :7.021
## 3rd Qu.:1.529e+01 3rd Qu.:7.671402 3rd Qu.:7.3560580 3rd Qu.:8.524
## Max. :2.000e+01 Max. :9.993893 Max. :9.9986774 Max. :9.989
## FamilyHistoryDiabetes GestationalDiabetes PolycysticOvarySyndrome
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.0000
## Mean :0.2384 Mean :0.09952 Mean :0.0447
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## PreviousPreDiabetes Hypertension SystolicBP DiastolicBP
## Min. :0.0000 Min. :0.0000 Min. : 90.0 Min. : 60.00
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:112.0 1st Qu.: 75.00
## Median :0.0000 Median :0.0000 Median :134.0 Median : 90.00
## Mean :0.1538 Mean :0.1533 Mean :134.1 Mean : 89.86
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:156.0 3rd Qu.:105.00
## Max. :1.0000 Max. :1.0000 Max. :179.0 Max. :119.00
## FastingBloodSugar HbA1c SerumCreatinine BUNLevels
## Min. : 70.07 Min. :4.003 Min. :0.5006 Min. : 5.01
## 1st Qu.:102.34 1st Qu.:5.444 1st Qu.:1.6545 1st Qu.:17.17
## Median :137.40 Median :7.096 Median :2.8551 Median :28.19
## Mean :135.20 Mean :6.976 Mean :2.7846 Mean :27.80
## 3rd Qu.:167.36 3rd Qu.:8.423 3rd Qu.:3.8981 3rd Qu.:38.51
## Max. :199.94 Max. :9.991 Max. :4.9940 Max. :49.98
## CholesterolTotal CholesterolLDL CholesterolHDL CholesterolTriglycerides
## Min. :150.1 Min. : 50.06 Min. :20.01 Min. : 50.15
## 1st Qu.:186.9 1st Qu.: 87.81 1st Qu.:40.01 1st Qu.:140.87
## Median :225.1 Median :124.92 Median :60.46 Median :228.42
## Mean :225.0 Mean :124.66 Mean :60.06 Mean :227.39
## 3rd Qu.:263.5 3rd Qu.:161.62 3rd Qu.:80.06 3rd Qu.:313.41
## Max. :300.0 Max. :199.90 Max. :99.96 Max. :399.89
## AntihypertensiveMedications Statins AntidiabeticMedications
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.2858 Mean :0.4039 Mean :0.2783
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## FrequentUrination ExcessiveThirst UnexplainedWeightLoss FatigueLevels
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. : 0.004977
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 2.417748
## Median :0.0000 Median :0.0000 Median :0.0000 Median : 4.851914
## Mean :0.1974 Mean :0.1932 Mean :0.1096 Mean : 4.949003
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 7.569772
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. : 9.999979
## BlurredVision SlowHealingSores TinglingHandsFeet QualityOfLifeScore
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. : 0.00239
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:23.97410
## Median :0.00000 Median :0.0000 Median :0.0000 Median :47.51969
## Mean :0.09526 Mean :0.1027 Mean :0.1112 Mean :48.50864
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:72.88318
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :99.78853
## HeavyMetalsExposure OccupationalExposureChemicals WaterQuality
## Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.05216 Mean :0.1032 Mean :0.2006
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000
## MedicalCheckupsFrequency MedicationAdherence HealthLiteracy
## Min. :0.004013 Min. :0.005384 Min. :0.0003622
## 1st Qu.:1.057802 1st Qu.:2.420024 1st Qu.:2.4101127
## Median :1.987170 Median :4.843886 Median :5.0352081
## Mean :1.997101 Mean :4.957539 Mean :5.0117365
## 3rd Qu.:2.946019 3rd Qu.:7.513933 3rd Qu.:7.5868647
## Max. :3.999715 Max. :9.997165 Max. :9.9930290
## Diagnosis DoctorInCharge
## Min. :0.0000 Length:1879
## 1st Qu.:0.0000 Class :character
## Median :0.0000 Mode :character
## Mean :0.4002
## 3rd Qu.:1.0000
## Max. :1.0000
# assess missing data
anyNA(diabetes)
## [1] FALSE
colSums(is.na(diabetes))
## PatientID Age
## 0 0
## Gender Ethnicity
## 0 0
## SocioeconomicStatus EducationLevel
## 0 0
## BMI Smoking
## 0 0
## AlcoholConsumption PhysicalActivity
## 0 0
## DietQuality SleepQuality
## 0 0
## FamilyHistoryDiabetes GestationalDiabetes
## 0 0
## PolycysticOvarySyndrome PreviousPreDiabetes
## 0 0
## Hypertension SystolicBP
## 0 0
## DiastolicBP FastingBloodSugar
## 0 0
## HbA1c SerumCreatinine
## 0 0
## BUNLevels CholesterolTotal
## 0 0
## CholesterolLDL CholesterolHDL
## 0 0
## CholesterolTriglycerides AntihypertensiveMedications
## 0 0
## Statins AntidiabeticMedications
## 0 0
## FrequentUrination ExcessiveThirst
## 0 0
## UnexplainedWeightLoss FatigueLevels
## 0 0
## BlurredVision SlowHealingSores
## 0 0
## TinglingHandsFeet QualityOfLifeScore
## 0 0
## HeavyMetalsExposure OccupationalExposureChemicals
## 0 0
## WaterQuality MedicalCheckupsFrequency
## 0 0
## MedicationAdherence HealthLiteracy
## 0 0
## Diagnosis DoctorInCharge
## 0 0
library(naniar)
## Warning: package 'naniar' was built under R version 4.5.1
gg_miss_var(diabetes)
Step 3. Data Visualization
# histogram of dependent variable
hist(diabetes$Diagnosis, col = "blue")
# box plots to explore relationships between interesting variables
boxplot(PhysicalActivity ~ Diagnosis, data = diabetes, col="green")
boxplot(FastingBloodSugar ~ Diagnosis, data = diabetes, col="red")
boxplot(Age ~ Diagnosis, data = diabetes, col="yellow")
# scatter plot to analyze pairwise relationships
# Diagnosis 0 = no_diabetes; Diagnosis 1 = diabetes
library(ggplot2)
ggplot(diabetes, aes(x = Age, y=FastingBloodSugar, color = Diagnosis))+geom_point() + theme(axis.title = element_text(color = "blue"))
# 0 means male, 1 means female
# Diagnosis 0 = no_diabetes; Diagnosis 1 = diabetes
ggplot(diabetes, aes(Age, FastingBloodSugar, color = Diagnosis)) + geom_point()+facet_wrap(~Gender) + labs(title = "Age vs. Fasting Blood Sugar by Gender and Diagnostic")
# Diagnosis 0 = no_diabetes; Diagnosis 1 = diabetes
# Ethnicity: 0=Caucasian,1=African American,2=Asian,3=Other
diabetes$Diagnosis <- as.factor(diabetes$Diagnosis)
palette <- c("0" = "#377EB8", "1" = "#E41A1C")
ggplot(diabetes, aes(Ethnicity, fill = Diagnosis))+
geom_bar(position = "dodge")+
labs(x = "Ethnicity", y = "Count", fill = "Diagnosis")+
scale_fill_manual(values = palette)