library(readr)
library(purrr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(corrplot)
## corrplot 0.92 loaded
library(rmarkdown)
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective is to predict based on diagnostic measurements whether a patient has diabetes.
Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
Pregnancies: Number of times pregnant
Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
BloodPressure: Diastolic blood pressure (mm Hg)
SkinThickness: Triceps skin fold thickness (mm)
Insulin: 2-Hour serum insulin (mu U/ml)
BMI: Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction: Diabetes pedigree function. Indicates the function which scores likelihood of diabetes based on family history
Age: Age (years)
Outcome: Class variable (0 or 1). If patient had diabetes 1 = Yes, 0 = No.
The data was imported into R and run head to visualize the first 6 rows
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
Also looked at the dimensions of the data
dim(diabetes)
## [1] 768 9
Looking if we had any null values in the data or any duplicated values
diabetes %>% map(~sum(is.na(.)))
## $Pregnancies
## [1] 0
##
## $Glucose
## [1] 0
##
## $BloodPressure
## [1] 0
##
## $SkinThickness
## [1] 0
##
## $Insulin
## [1] 0
##
## $BMI
## [1] 0
##
## $DiabetesPedigreeFunction
## [1] 0
##
## $Age
## [1] 0
##
## $Outcome
## [1] 0
# duplicated
sum(duplicated(diabetes))
## [1] 0
Since We cannot have Bps, BMI, Skin Thickness and Glucose being zero(0) all zero values were replaced with the mean since dropping them would change the data.
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 44.00 Min. : 24.00 Min. : 7.00
## 1st Qu.: 1.000 1st Qu.: 99.75 1st Qu.: 64.00 1st Qu.:20.54
## Median : 3.000 Median :117.00 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :121.68 Mean : 72.25 Mean :26.61
## 3rd Qu.: 6.000 3rd Qu.:140.25 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.00 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. :18.20 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.50 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :32.45 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome age_cat
## Min. :0.000 Length:768
## 1st Qu.:0.000 Class :character
## Median :0.000 Mode :character
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
grouped_ages <- diabetes %>%
group_by(age_cat) %>%
summarize(count = n()) %>%
arrange(desc(count))
grouped_ages
## # A tibble: 6 × 2
## age_cat count
## <chr> <int>
## 1 Below_30 396
## 2 30s 165
## 3 40s 118
## 4 50s 57
## 5 60s 29
## 6 Above_70 3
diabetes <- diabetes %>%
mutate(
BMI_cat = case_when(
BMI < 18.5 ~ 'Underweight',
BMI >= 18.5 & BMI < 25 ~ 'Normal',
BMI >= 25 & BMI < 30 ~ 'Obese',
BMI >= 30 & BMI < 35 ~ 'Moderate Obese',
BMI >= 35 & BMI < 40 ~ 'Severe Obese',
BMI >= 40 & BMI < 45 ~ 'Very severe Obese',
BMI >= 45 & BMI < 50 ~ 'Morbid Obese',
BMI >= 50 & BMI <= 60 ~ 'Super Obese',
BMI > 60 ~ 'Hyper obese',
TRUE ~ 'Unknown' # Default category for other cases
)
)
# View the resulting dataset
grouped_counts <- diabetes %>%
group_by(BMI_cat) %>%
summarize(count = n()) %>%
arrange(desc(count))
grouped_counts
## # A tibble: 9 × 2
## BMI_cat count
## <chr> <int>
## 1 Moderate Obese 235
## 2 Obese 179
## 3 Severe Obese 150
## 4 Normal 102
## 5 Very severe Obese 62
## 6 Morbid Obese 27
## 7 Super Obese 8
## 8 Underweight 4
## 9 Hyper obese 1
diabetes$Outcome <- factor(diabetes$Outcome, levels = c(1, 0), labels = c("Yes", "No"))
ggplot(diabetes, aes(x = Outcome, fill = Outcome)) +
geom_bar() + # Create the bar plot
labs(
title = "Distribution of Outcome",
y = NULL
) +
scale_fill_manual(values = c("Yes" = "blue", "No" = "red")) + # Set bar colors
theme_minimal() +
theme(
panel.grid.major = element_blank(), # Hide major grid lines
panel.grid.minor = element_blank(), # Hide minor grid lines
axis.text.x = element_text(angle = 0, hjust = 0.5) # Adjust x-axis label alignment
)
ggplot(diabetes, aes(x = age_cat, fill = Outcome)) +
geom_bar() +
labs(
title = "Age Category vs. Outcome",
x = " ",
y = ""
) +
scale_fill_manual(values = c("Yes" = "blue", "No" = "red")) +
theme_minimal()+
theme(
panel.grid.major = element_blank(), # Hide major grid lines
panel.grid.minor = element_blank(), # Hide minor grid lines
axis.text.x = element_text(angle = 0, hjust = 0.5) # Adjust x-axis label alignment
)