str(data)
## spc_tbl_ [5,292 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:5292] 48 18 21 25 78 60 38 23 22 18 ...
## $ Gender : chr [1:5292] "Male" "Other" "Other" "Female" ...
## $ BMI : num [1:5292] 35.5 28.7 30 25.6 38.8 19.2 36.4 21.4 18.9 24.4 ...
## $ Family_History : chr [1:5292] "No" "Yes" "Yes" "No" ...
## $ Physical_Activity : chr [1:5292] "High" "Medium" "High" "Medium" ...
## $ Diet_Type : chr [1:5292] "Non-Vegetarian" "Non-Vegetarian" "Non-Vegetarian" "Vegetarian" ...
## $ Smoking_Status : chr [1:5292] "Never" "Current" "Current" "Former" ...
## $ Alcohol_Intake : chr [1:5292] "None" "Moderate" "Moderate" "Moderate" ...
## $ Stress_Level : chr [1:5292] "Medium" "High" "High" "High" ...
## $ Hypertension : chr [1:5292] "Yes" "No" "Yes" "Yes" ...
## $ Cholesterol_Level : num [1:5292] 112 131 295 159 215 ...
## $ Fasting_Blood_Sugar : num [1:5292] 141 83.1 159.9 133.3 164.9 ...
## $ Postprandial_Blood_Sugar : num [1:5292] 166 143 212 225 218 ...
## $ HBA1C : num [1:5292] 8.9 5.9 4.8 11.9 11.6 4.7 7.6 9.2 8.7 11.7 ...
## $ Heart_Rate : num [1:5292] 94 68 70 78 65 69 74 81 95 96 ...
## $ Waist_Hip_Ratio : num [1:5292] 0.91 0.96 0.88 0.98 0.85 0.88 1.11 1.16 0.87 0.73 ...
## $ Urban_Rural : chr [1:5292] "Urban" "Rural" "Rural" "Rural" ...
## $ Health_Insurance : chr [1:5292] "No" "Yes" "No" "No" ...
## $ Regular_Checkups : chr [1:5292] "No" "Yes" "No" "No" ...
## $ Medication_For_Chronic_Conditions: chr [1:5292] "No" "No" "Yes" "Yes" ...
## $ Pregnancies : num [1:5292] 0 0 0 1 0 0 2 0 0 9 ...
## $ Polycystic_Ovary_Syndrome : chr [1:5292] "0" "0" "0" "No" ...
## $ Glucose_Tolerance_Test_Result : num [1:5292] 124.3 151.4 106.1 85.6 77 ...
## $ Vitamin_D_Level : num [1:5292] 31.5 12.5 35.8 15.4 28.6 49 11.7 25.8 18.4 23.6 ...
## $ C_Protein_Level : num [1:5292] 7.46 5.64 7.2 6.53 0.58 1.83 8.24 3.39 3.86 2.16 ...
## $ Thyroid_Condition : chr [1:5292] "Yes" "Yes" "No" "Yes" ...
## $ Diabetes_Status : chr [1:5292] "Yes" "No" "Yes" "No" ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Gender = col_character(),
## .. BMI = col_double(),
## .. Family_History = col_character(),
## .. Physical_Activity = col_character(),
## .. Diet_Type = col_character(),
## .. Smoking_Status = col_character(),
## .. Alcohol_Intake = col_character(),
## .. Stress_Level = col_character(),
## .. Hypertension = col_character(),
## .. Cholesterol_Level = col_double(),
## .. Fasting_Blood_Sugar = col_double(),
## .. Postprandial_Blood_Sugar = col_double(),
## .. HBA1C = col_double(),
## .. Heart_Rate = col_double(),
## .. Waist_Hip_Ratio = col_double(),
## .. Urban_Rural = col_character(),
## .. Health_Insurance = col_character(),
## .. Regular_Checkups = col_character(),
## .. Medication_For_Chronic_Conditions = col_character(),
## .. Pregnancies = col_double(),
## .. Polycystic_Ovary_Syndrome = col_character(),
## .. Glucose_Tolerance_Test_Result = col_double(),
## .. Vitamin_D_Level = col_double(),
## .. C_Protein_Level = col_double(),
## .. Thyroid_Condition = col_character(),
## .. Diabetes_Status = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(data)
## Age Gender BMI Family_History
## Min. :18.00 Length:5292 Min. :15.00 Length:5292
## 1st Qu.:33.00 Class :character 1st Qu.:21.20 Class :character
## Median :48.00 Mode :character Median :27.40 Mode :character
## Mean :48.42 Mean :27.46
## 3rd Qu.:64.00 3rd Qu.:33.60
## Max. :79.00 Max. :40.00
## Physical_Activity Diet_Type Smoking_Status Alcohol_Intake
## Length:5292 Length:5292 Length:5292 Length:5292
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Stress_Level Hypertension Cholesterol_Level Fasting_Blood_Sugar
## Length:5292 Length:5292 Min. :100.0 Min. : 70.0
## Class :character Class :character 1st Qu.:151.1 1st Qu.: 97.1
## Mode :character Mode :character Median :198.6 Median :124.0
## Mean :199.8 Mean :124.9
## 3rd Qu.:249.1 3rd Qu.:153.3
## Max. :300.0 Max. :180.0
## Postprandial_Blood_Sugar HBA1C Heart_Rate Waist_Hip_Ratio
## Min. : 90.1 Min. : 4.500 Min. : 60.00 Min. :0.7000
## 1st Qu.:128.2 1st Qu.: 6.300 1st Qu.: 75.00 1st Qu.:0.8200
## Median :164.9 Median : 8.200 Median : 90.00 Median :0.9500
## Mean :164.7 Mean : 8.227 Mean : 89.79 Mean :0.9494
## 3rd Qu.:201.0 3rd Qu.:10.100 3rd Qu.:105.00 3rd Qu.:1.0800
## Max. :240.0 Max. :12.000 Max. :119.00 Max. :1.2000
## Urban_Rural Health_Insurance Regular_Checkups
## Length:5292 Length:5292 Length:5292
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Medication_For_Chronic_Conditions Pregnancies Polycystic_Ovary_Syndrome
## Length:5292 Min. :0.000 Length:5292
## Class :character 1st Qu.:0.000 Class :character
## Mode :character Median :0.000 Mode :character
## Mean :1.493
## 3rd Qu.:2.000
## Max. :9.000
## Glucose_Tolerance_Test_Result Vitamin_D_Level C_Protein_Level
## Min. : 70.1 Min. :10.00 Min. : 0.500
## 1st Qu.:102.2 1st Qu.:20.00 1st Qu.: 2.880
## Median :136.0 Median :29.80 Median : 5.270
## Mean :135.0 Mean :29.91 Mean : 5.259
## 3rd Qu.:167.1 3rd Qu.:39.90 3rd Qu.: 7.660
## Max. :200.0 Max. :50.00 Max. :10.000
## Thyroid_Condition Diabetes_Status
## Length:5292 Length:5292
## Class :character Class :character
## Mode :character Mode :character
##
##
##
#get the dimensions of data set
dim(data)
## [1] 5292 27
colSums(is.na(data))
## Age Gender
## 0 0
## BMI Family_History
## 0 0
## Physical_Activity Diet_Type
## 0 0
## Smoking_Status Alcohol_Intake
## 0 0
## Stress_Level Hypertension
## 0 0
## Cholesterol_Level Fasting_Blood_Sugar
## 0 0
## Postprandial_Blood_Sugar HBA1C
## 0 0
## Heart_Rate Waist_Hip_Ratio
## 0 0
## Urban_Rural Health_Insurance
## 0 0
## Regular_Checkups Medication_For_Chronic_Conditions
## 0 0
## Pregnancies Polycystic_Ovary_Syndrome
## 0 0
## Glucose_Tolerance_Test_Result Vitamin_D_Level
## 0 0
## C_Protein_Level Thyroid_Condition
## 0 0
## Diabetes_Status
## 0
data <- data %>% distinct()
Q1 <- quantile(data$BMI, 0.25, na.rm = TRUE)
Q3 <- quantile(data$BMI, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
data <- data %>% filter(BMI >= (Q1 - 1.5 * IQR_value) & BMI <= (Q3 + 1.5 * IQR_value))
data <- data %>%
mutate(BMI_Category = case_when(
BMI < 18.5 ~ "Underweight",
BMI >= 18.5 & BMI < 24.9 ~ "Normal",
BMI >= 25 & BMI < 29.9 ~ "Overweight",
TRUE ~ "Obese"
))
data$Gender <- as.factor(data$Gender)
data$Diabetes_Status <- as.factor(data$Diabetes_Status)
numeric_data <- data %>% select_if(is.numeric)
cor_matrix <- cor(numeric_data, use="complete.obs")
mean_BMI <- mean(data$BMI, na.rm = TRUE)
print(paste("Mean BMI:", mean_BMI))
## [1] "Mean BMI: 27.4598828420257"
median_FBS <- median(data$Fasting_Blood_Sugar, na.rm = TRUE)
print(paste("Median Fasting Blood Sugar:", median_FBS))
## [1] "Median Fasting Blood Sugar: 124.05"
mode_age <- as.numeric(names(sort(table(data$Age), decreasing = TRUE)[1]))
print(paste("Mode of Age:", mode_age))
## [1] "Mode of Age: 65"
max_chol <- max(data$Cholesterol_Level, na.rm = TRUE)
min_chol <- min(data$Cholesterol_Level, na.rm = TRUE)
print(paste("Max Cholesterol Level:", max_chol, "Min Cholesterol Level:", min_chol))
## [1] "Max Cholesterol Level: 300 Min Cholesterol Level: 100"
table(data$Diabetes_Status)
##
## No Yes
## 2588 2704
#1. What is the average BMI of individuals with and without diabetes?
data %>% group_by(Diabetes_Status) %>%
summarise(Average_BMI = mean(BMI, na.rm = TRUE))
## # A tibble: 2 × 2
## Diabetes_Status Average_BMI
## <fct> <dbl>
## 1 No 27.6
## 2 Yes 27.3
#2. How does average HBA1C level vary by gender?
data %>% group_by(Gender) %>%
summarise(Avg_HBA1C = mean(HBA1C, na.rm = TRUE)) %>%
arrange(desc(Avg_HBA1C))
## # A tibble: 3 × 2
## Gender Avg_HBA1C
## <fct> <dbl>
## 1 Female 8.26
## 2 Other 8.22
## 3 Male 8.20
#3. What is the distribution of diabetes status across different age groups?
data %>% mutate(Age_Group = cut(Age, breaks = c(0, 30, 45, 60, 80), labels = c("0–30", "31–45", "46–60", "61+"))) %>%
count(Age_Group, Diabetes_Status) %>%
pivot_wider(names_from = Diabetes_Status, values_from = n, values_fill = 0)
## # A tibble: 4 × 3
## Age_Group No Yes
## <fct> <int> <int>
## 1 0–30 550 606
## 2 31–45 628 613
## 3 46–60 582 654
## 4 61+ 828 831
#4. What is the average fasting and postprandial blood sugar for diabetic vs non-diabetic individuals?
data %>% group_by(Diabetes_Status) %>%
summarise(
Avg_Fasting_Blood_Sugar = mean(Fasting_Blood_Sugar, na.rm = TRUE),
Avg_Postprandial_Blood_Sugar = mean(Postprandial_Blood_Sugar, na.rm = TRUE)
)
## # A tibble: 2 × 3
## Diabetes_Status Avg_Fasting_Blood_Sugar Avg_Postprandial_Blood_Sugar
## <fct> <dbl> <dbl>
## 1 No 125. 165.
## 2 Yes 125. 164.
data %>% group_by(Physical_Activity) %>%
summarise(Avg_Cholesterol = mean(Cholesterol_Level, na.rm = TRUE))
## # A tibble: 3 × 2
## Physical_Activity Avg_Cholesterol
## <chr> <dbl>
## 1 High 201.
## 2 Low 199.
## 3 Medium 199.
data %>% filter(Diabetes_Status == "Yes") %>%
count(Hypertension) %>%
mutate(Proportion = n / sum(n))
## # A tibble: 2 × 3
## Hypertension n Proportion
## <chr> <int> <dbl>
## 1 No 1338 0.495
## 2 Yes 1366 0.505
data %>% group_by(Diabetes_Status) %>%
summarise(Average_Age = mean(Age, na.rm = TRUE))
## # A tibble: 2 × 2
## Diabetes_Status Average_Age
## <fct> <dbl>
## 1 No 48.4
## 2 Yes 48.4
data %>%
group_by(Stress_Level, Diabetes_Status) %>%
summarise(Count = n()) %>%
pivot_wider(names_from = Diabetes_Status, values_from = Count, values_fill = 0)
## `summarise()` has grouped output by 'Stress_Level'. You can override using the
## `.groups` argument.
## # A tibble: 3 × 3
## # Groups: Stress_Level [3]
## Stress_Level No Yes
## <chr> <int> <int>
## 1 High 887 926
## 2 Low 834 916
## 3 Medium 867 862
data %>%
filter(Polycystic_Ovary_Syndrome %in% c("Yes", "1")) %>%
count(Diabetes_Status)
## # A tibble: 2 × 2
## Diabetes_Status n
## <fct> <int>
## 1 No 408
## 2 Yes 441
{r}data %>% filter(Diabetes_Status == "Yes") %>% count(Gender, sort = TRUE)
data %>%
group_by(Diabetes_Status) %>%
summarise(Avg_WhR = mean(Waist_Hip_Ratio, na.rm = TRUE
))
## # A tibble: 2 × 2
## Diabetes_Status Avg_WhR
## <fct> <dbl>
## 1 No 0.951
## 2 Yes 0.948
data %>%
group_by(Regular_Checkups, Diabetes_Status) %>%
summarise(Count = n()) %>%
pivot_wider(names_from = Diabetes_Status, values_from = Count, values_fill = 0)
## `summarise()` has grouped output by 'Regular_Checkups'. You can override using
## the `.groups` argument.
## # A tibble: 2 × 3
## # Groups: Regular_Checkups [2]
## Regular_Checkups No Yes
## <chr> <int> <int>
## 1 No 1296 1341
## 2 Yes 1292 1363
data %>%
filter(Health_Insurance == "Yes") %>%
count(Diabetes_Status)
## # A tibble: 2 × 2
## Diabetes_Status n
## <fct> <int>
## 1 No 1305
## 2 Yes 1364
data %>%
filter(Diabetes_Status == "Yes") %>%
summarise(Avg_VitaminD = mean(Vitamin_D_Level, na.rm = TRUE))
## # A tibble: 1 × 1
## Avg_VitaminD
## <dbl>
## 1 30.1
data %>%
group_by(Diabetes_Status) %>%
summarise(Avg_C_Protein = mean(C_Protein_Level, na.rm = TRUE))
## # A tibble: 2 × 2
## Diabetes_Status Avg_C_Protein
## <fct> <dbl>
## 1 No 5.23
## 2 Yes 5.29
data %>%
filter(Diabetes_Status == "Yes", Medication_For_Chronic_Conditions == "Yes") %>%
summarise(Count = n())
## # A tibble: 1 × 1
## Count
## <int>
## 1 1340
data %>%
filter(Diabetes_Status == "Yes") %>%
count(Smoking_Status, sort = TRUE)
## # A tibble: 3 × 2
## Smoking_Status n
## <chr> <int>
## 1 Current 914
## 2 Former 895
## 3 Never 895
data %>%
group_by(Alcohol_Intake, Diabetes_Status) %>%
summarise(Count = n()) %>%
pivot_wider(names_from = Diabetes_Status, values_from = Count, values_fill = 0)
## `summarise()` has grouped output by 'Alcohol_Intake'. You can override using
## the `.groups` argument.
## # A tibble: 3 × 3
## # Groups: Alcohol_Intake [3]
## Alcohol_Intake No Yes
## <chr> <int> <int>
## 1 High 874 890
## 2 Moderate 839 909
## 3 None 875 905
data %>%
filter(Pregnancies > 0, Diabetes_Status == "Yes") %>%
summarise(Count = n())
## # A tibble: 1 × 1
## Count
## <int>
## 1 834
#Data Visualization
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
#1. What is the distribution of Diabetes_Status by Urban/Rural?
data %>%
group_by(Urban_Rural, Diabetes_Status) %>%
summarise(Count = n()) %>%
ggplot(aes(x = Urban_Rural, y = Count, fill = Diabetes_Status)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Diabetes by Urban/Rural Area") +
theme_minimal()
## `summarise()` has grouped output by 'Urban_Rural'. You can override using the
## `.groups` argument.
Interpretation:
This bar plot shows how diabetes prevalence differs between urban
and rural areas. We can observe if people living in urban areas have a
higher or lower rate of diabetes compared to those in rural
settings.
#2. Boxplot of BMI for Diabetic and Non-Diabetic Patients
ggplot(data, aes(x = Diabetes_Status, y = BMI, fill = Diabetes_Status)) +
geom_boxplot() +
labs(title = "BMI Distribution for Diabetic and Non-Diabetic Patients") +
theme_minimal()
Interpretation:
The boxplot compares the BMI distributions of diabetic and
non-diabetic patients. Higher median BMI in diabetic patients would
suggest that obesity is a significant risk factor for diabetes.
#3. Histogram of Age Distribution
ggplot(data, aes(x = Age, fill = Diabetes_Status)) +
geom_histogram(binwidth = 5, alpha = 0.7, position="identity") +
labs(title = "Age Distribution of Diabetic vs. Non-Diabetic Patients") +
theme_minimal()
Interpretation:
The histogram displays the age distribution for diabetic and
non-diabetic patients, helping to identify whether older individuals are
more affected by diabetes.
#4. Scatter Plot of HBA1C vs. Fasting Blood Sugar
ggplot(data, aes(x = Fasting_Blood_Sugar, y = HBA1C, color = Diabetes_Status)) +
geom_point() +
labs(title = "Relationship Between Fasting Blood Sugar and HBA1C") +
theme_minimal()
Interpretation:
The scatter plot shows a positive correlation between fasting blood
sugar and HBA1C levels, indicating that higher blood sugar levels are
associated with higher long-term glucose levels.
#5. Bar Chart of Diabetes Cases by Gender
ggplot(data, aes(x = Gender, fill = Diabetes_Status)) +
geom_bar(position="dodge") +
labs(title = "Diabetes Cases by Gender") +
theme_minimal()
Interpretation:
This bar chart highlights differences in diabetes prevalence between
males and females.
#6. Proportion of Physical Activity Levels across Diabetic and Non-Diabetic Groups
ggplot(data, aes(x = Physical_Activity, fill = Diabetes_Status)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Physical Activity vs Diabetes Status", x = "Physical Activity Level", y = "Proportion") +
theme_minimal()
Interpretation:
This plot shows the proportion of physical activity levels among
diabetic and non-diabetic individuals, suggesting that lower activity
levels might be linked to higher diabetes prevalence.
#8. Stress Level vs Diabetes Status (Stacked Bar Plot)
ggplot(data, aes(x = Stress_Level, fill = Diabetes_Status)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Stress Level by Diabetes Status") +
theme_minimal()
Interpretation:
This plot shows how stress levels vary between diabetic and
non-diabetic groups, possibly indicating a link between higher stress
and diabetes.
#9. Waist-to-Hip Ratio vs Diabetes Status (Boxplot)
ggplot(data, aes(x = Diabetes_Status, y = Waist_Hip_Ratio, fill = Diabetes_Status)) +
geom_boxplot() +
labs(title = "Waist-Hip Ratio by Diabetes Status") +
theme_minimal()
Interpretation:
The boxplot compares waist-to-hip ratios of diabetic and
non-diabetic individuals. Higher ratios in diabetic patients suggest
abdominal obesity as a risk factor.
#10. Alcohol Intake and Diabetes (Grouped Bar Plot)
ggplot(data, aes(x = Alcohol_Intake, fill = Diabetes_Status)) +
geom_bar(position = "dodge") +
labs(title = "Alcohol Intake vs Diabetes Status") +
theme_minimal()
Interpretation:
This grouped bar chart shows how alcohol intake varies between
diabetic and non-diabetic individuals.
#11. Pregnancy Count among Diabetics (Bar Plot)
data %>%
filter(Diabetes_Status == "Yes", Pregnancies > 0) %>%
ggplot(aes(x = factor(Pregnancies))) +
geom_bar(fill = "tomato") +
labs(title = "Pregnancy Count in Diabetic Women", x = "Number of Pregnancies") +
theme_minimal()
Interpretation:
This bar plot visualizes the distribution of the number of
pregnancies among diabetic women, which could hint at gestational
diabetes risk.
#12. Smoking Status among Diabetics (Horizontal Bar Plot)
data %>%
filter(Diabetes_Status == "Yes") %>%
ggplot(aes(x = Smoking_Status)) +
geom_bar(fill = "steelblue") +
coord_flip() +
labs(title = "Smoking Status among Diabetics") +
theme_minimal()
Interpretation:
This horizontal bar plot shows the distribution of smoking habits
among diabetic individuals, providing insights into the role of smoking
as a risk factor.
#13. Health Insurance Coverage among Diabetics (Pie Chart)
insurance_data <- data %>%
filter(Diabetes_Status == "Yes") %>%
count(Health_Insurance)
ggplot(insurance_data, aes(x = "", y = n, fill = Health_Insurance)) +
geom_col() +
coord_polar(theta = "y") +
labs(title = "Health Insurance among Diabetic Patients") +
theme_void()
Interpretation:
The pie chart shows the proportion of diabetic patients with and
without health insurance coverage.