1. data understanding

check the structure of the data set

str(data)
## spc_tbl_ [5,292 × 27] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                              : num [1:5292] 48 18 21 25 78 60 38 23 22 18 ...
##  $ Gender                           : chr [1:5292] "Male" "Other" "Other" "Female" ...
##  $ BMI                              : num [1:5292] 35.5 28.7 30 25.6 38.8 19.2 36.4 21.4 18.9 24.4 ...
##  $ Family_History                   : chr [1:5292] "No" "Yes" "Yes" "No" ...
##  $ Physical_Activity                : chr [1:5292] "High" "Medium" "High" "Medium" ...
##  $ Diet_Type                        : chr [1:5292] "Non-Vegetarian" "Non-Vegetarian" "Non-Vegetarian" "Vegetarian" ...
##  $ Smoking_Status                   : chr [1:5292] "Never" "Current" "Current" "Former" ...
##  $ Alcohol_Intake                   : chr [1:5292] "None" "Moderate" "Moderate" "Moderate" ...
##  $ Stress_Level                     : chr [1:5292] "Medium" "High" "High" "High" ...
##  $ Hypertension                     : chr [1:5292] "Yes" "No" "Yes" "Yes" ...
##  $ Cholesterol_Level                : num [1:5292] 112 131 295 159 215 ...
##  $ Fasting_Blood_Sugar              : num [1:5292] 141 83.1 159.9 133.3 164.9 ...
##  $ Postprandial_Blood_Sugar         : num [1:5292] 166 143 212 225 218 ...
##  $ HBA1C                            : num [1:5292] 8.9 5.9 4.8 11.9 11.6 4.7 7.6 9.2 8.7 11.7 ...
##  $ Heart_Rate                       : num [1:5292] 94 68 70 78 65 69 74 81 95 96 ...
##  $ Waist_Hip_Ratio                  : num [1:5292] 0.91 0.96 0.88 0.98 0.85 0.88 1.11 1.16 0.87 0.73 ...
##  $ Urban_Rural                      : chr [1:5292] "Urban" "Rural" "Rural" "Rural" ...
##  $ Health_Insurance                 : chr [1:5292] "No" "Yes" "No" "No" ...
##  $ Regular_Checkups                 : chr [1:5292] "No" "Yes" "No" "No" ...
##  $ Medication_For_Chronic_Conditions: chr [1:5292] "No" "No" "Yes" "Yes" ...
##  $ Pregnancies                      : num [1:5292] 0 0 0 1 0 0 2 0 0 9 ...
##  $ Polycystic_Ovary_Syndrome        : chr [1:5292] "0" "0" "0" "No" ...
##  $ Glucose_Tolerance_Test_Result    : num [1:5292] 124.3 151.4 106.1 85.6 77 ...
##  $ Vitamin_D_Level                  : num [1:5292] 31.5 12.5 35.8 15.4 28.6 49 11.7 25.8 18.4 23.6 ...
##  $ C_Protein_Level                  : num [1:5292] 7.46 5.64 7.2 6.53 0.58 1.83 8.24 3.39 3.86 2.16 ...
##  $ Thyroid_Condition                : chr [1:5292] "Yes" "Yes" "No" "Yes" ...
##  $ Diabetes_Status                  : chr [1:5292] "Yes" "No" "Yes" "No" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   BMI = col_double(),
##   ..   Family_History = col_character(),
##   ..   Physical_Activity = col_character(),
##   ..   Diet_Type = col_character(),
##   ..   Smoking_Status = col_character(),
##   ..   Alcohol_Intake = col_character(),
##   ..   Stress_Level = col_character(),
##   ..   Hypertension = col_character(),
##   ..   Cholesterol_Level = col_double(),
##   ..   Fasting_Blood_Sugar = col_double(),
##   ..   Postprandial_Blood_Sugar = col_double(),
##   ..   HBA1C = col_double(),
##   ..   Heart_Rate = col_double(),
##   ..   Waist_Hip_Ratio = col_double(),
##   ..   Urban_Rural = col_character(),
##   ..   Health_Insurance = col_character(),
##   ..   Regular_Checkups = col_character(),
##   ..   Medication_For_Chronic_Conditions = col_character(),
##   ..   Pregnancies = col_double(),
##   ..   Polycystic_Ovary_Syndrome = col_character(),
##   ..   Glucose_Tolerance_Test_Result = col_double(),
##   ..   Vitamin_D_Level = col_double(),
##   ..   C_Protein_Level = col_double(),
##   ..   Thyroid_Condition = col_character(),
##   ..   Diabetes_Status = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

check the summary of the data set

summary(data)
##       Age           Gender               BMI        Family_History    
##  Min.   :18.00   Length:5292        Min.   :15.00   Length:5292       
##  1st Qu.:33.00   Class :character   1st Qu.:21.20   Class :character  
##  Median :48.00   Mode  :character   Median :27.40   Mode  :character  
##  Mean   :48.42                      Mean   :27.46                     
##  3rd Qu.:64.00                      3rd Qu.:33.60                     
##  Max.   :79.00                      Max.   :40.00                     
##  Physical_Activity   Diet_Type         Smoking_Status     Alcohol_Intake    
##  Length:5292        Length:5292        Length:5292        Length:5292       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Stress_Level       Hypertension       Cholesterol_Level Fasting_Blood_Sugar
##  Length:5292        Length:5292        Min.   :100.0     Min.   : 70.0      
##  Class :character   Class :character   1st Qu.:151.1     1st Qu.: 97.1      
##  Mode  :character   Mode  :character   Median :198.6     Median :124.0      
##                                        Mean   :199.8     Mean   :124.9      
##                                        3rd Qu.:249.1     3rd Qu.:153.3      
##                                        Max.   :300.0     Max.   :180.0      
##  Postprandial_Blood_Sugar     HBA1C          Heart_Rate     Waist_Hip_Ratio 
##  Min.   : 90.1            Min.   : 4.500   Min.   : 60.00   Min.   :0.7000  
##  1st Qu.:128.2            1st Qu.: 6.300   1st Qu.: 75.00   1st Qu.:0.8200  
##  Median :164.9            Median : 8.200   Median : 90.00   Median :0.9500  
##  Mean   :164.7            Mean   : 8.227   Mean   : 89.79   Mean   :0.9494  
##  3rd Qu.:201.0            3rd Qu.:10.100   3rd Qu.:105.00   3rd Qu.:1.0800  
##  Max.   :240.0            Max.   :12.000   Max.   :119.00   Max.   :1.2000  
##  Urban_Rural        Health_Insurance   Regular_Checkups  
##  Length:5292        Length:5292        Length:5292       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##  Medication_For_Chronic_Conditions  Pregnancies    Polycystic_Ovary_Syndrome
##  Length:5292                       Min.   :0.000   Length:5292              
##  Class :character                  1st Qu.:0.000   Class :character         
##  Mode  :character                  Median :0.000   Mode  :character         
##                                    Mean   :1.493                            
##                                    3rd Qu.:2.000                            
##                                    Max.   :9.000                            
##  Glucose_Tolerance_Test_Result Vitamin_D_Level C_Protein_Level 
##  Min.   : 70.1                 Min.   :10.00   Min.   : 0.500  
##  1st Qu.:102.2                 1st Qu.:20.00   1st Qu.: 2.880  
##  Median :136.0                 Median :29.80   Median : 5.270  
##  Mean   :135.0                 Mean   :29.91   Mean   : 5.259  
##  3rd Qu.:167.1                 3rd Qu.:39.90   3rd Qu.: 7.660  
##  Max.   :200.0                 Max.   :50.00   Max.   :10.000  
##  Thyroid_Condition  Diabetes_Status   
##  Length:5292        Length:5292       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 

#get the dimensions of data set

dim(data)
## [1] 5292   27

2. Data Cleaning & Handling Missing Values

Check for missing values

colSums(is.na(data))
##                               Age                            Gender 
##                                 0                                 0 
##                               BMI                    Family_History 
##                                 0                                 0 
##                 Physical_Activity                         Diet_Type 
##                                 0                                 0 
##                    Smoking_Status                    Alcohol_Intake 
##                                 0                                 0 
##                      Stress_Level                      Hypertension 
##                                 0                                 0 
##                 Cholesterol_Level               Fasting_Blood_Sugar 
##                                 0                                 0 
##          Postprandial_Blood_Sugar                             HBA1C 
##                                 0                                 0 
##                        Heart_Rate                   Waist_Hip_Ratio 
##                                 0                                 0 
##                       Urban_Rural                  Health_Insurance 
##                                 0                                 0 
##                  Regular_Checkups Medication_For_Chronic_Conditions 
##                                 0                                 0 
##                       Pregnancies         Polycystic_Ovary_Syndrome 
##                                 0                                 0 
##     Glucose_Tolerance_Test_Result                   Vitamin_D_Level 
##                                 0                                 0 
##                   C_Protein_Level                 Thyroid_Condition 
##                                 0                                 0 
##                   Diabetes_Status 
##                                 0

3.remove duplicates

data <- data %>% distinct() 

4. Detect and Handle Outliers

Remove extreme outliers in BMI (values beyond 1.5*IQR)

Q1 <- quantile(data$BMI, 0.25, na.rm = TRUE)
Q3 <- quantile(data$BMI, 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
data <- data %>% filter(BMI >= (Q1 - 1.5 * IQR_value) & BMI <= (Q3 + 1.5 * IQR_value))

5. Data Transformation & Feature Engineering

Create a new column for BMI categories

data <- data %>%
  mutate(BMI_Category = case_when(
    BMI < 18.5 ~ "Underweight",
    BMI >= 18.5 & BMI < 24.9 ~ "Normal",
    BMI >= 25 & BMI < 29.9 ~ "Overweight",
    TRUE ~ "Obese"
  ))

Convert categorical variables to factors

data$Gender <- as.factor(data$Gender)
data$Diabetes_Status <- as.factor(data$Diabetes_Status)

6. Data Reduction - Feature Selection Using Correlation

Compute correlation matrix for numeric variables

numeric_data <- data %>% select_if(is.numeric)
cor_matrix <- cor(numeric_data, use="complete.obs")

7. Statistical Analysis (Mean, Median, Mode, Count, Max)

Mean BMI

mean_BMI <- mean(data$BMI, na.rm = TRUE)
print(paste("Mean BMI:", mean_BMI))
## [1] "Mean BMI: 27.4598828420257"

Median Fasting Blood Sugar

median_FBS <- median(data$Fasting_Blood_Sugar, na.rm = TRUE)
print(paste("Median Fasting Blood Sugar:", median_FBS))
## [1] "Median Fasting Blood Sugar: 124.05"

Mode of Age

mode_age <- as.numeric(names(sort(table(data$Age), decreasing = TRUE)[1]))
print(paste("Mode of Age:", mode_age))
## [1] "Mode of Age: 65"

Maximum and Minimum Cholesterol Level

  max_chol <- max(data$Cholesterol_Level, na.rm = TRUE)
min_chol <- min(data$Cholesterol_Level, na.rm = TRUE)
print(paste("Max Cholesterol Level:", max_chol, "Min Cholesterol Level:", min_chol))
## [1] "Max Cholesterol Level: 300 Min Cholesterol Level: 100"

Count of Diabetes Cases

table(data$Diabetes_Status)
## 
##   No  Yes 
## 2588 2704

DESCRIPTIVE ANALYSIS

#1. What is the average BMI of individuals with and without diabetes?

data %>% group_by(Diabetes_Status) %>%
  summarise(Average_BMI = mean(BMI, na.rm = TRUE))
## # A tibble: 2 × 2
##   Diabetes_Status Average_BMI
##   <fct>                 <dbl>
## 1 No                     27.6
## 2 Yes                    27.3

#2. How does average HBA1C level vary by gender?

data %>% group_by(Gender) %>%
  summarise(Avg_HBA1C = mean(HBA1C, na.rm = TRUE)) %>%
  arrange(desc(Avg_HBA1C))
## # A tibble: 3 × 2
##   Gender Avg_HBA1C
##   <fct>      <dbl>
## 1 Female      8.26
## 2 Other       8.22
## 3 Male        8.20

#3. What is the distribution of diabetes status across different age groups?

data %>% mutate(Age_Group = cut(Age, breaks = c(0, 30, 45, 60, 80), labels = c("0–30", "31–45", "46–60", "61+"))) %>%
  count(Age_Group, Diabetes_Status) %>%
  pivot_wider(names_from = Diabetes_Status, values_from = n, values_fill = 0)
## # A tibble: 4 × 3
##   Age_Group    No   Yes
##   <fct>     <int> <int>
## 1 0–30        550   606
## 2 31–45       628   613
## 3 46–60       582   654
## 4 61+         828   831

#4. What is the average fasting and postprandial blood sugar for diabetic vs non-diabetic individuals?

data %>% group_by(Diabetes_Status) %>%
  summarise(
    Avg_Fasting_Blood_Sugar = mean(Fasting_Blood_Sugar, na.rm = TRUE),
    Avg_Postprandial_Blood_Sugar = mean(Postprandial_Blood_Sugar, na.rm = TRUE)
  )
## # A tibble: 2 × 3
##   Diabetes_Status Avg_Fasting_Blood_Sugar Avg_Postprandial_Blood_Sugar
##   <fct>                             <dbl>                        <dbl>
## 1 No                                 125.                         165.
## 2 Yes                                125.                         164.

5. How does cholesterol level differ across different physical activity levels?

data %>% group_by(Physical_Activity) %>%
  summarise(Avg_Cholesterol = mean(Cholesterol_Level, na.rm = TRUE))
## # A tibble: 3 × 2
##   Physical_Activity Avg_Cholesterol
##   <chr>                       <dbl>
## 1 High                         201.
## 2 Low                          199.
## 3 Medium                       199.

6. What proportion of diabetic patients have hypertension?

data %>% filter(Diabetes_Status == "Yes") %>%
  count(Hypertension) %>%
  mutate(Proportion = n / sum(n))
## # A tibble: 2 × 3
##   Hypertension     n Proportion
##   <chr>        <int>      <dbl>
## 1 No            1338      0.495
## 2 Yes           1366      0.505

7. What is the average age of diabetic vs non-diabetic individuals?

data %>% group_by(Diabetes_Status) %>%
  summarise(Average_Age = mean(Age, na.rm = TRUE))
## # A tibble: 2 × 2
##   Diabetes_Status Average_Age
##   <fct>                 <dbl>
## 1 No                     48.4
## 2 Yes                    48.4

8. How does stress level relate to diabetes status?

data %>%
  group_by(Stress_Level, Diabetes_Status) %>%
  summarise(Count = n()) %>%
  pivot_wider(names_from = Diabetes_Status, values_from = Count, values_fill = 0)
## `summarise()` has grouped output by 'Stress_Level'. You can override using the
## `.groups` argument.
## # A tibble: 3 × 3
## # Groups:   Stress_Level [3]
##   Stress_Level    No   Yes
##   <chr>        <int> <int>
## 1 High           887   926
## 2 Low            834   916
## 3 Medium         867   862

9. How many individuals have Polycystic Ovary Syndrome and are diabetic?

data %>%
  filter(Polycystic_Ovary_Syndrome %in% c("Yes", "1")) %>%
  count(Diabetes_Status)
## # A tibble: 2 × 2
##   Diabetes_Status     n
##   <fct>           <int>
## 1 No                408
## 2 Yes               441

10. Which gender has the highest number of diabetic patients?

{r}data %>% filter(Diabetes_Status == "Yes") %>% count(Gender, sort = TRUE)

11. What is the average waist-to-hip ratio by diabetes status?

data %>%
  group_by(Diabetes_Status) %>%
  summarise(Avg_WhR = mean(Waist_Hip_Ratio, na.rm = TRUE
))
## # A tibble: 2 × 2
##   Diabetes_Status Avg_WhR
##   <fct>             <dbl>
## 1 No                0.951
## 2 Yes               0.948

12. Do regular checkups correlate with diabetes diagnosis?

data %>%
  group_by(Regular_Checkups, Diabetes_Status) %>%
  summarise(Count = n()) %>%
  pivot_wider(names_from = Diabetes_Status, values_from = Count, values_fill = 0)
## `summarise()` has grouped output by 'Regular_Checkups'. You can override using
## the `.groups` argument.
## # A tibble: 2 × 3
## # Groups:   Regular_Checkups [2]
##   Regular_Checkups    No   Yes
##   <chr>            <int> <int>
## 1 No                1296  1341
## 2 Yes               1292  1363

13 . How many individuals with health insurance are diabetic?

data %>%
  filter(Health_Insurance == "Yes") %>%
  count(Diabetes_Status)
## # A tibble: 2 × 2
##   Diabetes_Status     n
##   <fct>           <int>
## 1 No               1305
## 2 Yes              1364

14. What is the average Vitamin D level of diabetic patients?

data %>%
  filter(Diabetes_Status == "Yes") %>%
  summarise(Avg_VitaminD = mean(Vitamin_D_Level, na.rm = TRUE))
## # A tibble: 1 × 1
##   Avg_VitaminD
##          <dbl>
## 1         30.1

15. What is the average C-Protein Level for both groups

data %>%
  group_by(Diabetes_Status) %>%
  summarise(Avg_C_Protein = mean(C_Protein_Level, na.rm = TRUE))
## # A tibble: 2 × 2
##   Diabetes_Status Avg_C_Protein
##   <fct>                   <dbl>
## 1 No                       5.23
## 2 Yes                      5.29

16. How many diabetic individuals take medication for chronic conditions?

data %>%
  filter(Diabetes_Status == "Yes", Medication_For_Chronic_Conditions == "Yes") %>%
  summarise(Count = n())
## # A tibble: 1 × 1
##   Count
##   <int>
## 1  1340

17. Which Smoking Status is most common among diabetic patients ?

data %>%
  filter(Diabetes_Status == "Yes") %>%
  count(Smoking_Status, sort = TRUE)
## # A tibble: 3 × 2
##   Smoking_Status     n
##   <chr>          <int>
## 1 Current          914
## 2 Former           895
## 3 Never            895

18. Does alcohol intake show any relation with diabetes?

data %>%
  group_by(Alcohol_Intake, Diabetes_Status) %>%
  summarise(Count = n()) %>%
  pivot_wider(names_from = Diabetes_Status, values_from = Count, values_fill = 0)
## `summarise()` has grouped output by 'Alcohol_Intake'. You can override using
## the `.groups` argument.
## # A tibble: 3 × 3
## # Groups:   Alcohol_Intake [3]
##   Alcohol_Intake    No   Yes
##   <chr>          <int> <int>
## 1 High             874   890
## 2 Moderate         839   909
## 3 None             875   905

19. How many pregnant women are diabetic?

data %>%
  filter(Pregnancies > 0, Diabetes_Status == "Yes") %>%
  summarise(Count = n())
## # A tibble: 1 × 1
##   Count
##   <int>
## 1   834

#Data Visualization

install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)

#1. What is the distribution of Diabetes_Status by Urban/Rural?

data %>%
  group_by(Urban_Rural, Diabetes_Status) %>%
  summarise(Count = n()) %>%
  ggplot(aes(x = Urban_Rural, y = Count, fill = Diabetes_Status)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Diabetes by Urban/Rural Area") +
  theme_minimal()
## `summarise()` has grouped output by 'Urban_Rural'. You can override using the
## `.groups` argument.

Interpretation:
This bar plot shows how diabetes prevalence differs between urban and rural areas. We can observe if people living in urban areas have a higher or lower rate of diabetes compared to those in rural settings.

#2. Boxplot of BMI for Diabetic and Non-Diabetic Patients

ggplot(data, aes(x = Diabetes_Status, y = BMI, fill = Diabetes_Status)) +
  geom_boxplot() +
  labs(title = "BMI Distribution for Diabetic and Non-Diabetic Patients") +
  theme_minimal()

Interpretation:
The boxplot compares the BMI distributions of diabetic and non-diabetic patients. Higher median BMI in diabetic patients would suggest that obesity is a significant risk factor for diabetes.


#3. Histogram of Age Distribution

ggplot(data, aes(x = Age, fill = Diabetes_Status)) +
  geom_histogram(binwidth = 5, alpha = 0.7, position="identity") +
  labs(title = "Age Distribution of Diabetic vs. Non-Diabetic Patients") +
  theme_minimal()

Interpretation:
The histogram displays the age distribution for diabetic and non-diabetic patients, helping to identify whether older individuals are more affected by diabetes.


#4. Scatter Plot of HBA1C vs. Fasting Blood Sugar

ggplot(data, aes(x = Fasting_Blood_Sugar, y = HBA1C, color = Diabetes_Status)) +
  geom_point() +
  labs(title = "Relationship Between Fasting Blood Sugar and HBA1C") +
  theme_minimal()

Interpretation:
The scatter plot shows a positive correlation between fasting blood sugar and HBA1C levels, indicating that higher blood sugar levels are associated with higher long-term glucose levels.


#5. Bar Chart of Diabetes Cases by Gender

ggplot(data, aes(x = Gender, fill = Diabetes_Status)) +
  geom_bar(position="dodge") +
  labs(title = "Diabetes Cases by Gender") +
  theme_minimal()

Interpretation:
This bar chart highlights differences in diabetes prevalence between males and females.


#6. Proportion of Physical Activity Levels across Diabetic and Non-Diabetic Groups

ggplot(data, aes(x = Physical_Activity, fill = Diabetes_Status)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Physical Activity vs Diabetes Status", x = "Physical Activity Level", y = "Proportion") +
  theme_minimal()

Interpretation:
This plot shows the proportion of physical activity levels among diabetic and non-diabetic individuals, suggesting that lower activity levels might be linked to higher diabetes prevalence.


#8. Stress Level vs Diabetes Status (Stacked Bar Plot)

ggplot(data, aes(x = Stress_Level, fill = Diabetes_Status)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Stress Level by Diabetes Status") +
  theme_minimal()

Interpretation:
This plot shows how stress levels vary between diabetic and non-diabetic groups, possibly indicating a link between higher stress and diabetes.


#9. Waist-to-Hip Ratio vs Diabetes Status (Boxplot)

ggplot(data, aes(x = Diabetes_Status, y = Waist_Hip_Ratio, fill = Diabetes_Status)) +
  geom_boxplot() +
  labs(title = "Waist-Hip Ratio by Diabetes Status") +
  theme_minimal()

Interpretation:
The boxplot compares waist-to-hip ratios of diabetic and non-diabetic individuals. Higher ratios in diabetic patients suggest abdominal obesity as a risk factor.


#10. Alcohol Intake and Diabetes (Grouped Bar Plot)

ggplot(data, aes(x = Alcohol_Intake, fill = Diabetes_Status)) +
  geom_bar(position = "dodge") +
  labs(title = "Alcohol Intake vs Diabetes Status") +
  theme_minimal()

Interpretation:
This grouped bar chart shows how alcohol intake varies between diabetic and non-diabetic individuals.


#11. Pregnancy Count among Diabetics (Bar Plot)

data %>%
  filter(Diabetes_Status == "Yes", Pregnancies > 0) %>%
  ggplot(aes(x = factor(Pregnancies))) +
  geom_bar(fill = "tomato") +
  labs(title = "Pregnancy Count in Diabetic Women", x = "Number of Pregnancies") +
  theme_minimal()

Interpretation:
This bar plot visualizes the distribution of the number of pregnancies among diabetic women, which could hint at gestational diabetes risk.


#12. Smoking Status among Diabetics (Horizontal Bar Plot)

data %>%
  filter(Diabetes_Status == "Yes") %>%
  ggplot(aes(x = Smoking_Status)) +
  geom_bar(fill = "steelblue") +
  coord_flip() +
  labs(title = "Smoking Status among Diabetics") +
  theme_minimal()

Interpretation:
This horizontal bar plot shows the distribution of smoking habits among diabetic individuals, providing insights into the role of smoking as a risk factor.


#13. Health Insurance Coverage among Diabetics (Pie Chart)

insurance_data <- data %>%
  filter(Diabetes_Status == "Yes") %>%
  count(Health_Insurance)

ggplot(insurance_data, aes(x = "", y = n, fill = Health_Insurance)) +
  geom_col() +
  coord_polar(theta = "y") +
  labs(title = "Health Insurance among Diabetic Patients") +
  theme_void()

Interpretation:
The pie chart shows the proportion of diabetic patients with and without health insurance coverage.