library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load necessary libraries
library(dplyr)
library(ggplot2)
dataset <-read_delim("C:/Users/Akshay Dembra/Downloads/Stats_Selected_Dataset/diabetes_binary_5050split_health_indicators_BRFSS2015_1.csv" , delim = ",")
## Rows: 70692 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (22): Diabetes_binary, HighBP, HighChol, CholCheck, BMI, Smoker, Stroke,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset

Group By and Analyze Data

1. Group by High Blood Pressure and Summarize BMI

# Group by High Blood Pressure and calculate mean BMI
group_bmi <- dataset %>%
  group_by(HighBP) %>%
  summarize(mean_bmi = mean(BMI, na.rm = TRUE))

# Plot mean BMI by High Blood Pressure status
ggplot(group_bmi, aes(x = factor(HighBP), y = mean_bmi)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(title = "Mean BMI by High Blood Pressure Status",
       x = "High Blood Pressure (0 = No, 1 = Yes)",
       y = "Mean BMI")

Insight: Higher BMI is associated with high blood pressure. This could indicate that individuals with higher BMI are more likely to have high blood pressure.

2. Group by Smoker Status and Count

# Group by Smoker status and count
group_smoker <- dataset %>%
  group_by(Smoker) %>%
  tally()

# Plot count of smokers vs non-smokers
ggplot(group_smoker, aes(x = factor(Smoker), y = n)) +
  geom_bar(stat = "identity", fill = "purple") +
  labs(title = "Count of Smokers vs Non-Smokers",
       x = "Smoker (0 = No, 1 = Yes)",
       y = "Counts")

Insight: The number of non-smokers is slightly higher than smokers. This might indicate a trend towards non-smoking behaviors or successful public health campaigns.

3. Group by Sex and Summarize Physical Health Days

# Group by Sex and calculate mean physical health days
group_physical_health <- dataset %>%
  group_by(Sex) %>%
  summarize(mean_physical_health_days = mean(PhysHlth, na.rm = TRUE))

# Plot mean physical health days by sex
ggplot(group_physical_health, aes(x = factor(Sex), y = mean_physical_health_days)) +
  geom_bar(stat = "identity", fill = "lightblue") +
  labs(title = "Mean Physical Health by Sex",
       x = "Sex (0 = Female, 1 = Male)",
       y = "Mean Physical Health Days")

Insight: Females report more days of poor physical health compared to males. This could suggest gender differences in health reporting or actual health disparities.

Analyze Categorical Combinations

Combination Analysis

# Create combinations of two categorical variables: Sex and Smoker
combination_df <- dataset %>%
  group_by(Sex, Smoker) %>%
  tally()

# Find missing combinations (if any)
all_combinations <- expand.grid(Sex = unique(dataset$Sex), Smoker = unique(dataset$Smoker))
missing_combinations <- anti_join(all_combinations, combination_df)
## Joining with `by = join_by(Sex, Smoker)`
# Plot combinations
ggplot(combination_df, aes(x = factor(Sex), y = n, fill = factor(Smoker))) +
  geom_bar(stat="identity", position="dodge") +
  labs(title="Combinations of Sex and Smoker",
       x="Sex",
       y="Count",
       fill="Smoker")

Insight: Analyze which combinations are most/least common. Missing combinations might indicate cultural or behavioral patterns.

Conclusion:

Based on the insights from the dataset, it’s evident that females report more days of poor physical health compared to males, suggesting potential gender differences in health experiences or reporting. Additionally, individuals with high blood pressure tend to have a higher BMI, indicating a possible link between weight and hypertension. The data also shows a slightly higher number of non-smokers compared to smokers, which could reflect successful public health initiatives or changing social norms around smoking. These findings highlight areas for further research into health disparities and lifestyle factors.