library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(tibble)
setwd("C:/Users/ogutu/Desktop/Data 101 Project 1")
diabetes <- read.csv("diabetes.prev.csv")
What gender has the highest prevalence of diabetes in the state of Maryland, District of Columbia and Virginia and is their prevalence related to obesity in these states?
Diabetes is a well known condition that is highly prevalent not only in the US but in other countries too. Although it can be genetically inherited, obesity is a known as a major risk factor for developing diabetes as mentioned by NIH. The CDC did research in 2012 to evaluate the relation between obesity and diabetes on a State and County level in the US.
This dataset comprises of 3143 observations and 14 variables. These variables include: State, FIPS Codes, County, number of men and women with diabetes, obesity and inactive leisure.
It is important to note that these do not show the perfect numbers because the CDC did not document the population size in the year 2012 and that all the data has been rounded to the nearest 100,000.
I will use the following variables to analyze the if the prevalence of diabetes is higher in women then men in the US and how it is correlated to obesity in the states of Maryland, District of Columbia and Virginia. In the end I will visualize this data using a bar graph:
1.percent.women.diabetes 2.percent.men.diabetes 3.State 4.percent.women.obese 5.percent.men.obese
str(diabetes)
## 'data.frame': 3143 obs. of 14 variables:
## $ State : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ FIPS.Codes : int 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
## $ County : chr "Autauga County" "Baldwin County" "Barbour County" "Bibb County" ...
## $ num.men.diabetes : int 2224 8181 1440 1013 2865 693 1064 5589 1728 1371 ...
## $ percent.men.diabetes : num 12.1 12.4 12.9 11 14 15.3 15.4 13.5 14.4 14.1 ...
## $ num.women.diabetes : int 2336 8017 1505 893 2975 743 1400 6557 2132 1325 ...
## $ percent.women.diabetes : num 11.6 11.3 15.7 11.3 13.9 20.2 16.5 14.2 15.6 13.1 ...
## $ num.men.obese : int 5910 19990 4265 3738 6954 1822 2327 13013 4574 3355 ...
## $ percent.men.obese : num 31.3 29 37.7 40.2 33.5 39.9 33.7 31.5 37.8 33.9 ...
## $ num.women.obese : int 6274 18255 4217 3188 6834 1829 3187 15094 5727 3216 ...
## $ percent.women.obese : num 30.5 24.5 44.5 40 31.3 50.2 37.8 32.5 41.5 31.6 ...
## $ num.men.inactive.leisure : int 4902 15650 3242 2853 5177 1331 2096 12540 3716 2704 ...
## $ num.women.inactive.leisure : int 6406 20450 3587 2877 6952 1387 3175 16930 5301 3520 ...
## $ percent.women.inactive.liesure: num 31.1 27.5 37.9 36.1 31.8 38.1 37.7 36.5 38.4 34.6 ...
colSums(is.na(diabetes))
## State FIPS.Codes
## 0 0
## County num.men.diabetes
## 0 0
## percent.men.diabetes num.women.diabetes
## 0 0
## percent.women.diabetes num.men.obese
## 0 0
## percent.men.obese num.women.obese
## 0 0
## percent.women.obese num.men.inactive.leisure
## 0 0
## num.women.inactive.leisure percent.women.inactive.liesure
## 0 0
diabetes_prevalence <- diabetes |>
filter(State == "Maryland" | State == "District of Columbia" | State == "Virginia") |>
select(Male = percent.men.diabetes, Female= percent.women.diabetes, State)
diabetes_prevalence
## Male Female State
## 1 7.9 8.3 District of Columbia
## 2 11.9 12.4 Maryland
## 3 10.0 8.5 Maryland
## 4 9.8 9.6 Maryland
## 5 10.0 9.0 Maryland
## 6 11.7 11.6 Maryland
## 7 9.2 8.3 Maryland
## 8 10.9 9.8 Maryland
## 9 10.4 9.6 Maryland
## 10 13.7 13.4 Maryland
## 11 9.1 8.4 Maryland
## 12 11.1 11.5 Maryland
## 13 10.3 9.5 Maryland
## 14 8.3 7.0 Maryland
## 15 11.1 10.7 Maryland
## 16 7.7 6.4 Maryland
## 17 11.0 11.2 Maryland
## 18 9.7 8.3 Maryland
## 19 9.7 8.8 Maryland
## 20 10.4 10.3 Maryland
## 21 10.8 9.1 Maryland
## 22 11.2 11.4 Maryland
## 23 11.4 10.6 Maryland
## 24 12.5 11.9 Maryland
## 25 12.1 12.3 Maryland
## 26 13.0 12.8 Virginia
## 27 11.3 10.1 Virginia
## 28 11.1 9.5 Virginia
## 29 11.5 11.0 Virginia
## 30 13.3 12.5 Virginia
## 31 12.8 12.1 Virginia
## 32 5.8 5.0 Virginia
## 33 10.6 9.1 Virginia
## 34 12.4 10.6 Virginia
## 35 10.6 8.7 Virginia
## 36 11.7 11.3 Virginia
## 37 11.4 9.5 Virginia
## 38 12.6 14.4 Virginia
## 39 12.7 11.0 Virginia
## 40 12.1 12.9 Virginia
## 41 10.7 9.8 Virginia
## 42 12.3 11.8 Virginia
## 43 12.7 11.1 Virginia
## 44 14.2 14.4 Virginia
## 45 13.0 12.5 Virginia
## 46 10.4 9.5 Virginia
## 47 11.8 10.0 Virginia
## 48 11.0 9.2 Virginia
## 49 10.6 9.5 Virginia
## 50 13.3 12.9 Virginia
## 51 12.5 11.3 Virginia
## 52 12.7 12.5 Virginia
## 53 13.5 13.0 Virginia
## 54 8.4 7.3 Virginia
## 55 11.2 9.5 Virginia
## 56 12.1 10.6 Virginia
## 57 11.1 9.0 Virginia
## 58 12.4 11.1 Virginia
## 59 9.8 8.2 Virginia
## 60 13.6 12.1 Virginia
## 61 12.7 11.4 Virginia
## 62 11.9 10.6 Virginia
## 63 13.8 12.5 Virginia
## 64 10.4 9.1 Virginia
## 65 10.6 13.8 Virginia
## 66 14.3 14.3 Virginia
## 67 9.6 8.6 Virginia
## 68 9.4 9.4 Virginia
## 69 13.3 13.0 Virginia
## 70 13.2 11.5 Virginia
## 71 13.6 13.1 Virginia
## 72 11.5 9.8 Virginia
## 73 12.6 12.2 Virginia
## 74 11.5 10.3 Virginia
## 75 10.8 9.9 Virginia
## 76 14.8 13.4 Virginia
## 77 11.3 10.7 Virginia
## 78 7.8 6.8 Virginia
## 79 11.8 10.6 Virginia
## 80 12.7 14.2 Virginia
## 81 11.5 10.0 Virginia
## 82 13.9 12.2 Virginia
## 83 12.5 13.1 Virginia
## 84 13.4 12.0 Virginia
## 85 7.1 6.6 Virginia
## 86 12.7 11.6 Virginia
## 87 11.1 10.1 Virginia
## 88 14.9 15.4 Virginia
## 89 14.3 12.6 Virginia
## 90 11.1 12.3 Virginia
## 91 12.4 11.2 Virginia
## 92 11.2 9.8 Virginia
## 93 12.9 11.4 Virginia
## 94 13.3 11.9 Virginia
## 95 10.7 9.6 Virginia
## 96 10.0 10.2 Virginia
## 97 12.2 13.1 Virginia
## 98 7.8 7.1 Virginia
## 99 11.8 10.7 Virginia
## 100 12.4 10.2 Virginia
## 101 11.0 12.8 Virginia
## 102 11.0 10.0 Virginia
## 103 13.7 12.2 Virginia
## 104 10.5 9.0 Virginia
## 105 12.9 12.1 Virginia
## 106 11.2 9.9 Virginia
## 107 12.7 11.2 Virginia
## 108 11.1 9.5 Virginia
## 109 14.2 13.8 Virginia
## 110 10.7 9.2 Virginia
## 111 8.2 7.4 Virginia
## 112 13.5 13.5 Virginia
## 113 11.7 15.4 Virginia
## 114 12.4 12.0 Virginia
## 115 12.1 11.0 Virginia
## 116 13.2 11.5 Virginia
## 117 13.2 12.4 Virginia
## 118 13.2 13.4 Virginia
## 119 12.2 10.8 Virginia
## 120 9.8 8.8 Virginia
## 121 6.7 6.0 Virginia
## 122 13.2 12.5 Virginia
## 123 11.9 10.9 Virginia
## 124 10.6 9.7 Virginia
## 125 7.3 7.1 Virginia
## 126 10.3 9.2 Virginia
## 127 11.7 10.9 Virginia
## 128 11.9 11.1 Virginia
## 129 12.3 12.5 Virginia
## 130 12.8 13.4 Virginia
## 131 9.2 8.5 Virginia
## 132 9.9 9.0 Virginia
## 133 12.3 12.3 Virginia
## 134 9.2 8.8 Virginia
## 135 11.2 10.5 Virginia
## 136 11.7 12.2 Virginia
## 137 7.6 7.0 Virginia
## 138 12.6 13.0 Virginia
## 139 8.4 10.0 Virginia
## 140 10.9 10.9 Virginia
## 141 8.9 8.5 Virginia
## 142 8.4 8.2 Virginia
## 143 13.4 13.5 Virginia
## 144 10.8 11.6 Virginia
## 145 9.0 10.5 Virginia
## 146 10.4 9.5 Virginia
## 147 14.2 16.7 Virginia
## 148 11.3 9.7 Virginia
## 149 13.2 14.3 Virginia
## 150 7.7 7.3 Virginia
## 151 9.1 10.0 Virginia
## 152 10.8 10.8 Virginia
## 153 11.5 10.3 Virginia
## 154 12.0 10.9 Virginia
## 155 12.1 11.8 Virginia
## 156 9.2 8.8 Virginia
## 157 11.4 10.5 Virginia
## 158 9.9 9.1 Virginia
## 159 10.4 9.6 Virginia
diabetes_prevalence_mean <- diabetes |>
group_by(State) |>
filter(State == "Maryland" | State == "District of Columbia" | State == "Virginia") |>
summarize(mean_women = mean(percent.women.diabetes),
mean_men = mean(percent.men.diabetes))
print(diabetes_prevalence_mean)
## # A tibble: 3 × 3
## State mean_women mean_men
## <chr> <dbl> <dbl>
## 1 District of Columbia 8.3 7.9
## 2 Maryland 9.98 10.6
## 3 Virginia 10.9 11.5
diabetes_average_obese <- diabetes |>
group_by(State) |>
filter(percent.women.obese & percent.men.obese & State == "Maryland" | State == "District of Columbia" | State == "Virginia") |>
summarize(mean_women_obese = mean(percent.women.obese), mean_men_obese = mean(percent.men.obese))
print(diabetes_average_obese)
## # A tibble: 3 × 3
## State mean_women_obese mean_men_obese
## <chr> <dbl> <dbl>
## 1 District of Columbia 25 18.9
## 2 Maryland 29.8 30.4
## 3 Virginia 29.0 30.5
diabetes_prevalence_by_state <- diabetes_prevalence_mean |>
filter(State %in% c("Maryland", "District of Columbia", "Virginia"))
diabetes_average_obese_by_state <- diabetes_average_obese |>
filter(State %in% c("Maryland", "District of Columbia", "Virginia"))
summary_stats <- data.frame(
Prevalence_Men = diabetes_prevalence_by_state$mean_men,
Prevalence_Women = diabetes_prevalence_by_state$mean_women,
Obesity_Men = diabetes_average_obese_by_state$mean_men_obese,
Obesity_Women = diabetes_average_obese_by_state$mean_women_obese,
State = diabetes_prevalence_by_state$State
)
summary_stats
## Prevalence_Men Prevalence_Women Obesity_Men Obesity_Women
## 1 7.90000 8.300000 18.90000 25.0000
## 2 10.58333 9.983333 30.37917 29.8500
## 3 11.48433 10.858955 30.46119 28.9791
## State
## 1 District of Columbia
## 2 Maryland
## 3 Virginia
head(summary_stats)
## Prevalence_Men Prevalence_Women Obesity_Men Obesity_Women
## 1 7.90000 8.300000 18.90000 25.0000
## 2 10.58333 9.983333 30.37917 29.8500
## 3 11.48433 10.858955 30.46119 28.9791
## State
## 1 District of Columbia
## 2 Maryland
## 3 Virginia
tail(summary_stats)
## Prevalence_Men Prevalence_Women Obesity_Men Obesity_Women
## 1 7.90000 8.300000 18.90000 25.0000
## 2 10.58333 9.983333 30.37917 29.8500
## 3 11.48433 10.858955 30.46119 28.9791
## State
## 1 District of Columbia
## 2 Maryland
## 3 Virginia
ggplot(summary_stats) +
# Prevalence bars
geom_col(aes(x = State, y = Prevalence_Women, fill = "Prevalence Women"),
position = position_nudge(x = -0.25), width = 0.2) +
geom_col(aes(x = State, y = Prevalence_Men, fill = "Prevalence Men"),
position = position_nudge(x = -0.08), width = 0.2) +
# Obesity bars
geom_col(aes(x = State, y = Obesity_Women, fill = "Obesity Women"),
position = position_nudge(x = 0.08), width = 0.2) +
geom_col(aes(x = State, y = Obesity_Men, fill = "Obesity Men"),
position = position_nudge(x = 0.25), width = 0.2) +
# Add value labels slightly above each bar
geom_text(aes(x = State, y = Prevalence_Women, label = round(Prevalence_Women,1)),
position = position_nudge(x = -0.25, y = 1.2), size = 3) +
geom_text(aes(x = State, y = Prevalence_Men, label = round(Prevalence_Men,1)),
position = position_nudge(x = -0.08, y = 1.2), size = 3) +
geom_text(aes(x = State, y = Obesity_Women, label = round(Obesity_Women,1)),
position = position_nudge(x = 0.08, y = 1.2), size = 3) +
geom_text(aes(x = State, y = Obesity_Men, label = round(Obesity_Men,1)),
position = position_nudge(x = 0.25, y = 1.2), size = 3) +
# Set colors for each bar type
scale_fill_manual(values = c(
"Prevalence Women" = "#FF9999",
"Prevalence Men" = "#FF6666",
"Obesity Women" = "#99CCFF",
"Obesity Men" = "#6699FF"
)) +
labs(
title = "Diabetes Prevalence and Obesity by Gender in Selected States",
x = "State",
y = "Percentage",
fill = "Measure and Gender"
) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 15, hjust = 1),
legend.position = "top"
)
Based on my analysis of this data, men in the states of Maryland and Virginia have a higher percentage of obesity and prevalence of diabetes whereas in District of Columbia, obesity is higher in women and they tend to have a higher prevalence for diabetes.
Through my analysis, it appears that gender may be a risk factor the prevalence of diabetes and obesity and depending on socioeconomic factors, certain states who have easier access to either fast food or nutritious food. For future research, I would suggest that the CDC should look into race or ethnicity and socioeconomic status of the individuals to get more specific data.