state_21_working_pudf <- read_sas("~/Every Texan/R/ADA/state_21_working_pudf.sas7bdat",
NULL)
view(state_21_working_pudf)Diabetes and Obesity Rates by Texas County 2021
Load BRFSS data.
Next, recode diabetes and obesity variables and filter to show the percentage of people that are diabetic and obese by county.
# Recode diabetes and obesity variables
brfss_2021 <- state_21_working_pudf %>%
mutate(
diabetic = recode(diabetes, "1"="Yes", "2"="No"),
obese = recode(bmi30, "1"="No", "2"="Yes")
) %>%
select(CountyName, diabetic, obese)
# Group by county and calculate the percentage of people who are diabetic and obese
diabetic_obese_by_county <- brfss_2021 %>%
group_by(CountyName) %>%
summarize(
across(c(diabetic, obese), ~ mean(. == "Yes"), .names = "{.col}_pct")
)
# View the results
print(diabetic_obese_by_county)# A tibble: 126 × 3
CountyName diabetic_pct obese_pct
<chr> <dbl> <dbl>
1 "" NA NA
2 "Angelina" NA NA
3 "Aransas" NA 0.6
4 "Atascosa" 0 1
5 "Austin" 0 0.25
6 "Bailey" 0 1
7 "Bastrop" 0.162 NA
8 "Bee" 0 0.5
9 "Bell" 0.148 NA
10 "Bexar" NA NA
# … with 116 more rows
Create lists showing the counties with the highest rates.
# Group by county and calculate the percentage of people who are diabetic
diabetic_by_county <- brfss_2021 %>%
group_by(CountyName) %>%
summarize(diabetic_pct = mean(diabetic == "Yes")) %>%
arrange(desc(diabetic_pct))
# View the results
print(diabetic_by_county)# A tibble: 126 × 2
CountyName diabetic_pct
<chr> <dbl>
1 Childress 1
2 Clay 1
3 Kleberg 1
4 Lavaca 1
5 San Jacinto 1
6 Washington 1
7 Fannin 0.5
8 Hutchinson 0.5
9 Webb 0.5
10 Bowie 0.4
# … with 116 more rows
# Group by county and calculate the percentage of people who are obese
obese_by_county <- brfss_2021 %>%
group_by(CountyName) %>%
summarize(obese_pct = mean(obese == "Yes")) %>%
arrange(desc(obese_pct))
# View the results
print(obese_by_county)# A tibble: 126 × 2
CountyName obese_pct
<chr> <dbl>
1 Atascosa 1
2 Bailey 1
3 Blanco 1
4 Cass 1
5 Clay 1
6 Crosby 1
7 Fannin 1
8 Hemphill 1
9 Jack 1
10 Oldham 1
# … with 116 more rows
Inspect the new data frame.
# View basic summary statistics
summary(diabetic_obese_by_county) CountyName diabetic_pct obese_pct
Length:126 Min. :0.0000 Min. :0.0000
Class :character 1st Qu.:0.0000 1st Qu.:0.0000
Mode :character Median :0.0000 Median :0.2500
Mean :0.1307 Mean :0.3919
3rd Qu.:0.1940 3rd Qu.:0.9643
Max. :1.0000 Max. :1.0000
NA's :13 NA's :48
# View min and max values for each column
range(diabetic_obese_by_county$diabetic_pct, na.rm = TRUE)[1] 0 1
range(diabetic_obese_by_county$obese_pct, na.rm = TRUE)[1] 0 1
It looks like there are multiple counties where 100% of surveyed adults are obese or have diabetes. Let’s see how many people from each county are in our sample.
# Group the data by county and count the number of observations in each group
n_people <- brfss_2021 %>%
group_by(CountyName) %>%
summarize(n_people_denominator = n())
# View the results
print(n_people)# A tibble: 126 × 2
CountyName n_people_denominator
<chr> <int>
1 "" 2182
2 "Angelina" 452
3 "Aransas" 5
4 "Atascosa" 2
5 "Austin" 4
6 "Bailey" 1
7 "Bastrop" 68
8 "Bee" 2
9 "Bell" 398
10 "Bexar" 430
# … with 116 more rows
We can see why there are so many counties where 100% of surveyed adults are obese – it’s because many counties only had 1 or 2 people surveyed! Let’s add the n_people_denominator variable to the diabetic_obese_by_county data frame so these numbers can be referenced in the final data set.
# Join the two data frames by CountyName
diabetic_obese_by_county <- left_join(diabetic_obese_by_county, n_people, by = "CountyName")Check to see which counties have missing values.
# Get list of counties with NA percentages for diabetes or obesity
na_counties <- diabetic_obese_by_county %>%
filter(is.na(diabetic_pct) | is.na(obese_pct)) %>%
pull(CountyName)
# Check if any counties have NA percentages
if (length(na_counties) == 0) {
cat("There are no counties with NA percentages.")
} else {
cat("Counties with NA percentages:\n")
print(na_counties)
}Counties with NA percentages:
[1] "" "Angelina" "Aransas" "Bastrop" "Bell"
[6] "Bexar" "Brazoria" "Cameron" "Collin" "Comal"
[11] "Coryell" "Dallas" "Deaf Smith" "Denton" "Ector"
[16] "El Paso" "Ellis" "Fort Bend" "Galveston" "Grayson"
[21] "Gregg" "Hardin" "Harris" "Hays" "Hidalgo"
[26] "Hood" "Howard" "Hunt" "Jefferson" "Kaufman"
[31] "Kendall" "Lavaca" "Liberty" "Lubbock" "McLennan"
[36] "Midland" "Montgomery" "Newton" "Nueces" "Orange"
[41] "Potter" "Randall" "San Patricio" "Tarrant" "Taylor"
[46] "Tom Green" "Travis" "Webb" "Williamson"
Export data into Excel spreadsheet.
# Export the data frame to an Excel file
write_xlsx(diabetic_obese_by_county, "diabetes_obesity_by_county_2021.xlsx")