Diabetes and Obesity Rates by Texas County 2021

Author

Kaitlan Wong

Load BRFSS data.
state_21_working_pudf <- read_sas("~/Every Texan/R/ADA/state_21_working_pudf.sas7bdat", 
    NULL)

view(state_21_working_pudf)
Next, recode diabetes and obesity variables and filter to show the percentage of people that are diabetic and obese by county.
# Recode diabetes and obesity variables
 brfss_2021 <- state_21_working_pudf %>%
   mutate(
     diabetic = recode(diabetes, "1"="Yes", "2"="No"),
     obese = recode(bmi30, "1"="No", "2"="Yes")
     ) %>%
   select(CountyName,  diabetic, obese) 
 
# Group by county and calculate the percentage of people who are diabetic and obese
diabetic_obese_by_county <- brfss_2021 %>%
  group_by(CountyName) %>%
  summarize(
    across(c(diabetic, obese), ~ mean(. == "Yes"), .names = "{.col}_pct")
  )

# View the results
print(diabetic_obese_by_county)
# A tibble: 126 × 3
   CountyName diabetic_pct obese_pct
   <chr>             <dbl>     <dbl>
 1 ""               NA         NA   
 2 "Angelina"       NA         NA   
 3 "Aransas"        NA          0.6 
 4 "Atascosa"        0          1   
 5 "Austin"          0          0.25
 6 "Bailey"          0          1   
 7 "Bastrop"         0.162     NA   
 8 "Bee"             0          0.5 
 9 "Bell"            0.148     NA   
10 "Bexar"          NA         NA   
# … with 116 more rows
Create lists showing the counties with the highest rates.
# Group by county and calculate the percentage of people who are diabetic
diabetic_by_county <- brfss_2021 %>%
  group_by(CountyName) %>%
  summarize(diabetic_pct = mean(diabetic == "Yes")) %>%
  arrange(desc(diabetic_pct))

# View the results
print(diabetic_by_county)
# A tibble: 126 × 2
   CountyName  diabetic_pct
   <chr>              <dbl>
 1 Childress            1  
 2 Clay                 1  
 3 Kleberg              1  
 4 Lavaca               1  
 5 San Jacinto          1  
 6 Washington           1  
 7 Fannin               0.5
 8 Hutchinson           0.5
 9 Webb                 0.5
10 Bowie                0.4
# … with 116 more rows
# Group by county and calculate the percentage of people who are obese
obese_by_county <- brfss_2021 %>%
  group_by(CountyName) %>%
  summarize(obese_pct = mean(obese == "Yes")) %>%
  arrange(desc(obese_pct))

# View the results
print(obese_by_county)
# A tibble: 126 × 2
   CountyName obese_pct
   <chr>          <dbl>
 1 Atascosa           1
 2 Bailey             1
 3 Blanco             1
 4 Cass               1
 5 Clay               1
 6 Crosby             1
 7 Fannin             1
 8 Hemphill           1
 9 Jack               1
10 Oldham             1
# … with 116 more rows
Inspect the new data frame.
# View basic summary statistics
summary(diabetic_obese_by_county)
  CountyName         diabetic_pct      obese_pct     
 Length:126         Min.   :0.0000   Min.   :0.0000  
 Class :character   1st Qu.:0.0000   1st Qu.:0.0000  
 Mode  :character   Median :0.0000   Median :0.2500  
                    Mean   :0.1307   Mean   :0.3919  
                    3rd Qu.:0.1940   3rd Qu.:0.9643  
                    Max.   :1.0000   Max.   :1.0000  
                    NA's   :13       NA's   :48      
# View min and max values for each column
range(diabetic_obese_by_county$diabetic_pct, na.rm = TRUE)
[1] 0 1
range(diabetic_obese_by_county$obese_pct, na.rm = TRUE)
[1] 0 1
It looks like there are multiple counties where 100% of surveyed adults are obese or have diabetes. Let’s see how many people from each county are in our sample.
# Group the data by county and count the number of observations in each group
n_people <- brfss_2021 %>%
  group_by(CountyName) %>%
  summarize(n_people_denominator = n())

# View the results
print(n_people)
# A tibble: 126 × 2
   CountyName n_people_denominator
   <chr>                     <int>
 1 ""                         2182
 2 "Angelina"                  452
 3 "Aransas"                     5
 4 "Atascosa"                    2
 5 "Austin"                      4
 6 "Bailey"                      1
 7 "Bastrop"                    68
 8 "Bee"                         2
 9 "Bell"                      398
10 "Bexar"                     430
# … with 116 more rows
We can see why there are so many counties where 100% of surveyed adults are obese – it’s because many counties only had 1 or 2 people surveyed! Let’s add the n_people_denominator variable to the diabetic_obese_by_county data frame so these numbers can be referenced in the final data set.
# Join the two data frames by CountyName
diabetic_obese_by_county <- left_join(diabetic_obese_by_county, n_people, by = "CountyName")
Check to see which counties have missing values.
# Get list of counties with NA percentages for diabetes or obesity
na_counties <- diabetic_obese_by_county %>%
  filter(is.na(diabetic_pct) | is.na(obese_pct)) %>%
  pull(CountyName)

# Check if any counties have NA percentages
if (length(na_counties) == 0) {
  cat("There are no counties with NA percentages.")
} else {
  cat("Counties with NA percentages:\n")
  print(na_counties)
}
Counties with NA percentages:
 [1] ""             "Angelina"     "Aransas"      "Bastrop"      "Bell"        
 [6] "Bexar"        "Brazoria"     "Cameron"      "Collin"       "Comal"       
[11] "Coryell"      "Dallas"       "Deaf Smith"   "Denton"       "Ector"       
[16] "El Paso"      "Ellis"        "Fort Bend"    "Galveston"    "Grayson"     
[21] "Gregg"        "Hardin"       "Harris"       "Hays"         "Hidalgo"     
[26] "Hood"         "Howard"       "Hunt"         "Jefferson"    "Kaufman"     
[31] "Kendall"      "Lavaca"       "Liberty"      "Lubbock"      "McLennan"    
[36] "Midland"      "Montgomery"   "Newton"       "Nueces"       "Orange"      
[41] "Potter"       "Randall"      "San Patricio" "Tarrant"      "Taylor"      
[46] "Tom Green"    "Travis"       "Webb"         "Williamson"  
Export data into Excel spreadsheet.
# Export the data frame to an Excel file
write_xlsx(diabetic_obese_by_county, "diabetes_obesity_by_county_2021.xlsx")