knitr::include_graphics("https://www.cdc.gov/places/about/500-cities-2016-2019/images/500-cities-logo.png")

The 500 cities datasets from the Centers for Disease Control and Prevention (CDC), It describes how to load and handle a dataset that includes health indicators from various American cities. The dataset contains variables such as population counts, geographic locations, city names, state abbreviations, measurements of health outcomes and behaviors, and data values with confidence bounds.

Load the libraries and set the working directory

library(tidyverse)
library(tidyr)

cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")

str(cities500)

## spc_tbl_ [810,103 × 24] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Year                      : num [1:810103] 2017 2017 2017 2017 2017 ...
##  $ StateAbbr                 : chr [1:810103] "CA" "CA" "CA" "CA" ...
##  $ StateDesc                 : chr [1:810103] "California" "California" "California" "California" ...
##  $ CityName                  : chr [1:810103] "Hawthorne" "Hawthorne" "Hayward" "Hayward" ...
##  $ GeographicLevel           : chr [1:810103] "Census Tract" "City" "City" "City" ...
##  $ DataSource                : chr [1:810103] "BRFSS" "BRFSS" "BRFSS" "BRFSS" ...
##  $ Category                  : chr [1:810103] "Health Outcomes" "Unhealthy Behaviors" "Health Outcomes" "Unhealthy Behaviors" ...
##  $ UniqueID                  : chr [1:810103] "0632548-06037602504" "632548" "633000" "633000" ...
##  $ Measure                   : chr [1:810103] "Arthritis among adults aged >=18 Years" "Current smoking among adults aged >=18 Years" "Coronary heart disease among adults aged >=18 Years" "Obesity among adults aged >=18 Years" ...
##  $ Data_Value_Unit           : chr [1:810103] "%" "%" "%" "%" ...
##  $ DataValueTypeID           : chr [1:810103] "CrdPrv" "CrdPrv" "AgeAdjPrv" "CrdPrv" ...
##  $ Data_Value_Type           : chr [1:810103] "Crude prevalence" "Crude prevalence" "Age-adjusted prevalence" "Crude prevalence" ...
##  $ Data_Value                : num [1:810103] 14.6 15.4 4.8 24.2 78 22 17.7 6 12.7 82.5 ...
##  $ Low_Confidence_Limit      : num [1:810103] 13.9 15 4.7 24.1 77.6 21.1 17.5 5.8 12 82 ...
##  $ High_Confidence_Limit     : num [1:810103] 15.2 15.9 4.8 24.4 78.3 22.8 17.9 6.2 13.5 83 ...
##  $ Data_Value_Footnote_Symbol: chr [1:810103] NA NA NA NA ...
##  $ Data_Value_Footnote       : chr [1:810103] NA NA NA NA ...
##  $ PopulationCount           : num [1:810103] 4407 84293 144186 144186 78657 ...
##  $ GeoLocation               : chr [1:810103] "(33.905547923, -118.337332298)" "(33.914667701, -118.347667728)" "(37.6329591551, -122.077051051)" "(37.6329591551, -122.077051051)" ...
##  $ CategoryID                : chr [1:810103] "HLTHOUT" "UNHBEH" "HLTHOUT" "UNHBEH" ...
##  $ MeasureId                 : chr [1:810103] "ARTHRITIS" "CSMOKING" "CHD" "OBESITY" ...
##  $ CityFIPS                  : num [1:810103] 632548 632548 633000 633000 633182 ...
##  $ TractFIPS                 : num [1:810103] 6.04e+09 NA NA NA NA ...
##  $ Short_Question_Text       : chr [1:810103] "Arthritis" "Current Smoking" "Coronary Heart Disease" "Obesity" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Year = col_double(),
##   ..   StateAbbr = col_character(),
##   ..   StateDesc = col_character(),
##   ..   CityName = col_character(),
##   ..   GeographicLevel = col_character(),
##   ..   DataSource = col_character(),
##   ..   Category = col_character(),
##   ..   UniqueID = col_character(),
##   ..   Measure = col_character(),
##   ..   Data_Value_Unit = col_character(),
##   ..   DataValueTypeID = col_character(),
##   ..   Data_Value_Type = col_character(),
##   ..   Data_Value = col_double(),
##   ..   Low_Confidence_Limit = col_double(),
##   ..   High_Confidence_Limit = col_double(),
##   ..   Data_Value_Footnote_Symbol = col_character(),
##   ..   Data_Value_Footnote = col_character(),
##   ..   PopulationCount = col_number(),
##   ..   GeoLocation = col_character(),
##   ..   CategoryID = col_character(),
##   ..   MeasureId = col_character(),
##   ..   CityFIPS = col_double(),
##   ..   TractFIPS = col_double(),
##   ..   Short_Question_Text = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

The GeoLocation variable has (lat, long) format

Split GeoLocation (lat, long) into two columns: lat and long

latlong <- cities500 |>
  mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", "")) |>
  separate(
    GeoLocation,
    into = c("lat", "long"),
    sep = ", ",
    convert = TRUE
  )

head(latlong)

## # A tibble: 6 × 25
##    Year StateAbbr StateDesc  CityName  GeographicLevel DataSource Category      
##   <dbl> <chr>     <chr>      <chr>     <chr>           <chr>      <chr>         
## 1  2017 CA        California Hawthorne Census Tract    BRFSS      Health Outcom…
## 2  2017 CA        California Hawthorne City            BRFSS      Unhealthy Beh…
## 3  2017 CA        California Hayward   City            BRFSS      Health Outcom…
## 4  2017 CA        California Hayward   City            BRFSS      Unhealthy Beh…
## 5  2017 CA        California Hemet     City            BRFSS      Prevention    
## 6  2017 CA        California Indio     Census Tract    BRFSS      Health Outcom…
## # ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
## #   DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
## #   Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
## #   Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
## #   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
## #   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>

Filter the dataset

Remove the StateDesc that includes the United Sates, select Prevention as the category (of interest), filter for only measuring crude prevalence and select only 2017.

latlong_clean <- latlong |>
  filter(StateDesc != "United States") |>
  filter(Category == "Prevention") |>
  filter(Data_Value_Type == "Crude prevalence") |>
  filter(Year == 2017)
head(latlong_clean)

## # A tibble: 6 × 25
##    Year StateAbbr StateDesc  CityName   GeographicLevel DataSource Category  
##   <dbl> <chr>     <chr>      <chr>      <chr>           <chr>      <chr>     
## 1  2017 AL        Alabama    Montgomery City            BRFSS      Prevention
## 2  2017 CA        California Concord    City            BRFSS      Prevention
## 3  2017 CA        California Concord    City            BRFSS      Prevention
## 4  2017 CA        California Fontana    City            BRFSS      Prevention
## 5  2017 CA        California Richmond   Census Tract    BRFSS      Prevention
## 6  2017 FL        Florida    Davie      Census Tract    BRFSS      Prevention
## # ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
## #   DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
## #   Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
## #   Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
## #   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
## #   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>

What variables are included? (can any of them be removed?)

names(latlong_clean)

##  [1] "Year"                       "StateAbbr"                 
##  [3] "StateDesc"                  "CityName"                  
##  [5] "GeographicLevel"            "DataSource"                
##  [7] "Category"                   "UniqueID"                  
##  [9] "Measure"                    "Data_Value_Unit"           
## [11] "DataValueTypeID"            "Data_Value_Type"           
## [13] "Data_Value"                 "Low_Confidence_Limit"      
## [15] "High_Confidence_Limit"      "Data_Value_Footnote_Symbol"
## [17] "Data_Value_Footnote"        "PopulationCount"           
## [19] "lat"                        "long"                      
## [21] "CategoryID"                 "MeasureId"                 
## [23] "CityFIPS"                   "TractFIPS"                 
## [25] "Short_Question_Text"

Remove the variables that will not be used in the assignment

prevention <- latlong_clean |>
  select(
    -DataSource,
    -Data_Value_Unit,
    -DataValueTypeID,
    -Low_Confidence_Limit,
    -High_Confidence_Limit,
    -Data_Value_Footnote_Symbol,
    -Data_Value_Footnote
  )
head(prevention)

## # A tibble: 6 × 18
##    Year StateAbbr StateDesc  CityName  GeographicLevel Category UniqueID Measure
##   <dbl> <chr>     <chr>      <chr>     <chr>           <chr>    <chr>    <chr>  
## 1  2017 AL        Alabama    Montgome… City            Prevent… 151000   Choles…
## 2  2017 CA        California Concord   City            Prevent… 616000   Visits…
## 3  2017 CA        California Concord   City            Prevent… 616000   Choles…
## 4  2017 CA        California Fontana   City            Prevent… 624680   Visits…
## 5  2017 CA        California Richmond  Census Tract    Prevent… 0660620… Choles…
## 6  2017 FL        Florida    Davie     Census Tract    Prevent… 1216475… Choles…
## # ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
## #   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
## #   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>

The new dataset “Prevention” is a manageable dataset now.

For your assignment, work with the cleaned “Prevention” dataset

1. Once you run the above code, filter this dataset one more time for any particular subset.

Filter chunk here

str(prevention)

## tibble [113,983 × 18] (S3: tbl_df/tbl/data.frame)
##  $ Year               : num [1:113983] 2017 2017 2017 2017 2017 ...
##  $ StateAbbr          : chr [1:113983] "AL" "CA" "CA" "CA" ...
##  $ StateDesc          : chr [1:113983] "Alabama" "California" "California" "California" ...
##  $ CityName           : chr [1:113983] "Montgomery" "Concord" "Concord" "Fontana" ...
##  $ GeographicLevel    : chr [1:113983] "City" "City" "City" "City" ...
##  $ Category           : chr [1:113983] "Prevention" "Prevention" "Prevention" "Prevention" ...
##  $ UniqueID           : chr [1:113983] "151000" "616000" "616000" "624680" ...
##  $ Measure            : chr [1:113983] "Cholesterol screening among adults aged >=18 Years" "Visits to doctor for routine checkup within the past Year among adults aged >=18 Years" "Cholesterol screening among adults aged >=18 Years" "Visits to doctor for routine checkup within the past Year among adults aged >=18 Years" ...
##  $ Data_Value_Type    : chr [1:113983] "Crude prevalence" "Crude prevalence" "Crude prevalence" "Crude prevalence" ...
##  $ Data_Value         : num [1:113983] 80.2 64.5 80.1 66.1 81.8 84.5 73.5 81 79.1 19.7 ...
##  $ PopulationCount    : num [1:113983] 205764 122067 122067 196069 5706 ...
##  $ lat                : num [1:113983] 32.3 38 38 34.1 37.9 ...
##  $ long               : num [1:113983] -86.3 -122 -122 -117.5 -122.3 ...
##  $ CategoryID         : chr [1:113983] "PREVENT" "PREVENT" "PREVENT" "PREVENT" ...
##  $ MeasureId          : chr [1:113983] "CHOLSCREEN" "CHECKUP" "CHOLSCREEN" "CHECKUP" ...
##  $ CityFIPS           : num [1:113983] 151000 616000 616000 624680 660620 ...
##  $ TractFIPS          : num [1:113983] NA NA NA NA 6.01e+09 ...
##  $ Short_Question_Text: chr [1:113983] "Cholesterol Screening" "Annual Checkup" "Cholesterol Screening" "Annual Checkup" ...

# Check types of screens
prevention |>
  select(Short_Question_Text) |>
  distinct()

## # A tibble: 4 × 1
##   Short_Question_Text  
##   <chr>                
## 1 Cholesterol Screening
## 2 Annual Checkup       
## 3 Health Insurance     
## 4 Taking BP Medication

# Check types of categories
prevention |>
  select(CategoryID) |>
  distinct()

## # A tibble: 1 × 1
##   CategoryID
##   <chr>     
## 1 PREVENT

# Filter Cholesterol Screening
cholesterol_subset <- prevention |>
  filter(Short_Question_Text == "Cholesterol Screening")
head(cholesterol_subset)

## # A tibble: 6 × 18
##    Year StateAbbr StateDesc  CityName  GeographicLevel Category UniqueID Measure
##   <dbl> <chr>     <chr>      <chr>     <chr>           <chr>    <chr>    <chr>  
## 1  2017 AL        Alabama    Montgome… City            Prevent… 151000   Choles…
## 2  2017 CA        California Concord   City            Prevent… 616000   Choles…
## 3  2017 CA        California Richmond  Census Tract    Prevent… 0660620… Choles…
## 4  2017 FL        Florida    Davie     Census Tract    Prevent… 1216475… Choles…
## 5  2017 FL        Florida    Hialeah   Census Tract    Prevent… 1230000… Choles…
## 6  2017 FL        Florida    Miami Be… Census Tract    Prevent… 1245025… Choles…
## # ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
## #   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
## #   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>

2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.

First plot chunk here

cholesterol_subset$lat <- as.numeric(cholesterol_subset$lat)
cholesterol_subset$long <- as.numeric(cholesterol_subset$long)
cholesterol_subset <- na.omit(cholesterol_subset)

str(cholesterol_subset)

## tibble [27,210 × 18] (S3: tbl_df/tbl/data.frame)
##  $ Year               : num [1:27210] 2017 2017 2017 2017 2017 ...
##  $ StateAbbr          : chr [1:27210] "CA" "FL" "FL" "FL" ...
##  $ StateDesc          : chr [1:27210] "California" "Florida" "Florida" "Florida" ...
##  $ CityName           : chr [1:27210] "Richmond" "Davie" "Hialeah" "Miami Beach" ...
##  $ GeographicLevel    : chr [1:27210] "Census Tract" "Census Tract" "Census Tract" "Census Tract" ...
##  $ Category           : chr [1:27210] "Prevention" "Prevention" "Prevention" "Prevention" ...
##  $ UniqueID           : chr [1:27210] "0660620-06013380000" "1216475-12011070210" "1230000-12086000605" "1245025-12086004403" ...
##  $ Measure            : chr [1:27210] "Cholesterol screening among adults aged >=18 Years" "Cholesterol screening among adults aged >=18 Years" "Cholesterol screening among adults aged >=18 Years" "Cholesterol screening among adults aged >=18 Years" ...
##  $ Data_Value_Type    : chr [1:27210] "Crude prevalence" "Crude prevalence" "Crude prevalence" "Crude prevalence" ...
##  $ Data_Value         : num [1:27210] 81.8 84.5 81 79.1 83.7 75.6 77.1 73.7 83.6 76.8 ...
##  $ PopulationCount    : num [1:27210] 5706 3228 4806 2944 2348 ...
##  $ lat                : num [1:27210] 37.9 26.1 25.9 25.8 37.3 ...
##  $ long               : num [1:27210] -122.3 -80.3 -80.3 -80.1 -122 ...
##  $ CategoryID         : chr [1:27210] "PREVENT" "PREVENT" "PREVENT" "PREVENT" ...
##  $ MeasureId          : chr [1:27210] "CHOLSCREEN" "CHOLSCREEN" "CHOLSCREEN" "CHOLSCREEN" ...
##  $ CityFIPS           : num [1:27210] 660620 1216475 1230000 1245025 669084 ...
##  $ TractFIPS          : num [1:27210] 6.01e+09 1.20e+10 1.21e+10 1.21e+10 6.09e+09 ...
##  $ Short_Question_Text: chr [1:27210] "Cholesterol Screening" "Cholesterol Screening" "Cholesterol Screening" "Cholesterol Screening" ...
##  - attr(*, "na.action")= 'omit' Named int [1:1294] 1 2 7 11 15 16 19 26 30 31 ...
##   ..- attr(*, "names")= chr [1:1294] "1" "2" "7" "11" ...

p1 <- ggplot(cholesterol_subset, aes(x = long, y = lat, color = Data_Value)) +
  geom_point() +
  labs(title = "Cholesterol Screening Prevalence by City",
       x = "Longitude",
       y = "Latitude",
       color = "Prevalence Rate (%)") +
  theme_classic()
print(p1)

Each point on the plot corresponds to a city, with the location of the point representing the city’s geographic location within the United States.

The color intensity of each point reflects the prevalence rate (%) of cholesterol screening in that city, with darker shades likely indicating higher prevalence rates.

3. Now create a map of your subsetted dataset.

First map chunk here

library(leaflet)

## Warning: package 'leaflet' was built under R version 4.3.2

us_lat <- 37.09
us_lon <- -95.71
# Create the map
leaflet(cholesterol_subset) |>
  addProviderTiles(providers$OpenStreetMap) |> 
  setView(lng = us_lon, lat = us_lat, zoom = 4) |> 
  addCircleMarkers(
    lng = ~long, 
    lat = ~lat,
    radius = ~log10(Data_Value),
    label = ~paste(CityName, ":", Data_Value), 
    labelOptions = labelOptions(
      direction = "auto",
      noHide = F,
      textOnly = TRUE
    ))

4. Refine your map to include a mouseover tooltip

Refined map chunk here

The purpose of the visualizations is to shed light on the geographic distribution and prevalence of health-related behaviors, particularly cholesterol screening, among adults in various cities who are 18 years of age or older.

leaflet(cholesterol_subset) |>  
  setView(lng = us_lon, lat = us_lat, zoom = 4) |> # Set the view to the center of the US
  addTiles() |>  # Add the default OpenStreetMap map tiles
  addMarkers(
    lng = ~long, lat = ~lat,
    popup = ~paste(CityName, UniqueID, "Cholesterol Level:", Data_Value),
    clusterOptions = markerClusterOptions()
  )

5. Write a paragraph

In a paragraph, describe the plots you created and what they show.

The points are dispersed over the United States’ geographic map, with a noteworthy concentration of points in the regions that correspond to the Midwest, the East Coast, the Gulf Coast, and the West Coast. The mountain states have fewer points, which might be because of lower population densities or a dearth of data points from these regions.

The color scale shows that different cities have different prevalence rates for cholesterol screening. The difference in hue indicates that some cities have significantly greater rates of cholesterol screening than others.

Particularly in areas with high population densities, there appear to be clusters of data points. This may suggest that access to healthcare facilities and services may be better in more populated areas, which could result in higher screening rates.

CDC 500 Cities/PLACES GIS Data Analysis

Mingzhuo Tian

2023-11-08