Healthy Cities GIS Assignment

Author

Ryan Seabold

Load the libraries and set the working directory

library(tidyverse)
library(tidyr)
setwd("C:/Users/ryan/OneDrive/School/DATA 110/Homework/GIS Homework")
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")
data(cities500)

The GeoLocation variable has (lat, long) format

Split GeoLocation (lat, long) into two columns: lat and long

latlong <- cities500|>
  mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
  separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)
head(latlong)
# A tibble: 6 × 25
   Year StateAbbr StateDesc  CityName  GeographicLevel DataSource Category      
  <dbl> <chr>     <chr>      <chr>     <chr>           <chr>      <chr>         
1  2017 CA        California Hawthorne Census Tract    BRFSS      Health Outcom…
2  2017 CA        California Hawthorne City            BRFSS      Unhealthy Beh…
3  2017 CA        California Hayward   City            BRFSS      Health Outcom…
4  2017 CA        California Hayward   City            BRFSS      Unhealthy Beh…
5  2017 CA        California Hemet     City            BRFSS      Prevention    
6  2017 CA        California Indio     Census Tract    BRFSS      Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
#   DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
#   Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
#   Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
#   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
#   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>

Filter the dataset

Remove the StateDesc that includes the United Sates, select Prevention as the category (of interest), filter for only measuring crude prevalence and select only 2017.

latlong_clean <- latlong |>
  filter(StateDesc != "United States") |>
  filter(Category == "Prevention") |>
  filter(Data_Value_Type == "Crude prevalence") |>
  filter(Year == 2017)
head(latlong_clean)
# A tibble: 6 × 25
   Year StateAbbr StateDesc  CityName   GeographicLevel DataSource Category  
  <dbl> <chr>     <chr>      <chr>      <chr>           <chr>      <chr>     
1  2017 AL        Alabama    Montgomery City            BRFSS      Prevention
2  2017 CA        California Concord    City            BRFSS      Prevention
3  2017 CA        California Concord    City            BRFSS      Prevention
4  2017 CA        California Fontana    City            BRFSS      Prevention
5  2017 CA        California Richmond   Census Tract    BRFSS      Prevention
6  2017 FL        Florida    Davie      Census Tract    BRFSS      Prevention
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
#   DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
#   Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
#   Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
#   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
#   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>

What variables are included? (can any of them be removed?)

names(latlong_clean)
 [1] "Year"                       "StateAbbr"                 
 [3] "StateDesc"                  "CityName"                  
 [5] "GeographicLevel"            "DataSource"                
 [7] "Category"                   "UniqueID"                  
 [9] "Measure"                    "Data_Value_Unit"           
[11] "DataValueTypeID"            "Data_Value_Type"           
[13] "Data_Value"                 "Low_Confidence_Limit"      
[15] "High_Confidence_Limit"      "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote"        "PopulationCount"           
[19] "lat"                        "long"                      
[21] "CategoryID"                 "MeasureId"                 
[23] "CityFIPS"                   "TractFIPS"                 
[25] "Short_Question_Text"       

Remove the variables that will not be used in the assignment

prevention <- latlong_clean |>
  select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(prevention)
# A tibble: 6 × 18
   Year StateAbbr StateDesc  CityName  GeographicLevel Category UniqueID Measure
  <dbl> <chr>     <chr>      <chr>     <chr>           <chr>    <chr>    <chr>  
1  2017 AL        Alabama    Montgome… City            Prevent… 151000   Choles…
2  2017 CA        California Concord   City            Prevent… 616000   Visits…
3  2017 CA        California Concord   City            Prevent… 616000   Choles…
4  2017 CA        California Fontana   City            Prevent… 624680   Visits…
5  2017 CA        California Richmond  Census Tract    Prevent… 0660620… Choles…
6  2017 FL        Florida    Davie     Census Tract    Prevent… 1216475… Choles…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
#   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
#   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
md <- prevention |>
  filter(StateAbbr=="MD")
head(md)
# A tibble: 6 × 18
   Year StateAbbr StateDesc CityName  GeographicLevel Category  UniqueID Measure
  <dbl> <chr>     <chr>     <chr>     <chr>           <chr>     <chr>    <chr>  
1  2017 MD        Maryland  Baltimore Census Tract    Preventi… 2404000… "Chole…
2  2017 MD        Maryland  Baltimore Census Tract    Preventi… 2404000… "Visit…
3  2017 MD        Maryland  Baltimore Census Tract    Preventi… 2404000… "Visit…
4  2017 MD        Maryland  Baltimore Census Tract    Preventi… 2404000… "Curre…
5  2017 MD        Maryland  Baltimore Census Tract    Preventi… 2404000… "Curre…
6  2017 MD        Maryland  Baltimore Census Tract    Preventi… 2404000… "Visit…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
#   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
#   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
unique(md$CityName)
[1] "Baltimore"

The new dataset “Prevention” is a manageable dataset now.

For your assignment, work with a cleaned dataset.

1. Once you run the above code, filter this dataset one more time for any particular subset with no more than 900 observations.

Filter chunk here

# Remove unnecessary variables
prevention <- prevention %>%
  select(-Year,
         -Category,
         -Data_Value_Type,
         -CategoryID)
head(prevention)
# A tibble: 6 × 14
  StateAbbr StateDesc  CityName   GeographicLevel UniqueID    Measure Data_Value
  <chr>     <chr>      <chr>      <chr>           <chr>       <chr>        <dbl>
1 AL        Alabama    Montgomery City            151000      Choles…       80.2
2 CA        California Concord    City            616000      Visits…       64.5
3 CA        California Concord    City            616000      Choles…       80.1
4 CA        California Fontana    City            624680      Visits…       66.1
5 CA        California Richmond   Census Tract    0660620-06… Choles…       81.8
6 FL        Florida    Davie      Census Tract    1216475-12… Choles…       84.5
# ℹ 7 more variables: PopulationCount <dbl>, lat <dbl>, long <dbl>,
#   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
# Filter for Virginia and lack of health insurance
va <- prevention %>%
  filter(StateAbbr == "VA",
         Short_Question_Text == "Health Insurance")
head(va)
# A tibble: 6 × 14
  StateAbbr StateDesc CityName     GeographicLevel UniqueID   Measure Data_Value
  <chr>     <chr>     <chr>        <chr>           <chr>      <chr>        <dbl>
1 VA        Virginia  Lynchburg    Census Tract    5147672-5… "Curre…       16.6
2 VA        Virginia  Norfolk      Census Tract    5157000-5… "Curre…        8.7
3 VA        Virginia  Portsmouth   Census Tract    5164000-5… "Curre…       17.4
4 VA        Virginia  Alexandria   Census Tract    5101000-5… "Curre…       29.9
5 VA        Virginia  Newport News Census Tract    5156000-5… "Curre…       24.3
6 VA        Virginia  Newport News Census Tract    5156000-5… "Curre…       24.5
# ℹ 7 more variables: PopulationCount <dbl>, lat <dbl>, long <dbl>,
#   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
# Remove unnecessary variables
va <- va %>%
  select(-StateAbbr,
         -StateDesc,
         -Measure,
         -MeasureId,
         -Short_Question_Text)
head(va)
# A tibble: 6 × 9
  CityName     GeographicLevel UniqueID   Data_Value PopulationCount   lat  long
  <chr>        <chr>           <chr>           <dbl>           <dbl> <dbl> <dbl>
1 Lynchburg    Census Tract    5147672-5…       16.6            2115  37.4 -79.2
2 Norfolk      Census Tract    5157000-5…        8.7            1357  36.9 -76.3
3 Portsmouth   Census Tract    5164000-5…       17.4            4844  36.9 -76.4
4 Alexandria   Census Tract    5101000-5…       29.9            7165  38.8 -77.1
5 Newport News Census Tract    5156000-5…       24.3            2560  37.0 -76.4
6 Newport News Census Tract    5156000-5…       24.5            2138  37.0 -76.4
# ℹ 2 more variables: CityFIPS <dbl>, TractFIPS <dbl>

2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.

First plot chunk here

# Filter for only Virginia cities
va_cities <- va %>%
  filter(GeographicLevel == "City") %>%
    select(-GeographicLevel,
           -TractFIPS)

# Import necessary library
library(scales)

Attaching package: 'scales'
The following object is masked from 'package:purrr':

    discard
The following object is masked from 'package:readr':

    col_factor
# Create a scatterplot comparing population and percent of the population that lacks access to health insurance, including only cities (not census tracts)
va_scatterplot <- va_cities %>%
  ggplot(aes(x = PopulationCount,
             y = Data_Value)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Percentage of Virginia Adults Without
                Health Insurance by City Population",
       x = "City Population",
       y = "Percentage of Adults Without Health Insurance",
       caption = "Source: Behavioral Risk Factor Surveillance System - CDC") +
  theme_minimal() +
  scale_x_continuous(labels = comma) # Remove scientific notation on x-axis
va_scatterplot
`geom_smooth()` using formula = 'y ~ x'

# Filter for United States cities and remove unnecessary variables
us_cities <- prevention %>%
  filter(GeographicLevel == "City",
         Short_Question_Text == "Health Insurance") %>%
    select(-GeographicLevel,
           -TractFIPS,
           -Measure,
           -MeasureId)

# Create the same scatterplot, but for the entire country, to compare Virginia to the rest of the United States
us_scatterplot <- us_cities %>%
  ggplot(aes(x = PopulationCount,
             y = Data_Value)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Percentage of American Adults Without
                Health Insurance by City Population",
       x = "City Population",
       y = "Percentage of Adults Without Health Insurance",
       caption = "Source: Behavioral Risk Factor Surveillance System - CDC") +
  theme_minimal() +
  scale_x_continuous(labels = comma) # Remove scientific notation on x-axis
us_scatterplot
`geom_smooth()` using formula = 'y ~ x'

# Limit the cities to ones around the same size as Virginia cities
us_scatterplot2 <- us_cities %>%
  filter(PopulationCount <= 450000 & PopulationCount >= 75000) %>%
    ggplot(aes(x = PopulationCount,
               y = Data_Value)) +
    geom_point() +
    geom_smooth(method = "lm") +
    labs(title = "Percentage of American Adults Without
                  Health Insurance by City Population",
         x = "City Population",
         y = "Percentage of Adults Without Health Insurance",
         caption = "Source: Behavioral Risk Factor Surveillance System - CDC") +
    theme_minimal() +
    scale_x_continuous(labels = comma) # Remove scientific notation on x-axis
us_scatterplot2
`geom_smooth()` using formula = 'y ~ x'

3. Now create a map of your subsetted dataset.

First map chunk here

# Import necessary libraries
library(maps)

Attaching package: 'maps'
The following object is masked from 'package:purrr':

    map
# Load Virginia map data
va_map_data <- map_data("state",
                        region = "virginia")

# Create the map
va_map <- ggplot() +
  geom_polygon(data = va_map_data,
               aes(x = long,
                   y = lat,
                   group = group),
               fill = "lightgray") +
  geom_point(data = va_cities,
             aes(x = long,
                 y = lat,
                 size = PopulationCount,
                 color = Data_Value),
             alpha = 0.7) +
  labs(title = "Map of Virginian Cities",
       size = "Population Count",
       color = "Percentage of Adults\nWithout Health Insurance",
       caption = "Source: Behavioral Risk Factor Surveillance System - CDC") +
  scale_size_continuous(labels = comma) +
  coord_fixed(ratio = 1.3) +
  theme(plot.title = element_text(hjust = 0.5, size = 14), # Customize theme to reduce legend
        axis.text.y = element_blank(),                     # size and remove y-axis labels
        axis.ticks.y = element_blank(),
        legend.position = "right",
        legend.title = element_text(size = 10),
        legend.text = element_text(size = 8),
        legend.box.margin = margin(0, 0, 0, 0))
va_map

4. Refine your map to include a mouse-click tooltip

Refined map chunk here

# Import necessary library
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
# Create the interactive map
va_map_interactive <- ggplotly(va_map)
va_map_interactive

5. Write a paragraph

The first graph compares city size and percentage of adults without health insurance. In Virginia, larger cities tend to have a population with a lower percentage of adults without health insurance. However, in the United States overall, larger cities tend the opposite way, with larger cities performing worse in this category. However, for both comparisons, the was variation in the data and the correlation was not very strong. The second plot, a map, shows the Virginian cities on the dataset, sized by population and colored by percentage of adults without health insurance. Overall, the difference between the highest and lowest percentages is only 4.3%. The dataset only contained data for 11 Virginia cities in terms of health insurance information.