library(tidyverse)
library(tidyr)
setwd("C:/Users/ryan/OneDrive/School/DATA 110/Homework/GIS Homework")
<- read_csv("500CitiesLocalHealthIndicators.cdc.csv")
cities500 data(cities500)
Healthy Cities GIS Assignment
Load the libraries and set the working directory
The GeoLocation variable has (lat, long) format
Split GeoLocation (lat, long) into two columns: lat and long
<- cities500|>
latlong mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)
head(latlong)
# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Health Outcom…
4 2017 CA California Hayward City BRFSS Unhealthy Beh…
5 2017 CA California Hemet City BRFSS Prevention
6 2017 CA California Indio Census Tract BRFSS Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
Filter the dataset
Remove the StateDesc that includes the United Sates, select Prevention as the category (of interest), filter for only measuring crude prevalence and select only 2017.
<- latlong |>
latlong_clean filter(StateDesc != "United States") |>
filter(Category == "Prevention") |>
filter(Data_Value_Type == "Crude prevalence") |>
filter(Year == 2017)
head(latlong_clean)
# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 AL Alabama Montgomery City BRFSS Prevention
2 2017 CA California Concord City BRFSS Prevention
3 2017 CA California Concord City BRFSS Prevention
4 2017 CA California Fontana City BRFSS Prevention
5 2017 CA California Richmond Census Tract BRFSS Prevention
6 2017 FL Florida Davie Census Tract BRFSS Prevention
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
What variables are included? (can any of them be removed?)
names(latlong_clean)
[1] "Year" "StateAbbr"
[3] "StateDesc" "CityName"
[5] "GeographicLevel" "DataSource"
[7] "Category" "UniqueID"
[9] "Measure" "Data_Value_Unit"
[11] "DataValueTypeID" "Data_Value_Type"
[13] "Data_Value" "Low_Confidence_Limit"
[15] "High_Confidence_Limit" "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote" "PopulationCount"
[19] "lat" "long"
[21] "CategoryID" "MeasureId"
[23] "CityFIPS" "TractFIPS"
[25] "Short_Question_Text"
Remove the variables that will not be used in the assignment
<- latlong_clean |>
prevention select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(prevention)
# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 AL Alabama Montgome… City Prevent… 151000 Choles…
2 2017 CA California Concord City Prevent… 616000 Visits…
3 2017 CA California Concord City Prevent… 616000 Choles…
4 2017 CA California Fontana City Prevent… 624680 Visits…
5 2017 CA California Richmond Census Tract Prevent… 0660620… Choles…
6 2017 FL Florida Davie Census Tract Prevent… 1216475… Choles…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
<- prevention |>
md filter(StateAbbr=="MD")
head(md)
# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Chole…
2 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Visit…
3 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Visit…
4 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Curre…
5 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Curre…
6 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Visit…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
unique(md$CityName)
[1] "Baltimore"
The new dataset “Prevention” is a manageable dataset now.
For your assignment, work with a cleaned dataset.
1. Once you run the above code, filter this dataset one more time for any particular subset with no more than 900 observations.
Filter chunk here
# Remove unnecessary variables
<- prevention %>%
prevention select(-Year,
-Category,
-Data_Value_Type,
-CategoryID)
head(prevention)
# A tibble: 6 × 14
StateAbbr StateDesc CityName GeographicLevel UniqueID Measure Data_Value
<chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 AL Alabama Montgomery City 151000 Choles… 80.2
2 CA California Concord City 616000 Visits… 64.5
3 CA California Concord City 616000 Choles… 80.1
4 CA California Fontana City 624680 Visits… 66.1
5 CA California Richmond Census Tract 0660620-06… Choles… 81.8
6 FL Florida Davie Census Tract 1216475-12… Choles… 84.5
# ℹ 7 more variables: PopulationCount <dbl>, lat <dbl>, long <dbl>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
# Filter for Virginia and lack of health insurance
<- prevention %>%
va filter(StateAbbr == "VA",
== "Health Insurance")
Short_Question_Text head(va)
# A tibble: 6 × 14
StateAbbr StateDesc CityName GeographicLevel UniqueID Measure Data_Value
<chr> <chr> <chr> <chr> <chr> <chr> <dbl>
1 VA Virginia Lynchburg Census Tract 5147672-5… "Curre… 16.6
2 VA Virginia Norfolk Census Tract 5157000-5… "Curre… 8.7
3 VA Virginia Portsmouth Census Tract 5164000-5… "Curre… 17.4
4 VA Virginia Alexandria Census Tract 5101000-5… "Curre… 29.9
5 VA Virginia Newport News Census Tract 5156000-5… "Curre… 24.3
6 VA Virginia Newport News Census Tract 5156000-5… "Curre… 24.5
# ℹ 7 more variables: PopulationCount <dbl>, lat <dbl>, long <dbl>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
# Remove unnecessary variables
<- va %>%
va select(-StateAbbr,
-StateDesc,
-Measure,
-MeasureId,
-Short_Question_Text)
head(va)
# A tibble: 6 × 9
CityName GeographicLevel UniqueID Data_Value PopulationCount lat long
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Lynchburg Census Tract 5147672-5… 16.6 2115 37.4 -79.2
2 Norfolk Census Tract 5157000-5… 8.7 1357 36.9 -76.3
3 Portsmouth Census Tract 5164000-5… 17.4 4844 36.9 -76.4
4 Alexandria Census Tract 5101000-5… 29.9 7165 38.8 -77.1
5 Newport News Census Tract 5156000-5… 24.3 2560 37.0 -76.4
6 Newport News Census Tract 5156000-5… 24.5 2138 37.0 -76.4
# ℹ 2 more variables: CityFIPS <dbl>, TractFIPS <dbl>
2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.
First plot chunk here
# Filter for only Virginia cities
<- va %>%
va_cities filter(GeographicLevel == "City") %>%
select(-GeographicLevel,
-TractFIPS)
# Import necessary library
library(scales)
Attaching package: 'scales'
The following object is masked from 'package:purrr':
discard
The following object is masked from 'package:readr':
col_factor
# Create a scatterplot comparing population and percent of the population that lacks access to health insurance, including only cities (not census tracts)
<- va_cities %>%
va_scatterplot ggplot(aes(x = PopulationCount,
y = Data_Value)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Percentage of Virginia Adults Without
Health Insurance by City Population",
x = "City Population",
y = "Percentage of Adults Without Health Insurance",
caption = "Source: Behavioral Risk Factor Surveillance System - CDC") +
theme_minimal() +
scale_x_continuous(labels = comma) # Remove scientific notation on x-axis
va_scatterplot
`geom_smooth()` using formula = 'y ~ x'
# Filter for United States cities and remove unnecessary variables
<- prevention %>%
us_cities filter(GeographicLevel == "City",
== "Health Insurance") %>%
Short_Question_Text select(-GeographicLevel,
-TractFIPS,
-Measure,
-MeasureId)
# Create the same scatterplot, but for the entire country, to compare Virginia to the rest of the United States
<- us_cities %>%
us_scatterplot ggplot(aes(x = PopulationCount,
y = Data_Value)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Percentage of American Adults Without
Health Insurance by City Population",
x = "City Population",
y = "Percentage of Adults Without Health Insurance",
caption = "Source: Behavioral Risk Factor Surveillance System - CDC") +
theme_minimal() +
scale_x_continuous(labels = comma) # Remove scientific notation on x-axis
us_scatterplot
`geom_smooth()` using formula = 'y ~ x'
# Limit the cities to ones around the same size as Virginia cities
<- us_cities %>%
us_scatterplot2 filter(PopulationCount <= 450000 & PopulationCount >= 75000) %>%
ggplot(aes(x = PopulationCount,
y = Data_Value)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Percentage of American Adults Without
Health Insurance by City Population",
x = "City Population",
y = "Percentage of Adults Without Health Insurance",
caption = "Source: Behavioral Risk Factor Surveillance System - CDC") +
theme_minimal() +
scale_x_continuous(labels = comma) # Remove scientific notation on x-axis
us_scatterplot2
`geom_smooth()` using formula = 'y ~ x'
3. Now create a map of your subsetted dataset.
First map chunk here
# Import necessary libraries
library(maps)
Attaching package: 'maps'
The following object is masked from 'package:purrr':
map
# Load Virginia map data
<- map_data("state",
va_map_data region = "virginia")
# Create the map
<- ggplot() +
va_map geom_polygon(data = va_map_data,
aes(x = long,
y = lat,
group = group),
fill = "lightgray") +
geom_point(data = va_cities,
aes(x = long,
y = lat,
size = PopulationCount,
color = Data_Value),
alpha = 0.7) +
labs(title = "Map of Virginian Cities",
size = "Population Count",
color = "Percentage of Adults\nWithout Health Insurance",
caption = "Source: Behavioral Risk Factor Surveillance System - CDC") +
scale_size_continuous(labels = comma) +
coord_fixed(ratio = 1.3) +
theme(plot.title = element_text(hjust = 0.5, size = 14), # Customize theme to reduce legend
axis.text.y = element_blank(), # size and remove y-axis labels
axis.ticks.y = element_blank(),
legend.position = "right",
legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
legend.box.margin = margin(0, 0, 0, 0))
va_map
4. Refine your map to include a mouse-click tooltip
Refined map chunk here
# Import necessary library
library(plotly)
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
# Create the interactive map
<- ggplotly(va_map)
va_map_interactive va_map_interactive
5. Write a paragraph
The first graph compares city size and percentage of adults without health insurance. In Virginia, larger cities tend to have a population with a lower percentage of adults without health insurance. However, in the United States overall, larger cities tend the opposite way, with larger cities performing worse in this category. However, for both comparisons, the was variation in the data and the correlation was not very strong. The second plot, a map, shows the Virginian cities on the dataset, sized by population and colored by percentage of adults without health insurance. Overall, the difference between the highest and lowest percentages is only 4.3%. The dataset only contained data for 11 Virginia cities in terms of health insurance information.