library(tidyverse)
library(tidyr)
library(webshot2)
library(leaflet)
setwd("~/Schol Stuff/Montgomery College 2025/Data 110 Data Visualization/GIS Tutorial 500 Healthy Cities")
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")
data(cities500)Healthy Cities GIS Assignment
Load the libraries and set the working directory
The GeoLocation variable has (lat, long) format
Split GeoLocation (lat, long) into two columns: lat and long
latlong <- cities500|>
mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|> #need the brackets to actually remove the parenthesis
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE) # splits into two columns, so much better than what I did for project one
# note convert = true REPLACES the original column, if you want it make that equal false)
head(latlong)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Health Outcom…
4 2017 CA California Hayward City BRFSS Unhealthy Beh…
5 2017 CA California Hemet City BRFSS Prevention
6 2017 CA California Indio Census Tract BRFSS Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
Remember: North = + lat, South = - lat; East = + long, West = - long
Filter the dataset
Remove the StateDesc that includes the United Sates, select Prevention as the category (of interest), filter for only measuring crude prevalence and select only 2017. Filter for just Connecticut
latlong_clean <- latlong |>
filter(StateDesc != "United States") |>
filter(Data_Value_Type == "Crude prevalence") |>
filter(Year == 2017) |>
filter(StateAbbr == "CT") |>
filter(Category == "Unhealthy Behaviors")
head(latlong_clean)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CT Connecticut Bridgeport Census Tract BRFSS Unhealthy B…
2 2017 CT Connecticut Danbury City BRFSS Unhealthy B…
3 2017 CT Connecticut Norwalk Census Tract BRFSS Unhealthy B…
4 2017 CT Connecticut Bridgeport Census Tract BRFSS Unhealthy B…
5 2017 CT Connecticut Hartford Census Tract BRFSS Unhealthy B…
6 2017 CT Connecticut Waterbury Census Tract BRFSS Unhealthy B…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
What variables are included? (can any of them be removed?)
names(latlong_clean) [1] "Year" "StateAbbr"
[3] "StateDesc" "CityName"
[5] "GeographicLevel" "DataSource"
[7] "Category" "UniqueID"
[9] "Measure" "Data_Value_Unit"
[11] "DataValueTypeID" "Data_Value_Type"
[13] "Data_Value" "Low_Confidence_Limit"
[15] "High_Confidence_Limit" "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote" "PopulationCount"
[19] "lat" "long"
[21] "CategoryID" "MeasureId"
[23] "CityFIPS" "TractFIPS"
[25] "Short_Question_Text"
Remove the variables that will not be used in the assignment
latlong_clean2 <- latlong_clean |>
select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(latlong_clean2)# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CT Connecticut Bridgep… Census Tract Unhealt… 0908000… Obesit…
2 2017 CT Connecticut Danbury City Unhealt… 918430 Obesit…
3 2017 CT Connecticut Norwalk Census Tract Unhealt… 0955990… Obesit…
4 2017 CT Connecticut Bridgep… Census Tract Unhealt… 0908000… Curren…
5 2017 CT Connecticut Hartford Census Tract Unhealt… 0937000… Obesit…
6 2017 CT Connecticut Waterbu… Census Tract Unhealt… 0980000… Obesit…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
The new dataset “latlong_clean2” is a manageable dataset now.
For your assignment, work with a cleaned dataset where you perform your own cleaning and filtering.
1. Once you run the above code and filter this complicated dataset, perform your own investigation by filtering this dataset however you choose so that you have a subset with no more than 900 observations through some inclusion/exclusion criteria.
Filter chunk here (you may need multiple chunks)
# create df with latitude and longitude in separate columns
location <- cities500|>
mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)
head(location)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Health Outcom…
4 2017 CA California Hayward City BRFSS Unhealthy Beh…
5 2017 CA California Hemet City BRFSS Prevention
6 2017 CA California Indio Census Tract BRFSS Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
Filter for Year 2017, Measure Current Asthma, Geographic level of “City” and within the States of New York, New Jersey, and Pennsylvania. I initially was just looking at PA but there wasn’t enough Asthma observations, so I expanded to two neighboring states.
location2 <- location |>
filter(Year == 2017) |>
filter(MeasureId == "CASTHMA") |>
filter(GeographicLevel == "City") |>
filter(StateAbbr %in% c('NY', 'NJ', 'PA'))
head(location2)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 NJ New Jersey Camden City BRFSS Health Outc…
2 2017 NJ New Jersey Passaic City BRFSS Health Outc…
3 2017 NJ New Jersey Union City City BRFSS Health Outc…
4 2017 NJ New Jersey Jersey City City BRFSS Health Outc…
5 2017 NJ New Jersey Clifton City BRFSS Health Outc…
6 2017 NJ New Jersey Paterson City BRFSS Health Outc…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
Clean df of unneeded columns:
#names(location2)location3 <- location2 |>
select(-DataSource, -UniqueID, -Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote, -CategoryID)
head(location3)# A tibble: 6 × 16
Year StateAbbr StateDesc CityName GeographicLevel Category Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 NJ New Jersey Camden City Health Outcomes Curren…
2 2017 NJ New Jersey Passaic City Health Outcomes Curren…
3 2017 NJ New Jersey Union City City Health Outcomes Curren…
4 2017 NJ New Jersey Jersey City City Health Outcomes Curren…
5 2017 NJ New Jersey Clifton City Health Outcomes Curren…
6 2017 NJ New Jersey Paterson City Health Outcomes Curren…
# ℹ 9 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, MeasureId <chr>,
# CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.
First plot chunk here
# non map plot
ggplot(location3, aes(x = long, y = Data_Value, color = StateAbbr)) +
geom_point(size = 2.7, alpha =0.7) +
labs(title = "Percentage of Asthma Suffers vs Longitude for NJ, NY, and PA",
x = 'Longitude',
y = "Percentage with Asthma",
color = 'State',
caption = 'Source: 500 Healthy Cities, CDC') +
theme_bw() +
scale_color_brewer(palette = "Set2")3. Now create a map of your subsetted dataset.
First map chunk here
start_long <- 40.848740
start_lat <- -76.764354leaflet() |>
setView(lng = -76.764354, lat= 40.848740, zoom = 6) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = location3,
radius =(location3$Data_Value/100 * location3$PopulationCount)) # getting actual population with asthma, not just the percentage of the populationAssuming "long" and "lat" are longitude and latitude, respectively
leaflet() |>
setView(lng = -75., lat= 41.848740, zoom =6.5 ) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = location3,
radius = 30* sqrt(location3$Data_Value/100 * location3$PopulationCount)
) # getting actual population with asthma, not just the percentage of the populationAssuming "long" and "lat" are longitude and latitude, respectively
4. Refine your map to include a mouse-click tooltip
Refined map chunk here
popasthma <- paste0(
"<b>Location: </b>", location3$CityName, "<br>",
"<b>Asthma Suffers: </b>", location3$Data_Value/100 * location3$PopulationCount, "<br>",
"<b>Percentage: </b>", location3$Data_Value, "<br>",
"<b>Population: </b>", location3$PopulationCount, "<br>"
)leaflet() |>
setView(lng = -75.764354, lat= 40.848740, zoom =7 ) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = location3,
radius = 25 * sqrt(location3$Data_Value/100 *location3$PopulationCount), # scaling to actual asthmatic population
color = "gray",
fillColor = '#5ab5d1',
fillOpacity = 0.30,
popup = popasthma) Assuming "long" and "lat" are longitude and latitude, respectively
5. Write a paragraph
My plots show the prevalence of asthma in 2017 in New York, New Jersey, and Pennsylvania. I chose asthma because my brother had bad asthma as a child and some of my close friends have bad asthma now, and I have family in the Pennsylvania and New York areas. I made the size of the radi of the circles of total population of asthma sufferers, which has the downside of the New York city circle covering up other circles, and I have not been able to fix that by changing the radius. I did try reverting it back to just the percentage, but did not find a good scalar to make the circles visible.