library(tidyverse)
library(tidyr)
library(leaflet)
library(sf)
library(knitr)
library(scales)
setwd("/Users/oworenibanseyo/Desktop/Data 110 2025/Datasets")
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")
data(cities500)Healthy Cities GIS Assignment
Load the libraries and set the working directory
The GeoLocation variable has (lat, long) format
Split GeoLocation (lat, long) into two columns: lat and long
latlong <- cities500|>
mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = T)
head(latlong)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Health Outcom…
4 2017 CA California Hayward City BRFSS Unhealthy Beh…
5 2017 CA California Hemet City BRFSS Prevention
6 2017 CA California Indio Census Tract BRFSS Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
Filter the dataset
Remove the StateDesc that includes the United Sates, select Prevention as the category (of interest), filter for only measuring crude prevalence and select only 2017.
latlong_clean <- latlong |>
filter(StateDesc != "United States") |>
filter(Data_Value_Type == "Crude prevalence") |>
filter(Year == 2017)
head(latlong_clean)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Unhealthy Beh…
4 2017 CA California Indio Census Tract BRFSS Health Outcom…
5 2017 CA California Inglewood Census Tract BRFSS Health Outcom…
6 2017 CA California Lakewood City BRFSS Unhealthy Beh…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
What variables are included? (can any of them be removed?)
names(latlong_clean) [1] "Year" "StateAbbr"
[3] "StateDesc" "CityName"
[5] "GeographicLevel" "DataSource"
[7] "Category" "UniqueID"
[9] "Measure" "Data_Value_Unit"
[11] "DataValueTypeID" "Data_Value_Type"
[13] "Data_Value" "Low_Confidence_Limit"
[15] "High_Confidence_Limit" "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote" "PopulationCount"
[19] "lat" "long"
[21] "CategoryID" "MeasureId"
[23] "CityFIPS" "TractFIPS"
[25] "Short_Question_Text"
Remove the variables that will not be used in the assignment
latlong_clean2 <- latlong_clean |>
select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(latlong_clean2)# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract Health … 0632548… Arthri…
2 2017 CA California Hawthorne City Unhealt… 632548 Curren…
3 2017 CA California Hayward City Unhealt… 633000 Obesit…
4 2017 CA California Indio Census Tract Health … 0636448… Arthri…
5 2017 CA California Inglewood Census Tract Health … 0636546… Diagno…
6 2017 CA California Lakewood City Unhealt… 639892 Obesit…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
The new dataset “Prevention” is a manageable dataset now.
For your assignment, work with a cleaned dataset.
1. Once you run the above code and learn how to filter in this format, filter this dataset however you choose so that you have a subset with no more than 900 observations.
Filter chunk here
I filter my dataset for Texas specifically, then chipped down to the city of El paso.
texas_data <- latlong_clean2 |>
filter(StateDesc == "Texas", CategoryID == "PREVENT", CityName == "El Paso")summary(texas_data) Year StateAbbr StateDesc CityName
Min. :2017 Length:547 Length:547 Length:547
1st Qu.:2017 Class :character Class :character Class :character
Median :2017 Mode :character Mode :character Mode :character
Mean :2017
3rd Qu.:2017
Max. :2017
GeographicLevel Category UniqueID Measure
Length:547 Length:547 Length:547 Length:547
Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character
Data_Value_Type Data_Value PopulationCount lat
Length:547 Min. :17.70 Min. : 2 Min. :31.63
Class :character 1st Qu.:53.25 1st Qu.: 3571 1st Qu.:31.76
Mode :character Median :67.65 Median : 4796 Median :31.79
Mean :63.53 Mean : 9494 Mean :31.80
3rd Qu.:77.90 3rd Qu.: 5958 3rd Qu.:31.85
Max. :89.00 Max. :649121 Max. :31.97
NA's :19
long CategoryID MeasureId CityFIPS
Min. :-106.6 Length:547 Length:547 Min. :4824000
1st Qu.:-106.5 Class :character Class :character 1st Qu.:4824000
Median :-106.4 Mode :character Mode :character Median :4824000
Mean :-106.4 Mean :4824000
3rd Qu.:-106.3 3rd Qu.:4824000
Max. :-106.2 Max. :4824000
TractFIPS Short_Question_Text
Min. :4.814e+10 Length:547
1st Qu.:4.814e+10 Class :character
Median :4.814e+10 Mode :character
Mean :4.814e+10
3rd Qu.:4.814e+10
Max. :4.814e+10
NA's :4
2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.
First plot chunk here
texas_data# A tibble: 547 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Curre…
2 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Takin…
3 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Visit…
4 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Visit…
5 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Takin…
6 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Visit…
7 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Curre…
8 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Chole…
9 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Visit…
10 2017 TX Texas El Paso Census Tract Preventi… 4824000… "Curre…
# ℹ 537 more rows
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
texas_data <- texas_data |>
drop_na()Dropping the whole dataset into a drop_na function is usually not recommended. I had visually inspected the preloaded dataset, ensuring NAs were a very small amount.
pop_taking_by_measure <- texas_data |>
mutate(NumPeople = PopulationCount * Data_Value/100) |>
group_by(MeasureId, Short_Question_Text) |>
summarise(
TotalPeople = sum(NumPeople),
.groups = "drop"
) |>
arrange(desc(TotalPeople))
pop_taking_by_measure# A tibble: 4 × 3
MeasureId Short_Question_Text TotalPeople
<chr> <chr> <dbl>
1 CHOLSCREEN Cholesterol Screening 520318.
2 BPMED Taking BP Medication 474888.
3 CHECKUP Annual Checkup 423036.
4 ACCESS2 Health Insurance 222238.
ggplot(pop_taking_by_measure,
aes(x = reorder(Short_Question_Text, TotalPeople), y = TotalPeople)) +
geom_col(fill = "blue") +
coord_flip() +
scale_y_continuous(
labels = label_number(scale = 1/1000, suffix = "k", accuracy = 1))+
labs(
title = "Estimated Number of People by Preventive Measure\n(El Paso Census Tracts, 2017)",
x = "Preventive Measure",
y = "Estimated Number of People",
caption = "Source: CDC: 500 Cities_cdc.gov: El_paso 2017"
) +
theme_minimal()3. Now create a map of your subsetted dataset.
First map chunk here
Map of the city of El Paso, Texas, showing preventive health care measures by population count.
leaflet() |>
setView(lng = -106.4850, lat = 31.7619, zoom =10) |>
addProviderTiles("OpenTopoMap") |>
addCircles(
data = texas_data,
radius = ~PopulationCount / 8,
fillOpacity = 0.2,
stroke = F
)Assuming "long" and "lat" are longitude and latitude, respectively
4. Refine your map to include a mouse-click tooltip
Refined map chunk here
El_paso <- paste0(
"<b>Prevention Measure: </b>", texas_data$Short_Question_Text, "<br>",
"<b>Poupulation Count: </b>", texas_data$PopulationCount, "<br>"
)my_cols <- c("red", "blue", "green", "grey")
pal <- colorFactor(
palette = my_cols,
domain = texas_data$Short_Question_Text
)?leafletleaflet() |>
setView(lng = -106.4850, lat = 31.7619, zoom =10) |>
addProviderTiles("OpenTopoMap") |>
addCircles(
data = texas_data,
radius = ~PopulationCount / 8,
fillColor = ~pal(Short_Question_Text),
color = ~pal(Short_Question_Text),
fillOpacity = 0.4,
stroke = F,
popup = El_paso
) Assuming "long" and "lat" are longitude and latitude, respectively
5. Write a paragraph
In a paragraph, describe the plots you created and what they show.
My first plot shows which preventative measure is more prevalent by population count in El Paso. Having lived there for three years, I haven’t worked with much information on the positive side of El Paso. On the map, I included the entire dataset, randomly changing and filtering to achieve a suitable fit for the circle markers. This dataset provides a clear visual of the population distribution in El Paso, demonstrating that most areas have access to preventative healthcare, as they say, prevention is better than cure. El Paso, with a strong organizational presence, like that of the US military bases ( that gap from the northeast), factories, mountain elevation, and desert storms, already makes everyday living in El Paso detrimental to health. I removed the thick strokes from my circles because I had a lot of clustering and data points that would have been completely overshadowed or covered.
Maptile Source: https://rstudio.github.io/leaflet/articles/basemaps.html#:~:text=As%20a%20convenience%2C%20leaflet%20also,view%20all%20of%20the%20options.
El Paso Coordinates: Google