library(tidyverse)
library(tidyr)
library(leaflet)
library(sf)
library(knitr)
setwd("/Users/emiliodifilippantonio/Desktop/DATA 110/DATA 110 Working Directory")
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")Healthy Cities GIS Assignment
Load the libraries and set the working directory
The GeoLocation variable has (lat, long) format
Split GeoLocation (lat, long) into two columns: lat and long
latlong <- cities500|>
mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)
head(latlong)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Health Outcom…
4 2017 CA California Hayward City BRFSS Unhealthy Beh…
5 2017 CA California Hemet City BRFSS Prevention
6 2017 CA California Indio Census Tract BRFSS Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
Filter the dataset
Remove the StateDesc that includes the United Sates, select Prevention as the category (of interest), filter for only measuring crude prevalence and select only 2017.
latlong_clean <- latlong |>
filter(StateDesc != "United States") |>
filter(Category == "Prevention") |>
filter(Data_Value_Type == "Crude prevalence") |>
filter(Year == 2017)
head(latlong_clean)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 AL Alabama Montgomery City BRFSS Prevention
2 2017 CA California Concord City BRFSS Prevention
3 2017 CA California Concord City BRFSS Prevention
4 2017 CA California Fontana City BRFSS Prevention
5 2017 CA California Richmond Census Tract BRFSS Prevention
6 2017 FL Florida Davie Census Tract BRFSS Prevention
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
What variables are included? (can any of them be removed?)
names(latlong_clean) [1] "Year" "StateAbbr"
[3] "StateDesc" "CityName"
[5] "GeographicLevel" "DataSource"
[7] "Category" "UniqueID"
[9] "Measure" "Data_Value_Unit"
[11] "DataValueTypeID" "Data_Value_Type"
[13] "Data_Value" "Low_Confidence_Limit"
[15] "High_Confidence_Limit" "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote" "PopulationCount"
[19] "lat" "long"
[21] "CategoryID" "MeasureId"
[23] "CityFIPS" "TractFIPS"
[25] "Short_Question_Text"
Remove the variables that will not be used in the assignment
prevention <- latlong_clean |>
select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(prevention)# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 AL Alabama Montgome… City Prevent… 151000 Choles…
2 2017 CA California Concord City Prevent… 616000 Visits…
3 2017 CA California Concord City Prevent… 616000 Choles…
4 2017 CA California Fontana City Prevent… 624680 Visits…
5 2017 CA California Richmond Census Tract Prevent… 0660620… Choles…
6 2017 FL Florida Davie Census Tract Prevent… 1216475… Choles…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
md <- prevention |>
filter(StateAbbr=="MD")
head(md)# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Chole…
2 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Visit…
3 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Visit…
4 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Curre…
5 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Curre…
6 2017 MD Maryland Baltimore Census Tract Preventi… 2404000… "Visit…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
The new dataset “Prevention” is a manageable dataset now.
For your assignment, work with the cleaned “Prevention” dataset
1. Once you run the above code, filter this dataset one more time for any particular subset.
Filter chunk here
# Filtering only for cholesterol screenings in the md dataset
chol_screen <- md |> filter(Short_Question_Text == "Cholesterol Screening")
# Calling the new dataset
chol_screen# A tibble: 201 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
2 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
3 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
4 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
5 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
6 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
7 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
8 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
9 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
10 2017 MD Maryland Baltimore Census Tract Prevent… 2404000… Choles…
# ℹ 191 more rows
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.
First plot chunk here
# Creating a histogram to show the frequency of different data value scores in the chol_screen dataset
plot <- chol_screen |>
ggplot(aes(x = Data_Value)) +
geom_histogram() +
labs(title = "Data Values of Cholseterol Screenings in Baltimore in 2017",
caption = "Data Source: CDC",
x = "Data Value",
y = "Count")
# Calling the plot
plot`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 1 rows containing non-finite values (`stat_bin()`).
3. Now create a map of your subsetted dataset.
First map chunk here
# Map the location of the cholesterol screenings in the chol_screen dataset and set the circle radius to the data value score to the power of 10 divided by 100 quadrillion to better see the different data value scores (see end paragraph for full explanation).
leaflet() |>
setView(lng = mean(chol_screen$long), lat = mean(chol_screen$lat), zoom = 11) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = chol_screen,
radius = chol_screen$Data_Value^10 / 100000000000000000,
color = "black",
fillColor = "red",
fillOpacity = 0.25)Assuming "long" and "lat" are longitude and latitude, respectively
4. Refine your map to include a mousover tooltip
Refined map chunk here
# Add interactivity to the previous visualization to see the population and data value score of each cholesterol screening on the chol_screen dataset
leaflet() |>
setView(lng = mean(chol_screen$long), lat = mean(chol_screen$lat), zoom = 11) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = chol_screen,
radius = chol_screen$Data_Value^10 / 100000000000000000,
color = "black",
fillColor = "red",
fillOpacity = 0.25,
popup = paste0("<b>Data Value: </b>", chol_screen$Data_Value, "<br>",
"<b>Population: </b>", chol_screen$PopulationCount, "<br>"))Assuming "long" and "lat" are longitude and latitude, respectively
5. Write a paragraph
In a paragraph, describe the plots you created and what they show.
After filtering to only look at cholesterol screenings because high cholesterol is a risk factor for both heart disease and stroke, the first and fifth highest causes of death in the U.S., respectively (CDC), I created a histogram to visualize the frequency of each data value score in the chol_screen dataset. I then mapped the cholesterol screenings from the data set and set the radii of the circles as the data value score to the power of 10 divided by 100 quadrillion. The data value scores are on a scale from 0 to 100 and tell how useful the data is. When I first mapped the points, I set the radii of the circles as the data value score. Although the circles fitted the map quite well, it was very difficult to differentiate them, as their sizes were all very similar. To solve this problem, I decided to put the data value scores to the power of 10. Putting them to the power of something allowed me to spread out the data points, because the greater the power to which you put the number, the greatest the gap between the numbers relative to the numbers themselves. The problem with this is that the points were too big, so I divided them by 100 quadrillion to make them fit the map. In the last visualization, I added interactivity by allowing viewers to see the population size and data value score of each point.