library(tidyverse)
library(tidyr)
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")
data(cities500)Pennsylvania Diabetics - Data 110
Load the libraries and set the working directory
The GeoLocation variable has (lat, long) format
Split GeoLocation (lat, long) into two columns: lat and long
latlong <- cities500|>
mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)
#head(latlong)Filter the dataset
Choose a state for unhealthy behaviors - Pennsylvania
latlong_clean <- latlong |>
filter(StateDesc != "United States") |>
filter(MeasureId == "DIABETES") |>
filter(StateAbbr == "PA")
nrow(latlong_clean)[1] 658
What variables are included? (can any of them be removed?)
names(latlong_clean) [1] "Year" "StateAbbr"
[3] "StateDesc" "CityName"
[5] "GeographicLevel" "DataSource"
[7] "Category" "UniqueID"
[9] "Measure" "Data_Value_Unit"
[11] "DataValueTypeID" "Data_Value_Type"
[13] "Data_Value" "Low_Confidence_Limit"
[15] "High_Confidence_Limit" "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote" "PopulationCount"
[19] "lat" "long"
[21] "CategoryID" "MeasureId"
[23] "CityFIPS" "TractFIPS"
[25] "Short_Question_Text"
Remove the variables that will not be used in the assignment
latlong_clean2 <- latlong_clean |>
select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)The new dataset “latlong_clean2” is a manageable dataset now.
For your assignment, work with a cleaned dataset where you perform your own cleaning and filtering.
1. Once you run the above code and filter this complicated dataset, perform your own investigation by filtering this dataset however you choose so that you have a subset with no more than 900 observations through some inclusion/exclusion criteria.
Filter chunk here (you may need multiple chunks)
latlong_plot <- latlong_clean2 |>
filter(!is.na(Data_Value), is.finite(Data_Value)) |>
mutate(diabetes_bin = cut(
Data_Value,
breaks = c(0, 10, 15, 20, 25, Inf),
labels = c("Very Low", "Low", "Moderate", "High", "Very High"),
right = FALSE
)) |>
filter(!is.na(diabetes_bin), !is.na(Category))2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.
First plot chunk here
# non map plot
library(RColorBrewer)
# Step 1: Identify top 7 cities by average diabetes
top7 <- latlong |>
filter(MeasureId == "DIABETES", StateAbbr == "PA") |>
group_by(CityName) |>
summarise(avg_diabetes = mean(Data_Value, na.rm = TRUE)) |>
arrange(desc(avg_diabetes)) |>
slice_head(n = 7) |>
pull(CityName)
# Step 2: Filter and reshape data for selected measures
bar_data <- latlong |>
filter(StateAbbr == "PA", CityName %in% top7) |>
filter(MeasureId %in% c("DIABETES", "CSMOKING", "BINGE", "BPHIGH")) |>
select(CityName, MeasureId, Data_Value) |>
group_by(CityName, MeasureId) |>
summarise(mean_value = mean(Data_Value, na.rm = TRUE), .groups = "drop")
# Step 3: Faceted bar chart with RColorBrewer palette
ggplot(bar_data, aes(x = MeasureId, y = mean_value, fill = MeasureId)) +
geom_col(width = 0.7, color = "white") +
facet_wrap(~ CityName, scales = "free_y") +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Health Indicators in Top 7 PA Cities (2017)",
x = "Measure",
y = "Average Rate (%)"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)3. Now create a map of your subsetted dataset.
First map chunk here
# leaflet()
library(leaflet)
# Step 1: Reshape data to include all four measures
map_data <- latlong |>
filter(StateAbbr == "PA", MeasureId %in% c("DIABETES", "CSMOKING", "BINGE", "BPHIGH")) |>
select(TractFIPS, lat, long, CityName, MeasureId, Data_Value) |>
group_by(TractFIPS, lat, long, CityName, MeasureId) |>
summarise(Data_Value = mean(Data_Value, na.rm = TRUE), .groups = "drop") |>
pivot_wider(names_from = MeasureId, values_from = Data_Value) |>
filter(!is.na(lat), !is.na(long), !is.na(DIABETES))
# Step 2: Basic leaflet map
leaflet(map_data) |>
setView(lng = -77.1945, lat = 41.2033, zoom = 7) |>
addProviderTiles("CartoDB.Positron") |>
addCircleMarkers(
lng = ~long,
lat = ~lat,
radius = ~DIABETES / 2,
color = "red",
fillOpacity = 0.7,
stroke = FALSE
)4. Refine your map to include a mouse-click tooltip
Refined map chunk here
library(leaflet)
# Step 1: Filter and reshape data to include all four measures
map_data <- latlong |>
filter(StateAbbr == "PA", MeasureId %in% c("DIABETES", "CSMOKING", "BINGE", "BPHIGH")) |>
select(TractFIPS, lat, long, CityName, MeasureId, Data_Value) |>
group_by(TractFIPS, lat, long, CityName, MeasureId) |>
summarise(Data_Value = mean(Data_Value, na.rm = TRUE), .groups = "drop") |>
pivot_wider(names_from = MeasureId, values_from = Data_Value) |>
filter(!is.na(lat), !is.na(long), !is.na(DIABETES))
# Step 2: Create popup text
map_data <- map_data |>
mutate(
popup_text = paste0(
"<strong>City:</strong> ", ifelse(is.na(CityName), "Unknown", CityName), "<br>",
"<strong>Diabetes Rate:</strong> ", round(DIABETES, 1), "%<br>",
"<strong>Smoking Rate:</strong> ", round(CSMOKING, 1), "%<br>",
"<strong>Binge Drinking Rate:</strong> ", round(BINGE, 1), "%<br>",
"<strong>High BP Rate:</strong> ", round(BPHIGH, 1), "%<br>",
"<strong>Tract FIPS:</strong> ", TractFIPS
)
)
# Step 3: Build leaflet map
leaflet(map_data) |>
setView(lng = -77.1945, lat = 41.2033, zoom = 7) |>
addProviderTiles("CartoDB.Positron") |>
addCircleMarkers(
lng = ~long,
lat = ~lat,
radius = ~DIABETES / 2,
color = "darkred",
fillOpacity = 0.7,
stroke = FALSE,
popup = ~popup_text
)5. Write a paragraph
In a paragraph, describe the plots you created and the insights they show.
The visualizations I created offer a multifaceted view of diabetes prevalence and its relationship to other health indicators across Pennsylvania. I began with a faceted bar chart that displayed average rates of diabetes, smoking, binge drinking, and high blood pressure for the top seven cities with the highest diabetes prevalence. This layout allowed for side-by-side comparisons, revealing that cities with elevated diabetes rates often also exhibited high rates of smoking and hypertension, suggesting potential comorbidities or shared risk factors. The use of a bright, qualitative color palette from RColorBrewer ensured that each health measure stood out clearly within each city panel. I then constructed an interactive leaflet map to visualize diabetes prevalence geographically across the entire state. Initially, the map displayed circle markers scaled by diabetes rate, allowing users to identify spatial clusters or outliers. In a later version, I enriched the map with mouse-over popups that included not just diabetes rates but also smoking, binge drinking, and high blood pressure rates for each census tract. This interactivity provided a more nuanced understanding of local health burdens and made it easier to explore patterns without cluttering the visual space. Together, these plots highlight both geographic and behavioral dimensions of chronic disease risk, and they underscore the value of combining static and interactive visualizations to support exploratory public health analysis.