library(tidyverse)
library(tidyr)
library(leaflet)
library(RColorBrewer)
setwd("~/DATA110")
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")
data(cities500)500 cities
## Load the libraries and set the working directory
The GeoLocation variable has (lat, long) format
latlong <- cities500|>
mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)
head(latlong)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Health Outcom…
4 2017 CA California Hayward City BRFSS Unhealthy Beh…
5 2017 CA California Hemet City BRFSS Prevention
6 2017 CA California Indio Census Tract BRFSS Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
Filter the dataset
latlong_clean <- latlong |>
filter(StateDesc != "United States") |>
filter(Data_Value_Type == "Crude prevalence") |>
filter(Year == 2017) |>
filter(StateAbbr == "CT") |>
filter(Category == "Unhealthy Behaviors")
head(latlong_clean)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CT Connecticut Bridgeport Census Tract BRFSS Unhealthy B…
2 2017 CT Connecticut Danbury City BRFSS Unhealthy B…
3 2017 CT Connecticut Norwalk Census Tract BRFSS Unhealthy B…
4 2017 CT Connecticut Bridgeport Census Tract BRFSS Unhealthy B…
5 2017 CT Connecticut Hartford Census Tract BRFSS Unhealthy B…
6 2017 CT Connecticut Waterbury Census Tract BRFSS Unhealthy B…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
What variables are included? (can any of them be removed?)
names(latlong_clean) [1] "Year" "StateAbbr"
[3] "StateDesc" "CityName"
[5] "GeographicLevel" "DataSource"
[7] "Category" "UniqueID"
[9] "Measure" "Data_Value_Unit"
[11] "DataValueTypeID" "Data_Value_Type"
[13] "Data_Value" "Low_Confidence_Limit"
[15] "High_Confidence_Limit" "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote" "PopulationCount"
[19] "lat" "long"
[21] "CategoryID" "MeasureId"
[23] "CityFIPS" "TractFIPS"
[25] "Short_Question_Text"
Remove the variables that will not be used in the assignment
latlong_clean2 <- latlong_clean |>
select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(latlong_clean2)# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CT Connecticut Bridgep… Census Tract Unhealt… 0908000… Obesit…
2 2017 CT Connecticut Danbury City Unhealt… 918430 Obesit…
3 2017 CT Connecticut Norwalk Census Tract Unhealt… 0955990… Obesit…
4 2017 CT Connecticut Bridgep… Census Tract Unhealt… 0908000… Curren…
5 2017 CT Connecticut Hartford Census Tract Unhealt… 0937000… Obesit…
6 2017 CT Connecticut Waterbu… Census Tract Unhealt… 0980000… Obesit…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
My own cleaning
Springfield <- latlong |>
filter(StateDesc != "Illinois") |>
filter(Data_Value_Type == "Crude prevalence") |>
filter(GeographicLevel == "Census Tract") |>
filter(Year == 2017)|>
filter(CityName == "Springfield") |>
filter(Measure !="Arthritis among adults aged >=18 Years")|>
filter(MeasureId !="CHD")|>
filter(!is.na(Data_Value)) |>
filter(Category == "Health Outcomes")
head(Springfield)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 MA Massachusetts Springfield Census Tract BRFSS Health O…
2 2017 MA Massachusetts Springfield Census Tract BRFSS Health O…
3 2017 MA Massachusetts Springfield Census Tract BRFSS Health O…
4 2017 MA Massachusetts Springfield Census Tract BRFSS Health O…
5 2017 MA Massachusetts Springfield Census Tract BRFSS Health O…
6 2017 MA Massachusetts Springfield Census Tract BRFSS Health O…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
## Look at a variable
unique(Springfield$Measure) [1] "Chronic obstructive pulmonary disease among adults aged >=18 Years"
[2] "Diagnosed diabetes among adults aged >=18 Years"
[3] "Stroke among adults aged >=18 Years"
[4] "Mental health not good for >=14 days among adults aged >=18 Years"
[5] "Cancer (excluding skin cancer) among adults aged >=18 Years"
[6] "Physical health not good for >=14 days among adults aged >=18 Years"
[7] "Chronic kidney disease among adults aged >=18 Years"
[8] "High cholesterol among adults aged >=18 Years who have been screened in the past 5 Years"
[9] "High blood pressure among adults aged >=18 Years"
[10] "Current asthma among adults aged >=18 Years"
Create a plot
library(ggplot2)
Springfield |>
ggplot(aes(x = Data_Value,
y = Measure,
fill = StateDesc)) +
geom_col(position = "dodge") +
labs(title = "Comparison of Health Indicators\nbetween Springfield from MA\nAND M0",
x = "Value (%)",
y = "Health Indicator",
fill = "State") +
scale_y_discrete(labels = c(
"Stroke among adults aged >=18 Years" = "Stroke\n≥18y",
"Physical health not good for >=14 days among adults aged >=18 Years" = "Physical health not good\n≥14 days ≥18y",
"Mental health not good for >=14 days among adults aged >=18 Years" = "Mental health not good\n≥14 days ≥18y",
"High cholesterol among adults aged >=18 Years who have been screened in the past 5 Years" = "High cholesterol\nscreened past 5y\n≥18y",
"High blood pressure among adults aged >=18 Years" = "High blood pressure\n≥18y",
"Diagnosed diabetes among adults aged >=18 Years" = "Diagnosed diabetes\n≥18y",
"Current asthma among adults aged >=18 Years" = "Current asthma\n≥18y",
"Chronic obstructive pulmonary disease among adults aged >=18 Years" = "Chronic obstructive\npulmonary disease\n≥18y",
"Chronic kidney disease among adults aged >=18 Years" = "Chronic kidney disease\n≥18y",
"Cancer (excluding skin cancer) among adults aged >=18 Years" = "Cancer (excl. skin)\n≥18y"))+
theme_bw() +
theme (axis.text.y= element_text(size=6.9))Set the geographical ordinates
sma_long <- 72.590279
sma_lat <- 42.101391
smo_long <- 93.297256
smo_lat <- 37.210388Create my first Map Graph
leaflet(Springfield) |>
setView(lat =42.101, lng = - 72.5902, zoom =12) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = Springfield,
radius = sqrt(1.3^Springfield$Data_Value)*2
)Assuming "long" and "lat" are longitude and latitude, respectively
Create the second Map graph
leaflet() |>
setView( lng = - 93.2972, lat = 37.2103 , zoom =11) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = Springfield,
radius = sqrt(1.3^Springfield$Data_Value)*2,
)Assuming "long" and "lat" are longitude and latitude, respectively
Set the popup
popup1 <- paste0(
"<b>Measure: </b>", Springfield$Measure, "<br>",
"<b>Category: </b>", Springfield$Category, "<br>",
"<b>Value (%): </b>", Springfield$Data_Value, "<br>",
"<b>Year: </b>", Springfield$Year
)Add the popup for my first graph
label_names <- c(
"Stroke among adults aged >=18 Years" = "Stroke ≥18y",
"Physical health not good for >=14 days among adults aged >=18 Years" = "Physical health\n≥14 days ≥18y",
"Mental health not good for >=14 days among adults aged >=18 Years" = "Mental health\n≥14 days ≥18y",
"High cholesterol among adults aged >=18 Years who have been screened in the past 5 Years" = "High cholesterol\nscreened past 5y ≥18y",
"High blood pressure among adults aged >=18 Years" = "High blood pressure ≥18y",
"Diagnosed diabetes among adults aged >=18 Years" = "Diagnosed diabetes ≥18y",
"Current asthma among adults aged >=18 Years" = "Current asthma ≥18y",
"Chronic obstructive pulmonary disease among adults aged >=18 Years" = "COPD\n≥18y",
"Chronic kidney disease among adults aged >=18 Years" = "Chronic kidney disease ≥18y",
"Cancer (excluding skin cancer) among adults aged >=18 Years" = "Cancer (excl. skin) ≥18y"
)
pal <- colorFactor(
palette = brewer.pal(10, "Set3"),
domain = Springfield$Measure)
leaflet(Springfield) |>
setView( lng = - 72.5902, lat = 42.101, zoom =12) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = Springfield,
radius = sqrt(1.3^Springfield$Data_Value)*2,
color = pal(Springfield$Measure),
popup = popup1)|>
addLegend("topleft", pal = pal, values = Springfield$Measure,
title = "Measure type", opacity = 2,
labFormat = labelFormat(transform = function(x) label_names[x]) # code from Ai to be able to modify the legend label .
)Assuming "long" and "lat" are longitude and latitude, respectively
Add the popup for my second graph
label_names <- c(
"Stroke among adults aged >=18 Years" = "Stroke ≥18y",
"Physical health not good for >=14 days among adults aged >=18 Years" = "Physical health\n≥14 days ≥18y",
"Mental health not good for >=14 days among adults aged >=18 Years" = "Mental health\n≥14 days ≥18y",
"High cholesterol among adults aged >=18 Years who have been screened in the past 5 Years" = "High cholesterol\nscreened past 5y ≥18y",
"High blood pressure among adults aged >=18 Years" = "High blood pressure ≥18y",
"Diagnosed diabetes among adults aged >=18 Years" = "Diagnosed diabetes ≥18y",
"Current asthma among adults aged >=18 Years" = "Current asthma ≥18y",
"Chronic obstructive pulmonary disease among adults aged >=18 Years" = "COPD\n≥18y",
"Chronic kidney disease among adults aged >=18 Years" = "Chronic kidney disease ≥18y",
"Cancer (excluding skin cancer) among adults aged >=18 Years" = "Cancer (excl. skin) ≥18y"
)
pal <- colorFactor(
palette = brewer.pal(10, "Set3"),
domain = Springfield$Measure)
leaflet() |>
setView( lng = - 93.2972, lat = 37.2103 , zoom =11) |>
addProviderTiles("Esri.WorldStreetMap") |>
addCircles(
data = Springfield,
radius = sqrt(1.3^Springfield$Data_Value)*2,
color= pal(Springfield$Measure),
label=label_names,
popup = popup1)|>
addLegend("topleft", pal = pal, values = Springfield$Measure,
title = "Measure type", opacity = 1.5,
labFormat = labelFormat(transform = function(x) label_names[x]) # code from Ai to be able to modify the legend label .
)Assuming "long" and "lat" are longitude and latitude, respectively
Essay
The first step was data cleaning to ensure accuracy and consistency. I started by removing all missing values (NAs) from the dataset to avoid any distortion in the results. I then filtered the data to focus exclusively on the year 2017, ensuring that comparisons were consistent over the same time frame.
I also narrowed the dataset to include only records from cities named Springfield. Since multiple Springfields exist in the U.S., I specifically selected the ones located in Massachusetts and Missouri for a clear and focused comparison.
To refine the analysis further, I excluded some diseases that were not relevant to my study. For example, I removed “Arthritis among adults aged 18 years and older” and filtered out the Measure ID CHD, which represents Coronary Heart Disease. Instead, I concentrated on other key health outcomes listed under the “Outcomes” category, focusing on chronic diseases that have a significant public health impact.
With the data cleaned and prepared, I created my first graph, a bar chart. This chart shows the percentage of the population affected by each disease in both Springfields—Massachusetts and Missouri. This visualization allowed me to easily compare the prevalence rates of various diseases between the two locations.
Following that, I added a geographic component to the project by incorporating latitude and longitude coordinates for both cities. This enabled me to create interactive maps, where each circle on the map represents a disease. The size of each circle corresponds to the percentage of the population affected by that disease, providing a clear visual representation of disease severity.
I created one map for Springfield, Massachusetts, and another for Springfield, Missouri, following the same logic for both. This made it easy to compare health conditions geographically between the two cities.
To enhance the interactivity of the maps, I implemented popups. These popups display detailed information when the user clicks or hovers over a circle, including the disease name (Measure), the health category, the prevalence percentage (Value), and the year of the data. This makes the visualization more informative and user-friendly.
After finalizing the popups and making sure all the visual elements were clear and accessible, the project resulted in a set of visualizations that provide both a statistical and spatial understanding of disease distribution in these two cities.
In conclusion, my final output includes both a comparative bar chart and two interactive maps. Together, these visuals offer a comprehensive overview of how chronic diseases affect the populations of Springfield, Massachusetts, and Springfield, Missouri, helping to better understand health disparities between different regions sharing the same city name.