library(tidyverse)
library(tidyr)
library(leaflet)
library(webshot2)
setwd("C:/Users/thilo/OneDrive/Desktop/DATA 110")
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")
data(cities500)Healthy Cities GIS Assignment
Load the libraries and set the working directory
The GeoLocation variable has (lat, long) format
Split GeoLocation (lat, long) into two columns: lat and long
latlong <- cities500|>
mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)
head(latlong)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Health Outcom…
4 2017 CA California Hayward City BRFSS Unhealthy Beh…
5 2017 CA California Hemet City BRFSS Prevention
6 2017 CA California Indio Census Tract BRFSS Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
For your assignment, work with a cleaned dataset where you perform your own cleaning and filtering.
1. Once you run the above code and filter this complicated dataset, perform your own investigation by filtering this dataset however you choose so that you have a subset with no more than 900 observations through some inclusion/exclusion criteria.
Filter chunk here (you may need multiple chunks)
Filtered out for Health Outcomes in Florida in 2017
clean_data <- latlong |>
filter(StateDesc != "United States") |>
filter(Year == 2017)|>
filter(StateAbbr =="FL") |>
filter(Data_Value_Type == "Age-adjusted prevalence") |>
filter(Category == "Health Outcomes")
head(clean_data)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 FL Florida Cape Coral City BRFSS Health O…
2 2017 FL Florida Lakeland City BRFSS Health O…
3 2017 FL Florida Plantation City BRFSS Health O…
4 2017 FL Florida Plantation City BRFSS Health O…
5 2017 FL Florida Hialeah City BRFSS Health O…
6 2017 FL Florida Deerfield Beach City BRFSS Health O…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
To see is there any columns that can remove
names(clean_data) [1] "Year" "StateAbbr"
[3] "StateDesc" "CityName"
[5] "GeographicLevel" "DataSource"
[7] "Category" "UniqueID"
[9] "Measure" "Data_Value_Unit"
[11] "DataValueTypeID" "Data_Value_Type"
[13] "Data_Value" "Low_Confidence_Limit"
[15] "High_Confidence_Limit" "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote" "PopulationCount"
[19] "lat" "long"
[21] "CategoryID" "MeasureId"
[23] "CityFIPS" "TractFIPS"
[25] "Short_Question_Text"
Removing Unused Columns
clean_data2 <- clean_data |>
select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(clean_data2 )# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 FL Florida Cape Coral City Health … 1210275 Corona…
2 2017 FL Florida Lakeland City Health … 1238250 Chroni…
3 2017 FL Florida Plantation City Health … 1257425 Cancer…
4 2017 FL Florida Plantation City Health … 1257425 Stroke…
5 2017 FL Florida Hialeah City Health … 1230000 Diagno…
6 2017 FL Florida Deerfield… City Health … 1216725 Mental…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
To see what are the health outcomes and city names
unique(clean_data2$MeasureId) [1] "CHD" "KIDNEY" "CANCER" "STROKE" "DIABETES" "MHLTH"
[7] "ARTHRITIS" "CASTHMA" "COPD" "HIGHCHOL" "PHLTH" "BPHIGH"
#unique(clean_data2$CityName)2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.
First plot chunk here
Arranged the health outcomes by highest to lowest
top <- clean_data2 |>
group_by(Short_Question_Text) |>
summarise(mean_value = mean(Data_Value,na.rm=TRUE)) |>
arrange(desc(mean_value))
top# A tibble: 12 × 2
Short_Question_Text mean_value
<chr> <dbl>
1 High Blood Pressure 31.9
2 High Cholesterol 29.5
3 Arthritis 21.9
4 Mental Health 14.3
5 Physical Health 13.7
6 Diabetes 10.5
7 Current Asthma 8.67
8 COPD 7.71
9 Coronary Heart Disease 6.29
10 Cancer (except skin) 5.95
11 Stroke 3.47
12 Chronic Kidney Disease 3.15
Plot
# non map plot
health <- clean_data2 |>
filter(Short_Question_Text %in% c("High Blood Pressure","High Cholesterol","Arthritis","Mental Health","Physical Health"))
ggplot(health,aes(x=PopulationCount/10^6,y=Data_Value,color=Short_Question_Text))+
geom_point(alpha=0.05)+
scale_color_brewer(palette = "Set1")+
geom_jitter()+
facet_wrap(~ Short_Question_Text)+
labs(title="Most Common Health Conditions in Florida",x="Population Count (millions)", y= "Age Adjusted Prevalence(%)",caption = "Source:CDC")+
theme_bw()+
theme(plot.title = element_text(face="bold",size=14,hjust=0.5,family = "mono"),
axis.text.x = element_text(size = 9),
axis.text.y = element_text(size = 9),
legend.title = element_blank())3. Now create a map of your subsetted dataset.
First map chunk here
Getting the longtitude and latitude in Florida
mean_lat <- mean(health$lat,na.rm = TRUE)
mean_long <- mean(health$long,na.rm=TRUE)
mean_lat[1] 27.20351
mean_long[1] -80.97493
Map - Filtered only for high blood pressure in Florida
health1 <- health |>
filter(Short_Question_Text == "High Blood Pressure") |>
group_by(CityName,lat,long)|>
summarise(Data_Value=mean(Data_Value,na.rm=TRUE))`summarise()` has grouped output by 'CityName', 'lat'. You can override using
the `.groups` argument.
leaflet() |>
setView(lng=mean_long,lat=mean_lat,zoom =6)|>
addProviderTiles("Esri.WorldStreetMap")|>
addCircles(
data = health1,
radius = sqrt(health1$Data_Value) * 2000,
color = "black",
fillColor = "#7FFF00",
fillOpacity = 0.7,
)Assuming "long" and "lat" are longitude and latitude, respectively
4. Refine your map to include a mouse-click tooltip
Refined map chunk here
Tooltip
pophealth1 <- paste0("<b>", health1$CityName, "</b><br>",
"High Blood Pressure: ", round(health1$Data_Value, 1), "%")leaflet() |>
setView(lng=mean_long,lat=mean_lat,zoom =6)|>
addProviderTiles("Esri.WorldStreetMap")|>
addCircles(
data = health1,
radius = sqrt(health1$Data_Value) * 2000,
color = "black",
fillColor = "#7FFF00",
fillOpacity = 0.7,
popup = pophealth1
)Assuming "long" and "lat" are longitude and latitude, respectively
5. Write a paragraph
In a paragraph, describe the plots you created and the insights they show.
In Florida during 2017, the health outcomes with the highest age-adjusted prevalence were high blood pressure and high cholesterol, showing that these conditions were the most common among residents across Florida cities. To explore this further, I created a facet wrap scatterplot that compared five major health outcomes, high blood pressure, high cholesterol, arthritis, mental health, and physical health, against city population size. Each facet shows how the prevalence of each condition varies by population, making it easier to compare trends across different health issues.
For the map, I focused only on high blood pressure and visualized its distribution across Florida. Each circle represents a city, with its size reflecting the percentage of people affected by high blood pressure. One interesting detail is that the Miami area appears clustered,it’s just only because, the dataset includes several nearby municipalities such as Miami, Miami Beach, and Miami Gardens, which are all geographically close to one another, creating overlapping points in that region.