library(tidyverse)
library(tidyr)
setwd("C:/Users/mafok/OneDrive/Desktop/Data 110")
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")
data(cities500)Healthy Cities GIS Assignment
Load the libraries and set the working directory
The GeoLocation variable has (lat, long) format
Split GeoLocation (lat, long) into two columns: lat and long
latlong <- cities500|>
mutate(GeoLocation = str_replace_all(GeoLocation, "[()]", ""))|>
separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)
head(latlong)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Health Outcom…
4 2017 CA California Hayward City BRFSS Unhealthy Beh…
5 2017 CA California Hemet City BRFSS Prevention
6 2017 CA California Indio Census Tract BRFSS Health Outcom…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
Filter the dataset
Remove the StateDesc that includes the United Sates, select Prevention as the category (of interest), filter for only measuring crude prevalence and select only 2017.
latlong_clean <- latlong |>
filter(StateDesc != "United States") |>
filter(Data_Value_Type == "Crude prevalence") |>
filter(Year == 2017)
head(latlong_clean)# A tibble: 6 × 25
Year StateAbbr StateDesc CityName GeographicLevel DataSource Category
<dbl> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract BRFSS Health Outcom…
2 2017 CA California Hawthorne City BRFSS Unhealthy Beh…
3 2017 CA California Hayward City BRFSS Unhealthy Beh…
4 2017 CA California Indio Census Tract BRFSS Health Outcom…
5 2017 CA California Inglewood Census Tract BRFSS Health Outcom…
6 2017 CA California Lakewood City BRFSS Unhealthy Beh…
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
# DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
# Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
# Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
What variables are included? (can any of them be removed?)
names(latlong_clean) [1] "Year" "StateAbbr"
[3] "StateDesc" "CityName"
[5] "GeographicLevel" "DataSource"
[7] "Category" "UniqueID"
[9] "Measure" "Data_Value_Unit"
[11] "DataValueTypeID" "Data_Value_Type"
[13] "Data_Value" "Low_Confidence_Limit"
[15] "High_Confidence_Limit" "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote" "PopulationCount"
[19] "lat" "long"
[21] "CategoryID" "MeasureId"
[23] "CityFIPS" "TractFIPS"
[25] "Short_Question_Text"
Remove the variables that will not be used in the assignment
latlong_clean2 <- latlong_clean |>
select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(latlong_clean2)# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract Health … 0632548… Arthri…
2 2017 CA California Hawthorne City Unhealt… 632548 Curren…
3 2017 CA California Hayward City Unhealt… 633000 Obesit…
4 2017 CA California Indio Census Tract Health … 0636448… Arthri…
5 2017 CA California Inglewood Census Tract Health … 0636546… Diagno…
6 2017 CA California Lakewood City Unhealt… 639892 Obesit…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
The new dataset “Prevention” is a manageable dataset now.
For your assignment, work with a cleaned dataset.
1. Once you run the above code and learn how to filter in this format, filter this dataset however you choose so that you have a subset with no more than 900 observations.
Filter chunk here
latlong_clean <- latlong |>
filter(StateDesc != "United States") |>
filter(Data_Value_Type == "Crude prevalence") |>
filter(Year == 2017)
latlong_clean2 <- latlong_clean |>
select(-DataSource, -Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit,
-High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)
head(latlong_clean2)# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Hawthorne Census Tract Health … 0632548… Arthri…
2 2017 CA California Hawthorne City Unhealt… 632548 Curren…
3 2017 CA California Hayward City Unhealt… 633000 Obesit…
4 2017 CA California Indio Census Tract Health … 0636448… Arthri…
5 2017 CA California Inglewood Census Tract Health … 0636546… Diagno…
6 2017 CA California Lakewood City Unhealt… 639892 Obesit…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
bar_health <- latlong_clean2 |>
filter(CityName %in% c("New York", "Los Angeles", "Chicago")) |>
filter(Measure == "Cholesterol screening among adults aged >=18 Years") |>
filter(!is.na(Data_Value))
nrow(bar_health) [1] 3908
head(bar_health)# A tibble: 6 × 18
Year StateAbbr StateDesc CityName GeographicLevel Category UniqueID Measure
<dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 2017 CA California Los Ange… Census Tract Prevent… 0644000… Choles…
2 2017 CA California Los Ange… Census Tract Prevent… 0644000… Choles…
3 2017 CA California Los Ange… Census Tract Prevent… 0644000… Choles…
4 2017 CA California Los Ange… Census Tract Prevent… 0644000… Choles…
5 2017 CA California Los Ange… Census Tract Prevent… 0644000… Choles…
6 2017 CA California Los Ange… Census Tract Prevent… 0644000… Choles…
# ℹ 10 more variables: Data_Value_Type <chr>, Data_Value <dbl>,
# PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
# MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>
latlong_clean2 |>
filter(Measure == "Cholesterol screening among adults aged >=18 Years") |>
count(CityName, sort = TRUE)# A tibble: 474 × 2
CityName n
<chr> <int>
1 New York 2141
2 Los Angeles 1004
3 Chicago 798
4 Houston 603
5 Philadelphia 382
6 Phoenix 360
7 San Antonio 327
8 Dallas 320
9 Detroit 297
10 San Diego 297
# ℹ 464 more rows
subset_900 <- latlong_clean2 |>
filter(CityName %in% c("Los Angeles", "Philadelphia", "Phoenix", "Houston", "San Antonio")) |>
filter(Measure == "Cholesterol screening among adults aged >=18 Years") |>
filter(!is.na(Data_Value))
unique(subset_900$CityName)[1] "Philadelphia" "Houston" "San Antonio" "Phoenix" "Los Angeles"
##subseting to 900 0bservations
subset_900 <- bar_health |>
sample_n(size = min(900, nrow(bar_health)))2. Based on the GIS tutorial (Japan earthquakes), create one plot about something in your subsetted dataset.
First plot chunk here
subset_900 <- latlong_clean2 |>
filter(Measure == "Cholesterol screening among adults aged >=18 Years") |>
filter(!is.na(Data_Value)) |>
slice_head(n = 900)library(dplyr) ## I used chatgpt to do deplyr summarize to reduce my search to a limited states
bar_data <- subset_900 |>
filter(CityName %in% c("Los Angeles", "Philadelphia", "Phoenix", "Houston", "San Antonio")) |>
group_by(CityName) |>
summarise(Data_Value = mean(Data_Value, na.rm = TRUE)) ggplot(bar_data, aes(x = CityName, y = Data_Value, fill = CityName)) +
geom_bar(stat = "identity") +
scale_fill_manual(
values = c(
"Los Angeles" = "#1f79b4",
"Philadelphia" = "#ff7f0e",
"Phoenix" = "#2ca32c",
"Houston" = "#d62728",
"San Antonio" = "#9437bd"
),
name = "City"
) +
labs(
title = "Cholesterol Screening among Adults Aged ≥18 Years (2017)",
x = "City",
y = "Crude Prevalence (%)"
) +
theme_minimal()This bar graph shows the percentage of adults aged 18 and older who got screened for cholesterol in 2017 across five U.S. cities: Los Angeles, Philadelphia, Phoenix, Houston, and San Antonio. Each bar represents one city, and the height of the bar shows how common cholesterol screening was in that location. The colors make it easier to tell the cities apart. We used custom colors to make the graph more visually clear and used meaningful labels so viewers can understand the message at a glance.
From the graph, we can see that Philadelphia has the highest prevalence rate of cholesterol screening, meaning more adults there got tested. In contrast, Phoenix and San Antonio have the lowest screening rates among the group. This tells us that people in Philadelphia may have better access to healthcare services or stronger health awareness programs. On the other hand, cities with lower rates may need more public health campaigns or easier access to screening. This kind of data helps identify where improvements are needed in community health.
3. Now create a map of your subsetted dataset.
First map chunk here
library(dplyr)
library(leaflet)Warning: package 'leaflet' was built under R version 4.4.3
# Step 1: Define your target cities
target_cities <- c("Los Angeles", "Philadelphia", "Phoenix", "Houston", "San Antonio")
# Step 2: Create a label for legend that includes city, state, and prevalence
subset_900 <- subset_900 |>
mutate(
label_text = paste0(CityName, ", ", StateDesc, ": ", Data_Value, "%"),
is_target = ifelse(CityName %in% target_cities, "highlight", "other")
)
# Step 3: Assign different colors based on whether the city is in your filtered list
city_colors <- colorFactor(
palette = c("highlight" = "#3F88C5", "other" = "#F4A261"),
domain = subset_900$is_target
)
# Step 4: Create the leaflet map
leaflet(subset_900) |>
addTiles() |>
addCircleMarkers(
lng = ~long,
lat = ~lat,
radius = ~Data_Value / 2,
color = ~city_colors(is_target), #I used chatgpt to help target
stroke = FALSE,
fillOpacity = 0.9,
label = ~label_text,
popup = ~paste(
"<b>City:</b>", CityName, "<br>",
"<b>State:</b>", StateDesc, "<br>",
"<b>Prevalence:</b>", Data_Value, "%"
)
) |>
addLegend(
position = "bottomright",
colors = c("#3F88C5", "#F4A261"), # match new palette here
labels = c("Filtered Cities", "Other Cities"),
title = "City Category",
opacity = 1
)This interactive leaflet map shows cholesterol screening rates for adults aged 18 and older across different U.S. cities. The five cities we focused on—Los Angeles, Philadelphia, Phoenix, Houston, and San Antonio—are shown in blue, while the other cities appear in orange. Each circle on the map represents a city, and the size of the circle shows how high the screening rate is. When you hover over or click on a circle, a tooltip or popup gives more details like the city’s name, the state it’s in, and the exact percentage of people screened.
From this map, Philadelphia stands out with a larger blue circle, meaning it has a higher cholesterol screening rate compared to the other focus cities. This suggests that people in Philadelphia may have better access to health services or more awareness about heart health. Cities like San Antonio or Phoenix have smaller circles, which could mean they need more public health outreach. Overall, this map helps us easily compare screening rates and see which cities may need more attention when it comes to preventive healthcare.
4. Refine your map to include a mouse-click tooltip
Refined map chunk here
library(leaflet)
library(dplyr)
# Step 1: Define the target cities
target_cities <- c("Los Angeles", "Philadelphia", "Phoenix", "Houston", "San Antonio")
# Step 2: Filter dataset for selected cities
filtered_subset <- subset_900 |>
filter(CityName %in% target_cities)
# Step 3: Create a color palette for the filtered cities
city_colors <- colorFactor(
palette = c("#E63946", "#2A9D8F", "#F4A261", "#A23E48", "#3F88C5"),
domain = filtered_subset$CityName
)
# Step 4: Create the leaflet map
leaflet(filtered_subset) |>
addTiles() |>
addCircleMarkers(
lng = ~long,
lat = ~lat,
radius = ~Data_Value / 2,
color = ~city_colors(CityName),
stroke = FALSE,
fillOpacity = 0.8,
label = ~paste0(CityName, ", ", StateDesc, ": ", Data_Value, "%"), # tooltip label
popup = ~paste(
"<b>City:</b>", CityName, "<br>",
"<b>State:</b>", StateDesc, "<br>",
"<b>Geographic Level:</b>", GeographicLevel, "<br>",
"<b>Measure:</b>", Measure, "<br>",
"<b>Prevalence:</b>", Data_Value, "%"
)
) |>
addLegend(
position = "bottomright",
pal = city_colors,
values = ~CityName,
title = "City",
opacity = 1
)5. Write a paragraph
In a paragraph, describe the plots you created and what they show.
This interactive map shows cholesterol screening rates among adults aged 18 and older in five U.S. cities: Los Angeles, Philadelphia, Phoenix, Houston, and San Antonio. Each city is marked by a colored circle, and the size of the circle reflects the screening rate—the larger the circle, the higher the percentage of people who got screened. When you hover over or click on a city, a label and popup appear showing the city’s name, state, and cholesterol screening percentage. This helps us understand and compare health trends visually across different cities.
From this map, we can see that Philadelphia has the largest circle, meaning it has the highest cholesterol screening rate among the five selected cities. This might suggest that people in Philadelphia have better access to preventive health services or stronger public health efforts. On the other hand, cities like Phoenix and San Antonio show smaller circles, indicating lower screening rates. This could mean there’s a need for more awareness or better healthcare access in those areas. Overall, the map is a helpful tool to identify where health support is working well and where improvements are still needed.
4o