Healthy Cities GIS Assignment

Author

Ava Haghighi

libraries

library(tidyverse)
library(tidyr)
library(knitr)
library(leaflet)
library(sf)
library(readr)
setwd("C:/Users/ava/Downloads/data110")
cities500 <- read_csv("500CitiesLocalHealthIndicators.cdc.csv")

The GeoLocation variable has (lat, long) format

Split GeoLocation (lat, long) into two columns: lat and long

latlong <- cities500|>
  mutate(GeoLocation= str_replace_all(GeoLocation, "[()]", ""))|>
  separate(GeoLocation, into = c("lat", "long"), sep = ",", convert = TRUE)

Filter the dataset

Remove the StateDesc that includes the United Sates, select Prevention as the category (of interest), filter for only measuring crude prevalence and select only 2017.

latlong_clean <- latlong |>
  filter(StateDesc != "United States") |>
  filter(Category == "Prevention") |>
  filter(Data_Value_Type == "Crude prevalence") |>
  filter(Year == 2017)
head(latlong_clean)

# A tibble: 6 × 25
   Year StateAbbr StateDesc  CityName   GeographicLevel DataSource Category  
  <dbl> <chr>     <chr>      <chr>      <chr>           <chr>      <chr>     
1  2017 AL        Alabama    Montgomery City            BRFSS      Prevention
2  2017 CA        California Concord    City            BRFSS      Prevention
3  2017 CA        California Concord    City            BRFSS      Prevention
4  2017 CA        California Fontana    City            BRFSS      Prevention
5  2017 CA        California Richmond   Census Tract    BRFSS      Prevention
6  2017 FL        Florida    Davie      Census Tract    BRFSS      Prevention
# ℹ 18 more variables: UniqueID <chr>, Measure <chr>, Data_Value_Unit <chr>,
#   DataValueTypeID <chr>, Data_Value_Type <chr>, Data_Value <dbl>,
#   Low_Confidence_Limit <dbl>, High_Confidence_Limit <dbl>,
#   Data_Value_Footnote_Symbol <chr>, Data_Value_Footnote <chr>,
#   PopulationCount <dbl>, lat <dbl>, long <dbl>, CategoryID <chr>,
#   MeasureId <chr>, CityFIPS <dbl>, TractFIPS <dbl>, Short_Question_Text <chr>

names(latlong_clean)

 [1] "Year"                       "StateAbbr"                 
 [3] "StateDesc"                  "CityName"                  
 [5] "GeographicLevel"            "DataSource"                
 [7] "Category"                   "UniqueID"                  
 [9] "Measure"                    "Data_Value_Unit"           
[11] "DataValueTypeID"            "Data_Value_Type"           
[13] "Data_Value"                 "Low_Confidence_Limit"      
[15] "High_Confidence_Limit"      "Data_Value_Footnote_Symbol"
[17] "Data_Value_Footnote"        "PopulationCount"           
[19] "lat"                        "long"                      
[21] "CategoryID"                 "MeasureId"                 
[23] "CityFIPS"                   "TractFIPS"                 
[25] "Short_Question_Text"

Remove the variables that will not be used in the assignment

prevention <- latlong_clean |>
  select(-DataSource,-Data_Value_Unit, -DataValueTypeID, -Low_Confidence_Limit, -High_Confidence_Limit, -Data_Value_Footnote_Symbol, -Data_Value_Footnote)

#md <- prevention |>
  ##filter(StateAbbr=="MD")
#head(md)

The new dataset

I have filtered the dataset for the state of Texas and selected three different cities in this state to take a look at their population count and how it relates to their answers to variable survey questions.

unique(latlong_clean$StateAbbr)

 [1] "AL" "CA" "FL" "CT" "IL" "MN" "NY" "PA" "NC" "OH" "OK" "OR" "TX" "RI" "SC"
[16] "SD" "TN" "UT" "VA" "WA" "AK" "WI" "AZ" "AR" "CO" "DE" "NV" "DC" "GA" "ID"
[31] "HI" "MA" "MI" "IN" "KS" "KY" "IA" "LA" "MD" "ME" "NH" "NJ" "NM" "MO" "MS"
[46] "NE" "MT" "ND" "WV" "VT" "WY"

# with filter selecting Texas between all states 
TX_data <- prevention |>
  filter(StateAbbr=="TX")

# selecting only var Cityname Population count and short quetion texts
selected_data <- TX_data %>% 
  select(CityName, PopulationCount , Short_Question_Text)

Citys Austin , Houston and Dallas data

# selecting only 3 citys from state 
selected_city <- selected_data |>
  filter( CityName == c("Austin", "Houston", "Dallas"))

# cleaning the new data
TX_data <- TX_data %>%
  mutate(
    lat = as.numeric(lat),
    lng = as.numeric(long)
  )

# creating a map data that will later be used 
map_data <- TX_data |>
  group_by(CityName) %>%
  summarize(
    lat = mean(lat, na.rm = TRUE),
    lng = mean(long, na.rm = TRUE),
    population = sum(PopulationCount, na.rm = TRUE) # Assuming you want to sum populations if duplicates exist
  )

Bar Chart

In the bar charts below, the population that has answered each question is shown. The questions are asking about individuals getting annual checkups for health insurance. Taking BP medications and cholesterol screening

p2<- selected_city |>
  # creating ggplot followed by geom bar and a facet_wrap
  # using fill to color each question 
ggplot( aes(x=Short_Question_Text, y=PopulationCount , fill = Short_Question_Text))+
  geom_bar(stat='identity')+
  facet_wrap(~CityName, scales="free_y", ncol=1,) +
  # changing position of the bars 
   theme(axis.text.x = element_text( hjust = 1),
        legend.position = "none") + 
  labs(title = " 3 Citys Of Texas",
       x = "Question ",
       y = " Population")
 

p2

First Map

The map below shows the population of each city; therefore, the bigger the bubble, the bigger the population.

# having each citys lat and Long for adding circles 
leaflet( data = TX_data) %>%
  setView(lng = -98.5456116 , lat = 31.2638905, zoom = 6) |>
  addProviderTiles("Esri.WorldStreetMap") |>
  addCircles(  lng = c( -97.7431,-96.7970 , -95.3698),
               lat = c (30.2672, 32.7767, 29.7604),
    data = selected_city, radius = ~sqrt(PopulationCount) * 100 ,
    color = "blue", fillColor = "#c0defa", fillOpacity = 0.5)

Second Map

The map below is the refined version of the first map, which means that by clicking on the bubbles, you can see a number that shows the average answer to each question for each city. To have an average of each question, first it was needed to select each question, and then, with the use of the function summrise, have one number that represents the average population for that particular question.

short_quetion1 <- selected_city |>
 filter( Short_Question_Text== "Annual Checkup"
)

short_quetion2 <- selected_city |>
  filter( Short_Question_Text == "Health Insurance"
    )

short_quetion3 <- selected_city |>
  filter( Short_Question_Text == "Cholesterol Screening")

short_quetion4 <- selected_city |>
 filter( Short_Question_Text == "Taking BP Medication")

avg1 <- short_quetion1 |>
  group_by(CityName) |>
  summarise( Annual_check = mean(PopulationCount))

avg2 <- short_quetion2 |>
  group_by(CityName) |>
summarise( Health_insurance = mean(PopulationCount))

avg3 <- short_quetion3|>
  group_by(CityName)|>
summarise( Cholesterol_Screening = mean(PopulationCount))

avg4 <- short_quetion4 |>
  group_by(CityName)|>
summarise( TakingBPMedication = mean(PopulationCount))

popupq <- paste0(
      "<b> Annuual check up: </b>", avg1$Annual_check, "<br>" ,
      "<b> Health Insurance: </b>", avg2$Health_insurance, "<br>",
      "<b> Taking BP Medication: </b>", avg4$TakingBPMedication, "<br>",
      "<b> Cholesterol Screening: </b>", avg3$Cholesterol_Screening, "<br>"
      
      
    )

  leaflet( data = TX_data) %>%
  setView(lng = -98.5456116 , lat = 31.2638905, zoom = 7) |>
  addProviderTiles("Esri.WorldStreetMap") |> 
   addCircles( lng = c( -97.7431,-96.7970 , -95.3698),
               lat = c (30.2672, 32.7767, 29.7604),
    data = selected_city, radius = ~sqrt(PopulationCount) * 100 ,
    color = "blue", fillColor = "#c0defa", fillOpacity = 0.5,
    popup = popupq)

5. Paragraph

The map I created initially displays the population of each city in Texas. In the refined version of this map, I show the average number of people who responded to questions about health insurance, annual check-ups, taking BP medication, and cholesterol screening. It is evident that larger city populations correspond to more responses to these health-related questions. Furthermore, the related bar chart shows which health category is least followed in every city,Each character is indicated with a single color to make comparisons easier, and cholesterol screening has the lowest participation rate.