import libraries

library(dplyr)
library(sf)
library(sp)
library(tidyr)
library(spdep)
library(stringr)
library(dbscan)
library(ggplot2)
library(h3jsr)
library(GGally)
library(leaflet)
library(cluster)
library(clusterSim)
library(clValid)
library(gstat)
library(raster)
library(viridis)
library(viridisLite)
library(tidycensus)
library(pscl)

load datasets

FARS_wide <- read.csv('FARS_NJ-NY-CT-18-22.csv')
SDOH_df <- read.csv('SDOH_NJ-NY-CT.csv')
combined_sf <- read.csv('combined_health_crash.csv')

sf_nj <- st_read('cb_2018_34_tract_500k/cb_2018_34_tract_500k.shp')
## Reading layer `cb_2018_34_tract_500k' from data source 
##   `/Users/erandoni/mechkar/cb_2018_34_tract_500k/cb_2018_34_tract_500k.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 2006 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -75.55961 ymin: 38.92852 xmax: -73.89398 ymax: 41.35742
## Geodetic CRS:  NAD83
sf_ny <- st_read('cb_2018_36_tract_500k/cb_2018_36_tract_500k.shp')
## Reading layer `cb_2018_36_tract_500k' from data source 
##   `/Users/erandoni/mechkar/cb_2018_36_tract_500k/cb_2018_36_tract_500k.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 4906 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.76215 ymin: 40.4961 xmax: -71.85621 ymax: 45.01585
## Geodetic CRS:  NAD83
sf_ct <- st_read('cb_2018_09_tract_500k/cb_2018_09_tract_500k.shp')
## Reading layer `cb_2018_09_tract_500k' from data source 
##   `/Users/erandoni/mechkar/cb_2018_09_tract_500k/cb_2018_09_tract_500k.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 830 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -73.72777 ymin: 40.98014 xmax: -71.78699 ymax: 42.05059
## Geodetic CRS:  NAD83
tristate_sf <- rbind(sf_nj, sf_ny, sf_ct)
combined_sf$GEOID[combined_sf$STATEFP == 9] <- paste0(0, combined_sf$GEOID)

combined_sf <- merge(tristate_sf, combined_sf, by = 'GEOID', all.x = TRUE)

We want to find spatial clusters based on the following variables:
-Fair or poor self-rated health status among adults (GHLTH_CrudePrev)
-Current asthma prevalence among adults (CASTHMA_CrudePrev)
-High blood pressure among adults (BPHIGH_CrudePrev)
-High cholesterol (HIGHCHOL_CrudePrev)
-Diagnosed diabetes among adults (DIABETES_CrudePrev)
-Current lack of health insurance among adults (ACCESS2_CrudePrev)

First let’s get a general look at these variables by county:

# group data into counties, plot on map

counties <- combined_sf %>%
  group_by(CountyCode) %>%
  summarize(
    GHLTH = mean(GHLTH_CrudePrev, na.rm = TRUE),
    ASTHMA = mean(CASTHMA_CrudePrev, na.rm = TRUE),
    BPHIGH = mean(BPHIGH_CrudePrev, na.rm = TRUE),
    HIGHCHOL = mean(HIGHCHOL_CrudePrev, na.rm = TRUE),
    DIABETES = mean(DIABETES_CrudePrev, na.rm = TRUE),
    INSURANCE = mean(ACCESS2_CrudePrev, na.rm = TRUE),
    geometry = (st_union(geometry))
  )
ggplot(data = counties) +
  geom_sf(aes(fill = GHLTH)) +
  scale_fill_viridis_c(option = "viridis", name = "LOW GHLTH") +
  theme_minimal() +
  theme(legend.position = 'right') +
  labs(title = "Prevalence of low general health by county")

ggplot(data = counties) +
  geom_sf(aes(fill = ASTHMA)) +
  scale_fill_viridis_c(option = "viridis", name = "ASTHMA") +
  theme_minimal() +
  theme(legend.position = 'right') +
  labs(title = "Prevalence of asthma by county")

ggplot(data = counties) +
  geom_sf(aes(fill = BPHIGH)) +
  scale_fill_viridis_c(option = "viridis", name = "BPHIGH") +
  theme_minimal() +
  theme(legend.position = 'right') +
  labs(title = "Prevalence of high blood pressure by county")

ggplot(data = counties) +
  geom_sf(aes(fill = HIGHCHOL)) +
  scale_fill_viridis_c(option = "viridis", name = "HIGHCHOL") +
  theme_minimal() +
  theme(legend.position = 'right') +
  labs(title = "Prevalence of high cholestrol by county")

ggplot(data = counties) +
  geom_sf(aes(fill = DIABETES)) +
  scale_fill_viridis_c(option = "viridis", name = "DIABETES") +
  theme_minimal() +
  theme(legend.position = 'right') +
  labs(title = "Prevalence of diabetes by county")

ggplot(data = counties) +
  geom_sf(aes(fill = INSURANCE)) +
  scale_fill_viridis_c(option = "viridis", name = "INSURANCE") +
  theme_minimal() +
  theme(legend.position = 'right') +
  labs(title = "Prevalence of lack of insurance by county")

The dataset contains a variable called health intensity that is calculated from the 6 variables mentioned above using the following method for each row: Each variable has the percentile at which it is at relative to the dataset calculated, for example: the median for BPHIGH_CrudePrev is 29.8, therefore a census tract where it’s BPHIGH_CrudePrev variable is 29.8 will be at the 50th percentile. After this is done, the percentiles are added, and divided by 6, and assidned as the health_intensity variable. This variable is important because it allows us to gauge what the overall health in a census tract is based on those 6 variables.

density graph of the health_intensity variable:

ggplot(data = combined_sf, aes(x = health_intensity)) + 
  geom_density(fill = "blue", alpha = 0.5) +
    stat_function(fun = dnorm, args = list(mean = mean(combined_sf$health_intensity), sd = sd(combined_sf$health_intensity)), color = "red") +
  labs(title = "Density Plot of health intensity",
       x = "health intensity",
       y = "Density") +
  theme_minimal()

Health intensity by census tract:

palette <- colorNumeric(
  palette = colorRampPalette(c("green", "red"))(100),
  domain = combined_sf$health_intensity
)

leaflet(data = combined_sf, options = leafletOptions(minZoom = 6)) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~palette(health_intensity),
    fillOpacity = 0.7,
    color = "black",
    weight = 1,
    opacity = 1,
    highlightOptions = highlightOptions(
      weight = 3,
      color = "white",
      bringToFront = TRUE
    ),
    label = ~paste("Health Intensity:", health_intensity)
  ) %>%
  addLegend(
    pal = palette,
    values = ~health_intensity,
    title = "Health Intensity",
    position = "bottomright"
  ) %>%
  addControl(
    html = "<h2>Health Intensity by Census Tract</h2>", 
    position = "topright",
    className = "map-title"
  )

Next, we split up health intensity into quantiles and categorize our census tracts into 5 health categories to make understanding the visualization easier:

# split up data into 5 quantiles of health intensity
health_quantiles <- quantile(combined_sf$health_intensity, probs = seq(0, 1, by = 0.2), na.rm=TRUE)

combined_sf$health_cat <- cut(combined_sf$health_intensity, breaks = health_quantiles, include.lowest = TRUE, labels=FALSE)#c('extremely high', 'high', 'moderate', 'low', 'extremely low'))

palette <- colorFactor(palette = colorRampPalette(c("green", "limegreen","orange", "red", "darkred"))(10), domain = combined_sf$health_cat)

leaflet(data = combined_sf, options = leafletOptions(minZoom = 6)) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~palette(health_cat),
    fillOpacity = 0.7,
    color = "black",
    weight = 1,
    opacity = 1,
    highlightOptions = highlightOptions(
      weight = 3,
      color = "white",
      bringToFront = TRUE
    ),
    label = ~paste("Health Category:", health_cat)
  ) %>%
  addLegend(
    pal = palette,
    values = ~health_cat,
    title = "Health Category",
    position = "bottomright"
  ) %>%
  addControl(
    html = "<h2>Health Category by Census Tract</h2>", 
    position = "topright",
    className = "map-title"
  )

We want to find clusters of areas with low health. We can manually separate all continuous areas where census tracts are in the lowest health category, and filter for some sort of minimum population and density (this it done to eliminate extremely rural areas where there are little people/no pedestrians). This method yields decent results, but it also misses the full picture due to its simplicity.

# calculate population density per acre. I suspect the area variable is not completely accurate, since the sum of all of them is too low, but it does still seem to be correlated to how much area is actually in a census tract
combined_sf$density <- combined_sf$TotalPopulation / (combined_sf$Ac_Total_mean)

# split off worst health quantile, filter for clusters with over 50k pop and density over 20
lowhealth <- combined_sf %>% filter(health_cat == 5)
adj_list <- poly2nb(as_Spatial(lowhealth))
clusters <- n.comp.nb(adj_list)
lowhealth$cluster <- clusters$comp.id

cluster_populations <- lowhealth %>%
  group_by(cluster) %>%
  summarize(total_population = sum(TotalPopulation),
            avg_density = mean(density))

large_clusters <- cluster_populations %>%
  filter(total_population >= 50000 & avg_density >= 20) %>%
  pull(cluster)
lowhealth <- lowhealth %>%
  filter(cluster %in% large_clusters)
leaflet(data = lowhealth, options = leafletOptions(minZoom = 6)) %>%
  addTiles() %>%
  addPolygons(fillColor = ~colorFactor(viridis(10), domain = lowhealth$cluster)(cluster),
              fillOpacity = 0.7,
              weight = 1,
              color = 'black',
              label = ~paste("Cluster:", cluster),
              labelOptions = labelOptions(direction = "auto")) %>%
  addControl(
    html = "<h2>Low Health Clusters - Heuristic Method</h2>", 
    position = "topright",
    className = "map-title"
  )

As we can see, there are 19 distinct clusters that come up: 7 in NJ, 9 in NY, 3 in CT
NJ: Paterson, Newark + Irvington and the Oranges, South JC + Bayonne, Elizabeth, Trenton, Camden + Pennsauken, Atlantic City Area
NYC: The Bronx + Harlem, Washington Heights, Flushing, Jamaica, Chinatown/Two Bridges, Sunset/Borough Park, East NY + Brownsville + Crown Heights
Upstate NY: Rochester, Buffalo
CT: Hartford, Waterbury, Bridgeport

Now, we will use aglommerative hierarchical clustering to find clusters of low health. This is the most succesful clustering method for this data that we have found so far.

combined_sf <- combined_sf %>% filter(!is.na(ACCESS2_CrudePrev) & !is.na(GHLTH_CrudePrev) & !is.na(CASTHMA_CrudePrev) & !is.na(BPHIGH_CrudePrev) & !is.na(HIGHCHOL_CrudePrev) & !is.na(DIABETES_CrudePrev))

nyc_metro <- combined_sf #%>% filter(CSA_Name_max == 'New York-Newark, NY-NJ-CT-PA')

coords <- st_coordinates(st_centroid(nyc_metro))
scaled_coords <- scale(coords)
scaled_data <- as.data.frame(cbind(scale(nyc_metro$GHLTH_CrudePrev), scale(nyc_metro$CASTHMA_CrudePrev), scale(nyc_metro$BPHIGH_CrudePrev), scale(nyc_metro$HIGHCHOL_CrudePrev), scale(nyc_metro$DIABETES_CrudePrev), scale(nyc_metro$ACCESS2_CrudePrev)))

combined <- as.data.frame(cbind((coords), scale(nyc_metro$GHLTH_CrudePrev), scale(nyc_metro$CASTHMA_CrudePrev), scale(nyc_metro$BPHIGH_CrudePrev), scale(nyc_metro$HIGHCHOL_CrudePrev), scale(nyc_metro$DIABETES_CrudePrev), scale(nyc_metro$ACCESS2_CrudePrev)))

colnames(combined) <- c("x", "y", "GHLTH_CrudePrev", "CASTHMA_CrudePrev", "BPHIGH_CrudePrev", "HIGHCHOL_CrudePrev", "DIABETES_CrudePrev", "ACCESS2_CrudePrev")

dist_coords <- dist(scaled_coords)
dist_data <- dist(scaled_data)
  
# Weighting factor for spatial distance
spatial_weight <- 0.8  # Adjust this factor to increase or decrease the emphasis on spatial proximity
  
# Combine distance matrices with emphasis on spatial nearness
combined_dist <- (spatial_weight * dist_coords) + ((1 - spatial_weight) * dist_data)

agg_v2 <- function(combined, combined_dist, i, method, graph) {
  # Calculate distance matrices
  
  hc <- hclust(combined_dist, method)
  clusters <- cutree(hc, k = i)
  
  # Assign clusters back to the data
  combined$cluster <- as.integer(as.factor(clusters))
  nyc_metro$cluster <- combined$cluster

  sil_scores <- silhouette(nyc_metro$cluster, dist(as.matrix(combined)))
  mean_sil_score <- mean(as.data.frame(sil_scores)$sil_width)
  print(paste("Silhouette score:", mean_sil_score))
    
  # Calculate Davis-Bouldin index
  db_index <- index.DB(as.matrix(combined), nyc_metro$cluster)$DB
  print(paste("Davis-Bouldin index:", db_index))
  
  # Calculate Dunn index
  dunn_index <- dunn(dist(as.matrix(combined)), nyc_metro$cluster)
  print(paste("Dunn index:", dunn_index))
  
  palette <- colorFactor(palette = rainbow(length(unique(nyc_metro$cluster))), domain = nyc_metro$cluster)
  
  clus_means <- nyc_metro %>% group_by(cluster) %>% summarise(mean_health = mean(health_intensity), total_population = sum(TotalPopulation), avg_density = mean(density), total_tracts = n())
  
  clus_means <- clus_means %>% filter(mean_health >= 65 & avg_density >= 20) %>% pull(cluster)
  
  bad_sf <- nyc_metro %>% filter(cluster %in% clus_means)
  neighbors <- poly2nb(as_Spatial(bad_sf))
  no_neighbors <- which(card(neighbors) == 0)
  bad_sf <- bad_sf[-no_neighbors, ]
  
  palette <- colorFactor(palette = colorRampPalette(c("green", "limegreen","orange", "red", "darkred"))(10), domain = combined_sf$health_cat)
  if (graph == 0){
  leaflet(bad_sf, options = leafletOptions(minZoom = 6)) %>%
      addTiles() %>%
      addPolygons(
        fillColor = ~palette(health_cat),
        fillOpacity = 0.7,
        color = "white",
        weight = 1,
        popup = ~paste("health category:", health_cat)
      ) %>%
      addLegend(
        "bottomright", 
        pal = palette, 
        values = ~health_cat,
        title = "Clustering based on health factors",
        opacity = 1
      ) %>%
    addControl(
      html = "<h2>Low Health Clusters - Agglomerative Method</h2>", 
      position = "topright",
      className = "map-title"
    )
    }
  else {
    return (bad_sf)
  }
}

agg_v2(combined, combined_dist, 200, 'ward.D2', 0)
## [1] "Silhouette score: 0.499632362521662"
## [1] "Davis-Bouldin index: 0.847814278955593"
## [1] "Dunn index: 0.152630003717111"

There are many ways to tweak the clustering methodology and there are some clear trade offs:
the more spatial compactness is prioritized, the worse the actual fit of the clusters is
more clusters -> better fit, at the expense of spatial compactness and overall is just messier
As we can see, this method finds a lot of clusters that the heuristic method misses.

Now that we’ve identified these clusters, we want to see if their pedestrian fatality rates are higher than the general pedestrian fatality rates relative to population.
Our question is: Is there a statistical difference in the number of ped crashes among census tracts with very high intensity of poor health?

To do this, we need to look at the distribution of the number of crashes per census tract:

combined_sf$total_crashes <- ifelse(is.na(combined_sf$total_crashes), 0, combined_sf$total_crashes)
hist(combined_sf$total_crashes, breaks = 7)

mean(combined_sf$total_crashes)
## [1] 0.2749837
var(combined_sf$total_crashes)
## [1] 0.4045152

Since the data is evidently zero inflated, we need a negative binomial model. Variance is higher than the mean, therefore a negative binomial model is more appropriate than a poisson model.

lowhealth <- agg_v2(combined, combined_dist, 200, 'ward.D2', 1)
## [1] "Silhouette score: 0.499632362521662"
## [1] "Davis-Bouldin index: 0.847814278955593"
## [1] "Dunn index: 0.152630003717111"
combined_sf <- combined_sf %>% mutate(lh_cluster = ifelse(GEOID %in% lowhealth$GEOID, 1, 0))

zinb_model <- zeroinfl(total_crashes ~ lh_cluster, data = combined_sf, dist = "negbin")
summary(zinb_model)
## 
## Call:
## zeroinfl(formula = total_crashes ~ lh_cluster, data = combined_sf, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4743 -0.4294 -0.4294 -0.4294  9.5831 
## 
## Count model coefficients (negbin with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.35708    0.03059 -44.365  < 2e-16 ***
## lh_cluster   0.35334    0.11766   3.003  0.00267 ** 
## Log(theta)  -0.42951    0.09256  -4.640 3.48e-06 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   -7.812     20.611  -0.379    0.705
## lh_cluster     4.438     20.595   0.215    0.829
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.6508 
## Number of iterations in BFGS optimization: 73 
## Log-likelihood: -5061 on 5 Df
combined_sf$total_crashes <- ifelse(is.na(combined_sf$total_crashes), 0, combined_sf$total_crashes)
combined_sf$anycrash <- ifelse(combined_sf$total_crashes > 0, 'Crashes', 'No Crashes')
combined_sf$lowhealth <- ifelse(combined_sf$lh_cluster == 1, 'Low Health', 'High Health')
table(combined_sf$lowhealth, combined_sf$anycrash)
##              
##               Crashes No Crashes
##   High Health    1225       5036
##   Low Health      340       1054
prop.table(table(combined_sf$lowhealth, combined_sf$anycrash)) * 100
##              
##                 Crashes No Crashes
##   High Health 16.002613  65.787067
##   Low Health   4.441541  13.768779
palette <- colorFactor(palette = colorRampPalette(c("green", "limegreen","orange", "red", "darkred"))(10), domain = combined_sf$health_cat)

w <- combined_sf %>% filter(anycrash == 'Crashes' & lowhealth == 'Low Health')
leaflet(w) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~palette(health_cat),
    fillOpacity = 0.7,
    color="black",
    weight = 1,
  )
FARS_wide <- FARS_wide %>% filter(LONGITUD < 200)
crashes_clustered <- dbscan(FARS_wide[, c("LATITUDE", "LONGITUD")], eps = 0.01, minPts = 3)

FARS_wide$cluster <- crashes_clustered$cluster
clusres <- FARS_wide %>% filter(cluster > 0)

pal <- colorFactor(rainbow(length(unique(clusres$cluster))), domain = clusres$cluster)
  
leaflet(clusres) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~LONGITUD,
    lat = ~LATITUDE,
    color = ~pal(cluster),
    radius = 5,
    stroke = FALSE,
    fillOpacity = 1,
    label = ~cluster
  ) %>%
  addLegend(
    position = 'bottomright',
    colors = unique(as.factor(crashes_clustered$cluster)),
    labels = unique(as.factor(crashes_clustered$cluster))
  )
cc <- merge(combined_sf, clusres, by.x = 'GEOID', by.y = 'CENSUS_TRACT', all.y = TRUE)
cc <- st_as_sf(cc)
cc_polygons <- cc %>%
  filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON"))

leaflet(cc_polygons) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~pal(cluster),
    fillOpacity = 0.7,
    color="black",
    weight = 1,
    label = ~paste('cluster:', cluster)
  )
table(combined_sf$lh_cluster)
## 
##    0    1 
## 6261 1394
sum(combined_sf$total_crashes[combined_sf$lh_cluster == 1])
## [1] 494
sum(combined_sf$total_crashes[combined_sf$lh_cluster == 0])
## [1] 1611

Interpretation of results: there are two parts - the count model and the zero-inflation model.
The count model is negative binomial regression with a log link, which predicts total_crashes based on the predictor of belonging to a low health cluster.
The intercept shows that the expected number of crashes outside of low health crashes is exp(-1.35708) ≈ 0.257
The coefficient for lh_cluster indicates that being a in a low health cluster is associated with a 42.4% increase in the expected number of crashes (exp(0.35334) ≈ 1.424). The p value is 0.00267, which indicates this is statisticaly significant.

The zero-inflation model is a binomial logistic regression model, predicting the probability of excess zeros in total_crashes based on the predictor of belonging to a low health cluster.
The p values for both the intercept and lh_cluster are too high to draw any statistically significant conclusions.

We need to prove that this higher pedestrian death rate in low-health areas is not as a result of another variable, such as higher exposure (more walking). To check this we divide the census tracts into high, medium and low walking commutes.
We then calculate a chi-square test of difference with the cluster designation being the grouping variable, and the outcome variable being number of ped crashes per thousand population. We can estimate walking exposure using data from the ACS using the census API

# get walking data. The data is actually structured as # of people who walked for a range of time, so we need to get all of the ranges individually and through that approximate the average walking time
walk_code <- "B08134_101"
walk_data <- get_acs(
  geography = "tract",
  variables <- walk_code,
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
walk_data <- walk_data %>%
  rename(Total_walkers = estimate)

under10 <- get_acs(
  geography = "tract",
  variables <- "B08134_102",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
ten_14 <- get_acs(
  geography = "tract",
  variables <- "B08134_103",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
fifteen_19 <- get_acs(
  geography = "tract",
  variables <- "B08134_104",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
twenty_24 <- get_acs(
  geography = "tract",
  variables <- "B08134_105",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
twenty5_29 <- get_acs(
  geography = "tract",
  variables <- "B08134_106",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
thirty_34 <- get_acs(
  geography = "tract",
  variables <- "B08134_107",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
thirty5_44 <- get_acs(
  geography = "tract",
  variables <- "B08134_108",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
fourty5_59 <- get_acs(
  geography = "tract",
  variables <- "B08134_109",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)
over60 <- get_acs(
  geography = "tract",
  variables <- "B08134_110",
  year = 2020,
  state = c("New Jersey", "New York", "Connecticut")
)

under10 <- under10 %>%
  rename(under10 = estimate)
ten_14 <- ten_14 %>%
  rename(ten_14 = estimate)
fifteen_19 <- fifteen_19 %>%
  rename(fifteen_19 = estimate)
twenty_24 <- twenty_24 %>%
  rename(twenty_24 = estimate)
twenty5_29 <- twenty5_29 %>%
  rename(twenty5_29 = estimate)
thirty_34 <- thirty_34 %>%
  rename(thirty_34 = estimate)
thirty5_44 <- thirty5_44 %>%
  rename(thirty5_44 = estimate)
fourty5_59 <- fourty5_59 %>%
  rename(fourty5_59 = estimate)
over60 <- over60 %>%
  rename(over60 = estimate)

walk_data <- merge(walk_data, under10, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, ten_14, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, fifteen_19, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, twenty_24, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, twenty5_29, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, thirty_34, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, thirty5_44, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, fourty5_59, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, over60, by = 'GEOID', all.x = TRUE)

# approximation of average walking time by taking midpoint of ranges
walk_data$AvgWalktime <- ((walk_data$under10 * 5) + (walk_data$ten_14 * 12) + (walk_data$fifteen_19 * 17) + (walk_data$twenty_24 * 22) + (walk_data$twenty5_29 * 27) + (walk_data$thirty_34 * 32) + (walk_data$thirty5_44 * 40) + (walk_data$fourty5_59 * 53) + (walk_data$over60 * 65)) / walk_data$Total_walkers 

combined_sf <- merge(combined_sf, walk_data, by = 'GEOID', all.x = TRUE)

combined_sf$ped_crashes_category <- ifelse(combined_sf$crash_per1k == 0, 0, NA)

non_zero_values <- combined_sf$crash_per1k[combined_sf$crash_per1k > 0]
quantiles <- quantile(non_zero_values, probs = seq(0, 1, 0.2), na.rm = TRUE)

combined_sf$ped_crashes_category[combined_sf$crash_per1k > 0] <- cut(
  combined_sf$crash_per1k[combined_sf$crash_per1k > 0],
  breaks = quantiles,
  include.lowest = TRUE
)

# break average walking time into categories for chi sq test
combined_sf <- combined_sf %>%
  mutate(walk_cat = cut(AvgWalktime, breaks = quantile(AvgWalktime, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE), include.lowest = TRUE, labels = c(1, 2, 3)))

# split different walking length categories, run chi sq on each one
w1 <- combined_sf %>% filter(walk_cat == 1)
w2 <- combined_sf %>% filter(walk_cat == 2)
w3 <- combined_sf %>% filter(walk_cat == 3)

chisq.test(table(w1$lh_cluster, w1$ped_crashes_category))
## 
##  Pearson's Chi-squared test
## 
## data:  table(w1$lh_cluster, w1$ped_crashes_category)
## X-squared = 23.064, df = 5, p-value = 0.0003281
chisq.test(table(w2$lh_cluster, w2$ped_crashes_category))
## 
##  Pearson's Chi-squared test
## 
## data:  table(w2$lh_cluster, w2$ped_crashes_category)
## X-squared = 25.408, df = 5, p-value = 0.0001162
chisq.test(table(w3$lh_cluster, w3$ped_crashes_category))
## 
##  Pearson's Chi-squared test
## 
## data:  table(w3$lh_cluster, w3$ped_crashes_category)
## X-squared = 7.5222, df = 5, p-value = 0.1846

Low p-values for low and medium length walking commutes indicates a significant association between low health cluster membership and # of pedestrian crashes per 1k for all categories of walking commute length. This suggests that regardless of the actual amount of walking that is done which would expose pedestrians to a crash, the distribution of pedestrian crashes is not independent of low health cluster membership.

The p-value in the high walking commute category is not low enough to be statistically significant, which means implies that health cluster membership does not have a significant effect on the likelihood of pedestrian crashes. This may be due to individuals with particularly long walking commutes leaving the low-health clusters identified by our algorithm.

Means of crash per population for each walking category, and correlation between walk time and crashes per1k:

mean(w1$crash_per1k)
## [1] 0.07657081
mean(w2$crash_per1k)
## [1] 0.09331295
mean(w3$crash_per1k)
## [1] 0.09013194
mean(combined_sf$AvgWalktime, na.rm = TRUE)
## [1] 14.48157
mean(combined_sf$AvgWalktime[combined_sf$lh_cluster == 1], na.rm = TRUE)
## [1] 16.89633
combined_sf$AvgWalktime_r <- round(combined_sf$AvgWalktime)
zinb_model <- zeroinfl(total_crashes ~ AvgWalktime_r, data = combined_sf, dist = "negbin")
summary(zinb_model) 
## 
## Call:
## zeroinfl(formula = total_crashes ~ AvgWalktime_r, data = combined_sf, 
##     dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4937 -0.4841 -0.4721 -0.4482 10.8385 
## 
## Count model coefficients (negbin with log link):
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.1498171  0.1040463 -11.051   <2e-16 ***
## AvgWalktime_r  0.0006829  0.0041089   0.166    0.868    
## Log(theta)    -0.0792634  0.1389694  -0.570    0.568    
## 
## Zero-inflation model coefficients (binomial with logit link):
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -0.9181     1.0473  -0.877    0.381
## AvgWalktime_r  -0.2050     0.2103  -0.975    0.330
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.9238 
## Number of iterations in BFGS optimization: 33 
## Log-likelihood: -4189 on 5 Df

We can see that the pedestrian crash rate is actually slightly lower in the areas with the longest average walking commutes, and that the average walking time in low health clusters is only slightly higher. The zero inflated negative binomial model shows us that there is no significant association between average walk time and number of crashes.

One variable that is heavily correlated to health is income:

income_code <- "B19019_001"
income_data <- get_acs(
  geography = "tract",
  variables <- income_code,
  year = 2020,
  state = c("Connecticut", "New York", "New Jersey")
)
income_data <- income_data %>%
  rename(AvgIncome = estimate)

combined_sf <- merge(combined_sf, income_data, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, income_data, by = 'GEOID', all.x = TRUE)

ggplot(data = combined_sf, aes(x = AvgIncome, y = health_intensity)) +
  geom_point() +
  geom_smooth(method = 'lm', color = 'red') +
  theme_minimal()

cleaned <- combined_sf %>% filter(!is.na(AvgIncome) & !is.na(health_intensity))
cor(cleaned$AvgIncome, cleaned$health_intensity)
## [1] -0.7358396

As we can see, there is a very strong negative correlation (-0.73), i.e. higher incomes have a lower health intensity.

And the income in the low health clusters identified by our algorithm, vs. overall average income:

mean(lowhealth$AvgIncome, na.rm = TRUE)
## [1] 46350.8
mean(combined_sf$AvgIncome, na.rm = TRUE)
## [1] 82813.55
cor(combined_sf$lh_cluster, combined_sf$AvgIncome, use = 'complete.obs')
## [1] -0.4245338

Evidently, these low health clusters also have much lower average incomes on average and that being in a lo whealth cluster is correlated with lower incomes. This lines up with what we would expect based on previous work tying health to income.

Many studies also look at correlations between pedestrian fatalities and race. Let’s examine what the racial characteristics are of these low health clusters, and if they align with previous research which shows associations between certain races and pedestrian fatalities. We can get racial demographic data from the ACS.

pop_data <- get_acs(
  geography = "tract",
  variables <- "B01003_001",
  year = 2020,
  state = c("Connecticut", "New York", "New Jersey")
)
pop_data <- pop_data %>%
  rename(Population = estimate)

combined_sf <- merge(combined_sf, pop_data, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, pop_data, by = 'GEOID', all.x = TRUE)

race_white <- get_acs(
  geography = "tract",
  variables <- "B02001_002",
  year = 2020,
  state = c("Connecticut", "New York", "New Jersey")
)
race_white <- race_white %>%
  rename(white = estimate)
combined_sf <- merge(combined_sf, race_white, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_white, by = 'GEOID', all.x = TRUE)

race_blk <- get_acs(
  geography = "tract",
  variables <- "B02001_003",
  year = 2020,
  state = c("Connecticut", "New York", "New Jersey")
)
race_blk <- race_blk %>%
  rename(black = estimate)
combined_sf <- merge(combined_sf, race_blk, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_blk, by = 'GEOID', all.x = TRUE)

race_hispanic <- get_acs(
  geography = "tract",
  variables <- "B03001_003",
  year = 2020,
  state = c("Connecticut", "New York", "New Jersey")
)
race_hispanic <- race_hispanic %>%
  rename(hispanic = estimate)
combined_sf <- merge(combined_sf, race_hispanic, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_hispanic, by = 'GEOID', all.x = TRUE)

race_asian <- get_acs(
  geography = "tract",
  variables <- "B02001_005",
  year = 2020,
  state = c("Connecticut", "New York", "New Jersey")
)
race_asian <- race_asian %>%
  rename(asian = estimate)
combined_sf <- merge(combined_sf, race_asian, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_asian, by = 'GEOID', all.x = TRUE)

race_nai <- get_acs(
  geography = "tract",
  variables <- "B02001_004",
  year = 2020,
  state = c("Connecticut", "New York", "New Jersey")
)
race_nai <- race_nai %>%
  rename(native_american = estimate)
combined_sf <- merge(combined_sf, race_nai, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_nai, by = 'GEOID', all.x = TRUE)

combined_sf$pct_white <- combined_sf$white / combined_sf$Population * 100
combined_sf$pct_black <- combined_sf$black / combined_sf$Population * 100
combined_sf$pct_hispanic <- combined_sf$hispanic / combined_sf$Population * 100
combined_sf$pct_asian <- combined_sf$asian / combined_sf$Population * 100
combined_sf$pct_nai <- combined_sf$native_american / combined_sf$Population * 100

lowhealth$pct_white <- lowhealth$white / lowhealth$Population * 100
lowhealth$pct_black <- lowhealth$black / lowhealth$Population * 100
lowhealth$pct_hispanic <- lowhealth$hispanic / lowhealth$Population * 100
lowhealth$pct_asian <- lowhealth$asian / lowhealth$Population * 100
lowhealth$pct_nai <- lowhealth$native_american / lowhealth$Population * 100

print(paste("Pct White:", mean(combined_sf$pct_white, na.rm = TRUE)))
## [1] "Pct White: 63.1981774879091"
print(paste("Pct Black:", mean(combined_sf$pct_black, na.rm = TRUE)))
## [1] "Pct Black: 15.8611376151714"
print(paste("Pct Hispanic:", mean(combined_sf$pct_hispanic, na.rm = TRUE)))
## [1] "Pct Hispanic: 18.7670760990548"
print(paste("Pct Asian:", mean(combined_sf$pct_asian, na.rm = TRUE)))
## [1] "Pct Asian: 8.2045496817234"
print(paste("Pct Native American:", mean(combined_sf$pct_nai, na.rm = TRUE)))
## [1] "Pct Native American: 0.389235294364415"
print(paste("Pct White:", mean(lowhealth$pct_white, na.rm = TRUE)))
## [1] "Pct White: 26.6241976816869"
print(paste("Pct Black:", mean(lowhealth$pct_black, na.rm = TRUE)))
## [1] "Pct Black: 44.7663379484941"
print(paste("Pct Hispanic:", mean(lowhealth$pct_hispanic, na.rm = TRUE)))
## [1] "Pct Hispanic: 33.0485437162817"
print(paste("Pct Asian:", mean(lowhealth$pct_asian, na.rm = TRUE)))
## [1] "Pct Asian: 6.22139770797221"
print(paste("Pct Native American:", mean(lowhealth$pct_nai, na.rm = TRUE)))
## [1] "Pct Native American: 0.666675398381168"

As you would expect, the percentage of the population in the clusters we identified which is African American, Hispanic, or Native American is significantly higher than average, while the percentage that is Asian is slightly lower and the percentage that is White is significantly lower.

We can also just calculate the overall percentage from total population (more accurate)

print(paste("Overall White Percentage:", sum(combined_sf$white, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall White Percentage: 63.8845730506845"
print(paste("Overall Black Percentage:", sum(combined_sf$black, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall Black Percentage: 14.8059997252873"
print(paste("Overall Hispanic Percentage:", sum(combined_sf$hispanic, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall Hispanic Percentage: 19.4770935396731"
print(paste("Overall Asian Percentage:", sum(combined_sf$asian, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall Asian Percentage: 8.29764571219267"
print(paste("Overall Native American Percentage:", sum(combined_sf$native_american, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall Native American Percentage: 0.333050684492468"
print(paste("Low Health Cluster White Percentage:", sum(lowhealth$white, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster White Percentage: 26.765636557467"
print(paste("Low Health Cluster Black Percentage:", sum(lowhealth$black, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster Black Percentage: 42.6034542321656"
print(paste("Low Health Cluster Hispanic Percentage:", sum(lowhealth$hispanic, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster Hispanic Percentage: 36.2149686219737"
print(paste("Low Health Cluster Asian Percentage:", sum(lowhealth$asian, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster Asian Percentage: 6.42525756262359"
print(paste("Low Health Cluster Native American Percentage:", sum(lowhealth$native_american, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster Native American Percentage: 0.56414745235012"

ZINB models for racial demographic and crashes

combined_sf$pct_white_r <- round(combined_sf$pct_white)
summary(zeroinfl(total_crashes ~ pct_white_r, data = combined_sf, dist = 'negbin'))
## 
## Call:
## zeroinfl(formula = total_crashes ~ pct_white_r, data = combined_sf, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5164 -0.5025 -0.4549 -0.3427  9.5783 
## 
## Count model coefficients (negbin with log link):
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.999781   0.063293 -15.796   <2e-16 ***
## pct_white_r -0.001217   0.001258  -0.968    0.333    
## Log(theta)  -0.031904   0.106268  -0.300    0.764    
## 
## Zero-inflation model coefficients (binomial with logit link):
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -12.11527    2.92173  -4.147 3.37e-05 ***
## pct_white_r   0.12217    0.03018   4.048 5.16e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.9686 
## Number of iterations in BFGS optimization: 33 
## Log-likelihood: -4845 on 5 Df
combined_sf$pct_black_r <- round(combined_sf$pct_black)
summary(zeroinfl(total_crashes ~ pct_black_r, data = combined_sf, dist = 'negbin'))
## 
## Call:
## zeroinfl(formula = total_crashes ~ pct_black_r, data = combined_sf, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5248 -0.4984 -0.4377 -0.3958  9.6593 
## 
## Count model coefficients (negbin with log link):
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.099137   0.057766 -19.028   <2e-16 ***
## pct_black_r  0.001450   0.001389   1.044    0.297    
## Log(theta)  -0.034579   0.111522  -0.310    0.757    
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.7640     0.2336  -3.271  0.00107 **
## pct_black_r  -0.3375     0.1449  -2.330  0.01981 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.966 
## Number of iterations in BFGS optimization: 30 
## Log-likelihood: -4857 on 5 Df
combined_sf$pct_hispanic_r <- round(combined_sf$pct_hispanic)
summary(zeroinfl(total_crashes ~ pct_hispanic_r, data = combined_sf, dist = 'negbin'))
## 
## Call:
## zeroinfl(formula = total_crashes ~ pct_hispanic_r, data = combined_sf, 
##     dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5869 -0.4945 -0.4445 -0.3637  9.0815 
## 
## Count model coefficients (negbin with log link):
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.205711   0.069014 -17.471  < 2e-16 ***
## pct_hispanic_r  0.005954   0.001661   3.584 0.000338 ***
## Log(theta)     -0.017888   0.111473  -0.160 0.872510    
## 
## Zero-inflation model coefficients (binomial with logit link):
##                Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    -0.28396    0.26325  -1.079  0.28074   
## pct_hispanic_r -0.22050    0.07957  -2.771  0.00559 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.9823 
## Number of iterations in BFGS optimization: 31 
## Log-likelihood: -4832 on 5 Df
combined_sf$pct_asian_r <- round(combined_sf$pct_asian)
summary(zeroinfl(total_crashes ~ pct_asian_r, data = combined_sf, dist = 'negbin'))
## 
## Call:
## zeroinfl(formula = total_crashes ~ pct_asian_r, data = combined_sf, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4911 -0.4782 -0.4582 -0.4425 10.3161 
## 
## Count model coefficients (negbin with log link):
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.038108   0.102763 -10.102   <2e-16 ***
## pct_asian_r -0.004440   0.003168  -1.401    0.161    
## Log(theta)  -0.038241   0.186448  -0.205    0.837    
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -1.3639     0.4703  -2.900  0.00373 **
## pct_asian_r  -0.1779     0.1532  -1.162  0.24543   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.9625 
## Number of iterations in BFGS optimization: 31 
## Log-likelihood: -4876 on 5 Df
combined_sf$pct_nai_r <- round(combined_sf$pct_nai)
summary(zeroinfl(total_crashes ~ pct_nai_r, data = combined_sf, dist = 'negbin'))
## 
## Call:
## zeroinfl(formula = total_crashes ~ pct_nai_r, data = combined_sf, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5031 -0.4648 -0.4648 -0.4648 10.3376 
## 
## Count model coefficients (negbin with log link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.20008    0.02727 -44.000  < 2e-16 ***
## pct_nai_r    0.02129    0.02827   0.753  0.45139    
## Log(theta)  -0.26944    0.08350  -3.227  0.00125 ** 
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.4416    23.2327  -0.449    0.653
## pct_nai_r     0.5159     1.4017   0.368    0.713
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.7638 
## Number of iterations in BFGS optimization: 48 
## Log-likelihood: -4878 on 5 Df

These results are interesting: - For white percentage of the population of a census tract, there is no significant effect on the count model, but a significant effect on the zero count model, where higher percentages of white people associate with a higher chance of zero count, likely due to overrepresentation of white people in rural areas.
- For black percentage, again no significant effect on the count model, and the opposite effect on the zero count model, where higher percentages of black people are associated with lower probabilities of zero count, likely due to overrepresentation of black people in urban areas.
- For hispanic percentage, significant effect on the count model and zero count model, with higher hispanic percentages associated with both higher crash counts and lower probabilities of zero crashes.
- For asian and native american percentages no significant associations are found.
However it is important to note that without a clustering based on race no conclusions can be drawn from these results.

Another variable that is often associated with pedestrian fatalities in many studies is density:

combined_sf$density_r <- round(combined_sf$density)
summary(zeroinfl(total_crashes ~ density_r, data = combined_sf, dist = "negbin"))
## 
## Call:
## zeroinfl(formula = total_crashes ~ density_r, data = combined_sf, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4738 -0.4675 -0.4379 -0.3538 10.5746 
## 
## Count model coefficients (negbin with log link):
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.1251408  0.0381208 -29.515  < 2e-16 ***
## density_r   -0.0006642  0.0001662  -3.997 6.42e-05 ***
## Log(theta)  -0.2607655  0.0921737  -2.829  0.00467 ** 
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.39902    0.20156  -1.980  0.04774 * 
## density_r   -0.18550    0.06091  -3.045  0.00232 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 0.7705 
## Number of iterations in BFGS optimization: 40 
## Log-likelihood: -5047 on 5 Df

The results from this model are significant and actually show that as density increases, both the number of crashes and probability of excess zero’s decrease. This is probably due to very low density rural areas having very little people and pedestrians, and as a result no pedestrian fatalities, while higher density areas in general will have at least some crashes, but as they get denser they are likely to have less.

This is contrary to what is found in many other studies, and may be due to different built environment dynamics in the tri-state area, where overall density is higher and dense areas may be more walkable and safe for pedestrians.

One possible hypothesis for why pedestrian fatality rates are higher in lower health areas is due to longer EMS response times, as emergency services may be lacking funding in poorer areas.

We create 2 vars - (1) diffEMSarrive = difference between EMS arrival time and crash time and (2) diffEMShospital = difference between arrival time at hospital and crash time.

# clean for rows that have data for crash time, ems arrival, and hospital arrival
FARS_ems <- FARS_wide %>% filter(HOUR < 24 & ARR_HOUR < 24 & HOSP_HR < 24 & HOSPITAL == 5)

FARS_ems$crashtime <- (FARS_ems$HOUR * 60) + FARS_ems$MINUTE
FARS_ems$EMStime <- (FARS_ems$ARR_HOUR * 60) + FARS_ems$ARR_MIN
FARS_ems$hospitaltime <- (FARS_ems$HOSP_HR * 60) + FARS_ems$HOSP_MN

FARS_ems$diffEMSarrive <- FARS_ems$EMStime - FARS_ems$crashtime
FARS_ems$diffEMSarrive <- ifelse(FARS_ems$diffEMSarrive < 0, FARS_ems$diffEMSarrive + 1440, FARS_ems$diffEMSarrive)

FARS_ems$diffEMShospital <- FARS_ems$hospitaltime - FARS_ems$EMStime
FARS_ems$diffEMShospital <- ifelse(FARS_ems$diffEMShospital < 0, FARS_ems$diffEMShospital + 1440, FARS_ems$diffEMShospital)

FARS_ems <- merge(FARS_ems, combined_sf, by.x = "CENSUS_TRACT", by.y = "GEOID", all.x = TRUE)

h1 <- FARS_ems %>% filter(health_cat == 1)
h2 <- FARS_ems %>% filter(health_cat == 2)
h3 <- FARS_ems %>% filter(health_cat == 3)
h4 <- FARS_ems %>% filter(health_cat == 4)
h5 <- FARS_ems %>% filter(health_cat == 5)

ggplot(data = FARS_ems, aes(x = health_intensity, y = diffEMSarrive)) +
  geom_point() +
  ylim(0, 200) +
  geom_smooth(method = 'lm', color = 'red')

ggplot(data = FARS_ems, aes(x = health_intensity, y = diffEMShospital)) +
  geom_point() +
  ylim(0, 200) +
  geom_smooth(method = 'lm', color = 'red')

ggplot(data = FARS_ems, aes(x = AvgIncome, y = diffEMSarrive)) +
  geom_point() +
  ylim(0, 200) +
  geom_smooth(method = 'lm', color = 'red')

ggplot(data = FARS_ems, aes(x = AvgIncome, y = diffEMShospital)) +
  geom_point() +
  ylim(0, 200) +
  geom_smooth(method = 'lm', color = 'red')

FARS_ems <- FARS_ems %>% filter(diffEMSarrive < 120)

hist(FARS_ems$diffEMSarrive, breaks = 40, xlab = 'EMS Arrival Time Difference (minutes)', main = 'Distribution of EMS arrival time to crash')

hist(FARS_ems$diffEMShospital, breaks = 40, xlab = 'EMS Arrival Time Difference to Hospital (minutes)', main = 'Distribution of EMS arrival time to hospital')

#FARS_ems <- FARS_ems %>% mutate(lh_cluster = ifelse(CENSUS_TRACT %in% lowhealth$GEOID, 1, 0))

#mean(FARS_ems$diffEMSarrive[FARS_ems$lh_cluster == 0])
#mean(FARS_ems$diffEMSarrive[FARS_ems$lh_cluster == 1])
#mean(FARS_ems$diffEMShospital[FARS_ems$lh_cluster == 0])
#mean(FARS_ems$diffEMShospital[FARS_ems$lh_cluster == 1])

As we can see, the arrival times to the scene and to the hospital are normally distributed, and do not show a correlation to the overall health in an area nor to income. In fact, the average time to EMS arrival and EMS to hospital is actually lower in the low health clusters

(NEW) added 10/9/24 Agglomerative clustering of census tracts based on total crashes per tract:

combined_sf <- combined_sf %>% filter(!is.na(ACCESS2_CrudePrev) & !is.na(GHLTH_CrudePrev) & !is.na(CASTHMA_CrudePrev) & !is.na(BPHIGH_CrudePrev) & !is.na(HIGHCHOL_CrudePrev) & !is.na(DIABETES_CrudePrev))

nyc_metro <- combined_sf #%>% filter(CSA_Name_max == 'New York-Newark, NY-NJ-CT-PA')

coords <- st_coordinates(st_centroid(nyc_metro))
scaled_coords <- scale(coords)
scaled_data <- as.data.frame(nyc_metro$total_crashes)

combined <- as.data.frame(cbind((coords), nyc_metro$total_crashes))

dist_coords <- dist(scaled_coords)
dist_data <- dist(scaled_data)
  
# Weighting factor for spatial distance
spatial_weight <- 0.85  # Adjust this factor to increase or decrease the emphasis on spatial proximity
  
# Combine distance matrices with emphasis on spatial nearness
combined_dist <- (spatial_weight * dist_coords) + ((1 - spatial_weight) * dist_data)

agg_v3 <- function(combined, combined_dist, i, method, graph) {
  # Calculate distance matrices
  
  hc <- hclust(combined_dist, method)
  clusters <- cutree(hc, k = i)
  
  # Assign clusters back to the data
  combined$cluster <- as.integer(as.factor(clusters))
  nyc_metro$cluster <- combined$cluster

  sil_scores <- silhouette(nyc_metro$cluster, dist(as.matrix(combined)))
  mean_sil_score <- mean(as.data.frame(sil_scores)$sil_width)
  print(paste("Silhouette score:", mean_sil_score))
    
  # Calculate Davis-Bouldin index
  db_index <- index.DB(as.matrix(combined), nyc_metro$cluster)$DB
  print(paste("Davis-Bouldin index:", db_index))
  
  # Calculate Dunn index
  dunn_index <- dunn(dist(as.matrix(combined)), nyc_metro$cluster)
  print(paste("Dunn index:", dunn_index))
  
  clus_means <- nyc_metro %>% group_by(cluster) %>% summarise(mean_crash = mean(total_crashes), total_tracts = n())
  
  clus_means <- clus_means %>% filter(mean_crash > 0.25) %>% pull(cluster)
  
  bad_sf <- nyc_metro %>% filter(cluster %in% clus_means)
  neighbors <- poly2nb(as_Spatial(bad_sf))
  no_neighbors <- which(card(neighbors) == 0)
  bad_sf <- bad_sf[-no_neighbors, ]
  
  #palette <- colorFactor(palette = rainbow(length(unique(nyc_metro$cluster))), domain = nyc_metro$cluster)
  
  palette <- colorFactor(palette = colorRampPalette(c("yellow", "red"))(10), domain = combined_sf$total_crashes)

  if (graph == 0){
  leaflet(bad_sf, options = leafletOptions(minZoom = 6)) %>%
      addTiles() %>%
      addPolygons(
        fillColor = ~palette(total_crashes),
        fillOpacity = 0.7,
        color = "white",
        weight = 1,
        popup = ~paste("cluster:", cluster)
      ) %>%
      addLegend(
        "bottomright", 
        pal = palette, 
        values = ~total_crashes,
        title = "Clustering based on Crashes per Tract",
        opacity = 1
      ) %>%
    addControl(
      html = "<h2>Crash Clusters - Agglomerative Method</h2>", 
      position = "topright",
      className = "map-title"
    )
    }
  else {
    return (bad_sf)
  }
}

agg_v3(combined, combined_dist, 100, 'ward.D2', 0)
## [1] "Silhouette score: 0.851287180380529"
## [1] "Davis-Bouldin index: 0.446776471513434"
## [1] "Dunn index: 0.247161464573612"

(NEW) added 10/9/24 Mapping of all tracts with more than 1 crash:

cc <- combined_sf %>% filter(total_crashes > 1)

palette <- colorFactor(palette = colorRampPalette(c("yellow", "red"))(10), domain = combined_sf$total_crashes)

leaflet(data = cc, options = leafletOptions(minZoom = 6)) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~palette(total_crashes),
    fillOpacity = 0.7,
    color = "black",
    weight = 1,
    opacity = 1,
    highlightOptions = highlightOptions(
      weight = 3,
      color = "white",
      bringToFront = TRUE
    ),
    label = ~paste("total crashes:", total_crashes)
  ) %>%
  addLegend(
    pal = palette,
    values = ~total_crashes,
    title = "total crashes",
    position = "bottomright"
  ) %>%
  addControl(
    html = "<h2>Census tracts with more than 1 crash</h2>", 
    position = "topright",
    className = "map-title"
  )