import libraries
library(dplyr)
library(sf)
library(sp)
library(tidyr)
library(spdep)
library(stringr)
library(dbscan)
library(ggplot2)
library(h3jsr)
library(GGally)
library(leaflet)
library(cluster)
library(clusterSim)
library(clValid)
library(gstat)
library(raster)
library(viridis)
library(viridisLite)
library(tidycensus)
library(pscl)
load datasets
FARS_wide <- read.csv('FARS_NJ-NY-CT-18-22.csv')
SDOH_df <- read.csv('SDOH_NJ-NY-CT.csv')
combined_sf <- read.csv('combined_health_crash.csv')
sf_nj <- st_read('cb_2018_34_tract_500k/cb_2018_34_tract_500k.shp')
## Reading layer `cb_2018_34_tract_500k' from data source
## `/Users/erandoni/mechkar/cb_2018_34_tract_500k/cb_2018_34_tract_500k.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 2006 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -75.55961 ymin: 38.92852 xmax: -73.89398 ymax: 41.35742
## Geodetic CRS: NAD83
sf_ny <- st_read('cb_2018_36_tract_500k/cb_2018_36_tract_500k.shp')
## Reading layer `cb_2018_36_tract_500k' from data source
## `/Users/erandoni/mechkar/cb_2018_36_tract_500k/cb_2018_36_tract_500k.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 4906 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -79.76215 ymin: 40.4961 xmax: -71.85621 ymax: 45.01585
## Geodetic CRS: NAD83
sf_ct <- st_read('cb_2018_09_tract_500k/cb_2018_09_tract_500k.shp')
## Reading layer `cb_2018_09_tract_500k' from data source
## `/Users/erandoni/mechkar/cb_2018_09_tract_500k/cb_2018_09_tract_500k.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 830 features and 9 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -73.72777 ymin: 40.98014 xmax: -71.78699 ymax: 42.05059
## Geodetic CRS: NAD83
tristate_sf <- rbind(sf_nj, sf_ny, sf_ct)
combined_sf$GEOID[combined_sf$STATEFP == 9] <- paste0(0, combined_sf$GEOID)
combined_sf <- merge(tristate_sf, combined_sf, by = 'GEOID', all.x = TRUE)
We want to find spatial clusters based on the following
variables:
-Fair or poor self-rated health status among adults
(GHLTH_CrudePrev)
-Current asthma prevalence among adults (CASTHMA_CrudePrev)
-High blood pressure among adults (BPHIGH_CrudePrev)
-High cholesterol (HIGHCHOL_CrudePrev)
-Diagnosed diabetes among adults (DIABETES_CrudePrev)
-Current lack of health insurance among adults (ACCESS2_CrudePrev)
First let’s get a general look at these variables by county:
# group data into counties, plot on map
counties <- combined_sf %>%
group_by(CountyCode) %>%
summarize(
GHLTH = mean(GHLTH_CrudePrev, na.rm = TRUE),
ASTHMA = mean(CASTHMA_CrudePrev, na.rm = TRUE),
BPHIGH = mean(BPHIGH_CrudePrev, na.rm = TRUE),
HIGHCHOL = mean(HIGHCHOL_CrudePrev, na.rm = TRUE),
DIABETES = mean(DIABETES_CrudePrev, na.rm = TRUE),
INSURANCE = mean(ACCESS2_CrudePrev, na.rm = TRUE),
geometry = (st_union(geometry))
)
ggplot(data = counties) +
geom_sf(aes(fill = GHLTH)) +
scale_fill_viridis_c(option = "viridis", name = "LOW GHLTH") +
theme_minimal() +
theme(legend.position = 'right') +
labs(title = "Prevalence of low general health by county")
ggplot(data = counties) +
geom_sf(aes(fill = ASTHMA)) +
scale_fill_viridis_c(option = "viridis", name = "ASTHMA") +
theme_minimal() +
theme(legend.position = 'right') +
labs(title = "Prevalence of asthma by county")
ggplot(data = counties) +
geom_sf(aes(fill = BPHIGH)) +
scale_fill_viridis_c(option = "viridis", name = "BPHIGH") +
theme_minimal() +
theme(legend.position = 'right') +
labs(title = "Prevalence of high blood pressure by county")
ggplot(data = counties) +
geom_sf(aes(fill = HIGHCHOL)) +
scale_fill_viridis_c(option = "viridis", name = "HIGHCHOL") +
theme_minimal() +
theme(legend.position = 'right') +
labs(title = "Prevalence of high cholestrol by county")
ggplot(data = counties) +
geom_sf(aes(fill = DIABETES)) +
scale_fill_viridis_c(option = "viridis", name = "DIABETES") +
theme_minimal() +
theme(legend.position = 'right') +
labs(title = "Prevalence of diabetes by county")
ggplot(data = counties) +
geom_sf(aes(fill = INSURANCE)) +
scale_fill_viridis_c(option = "viridis", name = "INSURANCE") +
theme_minimal() +
theme(legend.position = 'right') +
labs(title = "Prevalence of lack of insurance by county")
The dataset contains a variable called health intensity that is calculated from the 6 variables mentioned above using the following method for each row: Each variable has the percentile at which it is at relative to the dataset calculated, for example: the median for BPHIGH_CrudePrev is 29.8, therefore a census tract where it’s BPHIGH_CrudePrev variable is 29.8 will be at the 50th percentile. After this is done, the percentiles are added, and divided by 6, and assidned as the health_intensity variable. This variable is important because it allows us to gauge what the overall health in a census tract is based on those 6 variables.
density graph of the health_intensity variable:
ggplot(data = combined_sf, aes(x = health_intensity)) +
geom_density(fill = "blue", alpha = 0.5) +
stat_function(fun = dnorm, args = list(mean = mean(combined_sf$health_intensity), sd = sd(combined_sf$health_intensity)), color = "red") +
labs(title = "Density Plot of health intensity",
x = "health intensity",
y = "Density") +
theme_minimal()
Health intensity by census tract:
palette <- colorNumeric(
palette = colorRampPalette(c("green", "red"))(100),
domain = combined_sf$health_intensity
)
leaflet(data = combined_sf, options = leafletOptions(minZoom = 6)) %>%
addTiles() %>%
addPolygons(
fillColor = ~palette(health_intensity),
fillOpacity = 0.7,
color = "black",
weight = 1,
opacity = 1,
highlightOptions = highlightOptions(
weight = 3,
color = "white",
bringToFront = TRUE
),
label = ~paste("Health Intensity:", health_intensity)
) %>%
addLegend(
pal = palette,
values = ~health_intensity,
title = "Health Intensity",
position = "bottomright"
) %>%
addControl(
html = "<h2>Health Intensity by Census Tract</h2>",
position = "topright",
className = "map-title"
)
Next, we split up health intensity into quantiles and categorize our census tracts into 5 health categories to make understanding the visualization easier:
# split up data into 5 quantiles of health intensity
health_quantiles <- quantile(combined_sf$health_intensity, probs = seq(0, 1, by = 0.2), na.rm=TRUE)
combined_sf$health_cat <- cut(combined_sf$health_intensity, breaks = health_quantiles, include.lowest = TRUE, labels=FALSE)#c('extremely high', 'high', 'moderate', 'low', 'extremely low'))
palette <- colorFactor(palette = colorRampPalette(c("green", "limegreen","orange", "red", "darkred"))(10), domain = combined_sf$health_cat)
leaflet(data = combined_sf, options = leafletOptions(minZoom = 6)) %>%
addTiles() %>%
addPolygons(
fillColor = ~palette(health_cat),
fillOpacity = 0.7,
color = "black",
weight = 1,
opacity = 1,
highlightOptions = highlightOptions(
weight = 3,
color = "white",
bringToFront = TRUE
),
label = ~paste("Health Category:", health_cat)
) %>%
addLegend(
pal = palette,
values = ~health_cat,
title = "Health Category",
position = "bottomright"
) %>%
addControl(
html = "<h2>Health Category by Census Tract</h2>",
position = "topright",
className = "map-title"
)
We want to find clusters of areas with low health. We can manually separate all continuous areas where census tracts are in the lowest health category, and filter for some sort of minimum population and density (this it done to eliminate extremely rural areas where there are little people/no pedestrians). This method yields decent results, but it also misses the full picture due to its simplicity.
# calculate population density per acre. I suspect the area variable is not completely accurate, since the sum of all of them is too low, but it does still seem to be correlated to how much area is actually in a census tract
combined_sf$density <- combined_sf$TotalPopulation / (combined_sf$Ac_Total_mean)
# split off worst health quantile, filter for clusters with over 50k pop and density over 20
lowhealth <- combined_sf %>% filter(health_cat == 5)
adj_list <- poly2nb(as_Spatial(lowhealth))
clusters <- n.comp.nb(adj_list)
lowhealth$cluster <- clusters$comp.id
cluster_populations <- lowhealth %>%
group_by(cluster) %>%
summarize(total_population = sum(TotalPopulation),
avg_density = mean(density))
large_clusters <- cluster_populations %>%
filter(total_population >= 50000 & avg_density >= 20) %>%
pull(cluster)
lowhealth <- lowhealth %>%
filter(cluster %in% large_clusters)
leaflet(data = lowhealth, options = leafletOptions(minZoom = 6)) %>%
addTiles() %>%
addPolygons(fillColor = ~colorFactor(viridis(10), domain = lowhealth$cluster)(cluster),
fillOpacity = 0.7,
weight = 1,
color = 'black',
label = ~paste("Cluster:", cluster),
labelOptions = labelOptions(direction = "auto")) %>%
addControl(
html = "<h2>Low Health Clusters - Heuristic Method</h2>",
position = "topright",
className = "map-title"
)
As we can see, there are 19 distinct clusters that come up: 7 in NJ,
9 in NY, 3 in CT
NJ: Paterson, Newark + Irvington and the Oranges, South JC + Bayonne,
Elizabeth, Trenton, Camden + Pennsauken, Atlantic City Area
NYC: The Bronx + Harlem, Washington Heights, Flushing, Jamaica,
Chinatown/Two Bridges, Sunset/Borough Park, East NY + Brownsville +
Crown Heights
Upstate NY: Rochester, Buffalo
CT: Hartford, Waterbury, Bridgeport
Now, we will use aglommerative hierarchical clustering to find clusters of low health. This is the most succesful clustering method for this data that we have found so far.
combined_sf <- combined_sf %>% filter(!is.na(ACCESS2_CrudePrev) & !is.na(GHLTH_CrudePrev) & !is.na(CASTHMA_CrudePrev) & !is.na(BPHIGH_CrudePrev) & !is.na(HIGHCHOL_CrudePrev) & !is.na(DIABETES_CrudePrev))
nyc_metro <- combined_sf #%>% filter(CSA_Name_max == 'New York-Newark, NY-NJ-CT-PA')
coords <- st_coordinates(st_centroid(nyc_metro))
scaled_coords <- scale(coords)
scaled_data <- as.data.frame(cbind(scale(nyc_metro$GHLTH_CrudePrev), scale(nyc_metro$CASTHMA_CrudePrev), scale(nyc_metro$BPHIGH_CrudePrev), scale(nyc_metro$HIGHCHOL_CrudePrev), scale(nyc_metro$DIABETES_CrudePrev), scale(nyc_metro$ACCESS2_CrudePrev)))
combined <- as.data.frame(cbind((coords), scale(nyc_metro$GHLTH_CrudePrev), scale(nyc_metro$CASTHMA_CrudePrev), scale(nyc_metro$BPHIGH_CrudePrev), scale(nyc_metro$HIGHCHOL_CrudePrev), scale(nyc_metro$DIABETES_CrudePrev), scale(nyc_metro$ACCESS2_CrudePrev)))
colnames(combined) <- c("x", "y", "GHLTH_CrudePrev", "CASTHMA_CrudePrev", "BPHIGH_CrudePrev", "HIGHCHOL_CrudePrev", "DIABETES_CrudePrev", "ACCESS2_CrudePrev")
dist_coords <- dist(scaled_coords)
dist_data <- dist(scaled_data)
# Weighting factor for spatial distance
spatial_weight <- 0.8 # Adjust this factor to increase or decrease the emphasis on spatial proximity
# Combine distance matrices with emphasis on spatial nearness
combined_dist <- (spatial_weight * dist_coords) + ((1 - spatial_weight) * dist_data)
agg_v2 <- function(combined, combined_dist, i, method, graph) {
# Calculate distance matrices
hc <- hclust(combined_dist, method)
clusters <- cutree(hc, k = i)
# Assign clusters back to the data
combined$cluster <- as.integer(as.factor(clusters))
nyc_metro$cluster <- combined$cluster
sil_scores <- silhouette(nyc_metro$cluster, dist(as.matrix(combined)))
mean_sil_score <- mean(as.data.frame(sil_scores)$sil_width)
print(paste("Silhouette score:", mean_sil_score))
# Calculate Davis-Bouldin index
db_index <- index.DB(as.matrix(combined), nyc_metro$cluster)$DB
print(paste("Davis-Bouldin index:", db_index))
# Calculate Dunn index
dunn_index <- dunn(dist(as.matrix(combined)), nyc_metro$cluster)
print(paste("Dunn index:", dunn_index))
palette <- colorFactor(palette = rainbow(length(unique(nyc_metro$cluster))), domain = nyc_metro$cluster)
clus_means <- nyc_metro %>% group_by(cluster) %>% summarise(mean_health = mean(health_intensity), total_population = sum(TotalPopulation), avg_density = mean(density), total_tracts = n())
clus_means <- clus_means %>% filter(mean_health >= 65 & avg_density >= 20) %>% pull(cluster)
bad_sf <- nyc_metro %>% filter(cluster %in% clus_means)
neighbors <- poly2nb(as_Spatial(bad_sf))
no_neighbors <- which(card(neighbors) == 0)
bad_sf <- bad_sf[-no_neighbors, ]
palette <- colorFactor(palette = colorRampPalette(c("green", "limegreen","orange", "red", "darkred"))(10), domain = combined_sf$health_cat)
if (graph == 0){
leaflet(bad_sf, options = leafletOptions(minZoom = 6)) %>%
addTiles() %>%
addPolygons(
fillColor = ~palette(health_cat),
fillOpacity = 0.7,
color = "white",
weight = 1,
popup = ~paste("health category:", health_cat)
) %>%
addLegend(
"bottomright",
pal = palette,
values = ~health_cat,
title = "Clustering based on health factors",
opacity = 1
) %>%
addControl(
html = "<h2>Low Health Clusters - Agglomerative Method</h2>",
position = "topright",
className = "map-title"
)
}
else {
return (bad_sf)
}
}
agg_v2(combined, combined_dist, 200, 'ward.D2', 0)
## [1] "Silhouette score: 0.499632362521662"
## [1] "Davis-Bouldin index: 0.847814278955593"
## [1] "Dunn index: 0.152630003717111"
There are many ways to tweak the clustering methodology and there are
some clear trade offs:
the more spatial compactness is prioritized, the worse the actual fit of
the clusters is
more clusters -> better fit, at the expense of spatial compactness
and overall is just messier
As we can see, this method finds a lot of clusters that the heuristic
method misses.
Now that we’ve identified these clusters, we want to see if their
pedestrian fatality rates are higher than the general pedestrian
fatality rates relative to population.
Our question is: Is there a statistical difference in the number of ped
crashes among census tracts with very high intensity of poor
health?
To do this, we need to look at the distribution of the number of
crashes per census tract:
combined_sf$total_crashes <- ifelse(is.na(combined_sf$total_crashes), 0, combined_sf$total_crashes)
hist(combined_sf$total_crashes, breaks = 7)
mean(combined_sf$total_crashes)
## [1] 0.2749837
var(combined_sf$total_crashes)
## [1] 0.4045152
Since the data is evidently zero inflated, we need a negative binomial model. Variance is higher than the mean, therefore a negative binomial model is more appropriate than a poisson model.
lowhealth <- agg_v2(combined, combined_dist, 200, 'ward.D2', 1)
## [1] "Silhouette score: 0.499632362521662"
## [1] "Davis-Bouldin index: 0.847814278955593"
## [1] "Dunn index: 0.152630003717111"
combined_sf <- combined_sf %>% mutate(lh_cluster = ifelse(GEOID %in% lowhealth$GEOID, 1, 0))
zinb_model <- zeroinfl(total_crashes ~ lh_cluster, data = combined_sf, dist = "negbin")
summary(zinb_model)
##
## Call:
## zeroinfl(formula = total_crashes ~ lh_cluster, data = combined_sf, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.4743 -0.4294 -0.4294 -0.4294 9.5831
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.35708 0.03059 -44.365 < 2e-16 ***
## lh_cluster 0.35334 0.11766 3.003 0.00267 **
## Log(theta) -0.42951 0.09256 -4.640 3.48e-06 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.812 20.611 -0.379 0.705
## lh_cluster 4.438 20.595 0.215 0.829
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.6508
## Number of iterations in BFGS optimization: 73
## Log-likelihood: -5061 on 5 Df
combined_sf$total_crashes <- ifelse(is.na(combined_sf$total_crashes), 0, combined_sf$total_crashes)
combined_sf$anycrash <- ifelse(combined_sf$total_crashes > 0, 'Crashes', 'No Crashes')
combined_sf$lowhealth <- ifelse(combined_sf$lh_cluster == 1, 'Low Health', 'High Health')
table(combined_sf$lowhealth, combined_sf$anycrash)
##
## Crashes No Crashes
## High Health 1225 5036
## Low Health 340 1054
prop.table(table(combined_sf$lowhealth, combined_sf$anycrash)) * 100
##
## Crashes No Crashes
## High Health 16.002613 65.787067
## Low Health 4.441541 13.768779
palette <- colorFactor(palette = colorRampPalette(c("green", "limegreen","orange", "red", "darkred"))(10), domain = combined_sf$health_cat)
w <- combined_sf %>% filter(anycrash == 'Crashes' & lowhealth == 'Low Health')
leaflet(w) %>%
addTiles() %>%
addPolygons(
fillColor = ~palette(health_cat),
fillOpacity = 0.7,
color="black",
weight = 1,
)
FARS_wide <- FARS_wide %>% filter(LONGITUD < 200)
crashes_clustered <- dbscan(FARS_wide[, c("LATITUDE", "LONGITUD")], eps = 0.01, minPts = 3)
FARS_wide$cluster <- crashes_clustered$cluster
clusres <- FARS_wide %>% filter(cluster > 0)
pal <- colorFactor(rainbow(length(unique(clusres$cluster))), domain = clusres$cluster)
leaflet(clusres) %>%
addTiles() %>%
addCircleMarkers(
lng = ~LONGITUD,
lat = ~LATITUDE,
color = ~pal(cluster),
radius = 5,
stroke = FALSE,
fillOpacity = 1,
label = ~cluster
) %>%
addLegend(
position = 'bottomright',
colors = unique(as.factor(crashes_clustered$cluster)),
labels = unique(as.factor(crashes_clustered$cluster))
)
cc <- merge(combined_sf, clusres, by.x = 'GEOID', by.y = 'CENSUS_TRACT', all.y = TRUE)
cc <- st_as_sf(cc)
cc_polygons <- cc %>%
filter(st_geometry_type(geometry) %in% c("POLYGON", "MULTIPOLYGON"))
leaflet(cc_polygons) %>%
addTiles() %>%
addPolygons(
fillColor = ~pal(cluster),
fillOpacity = 0.7,
color="black",
weight = 1,
label = ~paste('cluster:', cluster)
)
table(combined_sf$lh_cluster)
##
## 0 1
## 6261 1394
sum(combined_sf$total_crashes[combined_sf$lh_cluster == 1])
## [1] 494
sum(combined_sf$total_crashes[combined_sf$lh_cluster == 0])
## [1] 1611
Interpretation of results: there are two parts - the count model and
the zero-inflation model.
The count model is negative binomial regression with a log link, which
predicts total_crashes based on the predictor of belonging to a low
health cluster.
The intercept shows that the expected number of crashes outside of low
health crashes is exp(-1.35708) ≈ 0.257
The coefficient for lh_cluster indicates that being a in a low health
cluster is associated with a 42.4% increase in the expected number of
crashes (exp(0.35334) ≈ 1.424). The p value is 0.00267, which indicates
this is statisticaly significant.
The zero-inflation model is a binomial logistic regression model,
predicting the probability of excess zeros in total_crashes based on the
predictor of belonging to a low health cluster.
The p values for both the intercept and lh_cluster are too high to draw
any statistically significant conclusions.
We need to prove that this higher pedestrian death rate in low-health
areas is not as a result of another variable, such as higher exposure
(more walking). To check this we divide the census tracts into high,
medium and low walking commutes.
We then calculate a chi-square test of difference with the cluster
designation being the grouping variable, and the outcome variable being
number of ped crashes per thousand population. We can estimate walking
exposure using data from the ACS using the census API
# get walking data. The data is actually structured as # of people who walked for a range of time, so we need to get all of the ranges individually and through that approximate the average walking time
walk_code <- "B08134_101"
walk_data <- get_acs(
geography = "tract",
variables <- walk_code,
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
walk_data <- walk_data %>%
rename(Total_walkers = estimate)
under10 <- get_acs(
geography = "tract",
variables <- "B08134_102",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
ten_14 <- get_acs(
geography = "tract",
variables <- "B08134_103",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
fifteen_19 <- get_acs(
geography = "tract",
variables <- "B08134_104",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
twenty_24 <- get_acs(
geography = "tract",
variables <- "B08134_105",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
twenty5_29 <- get_acs(
geography = "tract",
variables <- "B08134_106",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
thirty_34 <- get_acs(
geography = "tract",
variables <- "B08134_107",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
thirty5_44 <- get_acs(
geography = "tract",
variables <- "B08134_108",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
fourty5_59 <- get_acs(
geography = "tract",
variables <- "B08134_109",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
over60 <- get_acs(
geography = "tract",
variables <- "B08134_110",
year = 2020,
state = c("New Jersey", "New York", "Connecticut")
)
under10 <- under10 %>%
rename(under10 = estimate)
ten_14 <- ten_14 %>%
rename(ten_14 = estimate)
fifteen_19 <- fifteen_19 %>%
rename(fifteen_19 = estimate)
twenty_24 <- twenty_24 %>%
rename(twenty_24 = estimate)
twenty5_29 <- twenty5_29 %>%
rename(twenty5_29 = estimate)
thirty_34 <- thirty_34 %>%
rename(thirty_34 = estimate)
thirty5_44 <- thirty5_44 %>%
rename(thirty5_44 = estimate)
fourty5_59 <- fourty5_59 %>%
rename(fourty5_59 = estimate)
over60 <- over60 %>%
rename(over60 = estimate)
walk_data <- merge(walk_data, under10, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, ten_14, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, fifteen_19, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, twenty_24, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, twenty5_29, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, thirty_34, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, thirty5_44, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, fourty5_59, by = 'GEOID', all.x = TRUE)
walk_data <- merge(walk_data, over60, by = 'GEOID', all.x = TRUE)
# approximation of average walking time by taking midpoint of ranges
walk_data$AvgWalktime <- ((walk_data$under10 * 5) + (walk_data$ten_14 * 12) + (walk_data$fifteen_19 * 17) + (walk_data$twenty_24 * 22) + (walk_data$twenty5_29 * 27) + (walk_data$thirty_34 * 32) + (walk_data$thirty5_44 * 40) + (walk_data$fourty5_59 * 53) + (walk_data$over60 * 65)) / walk_data$Total_walkers
combined_sf <- merge(combined_sf, walk_data, by = 'GEOID', all.x = TRUE)
combined_sf$ped_crashes_category <- ifelse(combined_sf$crash_per1k == 0, 0, NA)
non_zero_values <- combined_sf$crash_per1k[combined_sf$crash_per1k > 0]
quantiles <- quantile(non_zero_values, probs = seq(0, 1, 0.2), na.rm = TRUE)
combined_sf$ped_crashes_category[combined_sf$crash_per1k > 0] <- cut(
combined_sf$crash_per1k[combined_sf$crash_per1k > 0],
breaks = quantiles,
include.lowest = TRUE
)
# break average walking time into categories for chi sq test
combined_sf <- combined_sf %>%
mutate(walk_cat = cut(AvgWalktime, breaks = quantile(AvgWalktime, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE), include.lowest = TRUE, labels = c(1, 2, 3)))
# split different walking length categories, run chi sq on each one
w1 <- combined_sf %>% filter(walk_cat == 1)
w2 <- combined_sf %>% filter(walk_cat == 2)
w3 <- combined_sf %>% filter(walk_cat == 3)
chisq.test(table(w1$lh_cluster, w1$ped_crashes_category))
##
## Pearson's Chi-squared test
##
## data: table(w1$lh_cluster, w1$ped_crashes_category)
## X-squared = 23.064, df = 5, p-value = 0.0003281
chisq.test(table(w2$lh_cluster, w2$ped_crashes_category))
##
## Pearson's Chi-squared test
##
## data: table(w2$lh_cluster, w2$ped_crashes_category)
## X-squared = 25.408, df = 5, p-value = 0.0001162
chisq.test(table(w3$lh_cluster, w3$ped_crashes_category))
##
## Pearson's Chi-squared test
##
## data: table(w3$lh_cluster, w3$ped_crashes_category)
## X-squared = 7.5222, df = 5, p-value = 0.1846
Low p-values for low and medium length walking commutes indicates a
significant association between low health cluster membership and # of
pedestrian crashes per 1k for all categories of walking commute length.
This suggests that regardless of the actual amount of walking that is
done which would expose pedestrians to a crash, the distribution of
pedestrian crashes is not independent of low health cluster
membership.
The p-value in the high walking commute category is not low enough to
be statistically significant, which means implies that health cluster
membership does not have a significant effect on the likelihood of
pedestrian crashes. This may be due to individuals with particularly
long walking commutes leaving the low-health clusters identified by our
algorithm.
Means of crash per population for each walking category, and correlation between walk time and crashes per1k:
mean(w1$crash_per1k)
## [1] 0.07657081
mean(w2$crash_per1k)
## [1] 0.09331295
mean(w3$crash_per1k)
## [1] 0.09013194
mean(combined_sf$AvgWalktime, na.rm = TRUE)
## [1] 14.48157
mean(combined_sf$AvgWalktime[combined_sf$lh_cluster == 1], na.rm = TRUE)
## [1] 16.89633
combined_sf$AvgWalktime_r <- round(combined_sf$AvgWalktime)
zinb_model <- zeroinfl(total_crashes ~ AvgWalktime_r, data = combined_sf, dist = "negbin")
summary(zinb_model)
##
## Call:
## zeroinfl(formula = total_crashes ~ AvgWalktime_r, data = combined_sf,
## dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.4937 -0.4841 -0.4721 -0.4482 10.8385
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1498171 0.1040463 -11.051 <2e-16 ***
## AvgWalktime_r 0.0006829 0.0041089 0.166 0.868
## Log(theta) -0.0792634 0.1389694 -0.570 0.568
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.9181 1.0473 -0.877 0.381
## AvgWalktime_r -0.2050 0.2103 -0.975 0.330
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.9238
## Number of iterations in BFGS optimization: 33
## Log-likelihood: -4189 on 5 Df
We can see that the pedestrian crash rate is actually slightly lower in the areas with the longest average walking commutes, and that the average walking time in low health clusters is only slightly higher. The zero inflated negative binomial model shows us that there is no significant association between average walk time and number of crashes.
One variable that is heavily correlated to health is income:
income_code <- "B19019_001"
income_data <- get_acs(
geography = "tract",
variables <- income_code,
year = 2020,
state = c("Connecticut", "New York", "New Jersey")
)
income_data <- income_data %>%
rename(AvgIncome = estimate)
combined_sf <- merge(combined_sf, income_data, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, income_data, by = 'GEOID', all.x = TRUE)
ggplot(data = combined_sf, aes(x = AvgIncome, y = health_intensity)) +
geom_point() +
geom_smooth(method = 'lm', color = 'red') +
theme_minimal()
cleaned <- combined_sf %>% filter(!is.na(AvgIncome) & !is.na(health_intensity))
cor(cleaned$AvgIncome, cleaned$health_intensity)
## [1] -0.7358396
As we can see, there is a very strong negative correlation (-0.73), i.e. higher incomes have a lower health intensity.
And the income in the low health clusters identified by our algorithm, vs. overall average income:
mean(lowhealth$AvgIncome, na.rm = TRUE)
## [1] 46350.8
mean(combined_sf$AvgIncome, na.rm = TRUE)
## [1] 82813.55
cor(combined_sf$lh_cluster, combined_sf$AvgIncome, use = 'complete.obs')
## [1] -0.4245338
Evidently, these low health clusters also have much lower average incomes on average and that being in a lo whealth cluster is correlated with lower incomes. This lines up with what we would expect based on previous work tying health to income.
Many studies also look at correlations between pedestrian fatalities and race. Let’s examine what the racial characteristics are of these low health clusters, and if they align with previous research which shows associations between certain races and pedestrian fatalities. We can get racial demographic data from the ACS.
pop_data <- get_acs(
geography = "tract",
variables <- "B01003_001",
year = 2020,
state = c("Connecticut", "New York", "New Jersey")
)
pop_data <- pop_data %>%
rename(Population = estimate)
combined_sf <- merge(combined_sf, pop_data, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, pop_data, by = 'GEOID', all.x = TRUE)
race_white <- get_acs(
geography = "tract",
variables <- "B02001_002",
year = 2020,
state = c("Connecticut", "New York", "New Jersey")
)
race_white <- race_white %>%
rename(white = estimate)
combined_sf <- merge(combined_sf, race_white, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_white, by = 'GEOID', all.x = TRUE)
race_blk <- get_acs(
geography = "tract",
variables <- "B02001_003",
year = 2020,
state = c("Connecticut", "New York", "New Jersey")
)
race_blk <- race_blk %>%
rename(black = estimate)
combined_sf <- merge(combined_sf, race_blk, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_blk, by = 'GEOID', all.x = TRUE)
race_hispanic <- get_acs(
geography = "tract",
variables <- "B03001_003",
year = 2020,
state = c("Connecticut", "New York", "New Jersey")
)
race_hispanic <- race_hispanic %>%
rename(hispanic = estimate)
combined_sf <- merge(combined_sf, race_hispanic, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_hispanic, by = 'GEOID', all.x = TRUE)
race_asian <- get_acs(
geography = "tract",
variables <- "B02001_005",
year = 2020,
state = c("Connecticut", "New York", "New Jersey")
)
race_asian <- race_asian %>%
rename(asian = estimate)
combined_sf <- merge(combined_sf, race_asian, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_asian, by = 'GEOID', all.x = TRUE)
race_nai <- get_acs(
geography = "tract",
variables <- "B02001_004",
year = 2020,
state = c("Connecticut", "New York", "New Jersey")
)
race_nai <- race_nai %>%
rename(native_american = estimate)
combined_sf <- merge(combined_sf, race_nai, by = 'GEOID', all.x = TRUE)
lowhealth <- merge(lowhealth, race_nai, by = 'GEOID', all.x = TRUE)
combined_sf$pct_white <- combined_sf$white / combined_sf$Population * 100
combined_sf$pct_black <- combined_sf$black / combined_sf$Population * 100
combined_sf$pct_hispanic <- combined_sf$hispanic / combined_sf$Population * 100
combined_sf$pct_asian <- combined_sf$asian / combined_sf$Population * 100
combined_sf$pct_nai <- combined_sf$native_american / combined_sf$Population * 100
lowhealth$pct_white <- lowhealth$white / lowhealth$Population * 100
lowhealth$pct_black <- lowhealth$black / lowhealth$Population * 100
lowhealth$pct_hispanic <- lowhealth$hispanic / lowhealth$Population * 100
lowhealth$pct_asian <- lowhealth$asian / lowhealth$Population * 100
lowhealth$pct_nai <- lowhealth$native_american / lowhealth$Population * 100
print(paste("Pct White:", mean(combined_sf$pct_white, na.rm = TRUE)))
## [1] "Pct White: 63.1981774879091"
print(paste("Pct Black:", mean(combined_sf$pct_black, na.rm = TRUE)))
## [1] "Pct Black: 15.8611376151714"
print(paste("Pct Hispanic:", mean(combined_sf$pct_hispanic, na.rm = TRUE)))
## [1] "Pct Hispanic: 18.7670760990548"
print(paste("Pct Asian:", mean(combined_sf$pct_asian, na.rm = TRUE)))
## [1] "Pct Asian: 8.2045496817234"
print(paste("Pct Native American:", mean(combined_sf$pct_nai, na.rm = TRUE)))
## [1] "Pct Native American: 0.389235294364415"
print(paste("Pct White:", mean(lowhealth$pct_white, na.rm = TRUE)))
## [1] "Pct White: 26.6241976816869"
print(paste("Pct Black:", mean(lowhealth$pct_black, na.rm = TRUE)))
## [1] "Pct Black: 44.7663379484941"
print(paste("Pct Hispanic:", mean(lowhealth$pct_hispanic, na.rm = TRUE)))
## [1] "Pct Hispanic: 33.0485437162817"
print(paste("Pct Asian:", mean(lowhealth$pct_asian, na.rm = TRUE)))
## [1] "Pct Asian: 6.22139770797221"
print(paste("Pct Native American:", mean(lowhealth$pct_nai, na.rm = TRUE)))
## [1] "Pct Native American: 0.666675398381168"
As you would expect, the percentage of the population in the clusters we identified which is African American, Hispanic, or Native American is significantly higher than average, while the percentage that is Asian is slightly lower and the percentage that is White is significantly lower.
We can also just calculate the overall percentage from total population (more accurate)
print(paste("Overall White Percentage:", sum(combined_sf$white, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall White Percentage: 63.8845730506845"
print(paste("Overall Black Percentage:", sum(combined_sf$black, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall Black Percentage: 14.8059997252873"
print(paste("Overall Hispanic Percentage:", sum(combined_sf$hispanic, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall Hispanic Percentage: 19.4770935396731"
print(paste("Overall Asian Percentage:", sum(combined_sf$asian, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall Asian Percentage: 8.29764571219267"
print(paste("Overall Native American Percentage:", sum(combined_sf$native_american, na.rm = TRUE) / sum(combined_sf$Population, na.rm = TRUE) * 100))
## [1] "Overall Native American Percentage: 0.333050684492468"
print(paste("Low Health Cluster White Percentage:", sum(lowhealth$white, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster White Percentage: 26.765636557467"
print(paste("Low Health Cluster Black Percentage:", sum(lowhealth$black, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster Black Percentage: 42.6034542321656"
print(paste("Low Health Cluster Hispanic Percentage:", sum(lowhealth$hispanic, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster Hispanic Percentage: 36.2149686219737"
print(paste("Low Health Cluster Asian Percentage:", sum(lowhealth$asian, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster Asian Percentage: 6.42525756262359"
print(paste("Low Health Cluster Native American Percentage:", sum(lowhealth$native_american, na.rm = TRUE) / sum(lowhealth$Population, na.rm = TRUE) * 100))
## [1] "Low Health Cluster Native American Percentage: 0.56414745235012"
ZINB models for racial demographic and crashes
combined_sf$pct_white_r <- round(combined_sf$pct_white)
summary(zeroinfl(total_crashes ~ pct_white_r, data = combined_sf, dist = 'negbin'))
##
## Call:
## zeroinfl(formula = total_crashes ~ pct_white_r, data = combined_sf, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.5164 -0.5025 -0.4549 -0.3427 9.5783
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.999781 0.063293 -15.796 <2e-16 ***
## pct_white_r -0.001217 0.001258 -0.968 0.333
## Log(theta) -0.031904 0.106268 -0.300 0.764
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.11527 2.92173 -4.147 3.37e-05 ***
## pct_white_r 0.12217 0.03018 4.048 5.16e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.9686
## Number of iterations in BFGS optimization: 33
## Log-likelihood: -4845 on 5 Df
combined_sf$pct_black_r <- round(combined_sf$pct_black)
summary(zeroinfl(total_crashes ~ pct_black_r, data = combined_sf, dist = 'negbin'))
##
## Call:
## zeroinfl(formula = total_crashes ~ pct_black_r, data = combined_sf, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.5248 -0.4984 -0.4377 -0.3958 9.6593
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.099137 0.057766 -19.028 <2e-16 ***
## pct_black_r 0.001450 0.001389 1.044 0.297
## Log(theta) -0.034579 0.111522 -0.310 0.757
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.7640 0.2336 -3.271 0.00107 **
## pct_black_r -0.3375 0.1449 -2.330 0.01981 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.966
## Number of iterations in BFGS optimization: 30
## Log-likelihood: -4857 on 5 Df
combined_sf$pct_hispanic_r <- round(combined_sf$pct_hispanic)
summary(zeroinfl(total_crashes ~ pct_hispanic_r, data = combined_sf, dist = 'negbin'))
##
## Call:
## zeroinfl(formula = total_crashes ~ pct_hispanic_r, data = combined_sf,
## dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.5869 -0.4945 -0.4445 -0.3637 9.0815
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.205711 0.069014 -17.471 < 2e-16 ***
## pct_hispanic_r 0.005954 0.001661 3.584 0.000338 ***
## Log(theta) -0.017888 0.111473 -0.160 0.872510
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.28396 0.26325 -1.079 0.28074
## pct_hispanic_r -0.22050 0.07957 -2.771 0.00559 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.9823
## Number of iterations in BFGS optimization: 31
## Log-likelihood: -4832 on 5 Df
combined_sf$pct_asian_r <- round(combined_sf$pct_asian)
summary(zeroinfl(total_crashes ~ pct_asian_r, data = combined_sf, dist = 'negbin'))
##
## Call:
## zeroinfl(formula = total_crashes ~ pct_asian_r, data = combined_sf, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.4911 -0.4782 -0.4582 -0.4425 10.3161
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.038108 0.102763 -10.102 <2e-16 ***
## pct_asian_r -0.004440 0.003168 -1.401 0.161
## Log(theta) -0.038241 0.186448 -0.205 0.837
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.3639 0.4703 -2.900 0.00373 **
## pct_asian_r -0.1779 0.1532 -1.162 0.24543
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.9625
## Number of iterations in BFGS optimization: 31
## Log-likelihood: -4876 on 5 Df
combined_sf$pct_nai_r <- round(combined_sf$pct_nai)
summary(zeroinfl(total_crashes ~ pct_nai_r, data = combined_sf, dist = 'negbin'))
##
## Call:
## zeroinfl(formula = total_crashes ~ pct_nai_r, data = combined_sf, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.5031 -0.4648 -0.4648 -0.4648 10.3376
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.20008 0.02727 -44.000 < 2e-16 ***
## pct_nai_r 0.02129 0.02827 0.753 0.45139
## Log(theta) -0.26944 0.08350 -3.227 0.00125 **
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -10.4416 23.2327 -0.449 0.653
## pct_nai_r 0.5159 1.4017 0.368 0.713
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.7638
## Number of iterations in BFGS optimization: 48
## Log-likelihood: -4878 on 5 Df
These results are interesting: - For white percentage of the
population of a census tract, there is no significant effect on the
count model, but a significant effect on the zero count model, where
higher percentages of white people associate with a higher chance of
zero count, likely due to overrepresentation of white people in rural
areas.
- For black percentage, again no significant effect on the count model,
and the opposite effect on the zero count model, where higher
percentages of black people are associated with lower probabilities of
zero count, likely due to overrepresentation of black people in urban
areas.
- For hispanic percentage, significant effect on the count model and
zero count model, with higher hispanic percentages associated with both
higher crash counts and lower probabilities of zero crashes.
- For asian and native american percentages no significant associations
are found.
However it is important to note that without a clustering based on race
no conclusions can be drawn from these results.
Another variable that is often associated with pedestrian fatalities in many studies is density:
combined_sf$density_r <- round(combined_sf$density)
summary(zeroinfl(total_crashes ~ density_r, data = combined_sf, dist = "negbin"))
##
## Call:
## zeroinfl(formula = total_crashes ~ density_r, data = combined_sf, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -0.4738 -0.4675 -0.4379 -0.3538 10.5746
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.1251408 0.0381208 -29.515 < 2e-16 ***
## density_r -0.0006642 0.0001662 -3.997 6.42e-05 ***
## Log(theta) -0.2607655 0.0921737 -2.829 0.00467 **
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.39902 0.20156 -1.980 0.04774 *
## density_r -0.18550 0.06091 -3.045 0.00232 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 0.7705
## Number of iterations in BFGS optimization: 40
## Log-likelihood: -5047 on 5 Df
The results from this model are significant and actually show that as
density increases, both the number of crashes and probability of excess
zero’s decrease. This is probably due to very low density rural areas
having very little people and pedestrians, and as a result no pedestrian
fatalities, while higher density areas in general will have at least
some crashes, but as they get denser they are likely to have less.
This is contrary to what is found in many other studies, and may be
due to different built environment dynamics in the tri-state area, where
overall density is higher and dense areas may be more walkable and safe
for pedestrians.
One possible hypothesis for why pedestrian fatality rates are higher in lower health areas is due to longer EMS response times, as emergency services may be lacking funding in poorer areas.
We create 2 vars - (1) diffEMSarrive = difference between EMS arrival time and crash time and (2) diffEMShospital = difference between arrival time at hospital and crash time.
# clean for rows that have data for crash time, ems arrival, and hospital arrival
FARS_ems <- FARS_wide %>% filter(HOUR < 24 & ARR_HOUR < 24 & HOSP_HR < 24 & HOSPITAL == 5)
FARS_ems$crashtime <- (FARS_ems$HOUR * 60) + FARS_ems$MINUTE
FARS_ems$EMStime <- (FARS_ems$ARR_HOUR * 60) + FARS_ems$ARR_MIN
FARS_ems$hospitaltime <- (FARS_ems$HOSP_HR * 60) + FARS_ems$HOSP_MN
FARS_ems$diffEMSarrive <- FARS_ems$EMStime - FARS_ems$crashtime
FARS_ems$diffEMSarrive <- ifelse(FARS_ems$diffEMSarrive < 0, FARS_ems$diffEMSarrive + 1440, FARS_ems$diffEMSarrive)
FARS_ems$diffEMShospital <- FARS_ems$hospitaltime - FARS_ems$EMStime
FARS_ems$diffEMShospital <- ifelse(FARS_ems$diffEMShospital < 0, FARS_ems$diffEMShospital + 1440, FARS_ems$diffEMShospital)
FARS_ems <- merge(FARS_ems, combined_sf, by.x = "CENSUS_TRACT", by.y = "GEOID", all.x = TRUE)
h1 <- FARS_ems %>% filter(health_cat == 1)
h2 <- FARS_ems %>% filter(health_cat == 2)
h3 <- FARS_ems %>% filter(health_cat == 3)
h4 <- FARS_ems %>% filter(health_cat == 4)
h5 <- FARS_ems %>% filter(health_cat == 5)
ggplot(data = FARS_ems, aes(x = health_intensity, y = diffEMSarrive)) +
geom_point() +
ylim(0, 200) +
geom_smooth(method = 'lm', color = 'red')
ggplot(data = FARS_ems, aes(x = health_intensity, y = diffEMShospital)) +
geom_point() +
ylim(0, 200) +
geom_smooth(method = 'lm', color = 'red')
ggplot(data = FARS_ems, aes(x = AvgIncome, y = diffEMSarrive)) +
geom_point() +
ylim(0, 200) +
geom_smooth(method = 'lm', color = 'red')
ggplot(data = FARS_ems, aes(x = AvgIncome, y = diffEMShospital)) +
geom_point() +
ylim(0, 200) +
geom_smooth(method = 'lm', color = 'red')
FARS_ems <- FARS_ems %>% filter(diffEMSarrive < 120)
hist(FARS_ems$diffEMSarrive, breaks = 40, xlab = 'EMS Arrival Time Difference (minutes)', main = 'Distribution of EMS arrival time to crash')
hist(FARS_ems$diffEMShospital, breaks = 40, xlab = 'EMS Arrival Time Difference to Hospital (minutes)', main = 'Distribution of EMS arrival time to hospital')
#FARS_ems <- FARS_ems %>% mutate(lh_cluster = ifelse(CENSUS_TRACT %in% lowhealth$GEOID, 1, 0))
#mean(FARS_ems$diffEMSarrive[FARS_ems$lh_cluster == 0])
#mean(FARS_ems$diffEMSarrive[FARS_ems$lh_cluster == 1])
#mean(FARS_ems$diffEMShospital[FARS_ems$lh_cluster == 0])
#mean(FARS_ems$diffEMShospital[FARS_ems$lh_cluster == 1])
As we can see, the arrival times to the scene and to the hospital are normally distributed, and do not show a correlation to the overall health in an area nor to income. In fact, the average time to EMS arrival and EMS to hospital is actually lower in the low health clusters
(NEW) added 10/9/24 Agglomerative clustering of census tracts based on total crashes per tract:
combined_sf <- combined_sf %>% filter(!is.na(ACCESS2_CrudePrev) & !is.na(GHLTH_CrudePrev) & !is.na(CASTHMA_CrudePrev) & !is.na(BPHIGH_CrudePrev) & !is.na(HIGHCHOL_CrudePrev) & !is.na(DIABETES_CrudePrev))
nyc_metro <- combined_sf #%>% filter(CSA_Name_max == 'New York-Newark, NY-NJ-CT-PA')
coords <- st_coordinates(st_centroid(nyc_metro))
scaled_coords <- scale(coords)
scaled_data <- as.data.frame(nyc_metro$total_crashes)
combined <- as.data.frame(cbind((coords), nyc_metro$total_crashes))
dist_coords <- dist(scaled_coords)
dist_data <- dist(scaled_data)
# Weighting factor for spatial distance
spatial_weight <- 0.85 # Adjust this factor to increase or decrease the emphasis on spatial proximity
# Combine distance matrices with emphasis on spatial nearness
combined_dist <- (spatial_weight * dist_coords) + ((1 - spatial_weight) * dist_data)
agg_v3 <- function(combined, combined_dist, i, method, graph) {
# Calculate distance matrices
hc <- hclust(combined_dist, method)
clusters <- cutree(hc, k = i)
# Assign clusters back to the data
combined$cluster <- as.integer(as.factor(clusters))
nyc_metro$cluster <- combined$cluster
sil_scores <- silhouette(nyc_metro$cluster, dist(as.matrix(combined)))
mean_sil_score <- mean(as.data.frame(sil_scores)$sil_width)
print(paste("Silhouette score:", mean_sil_score))
# Calculate Davis-Bouldin index
db_index <- index.DB(as.matrix(combined), nyc_metro$cluster)$DB
print(paste("Davis-Bouldin index:", db_index))
# Calculate Dunn index
dunn_index <- dunn(dist(as.matrix(combined)), nyc_metro$cluster)
print(paste("Dunn index:", dunn_index))
clus_means <- nyc_metro %>% group_by(cluster) %>% summarise(mean_crash = mean(total_crashes), total_tracts = n())
clus_means <- clus_means %>% filter(mean_crash > 0.25) %>% pull(cluster)
bad_sf <- nyc_metro %>% filter(cluster %in% clus_means)
neighbors <- poly2nb(as_Spatial(bad_sf))
no_neighbors <- which(card(neighbors) == 0)
bad_sf <- bad_sf[-no_neighbors, ]
#palette <- colorFactor(palette = rainbow(length(unique(nyc_metro$cluster))), domain = nyc_metro$cluster)
palette <- colorFactor(palette = colorRampPalette(c("yellow", "red"))(10), domain = combined_sf$total_crashes)
if (graph == 0){
leaflet(bad_sf, options = leafletOptions(minZoom = 6)) %>%
addTiles() %>%
addPolygons(
fillColor = ~palette(total_crashes),
fillOpacity = 0.7,
color = "white",
weight = 1,
popup = ~paste("cluster:", cluster)
) %>%
addLegend(
"bottomright",
pal = palette,
values = ~total_crashes,
title = "Clustering based on Crashes per Tract",
opacity = 1
) %>%
addControl(
html = "<h2>Crash Clusters - Agglomerative Method</h2>",
position = "topright",
className = "map-title"
)
}
else {
return (bad_sf)
}
}
agg_v3(combined, combined_dist, 100, 'ward.D2', 0)
## [1] "Silhouette score: 0.851287180380529"
## [1] "Davis-Bouldin index: 0.446776471513434"
## [1] "Dunn index: 0.247161464573612"
(NEW) added 10/9/24 Mapping of all tracts with more than 1 crash:
cc <- combined_sf %>% filter(total_crashes > 1)
palette <- colorFactor(palette = colorRampPalette(c("yellow", "red"))(10), domain = combined_sf$total_crashes)
leaflet(data = cc, options = leafletOptions(minZoom = 6)) %>%
addTiles() %>%
addPolygons(
fillColor = ~palette(total_crashes),
fillOpacity = 0.7,
color = "black",
weight = 1,
opacity = 1,
highlightOptions = highlightOptions(
weight = 3,
color = "white",
bringToFront = TRUE
),
label = ~paste("total crashes:", total_crashes)
) %>%
addLegend(
pal = palette,
values = ~total_crashes,
title = "total crashes",
position = "bottomright"
) %>%
addControl(
html = "<h2>Census tracts with more than 1 crash</h2>",
position = "topright",
className = "map-title"
)