library(spdep)
library(sf)
library(dplyr)
library(tidyverse)
library(viridis)
library(sfweight)
library(tmap)
library(tidycensus)
library(sf)
library(gridExtra)
q5 <- function(variable) {as.factor(ntile(variable, 5))}
palette5 <- c("#f0f9e8","#bae4bc","#7bccc4","#43a2ca","#0868ac")
mapTheme <- function(base_size = 12, title_size = 13) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = title_size,colour = "black"),
plot.subtitle=element_text(face="italic"),
plot.caption=element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.text.x = element_text(size = 14))
}
qBr <- function(df, variable, rnd) {
if (missing(rnd)) {
as.character(quantile(round(df[[variable]],0),
c(.01,.2,.4,.6,.8), na.rm=T))
} else if (rnd == FALSE | rnd == F) {
as.character(formatC(quantile(df[[variable]],
c(.01,.2,.4,.6,.8), na.rm=T),
digits = 3))
}
}
plotTheme <- function(base_size = 12, title_size = 13) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = title_size, colour = "black"),
plot.subtitle = element_text(face="italic"),
plot.caption = element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=12),
axis.title = element_text(size=12),
axis.text = element_text(size=10),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"),
strip.text.x = element_text(size = 14)
)
}
This project aims to investigate health disparities in Dallas, Texas. In order to investigate the correlation between economic variables and health outcomes, we need to first look at different variables related to health. The variables I will explore are diabetes, asthma, coronary heart disease, and low life expectancy.
PLACES_shp<- read_sf(dsn = "C:/Users/halie/OneDrive/Desktop/Get PHIT/1.0-shapefile-codebook/usa")
dallas_places<- PLACES_shp[PLACES_shp$SF == "Texas" & PLACES_shp$CF == "Dallas County",] %>%
st_as_sf(.) %>%
rename(Diabetes = DF_PFS, Asthma = AF_PFS, HeartDisease = HDF_PFS, LLE = LLEF_PFS) %>%
mutate(Diabetes = Diabetes * 100,
Asthma = Asthma * 100,
HeartDisease = HeartDisease * 100,
LLE = LLE * 100) %>%
dplyr::select(GEOID10, SF, CF, Diabetes, Asthma, HeartDisease, LLE, TPF, UI_EXP, THRHLD, geometry)
diabetes <-
ggplot() +
geom_sf(data = dallas_places, aes(fill = q5(Diabetes))) +
scale_fill_manual(values=palette5,
labels = qBr(dallas_places, "Diabetes"),
name = "Diabetes Percentage\n(Quintile Breaks)") +
labs(title = "Percentage of Diagnosed Diabetes",
subtitle= "Among Adults 18 or Older") +
mapTheme() + theme(legend.key.size = unit(0.45, "cm"))
asthma <-
ggplot() +
geom_sf(data = dallas_places, aes(fill = q5(Asthma))) +
scale_fill_manual(values=palette5,
labels = qBr(dallas_places, "Asthma"),
name = "Asthma Percentage\n(Quintile Breaks)") +
labs(title = "Percentage of Current Asthma",
subtitle= "Among Adults 18 or Older") +
mapTheme() + theme(legend.key.size = unit(0.45, "cm"))
CHD <-
ggplot() +
geom_sf(data = dallas_places, aes(fill = q5(HeartDisease))) +
scale_fill_manual(values=palette5,
labels = qBr(dallas_places, "HeartDisease"),
name = "CHD Percentage\n(Quintile Breaks)") +
labs(title = "Percentage of Coronary Heart Disease",
subtitle= "Among Adults 18 or Older") +
mapTheme() + theme(legend.key.size = unit(0.45, "cm"))
LLE <-
ggplot() +
geom_sf(data = dallas_places, aes(fill = q5(LLE))) +
scale_fill_manual(values=palette5,
labels = qBr(dallas_places, "LLE"),
name = "LLE Percentage\n(Quintile Breaks)") +
labs(title = "Percentage of Low Life Expectancy",
subtitle = "Among Whole Population") +
mapTheme() + theme(legend.key.size = unit(0.45, "cm"))
grid.arrange(diabetes, asthma, CHD, LLE)
From the maps, we can see that some areas have higher percentages of these health outcomes than other areas. For instance, the southeast side of Dallas tends to have higher percentages, while the northern side has lower percentages. The places with the highest percentages of these health outcomes tends to be in the same place no matter which outcome is being investigated.
Moran’s I is a statistical test we can do to examine spatial clustering. The null hypothesis is that the variable is randomly distributed. I values can be between -1 and 1, where 1 represents positive spatial autocorrelation, 0 represents no autocorrelation, and -1 represents negative spatial autocorrelation. If the p-value is statistically significant, we reject the null hypothesis and conclude that there is clustering. There are two types that we will be investigating: Global Moran and Local Moran.
The figure below shows the frequency of 999 randomly permuted I values, with the observed I shown by the orange line for each of the four health variables. Because the observed I is higher than the randomly generated I values for each of the variables, we can conclude that there is spatial autocorrelation for each of these variables.
dallas_0 <- dallas_places %>%
mutate(Diabetes = replace_na(Diabetes, 0),
Asthma = replace_na(Asthma, 0),
HeartDisease = replace_na(HeartDisease, 0),
LLE = replace_na(LLE, 0))
dallas.nb<- poly2nb(dallas_0, queen=TRUE)
dallas.lw<- nb2listw(dallas.nb, zero.policy = TRUE)
dallas_diabetes_moran<- moran.mc(dallas_0$Diabetes, dallas.lw, nsim = 999)
dallas_asthma_moran<- moran.mc(dallas_0$Asthma, dallas.lw, nsim = 999)
dallas_CHD_moran<- moran.mc(dallas_0$HeartDisease, dallas.lw, nsim = 999)
dallas_LLE_moran<- moran.mc(dallas_0$LLE, dallas.lw, nsim = 999)
gmDiabetes <-
ggplot(as.data.frame(dallas_diabetes_moran$res[c(1:999)]), aes(dallas_diabetes_moran$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = dallas_diabetes_moran$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Permuted Moran's I - Diabetes",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
plotTheme()
gmAsthma <-
ggplot(as.data.frame(dallas_asthma_moran$res[c(1:999)]), aes(dallas_asthma_moran$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = dallas_asthma_moran$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Permuted Moran's I - Asthma",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
plotTheme()
gmCHD <-
ggplot(as.data.frame(dallas_CHD_moran$res[c(1:999)]), aes(dallas_CHD_moran$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = dallas_CHD_moran$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Permuted Moran's I - CHD",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
plotTheme()
gmLLE <-
ggplot(as.data.frame(dallas_LLE_moran$res[c(1:999)]), aes(dallas_LLE_moran$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = dallas_LLE_moran$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Permuted Moran's I - LLE",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
plotTheme()
grid.arrange(gmDiabetes, gmAsthma, gmCHD, gmLLE)
The other type of Moran test is Local Moran, or LISA. The null hypothesis for this test is that the variable at a given location is randomly dispersed relative to its immediate neighbors. There are four different categories mapped in the figures below. 1. HH: statistically significant cluster of high values 2. LL: statistically significant cluster of low values 3. HL: high value surrounded mostly by low values 4. LH: low value surrounded mostly by high values
From the figure below, we can see that there tends to be clusters of high values of these variables in the southern part of the city and clusters of low values in the northern part. It appears that the southern part of the city has higher incidents of diabetes, asthma, coronary heart disease, and low life expectancy.
diabetes_local_DF<- dallas_places %>% mutate(nb = st_neighbors(geometry),
wts = st_weights(nb), lag_DF = st_lag(Diabetes %>% replace(.,is.na(.), 0), nb, wts),
lisa = categorize_lisa(Diabetes %>% replace(.,is.na(.), 0), lag_DF))
asthma_local_DF<- dallas_places %>% mutate(nb = st_neighbors(geometry),
wts = st_weights(nb), lag_DF = st_lag(Asthma %>% replace(.,is.na(.), 0), nb, wts),
lisa = categorize_lisa(Asthma %>% replace(.,is.na(.), 0), lag_DF))
CHD_local_DF<- dallas_places %>% mutate(nb = st_neighbors(geometry),
wts = st_weights(nb), lag_DF = st_lag(HeartDisease %>% replace(.,is.na(.), 0), nb, wts),
lisa = categorize_lisa(HeartDisease %>% replace(.,is.na(.), 0), lag_DF))
LLE_local_DF<- dallas_places %>% mutate(nb = st_neighbors(geometry),
wts = st_weights(nb), lag_DF = st_lag(LLE %>% replace(.,is.na(.), 0), nb, wts),
lisa = categorize_lisa(LLE %>% replace(.,is.na(.), 0), lag_DF))
lmDiabetes <-
tm_shape(diabetes_local_DF) + tm_fill(col="lisa", alpha=0.2) + tm_borders() +
tm_layout(legend.position =c(0.8, 0.3), main.title = "Local Moran's LISA - Diabetes",
main.title.position = c("left", .3), main.title.size = 1.25, legend.title.color = "white", legend.text.size = .75)
lmAsthma <-
tm_shape(asthma_local_DF) + tm_fill(col="lisa", alpha=0.2) + tm_borders() +
tm_layout(legend.position =c(0.8, 0.3), main.title = "Local Moran's LISA - Asthma",
main.title.position = c("left", .3), main.title.size = 1.25, legend.title.color = "white", legend.text.size = .75)
lmCHD <-
tm_shape(CHD_local_DF) + tm_fill(col="lisa", alpha=0.2) + tm_borders() +
tm_layout(legend.position =c(0.8, 0.3), main.title = "Local Moran's LISA - CHD",
main.title.position = c("left", .3), main.title.size = 1.25, legend.title.color = "white", legend.text.size = .75)
lmLLE <-
tm_shape(LLE_local_DF) + tm_fill(col="lisa", alpha=0.2) + tm_borders() +
tm_layout(legend.position =c(0.8, 0.3), main.title = "Local Moran's LISA - LLE",
main.title.position = c("left", .3), main.title.size = 1.25, legend.title.color = "white", legend.text.size = .75)
tmap_arrange(lmDiabetes, lmAsthma, lmCHD, lmLLE)
Now that we have investigated certain health-related variables, we should look at economic variables such as income, rent, and poverty.
poverty <-
ggplot() +
geom_sf(data = tracts15, aes(fill = q5(pctPoverty))) +
scale_fill_manual(values = palette5,
labels = qBr(tracts15, "pctPoverty"),
name = "Poverty (%)\n(Quintile Breaks)") +
labs(title = "Percent Poverty") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 7))
income <-
ggplot() +
geom_sf(data = tracts15, aes(fill = q5(MedHHInc))) +
scale_fill_manual(values = palette5,
labels = qBr(tracts15, "MedHHInc"),
name = "Income\n(Quintile Breaks)") +
labs(title = "Median Household Income") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 7))
rent <-
ggplot() +
geom_sf(data = tracts15, aes(fill = q5(MedRent))) +
scale_fill_manual(values = palette5,
labels = qBr(tracts15, "MedRent"),
name = "Rent\n(Quintile Breaks)") +
labs(title = "Median Rent") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 7))
grid.arrange(rent, income, poverty)
From these maps, we can see that the places where there is high rent is similar to places where there is a high household income and places with a low percent poverty.
Now that we have data for the health-related variables and the economic variables, we will see whether there is any correlation between these.
everything <-
left_join(dallas_places, st_drop_geometry(tracts15), by = "GEOID10")
correlation.long <-
st_drop_geometry(everything) %>%
dplyr::select(Diabetes, Asthma, HeartDisease, LLE, pctPoverty) %>%
gather(Variable, Value, -pctPoverty)
correlation.cor <-
correlation.long %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, pctPoverty, use = "complete.obs"))
ggplot(correlation.long, aes(Value, pctPoverty)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor, aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title = "Percent Poverty as a function of health factors") +
plotTheme()
There appears to be a positive correlation between percent poverty and each of the health factors. In other words, as percent poverty increases, so does asthma, diabetes, heart disease, and low life expectancy. This relationship is strongest between percent poverty and asthma (r = 0.69) and weakest between percent poverty and heart disease (r = 0.43).
correlation.long2 <-
st_drop_geometry(everything) %>%
dplyr::select(Diabetes, Asthma, HeartDisease, LLE, MedHHInc) %>%
gather(Variable, Value, -MedHHInc)
correlation.cor2 <-
correlation.long2 %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, MedHHInc, use = "complete.obs"))
ggplot(correlation.long2, aes(Value, MedHHInc)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor2, aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title = "Median Household Income as a function of health factors") +
plotTheme()
There appears to be a negative correlation between median household income and each of the health outcomes. In other words, as median household income increases, the percentage of diabetes, asthma, heart disease, and low life expectancy go down. The strongest relationship is between income and low life expectancy (r = -0.61), and the weakest relationship is between income and heart disease (r = -0.34).
correlation.long3 <-
st_drop_geometry(everything) %>%
dplyr::select(Diabetes, Asthma, HeartDisease, LLE, MedRent) %>%
gather(Variable, Value, -MedRent)
correlation.cor3 <-
correlation.long3 %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, MedRent, use = "complete.obs"))
ggplot(correlation.long3, aes(Value, MedRent)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor3, aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title = "Median Rent as a function of health factors") +
plotTheme()
Once again, there appears to be a negative linear correlation between rent and the health outcomes. As rent increases, the percentage of diabetes, asthma, heart disease, and low life expectancy decreases. This relationship is strongest between diabetes and rent (r = -0.55) and weakest between heart disease and rent (r = -0.31).
Interestingly, heart disease has the weakest correlation with each of the economic variables. Still, each of the charts show a correlation that is at least moderate, and some even have a strong correlation.
From the correlation coefficients given for each of the tests, we can see that there does appear to be a correlation between certain health variables with economic variables. There appears to be higher percentages of asthma and diabetes in places with more poverty and lower rent and income. This matches what we already know about health disparities: for a variety of reasons, people with a lower socioeconomic status tend to have poorer health. Although the tests done here show a correlation, they do not necessarily show causation. More tests would have to be done to determine if these economic variables are the cause of poorer health, or if they are simply a correlating factor.