library(spdep)
library(sf)
library(dplyr)
library(tidyverse)
library(viridis)
library(sfweight)
library(tmap)
library(tidycensus)
library(sf)
library(gridExtra)
q5 <- function(variable) {as.factor(ntile(variable, 5))}
palette5 <- c("#f0f9e8","#bae4bc","#7bccc4","#43a2ca","#0868ac")
mapTheme <- function(base_size = 12, title_size = 13) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = title_size,colour = "black"),
plot.subtitle=element_text(face="italic"),
plot.caption=element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),axis.title = element_blank(),
axis.text = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.text.x = element_text(size = 14))
}
qBr <- function(df, variable, rnd) {
if (missing(rnd)) {
as.character(quantile(round(df[[variable]],0),
c(.01,.2,.4,.6,.8), na.rm=T))
} else if (rnd == FALSE | rnd == F) {
as.character(formatC(quantile(df[[variable]],
c(.01,.2,.4,.6,.8), na.rm=T),
digits = 3))
}
}
plotTheme <- function(base_size = 12, title_size = 13) {
theme(
text = element_text( color = "black"),
plot.title = element_text(size = title_size, colour = "black"),
plot.subtitle = element_text(face="italic"),
plot.caption = element_text(hjust=0),
axis.ticks = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=2),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=12),
axis.title = element_text(size=12),
axis.text = element_text(size=10),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(colour = "black", face = "italic"),
legend.text = element_text(colour = "black", face = "italic"),
strip.text.x = element_text(size = 14)
)
}
The goal of this project is to investigate the health disparities that exist in the south Texas region including Cameron County, Hidalgo County, Starr County, and Willacy County. Before we look at the correlation between health outcomes and economic variables, it is important to understand the distribution of the health outcomes themselves. The variables I will explore are diabetes, asthma, coronary heart disease, and low life expectancy.
PLACES_shp<- read_sf(dsn = "C:/Users/halie/OneDrive/Desktop/Get PHIT/1.0-shapefile-codebook/usa")
cameron_places<- PLACES_shp[PLACES_shp$SF == "Texas" & (PLACES_shp$CF == "Cameron County"|PLACES_shp$CF == "Willacy County" | PLACES_shp$CF == "Starr County" |PLACES_shp$CF == "Hidalgo County"),] %>%
st_as_sf(.) %>%
rename(Diabetes = DF_PFS, Asthma = AF_PFS, HeartDisease = HDF_PFS, LLE = LLEF_PFS) %>%
mutate(Diabetes = Diabetes * 100,
Asthma = Asthma * 100,
HeartDisease = HeartDisease * 100,
LLE = LLE * 100) %>%
dplyr::select(GEOID10, SF, CF, Diabetes, Asthma, HeartDisease, LLE, TPF, UI_EXP, THRHLD, geometry)
diabetes <-
ggplot() +
geom_sf(data = cameron_places, aes(fill = q5(Diabetes))) +
scale_fill_manual(values=palette5,
labels = qBr(cameron_places, "Diabetes"),
name = "Diabetes Percentage\n(Quintile Breaks)") +
labs(title = "Percentage of Diagnosed Diabetes",
subtitle= "Among Adults 18 or Older") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title=element_text(size = 9))
asthma <-
ggplot() +
geom_sf(data = cameron_places, aes(fill = q5(Asthma))) +
scale_fill_manual(values=palette5,
labels = qBr(cameron_places, "Asthma"),
name = "Asthma Percentage\n(Quintile Breaks)") +
labs(title = "Percentage of Current Asthma",
subtitle= "Among Adults 18 or Older") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title=element_text(size = 9))
CHD <-
ggplot() +
geom_sf(data = cameron_places, aes(fill = q5(HeartDisease))) +
scale_fill_manual(values=palette5,
labels = qBr(cameron_places, "HeartDisease"),
name = "CHD Percentage\n(Quintile Breaks)") +
labs(title = "Percentage of Coronary Heart Disease",
subtitle= "Among Adults 18 or Older") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title=element_text(size = 9))
LLE <-
ggplot() +
geom_sf(data = cameron_places, aes(fill = q5(LLE))) +
scale_fill_manual(values=palette5,
labels = qBr(cameron_places, "LLE"),
name = "LLE Percentage\n(Quintile Breaks)") +
labs(title = "Percentage of Low Life Expectancy",
subtitle = "Among Whole Population") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title=element_text(size = 9))
grid.arrange(diabetes, asthma, CHD, LLE)
From these maps, we can see that some areas have higher percentages of these health issues than others. It appears that the places with higher percentages of these health outcome cluster in some cases and are dispersed in other cases. In order to investigate that clustering further, we should look at the Moran’s I.
Moran’s I is a statistical test we can do to examine spatial clustering. I values can be between -1 and 1, where 1 represents positive spatial autocorrelation, 0 represents no autocorrelation, and -1 represents negative spatial autocorrelation. There are two types that we will be investigating: Global Moran and Local Moran.
For a Global Moran, the null hypothesis is that the variable is randomly distributed. If the p-value is statistically significant, we reject the null hypothesis and conclude that there is clustering. The figure below shows the frequency of 999 randomly permuted I values, with the observed I shown by the orange line for each of the four health variables. The observed I is higher than the randomly generated I values for each of the variables, but there is variation in the observed I values. For diabetes and low life expectancy, the observed I is a lot closer to 0, so the clustering is weaker than for coronary heart disease and asthma. None of the observed I values are particularly strong, so we can conclude that although there is some positive autocorrelation, this data leans more towards a random spatial arrangement - especially for diabetes and low life expectancy.
cameron_0 <- cameron_places %>%
mutate(Diabetes = replace_na(Diabetes, 0),
Asthma = replace_na(Asthma, 0),
HeartDisease = replace_na(HeartDisease, 0),
LLE = replace_na(LLE, 0)) %>%
filter(!row_number() %in% c(147,36))
cameron.nb<- poly2nb(cameron_0, queen=TRUE)
cameron.lw<- nb2listw(cameron.nb, zero.policy = TRUE)
cameron_diabetes_moran<- moran.mc(cameron_0$Diabetes, cameron.lw, nsim = 999)
cameron_asthma_moran<- moran.mc(cameron_0$Asthma, cameron.lw, nsim = 999)
cameron_CHD_moran<- moran.mc(cameron_0$HeartDisease, cameron.lw, nsim = 999)
cameron_LLE_moran<- moran.mc(cameron_0$LLE, cameron.lw, nsim = 999)
gmDiabetes <-
ggplot(as.data.frame(cameron_diabetes_moran$res[c(1:999)]), aes(cameron_diabetes_moran$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = cameron_diabetes_moran$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Permuted Moran's I - Diabetes",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
plotTheme()
gmAsthma <-
ggplot(as.data.frame(cameron_asthma_moran$res[c(1:999)]), aes(cameron_asthma_moran$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = cameron_asthma_moran$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Permuted Moran's I - Asthma",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
plotTheme()
gmCHD <-
ggplot(as.data.frame(cameron_CHD_moran$res[c(1:999)]), aes(cameron_CHD_moran$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = cameron_CHD_moran$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Permuted Moran's I - CHD",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
plotTheme()
gmLLE <-
ggplot(as.data.frame(cameron_LLE_moran$res[c(1:999)]), aes(cameron_LLE_moran$res[c(1:999)])) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = cameron_LLE_moran$statistic), colour = "#FA7800",size=1) +
scale_x_continuous(limits = c(-1, 1)) +
labs(title="Permuted Moran's I - LLE",
subtitle= "Observed Moran's I in orange",
x="Moran's I",
y="Count") +
plotTheme()
grid.arrange(gmDiabetes, gmAsthma, gmCHD, gmLLE)
The other type of Moran test is Local Moran, or LISA. The null hypothesis for this test is that the variable at a given location is randomly dispersed relative to its immediate neighbors. There are four different categories mapped in the figures below. 1. HH: statistically significant cluster of high values 2. LL: statistically significant cluster of low values 3. HL: high value surrounded mostly by low values 4. LH: low value surrounded mostly by high values
From the figure below, we can see that diabetes and coronary heart disease do cluster. High values of these variables tend to cluster together, as do low values of these variables. For asthma and low life expectancy, there is less clustering of similar values. We see a lot more high values surrounded by low values, or low values surrounded by high values.
diabetes_local_DF<- cameron_0 %>% mutate(nb = st_neighbors(geometry),
wts = st_weights(nb), lag_DF = st_lag(Diabetes %>% replace(.,is.na(.), 0), nb, wts),
lisa = categorize_lisa(Diabetes %>% replace(.,is.na(.), 0), lag_DF))
asthma_local_DF<- cameron_0 %>% mutate(nb = st_neighbors(geometry),
wts = st_weights(nb), lag_DF = st_lag(Asthma %>% replace(.,is.na(.), 0), nb, wts),
lisa = categorize_lisa(Asthma %>% replace(.,is.na(.), 0), lag_DF))
CHD_local_DF<- cameron_0 %>% mutate(nb = st_neighbors(geometry),
wts = st_weights(nb), lag_DF = st_lag(HeartDisease %>% replace(.,is.na(.), 0), nb, wts),
lisa = categorize_lisa(HeartDisease %>% replace(.,is.na(.), 0), lag_DF))
LLE_local_DF<- cameron_0 %>% mutate(nb = st_neighbors(geometry),
wts = st_weights(nb), lag_DF = st_lag(LLE %>% replace(.,is.na(.), 0), nb, wts),
lisa = categorize_lisa(LLE %>% replace(.,is.na(.), 0), lag_DF))
lmDiabetes <-
tm_shape(diabetes_local_DF) + tm_fill(col="lisa", alpha=0.2) + tm_borders() +
tm_layout(legend.position =c("left", "bottom"), main.title = "Local Moran's LISA - Diabetes",
main.title.position = c("left", .3), main.title.size = 1.25, legend.title.color = "white", legend.text.size = .75)
lmAsthma <-
tm_shape(asthma_local_DF) + tm_fill(col="lisa", alpha=0.2) + tm_borders() +
tm_layout(legend.position =c("left", "bottom"), main.title = "Local Moran's LISA - Asthma",
main.title.position = c("left", .3), main.title.size = 1.25, legend.title.color = "white", legend.text.size = .75)
lmCHD <-
tm_shape(CHD_local_DF) + tm_fill(col="lisa", alpha=0.2) + tm_borders() +
tm_layout(legend.position =c("left", "bottom"), main.title = "Local Moran's LISA - CHD",
main.title.position = c("left", .3), main.title.size = 1.25, legend.title.color = "white", legend.text.size = .75)
lmLLE <-
tm_shape(LLE_local_DF) + tm_fill(col="lisa", alpha=0.2) + tm_borders() +
tm_layout(legend.position =c("left", "bottom"), main.title = "Local Moran's LISA - LLE",
main.title.position = c("left", .3), main.title.size = 1.25, legend.title.color = "white", legend.text.size = .75)
tmap_arrange(lmDiabetes, lmAsthma, lmCHD, lmLLE)
Now that we have investigated certain health-related variables, we should look at economic variables such as income, rent, and poverty.
poverty <-
ggplot() +
geom_sf(data = tracts15, aes(fill = q5(pctPoverty))) +
scale_fill_manual(values = palette5,
labels = qBr(tracts15, "pctPoverty"),
name = "Poverty (%)\n(Quintile Breaks)") +
labs(title = "Percent Poverty") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 7))
income <-
ggplot() +
geom_sf(data = tracts15, aes(fill = q5(MedHHInc))) +
scale_fill_manual(values = palette5,
labels = qBr(tracts15, "MedHHInc"),
name = "Income\n(Quintile Breaks)") +
labs(title = "Median Household Income") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 7))
rent <-
ggplot() +
geom_sf(data = tracts15, aes(fill = q5(MedRent))) +
scale_fill_manual(values = palette5,
labels = qBr(tracts15, "MedRent"),
name = "Rent\n(Quintile Breaks)") +
labs(title = "Median Rent") +
mapTheme() + theme(legend.key.size = unit(0.3, "cm"), legend.title = element_text(size = 8), legend.text = element_text(size = 7))
grid.arrange(rent, income, poverty)
From these maps, we see that each of the economic variables appears to be randomly dispersed. There are also no apparent similarities between each of the variables. In other words, the places with high income are not necessarily the same places with high rent.
Now that we have data for the health-related variables and the economic variables, we will see whether there is any correlation between these.
everything <-
left_join(cameron_places, st_drop_geometry(tracts15), by = "GEOID10")
correlation.long <-
st_drop_geometry(everything) %>%
dplyr::select(Diabetes, Asthma, HeartDisease, LLE, pctPoverty) %>%
gather(Variable, Value, -pctPoverty)
correlation.cor <-
correlation.long %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, pctPoverty, use = "complete.obs"))
ggplot(correlation.long, aes(Value, pctPoverty)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor, aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title = "Percent Poverty as a function of health factors") +
plotTheme()
There appears to be a positive linear correlation between percent poverty and each of the health factors. In other words, as percent poverty increases, so does asthma, diabetes, heart disease, and low life expectancy. This relationship is strongest between asthma and percent poverty (r = 0.74).
correlation.long2 <-
st_drop_geometry(everything) %>%
dplyr::select(Diabetes, Asthma, HeartDisease, LLE, MedHHInc) %>%
gather(Variable, Value, -MedHHInc)
correlation.cor2 <-
correlation.long2 %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, MedHHInc, use = "complete.obs"))
ggplot(correlation.long2, aes(Value, MedHHInc)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor2, aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title = "Median Household Income as a function of health factors") +
plotTheme()
There appears to be a negative linear correlation between median household income and each of the health outcomes. As income increases, the percentage of diabetes, asthma, heart disease, and low life expectancy decreases. This relationship is strongest between diabetes and income (r = -0.7).
correlation.long3 <-
st_drop_geometry(everything) %>%
dplyr::select(Diabetes, Asthma, HeartDisease, LLE, MedRent) %>%
gather(Variable, Value, -MedRent)
correlation.cor3 <-
correlation.long3 %>%
group_by(Variable) %>%
summarize(correlation = cor(Value, MedRent, use = "complete.obs"))
ggplot(correlation.long3, aes(Value, MedRent)) +
geom_point(size = 0.1) +
geom_text(data = correlation.cor3, aes(label = paste("r =", round(correlation, 2))),
x=-Inf, y=Inf, vjust = 1.5, hjust = -.1) +
geom_smooth(method = "lm", se = FALSE, colour = "black") +
facet_wrap(~Variable, ncol = 2, scales = "free") +
labs(title = "Median Rent as a function of health factors") +
plotTheme()
Once again, we see a negative linear correlation between median rent and the health outcomes. As rent increases, the percentage of diabetes, asthma, heart disease, and low life expectancy decreases. This relationship is strongest between diabetes and rent (r = -0.65).
Interestingly, low life expectancy has the weakest correlation with each of the economic variables. Still, each of the charts show a correlation that is at least moderate, and some even have a strong correlation.
From the correlation coefficients given for each of the tests, we can see that there does appear to be a correlation between certain health variables with economic variables. There appears to be higher percentages of asthma and diabetes in places with more poverty and lower rent and income. This matches what we already know about health disparities: for a variety of reasons, people with a lower socioeconomic status tend to have poorer health. Although the tests done here show a correlation, they do not necessarily show causation. More tests would have to be done to determine if these economic variables are the cause of poorer health, or if they are simply a correlating factor.