# Load and clean Data

depression <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Depression (adults) (full table).csv") %>%
  clean_names()
## Rows: 96 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): TimePeriod, GeoType, Geography, Age-adjusted percent, Percent
## dbl (2): GeoID, GeoRank
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
psychillness <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Psychiatric hospitalizations (adults) (full table).csv") %>%
  clean_names()
## Rows: 144 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): GeoType, Geography
## dbl (3): TimePeriod, GeoID, GeoRank
## num (3): Age-adjusted rate per 100,000 residents, Number, Rate per 100,000 r...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
seriousdistress <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Serious psychological distress (adults) (full table).csv") %>%
  clean_names()
## Rows: 192 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): TimePeriod, GeoType, Geography, Age-adjusted percent, Number, Percent
## dbl (2): GeoID, GeoRank
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
evictions <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Evictions (court-ordered) (full table).csv") %>%
  clean_names()
## Rows: 2122 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): GeoType, Geography, Estimated annual rate per 10,000 homes
## dbl (3): TimePeriod, GeoID, GeoRank
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crowding <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Household crowding (full table).csv") %>%
  clean_names()
## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
occupiedhomes <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Owner-occupied homes (full table).csv") %>%
  clean_names()
## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rentburdened <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Rent-burdened households (full table).csv") %>%
  clean_names()
## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Depression X Burden

depression <- depression %>%
  rename(
    depression_percent = percent,
    depression_number = number
  )

rentburdened <- rentburdened %>%
  rename(
    rentburdened_percent = percent,
    rentburdened_number = number
  )

# Filter by year to try to align years as best as possible, Depression only has 2017-18 and 2021-22

RBMatching2017 <- rentburdened %>%
  filter(time_period == "2014-18",) 

Depression2017 <- depression %>%
  filter(time_period == "2017-18",)

# Depression rate was in a weird format and was not numeric

Depression2017 <- Depression2017 %>%
  mutate(
    depression_percent_clean = as.numeric(str_extract(depression_percent, "^[0-9.]+"))
  )

# Left join by neighborhood

depressionxburden2018 <- left_join(Depression2017, RBMatching2017, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

depressionxburden2018UHF42 <- depressionxburden2018 %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model1 <- lm(depression_percent_clean ~ rentburdened_percent, data = depressionxburden2018UHF42)
summary(model1)
## 
## Call:
## lm(formula = depression_percent_clean ~ rentburdened_percent, 
##     data = depressionxburden2018UHF42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0315 -2.8398 -0.5448  2.3106  9.0990 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           1.15338    4.54670   0.254   0.8011  
## rentburdened_percent  0.17355    0.08933   1.943   0.0593 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.65 on 39 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.08824,    Adjusted R-squared:  0.06486 
## F-statistic: 3.775 on 1 and 39 DF,  p-value: 0.05928

p = 0.059, marginally significant

# Scatterplot

ggplot(depressionxburden2018UHF42, aes(x = rentburdened_percent, y = depression_percent_clean)) +
  geom_point(color = "blue") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "red") +  # Regression line with confidence interval
  labs(
    title = "Depression vs Rent Burden",
    x = "Number of Rent-Burdened People",
    y = "Number of People with Depression"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# Mapped Data

uhf42shp <- st_read("C:\\Users\\zim13\\Downloads\\UHF_42_DOHMH\\UHF_42_DOHMH.shp")
## Reading layer `UHF_42_DOHMH' from data source 
##   `C:\Users\zim13\Downloads\UHF_42_DOHMH\UHF_42_DOHMH.shp' using driver `ESRI Shapefile'
## Simple feature collection with 43 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913090.8 ymin: 120053.5 xmax: 1067310 ymax: 272932
## Projected CRS: NAD83 / New York Long Island (ftUS)
map_data <- left_join(uhf42shp, depressionxburden2018UHF42, by = c("UHF" = "geo_id.x"))

# Depression Map

map1 <- ggplot(map_data) +
  geom_sf(aes(fill = depression_percent_clean)) +
  scale_fill_viridis_c(name = "Depression") + 
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Depression Rates by Area")

# Rent Burdened Map

map2 <- ggplot(map_data) +
  geom_sf(aes(fill = rentburdened_percent)) +
  scale_fill_viridis_c(name = "Rent Burdened") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rent Burden Rate by Area")

ggarrange(map2, map1, nrow = 1, ncol = 2, align ="h")

Depression X Crowding

crowding2017 <- crowding %>%
  filter(time_period == "2014-18",) 

depressionxcrowding2018 <- left_join(Depression2017, crowding2017, by = c("geo_type","geography"))

depressionxcrowding2018UHF42 <- depressionxcrowding2018 %>%
  filter(geo_type == "UHF42")

model2 <- lm(depression_percent_clean ~ percent, data = depressionxcrowding2018UHF42)
summary(model2)
## 
## Call:
## lm(formula = depression_percent_clean ~ percent, data = depressionxcrowding2018UHF42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2403 -3.0550 -0.6823  2.2074  9.1148 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.0999     1.2885   6.286 2.07e-07 ***
## percent       0.2068     0.1310   1.578    0.123    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.706 on 39 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.06005,    Adjusted R-squared:  0.03595 
## F-statistic: 2.492 on 1 and 39 DF,  p-value: 0.1225

p > 0.05, not significant

ggplot(depressionxcrowding2018UHF42, aes(x = percent, y = depression_percent_clean)) +
  geom_point(color = "red") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "blue") +  # Regression line with confidence interval
  labs(
    title = "Depression vs Crowding",
    x = "Rate of Crowding",
    y = "Rate of Depression"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

map_data2 <- left_join(uhf42shp, depressionxcrowding2018UHF42, by = c("UHF" = "geo_id.x"))

map1.1 <- ggplot(map_data2) +
  geom_sf(aes(fill = depression_percent_clean)) +
  scale_fill_gradient(name = "Depression", low = "pink", high = "red") + 
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Depression Rates by Area")

map3 <- ggplot(map_data2) +
  geom_sf(aes(fill = percent)) +
  scale_fill_gradient(name = "Crowding", low = "pink", high ="red") + 
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Crowding Rates by Area")

ggarrange(map3, map1.1, nrow = 1, ncol = 2, align ="h")

Depression X Evictions

# Eviction rate was not numeric

evictions <- evictions %>%
  mutate(
    eviction_rate = as.numeric(str_extract(estimated_annual_rate_per_10_000_homes, "\\d+\\.?\\d*"))
  )

evictions2018 <- evictions %>%
  filter(time_period == "2018",)

depressionxevictions2018 <- left_join(Depression2017, evictions2018, by = c("geo_type","geography"))

depressionxevictions2018UHF42 <- depressionxevictions2018 %>%
  filter(geo_type == "UHF42")

model3 <- lm(depression_percent_clean ~ eviction_rate, data = depressionxevictions2018UHF42)
summary(model3)
## 
## Call:
## lm(formula = depression_percent_clean ~ eviction_rate, data = depressionxevictions2018UHF42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6615 -2.5718 -0.6506  1.6967  8.5149 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.12265    0.81618   8.727 1.04e-10 ***
## eviction_rate  0.04794    0.01117   4.291 0.000114 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.15 on 39 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3207, Adjusted R-squared:  0.3033 
## F-statistic: 18.41 on 1 and 39 DF,  p-value: 0.0001136

p < 0.01

ggplot(depressionxevictions2018UHF42, aes(x = eviction_rate, y = depression_percent_clean)) +
  geom_point(color = "black") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "yellow") +  # Regression line with confidence interval
  labs(
    title = "Depression vs Evictions",
    x = "Rate of Evictions",
    y = "Rate of Depression"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

map_data3 <- left_join(uhf42shp, depressionxevictions2018UHF42, by = c("UHF" = "geo_id.x"))

map1.2 <- ggplot(map_data3) +
  geom_sf(aes(fill = depression_percent_clean)) +
  scale_fill_gradient(name = "Depression", low = "lightyellow", high = "yellow") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Depression Rates by Area")

map4 <- ggplot(map_data3) +
  geom_sf(aes(fill = eviction_rate)) +
  scale_fill_gradient(name = "Evictions", low = "lightyellow", high ="yellow") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Eviction Rates by Area")

ggarrange(map4, map1.2, nrow = 1, ncol = 2, align ="h")

Depression X Owner Occupied Homes

occupiedhomes2018 <- occupiedhomes %>%
  filter(time_period == "2014-18", geo_type == "UHF42") 

depressionxoccupiedhomes2018 <- left_join(Depression2017, occupiedhomes2018, by = c("geo_type","geography"))

depressionxoccupiedhomes2018 <- depressionxoccupiedhomes2018 %>%
  filter(geo_type == "UHF42")

model4 <- lm(depression_percent_clean ~ percent, data = depressionxoccupiedhomes2018)
summary(model4)
## 
## Call:
## lm(formula = depression_percent_clean ~ percent, data = depressionxoccupiedhomes2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6796 -2.2080 -0.0741  1.6938  7.1311 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.0117     0.9429  14.860  < 2e-16 ***
## percent      -0.1179     0.0236  -4.996 1.27e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.985 on 39 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3902, Adjusted R-squared:  0.3746 
## F-statistic: 24.96 on 1 and 39 DF,  p-value: 1.269e-05

p < 0.01

ggplot(depressionxoccupiedhomes2018, aes(x = percent, y = depression_percent_clean)) +
  geom_point(color = "purple") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "green") +  # Regression line with confidence interval
  labs(
    title = "Depression vs Home Ownership",
    x = "Number of Homes Owned",
    y = "Number of People with Depression"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

map_data4 <- left_join(uhf42shp, depressionxoccupiedhomes2018, by = c("UHF" = "geo_id.x"))

map1.3 <- ggplot(map_data4) +
  geom_sf(aes(fill = depression_percent_clean)) +
  scale_fill_gradient(name = "Depression", low = "lightgreen", high = "darkgreen") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Depression Rates by Area")

map5 <- ggplot(map_data4) +
  geom_sf(aes(fill = percent)) +
  scale_fill_gradient(name = "Occpied by Owners", low = "lightgreen", high ="darkgreen") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rates of Living in Owned Homes")

ggarrange(map5, map1.3, nrow = 1, ncol = 2, align ="h")

All Scatterplots in One

# Combined dataset
combined <- bind_rows(
  depressionxevictions2018 %>%
    select(depression_percent_clean, percent = eviction_rate) %>%
    mutate(type = "Evictions"),
  
  depressionxcrowding2018UHF42 %>%
    select(depression_percent_clean, percent = percent) %>%
    mutate(type = "Crowding"),
  
  depressionxburden2018UHF42 %>%
    select(depression_percent_clean, percent = rentburdened_percent) %>%
    mutate(type = "Rent Burden"),
  
  depressionxoccupiedhomes2018 %>%
    select(depression_percent_clean, percent) %>%
    mutate(type = "Home Ownership")
)

ggplot(combined, aes(x = percent, y = depression_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Depression Percent",
    title = "Regression Lines of Depression % by Housing Factors"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

bigplot1 <- ggplot(combined, aes(x = percent, y = depression_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Depression Percent",
    title = "Regression Lines of Depression % by Housing Factors"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())

Evictions make the other ones hard to see

combined2 <- bind_rows(
  depressionxcrowding2018UHF42 %>%
    select(depression_percent_clean, percent = percent) %>%
    mutate(type = "Crowding"),
  
  depressionxburden2018UHF42 %>%
    select(depression_percent_clean, percent = rentburdened_percent) %>%
    mutate(type = "Rent Burden"),
  
  depressionxoccupiedhomes2018 %>%
    select(depression_percent_clean, percent) %>%
    mutate(type = "Owner Occupied Homes")
)

ggplot(combined2, aes(x = percent, y = depression_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Depression Percent",
    title = "Regression Lines of Depression % by Housing Factors W/O Evictions"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

### Owner occupied homes makes the other factors hard to see

combined3 <- bind_rows(
  depressionxcrowding2018UHF42 %>%
    select(depression_percent_clean, percent = percent) %>%
    mutate(type = "Crowding"),
  
  depressionxburden2018UHF42 %>%
    select(depression_percent_clean, percent = rentburdened_percent) %>%
    mutate(type = "Rent Burden"),
  
)

ggplot(combined3, aes(x = percent, y = depression_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Depression Percent",
    title = "Regression Lines of Depression % by Housing Factors W/O Evictions/Owner Occupied Homes"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Multi regression Analysis weighs all of the variables together instead of individually to see which variable may have a stronger association to our dependent variable.

depression_df <- depressionxevictions2018UHF42
crowding_df <- depressionxcrowding2018UHF42
burden_df <- depressionxburden2018UHF42
occupied_df <- depressionxoccupiedhomes2018

names(crowding_df)[names(crowding_df) == 'percent'] <- 'crowding_percent'
names(occupied_df)[names(occupied_df) == 'percent'] <- 'occupied_percent'

# Join them step by step (assumes same key column, e.g., "UHF_code")
combined_df <- depression_df %>%
  select(geo_id.x, depression_percent_clean) %>%
  left_join(depression_df %>% select(geo_id.x, eviction_rate), by ="geo_id.x") %>%
  left_join(crowding_df %>% select(geo_id.x, crowding_percent), by = "geo_id.x") %>%
  left_join(burden_df %>% select(geo_id.x, rentburdened_percent), by = "geo_id.x") %>%
  left_join(occupied_df %>% select(geo_id.x, occupied_percent), by = "geo_id.x")



model <- lm(depression_percent_clean ~ occupied_percent + rentburdened_percent + crowding_percent + eviction_rate, data = combined_df)
summary(model)
## 
## Call:
## lm(formula = depression_percent_clean ~ occupied_percent + rentburdened_percent + 
##     crowding_percent + eviction_rate, data = combined_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8779 -1.4755 -0.6063  1.6966  5.3983 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.38097    4.94700   1.492 0.144409    
## occupied_percent     -0.11622    0.02845  -4.086 0.000235 ***
## rentburdened_percent  0.15929    0.13522   1.178 0.246533    
## crowding_percent     -0.34450    0.17652  -1.952 0.058797 .  
## eviction_rate         0.02667    0.01457   1.831 0.075406 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.686 on 36 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.4934 
## F-statistic: 10.74 on 4 and 36 DF,  p-value: 7.826e-06

Shows that rent burden Overlaps too much with other factors and skews weight of those other factors

Regression Analysis W/O Rent Burden

combined_df1 <- depression_df %>%
  select(geo_id.x, depression_percent_clean) %>%
  left_join(depression_df %>% select(geo_id.x, eviction_rate), by ="geo_id.x") %>%
  left_join(crowding_df %>% select(geo_id.x, crowding_percent), by = "geo_id.x") %>%
  left_join(occupied_df %>% select(geo_id.x, occupied_percent), by = "geo_id.x")



model_1 <- lm(depression_percent_clean ~ occupied_percent + crowding_percent + eviction_rate, data = combined_df)
summary(model_1)
## 
## Call:
## lm(formula = depression_percent_clean ~ occupied_percent + crowding_percent + 
##     eviction_rate, data = combined_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9195 -1.6084 -0.5021  1.7613  6.3610 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      12.86863    1.67326   7.691 3.48e-09 ***
## occupied_percent -0.10018    0.02511  -3.990  0.00030 ***
## crowding_percent -0.18721    0.11606  -1.613  0.11523    
## eviction_rate     0.03726    0.01153   3.232  0.00258 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.7 on 37 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5265, Adjusted R-squared:  0.4881 
## F-statistic: 13.71 on 3 and 37 DF,  p-value: 3.621e-06

Taking out rent burden shows us that eviction rate is also responsible for depression rates.

Hopitalization x Rent Burden

psychillness <- psychillness %>%
  rename(
    psychillness_rateper100k = rate_per_100_000_residents,
    psychillness_number = number
  )

# Filter by year to try to align years as best as possible, Depression only has 2017-18 and 2021-22

rentburndened2019 <- rentburdened %>%
  filter(time_period == "2015-19",) 

psychillness2019 <- psychillness %>%
  filter(time_period == "2019",)

# Left join by neighborhood

illnessxburden <- left_join(psychillness2019, rentburndened2019, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

illnessxburdenufh42 <- illnessxburden %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model5 <- lm(psychillness_rateper100k ~ rentburdened_percent, data = illnessxburdenufh42)
summary(model5)
## 
## Call:
## lm(formula = psychillness_rateper100k ~ rentburdened_percent, 
##     data = illnessxburdenufh42)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -373.5 -187.5  -34.3  135.8 1121.6 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)          -178.669    318.164  -0.562   0.5775  
## rentburdened_percent   15.943      6.415   2.485   0.0172 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 283.8 on 40 degrees of freedom
## Multiple R-squared:  0.1338, Adjusted R-squared:  0.1121 
## F-statistic: 6.178 on 1 and 40 DF,  p-value: 0.01722

p = 0.017

ggplot(illnessxburdenufh42, aes(x = rentburdened_percent, y = psychillness_rateper100k)) +
  geom_point(color = "blue") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "red") +  # Regression line with confidence interval
  labs(
    title = "Psych Hospitalizations vs Rent Burden",
    x = "Number of Rent-Burdened People",
    y = "Number of Hospitalized People"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

map_data5 <- left_join(uhf42shp, illnessxburdenufh42, by = c("UHF" = "geo_id.x"))

map1.4 <- ggplot(map_data5) +
  geom_sf(aes(fill = psychillness_rateper100k)) +
  scale_fill_gradient(name = "Hospitalization", low = "violet", high = "purple") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Hopitalization Rates by Area")

map6 <- ggplot(map_data5) +
  geom_sf(aes(fill = rentburdened_percent)) +
  scale_fill_gradient(name = "Rent Burden", low = "violet", high ="purple") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rent Burden Rate by Area")

ggarrange(map6, map1.4, nrow = 1, ncol = 2, align ="h")

Depression X Crowding

# Filter by year to try to align years as best as possible, Depression only has 2017-18 and 2021-22

crowding2019 <- crowding %>%
  filter(time_period == "2015-19",) 

crowding2019uhf <- crowding2019 %>%
  filter(geo_type == "UHF42" )

# Left join by neighborhood

illnessxcrowding <- left_join(psychillness2019, crowding2019, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

illnessxcrowdingufh42 <- illnessxcrowding %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model6 <- lm(psychillness_rateper100k ~ percent, data = illnessxcrowdingufh42)
summary(model6)
## 
## Call:
## lm(formula = psychillness_rateper100k ~ percent, data = illnessxcrowdingufh42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -350.27 -231.62  -70.95  157.24 1126.43 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   480.28     103.29   4.650 3.59e-05 ***
## percent        14.17      10.54   1.344    0.186    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 298.2 on 40 degrees of freedom
## Multiple R-squared:  0.04322,    Adjusted R-squared:  0.01931 
## F-statistic: 1.807 on 1 and 40 DF,  p-value: 0.1864

p > 0.05, not significant

ggplot(illnessxcrowdingufh42, aes(x = percent, y = psychillness_rateper100k)) +
  geom_point(color = "blue") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "purple") +  # Regression line with confidence interval
  labs(
    title = "Psych Hospitalizations vs Crowding",
    x = "Percent of Crowded Homes",
    y = "Number of Hospitalized People"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

map_data6 <- left_join(uhf42shp, illnessxcrowdingufh42, by = c("UHF" = "geo_id.x"))

map1.5 <- ggplot(map_data6) +
  geom_sf(aes(fill = psychillness_rateper100k)) +
  scale_fill_gradient(name = "Hospitalization", low = "pink", high = "purple") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Hopitalization Rates by Area")

map7 <- ggplot(map_data6) +
  geom_sf(aes(fill = percent)) +
  scale_fill_gradient(name = "Crowding", low = "pink", high ="purple") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Crowding Rate by Area")

ggarrange(map7, map1.5, nrow = 1, ncol = 2, align ="h")

Depression X Evictions

# Filter by year to try to align years as best as possible, Depression only has 2017-18 and 2021-22

evictions2019 <- evictions %>%
  filter(time_period == "2019",) 

# Left join by neighborhood

illnessxevictions <- left_join(psychillness2019, evictions2019, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

illnessxevictionsufh42 <- illnessxevictions %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model7 <- lm(psychillness_rateper100k ~ eviction_rate, data = illnessxevictionsufh42)
summary(model7)
## 
## Call:
## lm(formula = psychillness_rateper100k ~ eviction_rate, data = illnessxevictionsufh42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -288.12 -122.22  -72.23   65.42 1093.55 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   347.2102    58.5101   5.934 5.85e-07 ***
## eviction_rate   5.2760     0.9545   5.527 2.18e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 229.6 on 40 degrees of freedom
## Multiple R-squared:  0.433,  Adjusted R-squared:  0.4189 
## F-statistic: 30.55 on 1 and 40 DF,  p-value: 2.18e-06

p < 0.01

ggplot(illnessxevictionsufh42, aes(x = eviction_rate, y = psychillness_rateper100k)) +
  geom_point(color = "darkblue") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "black") +  # Regression line with confidence interval
  labs(
    title = "Psych Hospitalizations vs Evictions",
    x = "Rate of Eviction of Area",
    y = "Rate of Hospitalizations"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

map_data7 <- left_join(uhf42shp, illnessxevictionsufh42, by = c("UHF" = "geo_id.x"))

map1.6 <- ggplot(map_data7) +
  geom_sf(aes(fill = psychillness_rateper100k)) +
  scale_fill_gradient(name = "Hospitalization", low = "lightblue", high = "darkblue") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Hopitalization Rates by Area")

map8 <- ggplot(map_data7) +
  geom_sf(aes(fill = eviction_rate)) +
  scale_fill_gradient(name = "Evictions", low = "lightblue", high ="darkblue") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Eviction Rate by Area")

ggarrange(map8, map1.6, nrow = 1, ncol = 2, align ="h")

Hospitalization X Home Ownership

occupiedhomes2019 <- occupiedhomes %>%
  filter(time_period == "2015-19",) 

# Left join by neighborhood

illnessxoccupiedhomes <- left_join(psychillness2019, occupiedhomes2019, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

illnessxoccupiedhomesufh42 <- illnessxoccupiedhomes %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model8 <- lm(psychillness_rateper100k ~ percent, data = illnessxoccupiedhomesufh42)
summary(model8)
## 
## Call:
## lm(formula = psychillness_rateper100k ~ percent, data = illnessxoccupiedhomesufh42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -421.80 -185.84  -31.08  148.64  915.05 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  848.817     85.053   9.980 2.05e-12 ***
## percent       -7.065      2.144  -3.295  0.00207 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 270.4 on 40 degrees of freedom
## Multiple R-squared:  0.2135, Adjusted R-squared:  0.1938 
## F-statistic: 10.86 on 1 and 40 DF,  p-value: 0.002066

p < 0.01

ggplot(illnessxoccupiedhomesufh42, aes(x = percent, y = psychillness_rateper100k)) +
  geom_point(color = "brown") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "black") +  # Regression line with confidence interval
  labs(
    title = "Psych Hospitalizations vs Home Ownership",
    x = "Rate of Home Ownership of Area",
    y = "Rate of Hospitalizations"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

map_data8 <- left_join(uhf42shp, illnessxoccupiedhomesufh42, by = c("UHF" = "geo_id.x"))

map1.7 <- ggplot(map_data8) +
  geom_sf(aes(fill = psychillness_rateper100k)) +
  scale_fill_gradient(name = "Hospitalization", low = "lightyellow", high = "orange") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Hopitalization Rates by Area")

map9 <- ggplot(map_data8) +
  geom_sf(aes(fill = percent)) +
  scale_fill_gradient(name = "Owner Occupied Homes", low = "lightyellow", high ="orange") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rate of Owner Occupied Homes by Area")

ggarrange(map9, map1.7, nrow = 1, ncol = 2, align ="h")

Scatterplot of Hospitalizations and all varbiables

# Combined dataset
combined4 <- bind_rows(
  illnessxevictionsufh42 %>%
    select(psychillness_rateper100k, percent = eviction_rate) %>%
    mutate(type = "Evictions"),
  
  illnessxcrowdingufh42 %>%
    select(psychillness_rateper100k, percent = percent) %>%
    mutate(type = "Crowding"),
  
  illnessxburdenufh42 %>%
    select(psychillness_rateper100k, percent = rentburdened_percent) %>%
    mutate(type = "Rent Burden"),
  
  illnessxoccupiedhomesufh42 %>%
    select(psychillness_rateper100k, percent) %>%
    mutate(type = "Home Ownership")
)


ggplot(combined4, aes(x = percent, y = psychillness_rateper100k, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Hospitalization Percent",
    title = "Regression Lines of Hospitalizations % by Housing Factors"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'

bigplot2 <- ggplot(combined4, aes(x = percent, y = psychillness_rateper100k, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Hospitalization Percent",
    title = "Regression Lines of Hospitalization % by Housing Factors"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())

Hospitalization Multi Regression Analysis

illness_df2 <- illnessxevictionsufh42
crowding_df2 <- illnessxcrowdingufh42
burden_df2 <- illnessxburdenufh42
occupied_df2 <- illnessxoccupiedhomesufh42

names(crowding_df2)[names(crowding_df2) == 'percent'] <- 'crowding_percent'
names(occupied_df2)[names(occupied_df2) == 'percent'] <- 'occupied_percent'

# Join them step by step (assumes same key column, e.g., "UHF_code")
combined_df2 <- illness_df2 %>%
  select(geo_id.x, psychillness_rateper100k) %>%
  left_join(illness_df2 %>% select(geo_id.x, eviction_rate), by ="geo_id.x") %>%
  left_join(crowding_df2 %>% select(geo_id.x, crowding_percent), by = "geo_id.x") %>%
  left_join(burden_df2 %>% select(geo_id.x, rentburdened_percent), by = "geo_id.x") %>%
  left_join(occupied_df2 %>% select(geo_id.x, occupied_percent), by = "geo_id.x")



model_2 <- lm(psychillness_rateper100k ~ occupied_percent + rentburdened_percent + crowding_percent + eviction_rate, data = combined_df2)
summary(model_2)
## 
## Call:
## lm(formula = psychillness_rateper100k ~ occupied_percent + rentburdened_percent + 
##     crowding_percent + eviction_rate, data = combined_df2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -249.10 -131.00  -31.56   39.59  934.43 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           480.449    340.891   1.409  0.16707   
## occupied_percent       -5.097      2.263  -2.252  0.03034 * 
## rentburdened_percent    5.144      9.583   0.537  0.59467   
## crowding_percent      -20.247     13.087  -1.547  0.13035   
## eviction_rate           4.617      1.407   3.281  0.00226 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 220.9 on 37 degrees of freedom
## Multiple R-squared:  0.5144, Adjusted R-squared:  0.4619 
## F-statistic: 9.798 on 4 and 37 DF,  p-value: 1.654e-05

p < 0.05, meaning that the housing variables predict rates of psychiatric hospitalization, among these, eviction and home ownership plays the biggest role, but crowding also seems to play a role. Rent burden does not seem to be doing so.

Let’s check collilinearity here as well.

check_collinearity(model_2)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##                  Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##      occupied_percent 1.67 [1.28, 2.61]         1.29      0.60     [0.38, 0.78]
##  rentburdened_percent 3.68 [2.48, 5.86]         1.92      0.27     [0.17, 0.40]
##      crowding_percent 2.81 [1.95, 4.43]         1.68      0.36     [0.23, 0.51]
##         eviction_rate 2.35 [1.68, 3.68]         1.53      0.43     [0.27, 0.60]

Shows that Rent Burden Overlaps too much with other factors and may skew weight of those other factors

Multiregression W/O Rent Burden

combined_df2.1 <- illness_df2 %>%
  select(geo_id.x, psychillness_rateper100k) %>%
  left_join(illness_df2 %>% select(geo_id.x, eviction_rate), by ="geo_id.x") %>%
  left_join(crowding_df2 %>% select(geo_id.x, crowding_percent), by = "geo_id.x") %>%
  left_join(occupied_df2 %>% select(geo_id.x, occupied_percent), by = "geo_id.x")



model_2.1 <- lm(psychillness_rateper100k ~ occupied_percent + crowding_percent + eviction_rate, data = combined_df2.1)
summary(model_2.1)
## 
## Call:
## lm(formula = psychillness_rateper100k ~ occupied_percent + crowding_percent + 
##     eviction_rate, data = combined_df2.1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -251.30 -131.92  -25.92   40.61  949.28 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       648.192    134.859   4.806 2.43e-05 ***
## occupied_percent   -4.577      2.026  -2.259   0.0297 *  
## crowding_percent  -15.327      9.254  -1.656   0.1059    
## eviction_rate       5.105      1.063   4.803 2.46e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 218.8 on 38 degrees of freedom
## Multiple R-squared:  0.5106, Adjusted R-squared:  0.472 
## F-statistic: 13.22 on 3 and 38 DF,  p-value: 4.657e-06

Taking out rent burden increased the weight of the other variables but did not change significance of anything.

Distress X Rent Burden

seriousdistress <- seriousdistress %>%
  rename(
    distress_percent = percent,
    distress_number = number
  )

seriousdistress <- seriousdistress %>%
mutate(
    distress_percent_clean = as.numeric(str_extract(distress_percent, "^[0-9.]+"))
  )

# Filter by year to try to align years as best as possible, Depression only has 2017-18 and 2021-22

rentburndened2020 <- rentburdened %>%
  filter(time_period == "2016-20",) 

seriousdistress2020 <- seriousdistress %>%
  filter(time_period == "2019-20",)

# Left join by neighborhood

distressxburden <- left_join(seriousdistress2020, rentburndened2020, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

distressxburdenufh42 <- distressxburden %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model9 <- lm(distress_percent_clean ~ rentburdened_percent, data = distressxburdenufh42)
summary(model9)
## 
## Call:
## lm(formula = distress_percent_clean ~ rentburdened_percent, data = distressxburdenufh42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3548 -1.2705 -0.1239  1.4838  3.7973 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)          -0.85934    2.58876  -0.332   0.7418  
## rentburdened_percent  0.13466    0.05207   2.586   0.0137 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.062 on 38 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1497, Adjusted R-squared:  0.1273 
## F-statistic: 6.689 on 1 and 38 DF,  p-value: 0.01366

p = 0.014, significant

ggplot(distressxburdenufh42, aes(x = rentburdened_percent, y = distress_percent_clean)) +
  geom_point(color = "orange") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "darkgreen") +  # Regression line with confidence interval
  labs(
    title = "Serious Psychological Distress vs Rent Burden",
    x = "Rate of Rent Burden by Area",
    y = "Rate of Serious Psychological Distress"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

map_data9 <- left_join(uhf42shp, distressxburdenufh42, by = c("UHF" = "geo_id.x"))

map1.8 <- ggplot(map_data9) +
  geom_sf(aes(fill = distress_percent_clean)) +
  scale_fill_gradient(name = "Serious Psychological Distress", low = "pink", high = "red") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Serious Psychological Distress")

map10 <- ggplot(map_data9) +
  geom_sf(aes(fill = rentburdened_percent)) +
  scale_fill_gradient(name = "Rate of Rent Burden", low = "pink", high ="red") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rate of Rent Burden by Area")

ggarrange(map10, map1.8, nrow = 1, ncol = 2, align ="h")

Distress X Crowding

crowding2020 <- crowding %>%
  filter(time_period == "2016-20",) 

distressxcrowding <- left_join(seriousdistress2020, crowding2020, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

distressxcrowdingufh42 <- distressxcrowding %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model10 <- lm(distress_percent_clean ~ percent, data = distressxcrowdingufh42)
summary(model10)
## 
## Call:
## lm(formula = distress_percent_clean ~ percent, data = distressxcrowdingufh42)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.555 -1.433 -0.196  1.708  4.368 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.94279    0.78896   6.265 2.47e-07 ***
## percent      0.09507    0.08021   1.185    0.243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.195 on 38 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.03565,    Adjusted R-squared:  0.01027 
## F-statistic: 1.405 on 1 and 38 DF,  p-value: 0.2433

p > 0.05, not significant

ggplot(distressxcrowdingufh42, aes(x = percent, y = distress_percent_clean)) +
  geom_point(color = "orange") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "darkgreen") +  # Regression line with confidence interval
  labs(
    title = "Serious Psychological Distress vs Rent Burden",
    x = "Rate of Crowding by Area",
    y = "Rate of Serious Psychological Distress"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

map_data10 <- left_join(uhf42shp, distressxcrowdingufh42, by = c("UHF" = "geo_id.x"))

map1.9 <- ggplot(map_data10) +
  geom_sf(aes(fill = distress_percent_clean)) +
  scale_fill_gradient(name = "Serious Psychological Distress", low = "lightgreen", high = "darkgreen") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Serious Psychological Distress")

map11 <- ggplot(map_data10) +
  geom_sf(aes(fill = percent)) +
  scale_fill_gradient(name = "Rate of Crowding", low = "lightgreen", high ="darkgreen") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rate of Crowding by Area")

ggarrange(map11, map1.9, nrow = 1, ncol = 2, align ="h")

Distress X Evictions

evictions2020 <- evictions %>%
  filter(time_period == "2020",) 

distressxevictions <- left_join(seriousdistress2020, evictions2020, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

distressxevictionsufh42 <- distressxevictions %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model11 <- lm(distress_percent_clean ~ eviction_rate, data = distressxevictionsufh42)
summary(model11)
## 
## Call:
## lm(formula = distress_percent_clean ~ eviction_rate, data = distressxevictionsufh42)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.386 -1.538 -0.164  1.304  3.854 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.38828    0.53529   8.198 6.28e-10 ***
## eviction_rate  0.15635    0.04866   3.213  0.00268 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.982 on 38 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2136, Adjusted R-squared:  0.1929 
## F-statistic: 10.32 on 1 and 38 DF,  p-value: 0.002676

p < 0.01

ggplot(distressxevictionsufh42, aes(x = eviction_rate, y = distress_percent_clean)) +
  geom_point(color = "blue") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "purple") +  # Regression line with confidence interval
  labs(
    title = "Serious Psychological Distress vs Evictions",
    x = "Rate of Evictions by Area",
    y = "Rate of Serious Psychological Distress"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

map_data11 <- left_join(uhf42shp, distressxevictionsufh42, by = c("UHF" = "geo_id.x"))

map1.91 <- ggplot(map_data11) +
  geom_sf(aes(fill = distress_percent_clean)) +
  scale_fill_gradient(name = "Serious Psychological Distress", low = "lightblue", high = "darkblue") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Serious Psychological Distress")

map12 <- ggplot(map_data11) +
  geom_sf(aes(fill = eviction_rate)) +
  scale_fill_gradient(name = "Rate of Evictions", low = "lightblue", high ="darkblue") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rate of Evictions by Area")

ggarrange(map12, map1.91, nrow = 1, ncol = 2, align ="h")

Distress X Owner Occupied Homes

occupiedhomes2020 <- occupiedhomes %>%
  filter(time_period == "2016-20",) 

distressxoccupiedhomes <- left_join(seriousdistress2020, occupiedhomes2020, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

distressxoccupiedhomesufh42 <- distressxoccupiedhomes %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model12 <- lm(distress_percent_clean ~ percent, data = distressxoccupiedhomesufh42)
summary(model12)
## 
## Call:
## lm(formula = distress_percent_clean ~ percent, data = distressxoccupiedhomesufh42)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -3.782 -1.633  0.019  1.687  4.310 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.101682   0.722905   8.441 3.04e-10 ***
## percent     -0.009376   0.018543  -0.506    0.616    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.228 on 38 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.006683,   Adjusted R-squared:  -0.01946 
## F-statistic: 0.2557 on 1 and 38 DF,  p-value: 0.616

p > 0.05, not significant

ggplot(distressxoccupiedhomesufh42, aes(x = percent, y = distress_percent_clean)) +
  geom_point(color = "blue") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "purple") +  # Regression line with confidence interval
  labs(
    title = "Serious Psychological Distress vs Owner Occupied Homes",
    x = "Rate of Owner Occupied Homes by Area",
    y = "Rate of Serious Psychological Distress"
  ) +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

map_data12 <- left_join(uhf42shp, distressxoccupiedhomesufh42, by = c("UHF" = "geo_id.x"))

map1.92 <- ggplot(map_data12) +
  geom_sf(aes(fill = distress_percent_clean)) +
  scale_fill_gradient(name = "Serious Psychological Distress", low = "lightyellow", high = "orange") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Serious Psychological Distress")

map13 <- ggplot(map_data12) +
  geom_sf(aes(fill = percent)) +
  scale_fill_gradient(name = "Rate of Owner Occupied Homes", low = "lightyellow", high ="orange") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rate of Owner Occupied Homes by Area")

ggarrange(map13, map1.92, nrow = 1, ncol = 2, align ="h")

Scatterplot of Psychiatric Distress and All Varbiables

# Combined dataset
combined5 <- bind_rows(
  distressxevictionsufh42 %>%
    select(distress_percent_clean, percent = eviction_rate) %>%
    mutate(type = "Evictions"),
  
  distressxcrowdingufh42 %>%
    select(distress_percent_clean, percent = percent) %>%
    mutate(type = "Crowding"),
  
  distressxburdenufh42 %>%
    select(distress_percent_clean, percent = rentburdened_percent) %>%
    mutate(type = "Rent Burden"),
  
  distressxoccupiedhomesufh42 %>%
    select(distress_percent_clean, percent) %>%
    mutate(type = "Home Ownership")
)


ggplot(combined5, aes(x = percent, y = distress_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Psychiatric Distress Percent",
    title = "Regression Lines of Psychiatric Distress % by Housing Factors"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_point()`).

bigplot3 <- ggplot(combined5, aes(x = percent, y = distress_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Psychiatric Distress Percent"  ) +
  theme_minimal() +
  theme(legend.title = element_blank())

Multi Regression Analysis W/ Distress

distress_df3 <- distressxevictionsufh42
crowding_df3 <- distressxcrowdingufh42
burden_df3 <- distressxburdenufh42
occupied_df3 <- distressxoccupiedhomesufh42

names(crowding_df3)[names(crowding_df3) == 'percent'] <- 'crowding_percent'
names(occupied_df3)[names(occupied_df3) == 'percent'] <- 'occupied_percent'

# Join them step by step (assumes same key column, e.g., "UHF_code")
combined_df3 <- distress_df3 %>%
  select(geo_id.x, distress_percent_clean) %>%
  left_join(distress_df3 %>% select(geo_id.x, eviction_rate), by ="geo_id.x") %>%
  left_join(crowding_df3 %>% select(geo_id.x, crowding_percent), by = "geo_id.x") %>%
  left_join(burden_df3 %>% select(geo_id.x, rentburdened_percent), by = "geo_id.x") %>%
  left_join(occupied_df3 %>% select(geo_id.x, occupied_percent), by = "geo_id.x")



model_3 <- lm(distress_percent_clean ~ occupied_percent + rentburdened_percent + crowding_percent + eviction_rate, data = combined_df3)
summary(model_3)
## 
## Call:
## lm(formula = distress_percent_clean ~ occupied_percent + rentburdened_percent + 
##     crowding_percent + eviction_rate, data = combined_df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6833 -1.4285 -0.0899  1.4064  3.6992 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)           0.806346   3.691866   0.218    0.828
## occupied_percent      0.004976   0.021930   0.227    0.822
## rentburdened_percent  0.088300   0.103752   0.851    0.401
## crowding_percent     -0.079201   0.130455  -0.607    0.548
## eviction_rate         0.129090   0.076465   1.688    0.100
## 
## Residual standard error: 2.029 on 35 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2413, Adjusted R-squared:  0.1546 
## F-statistic: 2.783 on 4 and 35 DF,  p-value: 0.04161

p < 0.05, meaning the housing variables are responsible for psychiatric distress, but none of the variables individually play a huge role, with eviction being the most relevant.

Checking collinearity again.

check_collinearity(model_2)
## # Check for Multicollinearity
## 
## Low Correlation
## 
##                  Term  VIF   VIF 95% CI Increased SE Tolerance Tolerance 95% CI
##      occupied_percent 1.67 [1.28, 2.61]         1.29      0.60     [0.38, 0.78]
##  rentburdened_percent 3.68 [2.48, 5.86]         1.92      0.27     [0.17, 0.40]
##      crowding_percent 2.81 [1.95, 4.43]         1.68      0.36     [0.23, 0.51]
##         eviction_rate 2.35 [1.68, 3.68]         1.53      0.43     [0.27, 0.60]

Shows that rent burden overlaps too much with other factors again

Multi Regression W/O Rent Burden

model_3.1 <- lm(distress_percent_clean ~ occupied_percent + crowding_percent + eviction_rate, data = combined_df3)
summary(model_3.1)
## 
## Call:
## lm(formula = distress_percent_clean ~ occupied_percent + crowding_percent + 
##     eviction_rate, data = combined_df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4096 -1.4082 -0.1602  1.2577  4.0171 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)   
## (Intercept)      3.758088   1.260333   2.982  0.00511 **
## occupied_percent 0.013816   0.019240   0.718  0.47731   
## crowding_percent 0.002753   0.087670   0.031  0.97512   
## eviction_rate    0.171544   0.057730   2.972  0.00525 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.021 on 36 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2256, Adjusted R-squared:  0.1611 
## F-statistic: 3.496 on 3 and 36 DF,  p-value: 0.02524

p value of model is even more significant, and eviction rate is significant, meaning that eviction may affect psychiatric distress.

Depression X Housing Variables - (All Data From 2018)

require(leaflet)
## Loading required package: leaflet
## Warning: package 'leaflet' was built under R version 4.4.3
library(RColorBrewer)

depressionxhousingsf <- uhf42shp %>%
  left_join(combined_df, by = c("UHFCODE" = "geo_id.x"))

depressionxhousingsf <- st_as_sf(depressionxhousingsf)

depressionxhousingsf <- st_transform(depressionxhousingsf, crs = 4326)

# Create color palettes
pal_dep <- colorNumeric("Blues", domain = depressionxhousingsf$depression_percent_clean)
pal_rent <- colorNumeric("Reds", domain = depressionxhousingsf$rentburdened_percent)
pal_crowd <- colorNumeric("Greens", domain = depressionxhousingsf$crowding_percent)
pal_evict <- colorNumeric("Oranges", domain = depressionxhousingsf$eviction_rate)
pal_own <- colorNumeric("Purples", domain = depressionxhousingsf$occupied_percent)

# Build leaflet map
leaflet(depressionxhousingsf) %>%
  addTiles() %>%

  # Depression layer
  addPolygons(
    fillColor = ~pal_dep(depression_percent_clean),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Depression (%)",
    label = ~paste0("Depression: ", depression_percent_clean, "%")
  ) %>%

  # Rent Burden layer
  addPolygons(
    fillColor = ~pal_rent(rentburdened_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Rent Burdened (%)",
    label = ~paste0("Rent Burdened: ", rentburdened_percent, "%")
  ) %>%

  # Crowding layer
  addPolygons(
    fillColor = ~pal_crowd(crowding_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Crowding (%)",
    label = ~paste0("Crowding: ", crowding_percent, "%")
  ) %>%
  
  # Crowding layer
  addPolygons(
    fillColor = ~pal_evict(eviction_rate),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Eviction (%)",
    label = ~paste0("Eviction: ", eviction_rate, "%")
  ) %>%

  # Crowding layer
  addPolygons(
    fillColor = ~pal_own(occupied_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Owner Occupied Homes (%)",
    label = ~paste0("Occupied: ", occupied_percent, "%")
  ) %>%
  
  # Layer control
  addLayersControl(
    baseGroups = c("Depression (%)", "Rent Burdened (%)", "Crowding (%)", "Evictions (%)", "Owner Occupied Homes (%)"),
    options = layersControlOptions(collapsed = FALSE)
  )

Psychiatric Hospitalizations X Housing Variables - (All Data From 2019)

hospitalizationsxhousingsf <- uhf42shp %>%
  left_join(combined_df2, by = c("UHFCODE" = "geo_id.x"))

hospitalizationsxhousingsf <- st_as_sf(hospitalizationsxhousingsf)

hospitalizationsxhousingsf <- st_transform(hospitalizationsxhousingsf, crs = 4326)


# Create color palettes
pal_dep2 <- colorNumeric("Oranges", domain = hospitalizationsxhousingsf$psychillness_rateper100k)
pal_rent <- colorNumeric("Reds", domain = hospitalizationsxhousingsf$rentburdened_percent)
pal_crowd <- colorNumeric("Greens", domain = hospitalizationsxhousingsf$crowding_percent)
pal_evict <- colorNumeric("Blues", domain = hospitalizationsxhousingsf$eviction_rate)
pal_own <- colorNumeric("Purples", domain = hospitalizationsxhousingsf$occupied_percent)

# Build leaflet map
leaflet(hospitalizationsxhousingsf) %>%
  addTiles() %>%

  # Depression layer
  addPolygons(
    fillColor = ~pal_dep2(psychillness_rateper100k),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Psychiatric Hospitalization (%)",
    label = ~paste0("Psychiatric Hospitalizations: ", psychillness_rateper100k, "%")
  ) %>%

  # Rent Burden layer
  addPolygons(
    fillColor = ~pal_rent(rentburdened_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Rent Burdened (%)",
    label = ~paste0("Rent Burdened: ", rentburdened_percent, "%")
  ) %>%

  # Crowding layer
  addPolygons(
    fillColor = ~pal_crowd(crowding_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Crowding (%)",
    label = ~paste0("Crowding: ", crowding_percent, "%")
  ) %>%
  
  # Crowding layer
  addPolygons(
    fillColor = ~pal_evict(eviction_rate),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Eviction (%)",
    label = ~paste0("Eviction: ", eviction_rate, "%")
  ) %>%

  # Crowding layer
  addPolygons(
    fillColor = ~pal_own(occupied_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Owner Occupied Homes (%)",
    label = ~paste0("Occupied: ", occupied_percent, "%")
  ) %>%
  
  # Layer control
  addLayersControl(
    baseGroups = c("Psychiatric Hospitalizations (%)", "Rent Burdened (%)", "Crowding (%)", "Evictions (%)", "Owner Occupied Homes (%)"),
    options = layersControlOptions(collapsed = FALSE)
  )

Distress X Housing Variables - (All Data From 2020)

distressxhousingsf <- uhf42shp %>%
  left_join(combined_df3, by = c("UHFCODE" = "geo_id.x"))

distressxhousingsf <- st_as_sf(distressxhousingsf)

distressxhousingsf <- st_transform(distressxhousingsf, crs = 4326)


# Create color palettes
pal_dep3 <- colorNumeric("Blues", domain = distressxhousingsf$distress_percent_clean)
pal_rent <- colorNumeric("Reds", domain = distressxhousingsf$rentburdened_percent)
pal_crowd <- colorNumeric("Greens", domain = distressxhousingsf$crowding_percent)
pal_evict <- colorNumeric("Oranges", domain = distressxhousingsf$eviction_rate)
pal_own <- colorNumeric("Purples", domain = distressxhousingsf$occupied_percent)

# Build leaflet map
leaflet(distressxhousingsf) %>%
  addTiles() %>%

  # Depression layer
  addPolygons(
    fillColor = ~pal_dep3(distress_percent_clean),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Psychological Distress (%)",
    label = ~paste0("Psychological Distress: ", distress_percent_clean, "%")
  ) %>%

  # Rent Burden layer
  addPolygons(
    fillColor = ~pal_rent(rentburdened_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Rent Burdened (%)",
    label = ~paste0("Rent Burdened: ", rentburdened_percent, "%")
  ) %>%

  # Crowding layer
  addPolygons(
    fillColor = ~pal_crowd(crowding_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Crowding (%)",
    label = ~paste0("Crowding: ", crowding_percent, "%")
  ) %>%
  
  # Crowding layer
  addPolygons(
    fillColor = ~pal_evict(eviction_rate),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Eviction (%)",
    label = ~paste0("Eviction: ", eviction_rate, "%")
  ) %>%

  # Crowding layer
  addPolygons(
    fillColor = ~pal_own(occupied_percent),
    color = "white",
    weight = 1,
    fillOpacity = 0.7,
    group = "Owner Occupied Homes (%)",
    label = ~paste0("Occupied: ", occupied_percent, "%")
  ) %>%
  
  # Layer control
  addLayersControl(
    baseGroups = c("Psychological Distress (%)", "Rent Burdened (%)", "Crowding (%)", "Evictions (%)", "Owner Occupied Homes (%)"),
    options = layersControlOptions(collapsed = FALSE)
  )