Project Report

R Spatial Lab Assignment # 1

Don’t use a single chunk for the entire assignment. Break it into multiple chunks.

task 1:

# Load and clean Data

depression <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Depression (adults) (full table).csv") %>%
  clean_names()

## Rows: 96 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): TimePeriod, GeoType, Geography, Age-adjusted percent, Percent
## dbl (2): GeoID, GeoRank
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

psychillness <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Psychiatric hospitalizations (adults) (full table).csv") %>%
  clean_names()

## Rows: 144 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): GeoType, Geography
## dbl (3): TimePeriod, GeoID, GeoRank
## num (3): Age-adjusted rate per 100,000 residents, Number, Rate per 100,000 r...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

seriousdistress <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Serious psychological distress (adults) (full table).csv") %>%
  clean_names()

## Rows: 192 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): TimePeriod, GeoType, Geography, Age-adjusted percent, Number, Percent
## dbl (2): GeoID, GeoRank
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

evictions <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Evictions (court-ordered) (full table).csv") %>%
  clean_names()

## Rows: 2122 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): GeoType, Geography, Estimated annual rate per 10,000 homes
## dbl (3): TimePeriod, GeoID, GeoRank
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

crowding <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Household crowding (full table).csv") %>%
  clean_names()

## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

occupiedhomes <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Owner-occupied homes (full table).csv") %>%
  clean_names()

## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

rentburdened <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Rent-burdened households (full table).csv") %>%
  clean_names()

## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

task 2: Depression X Burden

depression <- depression %>%
  rename(
    depression_percent = percent,
    depression_number = number
  )

rentburdened <- rentburdened %>%
  rename(
    rentburdened_percent = percent,
    rentburdened_number = number
  )

# Filter by year to try to align years as best as possible, Depression only has 2017-18 and 2021-22

RBMatching2017 <- rentburdened %>%
  filter(time_period == "2014-18",) 

Depression2017 <- depression %>%
  filter(time_period == "2017-18",)

# Depression rate was in a weird format and was not numeric

Depression2017 <- Depression2017 %>%
  mutate(
    depression_percent_clean = as.numeric(str_extract(depression_percent, "^[0-9.]+"))
  )

# Left join by neighborhood

depressionxburden2018 <- left_join(Depression2017, RBMatching2017, by = c("geo_type","geography"))

# Depression data was mostly in UHF42, so we use that

depressionxburden2018UHF42 <- depressionxburden2018 %>%
  filter(geo_type == "UHF42")

# Regression Analysis

model1 <- lm(depression_percent_clean ~ rentburdened_percent, data = depressionxburden2018UHF42)
summary(model1)

## 
## Call:
## lm(formula = depression_percent_clean ~ rentburdened_percent, 
##     data = depressionxburden2018UHF42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.0315 -2.8398 -0.5448  2.3106  9.0990 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)  
## (Intercept)           1.15338    4.54670   0.254   0.8011  
## rentburdened_percent  0.17355    0.08933   1.943   0.0593 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.65 on 39 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.08824,    Adjusted R-squared:  0.06486 
## F-statistic: 3.775 on 1 and 39 DF,  p-value: 0.05928

# Scatterplot

ggplot(depressionxburden2018UHF42, aes(x = rentburdened_percent, y = depression_percent_clean)) +
  geom_point(color = "blue") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "red") +  # Regression line with confidence interval
  labs(
    title = "Depression vs Rent Burden",
    x = "Number of Rent-Burdened People",
    y = "Number of People with Depression"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# Mapped Data

uhf42shp <- st_read("C:\\Users\\zim13\\Downloads\\UHF_42_DOHMH\\UHF_42_DOHMH.shp")

## Reading layer `UHF_42_DOHMH' from data source 
##   `C:\Users\zim13\Downloads\UHF_42_DOHMH\UHF_42_DOHMH.shp' using driver `ESRI Shapefile'
## Simple feature collection with 43 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 913090.8 ymin: 120053.5 xmax: 1067310 ymax: 272932
## Projected CRS: NAD83 / New York Long Island (ftUS)

map_data <- left_join(uhf42shp, depressionxburden2018UHF42, by = c("UHF" = "geo_id.x"))

# Depression Map

map1 <- ggplot(map_data) +
  geom_sf(aes(fill = depression_percent_clean)) +
  scale_fill_viridis_c(name = "Depression") + 
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Depression Rates by Area")

# Rent Burdened Map

map2 <- ggplot(map_data) +
  geom_sf(aes(fill = rentburdened_percent)) +
  scale_fill_viridis_c(name = "Rent Burdened") +
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rent Burden Rate by Area")

ggarrange(map2, map1, nrow = 1, ncol = 2, align ="h")

Depression X Crowding

crowding2017 <- crowding %>%
  filter(time_period == "2014-18",) 

depressionxcrowding2018 <- left_join(Depression2017, crowding2017, by = c("geo_type","geography"))

depressionxcrowding2018UHF42 <- depressionxcrowding2018 %>%
  filter(geo_type == "UHF42")

model2 <- lm(depression_percent_clean ~ percent, data = depressionxcrowding2018UHF42)
summary(model2)

## 
## Call:
## lm(formula = depression_percent_clean ~ percent, data = depressionxcrowding2018UHF42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2403 -3.0550 -0.6823  2.2074  9.1148 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.0999     1.2885   6.286 2.07e-07 ***
## percent       0.2068     0.1310   1.578    0.123    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.706 on 39 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.06005,    Adjusted R-squared:  0.03595 
## F-statistic: 2.492 on 1 and 39 DF,  p-value: 0.1225

ggplot(depressionxcrowding2018UHF42, aes(x = percent, y = depression_percent_clean)) +
  geom_point(color = "red") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "blue") +  # Regression line with confidence interval
  labs(
    title = "Depression vs Crowding",
    x = "Rate of Crowding",
    y = "Rate of Depression"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

map_data2 <- left_join(uhf42shp, depressionxcrowding2018UHF42, by = c("UHF" = "geo_id.x"))

map1.1 <- ggplot(map_data2) +
  geom_sf(aes(fill = depression_percent_clean)) +
  scale_fill_gradient(name = "Depression", low = "pink", high = "red") + 
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Depression Rates by Area")

map3 <- ggplot(map_data2) +
  geom_sf(aes(fill = percent)) +
  scale_fill_gradient(name = "Crowding", low = "pink", high ="red") + 
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Crowding Rates by Area")

ggarrange(map3, map1.1, nrow = 1, ncol = 2, align ="h")

Depression X Evictions

# Eviction rate was not numeric

evictions <- evictions %>%
  mutate(
    eviction_rate = as.numeric(str_extract(estimated_annual_rate_per_10_000_homes, "\\d+\\.?\\d*"))
  )

evictions2018 <- evictions %>%
  filter(time_period == "2018",)

depressionxevictions2018 <- left_join(Depression2017, evictions2018, by = c("geo_type","geography"))

depressionxevictions2018UHF42 <- depressionxevictions2018 %>%
  filter(geo_type == "UHF42")

model3 <- lm(depression_percent_clean ~ eviction_rate, data = depressionxevictions2018UHF42)
summary(model2)

## 
## Call:
## lm(formula = depression_percent_clean ~ percent, data = depressionxcrowding2018UHF42)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.2403 -3.0550 -0.6823  2.2074  9.1148 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.0999     1.2885   6.286 2.07e-07 ***
## percent       0.2068     0.1310   1.578    0.123    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.706 on 39 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.06005,    Adjusted R-squared:  0.03595 
## F-statistic: 2.492 on 1 and 39 DF,  p-value: 0.1225

ggplot(depressionxevictions2018UHF42, aes(x = eviction_rate, y = depression_percent_clean)) +
  geom_point(color = "black") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "yellow") +  # Regression line with confidence interval
  labs(
    title = "Depression vs Evictions",
    x = "Rate of Evictions",
    y = "Rate of Depression"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

map_data3 <- left_join(uhf42shp, depressionxevictions2018UHF42, by = c("UHF" = "geo_id.x"))

map1.2 <- ggplot(map_data3) +
  geom_sf(aes(fill = depression_percent_clean)) +
  scale_fill_gradient(name = "Depression", low = "lightyellow", high = "yellow") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Depression Rates by Area")

map4 <- ggplot(map_data3) +
  geom_sf(aes(fill = eviction_rate)) +
  scale_fill_gradient(name = "Evictions", low = "lightyellow", high ="yellow") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Eviction Rates by Area")

ggarrange(map4, map1.2, nrow = 1, ncol = 2, align ="h")

Depression X Owner Occupied Homes

occupiedhomes2018 <- occupiedhomes %>%
  filter(time_period == "2014-18", geo_type == "UHF42") 

depressionxoccupiedhomes2018 <- left_join(Depression2017, occupiedhomes2018, by = c("geo_type","geography"))

depressionxoccupiedhomes2018 <- depressionxoccupiedhomes2018 %>%
  filter(geo_type == "UHF42")

model4 <- lm(depression_percent_clean ~ percent, data = depressionxoccupiedhomes2018)
summary(model4)

## 
## Call:
## lm(formula = depression_percent_clean ~ percent, data = depressionxoccupiedhomes2018)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6796 -2.2080 -0.0741  1.6938  7.1311 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.0117     0.9429  14.860  < 2e-16 ***
## percent      -0.1179     0.0236  -4.996 1.27e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.985 on 39 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3902, Adjusted R-squared:  0.3746 
## F-statistic: 24.96 on 1 and 39 DF,  p-value: 1.269e-05

ggplot(depressionxoccupiedhomes2018, aes(x = percent, y = depression_percent_clean)) +
  geom_point(color = "purple") +              # Scatterplot points
  geom_smooth(method = "lm", se = TRUE, color = "green") +  # Regression line with confidence interval
  labs(
    title = "Depression vs Home Ownership",
    x = "Number of Homes Owned",
    y = "Number of People with Depression"
  ) +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

map_data4 <- left_join(uhf42shp, depressionxoccupiedhomes2018, by = c("UHF" = "geo_id.x"))

map1.3 <- ggplot(map_data4) +
  geom_sf(aes(fill = depression_percent_clean)) +
  scale_fill_gradient(name = "Depression", low = "lightgreen", high = "darkgreen") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Depression Rates by Area")

map5 <- ggplot(map_data4) +
  geom_sf(aes(fill = percent)) +
  scale_fill_gradient(name = "Occpied by Owners", low = "lightgreen", high ="darkgreen") +  # Customize the color scale
  theme_minimal() +
  theme(legend.position = "bottom") +
  labs(title = "Rates of Living in Owned Homes")

ggarrange(map5, map1.3, nrow = 1, ncol = 2, align ="h")

All Regressions in One

# Combined dataset
combined <- bind_rows(
  depressionxevictions2018 %>%
    select(depression_percent_clean, percent = eviction_rate) %>%
    mutate(type = "Evictions"),
  
  depressionxcrowding2018UHF42 %>%
    select(depression_percent_clean, percent = percent) %>%
    mutate(type = "Crowding"),
  
  depressionxburden2018UHF42 %>%
    select(depression_percent_clean, percent = rentburdened_percent) %>%
    mutate(type = "Rent Burden"),
  
  depressionxoccupiedhomes2018 %>%
    select(depression_percent_clean, percent) %>%
    mutate(type = "Home Ownership")
)


ggplot(combined, aes(x = percent, y = depression_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Depression Percent",
    title = "Regression Lines of Depression % by Housing Factors"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

combined2 <- bind_rows(
  depressionxcrowding2018UHF42 %>%
    select(depression_percent_clean, percent = percent) %>%
    mutate(type = "Crowding"),
  
  depressionxburden2018UHF42 %>%
    select(depression_percent_clean, percent = rentburdened_percent) %>%
    mutate(type = "Rent Burden"),
  
  depressionxoccupiedhomes2018 %>%
    select(depression_percent_clean, percent) %>%
    mutate(type = "Home Ownership")
)

ggplot(combined2, aes(x = percent, y = depression_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Depression Percent",
    title = "Regression Lines of Depression % by Housing Factors W/O Evictions"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

combined3 <- bind_rows(
  depressionxcrowding2018UHF42 %>%
    select(depression_percent_clean, percent = percent) %>%
    mutate(type = "Crowding"),
  
  depressionxburden2018UHF42 %>%
    select(depression_percent_clean, percent = rentburdened_percent) %>%
    mutate(type = "Rent Burden"),
  
)

ggplot(combined3, aes(x = percent, y = depression_percent_clean, color = type)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    x = "Predictor Percent",
    y = "Depression Percent",
    title = "Regression Lines of Depression % by Housing Factors W/O Evictions/Owner Occupied Homes"
  ) +
  theme_minimal() +
  theme(legend.title = element_blank())

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Project Report

Zim Zim

5/2/2025

Explanation of the template

R Spatial Lab Assignment # 1

task 1:

task 2: Depression X Burden

Depression X Crowding

Depression X Evictions

Depression X Owner Occupied Homes

All Regressions in One