Update the title with your information. Make sure to include identification information so that we know it is your submission.
Also update the author name and date accordingly.
Check out the Source Code from the top-right corner
</>Code
menu.
Don’t use a single chunk for the entire assignment. Break it into multiple chunks.
# Load and clean Data
depression <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Depression (adults) (full table).csv") %>%
clean_names()
## Rows: 96 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): TimePeriod, GeoType, Geography, Age-adjusted percent, Percent
## dbl (2): GeoID, GeoRank
## num (1): Number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
psychillness <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Psychiatric hospitalizations (adults) (full table).csv") %>%
clean_names()
## Rows: 144 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): GeoType, Geography
## dbl (3): TimePeriod, GeoID, GeoRank
## num (3): Age-adjusted rate per 100,000 residents, Number, Rate per 100,000 r...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
seriousdistress <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Serious psychological distress (adults) (full table).csv") %>%
clean_names()
## Rows: 192 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): TimePeriod, GeoType, Geography, Age-adjusted percent, Number, Percent
## dbl (2): GeoID, GeoRank
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
evictions <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Evictions (court-ordered) (full table).csv") %>%
clean_names()
## Rows: 2122 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): GeoType, Geography, Estimated annual rate per 10,000 homes
## dbl (3): TimePeriod, GeoID, GeoRank
## num (1): Number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
crowding <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Household crowding (full table).csv") %>%
clean_names()
## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
occupiedhomes <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Owner-occupied homes (full table).csv") %>%
clean_names()
## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rentburdened <- read_csv("C:\\Users\\zim13\\Downloads\\NYC EH Data Portal - Rent-burdened households (full table).csv") %>%
clean_names()
## Rows: 3758 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): TimePeriod, GeoType, Geography
## dbl (3): GeoID, GeoRank, Percent
## num (1): Number
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
depression <- depression %>%
rename(
depression_percent = percent,
depression_number = number
)
rentburdened <- rentburdened %>%
rename(
rentburdened_percent = percent,
rentburdened_number = number
)
# Filter by year to try to align years as best as possible, Depression only has 2017-18 and 2021-22
RBMatching2017 <- rentburdened %>%
filter(time_period == "2014-18",)
Depression2017 <- depression %>%
filter(time_period == "2017-18",)
# Depression rate was in a weird format and was not numeric
Depression2017 <- Depression2017 %>%
mutate(
depression_percent_clean = as.numeric(str_extract(depression_percent, "^[0-9.]+"))
)
# Left join by neighborhood
depressionxburden2018 <- left_join(Depression2017, RBMatching2017, by = c("geo_type","geography"))
# Depression data was mostly in UHF42, so we use that
depressionxburden2018UHF42 <- depressionxburden2018 %>%
filter(geo_type == "UHF42")
# Regression Analysis
model1 <- lm(depression_percent_clean ~ rentburdened_percent, data = depressionxburden2018UHF42)
summary(model1)
##
## Call:
## lm(formula = depression_percent_clean ~ rentburdened_percent,
## data = depressionxburden2018UHF42)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.0315 -2.8398 -0.5448 2.3106 9.0990
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.15338 4.54670 0.254 0.8011
## rentburdened_percent 0.17355 0.08933 1.943 0.0593 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.65 on 39 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.08824, Adjusted R-squared: 0.06486
## F-statistic: 3.775 on 1 and 39 DF, p-value: 0.05928
# Scatterplot
ggplot(depressionxburden2018UHF42, aes(x = rentburdened_percent, y = depression_percent_clean)) +
geom_point(color = "blue") + # Scatterplot points
geom_smooth(method = "lm", se = TRUE, color = "red") + # Regression line with confidence interval
labs(
title = "Depression vs Rent Burden",
x = "Number of Rent-Burdened People",
y = "Number of People with Depression"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# Mapped Data
uhf42shp <- st_read("C:\\Users\\zim13\\Downloads\\UHF_42_DOHMH\\UHF_42_DOHMH.shp")
## Reading layer `UHF_42_DOHMH' from data source
## `C:\Users\zim13\Downloads\UHF_42_DOHMH\UHF_42_DOHMH.shp' using driver `ESRI Shapefile'
## Simple feature collection with 43 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 913090.8 ymin: 120053.5 xmax: 1067310 ymax: 272932
## Projected CRS: NAD83 / New York Long Island (ftUS)
map_data <- left_join(uhf42shp, depressionxburden2018UHF42, by = c("UHF" = "geo_id.x"))
# Depression Map
map1 <- ggplot(map_data) +
geom_sf(aes(fill = depression_percent_clean)) +
scale_fill_viridis_c(name = "Depression") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Depression Rates by Area")
# Rent Burdened Map
map2 <- ggplot(map_data) +
geom_sf(aes(fill = rentburdened_percent)) +
scale_fill_viridis_c(name = "Rent Burdened") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Rent Burden Rate by Area")
ggarrange(map2, map1, nrow = 1, ncol = 2, align ="h")
crowding2017 <- crowding %>%
filter(time_period == "2014-18",)
depressionxcrowding2018 <- left_join(Depression2017, crowding2017, by = c("geo_type","geography"))
depressionxcrowding2018UHF42 <- depressionxcrowding2018 %>%
filter(geo_type == "UHF42")
model2 <- lm(depression_percent_clean ~ percent, data = depressionxcrowding2018UHF42)
summary(model2)
##
## Call:
## lm(formula = depression_percent_clean ~ percent, data = depressionxcrowding2018UHF42)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2403 -3.0550 -0.6823 2.2074 9.1148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.0999 1.2885 6.286 2.07e-07 ***
## percent 0.2068 0.1310 1.578 0.123
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.706 on 39 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.06005, Adjusted R-squared: 0.03595
## F-statistic: 2.492 on 1 and 39 DF, p-value: 0.1225
ggplot(depressionxcrowding2018UHF42, aes(x = percent, y = depression_percent_clean)) +
geom_point(color = "red") + # Scatterplot points
geom_smooth(method = "lm", se = TRUE, color = "blue") + # Regression line with confidence interval
labs(
title = "Depression vs Crowding",
x = "Rate of Crowding",
y = "Rate of Depression"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
map_data2 <- left_join(uhf42shp, depressionxcrowding2018UHF42, by = c("UHF" = "geo_id.x"))
map1.1 <- ggplot(map_data2) +
geom_sf(aes(fill = depression_percent_clean)) +
scale_fill_gradient(name = "Depression", low = "pink", high = "red") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Depression Rates by Area")
map3 <- ggplot(map_data2) +
geom_sf(aes(fill = percent)) +
scale_fill_gradient(name = "Crowding", low = "pink", high ="red") +
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Crowding Rates by Area")
ggarrange(map3, map1.1, nrow = 1, ncol = 2, align ="h")
# Eviction rate was not numeric
evictions <- evictions %>%
mutate(
eviction_rate = as.numeric(str_extract(estimated_annual_rate_per_10_000_homes, "\\d+\\.?\\d*"))
)
evictions2018 <- evictions %>%
filter(time_period == "2018",)
depressionxevictions2018 <- left_join(Depression2017, evictions2018, by = c("geo_type","geography"))
depressionxevictions2018UHF42 <- depressionxevictions2018 %>%
filter(geo_type == "UHF42")
model3 <- lm(depression_percent_clean ~ eviction_rate, data = depressionxevictions2018UHF42)
summary(model2)
##
## Call:
## lm(formula = depression_percent_clean ~ percent, data = depressionxcrowding2018UHF42)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2403 -3.0550 -0.6823 2.2074 9.1148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.0999 1.2885 6.286 2.07e-07 ***
## percent 0.2068 0.1310 1.578 0.123
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.706 on 39 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.06005, Adjusted R-squared: 0.03595
## F-statistic: 2.492 on 1 and 39 DF, p-value: 0.1225
ggplot(depressionxevictions2018UHF42, aes(x = eviction_rate, y = depression_percent_clean)) +
geom_point(color = "black") + # Scatterplot points
geom_smooth(method = "lm", se = TRUE, color = "yellow") + # Regression line with confidence interval
labs(
title = "Depression vs Evictions",
x = "Rate of Evictions",
y = "Rate of Depression"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
map_data3 <- left_join(uhf42shp, depressionxevictions2018UHF42, by = c("UHF" = "geo_id.x"))
map1.2 <- ggplot(map_data3) +
geom_sf(aes(fill = depression_percent_clean)) +
scale_fill_gradient(name = "Depression", low = "lightyellow", high = "yellow") + # Customize the color scale
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Depression Rates by Area")
map4 <- ggplot(map_data3) +
geom_sf(aes(fill = eviction_rate)) +
scale_fill_gradient(name = "Evictions", low = "lightyellow", high ="yellow") + # Customize the color scale
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Eviction Rates by Area")
ggarrange(map4, map1.2, nrow = 1, ncol = 2, align ="h")
occupiedhomes2018 <- occupiedhomes %>%
filter(time_period == "2014-18", geo_type == "UHF42")
depressionxoccupiedhomes2018 <- left_join(Depression2017, occupiedhomes2018, by = c("geo_type","geography"))
depressionxoccupiedhomes2018 <- depressionxoccupiedhomes2018 %>%
filter(geo_type == "UHF42")
model4 <- lm(depression_percent_clean ~ percent, data = depressionxoccupiedhomes2018)
summary(model4)
##
## Call:
## lm(formula = depression_percent_clean ~ percent, data = depressionxoccupiedhomes2018)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6796 -2.2080 -0.0741 1.6938 7.1311
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.0117 0.9429 14.860 < 2e-16 ***
## percent -0.1179 0.0236 -4.996 1.27e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.985 on 39 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3902, Adjusted R-squared: 0.3746
## F-statistic: 24.96 on 1 and 39 DF, p-value: 1.269e-05
ggplot(depressionxoccupiedhomes2018, aes(x = percent, y = depression_percent_clean)) +
geom_point(color = "purple") + # Scatterplot points
geom_smooth(method = "lm", se = TRUE, color = "green") + # Regression line with confidence interval
labs(
title = "Depression vs Home Ownership",
x = "Number of Homes Owned",
y = "Number of People with Depression"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
map_data4 <- left_join(uhf42shp, depressionxoccupiedhomes2018, by = c("UHF" = "geo_id.x"))
map1.3 <- ggplot(map_data4) +
geom_sf(aes(fill = depression_percent_clean)) +
scale_fill_gradient(name = "Depression", low = "lightgreen", high = "darkgreen") + # Customize the color scale
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Depression Rates by Area")
map5 <- ggplot(map_data4) +
geom_sf(aes(fill = percent)) +
scale_fill_gradient(name = "Occpied by Owners", low = "lightgreen", high ="darkgreen") + # Customize the color scale
theme_minimal() +
theme(legend.position = "bottom") +
labs(title = "Rates of Living in Owned Homes")
ggarrange(map5, map1.3, nrow = 1, ncol = 2, align ="h")
# Combined dataset
combined <- bind_rows(
depressionxevictions2018 %>%
select(depression_percent_clean, percent = eviction_rate) %>%
mutate(type = "Evictions"),
depressionxcrowding2018UHF42 %>%
select(depression_percent_clean, percent = percent) %>%
mutate(type = "Crowding"),
depressionxburden2018UHF42 %>%
select(depression_percent_clean, percent = rentburdened_percent) %>%
mutate(type = "Rent Burden"),
depressionxoccupiedhomes2018 %>%
select(depression_percent_clean, percent) %>%
mutate(type = "Home Ownership")
)
ggplot(combined, aes(x = percent, y = depression_percent_clean, color = type)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "Predictor Percent",
y = "Depression Percent",
title = "Regression Lines of Depression % by Housing Factors"
) +
theme_minimal() +
theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
combined2 <- bind_rows(
depressionxcrowding2018UHF42 %>%
select(depression_percent_clean, percent = percent) %>%
mutate(type = "Crowding"),
depressionxburden2018UHF42 %>%
select(depression_percent_clean, percent = rentburdened_percent) %>%
mutate(type = "Rent Burden"),
depressionxoccupiedhomes2018 %>%
select(depression_percent_clean, percent) %>%
mutate(type = "Home Ownership")
)
ggplot(combined2, aes(x = percent, y = depression_percent_clean, color = type)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "Predictor Percent",
y = "Depression Percent",
title = "Regression Lines of Depression % by Housing Factors W/O Evictions"
) +
theme_minimal() +
theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).
combined3 <- bind_rows(
depressionxcrowding2018UHF42 %>%
select(depression_percent_clean, percent = percent) %>%
mutate(type = "Crowding"),
depressionxburden2018UHF42 %>%
select(depression_percent_clean, percent = rentburdened_percent) %>%
mutate(type = "Rent Burden"),
)
ggplot(combined3, aes(x = percent, y = depression_percent_clean, color = type)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
x = "Predictor Percent",
y = "Depression Percent",
title = "Regression Lines of Depression % by Housing Factors W/O Evictions/Owner Occupied Homes"
) +
theme_minimal() +
theme(legend.title = element_blank())
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).