This project focuses on the housing affordability throughout the boroughs of New York City, analyzed through the lens of median rent prices and how neighborhood characteristics predict higher rents.
First, load in libraries.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(corrplot)
## corrplot 0.95 loaded
library(arm)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: lme4
##
## arm (Version 1.14-4, built: 2024-4-1)
## Working directory is C:/Users/withe/OneDrive/Desktop/DIDA 380D/tutorials
##
## Attaching package: 'arm'
## The following object is masked from 'package:corrplot':
##
## corrplot
## The following object is masked from 'package:car':
##
## logit
library(cowplot)
Set working directory, load in the data, and view it.
setwd("C:/Users/withe/OneDrive/Desktop/DIDA 380D/data")
data <- read.csv("coredata.csv")
head(data)
## region_id region_display region_name region_type year adlt_incar_rt
## 1 606 NYC New York City City 2000 1,340.70
## 2 606 NYC New York City City 2001
## 3 606 NYC New York City City 2002
## 4 606 NYC New York City City 2003
## 5 606 NYC New York City City 2004
## 6 606 NYC New York City City 2005 1,048.20
## afford_le030_rct afford_le080_rct afford_le120_rct crime_all_rt crime_prop_rt
## 1 23.1 15.4
## 2 20.3 13.6
## 3 19.3 12.9
## 4 18.4 12.4
## 5 17.7 12.1
## 6 5.2% 43.1% 76.3% 16.9 11.3
## crime_viol_rt gross_rent_0_1beds gross_rent_2_3beds hh_alone_pct
## 1 7.6
## 2 6.8
## 3 6.4
## 4 6.0
## 5 5.7
## 6 5.6 33.1%
## hh_inc_med_adj hh_inc_own_med_adj hh_inc_rent_med_adj hh_u18_pct
## 1 $72,300 34.0%
## 2
## 3
## 4
## 5
## 6 $53,780 $87,930 $41,010 32.7%
## hp_first_fhava_pct hp_first_hi_pct hp_first_orig hp_first_orig_lmi_app_pct
## 1
## 2
## 3
## 4
## 5 2.2% 8.4% 59659 7.2%
## 6 0.6% 20.1% 59,169 5.8%
## hp_first_orig_lmi_nbhd_pct hp_first_orig_rt hpi_1f hpi_4f hpi_al hpi_cn
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 23.9% 48.1 NA NA NA NA
## 6 25.5% 46.8 NA NA NA NA
## hpi_ot income_diversity_ratio lp_all lp_fam14condo_initial lp_fam14condo_rate
## 1 NA NA 7,351 5,168 9.0
## 2 NA NA 7,308 5,185 9.0
## 3 NA NA 7,993 5,701 10.0
## 4 NA NA 7,645 5,287 9.4
## 5 NA NA 6,951 4,885 8.4
## 6 NA 6.1 6,885 5,023 8.3
## lp_fam14condo_repeat med_r_1f med_r_4f med_r_cn med_r_ot nb_permit_res_units
## 1 1,315 $402,880 $225,860 $492,080 $79,360 15,544
## 2 1,379 $437,630 $243,820 $525,270 $86,340 15,744
## 3 1,613 $470,230 $257,750 $574,720 $94,920 18,879
## 4 1,697 $541,530 $304,040 $623,640 $113,210 20,050
## 5 1,544 $603,800 $342,690 $775,140 $130,720 9
## 6 1,400 $667,790 $380,330 $824,920 $152,220 97
## pct_prof_ela pct_prof_math pfn_fam14condo pfn_fam14condo_rate pop_65p_pct
## 1 NA 11.7%
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA 11.9%
## pop_bornstate_pct pop_commute_carfree_pct pop_commute_time_avg
## 1 63.8% 40.0
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 49.5% 67.0% 39.1
## pop_disabled_pct pop_discon_youth_pct pop_edu_collp_pct pop_edu_nohs_pct
## 1 27.4%
## 2
## 3
## 4
## 5
## 6 32.2% 21.0%
## pop_foreign_pct pop_num pop_pov_65p_pct pop_pov_pct pop_pov_u18_pct
## 1 35.9% 8,008,278 21.2%
## 2
## 3
## 4
## 5
## 6 36.6% 7,956,113 20.3% 19.1% 27.7%
## pop_race_asian_pct pop_race_black_pct pop_race_div_idx pop_race_hisp_pct
## 1 9.7% 24.5% 0.74 27.0%
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 11.5% 23.8% 0.73 27.9%
## pop_race_white_pct pop16_laborforce_pct pop16_unemp_pct population_density
## 1 35.0% 9.57% NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 34.5% 62.0% 8.42% NA
## priv_evic_amt_sought_med_adj priv_evic_filing_rt priv_evic_filings
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## prox_park_pct prox_subway_pct rburden_mod_ami_81_120 rburden_mod_ami_le80
## 1
## 2
## 3
## 4
## 5
## 6 21.6% 31.1%
## rburden_sev_ami_81_120 rburden_sev_ami_le80 refi_hi_pct refi_orig
## 1
## 2
## 3
## 4
## 5 15.4% 63811
## 6 3.2% 45.6% 29.9% 59,603
## refi_orig_rt rent_burden_med rent_burden_mod_pct rent_burden_sev_pct
## 1 NA 26.2% 23.7%
## 2 NA
## 3 NA
## 4 NA
## 5 51.4
## 6 47.2 31.0% 23.8% 27.9%
## rent_change rent_gross_med_adj rent_gross_recent_med_adj rent_pct_nycha reo
## 1 $1,290 987
## 2 966
## 3 695
## 4 580
## 5 338
## 6 $1,430 $1,810 189
## serious_viol_rate shd_420c_props shd_420c_units shd_421a_props shd_421a_units
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 NA NA NA NA NA
## 5 47.7 NA NA NA NA
## 6 63.1 NA NA NA NA
## shd_all_sub_props shd_all_sub_units shd_ex_2025_2030_props
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## shd_ex_2025_2030_units shd_ex_2031_2040_props shd_ex_2031_2040_units
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## shd_ex_2041_later_props shd_ex_2041_later_units shd_hud_finins_props
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## shd_hud_finins_units shd_hud_pbrap_props shd_hud_pbrap_units shd_lihtc_props
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## shd_lihtc_units shd_ml_props shd_ml_units shd_nyc_prog_props shd_ph_props
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## shd_ph_units total_viol_rate unit_le50_tar unit_le50_tar_per_capita unit_num
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA 212.4
## 6 NA 313.7 3275412
## unit_occ_own_pct unit_occ_rent_sevcrowd_pct unit_vac_rent_pct units_cert
## 1 30.2% 13,603
## 2 14,289
## 3 15,024
## 4 17,192
## 5 19,052
## 6 33.1% 3.7% 19,688
## volume_1f volume_4f volume_al volume_cn volume_ot voucher_pct
## 1 13,586 13,702 33,454 4842 1324
## 2 13,753 14,165 34,578 5284 1376
## 3 16,933 17,581 44,954 8431 2009
## 4 15,893 16,812 44,992 10421 1866
## 5 16,641 17,442 46,592 10649 1860
## 6 17,134 19,451 50,977 11973 2419
Here, I select the variables of interest. The dependent variable is the median gross rent, and its predictors are: year, percent of total Asian, black, Hispanic, and white populations, the population density, and the amount of all major felonies committed in an area per 1,000 residents.
I load it into data_clean, filtering for the years 2021-2023. I also delete City and Boroughs, since they’re too broad, leaving only Sub-Boroughs and Community Districts. In this data set, these two are pretty much the same, just different methods through which governing bodies split up the NYC region. Later, they’ll all be consolidated into one entry. For now, there are many duplicates which have the same region IDs. In order to better figure out how to analyze them, I select for region_id, region_name, and region_type. Region_display will be used later on to create a borough field.
data_clean <- data %>% dplyr::select(region_id, region_display, region_name,
region_type, year, rent_gross_med_adj,
pop_race_asian_pct,pop_race_black_pct,
pop_race_hisp_pct, pop_race_white_pct,
population_density, crime_all_rt) %>%
filter(year == "2023" | year == "2022" | year == "2021") %>%
filter(region_type != "City" & region_type != "Borough") %>%
arrange(region_id)
head(data_clean)
## region_id region_display region_name
## 1 101 MN 01 Greenwich Village/Financial District
## 2 101 MN 01 Greenwich Village/Financial District
## 3 101 MN 01 Greenwich Village/Financial District
## 4 101 MN 01 Financial District
## 5 101 MN 01 Financial District
## 6 101 MN 01 Financial District
## region_type year rent_gross_med_adj pop_race_asian_pct
## 1 Sub-Borough Area 2021 $3,580 16.6%
## 2 Sub-Borough Area 2022 $3,630 16.1%
## 3 Sub-Borough Area 2023 $3,630 15.8%
## 4 Community District 2021
## 5 Community District 2022
## 6 Community District 2023
## pop_race_black_pct pop_race_hisp_pct pop_race_white_pct population_density
## 1 4.6% 8.4% 66.4% 50.8
## 2 3.7% 9.2% 65.8% 51.6
## 3 4.9% 9.9% 63.9% 52.7
## 4 NA
## 5 NA
## 6 NA
## crime_all_rt
## 1 NA
## 2 NA
## 3 NA
## 4 12.8
## 5 16.8
## 6 17.5
Above you can see the duplicate entries for the Financial District as an example. To combine them, I grouped by region_id and year, to preserve the pair for summarize, and then summarized the duplicate entries into one. All missing values were removed and the first non-missing value was kept. Conveniently, the Community District entries had all NA fields except for the crime rate, which was all that was included. I put it all into one new data frame called data_combined.
Source: https://stackoverflow.com/questions/67050106/how-to-combine-repeated-rows-with-missing-fields-r
data_combined <- data_clean %>%
group_by(region_id, year) %>%
summarize(across(everything(), ~ first(na.omit(.x)))) %>%
drop_na()
## `summarise()` has grouped output by 'region_id'. You can override using the
## `.groups` argument.
head(data_combined)
## # A tibble: 6 × 12
## # Groups: region_id [2]
## region_id year region_display region_name region_type rent_gross_med_adj
## <int> <chr> <chr> <chr> <chr> <chr>
## 1 101 2021 MN 01 Greenwich Villa… Sub-Boroug… $3,580
## 2 101 2022 MN 01 Greenwich Villa… Sub-Boroug… $3,630
## 3 101 2023 MN 01 Greenwich Villa… Sub-Boroug… $3,630
## 4 102 2021 MN 02 Lower East Side… Sub-Boroug… $1,350
## 5 102 2022 MN 02 Lower East Side… Sub-Boroug… $1,420
## 6 102 2023 MN 02 Lower East Side… Sub-Boroug… $1,340
## # ℹ 6 more variables: pop_race_asian_pct <chr>, pop_race_black_pct <chr>,
## # pop_race_hisp_pct <chr>, pop_race_white_pct <chr>,
## # population_density <dbl>, crime_all_rt <dbl>
Then, I converted the region_displays into keeping only their borough, minus the additional number identifier. I did so by using substr(), which extracts substrings in a char. It was simple since each entry began with the abbreviated borough consisting of two letters, so I just kept the first two elements. I added these to a new column called borough.
Source: https://www.rdocumentation.org/packages/base/versions/3.6.2/topics/substr
data_combined$borough <- substr(data_combined$region_display, 1, 2)
head(data_combined)
## # A tibble: 6 × 13
## # Groups: region_id [2]
## region_id year region_display region_name region_type rent_gross_med_adj
## <int> <chr> <chr> <chr> <chr> <chr>
## 1 101 2021 MN 01 Greenwich Villa… Sub-Boroug… $3,580
## 2 101 2022 MN 01 Greenwich Villa… Sub-Boroug… $3,630
## 3 101 2023 MN 01 Greenwich Villa… Sub-Boroug… $3,630
## 4 102 2021 MN 02 Lower East Side… Sub-Boroug… $1,350
## 5 102 2022 MN 02 Lower East Side… Sub-Boroug… $1,420
## 6 102 2023 MN 02 Lower East Side… Sub-Boroug… $1,340
## # ℹ 7 more variables: pop_race_asian_pct <chr>, pop_race_black_pct <chr>,
## # pop_race_hisp_pct <chr>, pop_race_white_pct <chr>,
## # population_density <dbl>, crime_all_rt <dbl>, borough <chr>
Then, I had to turn borough and year into indicator variables as they were categorical.
data_combined <- data_combined %>%
mutate(borough = as.factor(borough)) %>%
mutate(year = as.factor(year))
Then, for the purposes of cleaning up before regression, I removed the no longer needed region identifier variables.
data_combined <- data_combined %>%
dplyr::select(-c(region_display, region_name, region_type))
In order to properly perform a regression, I needed to convert all the strings with percentages, commas, and dollar signs into plain numeric numbers. I did so by selecting the relevant columns and using gsub() to remove the characters, and then as.numeric() to convert them into a usable format. The function is applied to columns 2-8, replacing each unnecessary character with a blank space in the respective column.
Source: https://www.digitalocean.com/community/tutorials/sub-and-gsub-function-r
data_combined <- data_combined %>%
mutate(across(2:8, ~ as.numeric(gsub("[\\$,%]", "", .))))
I created a summary table that displayed the average statistics of each predictor and outcome as well as the median rent, grouped by the borough. It indicated that MN had the highest average rent, density, and crime rates.
summary <- data_combined %>%
group_by(borough) %>%
summarise(avg_rent = mean(rent_gross_med_adj, na.rm=T),
med_rent = median(rent_gross_med_adj, na.rm=T),
avg_asian = mean(pop_race_asian_pct, na.rm=T),
avg_black = mean(pop_race_black_pct, na.rm=T),
avg_hisp = mean(pop_race_hisp_pct, na.rm=T),
avg_white = mean(pop_race_white_pct, na.rm=T),
avg_density = mean(population_density, na.rm=T),
avg_crime = mean(crime_all_rt, na.rm=T))
summary
## # A tibble: 5 × 9
## borough avg_rent med_rent avg_asian avg_black avg_hisp avg_white avg_density
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MN 2258 2120 12.5 13.8 26.4 42.3 74.4
## 2 BX 1470 1505 3.91 27.5 55.7 9.67 43.8
## 3 BK 1921. 1805 11.7 27.0 19.2 35.8 42.9
## 4 QN 1945. 1960 25.9 14.2 28.9 24.6 25.6
## 5 SI 1771. 1750 13.2 8.36 18.5 56.5 9.18
## # ℹ 1 more variable: avg_crime <dbl>
Finally, I was able to move onto the regression itself. This regression indicated that rents don’t vary by much across years compared to the baseline of 2021. Asian, black, and Hispanic percentages has a significant negative impact on rent, while white percentages had a weak/uncertain effect. Interestingly, none of the other predictors were statistically significant besides being in Queens or Staten Island, which both tend to have lower rents than Manhattan. The R-squared values indicate that ~52% of variation in rents is explained by the model, and doesn’t include too many predictors. The F-statistic indicates a high value with a small p-value, meaning the model performs poorly against the null hypothesis, rejecting it.
model <- lm(rent_gross_med_adj ~ year + pop_race_asian_pct + pop_race_black_pct + pop_race_hisp_pct + pop_race_white_pct + population_density + crime_all_rt + borough, data = data_combined)
summary(model)
##
## Call:
## lm(formula = rent_gross_med_adj ~ year + pop_race_asian_pct +
## pop_race_black_pct + pop_race_hisp_pct + pop_race_white_pct +
## population_density + crime_all_rt + borough, data = data_combined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -927.07 -184.20 -15.52 197.54 1010.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4371.431 1160.481 3.767 0.000236 ***
## year2022 -5.428 70.125 -0.077 0.938403
## year2023 14.258 70.115 0.203 0.839136
## pop_race_asian_pct -28.976 12.001 -2.414 0.016950 *
## pop_race_black_pct -32.545 12.700 -2.563 0.011361 *
## pop_race_hisp_pct -32.528 11.734 -2.772 0.006265 **
## pop_race_white_pct -15.120 12.169 -1.243 0.215953
## population_density 1.580 1.717 0.920 0.358947
## crime_all_rt 3.223 2.897 1.112 0.267723
## boroughBX -71.931 132.537 -0.543 0.588117
## boroughBK -176.828 106.321 -1.663 0.098344 .
## boroughQN 20.954 138.868 0.151 0.880263
## boroughSI -526.355 185.139 -2.843 0.005084 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 364.6 on 152 degrees of freedom
## Multiple R-squared: 0.5549, Adjusted R-squared: 0.5197
## F-statistic: 15.79 on 12 and 152 DF, p-value: < 2.2e-16
The residuals vs fitted plot has a relatively straight line, meaning the outcome and predictor have a linear relationship. The QQ plot indicates that there is some non-normality in the predictors of the model due to the curving off at the ends. The scale-location plot indicates a slight upward curve, but otherwise has an acceptable homogeneity of variance. Lastly, the residuals vs leverage plot shows that there aren’t any outliers in the data that significantly impact the model.
plot(model)
To test for multicollinearity, I used vif() on the model. I found that the race population percentages had extremely high values of GVIF, indicating that they were extremely predictable with each other. This makes sense, as each percentage takes away from the other. In order to fix this, I’ll drop white percentage from the model so that every other percentage uses it as a baseline instead of depending on one another.
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## year 1.023543 2 1.005835
## pop_race_asian_pct 26.220555 1 5.120601
## pop_race_black_pct 75.376061 1 8.681939
## pop_race_hisp_pct 59.663593 1 7.724221
## pop_race_white_pct 83.927760 1 9.161210
## population_density 2.199829 1 1.483182
## crime_all_rt 1.371548 1 1.171131
## borough 9.922496 4 1.332225
In order to check for normality, I created histograms for each predictor and the outcome. I noticed some of them like crime or black pct were heavily skewed, while most of them were relatively flat.
hist_data <- data_combined %>% dplyr::select(pop_race_asian_pct,pop_race_black_pct,pop_race_hisp_pct, population_density,crime_all_rt, rent_gross_med_adj)
## Adding missing grouping variables: `region_id`
ggplot(gather(hist_data), aes(value)) +
geom_histogram(bins = 30, fill = "skyblue") +
facet_wrap(~key, scales = 'free_x')+
theme_dark()
I ran a boxcox() function on all predictors and the outcome to determine the best transformation to make them more normal. Most of them either indicated that they required logs or a sqrt.
boxcox(lm(pop_race_asian_pct~1, data = data_combined))
boxcox(lm(pop_race_black_pct~1, data = data_combined))
boxcox(lm(pop_race_hisp_pct~1, data = data_combined))
boxcox(lm(population_density~1, data = data_combined))
boxcox(lm(crime_all_rt~1, data = data_combined))
boxcox(lm(rent_gross_med_adj~1, data = data_combined))
I tested all the possible likely transformations, and landed on the following to make the plots more normal.
a <- ggplot(hist_data, aes(log(pop_race_asian_pct))) +
geom_histogram(bins = 30, fill = "skyblue") +
theme_dark()
b <- ggplot(hist_data, aes(log(pop_race_black_pct))) +
geom_histogram(bins = 30, fill = "skyblue") +
theme_dark()
c <- ggplot(hist_data, aes(log(pop_race_hisp_pct))) +
geom_histogram(bins = 30, fill = "skyblue") +
theme_dark()
d <- ggplot(hist_data, aes(log(population_density))) +
geom_histogram(bins = 30, fill = "skyblue") +
theme_dark()
e <- ggplot(hist_data, aes(log(crime_all_rt))) +
geom_histogram(bins = 30, fill = "skyblue") +
theme_dark()
f <- ggplot(hist_data, aes(log(rent_gross_med_adj))) +
geom_histogram(bins = 30, fill = "skyblue") +
theme_dark()
plot_grid(a, b, c, d, e, f,
nrow = 2,
labels = c("A", "B", "C", "D", "E", "F"))
The new model mostly indicates a slight improvement in the QQ plot, scale-location, and the leverage plot. The points are less clustered on the leverage plot and the scale-location plot doesn’t curve up as much anymore.
model2 <- lm(log(rent_gross_med_adj) ~ year + log(pop_race_asian_pct) + log(pop_race_black_pct) + log(pop_race_hisp_pct) + log(population_density) + log(crime_all_rt) + borough, data = data_combined)
plot(model2)
Here, I confirmed that the new model doesn’t have any crazy multicollinearity issues.
vif(model2)
## GVIF Df GVIF^(1/(2*Df))
## year 1.072232 2 1.017588
## log(pop_race_asian_pct) 2.642762 1 1.625657
## log(pop_race_black_pct) 1.843886 1 1.357898
## log(pop_race_hisp_pct) 1.806260 1 1.343972
## log(population_density) 2.703565 1 1.644252
## log(crime_all_rt) 1.978462 1 1.406578
## borough 9.583876 4 1.326455
The new model now assumes a baseline of white populations in Manhattan in 2021. While only the black percentage, Hispanic percentage, and crime percentages have a significant effect on rent, it seems that population density and borough location does not play as big of a role. It’s also interesting that being in Staten Island and Asian percentages mattered less after controlling for these other transformed factors.
summary(model2)
##
## Call:
## lm(formula = log(rent_gross_med_adj) ~ year + log(pop_race_asian_pct) +
## log(pop_race_black_pct) + log(pop_race_hisp_pct) + log(population_density) +
## log(crime_all_rt) + borough, data = data_combined)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.54701 -0.08997 0.00928 0.09170 0.41989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.83170 0.21475 36.469 < 2e-16 ***
## year2022 -0.02785 0.03658 -0.761 0.4476
## year2023 -0.02613 0.03659 -0.714 0.4762
## log(pop_race_asian_pct) 0.03287 0.02176 1.511 0.1329
## log(pop_race_black_pct) -0.08113 0.01652 -4.911 2.31e-06 ***
## log(pop_race_hisp_pct) -0.18495 0.03022 -6.121 7.50e-09 ***
## log(population_density) 0.03185 0.03698 0.861 0.3903
## log(crime_all_rt) 0.12462 0.04019 3.101 0.0023 **
## boroughBX -0.03688 0.06871 -0.537 0.5922
## boroughBK -0.02427 0.05160 -0.470 0.6387
## boroughQN 0.02730 0.06541 0.417 0.6770
## boroughSI -0.09583 0.11160 -0.859 0.3919
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1869 on 153 degrees of freedom
## Multiple R-squared: 0.4994, Adjusted R-squared: 0.4634
## F-statistic: 13.88 on 11 and 153 DF, p-value: < 2.2e-16
hist_data %>%
mutate(id = row_number()) %>%
gather(variable, value, -id, -rent_gross_med_adj) %>%
ggplot(aes(y = rent_gross_med_adj, x = log(value)))+
geom_point(color = "green3")+
geom_smooth(method = "lm", color = "black")+
facet_wrap(~variable, scales = "free_x")+
theme_dark()+
labs(y = "Gross Rent", x = "Variable")
## `geom_smooth()` using formula = 'y ~ x'