library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
require(broom)
## Loading required package: broom
require(dplyr)
library(modelr)
##
## Attaching package: 'modelr'
##
## The following object is masked from 'package:broom':
##
## bootstrap
# Import the data
housing <- read_csv("https://bit.ly/49QWVuc"); housing
## Rows: 2626 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Neighborhood, Building.Classification, Boro
## dbl (10): Total.Units, Year.Built, Gross.SqFt, Estimated.Gross.Income, Gross...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2,626 × 13
## Neighborhood Building.Classification Total.Units Year.Built Gross.SqFt
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 FINANCIAL R9-CONDOMINIUM 42 1920 36500
## 2 FINANCIAL R4-CONDOMINIUM 78 1985 126420
## 3 FINANCIAL RR-CONDOMINIUM 500 NA 554174
## 4 FINANCIAL R4-CONDOMINIUM 282 1930 249076
## 5 TRIBECA R4-CONDOMINIUM 239 1985 219495
## 6 TRIBECA R4-CONDOMINIUM 133 1986 139719
## 7 TRIBECA R4-CONDOMINIUM 109 1985 105000
## 8 TRIBECA R4-CONDOMINIUM 107 1986 87479
## 9 TRIBECA R4-CONDOMINIUM 247 1987 255845
## 10 TRIBECA R4-CONDOMINIUM 121 1985 106129
## # ℹ 2,616 more rows
## # ℹ 8 more variables: Estimated.Gross.Income <dbl>,
## # Gross.Income.per.SqFt <dbl>, Estimated.Expense <dbl>,
## # Expense.per.SqFt <dbl>, Net.Operating.Income <dbl>,
## # Full.Market.Value <dbl>, Market.Value.per.SqFt <dbl>, Boro <chr>
# Select and rename these variables
housing <- housing %>%
select(Market.Value.per.SqFt, Boro, Total.Units, Gross.SqFt) %>%
rename(value=Market.Value.per.SqFt, boro=Boro, units=Total.Units, SqFt=Gross.SqFt)
Model 1: value = 𝛼 + 𝛽 * boro + ε
Model 2:
value = 𝛼 + 𝛽1 * boro + 𝛽2 * SqFt + 𝛽3 * units + ε
model1 <- lm(value ~ boro, data=housing)
summary(model1)
##
## Call:
## lm(formula = value ~ boro, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -167.891 -23.033 -0.223 27.580 262.661
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.932 5.332 8.989 < 2e-16 ***
## boroBrooklyn 31.961 5.583 5.725 1.15e-08 ***
## boroManhattan 132.648 5.464 24.277 < 2e-16 ***
## boroQueens 29.526 5.740 5.144 2.89e-07 ***
## boroStaten Island -6.070 10.193 -0.596 0.552
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.29 on 2621 degrees of freedom
## Multiple R-squared: 0.5831, Adjusted R-squared: 0.5825
## F-statistic: 916.5 on 4 and 2621 DF, p-value: < 2.2e-16
model2 <- lm(value ~ boro + units + SqFt, data=housing)
summary(model2)
##
## Call:
## lm(formula = value ~ boro + units + SqFt, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -164.418 -22.692 1.416 26.972 261.122
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.329e+01 5.330e+00 8.122 6.97e-16 ***
## boroBrooklyn 3.456e+01 5.535e+00 6.244 4.95e-10 ***
## boroManhattan 1.310e+02 5.385e+00 24.327 < 2e-16 ***
## boroQueens 3.299e+01 5.663e+00 5.827 6.35e-09 ***
## boroStaten Island -3.630e+00 9.993e+00 -0.363 0.716
## units -1.881e-01 2.210e-02 -8.511 < 2e-16 ***
## SqFt 2.103e-04 2.087e-05 10.079 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.35 on 2619 degrees of freedom
## Multiple R-squared: 0.6009, Adjusted R-squared: 0.6
## F-statistic: 657.2 on 6 and 2619 DF, p-value: < 2.2e-16
According to results below, “r.squared” shows ‘Adjusted R2’ increase from 0.583 to 0.601, which shows the adj.r.squared of model2 is higher than model1. It could be said the size and number of units did ‘add value’ to the prediction.
# Compare the fit of 2 models
model12_fit <- as.data.frame(bind_rows(glance(model1),glance(model2)))
round(model12_fit,3)
## r.squared adj.r.squared sigma statistic p.value df logLik AIC
## 1 0.583 0.582 44.293 916.473 0 4 -13678.32 27368.63
## 2 0.601 0.600 43.353 657.220 0 6 -13621.02 27258.03
## BIC deviance df.residual nobs
## 1 27403.87 5141963 2621 2626
## 2 27305.02 4922389 2619 2626
Regarding to the results below, shows the condominium with 300 units in Manhattan having 220000 Sq Ft having a value of 164.1197.
# Predict the market value of the condominium
prediction <- tibble(boro = "Manhattan", SqFt =220000 , units = 300) %>%
add_predictions(model2); prediction
## # A tibble: 1 × 4
## boro SqFt units pred
## <chr> <dbl> <dbl> <dbl>
## 1 Manhattan 220000 300 164.
Answer: According to the results above I would
recommend to invest for this condominium since the value of the
condominium is 164.1197, which is higher than 140 per Sq Ft.