Data from the tax authority of the City of Boston to perform a re-assessment of Taxes of residential single family homes in the greater Boston area. I am creating and comparing two linear regression models to predict the av_total (assessed value) of these properties in the greater Boston area. I will be using the following data sets:
Boston.csv Boston_1k.csv Boston_zips.csv
options(scipen = 999)
library(tidyverse) # our main library
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(broom) # clean up the messy output of built-in functions in R return tibbles
## Warning: package 'broom' was built under R version 4.0.4
library(modelr) # contains several often-used model quality metrics
##
## Attaching package: 'modelr'
## The following object is masked from 'package:broom':
##
## bootstrap
library(skimr) # quick and dirty profile of data
library(janitor) # clean up any column names
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(vip) # make a nice variable importance plot
## Warning: package 'vip' was built under R version 4.0.4
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
boston_train <- read_csv("Boston_2020.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_character(),
## PID = col_double(),
## AV_TOTAL = col_double(),
## LAND_SF = col_double(),
## YR_BUILT = col_double(),
## YR_REMOD = col_double(),
## LIVING_AREA = col_double(),
## NUM_FLOORS = col_double(),
## STRUCTURE_CLASS = col_logical(),
## R_TOTAL_RMS = col_double(),
## R_BDRMS = col_double(),
## R_FULL_BTH = col_double(),
## R_HALF_BTH = col_double(),
## R_FPLACE = col_double()
## )
## i Use `spec()` for the full column specifications.
boston_predictions <- read.csv("Boston_1k.csv")
boston_join_demographics <- read.csv("Boston_zips.csv")
boston_train %>%
skim_without_charts()
| Name | Piped data |
| Number of rows | 13479 |
| Number of columns | 28 |
| _______________________ | |
| Column type frequency: | |
| character | 15 |
| logical | 1 |
| numeric | 12 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ZIPCODE | 0 | 1 | 5 | 5 | 0 | 5 | 0 |
| OWN_OCC | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| R_BLDG_STYL | 0 | 1 | 9 | 17 | 0 | 16 | 0 |
| R_ROOF_TYP | 0 | 1 | 7 | 11 | 0 | 7 | 0 |
| R_EXT_FIN | 0 | 1 | 9 | 17 | 0 | 10 | 0 |
| R_BTH_STYLE | 0 | 1 | 10 | 17 | 0 | 4 | 0 |
| R_KITCH | 0 | 1 | 20 | 20 | 0 | 4 | 0 |
| R_KITCH_STYLE | 0 | 1 | 10 | 17 | 0 | 4 | 0 |
| R_HEAT_TYP | 0 | 1 | 8 | 18 | 0 | 7 | 0 |
| R_AC | 0 | 1 | 8 | 15 | 0 | 3 | 0 |
| R_EXT_CND | 0 | 1 | 8 | 13 | 0 | 5 | 0 |
| R_OVRALL_CND | 0 | 1 | 8 | 13 | 0 | 5 | 0 |
| R_INT_CND | 0 | 1 | 8 | 13 | 0 | 5 | 0 |
| R_INT_FIN | 0 | 1 | 10 | 15 | 0 | 3 | 0 |
| R_VIEW | 0 | 1 | 8 | 13 | 0 | 5 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| STRUCTURE_CLASS | 13479 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| PID | 0 | 1.00 | 1853541231.84 | 184340306.35 | 702991020 | 1805608500.0 | 1900692000 | 2004482500 | 2012270010 |
| AV_TOTAL | 0 | 1.00 | 514393.30 | 180966.94 | 135400 | 393000.0 | 477000 | 585200 | 1490200 |
| LAND_SF | 0 | 1.00 | 5896.93 | 2880.59 | 0 | 4332.0 | 5325 | 6750 | 107158 |
| YR_BUILT | 2 | 1.00 | 1934.21 | 30.46 | 1710 | 1910.0 | 1932 | 1955 | 2018 |
| YR_REMOD | 8676 | 0.36 | 2003.29 | 14.14 | 1900 | 1998.0 | 2007 | 2014 | 2019 |
| LIVING_AREA | 0 | 1.00 | 1682.06 | 575.42 | 403 | 1306.0 | 1567 | 1939 | 8623 |
| NUM_FLOORS | 0 | 1.00 | 1.73 | 0.45 | 1 | 1.5 | 2 | 2 | 3 |
| R_TOTAL_RMS | 1 | 1.00 | 7.14 | 1.58 | 3 | 6.0 | 7 | 8 | 18 |
| R_BDRMS | 1 | 1.00 | 3.36 | 0.92 | 0 | 3.0 | 3 | 4 | 9 |
| R_FULL_BTH | 0 | 1.00 | 1.38 | 0.59 | 1 | 1.0 | 1 | 2 | 6 |
| R_HALF_BTH | 0 | 1.00 | 0.57 | 0.54 | 0 | 0.0 | 1 | 1 | 4 |
| R_FPLACE | 0 | 1.00 | 0.60 | 0.63 | 0 | 0.0 | 1 | 1 | 9 |
boston_predictions %>%
skim_without_charts()
| Name | Piped data |
| Number of rows | 1000 |
| Number of columns | 28 |
| _______________________ | |
| Column type frequency: | |
| character | 14 |
| logical | 1 |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| OWN_OCC | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| R_BLDG_STYL | 0 | 1 | 9 | 17 | 0 | 13 | 0 |
| R_ROOF_TYP | 0 | 1 | 7 | 11 | 0 | 6 | 0 |
| R_EXT_FIN | 0 | 1 | 9 | 17 | 0 | 11 | 0 |
| R_BTH_STYLE | 0 | 1 | 10 | 17 | 0 | 4 | 0 |
| R_KITCH | 0 | 1 | 15 | 20 | 0 | 4 | 0 |
| R_KITCH_STYLE | 0 | 1 | 10 | 17 | 0 | 4 | 0 |
| R_HEAT_TYP | 0 | 1 | 12 | 18 | 0 | 5 | 0 |
| R_AC | 0 | 1 | 8 | 15 | 0 | 3 | 0 |
| R_EXT_CND | 0 | 1 | 8 | 11 | 0 | 4 | 0 |
| R_OVRALL_CND | 0 | 1 | 8 | 11 | 0 | 4 | 0 |
| R_INT_CND | 0 | 1 | 8 | 13 | 0 | 5 | 0 |
| R_INT_FIN | 0 | 1 | 10 | 13 | 0 | 2 | 0 |
| R_VIEW | 0 | 1 | 8 | 13 | 0 | 5 | 0 |
Variable type: logical
| skim_variable | n_missing | complete_rate | mean | count |
|---|---|---|---|---|
| STRUCTURE_CLASS | 1000 | 0 | NaN | : |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| PID | 0 | 1.00 | 1857142653.36 | 185832590.79 | 1001514020 | 1805613000.00 | 1901717500.0 | 2005864000.00 | 2012260000 |
| ZIPCODE | 0 | 1.00 | 2131.42 | 3.63 | 2124 | 2131.00 | 2132.0 | 2132.00 | 2136 |
| AV_TOTAL | 0 | 1.00 | 520804.80 | 187980.95 | 179400 | 398650.00 | 472500.0 | 586475.00 | 1463500 |
| LAND_SF | 0 | 1.00 | 6122.25 | 3070.67 | 0 | 4484.75 | 5464.5 | 6860.75 | 34522 |
| YR_BUILT | 0 | 1.00 | 1936.04 | 31.14 | 1779 | 1914.75 | 1935.0 | 1956.00 | 2018 |
| YR_REMOD | 644 | 0.36 | 2003.87 | 13.74 | 1950 | 1998.75 | 2008.0 | 2014.00 | 2018 |
| LIVING_AREA | 0 | 1.00 | 1668.95 | 574.77 | 572 | 1296.00 | 1559.5 | 1928.00 | 5017 |
| NUM_FLOORS | 0 | 1.00 | 1.72 | 0.46 | 1 | 1.50 | 2.0 | 2.00 | 3 |
| R_TOTAL_RMS | 0 | 1.00 | 7.10 | 1.58 | 3 | 6.00 | 7.0 | 8.00 | 16 |
| R_BDRMS | 0 | 1.00 | 3.33 | 0.92 | 1 | 3.00 | 3.0 | 4.00 | 7 |
| R_FULL_BTH | 0 | 1.00 | 1.39 | 0.60 | 1 | 1.00 | 1.0 | 2.00 | 4 |
| R_HALF_BTH | 0 | 1.00 | 0.59 | 0.54 | 0 | 0.00 | 1.0 | 1.00 | 2 |
| R_FPLACE | 0 | 1.00 | 0.64 | 0.68 | 0 | 0.00 | 1.0 | 1.00 | 5 |
boston_join_demographics %>%
skim_without_charts()
| Name | Piped data |
| Number of rows | 6 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| City_State | 0 | 1 | 13 | 21 | 0 | 5 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| ZIP | 0 | 1 | 2131.83 | 4.92 | 2124 | 2130.25 | 2131.5 | 2135 | 2138 |
| Population | 0 | 1 | 35687.67 | 6831.66 | 28488 | 31219.75 | 35857.5 | 36314 | 47783 |
| Pop_Density | 0 | 1 | 11790.83 | 3283.07 | 6207 | 10839.75 | 12378.0 | 13251 | 15913 |
| Median_Income | 0 | 1 | 66848.00 | 11084.55 | 48841 | 60851.25 | 71090.5 | 75446 | 75730 |
boston_train %>%
summarise(min = min(AV_TOTAL, na.rm = TRUE),
max = max(AV_TOTAL,na.rm = TRUE),
mean = mean(AV_TOTAL, na.rm = TRUE),
null = sum(is.na(AV_TOTAL)))
boston_train %>%
ggplot(aes(AV_TOTAL)) +
geom_histogram(bins = 50)
boston_train %>%
ggplot(aes(x=ZIPCODE, y=AV_TOTAL)) +
geom_boxplot()
boston_train %>%
ggplot(aes(x=as.factor(OWN_OCC), y=AV_TOTAL)) +
geom_boxplot()
boston_train %>%
top_n(5,AV_TOTAL) %>%
arrange(desc(AV_TOTAL))
boston_train %>%
#na.omit() %>%
select_if(is.numeric) %>%
cor() %>%
as.data.frame() %>%
rownames_to_column(var="variable")
boston_train %>%
ggplot(aes(x=LAND_SF, y=AV_TOTAL)) +
geom_point() +
geom_smooth(method="lm") +
labs(title = "Does LAND_SF influence AV_TOTAL? ",
subtitle = "",
x = "LAND_SF",
y = "AV_TOTAL",
caption = "")
## `geom_smooth()` using formula 'y ~ x'
boston_train %>%
ggplot(aes(x=LIVING_AREA, y=AV_TOTAL)) +
geom_point() +
geom_smooth(method="lm") +
labs(title = "Does LIVING_AREA influence AV_TOTAL? ",
subtitle = "",
x = "LIVING_AREA",
y = "AV_TOTAL",
caption = "")
## `geom_smooth()` using formula 'y ~ x'
boston_train %>%
ggplot(aes(x=R_TOTAL_RMS, y=AV_TOTAL)) +
geom_point() +
geom_smooth(method="lm") +
labs(title = "Does R_TOTAL_RMS influence AV_TOTAL? ",
subtitle = "",
x = "R-TOTAL_RMS",
y = "AV_TOTAL",
caption = "")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
boston_train %>%
ggplot(aes(x=ZIPCODE, y= mean(AV_TOTAL))) +
geom_col() +
geom_smooth(method="lm") +
labs(title = "Does ZIPCODE influence average AV_TOTAL? ",
subtitle = "",
x = "ZIPCODE",
y = "AVG_AV_TOTAL",
caption = "")
## `geom_smooth()` using formula 'y ~ x'
boston_train %>%
ggplot(aes(x=OWN_OCC, y= mean(AV_TOTAL))) +
geom_col() +
geom_smooth(method="lm") +
labs(title = "Does OWN_OCC influence average AV_TOTAL? ",
subtitle = "",
x = "OWN_OCC",
y = "AVG_AV_TOTAL",
caption = "")
## `geom_smooth()` using formula 'y ~ x'
Boston_2020 <- boston_train %>%
mutate(ZIPCODE = as.integer(ZIPCODE)) %>%
mutate(zipcode = ZIPCODE)
Boston_zips <- boston_join_demographics %>%
mutate(ZIP = as.integer(ZIP)) %>%
mutate(zipcode = ZIP)
Boston_1k <- boston_predictions %>%
mutate(ZIPCODE = as.integer(ZIPCODE)) %>%
mutate(zipcode = ZIPCODE)
Boston_prep <- Boston_2020 %>%
inner_join(Boston_zips, by = "zipcode") %>%
mutate(R_TOTAL_RMS = replace_na(R_TOTAL_RMS, 7 )) %>%
mutate(R_BDRMS = replace_na(R_BDRMS, 3.5 )) %>%
mutate(YR_BUILT = replace_na(YR_BUILT, 1934 )) %>%
mutate(YR_REMOD = replace_na(YR_REMOD, 0)) %>%
mutate(AGE = if_else(YR_BUILT < YR_REMOD, 2021 - YR_REMOD, 2021 - YR_BUILT)) %>%
mutate(ZIPCODE = as.factor(ZIPCODE))
BOSTON_1K_PREP <- Boston_1k %>%
inner_join(Boston_zips, by = "zipcode") %>%
mutate(R_TOTAL_RMS = replace_na(R_TOTAL_RMS, 7 )) %>%
mutate(R_BDRMS = replace_na(R_BDRMS, 3.5 )) %>%
mutate(YR_BUILT = replace_na(YR_BUILT, 1934 )) %>%
mutate(YR_REMOD = replace_na(YR_REMOD, 0)) %>%
mutate(AGE = if_else(YR_BUILT < YR_REMOD, 2021 - YR_REMOD, 2021 - YR_BUILT))
split Boston_prep into 70/30 percent train/test split
sample <- sample.int(n = nrow(Boston_prep), size = floor(0.7*nrow(Boston_prep)), replace = F)
train <- Boston_prep[sample, ]
test <- Boston_prep[-sample,]
evaluating first AV_TOTAL model using the following predictors on the “train” data set:
model_1 <- lm(AV_TOTAL ~ LAND_SF + LIVING_AREA+ AGE + R_TOTAL_RMS, data=train)
summary(model_1)
##
## Call:
## lm(formula = AV_TOTAL ~ LAND_SF + LIVING_AREA + AGE + R_TOTAL_RMS,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1031592 -75656 -5186 60832 821144
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 193097.798 6854.920 28.169 <0.0000000000000002 ***
## LAND_SF 9.163 0.514 17.826 <0.0000000000000002 ***
## LIVING_AREA 187.595 3.417 54.908 <0.0000000000000002 ***
## AGE -824.979 32.987 -25.010 <0.0000000000000002 ***
## R_TOTAL_RMS 260.708 1201.359 0.217 0.828
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 130300 on 9430 degrees of freedom
## Multiple R-squared: 0.4849, Adjusted R-squared: 0.4847
## F-statistic: 2219 on 4 and 9430 DF, p-value: < 0.00000000000000022
glance(model_1) %>%
mutate(across(where(is.numeric), round, 3))
tidy(model_1, conf.int = TRUE) %>%
mutate(across(where(is.numeric), round, 3))
par(mfrow = c(2, 2))
plot(model_1)
sprintf("Training RSQUARE: %.4f and RMSE: %.3f",
rsquare(model_1, train),
rmse(model_1, train))
## [1] "Training RSQUARE: 0.4849 and RMSE: 130304.412"
sprintf("Testing r-square: %.4f and RMSE: %.3f",
rsquare(model_1, test),
rmse(model_1, test))
## [1] "Testing r-square: 0.4801 and RMSE: 129500.672"
model_2 <- lm(AV_TOTAL ~ AGE + Median_Income + R_TOTAL_RMS + LIVING_AREA + LAND_SF + R_BDRMS + R_FULL_BTH + Population + zipcode, data=train)
summary(model_2)
##
## Call:
## lm(formula = AV_TOTAL ~ AGE + Median_Income + R_TOTAL_RMS + LIVING_AREA +
## LAND_SF + R_BDRMS + R_FULL_BTH + Population + zipcode, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -869410 -56882 -4412 49606 706891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43690342.0938 1372972.7947 31.822 < 0.0000000000000002 ***
## AGE -569.9180 26.7000 -21.345 < 0.0000000000000002 ***
## Median_Income 8.4891 0.1137 74.675 < 0.0000000000000002 ***
## R_TOTAL_RMS 4047.7155 1084.0196 3.734 0.00019 ***
## LIVING_AREA 162.7125 2.8473 57.146 < 0.0000000000000002 ***
## LAND_SF 9.6722 0.4177 23.154 < 0.0000000000000002 ***
## R_BDRMS -5307.5418 1664.5501 -3.189 0.00143 **
## R_FULL_BTH 26303.9336 2042.0708 12.881 < 0.0000000000000002 ***
## Population -6.7337 0.3636 -18.520 < 0.0000000000000002 ***
## zipcode -20570.1944 639.4033 -32.171 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 100800 on 9425 degrees of freedom
## Multiple R-squared: 0.6922, Adjusted R-squared: 0.6919
## F-statistic: 2355 on 9 and 9425 DF, p-value: < 0.00000000000000022
glance(model_2) %>%
mutate(across(where(is.numeric), round, 3))
tidy(model_2, conf.int = TRUE) %>%
mutate(across(where(is.numeric), round, 3))
par(mfrow = c(2, 2))
plot(model_2)
sprintf("Training RSQUARE: %.4f and RMSE: %.3f",
rsquare(model_2, train),
rmse(model_2, train))
## [1] "Training RSQUARE: 0.6922 and RMSE: 100725.930"
sprintf("Testing r-square: %.4f and RMSE: %.3f",
rsquare(model_2, test),
rmse(model_2, test))
## [1] "Testing r-square: 0.6860 and RMSE: 100657.962"
identify properties where the model does well and not so well
top_10 <- BOSTON_1K_PREP %>%
add_predictions(model_2) %>%
mutate(error = abs(AV_TOTAL - pred)) %>%
top_n(10,-error)
bottom_10 <- BOSTON_1K_PREP %>%
add_predictions(model_2) %>%
mutate(error = abs(AV_TOTAL - pred)) %>%
top_n(-10,-error)