Background

Data from the tax authority of the City of Boston to perform a re-assessment of Taxes of residential single family homes in the greater Boston area. I am creating and comparing two linear regression models to predict the av_total (assessed value) of these properties in the greater Boston area. I will be using the following data sets:

Boston.csv Boston_1k.csv Boston_zips.csv

Loading Libraries

options(scipen = 999)
library(tidyverse)  # our main library 
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(broom)      # clean up the messy output of built-in functions in R return tibbles 
## Warning: package 'broom' was built under R version 4.0.4
library(modelr)     # contains several often-used model quality metrics 
## 
## Attaching package: 'modelr'
## The following object is masked from 'package:broom':
## 
##     bootstrap
library(skimr)      # quick and dirty profile of data 
library(janitor)    # clean up any column names
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(vip)        # make a nice variable importance plot
## Warning: package 'vip' was built under R version 4.0.4
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi

Importing the Data

boston_train <- read_csv("Boston_2020.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   PID = col_double(),
##   AV_TOTAL = col_double(),
##   LAND_SF = col_double(),
##   YR_BUILT = col_double(),
##   YR_REMOD = col_double(),
##   LIVING_AREA = col_double(),
##   NUM_FLOORS = col_double(),
##   STRUCTURE_CLASS = col_logical(),
##   R_TOTAL_RMS = col_double(),
##   R_BDRMS = col_double(),
##   R_FULL_BTH = col_double(),
##   R_HALF_BTH = col_double(),
##   R_FPLACE = col_double()
## )
## i Use `spec()` for the full column specifications.
boston_predictions <- read.csv("Boston_1k.csv")
boston_join_demographics <- read.csv("Boston_zips.csv")

Skim of the data

boston_train %>%
  skim_without_charts()
Data summary
Name Piped data
Number of rows 13479
Number of columns 28
_______________________
Column type frequency:
character 15
logical 1
numeric 12
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ZIPCODE 0 1 5 5 0 5 0
OWN_OCC 0 1 1 1 0 2 0
R_BLDG_STYL 0 1 9 17 0 16 0
R_ROOF_TYP 0 1 7 11 0 7 0
R_EXT_FIN 0 1 9 17 0 10 0
R_BTH_STYLE 0 1 10 17 0 4 0
R_KITCH 0 1 20 20 0 4 0
R_KITCH_STYLE 0 1 10 17 0 4 0
R_HEAT_TYP 0 1 8 18 0 7 0
R_AC 0 1 8 15 0 3 0
R_EXT_CND 0 1 8 13 0 5 0
R_OVRALL_CND 0 1 8 13 0 5 0
R_INT_CND 0 1 8 13 0 5 0
R_INT_FIN 0 1 10 15 0 3 0
R_VIEW 0 1 8 13 0 5 0

Variable type: logical

skim_variable n_missing complete_rate mean count
STRUCTURE_CLASS 13479 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
PID 0 1.00 1853541231.84 184340306.35 702991020 1805608500.0 1900692000 2004482500 2012270010
AV_TOTAL 0 1.00 514393.30 180966.94 135400 393000.0 477000 585200 1490200
LAND_SF 0 1.00 5896.93 2880.59 0 4332.0 5325 6750 107158
YR_BUILT 2 1.00 1934.21 30.46 1710 1910.0 1932 1955 2018
YR_REMOD 8676 0.36 2003.29 14.14 1900 1998.0 2007 2014 2019
LIVING_AREA 0 1.00 1682.06 575.42 403 1306.0 1567 1939 8623
NUM_FLOORS 0 1.00 1.73 0.45 1 1.5 2 2 3
R_TOTAL_RMS 1 1.00 7.14 1.58 3 6.0 7 8 18
R_BDRMS 1 1.00 3.36 0.92 0 3.0 3 4 9
R_FULL_BTH 0 1.00 1.38 0.59 1 1.0 1 2 6
R_HALF_BTH 0 1.00 0.57 0.54 0 0.0 1 1 4
R_FPLACE 0 1.00 0.60 0.63 0 0.0 1 1 9
boston_predictions %>%
  skim_without_charts()
Data summary
Name Piped data
Number of rows 1000
Number of columns 28
_______________________
Column type frequency:
character 14
logical 1
numeric 13
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
OWN_OCC 0 1 1 1 0 2 0
R_BLDG_STYL 0 1 9 17 0 13 0
R_ROOF_TYP 0 1 7 11 0 6 0
R_EXT_FIN 0 1 9 17 0 11 0
R_BTH_STYLE 0 1 10 17 0 4 0
R_KITCH 0 1 15 20 0 4 0
R_KITCH_STYLE 0 1 10 17 0 4 0
R_HEAT_TYP 0 1 12 18 0 5 0
R_AC 0 1 8 15 0 3 0
R_EXT_CND 0 1 8 11 0 4 0
R_OVRALL_CND 0 1 8 11 0 4 0
R_INT_CND 0 1 8 13 0 5 0
R_INT_FIN 0 1 10 13 0 2 0
R_VIEW 0 1 8 13 0 5 0

Variable type: logical

skim_variable n_missing complete_rate mean count
STRUCTURE_CLASS 1000 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
PID 0 1.00 1857142653.36 185832590.79 1001514020 1805613000.00 1901717500.0 2005864000.00 2012260000
ZIPCODE 0 1.00 2131.42 3.63 2124 2131.00 2132.0 2132.00 2136
AV_TOTAL 0 1.00 520804.80 187980.95 179400 398650.00 472500.0 586475.00 1463500
LAND_SF 0 1.00 6122.25 3070.67 0 4484.75 5464.5 6860.75 34522
YR_BUILT 0 1.00 1936.04 31.14 1779 1914.75 1935.0 1956.00 2018
YR_REMOD 644 0.36 2003.87 13.74 1950 1998.75 2008.0 2014.00 2018
LIVING_AREA 0 1.00 1668.95 574.77 572 1296.00 1559.5 1928.00 5017
NUM_FLOORS 0 1.00 1.72 0.46 1 1.50 2.0 2.00 3
R_TOTAL_RMS 0 1.00 7.10 1.58 3 6.00 7.0 8.00 16
R_BDRMS 0 1.00 3.33 0.92 1 3.00 3.0 4.00 7
R_FULL_BTH 0 1.00 1.39 0.60 1 1.00 1.0 2.00 4
R_HALF_BTH 0 1.00 0.59 0.54 0 0.00 1.0 1.00 2
R_FPLACE 0 1.00 0.64 0.68 0 0.00 1.0 1.00 5
boston_join_demographics %>%
  skim_without_charts()
Data summary
Name Piped data
Number of rows 6
Number of columns 5
_______________________
Column type frequency:
character 1
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
City_State 0 1 13 21 0 5 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
ZIP 0 1 2131.83 4.92 2124 2130.25 2131.5 2135 2138
Population 0 1 35687.67 6831.66 28488 31219.75 35857.5 36314 47783
Pop_Density 0 1 11790.83 3283.07 6207 10839.75 12378.0 13251 15913
Median_Income 0 1 66848.00 11084.55 48841 60851.25 71090.5 75446 75730

Exploring AV_TOTAL

boston_train %>%
  summarise(min = min(AV_TOTAL, na.rm = TRUE),
            max = max(AV_TOTAL,na.rm = TRUE),
            mean = mean(AV_TOTAL, na.rm = TRUE),
            null = sum(is.na(AV_TOTAL)))
boston_train %>%
  ggplot(aes(AV_TOTAL)) +
  geom_histogram(bins = 50)

boston_train %>%
  ggplot(aes(x=ZIPCODE, y=AV_TOTAL)) +
  geom_boxplot()

boston_train %>%
  ggplot(aes(x=as.factor(OWN_OCC), y=AV_TOTAL)) +
  geom_boxplot()

boston_train %>%
  top_n(5,AV_TOTAL) %>%
  arrange(desc(AV_TOTAL))

Exploring Likely Predictors

boston_train %>%
  #na.omit() %>%
  select_if(is.numeric) %>%
  cor() %>%
  as.data.frame() %>%
  rownames_to_column(var="variable") 
boston_train %>%
  ggplot(aes(x=LAND_SF, y=AV_TOTAL)) +
  geom_point() +
  geom_smooth(method="lm") +
  labs(title = "Does LAND_SF influence AV_TOTAL?  ",
       subtitle = "",
       x = "LAND_SF",
       y = "AV_TOTAL",
       caption = "")
## `geom_smooth()` using formula 'y ~ x'

boston_train %>%
  ggplot(aes(x=LIVING_AREA, y=AV_TOTAL)) +
  geom_point() +
  geom_smooth(method="lm") +
  labs(title = "Does LIVING_AREA influence AV_TOTAL?  ",
       subtitle = "",
       x = "LIVING_AREA",
       y = "AV_TOTAL",
       caption = "")
## `geom_smooth()` using formula 'y ~ x'

boston_train %>%
  ggplot(aes(x=R_TOTAL_RMS, y=AV_TOTAL)) +
  geom_point() +
  geom_smooth(method="lm") +
  labs(title = "Does R_TOTAL_RMS influence AV_TOTAL?  ",
       subtitle = "",
       x = "R-TOTAL_RMS",
       y = "AV_TOTAL",
       caption = "")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

boston_train %>%
  ggplot(aes(x=ZIPCODE, y= mean(AV_TOTAL))) +
  geom_col() +
  geom_smooth(method="lm") +
  labs(title = "Does ZIPCODE influence average AV_TOTAL?  ",
       subtitle = "",
       x = "ZIPCODE",
       y = "AVG_AV_TOTAL",
       caption = "")
## `geom_smooth()` using formula 'y ~ x'

boston_train %>%
  ggplot(aes(x=OWN_OCC, y= mean(AV_TOTAL))) +
  geom_col() +
  geom_smooth(method="lm") +
  labs(title = "Does OWN_OCC influence average AV_TOTAL?  ",
       subtitle = "",
       x = "OWN_OCC",
       y = "AVG_AV_TOTAL",
       caption = "")
## `geom_smooth()` using formula 'y ~ x'

Preping data

Boston_2020 <- boston_train %>%
  mutate(ZIPCODE = as.integer(ZIPCODE)) %>%
  mutate(zipcode = ZIPCODE)

Boston_zips <- boston_join_demographics %>%
  mutate(ZIP = as.integer(ZIP)) %>%
  mutate(zipcode = ZIP)

Boston_1k <- boston_predictions %>%
  mutate(ZIPCODE = as.integer(ZIPCODE)) %>%
  mutate(zipcode = ZIPCODE)

Boston_prep <- Boston_2020 %>%
  inner_join(Boston_zips, by = "zipcode") %>%
  mutate(R_TOTAL_RMS = replace_na(R_TOTAL_RMS, 7 )) %>%
  mutate(R_BDRMS = replace_na(R_BDRMS, 3.5 )) %>%
  mutate(YR_BUILT = replace_na(YR_BUILT, 1934 )) %>%
  mutate(YR_REMOD = replace_na(YR_REMOD, 0)) %>%
  mutate(AGE = if_else(YR_BUILT < YR_REMOD, 2021 - YR_REMOD, 2021 - YR_BUILT)) %>%
  mutate(ZIPCODE = as.factor(ZIPCODE)) 
  
BOSTON_1K_PREP <- Boston_1k %>%
  inner_join(Boston_zips, by = "zipcode") %>%
  mutate(R_TOTAL_RMS = replace_na(R_TOTAL_RMS, 7 )) %>%
  mutate(R_BDRMS = replace_na(R_BDRMS, 3.5 )) %>%
  mutate(YR_BUILT = replace_na(YR_BUILT, 1934 )) %>%
  mutate(YR_REMOD = replace_na(YR_REMOD, 0)) %>%
  mutate(AGE = if_else(YR_BUILT < YR_REMOD, 2021 - YR_REMOD, 2021 - YR_BUILT)) 

Partition BOSTON_PREP into TRAIN / TEST split

split Boston_prep into 70/30 percent train/test split

sample <- sample.int(n = nrow(Boston_prep), size = floor(0.7*nrow(Boston_prep)), replace = F)
train <- Boston_prep[sample, ]
test  <- Boston_prep[-sample,]

Build Model number 1

evaluating first AV_TOTAL model using the following predictors on the “train” data set:

model_1 <- lm(AV_TOTAL ~ LAND_SF + LIVING_AREA+ AGE + R_TOTAL_RMS, data=train)

summary(model_1)
## 
## Call:
## lm(formula = AV_TOTAL ~ LAND_SF + LIVING_AREA + AGE + R_TOTAL_RMS, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1031592   -75656    -5186    60832   821144 
## 
## Coefficients:
##               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 193097.798   6854.920  28.169 <0.0000000000000002 ***
## LAND_SF          9.163      0.514  17.826 <0.0000000000000002 ***
## LIVING_AREA    187.595      3.417  54.908 <0.0000000000000002 ***
## AGE           -824.979     32.987 -25.010 <0.0000000000000002 ***
## R_TOTAL_RMS    260.708   1201.359   0.217               0.828    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 130300 on 9430 degrees of freedom
## Multiple R-squared:  0.4849, Adjusted R-squared:  0.4847 
## F-statistic:  2219 on 4 and 9430 DF,  p-value: < 0.00000000000000022
glance(model_1) %>%
   mutate(across(where(is.numeric), round, 3))
tidy(model_1, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), round, 3))
par(mfrow = c(2, 2))
plot(model_1)

Evaluate model 1 on Train and Test Datasets

sprintf("Training RSQUARE: %.4f and RMSE: %.3f",
        rsquare(model_1, train),  
        rmse(model_1, train))
## [1] "Training RSQUARE: 0.4849 and RMSE: 130304.412"
sprintf("Testing r-square: %.4f and RMSE: %.3f",
        rsquare(model_1, test),  
        rmse(model_1, test))
## [1] "Testing r-square: 0.4801 and RMSE: 129500.672"

Build model 2

model_2 <- lm(AV_TOTAL ~ AGE + Median_Income + R_TOTAL_RMS + LIVING_AREA + LAND_SF + R_BDRMS + R_FULL_BTH + Population + zipcode, data=train)

summary(model_2)
## 
## Call:
## lm(formula = AV_TOTAL ~ AGE + Median_Income + R_TOTAL_RMS + LIVING_AREA + 
##     LAND_SF + R_BDRMS + R_FULL_BTH + Population + zipcode, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -869410  -56882   -4412   49606  706891 
## 
## Coefficients:
##                    Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)   43690342.0938  1372972.7947  31.822 < 0.0000000000000002 ***
## AGE               -569.9180       26.7000 -21.345 < 0.0000000000000002 ***
## Median_Income        8.4891        0.1137  74.675 < 0.0000000000000002 ***
## R_TOTAL_RMS       4047.7155     1084.0196   3.734              0.00019 ***
## LIVING_AREA        162.7125        2.8473  57.146 < 0.0000000000000002 ***
## LAND_SF              9.6722        0.4177  23.154 < 0.0000000000000002 ***
## R_BDRMS          -5307.5418     1664.5501  -3.189              0.00143 ** 
## R_FULL_BTH       26303.9336     2042.0708  12.881 < 0.0000000000000002 ***
## Population          -6.7337        0.3636 -18.520 < 0.0000000000000002 ***
## zipcode         -20570.1944      639.4033 -32.171 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 100800 on 9425 degrees of freedom
## Multiple R-squared:  0.6922, Adjusted R-squared:  0.6919 
## F-statistic:  2355 on 9 and 9425 DF,  p-value: < 0.00000000000000022
glance(model_2) %>%
   mutate(across(where(is.numeric), round, 3))
tidy(model_2, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), round, 3))
par(mfrow = c(2, 2))
plot(model_2)

Evaluate Model 2.

sprintf("Training RSQUARE: %.4f and RMSE: %.3f",
        rsquare(model_2, train),  
        rmse(model_2, train))
## [1] "Training RSQUARE: 0.6922 and RMSE: 100725.930"
sprintf("Testing r-square: %.4f and RMSE: %.3f",
        rsquare(model_2, test),  
        rmse(model_2, test))
## [1] "Testing r-square: 0.6860 and RMSE: 100657.962"

Make prediction on 1k sample

identify properties where the model does well and not so well

top_10 <- BOSTON_1K_PREP %>%
  add_predictions(model_2) %>%
  mutate(error = abs(AV_TOTAL - pred)) %>%
  top_n(10,-error)

bottom_10 <- BOSTON_1K_PREP %>%
  add_predictions(model_2) %>%
  mutate(error = abs(AV_TOTAL - pred)) %>%
  top_n(-10,-error)