Linear Regression Models Predicting Assessed Values

Background

Data from the tax authority of the City of Boston to perform a re-assessment of Taxes of residential single family homes in the greater Boston area. I am creating and comparing two linear regression models to predict the av_total (assessed value) of these properties in the greater Boston area. I will be using the following data sets:

Boston.csv Boston_1k.csv Boston_zips.csv

Loading Libraries

options(scipen = 999)
library(tidyverse)  # our main library

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(broom)      # clean up the messy output of built-in functions in R return tibbles

## Warning: package 'broom' was built under R version 4.0.4

library(modelr)     # contains several often-used model quality metrics

## 
## Attaching package: 'modelr'

## The following object is masked from 'package:broom':
## 
##     bootstrap

library(skimr)      # quick and dirty profile of data 
library(janitor)    # clean up any column names

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(vip)        # make a nice variable importance plot

## Warning: package 'vip' was built under R version 4.0.4

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

Importing the Data

boston_train <- read_csv("Boston_2020.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_character(),
##   PID = col_double(),
##   AV_TOTAL = col_double(),
##   LAND_SF = col_double(),
##   YR_BUILT = col_double(),
##   YR_REMOD = col_double(),
##   LIVING_AREA = col_double(),
##   NUM_FLOORS = col_double(),
##   STRUCTURE_CLASS = col_logical(),
##   R_TOTAL_RMS = col_double(),
##   R_BDRMS = col_double(),
##   R_FULL_BTH = col_double(),
##   R_HALF_BTH = col_double(),
##   R_FPLACE = col_double()
## )
## i Use `spec()` for the full column specifications.

boston_predictions <- read.csv("Boston_1k.csv")
boston_join_demographics <- read.csv("Boston_zips.csv")

Skim of the data

boston_train %>%
  skim_without_charts()

Data summary
Name	Piped data
Number of rows	13479
Number of columns	28
_______________________
Column type frequency:
character	15
logical	1
numeric	12
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
ZIPCODE	1	5	5	5
OWN_OCC	1	1	1	2
R_BLDG_STYL	1	9	17	16
R_ROOF_TYP	1	7	11	7
R_EXT_FIN	1	9	17	10
R_BTH_STYLE	1	10	17	4
R_KITCH	1	20	20	4
R_KITCH_STYLE	1	10	17	4
R_HEAT_TYP	1	8	18	7
R_AC	1	8	15	3
R_EXT_CND	1	8	13	5
R_OVRALL_CND	1	8	13	5
R_INT_CND	1	8	13	5
R_INT_FIN	1	10	15	3
R_VIEW	1	8	13	5

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
STRUCTURE_CLASS	13479	0	NaN	:

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
PID	0	1.00	1853541231.84	184340306.35	702991020	1805608500.0	1900692000	2004482500	2012270010
AV_TOTAL	0	1.00	514393.30	180966.94	135400	393000.0	477000	585200	1490200
LAND_SF	0	1.00	5896.93	2880.59	0	4332.0	5325	6750	107158
YR_BUILT	2	1.00	1934.21	30.46	1710	1910.0	1932	1955	2018
YR_REMOD	8676	0.36	2003.29	14.14	1900	1998.0	2007	2014	2019
LIVING_AREA	0	1.00	1682.06	575.42	403	1306.0	1567	1939	8623
NUM_FLOORS	0	1.00	1.73	0.45	1	1.5	2	2	3
R_TOTAL_RMS	1	1.00	7.14	1.58	3	6.0	7	8	18
R_BDRMS	1	1.00	3.36	0.92	0	3.0	3	4	9
R_FULL_BTH	0	1.00	1.38	0.59	1	1.0	1	2	6
R_HALF_BTH	0	1.00	0.57	0.54	0	0.0	1	1	4
R_FPLACE	0	1.00	0.60	0.63	0	0.0	1	1	9

boston_predictions %>%
  skim_without_charts()

Data summary
Name	Piped data
Number of rows	1000
Number of columns	28
_______________________
Column type frequency:
character	14
logical	1
numeric	13
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
OWN_OCC	1	1	1	2
R_BLDG_STYL	1	9	17	13
R_ROOF_TYP	1	7	11	6
R_EXT_FIN	1	9	17	11
R_BTH_STYLE	1	10	17	4
R_KITCH	1	15	20	4
R_KITCH_STYLE	1	10	17	4
R_HEAT_TYP	1	12	18	5
R_AC	1	8	15	3
R_EXT_CND	1	8	11	4
R_OVRALL_CND	1	8	11	4
R_INT_CND	1	8	13	5
R_INT_FIN	1	10	13	2
R_VIEW	1	8	13	5

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
STRUCTURE_CLASS	1000	0	NaN	:

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100
PID	0	1.00	1857142653.36	185832590.79	1001514020	1805613000.00	1901717500.0	2005864000.00	2012260000
ZIPCODE	0	1.00	2131.42	3.63	2124	2131.00	2132.0	2132.00	2136
AV_TOTAL	0	1.00	520804.80	187980.95	179400	398650.00	472500.0	586475.00	1463500
LAND_SF	0	1.00	6122.25	3070.67	0	4484.75	5464.5	6860.75	34522
YR_BUILT	0	1.00	1936.04	31.14	1779	1914.75	1935.0	1956.00	2018
YR_REMOD	644	0.36	2003.87	13.74	1950	1998.75	2008.0	2014.00	2018
LIVING_AREA	0	1.00	1668.95	574.77	572	1296.00	1559.5	1928.00	5017
NUM_FLOORS	0	1.00	1.72	0.46	1	1.50	2.0	2.00	3
R_TOTAL_RMS	0	1.00	7.10	1.58	3	6.00	7.0	8.00	16
R_BDRMS	0	1.00	3.33	0.92	1	3.00	3.0	4.00	7
R_FULL_BTH	0	1.00	1.39	0.60	1	1.00	1.0	2.00	4
R_HALF_BTH	0	1.00	0.59	0.54	0	0.00	1.0	1.00	2
R_FPLACE	0	1.00	0.64	0.68	0	0.00	1.0	1.00	5

boston_join_demographics %>%
  skim_without_charts()

Data summary
Name	Piped data
Number of rows	6
Number of columns	5
_______________________
Column type frequency:
character	1
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
City_State	0	1	13	21	0	5	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
ZIP	1	2131.83	4.92	2124	2130.25	2131.5	2135	2138
Population	1	35687.67	6831.66	28488	31219.75	35857.5	36314	47783
Pop_Density	1	11790.83	3283.07	6207	10839.75	12378.0	13251	15913
Median_Income	1	66848.00	11084.55	48841	60851.25	71090.5	75446	75730

Exploring AV_TOTAL

boston_train %>%
  summarise(min = min(AV_TOTAL, na.rm = TRUE),
            max = max(AV_TOTAL,na.rm = TRUE),
            mean = mean(AV_TOTAL, na.rm = TRUE),
            null = sum(is.na(AV_TOTAL)))

boston_train %>%
  ggplot(aes(AV_TOTAL)) +
  geom_histogram(bins = 50)

boston_train %>%
  ggplot(aes(x=ZIPCODE, y=AV_TOTAL)) +
  geom_boxplot()

boston_train %>%
  ggplot(aes(x=as.factor(OWN_OCC), y=AV_TOTAL)) +
  geom_boxplot()

boston_train %>%
  top_n(5,AV_TOTAL) %>%
  arrange(desc(AV_TOTAL))

Exploring Likely Predictors

boston_train %>%
  #na.omit() %>%
  select_if(is.numeric) %>%
  cor() %>%
  as.data.frame() %>%
  rownames_to_column(var="variable")

boston_train %>%
  ggplot(aes(x=LAND_SF, y=AV_TOTAL)) +
  geom_point() +
  geom_smooth(method="lm") +
  labs(title = "Does LAND_SF influence AV_TOTAL?  ",
       subtitle = "",
       x = "LAND_SF",
       y = "AV_TOTAL",
       caption = "")

## `geom_smooth()` using formula 'y ~ x'

boston_train %>%
  ggplot(aes(x=LIVING_AREA, y=AV_TOTAL)) +
  geom_point() +
  geom_smooth(method="lm") +
  labs(title = "Does LIVING_AREA influence AV_TOTAL?  ",
       subtitle = "",
       x = "LIVING_AREA",
       y = "AV_TOTAL",
       caption = "")

## `geom_smooth()` using formula 'y ~ x'

boston_train %>%
  ggplot(aes(x=R_TOTAL_RMS, y=AV_TOTAL)) +
  geom_point() +
  geom_smooth(method="lm") +
  labs(title = "Does R_TOTAL_RMS influence AV_TOTAL?  ",
       subtitle = "",
       x = "R-TOTAL_RMS",
       y = "AV_TOTAL",
       caption = "")

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 1 rows containing non-finite values (stat_smooth).

## Warning: Removed 1 rows containing missing values (geom_point).

boston_train %>%
  ggplot(aes(x=ZIPCODE, y= mean(AV_TOTAL))) +
  geom_col() +
  geom_smooth(method="lm") +
  labs(title = "Does ZIPCODE influence average AV_TOTAL?  ",
       subtitle = "",
       x = "ZIPCODE",
       y = "AVG_AV_TOTAL",
       caption = "")

## `geom_smooth()` using formula 'y ~ x'

boston_train %>%
  ggplot(aes(x=OWN_OCC, y= mean(AV_TOTAL))) +
  geom_col() +
  geom_smooth(method="lm") +
  labs(title = "Does OWN_OCC influence average AV_TOTAL?  ",
       subtitle = "",
       x = "OWN_OCC",
       y = "AVG_AV_TOTAL",
       caption = "")

## `geom_smooth()` using formula 'y ~ x'

Preping data

Boston_2020 <- boston_train %>%
  mutate(ZIPCODE = as.integer(ZIPCODE)) %>%
  mutate(zipcode = ZIPCODE)

Boston_zips <- boston_join_demographics %>%
  mutate(ZIP = as.integer(ZIP)) %>%
  mutate(zipcode = ZIP)

Boston_1k <- boston_predictions %>%
  mutate(ZIPCODE = as.integer(ZIPCODE)) %>%
  mutate(zipcode = ZIPCODE)

Boston_prep <- Boston_2020 %>%
  inner_join(Boston_zips, by = "zipcode") %>%
  mutate(R_TOTAL_RMS = replace_na(R_TOTAL_RMS, 7 )) %>%
  mutate(R_BDRMS = replace_na(R_BDRMS, 3.5 )) %>%
  mutate(YR_BUILT = replace_na(YR_BUILT, 1934 )) %>%
  mutate(YR_REMOD = replace_na(YR_REMOD, 0)) %>%
  mutate(AGE = if_else(YR_BUILT < YR_REMOD, 2021 - YR_REMOD, 2021 - YR_BUILT)) %>%
  mutate(ZIPCODE = as.factor(ZIPCODE)) 
  
BOSTON_1K_PREP <- Boston_1k %>%
  inner_join(Boston_zips, by = "zipcode") %>%
  mutate(R_TOTAL_RMS = replace_na(R_TOTAL_RMS, 7 )) %>%
  mutate(R_BDRMS = replace_na(R_BDRMS, 3.5 )) %>%
  mutate(YR_BUILT = replace_na(YR_BUILT, 1934 )) %>%
  mutate(YR_REMOD = replace_na(YR_REMOD, 0)) %>%
  mutate(AGE = if_else(YR_BUILT < YR_REMOD, 2021 - YR_REMOD, 2021 - YR_BUILT))

Partition BOSTON_PREP into TRAIN / TEST split

split Boston_prep into 70/30 percent train/test split

sample <- sample.int(n = nrow(Boston_prep), size = floor(0.7*nrow(Boston_prep)), replace = F)
train <- Boston_prep[sample, ]
test  <- Boston_prep[-sample,]

Build Model number 1

evaluating first AV_TOTAL model using the following predictors on the “train” data set:

LAND_SF
LIVING_AREA
AGE
R_TOTAL_RMS

model_1 <- lm(AV_TOTAL ~ LAND_SF + LIVING_AREA+ AGE + R_TOTAL_RMS, data=train)

summary(model_1)

## 
## Call:
## lm(formula = AV_TOTAL ~ LAND_SF + LIVING_AREA + AGE + R_TOTAL_RMS, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1031592   -75656    -5186    60832   821144 
## 
## Coefficients:
##               Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 193097.798   6854.920  28.169 <0.0000000000000002 ***
## LAND_SF          9.163      0.514  17.826 <0.0000000000000002 ***
## LIVING_AREA    187.595      3.417  54.908 <0.0000000000000002 ***
## AGE           -824.979     32.987 -25.010 <0.0000000000000002 ***
## R_TOTAL_RMS    260.708   1201.359   0.217               0.828    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 130300 on 9430 degrees of freedom
## Multiple R-squared:  0.4849, Adjusted R-squared:  0.4847 
## F-statistic:  2219 on 4 and 9430 DF,  p-value: < 0.00000000000000022

glance(model_1) %>%
   mutate(across(where(is.numeric), round, 3))

tidy(model_1, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), round, 3))

par(mfrow = c(2, 2))
plot(model_1)

Evaluate model 1 on Train and Test Datasets

sprintf("Training RSQUARE: %.4f and RMSE: %.3f",
        rsquare(model_1, train),  
        rmse(model_1, train))

## [1] "Training RSQUARE: 0.4849 and RMSE: 130304.412"

sprintf("Testing r-square: %.4f and RMSE: %.3f",
        rsquare(model_1, test),  
        rmse(model_1, test))

## [1] "Testing r-square: 0.4801 and RMSE: 129500.672"

Build model 2

model_2 <- lm(AV_TOTAL ~ AGE + Median_Income + R_TOTAL_RMS + LIVING_AREA + LAND_SF + R_BDRMS + R_FULL_BTH + Population + zipcode, data=train)

summary(model_2)

## 
## Call:
## lm(formula = AV_TOTAL ~ AGE + Median_Income + R_TOTAL_RMS + LIVING_AREA + 
##     LAND_SF + R_BDRMS + R_FULL_BTH + Population + zipcode, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -869410  -56882   -4412   49606  706891 
## 
## Coefficients:
##                    Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)   43690342.0938  1372972.7947  31.822 < 0.0000000000000002 ***
## AGE               -569.9180       26.7000 -21.345 < 0.0000000000000002 ***
## Median_Income        8.4891        0.1137  74.675 < 0.0000000000000002 ***
## R_TOTAL_RMS       4047.7155     1084.0196   3.734              0.00019 ***
## LIVING_AREA        162.7125        2.8473  57.146 < 0.0000000000000002 ***
## LAND_SF              9.6722        0.4177  23.154 < 0.0000000000000002 ***
## R_BDRMS          -5307.5418     1664.5501  -3.189              0.00143 ** 
## R_FULL_BTH       26303.9336     2042.0708  12.881 < 0.0000000000000002 ***
## Population          -6.7337        0.3636 -18.520 < 0.0000000000000002 ***
## zipcode         -20570.1944      639.4033 -32.171 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 100800 on 9425 degrees of freedom
## Multiple R-squared:  0.6922, Adjusted R-squared:  0.6919 
## F-statistic:  2355 on 9 and 9425 DF,  p-value: < 0.00000000000000022

glance(model_2) %>%
   mutate(across(where(is.numeric), round, 3))

tidy(model_2, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), round, 3))

par(mfrow = c(2, 2))
plot(model_2)

Evaluate Model 2.

sprintf("Training RSQUARE: %.4f and RMSE: %.3f",
        rsquare(model_2, train),  
        rmse(model_2, train))

## [1] "Training RSQUARE: 0.6922 and RMSE: 100725.930"

sprintf("Testing r-square: %.4f and RMSE: %.3f",
        rsquare(model_2, test),  
        rmse(model_2, test))

## [1] "Testing r-square: 0.6860 and RMSE: 100657.962"

Make prediction on 1k sample

identify properties where the model does well and not so well

top_10 <- BOSTON_1K_PREP %>%
  add_predictions(model_2) %>%
  mutate(error = abs(AV_TOTAL - pred)) %>%
  top_n(10,-error)

bottom_10 <- BOSTON_1K_PREP %>%
  add_predictions(model_2) %>%
  mutate(error = abs(AV_TOTAL - pred)) %>%
  top_n(-10,-error)

Linear Regression Models Predicting Assessed Values

Christopher Morris

3/21/2021

Background

Loading Libraries

Importing the Data

Skim of the data

Exploring AV_TOTAL

Exploring Likely Predictors

Preping data

Partition BOSTON_PREP into TRAIN / TEST split

Build Model number 1

Evaluate model 1 on Train and Test Datasets

Build model 2

Evaluate Model 2.

Make prediction on 1k sample