# Libraries
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(urca)
library(lubridate)
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(ISLR2)
library(leaps)
# Set Working Directory
setwd("~/BUDT730")
# Load ZHVI Data
load("~/BUDT730/ZHVI.RData")
attach(ZHVI)
## The following object is masked from package:carData:
##
## Florida
###### Question 1 ######
# See Datatypes For Each Column
str(ZHVI)
## spc_tbl_ [310 × 52] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : Date[1:310], format: "2000-01-01" "2000-02-01" ...
## $ Florida : num [1:310] 107862 108096 108379 108955 109575 ...
## $ California : num [1:310] 188374 189007 189862 191718 193875 ...
## $ Virginia : num [1:310] 124133 124454 124795 125438 126059 ...
## $ Texas : num [1:310] 112619 112680 112710 112857 112952 ...
## $ New York : num [1:310] 153885 154433 154961 156095 157269 ...
## $ Colorado : num [1:310] 176602 177229 177922 179471 181200 ...
## $ Massachusetts : num [1:310] 196903 197561 198343 199883 201551 ...
## $ Pennsylvania : num [1:310] 98957 99168 99367 99773 100189 ...
## $ New Jersey : num [1:310] 170361 170821 171259 172230 173219 ...
## $ Arizona : num [1:310] 135754 136000 136316 136963 137695 ...
## $ Michigan : num [1:310] 115957 116021 116255 116721 117407 ...
## $ Washington : num [1:310] 174648 175042 175444 176255 177140 ...
## $ Utah : num [1:310] 169263 169356 169502 169867 171191 ...
## $ the District of Columbia: num [1:310] 161758 162288 163038 164648 166434 ...
## $ New Hampshire : num [1:310] 133315 133767 134308 135120 136008 ...
## $ Illinois : num [1:310] 128077 128180 128402 128924 129543 ...
## $ Georgia : num [1:310] 126067 126354 126684 127379 128146 ...
## $ Connecticut : num [1:310] 162193 162811 163456 164549 165448 ...
## $ North Carolina : num [1:310] 128827 129023 129225 129663 130151 ...
## $ Nevada : num [1:310] 154865 154884 155124 155673 156373 ...
## $ Idaho : num [1:310] 124002 123927 123974 124073 124396 ...
## $ Maine : num [1:310] 106572 106926 107243 107940 108597 ...
## $ Minnesota : num [1:310] 124611 125030 125420 126336 127341 ...
## $ Alabama : num [1:310] 100026 100200 100317 100578 100870 ...
## $ Hawaii : num [1:310] 194143 194053 194351 194868 195881 ...
## $ Oregon : num [1:310] 151506 151839 152143 152782 153352 ...
## $ South Carolina : num [1:310] 114514 114767 114973 115445 115886 ...
## $ Missouri : num [1:310] 96716 96829 96999 97387 97888 ...
## $ Maryland : num [1:310] 151575 151554 151661 151920 152463 ...
## $ Vermont : num [1:310] 115574 115768 116072 116669 117357 ...
## $ Tennessee : num [1:310] 112258 112351 112447 112677 112942 ...
## $ Montana : num [1:310] NA NA NA NA NA NA NA NA NA NA ...
## $ Iowa : num [1:310] 90600 90773 90937 91290 91680 ...
## $ West Virginia : num [1:310] 72810 72807 72831 72890 73006 ...
## $ Louisiana : num [1:310] 109297 109508 109672 110041 110433 ...
## $ Rhode Island : num [1:310] 131633 132231 132860 134102 135340 ...
## $ Mississippi : num [1:310] 89088 89039 89037 89101 89414 ...
## $ Indiana : num [1:310] 97563 97429 97278 96820 96599 ...
## $ Nebraska : num [1:310] 115309 115493 115664 115929 116064 ...
## $ Wyoming : num [1:310] NA NA NA NA NA NA NA NA NA NA ...
## $ Ohio : num [1:310] 104765 104829 104926 105211 105647 ...
## $ North Dakota : num [1:310] NA NA NA NA NA NA NA NA NA NA ...
## $ Alaska : num [1:310] 135910 136064 136218 136399 136672 ...
## $ Kansas : num [1:310] 88644 88662 88764 88917 89148 ...
## $ Wisconsin : num [1:310] 120259 120225 120359 120552 120938 ...
## $ Delaware : num [1:310] 148207 148438 148725 149381 150130 ...
## $ New Mexico : num [1:310] NA NA NA NA NA NA NA NA NA NA ...
## $ Arkansas : num [1:310] 89099 89218 89354 89642 89927 ...
## $ Kentucky : num [1:310] 88517 88562 88646 88852 89127 ...
## $ Oklahoma : num [1:310] 83296 83331 83491 83783 84174 ...
## $ South Dakota : num [1:310] 107833 107604 107283 106851 106671 ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_date(format = ""),
## .. Florida = col_double(),
## .. California = col_double(),
## .. Virginia = col_double(),
## .. Texas = col_double(),
## .. `New York` = col_double(),
## .. Colorado = col_double(),
## .. Massachusetts = col_double(),
## .. Pennsylvania = col_double(),
## .. `New Jersey` = col_double(),
## .. Arizona = col_double(),
## .. Michigan = col_double(),
## .. Washington = col_double(),
## .. Utah = col_double(),
## .. `the District of Columbia` = col_double(),
## .. `New Hampshire` = col_double(),
## .. Illinois = col_double(),
## .. Georgia = col_double(),
## .. Connecticut = col_double(),
## .. `North Carolina` = col_double(),
## .. Nevada = col_double(),
## .. Idaho = col_double(),
## .. Maine = col_double(),
## .. Minnesota = col_double(),
## .. Alabama = col_double(),
## .. Hawaii = col_double(),
## .. Oregon = col_double(),
## .. `South Carolina` = col_double(),
## .. Missouri = col_double(),
## .. Maryland = col_double(),
## .. Vermont = col_double(),
## .. Tennessee = col_double(),
## .. Montana = col_double(),
## .. Iowa = col_double(),
## .. `West Virginia` = col_double(),
## .. Louisiana = col_double(),
## .. `Rhode Island` = col_double(),
## .. Mississippi = col_double(),
## .. Indiana = col_double(),
## .. Nebraska = col_double(),
## .. Wyoming = col_double(),
## .. Ohio = col_double(),
## .. `North Dakota` = col_double(),
## .. Alaska = col_double(),
## .. Kansas = col_double(),
## .. Wisconsin = col_double(),
## .. Delaware = col_double(),
## .. `New Mexico` = col_double(),
## .. Arkansas = col_double(),
## .. Kentucky = col_double(),
## .. Oklahoma = col_double(),
## .. `South Dakota` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# All Data Numeric (Except Time Index)
# Convert Columns Names To Snake Case
ZHVI <- ZHVI |> clean_names()
# Change Time Index Name
colnames(ZHVI)[1] <- "date"
# Count Null Values
colSums(is.na(ZHVI))
## date florida california
## 0 0 0
## virginia texas new_york
## 0 0 0
## colorado massachusetts pennsylvania
## 0 0 0
## new_jersey arizona michigan
## 0 1 0
## washington utah the_district_of_columbia
## 0 0 0
## new_hampshire illinois georgia
## 0 0 0
## connecticut north_carolina nevada
## 0 0 0
## idaho maine minnesota
## 1 0 0
## alabama hawaii oregon
## 0 0 0
## south_carolina missouri maryland
## 0 0 0
## vermont tennessee montana
## 0 0 61
## iowa west_virginia louisiana
## 0 1 0
## rhode_island mississippi indiana
## 0 0 0
## nebraska wyoming ohio
## 0 27 0
## north_dakota alaska kansas
## 108 1 0
## wisconsin delaware new_mexico
## 0 0 27
## arkansas kentucky oklahoma
## 0 0 0
## south_dakota
## 1
# Some Regions Like North Dakota, New Mexico, and Wyoming Have Significant Null Values. Something To Keep In Mind For Further Analysis.
### Define Regions
regions <- list(
northeast = c(
"new_york","new_jersey","massachusetts","pennsylvania",
"connecticut","rhode_island","vermont","new_hampshire","maine"
),
south = c(
"florida","texas","virginia","north_carolina","south_carolina","georgia",
"maryland","delaware","west_virginia","alabama","mississippi","louisiana",
"tennessee","kentucky","arkansas","oklahoma","the_district_of_columbia"
),
midwest = c(
"illinois","indiana","michigan","ohio","wisconsin","minnesota",
"iowa","missouri","kansas","nebraska","south_dakota","north_dakota"
),
west = c(
"california","colorado","arizona","nevada","utah","washington",
"oregon","idaho","montana","wyoming","new_mexico","alaska","hawaii"
)
)
### Pipeline To Create Dataset With Regional Averages
regional_df <- ZHVI |>
# Convert From Wide To Long Format
pivot_longer(
cols = -date,
names_to = "state",
values_to = "zhvi"
) |>
# Group States By Region
mutate(
region = case_when(
state %in% regions$northeast ~ "Northeast",
state %in% regions$south ~ "South",
state %in% regions$midwest ~ "Midwest",
state %in% regions$west ~ "West",
TRUE ~ NA_character_
)
) |>
# Calculate Avergae Zestimate Per Region For Each Year
group_by(date, region) |>
summarise(
regional_zhvi = mean(zhvi, na.rm = TRUE), # Ignore Null Values
.groups = "drop"
) |>
# Convert Back To Wide Format
pivot_wider(
names_from = region,
values_from = regional_zhvi
)
# Detach Old Dataset
detach(ZHVI)
# Inspect New Data
head(regional_df)
## # A tibble: 6 × 5
## date Midwest Northeast South West
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2000-01-01 108212. 141044. 112938. 160507.
## 2 2000-02-01 108279. 141499. 113098. 160740.
## 3 2000-03-01 108390. 141985. 113293. 161086.
## 4 2000-04-01 108631. 142929. 113721. 161807.
## 5 2000-05-01 108993. 143886. 114217. 162777.
## 6 2000-06-01 109433. 144868. 114757. 163783.
str(regional_df)
## tibble [310 × 5] (S3: tbl_df/tbl/data.frame)
## $ date : Date[1:310], format: "2000-01-01" "2000-02-01" ...
## $ Midwest : num [1:310] 108212 108279 108390 108631 108993 ...
## $ Northeast: num [1:310] 141044 141499 141985 142929 143886 ...
## $ South : num [1:310] 112938 113098 113293 113721 114217 ...
## $ West : num [1:310] 160507 160740 161086 161807 162777 ...
colSums(is.na(regional_df)) # No Null Values
## date Midwest Northeast South West
## 0 0 0 0 0
# Convert To Long Format To Plot
regional_df_long <- regional_df |>
pivot_longer(
cols = -date,
names_to = "regions",
values_to = "zhvi"
)
### Plot Data
ggplot(data = regional_df_long) +
geom_line(mapping = aes(x = date, y = zhvi, color = regions)) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "ZHVI ($)", labels = scales::comma) +
scale_color_discrete(name = "Regions", guide = guide_legend(position = "bottom")) +
ggtitle("Zillow Home Value Index (ZHVI) Across U.S. Regions, 2000 - 2025",
subtitle = "Source: Zillow Home Value Index (ZHVI) – Kaggle (https://www.kaggle.com/datasets/robikscube/zillow-home-value-index)") +
annotate("rect", xmin = as.Date("2020-03-01"), xmax = Inf, ymin = -Inf, ymax = Inf, alpha = 0.1, color = "grey76") + # Comment This Line To Make Shaded Area Disappear
theme_bw()
## Warning in scale_x_date(name = "Date"): A <numeric> value was passed to a Date scale.
## ℹ The value was converted to a <Date> object.

# Examine Mean Values Pre- And Post-COVID
pre_covid <- regional_df |> filter(date < as.Date("2020-03-01"))
post_covid <- regional_df |> filter(date >= as.Date("2020-03-01"))
### Generate Summary Statistics
summary(pre_covid[, 2:5])
## Midwest Northeast South West
## Min. :108212 Min. :141044 Min. :112938 Min. :160507
## 1st Qu.:131330 1st Qu.:224410 1st Qu.:156373 1st Qu.:209271
## Median :141822 Median :239975 Median :170507 Median :239893
## Mean :143339 Mean :237055 Mean :169395 Mean :244737
## 3rd Qu.:150219 3rd Qu.:266764 3rd Qu.:188109 3rd Qu.:277938
## Max. :189565 Max. :301939 Max. :226418 Max. :357832
summary(post_covid[, 2:5])
## Midwest Northeast South West
## Min. :190670 Min. :303963 Min. :227843 Min. :360304
## 1st Qu.:223091 1st Qu.:367064 1st Qu.:267172 1st Qu.:443934
## Median :245927 Median :407316 Median :298972 Median :494271
## Mean :239593 Mean :400650 Mean :285119 Mean :468519
## 3rd Qu.:260250 3rd Qu.:447695 3rd Qu.:306756 3rd Qu.:504816
## Max. :271580 Max. :465234 Max. :310230 Max. :513018
sd(pre_covid$Midwest)
## [1] 18699.46
sd(pre_covid$Northeast)
## [1] 38485.3
sd(pre_covid$South)
## [1] 28155.61
sd(pre_covid$West)
## [1] 49948.57
sd(post_covid$Midwest)
## [1] 24863.35
sd(post_covid$Northeast)
## [1] 51062.95
sd(post_covid$South)
## [1] 27338.77
sd(post_covid$West)
## [1] 50156.82
### Boxplot
regional_df_long$COVID_19 <- ifelse(regional_df_long$date < as.Date("2020-03-01"), "Pre-COVID", "Post-COVID")
# Convert To Ordered Factor
regional_df_long$COVID_19 <- factor(
regional_df_long$COVID_19,
levels = c("Pre-COVID", "Post-COVID")
)
# Midwest
regional_df_long |>
filter(regions == "Midwest") |>
ggplot() +
geom_boxplot(mapping = aes(x = COVID_19, y = zhvi, fill = COVID_19)) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
labs(
title = "Midwestern Region",
subtitle = "ZHVI Statistical Summary: Pre- And Post-COVID",
x = "Period",
y = "ZHVI ($)"
) +
theme_bw() +
theme(legend.position = "none")

# Northeast
regional_df_long |>
filter(regions == "Northeast") |>
ggplot() +
geom_boxplot(mapping = aes(x = COVID_19, y = zhvi, fill = COVID_19)) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
labs(
title = "Northeastern Region",
subtitle = "ZHVI Statistical Summary: Pre- And Post-COVID",
x = "Period",
y = "ZHVI ($)"
) +
theme_bw() +
theme(legend.position = "none")

# South
regional_df_long |>
filter(regions == "South") |>
ggplot() +
geom_boxplot(mapping = aes(x = COVID_19, y = zhvi, fill = COVID_19)) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
labs(
title = "Southern Region",
subtitle = "ZHVI Statistical Summary: Pre- And Post-COVID",
x = "Period",
y = "ZHVI ($)"
) +
theme_bw() +
theme(legend.position = "none")

# West
regional_df_long |>
filter(regions == "West") |>
ggplot() +
geom_boxplot(mapping = aes(x = COVID_19, y = zhvi, fill = COVID_19)) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
labs(
title = "Western Region",
subtitle = "ZHVI Statistical Summary: Pre- And Post-COVID",
x = "Period",
y = "ZHVI ($)"
) +
theme_bw() +
theme(legend.position = "none")

### Test Stationarity
# Midwest
midwest_df <- regional_df_long[regional_df_long$regions == "Midwest", ]
midwest_series <- midwest_df$zhvi
# Northeast
northeast_df <- regional_df_long[regional_df_long$regions == "Northeast", ]
northeast_series <- northeast_df$zhvi
# South
south_df <- regional_df_long[regional_df_long$regions == "South", ]
south_series <- south_df$zhvi
# West
west_df <- regional_df_long[regional_df_long$regions == "West", ]
west_series <- west_df$zhvi
# ADF Test
adf_midwest <- ur.df(midwest_series, type = "trend", selectlags = "AIC")
summary(adf_midwest)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1688.2 -108.3 -1.1 92.8 3202.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.070e+01 1.043e+02 0.294 0.769
## z.lag.1 -2.454e-04 9.901e-04 -0.248 0.804
## tt 5.736e-01 4.856e-01 1.181 0.238
## z.diff.lag 8.548e-01 3.035e-02 28.165 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 366.2 on 304 degrees of freedom
## Multiple R-squared: 0.7674, Adjusted R-squared: 0.7651
## F-statistic: 334.3 on 3 and 304 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -0.2479 3.6337 1.7561
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
adf_northeast <- ur.df(northeast_series, type = "trend", selectlags = "AIC")
summary(adf_northeast)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2565.02 -158.37 9.35 136.48 1981.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.995e+02 1.123e+02 1.776 0.0767 .
## z.lag.1 -1.117e-03 6.405e-04 -1.744 0.0823 .
## tt 9.795e-01 5.581e-01 1.755 0.0802 .
## z.diff.lag 9.577e-01 1.759e-02 54.437 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 460.9 on 304 degrees of freedom
## Multiple R-squared: 0.9128, Adjusted R-squared: 0.9119
## F-statistic: 1061 on 3 and 304 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -1.7435 1.9497 1.6633
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
adf_south <- ur.df(south_series, type = "trend", selectlags = "AIC")
summary(adf_south)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8511.4 -173.4 8.1 122.8 4301.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 226.069153 186.310714 1.213 0.226
## z.lag.1 -0.001317 0.001578 -0.835 0.404
## tt 1.113322 0.974357 1.143 0.254
## z.diff.lag 0.772630 0.036691 21.058 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 662.5 on 304 degrees of freedom
## Multiple R-squared: 0.6039, Adjusted R-squared: 0.6
## F-statistic: 154.5 on 3 and 304 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -0.8349 3.8943 0.7453
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
adf_west <- ur.df(west_series, type = "trend", selectlags = "AIC")
summary(adf_west)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12094.5 -351.2 28.5 222.0 7584.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 313.750485 277.781190 1.129 0.260
## z.lag.1 -0.001707 0.001683 -1.014 0.311
## tt 2.801416 1.977836 1.416 0.158
## z.diff.lag 0.779656 0.036067 21.617 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1451 on 304 degrees of freedom
## Multiple R-squared: 0.6197, Adjusted R-squared: 0.616
## F-statistic: 165.1 on 3 and 304 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -1.0144 3.069 1.1174
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
# All regions are non-stationary, so we difference each
midwest_series <- regional_df$Midwest
northeast_series <- regional_df$Northeast
south_series <- regional_df$South
west_series <- regional_df$West
# Difference each series (monthly delta)
midwest_diff <- diff(midwest_series)
northeast_diff <- diff(northeast_series)
south_diff <- diff(south_series)
west_diff <- diff(west_series)
# Confirm Differencing Worked
adf_midwest_diff <- ur.df(midwest_diff, type = "trend", selectlags = "AIC")
summary(adf_midwest_diff)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1762.3 -108.4 -2.0 94.5 3261.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.96934 42.14805 0.165 0.8688
## z.lag.1 -0.15274 0.03107 -4.916 1.45e-06 ***
## tt 0.49393 0.25625 1.928 0.0548 .
## z.diff.lag 0.04246 0.05740 0.740 0.4600
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 366.5 on 303 degrees of freedom
## Multiple R-squared: 0.07494, Adjusted R-squared: 0.06578
## F-statistic: 8.182 on 3 and 303 DF, p-value: 2.969e-05
##
##
## Value of test-statistic is: -4.9161 8.0631 12.0843
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
adf_northeast_diff <- ur.df(northeast_diff, type = "trend", selectlags = "AIC")
summary(adf_northeast_diff)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1344.04 -173.64 -3.24 113.40 1480.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.77070 36.26521 1.014 0.311
## z.lag.1 -0.08262 0.01189 -6.949 2.26e-11 ***
## tt 0.33344 0.20601 1.619 0.107
## z.diff.lag 0.74154 0.03868 19.170 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 311.9 on 303 degrees of freedom
## Multiple R-squared: 0.5588, Adjusted R-squared: 0.5544
## F-statistic: 127.9 on 3 and 303 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -6.9489 16.1009 24.1465
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
adf_south_diff <- ur.df(south_diff, type = "trend", selectlags = "AIC")
summary(adf_south_diff)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6930.4 -128.0 21.1 126.4 4423.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.79199 71.02938 0.828 0.40848
## z.lag.1 -0.13697 0.03583 -3.822 0.00016 ***
## tt 0.17359 0.39884 0.435 0.66371
## z.diff.lag -0.39865 0.05278 -7.553 5.05e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 609.4 on 303 degrees of freedom
## Multiple R-squared: 0.2546, Adjusted R-squared: 0.2472
## F-statistic: 34.49 on 3 and 303 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -3.8224 4.891 7.3363
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
adf_west_diff <- ur.df(west_diff, type = "trend", selectlags = "AIC")
summary(adf_west_diff)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10585.3 -274.8 23.2 251.4 7301.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 68.36159 158.13705 0.432 0.666
## z.lag.1 -0.14709 0.03611 -4.073 5.93e-05 ***
## tt 0.63362 0.90165 0.703 0.483
## z.diff.lag -0.33671 0.05414 -6.219 1.66e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1371 on 303 degrees of freedom
## Multiple R-squared: 0.2116, Adjusted R-squared: 0.2038
## F-statistic: 27.11 on 3 and 303 DF, p-value: 1.472e-15
##
##
## Value of test-statistic is: -4.0729 5.5348 8.3019
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
### Confirm if increase in Zestimates Statistically Significant Pre- and Post-COVID Periods
break_date <- as.Date("2020-03-01")
date_diff <- regional_df$date[-1] # drop first date to match diff length
# Midwest
mid_pre <- midwest_diff[ date_diff < break_date ]
mid_post <- midwest_diff[ date_diff >= break_date ]
# Northeast
ne_pre <- northeast_diff[ date_diff < break_date ]
ne_post <- northeast_diff[ date_diff >= break_date ]
# South
so_pre <- south_diff[ date_diff < break_date ]
so_post <- south_diff[ date_diff >= break_date ]
# West
we_pre <- west_diff[ date_diff < break_date ]
we_post <- west_diff[ date_diff >= break_date ]
# T-tests To Compare Pre- and Post-COVID
t.test(mid_pre, mid_post)
##
## Welch Two Sample t-test
##
## data: mid_pre and mid_post
## t = -6.473, df = 75.06, p-value = 8.846e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1135.829 -601.244
## sample estimates:
## mean of x mean of y
## 337.5636 1206.1003
t.test(ne_pre, ne_post)
##
## Welch Two Sample t-test
##
## data: ne_pre and ne_post
## t = -6.587, df = 77.667, p-value = 4.829e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2257.839 -1209.734
## sample estimates:
## mean of x mean of y
## 667.616 2401.402
t.test(so_pre, so_post)
##
## Welch Two Sample t-test
##
## data: so_pre and so_post
## t = -3.7076, df = 77.825, p-value = 0.0003901
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -1092.4171 -329.0953
## sample estimates:
## mean of x mean of y
## 470.8712 1181.6274
t.test(we_pre, we_post)
##
## Welch Two Sample t-test
##
## data: we_pre and we_post
## t = -2.9748, df = 75.55, p-value = 0.003934
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2256.7037 -446.6315
## sample estimates:
## mean of x mean of y
## 818.7777 2170.4453
# Results confirm at the mean monthly growth rate in home values increased significantly after the onset of COVID-19.
### Plot Differenced Data
# Create Wide Dataframe
differenced_data_df <- data.frame(date_diff, midwest_diff, northeast_diff, south_diff, west_diff)
# Create Long Dataframe
differenced_long <- differenced_data_df |>
pivot_longer(
cols = -date_diff,
names_to = "regions",
values_to = "diff_zhvi"
)
# Time-Series Plot
ggplot(data = differenced_long) +
geom_line(mapping = aes(x = date_diff, y = diff_zhvi, color = regions)) +
scale_x_date(name = "Date") +
scale_y_continuous(name = "Monthly ZHVI Change ($)", labels = scales::comma) +
scale_color_discrete(name = "Regions", guide = guide_legend(position = "bottom")) +
ggtitle("Differenced ZHVI Across U.S. Regions, 2000 - 2025",
subtitle = "Source: Zillow Home Value Index (ZHVI) – Kaggle (https://www.kaggle.com/datasets/robikscube/zillow-home-value-index)") +
annotate("rect", xmin = as.Date("2020-03-01"), xmax = Inf, ymin = -Inf, ymax = Inf, alpha = 0.1, color = "grey76") + # Comment This Line To Make Shaded Area Disappear
theme_bw()
## Warning in scale_x_date(name = "Date"): A <numeric> value was passed to a Date scale.
## ℹ The value was converted to a <Date> object.

# Boxplots
differenced_long$COVID_19 <- ifelse(differenced_long$date_diff < as.Date("2020-03-01"), "Pre-COVID", "Post-COVID")
# Convert To Ordered Factor
differenced_long$COVID_19 <- factor(
differenced_long$COVID_19,
levels = c("Pre-COVID", "Post-COVID")
)
# Midwest
differenced_long |>
filter(regions == "midwest_diff") |>
ggplot() +
geom_boxplot(mapping = aes(x = COVID_19, y = diff_zhvi, fill = COVID_19)) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
labs(
title = "Midwest: Change in Monthly Home Values",
subtitle = "Differenced ZHVI (Pre vs Post COVID)",
x = "Period",
y = "Monthly ZHVI Change"
) +
theme_bw() +
theme(legend.position = "none")

# Northeast
differenced_long |>
filter(regions == "northeast_diff") |>
ggplot() +
geom_boxplot(mapping = aes(x = COVID_19, y = diff_zhvi, fill = COVID_19)) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
labs(
title = "Northeast: Change in Monthly Home Values",
subtitle = "Differenced ZHVI (Pre vs Post COVID)",
x = "Period",
y = "Monthly ZHVI Change"
) +
theme_bw() +
theme(legend.position = "none")

# South
differenced_long |>
filter(regions == "south_diff") |>
ggplot() +
geom_boxplot(mapping = aes(x = COVID_19, y = diff_zhvi, fill = COVID_19)) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
labs(
title = "South: Change in Monthly Home Values",
subtitle = "Differenced ZHVI (Pre vs Post COVID)",
x = "Period",
y = "Monthly ZHVI Change"
) +
theme_bw() +
theme(legend.position = "none")

# West
differenced_long |>
filter(regions == "west_diff") |>
ggplot() +
geom_boxplot(mapping = aes(x = COVID_19, y = diff_zhvi, fill = COVID_19)) +
scale_y_continuous(labels = scales::comma) +
scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
labs(
title = "West: Change in Monthly Home Values",
subtitle = "Differenced ZHVI (Pre vs Post COVID)",
x = "Period",
y = "Monthly ZHVI Change"
) +
theme_bw() +
theme(legend.position = "none")

# Statistical Summary
pre_covid_diff <- differenced_data_df |> filter(date_diff < as.Date("2020-03-01"))
post_covid_diff <- differenced_data_df |> filter(date_diff >= as.Date("2020-03-01"))
# Pre-COVID
summary(pre_covid_diff[, 2:5])
## midwest_diff northeast_diff south_diff west_diff
## Min. :-1113.0 Min. :-1937.08 Min. :-5733.4 Min. :-6987.5
## 1st Qu.: 132.7 1st Qu.: -51.33 1st Qu.: 194.9 1st Qu.: 159.6
## Median : 506.5 Median : 827.98 Median : 617.0 Median : 1117.5
## Mean : 337.6 Mean : 667.62 Mean : 470.9 Mean : 818.8
## 3rd Qu.: 661.3 3rd Qu.: 1405.68 3rd Qu.: 867.4 3rd Qu.: 1764.7
## Max. : 2335.6 Max. : 2681.90 Max. : 3462.4 Max. :10441.7
sd(pre_covid_diff$midwest_diff)
## [1] 491.4823
sd(pre_covid_diff$northeast_diff)
## [1] 1096.314
sd(pre_covid_diff$south_diff)
## [1] 803.7902
sd(pre_covid_diff$west_diff)
## [1] 1710.409
# Post-COVID
summary(post_covid_diff[, 2:5])
## midwest_diff northeast_diff south_diff west_diff
## Min. :-850.3 Min. :-1856.3 Min. :-1009.7 Min. :-4977.0
## 1st Qu.: 406.3 1st Qu.: 820.4 1st Qu.: 51.7 1st Qu.: 245.9
## Median :1029.9 Median : 1851.2 Median : 792.8 Median : 1380.3
## Mean :1206.1 Mean : 2401.4 Mean : 1181.6 Mean : 2170.4
## 3rd Qu.:1983.6 3rd Qu.: 3811.9 3rd Qu.: 2490.7 3rd Qu.: 4945.2
## Max. :3810.2 Max. : 7175.5 Max. : 4736.1 Max. :10197.3
sd(post_covid_diff$midwest_diff)
## [1] 1075.219
sd(post_covid_diff$northeast_diff)
## [1] 2090.93
sd(post_covid_diff$south_diff)
## [1] 1522.056
sd(post_covid_diff$west_diff)
## [1] 3634.978
###### Question 2 ######
# National Average ZHVI For Each Month
national_df <- ZHVI |>
mutate(national_zhvi = rowMeans(across(-date), na.rm = TRUE)) |>
dplyr::select(date, national_zhvi)
# Difference For Stationarity
national_df <- national_df |>
mutate(d_zhvi = national_zhvi - lag(national_zhvi)) |>
filter(!is.na(d_zhvi))
# Remove Last Month So All Series Have Same Time Period
national_df <- national_df[-309,]
# Pull FRED Data
getSymbols(c("MORTGAGE30US", "UNRATE", "CPIAUCSL", "FEDFUNDS"), src = "FRED")
## [1] "MORTGAGE30US" "UNRATE" "CPIAUCSL" "FEDFUNDS"
# Ensure ALl Series Have Same Start/End Date
common_dates <- seq(as.Date("2000-02-01"), as.Date("2025-09-01"), by = "month")
# Common Date For Differenced Series
common_dates_differenced <- seq(as.Date("2000-01-01"), as.Date("2025-09-01"), by = "month")
MORT_monthly <- to.monthly(MORTGAGE30US, indexAt = "firstof", OHLC = FALSE)
MORT_monthly <- MORT_monthly[common_dates]
UNRATE_monthly <- to.monthly(UNRATE, indexAt = "firstof", OHLC = FALSE)
UNRATE_monthly <- UNRATE_monthly[common_dates]
CPI_monthly <- to.monthly(CPIAUCSL, indexAt = "firstof", OHLC = FALSE)
CPI_monthly <- CPI_monthly[common_dates_differenced]
FEDFUNDS_monthly <- to.monthly(FEDFUNDS, indexAt = "firstof", OHLC = FALSE)
FEDFUNDS_monthly <- FEDFUNDS_monthly[common_dates]
# Difference Inflation (Non-Stationary)
cpi_values <- as.numeric(CPI_monthly$CPIAUCSL)
inflation_diff <- diff(cpi_values)
# Create Dataframe For Regressors
regressors_df <- data.frame(
date = common_dates,
mortgage_rate = as.numeric(MORT_monthly$MORTGAGE30US),
unemployment_rate = as.numeric(UNRATE_monthly$UNRATE),
inflation_rate = inflation_diff,
fed_funds_rate = as.numeric(FEDFUNDS_monthly$FEDFUNDS)
)
# Merge With national_df Dataset
df <- national_df |>
inner_join(regressors_df, by = "date")
# Ensure No Null Values
colSums(is.na(df))
## date national_zhvi d_zhvi mortgage_rate
## 0 0 0 0
## unemployment_rate inflation_rate fed_funds_rate
## 0 0 0
# Create Regression
model <- lm(df$d_zhvi ~ df$mortgage_rate + df$unemployment_rate + df$inflation_rate + df$fed_funds_rate, data = df)
summary_model <- summary(model)
summary_model
##
## Call:
## lm(formula = df$d_zhvi ~ df$mortgage_rate + df$unemployment_rate +
## df$inflation_rate + df$fed_funds_rate, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3365.3 -665.6 -132.2 618.6 4443.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3895.12 402.00 9.689 < 2e-16 ***
## df$mortgage_rate -313.41 80.35 -3.900 0.000118 ***
## df$unemployment_rate -275.92 42.52 -6.489 3.52e-10 ***
## df$inflation_rate 446.01 89.59 4.978 1.08e-06 ***
## df$fed_funds_rate -61.97 64.01 -0.968 0.333758
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1107 on 303 degrees of freedom
## Multiple R-squared: 0.2904, Adjusted R-squared: 0.281
## F-statistic: 30.99 on 4 and 303 DF, p-value: < 2.2e-16
# Multicolinearlity?
vif(model)
## df$mortgage_rate df$unemployment_rate df$inflation_rate
## 3.130566 1.720394 1.049248
## df$fed_funds_rate
## 4.213044
# Correlation Matrix
numeric_vars <- df[, c("mortgage_rate", "unemployment_rate", "inflation_rate", "fed_funds_rate")]
cor_matrix <- cor(numeric_vars)
cor_matrix
## mortgage_rate unemployment_rate inflation_rate fed_funds_rate
## mortgage_rate 1.000000000 -0.3698296 0.007063664 0.81116953
## unemployment_rate -0.369829597 1.0000000 -0.203309373 -0.60046869
## inflation_rate 0.007063664 -0.2033094 1.000000000 0.08289704
## fed_funds_rate 0.811169530 -0.6004687 0.082897035 1.00000000
# Scatterplot of Fitted Vs. Actual
ggplot(data = df, mapping = aes(x = model$fitted.values, y = d_zhvi)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue4") +
labs(
title = "Scatterplot of Fitted and Actual Values of Differenced ZHVI",
x = "Fitted Values",
y = "Actual Values"
) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

# Scatterplot of Residuals and Fitted Values
ggplot(data = df, mapping = aes(x = model$fitted.values, y = model$residuals)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red4") +
labs(
title = "Scatterplot of Fitted and Residual Values of Differenced ZHVI",
x = "Fitted Values",
y = "Residual Values"
) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

# Q-Q Plot
ggplot(mapping = aes(sample = model$residuals)) +
stat_qq() +
stat_qq_line(color = "blue4") +
theme_bw() +
labs(
title = "Q-Q Plot of Residuals",
x = "Theoretical Quantiles",
y = "Sample Quantiles"
)

# Histogram Of Residuals
ggplot() +
geom_histogram(aes(x = model$residuals), bins = 30, fill = "steelblue", color = "white") +
theme_bw() +
labs(
title = "Histogram of Regression Residuals",
x = "Residuals",
y = "Count"
)

### Model 2
getSymbols(c("PERMIT", "HOUST", "DSPIC96", "UMCSENT"), src = "FRED")
## [1] "PERMIT" "HOUST" "DSPIC96" "UMCSENT"
# Test stationarity
# # Building Permits
# permit_monthly <- to.monthly(PERMIT, indexAt = "firstof", OHLC = FALSE)
# permit_monthly <- permit_monthly[common_dates]
# permit_series <- as.numeric(permit_monthly$PERMIT)
# adf_permit <- ur.df(permit_series, type = "trend", selectlags = "AIC")
# summary(adf_permit)
#
# # Housing Starts
# houst_monthly <- to.monthly(HOUST, indexAt = "firstof", OHLC = FALSE)
# houst_monthly <- houst_monthly[common_dates]
# houst_series <- as.numeric(houst_monthly$HOUST)
# adf_houst <- ur.df(houst_series, type = "trend", selectlags = "AIC")
# summary(adf_houst)
#
# # Disposable Income
# income_monthly <- to.monthly(DSPIC96, indexAt = "firstof", OHLC = FALSE)
# income_monthly <- income_monthly[common_dates]
# income_series <- as.numeric(income_monthly$DSPIC96)
# adf_income <- ur.df(income_series, type = "trend", selectlags = "AIC")
# summary(adf_income)
#
# # Consumer Sentiment
# sent_monthly <- to.monthly(UMCSENT, indexAt = "firstof", OHLC = FALSE)
# sent_monthly <- sent_monthly[common_dates]
# sent_series <- as.numeric(sent_monthly$UMCSENT)
# adf_sent <- ur.df(sent_series, type = "trend", selectlags = "AIC")
# summary(adf_sent)
# Ensure Same Timeframe
common_dates_diff_2 <- seq(as.Date("2000-01-01"), as.Date("2025-08-01"), by = "month")
PERMIT_monthly <- to.monthly(PERMIT, indexAt = "firstof", OHLC = FALSE)
PERMIT_monthly <- PERMIT_monthly[common_dates_diff_2]
HOUST_monthly <- to.monthly(HOUST, indexAt = "firstof", OHLC = FALSE)
HOUST_monthly <- HOUST_monthly[common_dates_diff_2]
DSPIC96_monthly <- to.monthly(DSPIC96, indexAt = "firstof", OHLC = FALSE)
DSPIC96_monthly <- DSPIC96_monthly[common_dates_diff_2]
UMCSENT_monthly <- to.monthly(UMCSENT, indexAt = "firstof", OHLC = FALSE)
## Warning in to.period(x, "months", indexAt = indexAt, name = name, ...): missing
## values removed from data
UMCSENT_monthly <- UMCSENT_monthly[common_dates_diff_2]
# Difference Series
permit_values <- as.numeric(PERMIT_monthly$PERMIT)
permit_diff <- diff(permit_values)
houst_values <- as.numeric(HOUST_monthly$HOUST)
houst_diff <- diff(houst_values)
dspic96_values <- as.numeric(DSPIC96_monthly$DSPIC96)
dspic96_diff <- diff(dspic96_values)
umcsent_values <- as.numeric(UMCSENT_monthly$UMCSENT)
umcsent_diff <- diff(umcsent_values)
common_dates2 <- seq(as.Date("2000-02-01"), as.Date("2025-08-01"), by = "month")
# Create Dataframe For Regressors
regressors_df_2 <- data.frame(
date = common_dates2,
building_permits = permit_diff,
housing_starts = houst_diff,
disposible_inc = dspic96_diff,
consumer_sent = umcsent_diff
)
# Make Original Dataset 1 Unit Shorter To Match Dimensions
df <- df[-308, ]
# Join Datasets Again
df <- df |>
inner_join(regressors_df_2, by = "date")
# Drop 2nd Column For Regression
df <- df[, -2]
# Best Subset Regression
n <- ncol(df) - 2
regfit.full <- regsubsets(df$d_zhvi ~ . - date, data = df, nvmax = n)
reg.summary <- summary(regfit.full)
reg.summary
## Subset selection object
## Call: regsubsets.formula(df$d_zhvi ~ . - date, data = df, nvmax = n)
## 8 Variables (and intercept)
## Forced in Forced out
## mortgage_rate FALSE FALSE
## unemployment_rate FALSE FALSE
## inflation_rate FALSE FALSE
## fed_funds_rate FALSE FALSE
## building_permits FALSE FALSE
## housing_starts FALSE FALSE
## disposible_inc FALSE FALSE
## consumer_sent FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## mortgage_rate unemployment_rate inflation_rate fed_funds_rate
## 1 ( 1 ) " " " " "*" " "
## 2 ( 1 ) "*" "*" " " " "
## 3 ( 1 ) "*" "*" "*" " "
## 4 ( 1 ) "*" "*" "*" " "
## 5 ( 1 ) "*" "*" "*" "*"
## 6 ( 1 ) "*" "*" "*" "*"
## 7 ( 1 ) "*" "*" "*" "*"
## 8 ( 1 ) "*" "*" "*" "*"
## building_permits housing_starts disposible_inc consumer_sent
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " "*"
## 5 ( 1 ) " " " " " " "*"
## 6 ( 1 ) " " " " "*" "*"
## 7 ( 1 ) "*" " " "*" "*"
## 8 ( 1 ) "*" "*" "*" "*"
# Best Model By Adj. R^2
best_adj_index <- which.max(reg.summary$adjr2)
best_adj_index
## [1] 5
coef(regfit.full, best_adj_index)
## (Intercept) mortgage_rate unemployment_rate inflation_rate
## 3913.62903 -311.67494 -279.00114 437.02777
## fed_funds_rate consumer_sent
## -66.40291 -26.32595
# Regression
best_subset_model <- lm(df$d_zhvi ~ df$mortgage_rate + df$unemployment_rate + df$inflation_rate + df$fed_funds_rate + df$consumer_sent, data = df)
summary(best_subset_model)
##
## Call:
## lm(formula = df$d_zhvi ~ df$mortgage_rate + df$unemployment_rate +
## df$inflation_rate + df$fed_funds_rate + df$consumer_sent,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3351.5 -711.7 -128.6 638.0 4591.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3913.63 401.22 9.754 < 2e-16 ***
## df$mortgage_rate -311.67 80.18 -3.887 0.000125 ***
## df$unemployment_rate -279.00 42.47 -6.570 2.21e-10 ***
## df$inflation_rate 437.03 89.62 4.876 1.75e-06 ***
## df$fed_funds_rate -66.40 63.98 -1.038 0.300184
## df$consumer_sent -26.33 14.63 -1.799 0.072974 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1105 on 301 degrees of freedom
## Multiple R-squared: 0.2981, Adjusted R-squared: 0.2865
## F-statistic: 25.57 on 5 and 301 DF, p-value: < 2.2e-16
# VIF
vif(best_subset_model)
## df$mortgage_rate df$unemployment_rate df$inflation_rate
## 3.124940 1.721345 1.053137
## df$fed_funds_rate df$consumer_sent
## 4.212157 1.007587
# Correlation Matrix
numeric_vars <- df[, c("mortgage_rate", "unemployment_rate", "inflation_rate", "fed_funds_rate", "consumer_sent")]
cor_matrix <- cor(numeric_vars)
cor_matrix
## mortgage_rate unemployment_rate inflation_rate fed_funds_rate
## mortgage_rate 1.000000000 -0.368785374 0.005273014 0.81075384
## unemployment_rate -0.368785374 1.000000000 -0.202145527 -0.59974475
## inflation_rate 0.005273014 -0.202145527 1.000000000 0.08061768
## fed_funds_rate 0.810753843 -0.599744750 0.080617681 1.00000000
## consumer_sent -0.033783811 0.006568184 -0.061433720 -0.04884186
## consumer_sent
## mortgage_rate -0.033783811
## unemployment_rate 0.006568184
## inflation_rate -0.061433720
## fed_funds_rate -0.048841864
## consumer_sent 1.000000000
# Scatterplot of Fitted Vs. Actual
ggplot(data = df, mapping = aes(x = best_subset_model$fitted.values, y = d_zhvi)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue4") +
labs(
title = "Scatterplot of Fitted and Actual Values of Differenced ZHVI",
subtitle = "Subset Selection",
x = "Fitted Values",
y = "Actual Values"
) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

# Scatterplot of Residuals and Fitted Values
ggplot(data = df, mapping = aes(x = best_subset_model$fitted.values, y = best_subset_model$residuals)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red4") +
labs(
title = "Scatterplot of Fitted and Residual Values of Differenced ZHVI",
subtitle = "Subset Selection",
x = "Fitted Values",
y = "Residual Values"
) +
theme_bw()
## `geom_smooth()` using formula = 'y ~ x'

# Q-Q Plot
ggplot(mapping = aes(sample = best_subset_model$residuals)) +
stat_qq() +
stat_qq_line(color = "blue4") +
theme_bw() +
labs(
title = "Q-Q Plot of Residuals",
subtitle = "Subset Selection",
x = "Theoretical Quantiles",
y = "Sample Quantiles"
)

# Histogram Of Residuals
ggplot() +
geom_histogram(aes(x = best_subset_model$residuals), bins = 30, fill = "steelblue", color = "white") +
theme_bw() +
labs(
title = "Histogram of Regression Residuals",
subtitle = "Subset Selection",
x = "Residuals",
y = "Count"
)
