BUDT730-Final-Project.R

# Libraries
library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(urca)
library(lubridate)
library(quantmod)

## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(ISLR2)
library(leaps)

# Set Working Directory
setwd("~/BUDT730")

# Load ZHVI Data
load("~/BUDT730/ZHVI.RData")
attach(ZHVI)

## The following object is masked from package:carData:
## 
##     Florida

###### Question 1 ######

# See Datatypes For Each Column
str(ZHVI)

## spc_tbl_ [310 × 52] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1                    : Date[1:310], format: "2000-01-01" "2000-02-01" ...
##  $ Florida                 : num [1:310] 107862 108096 108379 108955 109575 ...
##  $ California              : num [1:310] 188374 189007 189862 191718 193875 ...
##  $ Virginia                : num [1:310] 124133 124454 124795 125438 126059 ...
##  $ Texas                   : num [1:310] 112619 112680 112710 112857 112952 ...
##  $ New York                : num [1:310] 153885 154433 154961 156095 157269 ...
##  $ Colorado                : num [1:310] 176602 177229 177922 179471 181200 ...
##  $ Massachusetts           : num [1:310] 196903 197561 198343 199883 201551 ...
##  $ Pennsylvania            : num [1:310] 98957 99168 99367 99773 100189 ...
##  $ New Jersey              : num [1:310] 170361 170821 171259 172230 173219 ...
##  $ Arizona                 : num [1:310] 135754 136000 136316 136963 137695 ...
##  $ Michigan                : num [1:310] 115957 116021 116255 116721 117407 ...
##  $ Washington              : num [1:310] 174648 175042 175444 176255 177140 ...
##  $ Utah                    : num [1:310] 169263 169356 169502 169867 171191 ...
##  $ the District of Columbia: num [1:310] 161758 162288 163038 164648 166434 ...
##  $ New Hampshire           : num [1:310] 133315 133767 134308 135120 136008 ...
##  $ Illinois                : num [1:310] 128077 128180 128402 128924 129543 ...
##  $ Georgia                 : num [1:310] 126067 126354 126684 127379 128146 ...
##  $ Connecticut             : num [1:310] 162193 162811 163456 164549 165448 ...
##  $ North Carolina          : num [1:310] 128827 129023 129225 129663 130151 ...
##  $ Nevada                  : num [1:310] 154865 154884 155124 155673 156373 ...
##  $ Idaho                   : num [1:310] 124002 123927 123974 124073 124396 ...
##  $ Maine                   : num [1:310] 106572 106926 107243 107940 108597 ...
##  $ Minnesota               : num [1:310] 124611 125030 125420 126336 127341 ...
##  $ Alabama                 : num [1:310] 100026 100200 100317 100578 100870 ...
##  $ Hawaii                  : num [1:310] 194143 194053 194351 194868 195881 ...
##  $ Oregon                  : num [1:310] 151506 151839 152143 152782 153352 ...
##  $ South Carolina          : num [1:310] 114514 114767 114973 115445 115886 ...
##  $ Missouri                : num [1:310] 96716 96829 96999 97387 97888 ...
##  $ Maryland                : num [1:310] 151575 151554 151661 151920 152463 ...
##  $ Vermont                 : num [1:310] 115574 115768 116072 116669 117357 ...
##  $ Tennessee               : num [1:310] 112258 112351 112447 112677 112942 ...
##  $ Montana                 : num [1:310] NA NA NA NA NA NA NA NA NA NA ...
##  $ Iowa                    : num [1:310] 90600 90773 90937 91290 91680 ...
##  $ West Virginia           : num [1:310] 72810 72807 72831 72890 73006 ...
##  $ Louisiana               : num [1:310] 109297 109508 109672 110041 110433 ...
##  $ Rhode Island            : num [1:310] 131633 132231 132860 134102 135340 ...
##  $ Mississippi             : num [1:310] 89088 89039 89037 89101 89414 ...
##  $ Indiana                 : num [1:310] 97563 97429 97278 96820 96599 ...
##  $ Nebraska                : num [1:310] 115309 115493 115664 115929 116064 ...
##  $ Wyoming                 : num [1:310] NA NA NA NA NA NA NA NA NA NA ...
##  $ Ohio                    : num [1:310] 104765 104829 104926 105211 105647 ...
##  $ North Dakota            : num [1:310] NA NA NA NA NA NA NA NA NA NA ...
##  $ Alaska                  : num [1:310] 135910 136064 136218 136399 136672 ...
##  $ Kansas                  : num [1:310] 88644 88662 88764 88917 89148 ...
##  $ Wisconsin               : num [1:310] 120259 120225 120359 120552 120938 ...
##  $ Delaware                : num [1:310] 148207 148438 148725 149381 150130 ...
##  $ New Mexico              : num [1:310] NA NA NA NA NA NA NA NA NA NA ...
##  $ Arkansas                : num [1:310] 89099 89218 89354 89642 89927 ...
##  $ Kentucky                : num [1:310] 88517 88562 88646 88852 89127 ...
##  $ Oklahoma                : num [1:310] 83296 83331 83491 83783 84174 ...
##  $ South Dakota            : num [1:310] 107833 107604 107283 106851 106671 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_date(format = ""),
##   ..   Florida = col_double(),
##   ..   California = col_double(),
##   ..   Virginia = col_double(),
##   ..   Texas = col_double(),
##   ..   `New York` = col_double(),
##   ..   Colorado = col_double(),
##   ..   Massachusetts = col_double(),
##   ..   Pennsylvania = col_double(),
##   ..   `New Jersey` = col_double(),
##   ..   Arizona = col_double(),
##   ..   Michigan = col_double(),
##   ..   Washington = col_double(),
##   ..   Utah = col_double(),
##   ..   `the District of Columbia` = col_double(),
##   ..   `New Hampshire` = col_double(),
##   ..   Illinois = col_double(),
##   ..   Georgia = col_double(),
##   ..   Connecticut = col_double(),
##   ..   `North Carolina` = col_double(),
##   ..   Nevada = col_double(),
##   ..   Idaho = col_double(),
##   ..   Maine = col_double(),
##   ..   Minnesota = col_double(),
##   ..   Alabama = col_double(),
##   ..   Hawaii = col_double(),
##   ..   Oregon = col_double(),
##   ..   `South Carolina` = col_double(),
##   ..   Missouri = col_double(),
##   ..   Maryland = col_double(),
##   ..   Vermont = col_double(),
##   ..   Tennessee = col_double(),
##   ..   Montana = col_double(),
##   ..   Iowa = col_double(),
##   ..   `West Virginia` = col_double(),
##   ..   Louisiana = col_double(),
##   ..   `Rhode Island` = col_double(),
##   ..   Mississippi = col_double(),
##   ..   Indiana = col_double(),
##   ..   Nebraska = col_double(),
##   ..   Wyoming = col_double(),
##   ..   Ohio = col_double(),
##   ..   `North Dakota` = col_double(),
##   ..   Alaska = col_double(),
##   ..   Kansas = col_double(),
##   ..   Wisconsin = col_double(),
##   ..   Delaware = col_double(),
##   ..   `New Mexico` = col_double(),
##   ..   Arkansas = col_double(),
##   ..   Kentucky = col_double(),
##   ..   Oklahoma = col_double(),
##   ..   `South Dakota` = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

# All Data Numeric (Except Time Index)

# Convert Columns Names To Snake Case
ZHVI <- ZHVI |> clean_names()

# Change Time Index Name
colnames(ZHVI)[1] <- "date"

# Count Null Values
colSums(is.na(ZHVI))

##                     date                  florida               california 
##                        0                        0                        0 
##                 virginia                    texas                 new_york 
##                        0                        0                        0 
##                 colorado            massachusetts             pennsylvania 
##                        0                        0                        0 
##               new_jersey                  arizona                 michigan 
##                        0                        1                        0 
##               washington                     utah the_district_of_columbia 
##                        0                        0                        0 
##            new_hampshire                 illinois                  georgia 
##                        0                        0                        0 
##              connecticut           north_carolina                   nevada 
##                        0                        0                        0 
##                    idaho                    maine                minnesota 
##                        1                        0                        0 
##                  alabama                   hawaii                   oregon 
##                        0                        0                        0 
##           south_carolina                 missouri                 maryland 
##                        0                        0                        0 
##                  vermont                tennessee                  montana 
##                        0                        0                       61 
##                     iowa            west_virginia                louisiana 
##                        0                        1                        0 
##             rhode_island              mississippi                  indiana 
##                        0                        0                        0 
##                 nebraska                  wyoming                     ohio 
##                        0                       27                        0 
##             north_dakota                   alaska                   kansas 
##                      108                        1                        0 
##                wisconsin                 delaware               new_mexico 
##                        0                        0                       27 
##                 arkansas                 kentucky                 oklahoma 
##                        0                        0                        0 
##             south_dakota 
##                        1

# Some Regions Like North Dakota, New Mexico, and Wyoming Have Significant Null Values. Something To Keep In Mind For Further Analysis.

### Define Regions
regions <- list(
  northeast = c(
    "new_york","new_jersey","massachusetts","pennsylvania",
    "connecticut","rhode_island","vermont","new_hampshire","maine"
  ),
  south = c(
    "florida","texas","virginia","north_carolina","south_carolina","georgia",
    "maryland","delaware","west_virginia","alabama","mississippi","louisiana",
    "tennessee","kentucky","arkansas","oklahoma","the_district_of_columbia"
  ),
  midwest = c(
    "illinois","indiana","michigan","ohio","wisconsin","minnesota",
    "iowa","missouri","kansas","nebraska","south_dakota","north_dakota"
  ),
  west = c(
    "california","colorado","arizona","nevada","utah","washington",
    "oregon","idaho","montana","wyoming","new_mexico","alaska","hawaii"
  )
)

### Pipeline To Create Dataset With Regional Averages
regional_df <- ZHVI |>
  # Convert From Wide To Long Format
  pivot_longer(
    cols = -date,
    names_to = "state",
    values_to = "zhvi"
  ) |>
  # Group States By Region
  mutate(
    region = case_when(
      state %in% regions$northeast ~ "Northeast",
      state %in% regions$south ~ "South",
      state %in% regions$midwest ~ "Midwest",
      state %in% regions$west ~ "West",
      TRUE ~ NA_character_
    )
  ) |>
  # Calculate Avergae Zestimate Per Region For Each Year
  group_by(date, region) |>
  summarise(
    regional_zhvi = mean(zhvi, na.rm = TRUE), # Ignore Null Values
    .groups = "drop"
  ) |>
  # Convert Back To Wide Format
  pivot_wider(
    names_from = region,
    values_from = regional_zhvi
  )

# Detach Old Dataset
detach(ZHVI)

# Inspect New Data
head(regional_df)

## # A tibble: 6 × 5
##   date       Midwest Northeast   South    West
##   <date>       <dbl>     <dbl>   <dbl>   <dbl>
## 1 2000-01-01 108212.   141044. 112938. 160507.
## 2 2000-02-01 108279.   141499. 113098. 160740.
## 3 2000-03-01 108390.   141985. 113293. 161086.
## 4 2000-04-01 108631.   142929. 113721. 161807.
## 5 2000-05-01 108993.   143886. 114217. 162777.
## 6 2000-06-01 109433.   144868. 114757. 163783.

str(regional_df)

## tibble [310 × 5] (S3: tbl_df/tbl/data.frame)
##  $ date     : Date[1:310], format: "2000-01-01" "2000-02-01" ...
##  $ Midwest  : num [1:310] 108212 108279 108390 108631 108993 ...
##  $ Northeast: num [1:310] 141044 141499 141985 142929 143886 ...
##  $ South    : num [1:310] 112938 113098 113293 113721 114217 ...
##  $ West     : num [1:310] 160507 160740 161086 161807 162777 ...

colSums(is.na(regional_df)) # No Null Values

##      date   Midwest Northeast     South      West 
##         0         0         0         0         0

# Convert To Long Format To Plot
regional_df_long <- regional_df |>
  pivot_longer(
    cols = -date,
    names_to = "regions",
    values_to = "zhvi"
  )

### Plot Data
ggplot(data = regional_df_long) +
  geom_line(mapping = aes(x = date, y = zhvi, color = regions)) +
  scale_x_date(name = "Date") +
  scale_y_continuous(name = "ZHVI ($)", labels = scales::comma) +
  scale_color_discrete(name = "Regions", guide = guide_legend(position = "bottom")) +
  ggtitle("Zillow Home Value Index (ZHVI) Across U.S. Regions, 2000 - 2025",
          subtitle = "Source: Zillow Home Value Index (ZHVI) – Kaggle (https://www.kaggle.com/datasets/robikscube/zillow-home-value-index)") +
  annotate("rect", xmin = as.Date("2020-03-01"), xmax = Inf, ymin = -Inf, ymax = Inf, alpha = 0.1, color = "grey76") + # Comment This Line To Make Shaded Area Disappear
  theme_bw()

## Warning in scale_x_date(name = "Date"): A <numeric> value was passed to a Date scale.
## ℹ The value was converted to a <Date> object.

# Examine Mean Values Pre- And Post-COVID
pre_covid <- regional_df |> filter(date < as.Date("2020-03-01"))
post_covid <- regional_df |> filter(date >= as.Date("2020-03-01"))

### Generate Summary Statistics
summary(pre_covid[, 2:5])

##     Midwest         Northeast          South             West       
##  Min.   :108212   Min.   :141044   Min.   :112938   Min.   :160507  
##  1st Qu.:131330   1st Qu.:224410   1st Qu.:156373   1st Qu.:209271  
##  Median :141822   Median :239975   Median :170507   Median :239893  
##  Mean   :143339   Mean   :237055   Mean   :169395   Mean   :244737  
##  3rd Qu.:150219   3rd Qu.:266764   3rd Qu.:188109   3rd Qu.:277938  
##  Max.   :189565   Max.   :301939   Max.   :226418   Max.   :357832

summary(post_covid[, 2:5])

##     Midwest         Northeast          South             West       
##  Min.   :190670   Min.   :303963   Min.   :227843   Min.   :360304  
##  1st Qu.:223091   1st Qu.:367064   1st Qu.:267172   1st Qu.:443934  
##  Median :245927   Median :407316   Median :298972   Median :494271  
##  Mean   :239593   Mean   :400650   Mean   :285119   Mean   :468519  
##  3rd Qu.:260250   3rd Qu.:447695   3rd Qu.:306756   3rd Qu.:504816  
##  Max.   :271580   Max.   :465234   Max.   :310230   Max.   :513018

sd(pre_covid$Midwest)

## [1] 18699.46

sd(pre_covid$Northeast)

## [1] 38485.3

sd(pre_covid$South)

## [1] 28155.61

sd(pre_covid$West)

## [1] 49948.57

sd(post_covid$Midwest)

## [1] 24863.35

sd(post_covid$Northeast)

## [1] 51062.95

sd(post_covid$South)

## [1] 27338.77

sd(post_covid$West)

## [1] 50156.82

### Boxplot
regional_df_long$COVID_19 <- ifelse(regional_df_long$date < as.Date("2020-03-01"), "Pre-COVID", "Post-COVID")

# Convert To Ordered Factor
regional_df_long$COVID_19 <- factor(
  regional_df_long$COVID_19,
  levels = c("Pre-COVID", "Post-COVID")
)

# Midwest
regional_df_long |>
  filter(regions == "Midwest") |>
  ggplot() +
  geom_boxplot(mapping = aes(x = COVID_19, y = zhvi, fill = COVID_19)) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
  labs(
    title = "Midwestern Region",
    subtitle = "ZHVI Statistical Summary: Pre- And Post-COVID",
    x = "Period",
    y = "ZHVI ($)"
  ) +
  theme_bw() +
  theme(legend.position = "none")

# Northeast
regional_df_long |>
  filter(regions == "Northeast") |>
  ggplot() +
  geom_boxplot(mapping = aes(x = COVID_19, y = zhvi, fill = COVID_19)) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
  labs(
    title = "Northeastern Region",
    subtitle = "ZHVI Statistical Summary: Pre- And Post-COVID",
    x = "Period",
    y = "ZHVI ($)"
  ) +
  theme_bw() +
  theme(legend.position = "none")

# South
regional_df_long |>
  filter(regions == "South") |>
  ggplot() +
  geom_boxplot(mapping = aes(x = COVID_19, y = zhvi, fill = COVID_19)) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
  labs(
    title = "Southern Region",
    subtitle = "ZHVI Statistical Summary: Pre- And Post-COVID",
    x = "Period",
    y = "ZHVI ($)"
  ) +
  theme_bw() +
  theme(legend.position = "none")

# West
regional_df_long |>
  filter(regions == "West") |>
  ggplot() +
  geom_boxplot(mapping = aes(x = COVID_19, y = zhvi, fill = COVID_19)) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
  labs(
    title = "Western Region",
    subtitle = "ZHVI Statistical Summary: Pre- And Post-COVID",
    x = "Period",
    y = "ZHVI ($)"
  ) +
  theme_bw() +
  theme(legend.position = "none")

### Test Stationarity

# Midwest
midwest_df <- regional_df_long[regional_df_long$regions == "Midwest", ]
midwest_series <- midwest_df$zhvi

# Northeast
northeast_df <- regional_df_long[regional_df_long$regions == "Northeast", ]
northeast_series <- northeast_df$zhvi

# South
south_df <- regional_df_long[regional_df_long$regions == "South", ]
south_series <- south_df$zhvi

# West
west_df <- regional_df_long[regional_df_long$regions == "West", ]
west_series <- west_df$zhvi

# ADF Test
adf_midwest <- ur.df(midwest_series, type = "trend", selectlags = "AIC")
summary(adf_midwest)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1688.2  -108.3    -1.1    92.8  3202.3 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.070e+01  1.043e+02   0.294    0.769    
## z.lag.1     -2.454e-04  9.901e-04  -0.248    0.804    
## tt           5.736e-01  4.856e-01   1.181    0.238    
## z.diff.lag   8.548e-01  3.035e-02  28.165   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 366.2 on 304 degrees of freedom
## Multiple R-squared:  0.7674, Adjusted R-squared:  0.7651 
## F-statistic: 334.3 on 3 and 304 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -0.2479 3.6337 1.7561 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

adf_northeast <- ur.df(northeast_series, type = "trend", selectlags = "AIC")
summary(adf_northeast)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2565.02  -158.37     9.35   136.48  1981.57 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.995e+02  1.123e+02   1.776   0.0767 .  
## z.lag.1     -1.117e-03  6.405e-04  -1.744   0.0823 .  
## tt           9.795e-01  5.581e-01   1.755   0.0802 .  
## z.diff.lag   9.577e-01  1.759e-02  54.437   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 460.9 on 304 degrees of freedom
## Multiple R-squared:  0.9128, Adjusted R-squared:  0.9119 
## F-statistic:  1061 on 3 and 304 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -1.7435 1.9497 1.6633 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

adf_south <- ur.df(south_series, type = "trend", selectlags = "AIC")
summary(adf_south)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8511.4  -173.4     8.1   122.8  4301.4 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 226.069153 186.310714   1.213    0.226    
## z.lag.1      -0.001317   0.001578  -0.835    0.404    
## tt            1.113322   0.974357   1.143    0.254    
## z.diff.lag    0.772630   0.036691  21.058   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 662.5 on 304 degrees of freedom
## Multiple R-squared:  0.6039, Adjusted R-squared:    0.6 
## F-statistic: 154.5 on 3 and 304 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -0.8349 3.8943 0.7453 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

adf_west <- ur.df(west_series, type = "trend", selectlags = "AIC")
summary(adf_west)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12094.5   -351.2     28.5    222.0   7584.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 313.750485 277.781190   1.129    0.260    
## z.lag.1      -0.001707   0.001683  -1.014    0.311    
## tt            2.801416   1.977836   1.416    0.158    
## z.diff.lag    0.779656   0.036067  21.617   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1451 on 304 degrees of freedom
## Multiple R-squared:  0.6197, Adjusted R-squared:  0.616 
## F-statistic: 165.1 on 3 and 304 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -1.0144 3.069 1.1174 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

# All regions are non-stationary, so we difference each
midwest_series    <- regional_df$Midwest
northeast_series  <- regional_df$Northeast
south_series      <- regional_df$South
west_series       <- regional_df$West

# Difference each series (monthly delta)
midwest_diff   <- diff(midwest_series)
northeast_diff <- diff(northeast_series)
south_diff     <- diff(south_series)
west_diff      <- diff(west_series)

# Confirm Differencing Worked
adf_midwest_diff <- ur.df(midwest_diff, type = "trend", selectlags = "AIC")
summary(adf_midwest_diff)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1762.3  -108.4    -2.0    94.5  3261.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.96934   42.14805   0.165   0.8688    
## z.lag.1     -0.15274    0.03107  -4.916 1.45e-06 ***
## tt           0.49393    0.25625   1.928   0.0548 .  
## z.diff.lag   0.04246    0.05740   0.740   0.4600    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 366.5 on 303 degrees of freedom
## Multiple R-squared:  0.07494,    Adjusted R-squared:  0.06578 
## F-statistic: 8.182 on 3 and 303 DF,  p-value: 2.969e-05
## 
## 
## Value of test-statistic is: -4.9161 8.0631 12.0843 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

adf_northeast_diff <- ur.df(northeast_diff, type = "trend", selectlags = "AIC")
summary(adf_northeast_diff)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1344.04  -173.64    -3.24   113.40  1480.72 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.77070   36.26521   1.014    0.311    
## z.lag.1     -0.08262    0.01189  -6.949 2.26e-11 ***
## tt           0.33344    0.20601   1.619    0.107    
## z.diff.lag   0.74154    0.03868  19.170  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 311.9 on 303 degrees of freedom
## Multiple R-squared:  0.5588, Adjusted R-squared:  0.5544 
## F-statistic: 127.9 on 3 and 303 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -6.9489 16.1009 24.1465 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

adf_south_diff <- ur.df(south_diff, type = "trend", selectlags = "AIC")
summary(adf_south_diff)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6930.4  -128.0    21.1   126.4  4423.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 58.79199   71.02938   0.828  0.40848    
## z.lag.1     -0.13697    0.03583  -3.822  0.00016 ***
## tt           0.17359    0.39884   0.435  0.66371    
## z.diff.lag  -0.39865    0.05278  -7.553 5.05e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 609.4 on 303 degrees of freedom
## Multiple R-squared:  0.2546, Adjusted R-squared:  0.2472 
## F-statistic: 34.49 on 3 and 303 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -3.8224 4.891 7.3363 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

adf_west_diff <- ur.df(west_diff, type = "trend", selectlags = "AIC")
summary(adf_west_diff)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10585.3   -274.8     23.2    251.4   7301.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  68.36159  158.13705   0.432    0.666    
## z.lag.1      -0.14709    0.03611  -4.073 5.93e-05 ***
## tt            0.63362    0.90165   0.703    0.483    
## z.diff.lag   -0.33671    0.05414  -6.219 1.66e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1371 on 303 degrees of freedom
## Multiple R-squared:  0.2116, Adjusted R-squared:  0.2038 
## F-statistic: 27.11 on 3 and 303 DF,  p-value: 1.472e-15
## 
## 
## Value of test-statistic is: -4.0729 5.5348 8.3019 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

### Confirm if increase in Zestimates Statistically Significant Pre- and Post-COVID Periods

break_date <- as.Date("2020-03-01")

date_diff <- regional_df$date[-1]   # drop first date to match diff length

# Midwest
mid_pre  <- midwest_diff[ date_diff <  break_date ]
mid_post <- midwest_diff[ date_diff >= break_date ]

# Northeast
ne_pre  <- northeast_diff[ date_diff <  break_date ]
ne_post <- northeast_diff[ date_diff >= break_date ]

# South
so_pre  <- south_diff[ date_diff <  break_date ]
so_post <- south_diff[ date_diff >= break_date ]

# West
we_pre  <- west_diff[ date_diff <  break_date ]
we_post <- west_diff[ date_diff >= break_date ]

# T-tests To Compare Pre- and Post-COVID
t.test(mid_pre, mid_post)

## 
##  Welch Two Sample t-test
## 
## data:  mid_pre and mid_post
## t = -6.473, df = 75.06, p-value = 8.846e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1135.829  -601.244
## sample estimates:
## mean of x mean of y 
##  337.5636 1206.1003

t.test(ne_pre, ne_post)

## 
##  Welch Two Sample t-test
## 
## data:  ne_pre and ne_post
## t = -6.587, df = 77.667, p-value = 4.829e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2257.839 -1209.734
## sample estimates:
## mean of x mean of y 
##   667.616  2401.402

t.test(so_pre, so_post)

## 
##  Welch Two Sample t-test
## 
## data:  so_pre and so_post
## t = -3.7076, df = 77.825, p-value = 0.0003901
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1092.4171  -329.0953
## sample estimates:
## mean of x mean of y 
##  470.8712 1181.6274

t.test(we_pre, we_post)

## 
##  Welch Two Sample t-test
## 
## data:  we_pre and we_post
## t = -2.9748, df = 75.55, p-value = 0.003934
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2256.7037  -446.6315
## sample estimates:
## mean of x mean of y 
##  818.7777 2170.4453

# Results confirm at the mean monthly growth rate in home values increased significantly after the onset of COVID-19.

### Plot Differenced Data

# Create Wide Dataframe
differenced_data_df <- data.frame(date_diff, midwest_diff, northeast_diff, south_diff, west_diff)

# Create Long Dataframe
differenced_long <- differenced_data_df |>
  pivot_longer(
    cols = -date_diff,
    names_to = "regions",
    values_to = "diff_zhvi"
  )

# Time-Series Plot
ggplot(data = differenced_long) +
  geom_line(mapping = aes(x = date_diff, y = diff_zhvi, color = regions)) +
  scale_x_date(name = "Date") +
  scale_y_continuous(name = "Monthly ZHVI Change ($)", labels = scales::comma) +
  scale_color_discrete(name = "Regions", guide = guide_legend(position = "bottom")) +
  ggtitle("Differenced ZHVI Across U.S. Regions, 2000 - 2025",
          subtitle = "Source: Zillow Home Value Index (ZHVI) – Kaggle (https://www.kaggle.com/datasets/robikscube/zillow-home-value-index)") +
  annotate("rect", xmin = as.Date("2020-03-01"), xmax = Inf, ymin = -Inf, ymax = Inf, alpha = 0.1, color = "grey76") + # Comment This Line To Make Shaded Area Disappear
  theme_bw()

## Warning in scale_x_date(name = "Date"): A <numeric> value was passed to a Date scale.
## ℹ The value was converted to a <Date> object.

# Boxplots
differenced_long$COVID_19 <- ifelse(differenced_long$date_diff < as.Date("2020-03-01"), "Pre-COVID", "Post-COVID")

# Convert To Ordered Factor
differenced_long$COVID_19 <- factor(
  differenced_long$COVID_19,
  levels = c("Pre-COVID", "Post-COVID")
)

# Midwest
differenced_long |>
  filter(regions == "midwest_diff") |>
  ggplot() +
  geom_boxplot(mapping = aes(x = COVID_19, y = diff_zhvi, fill = COVID_19)) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
  labs(
    title = "Midwest: Change in Monthly Home Values",
    subtitle = "Differenced ZHVI (Pre vs Post COVID)",
    x = "Period",
    y = "Monthly ZHVI Change"
  ) +
  theme_bw() +
  theme(legend.position = "none")

# Northeast
differenced_long |>
  filter(regions == "northeast_diff") |>
  ggplot() +
  geom_boxplot(mapping = aes(x = COVID_19, y = diff_zhvi, fill = COVID_19)) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
  labs(
    title = "Northeast: Change in Monthly Home Values",
    subtitle = "Differenced ZHVI (Pre vs Post COVID)",
    x = "Period",
    y = "Monthly ZHVI Change"
  ) +
  theme_bw() +
  theme(legend.position = "none")

# South
differenced_long |>
  filter(regions == "south_diff") |>
  ggplot() +
  geom_boxplot(mapping = aes(x = COVID_19, y = diff_zhvi, fill = COVID_19)) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
  labs(
    title = "South: Change in Monthly Home Values",
    subtitle = "Differenced ZHVI (Pre vs Post COVID)",
    x = "Period",
    y = "Monthly ZHVI Change"
  ) +
  theme_bw() +
  theme(legend.position = "none")

# West
differenced_long |>
  filter(regions == "west_diff") |>
  ggplot() +
  geom_boxplot(mapping = aes(x = COVID_19, y = diff_zhvi, fill = COVID_19)) +
  scale_y_continuous(labels = scales::comma) +
  scale_fill_manual(values = c("Pre-COVID" = "steelblue", "Post-COVID" = "violetred2")) +
  labs(
    title = "West: Change in Monthly Home Values",
    subtitle = "Differenced ZHVI (Pre vs Post COVID)",
    x = "Period",
    y = "Monthly ZHVI Change"
  ) +
  theme_bw() +
  theme(legend.position = "none")

# Statistical Summary
pre_covid_diff <- differenced_data_df |> filter(date_diff < as.Date("2020-03-01"))
post_covid_diff <- differenced_data_df |> filter(date_diff >= as.Date("2020-03-01"))

# Pre-COVID
summary(pre_covid_diff[, 2:5])

##   midwest_diff     northeast_diff       south_diff        west_diff      
##  Min.   :-1113.0   Min.   :-1937.08   Min.   :-5733.4   Min.   :-6987.5  
##  1st Qu.:  132.7   1st Qu.:  -51.33   1st Qu.:  194.9   1st Qu.:  159.6  
##  Median :  506.5   Median :  827.98   Median :  617.0   Median : 1117.5  
##  Mean   :  337.6   Mean   :  667.62   Mean   :  470.9   Mean   :  818.8  
##  3rd Qu.:  661.3   3rd Qu.: 1405.68   3rd Qu.:  867.4   3rd Qu.: 1764.7  
##  Max.   : 2335.6   Max.   : 2681.90   Max.   : 3462.4   Max.   :10441.7

sd(pre_covid_diff$midwest_diff)

## [1] 491.4823

sd(pre_covid_diff$northeast_diff)

## [1] 1096.314

sd(pre_covid_diff$south_diff)

## [1] 803.7902

sd(pre_covid_diff$west_diff)

## [1] 1710.409

# Post-COVID
summary(post_covid_diff[, 2:5])

##   midwest_diff    northeast_diff      south_diff        west_diff      
##  Min.   :-850.3   Min.   :-1856.3   Min.   :-1009.7   Min.   :-4977.0  
##  1st Qu.: 406.3   1st Qu.:  820.4   1st Qu.:   51.7   1st Qu.:  245.9  
##  Median :1029.9   Median : 1851.2   Median :  792.8   Median : 1380.3  
##  Mean   :1206.1   Mean   : 2401.4   Mean   : 1181.6   Mean   : 2170.4  
##  3rd Qu.:1983.6   3rd Qu.: 3811.9   3rd Qu.: 2490.7   3rd Qu.: 4945.2  
##  Max.   :3810.2   Max.   : 7175.5   Max.   : 4736.1   Max.   :10197.3

sd(post_covid_diff$midwest_diff)

## [1] 1075.219

sd(post_covid_diff$northeast_diff)

## [1] 2090.93

sd(post_covid_diff$south_diff)

## [1] 1522.056

sd(post_covid_diff$west_diff)

## [1] 3634.978

###### Question 2 ######

# National Average ZHVI For Each Month
national_df <- ZHVI |>
  mutate(national_zhvi = rowMeans(across(-date), na.rm = TRUE)) |>
  dplyr::select(date, national_zhvi)

# Difference For Stationarity
national_df <- national_df |>
  mutate(d_zhvi = national_zhvi - lag(national_zhvi)) |>
  filter(!is.na(d_zhvi))

# Remove Last Month So All Series Have Same Time Period
national_df <- national_df[-309,]
           
# Pull FRED Data
getSymbols(c("MORTGAGE30US", "UNRATE", "CPIAUCSL", "FEDFUNDS"), src = "FRED")

## [1] "MORTGAGE30US" "UNRATE"       "CPIAUCSL"     "FEDFUNDS"

# Ensure ALl Series Have Same Start/End Date
common_dates <- seq(as.Date("2000-02-01"), as.Date("2025-09-01"), by = "month")

# Common Date For Differenced Series
common_dates_differenced <- seq(as.Date("2000-01-01"), as.Date("2025-09-01"), by = "month")

MORT_monthly <- to.monthly(MORTGAGE30US, indexAt = "firstof", OHLC = FALSE)
MORT_monthly <- MORT_monthly[common_dates]
UNRATE_monthly <- to.monthly(UNRATE, indexAt = "firstof", OHLC = FALSE)
UNRATE_monthly <- UNRATE_monthly[common_dates]
CPI_monthly <- to.monthly(CPIAUCSL, indexAt = "firstof", OHLC = FALSE)
CPI_monthly <- CPI_monthly[common_dates_differenced]
FEDFUNDS_monthly <- to.monthly(FEDFUNDS, indexAt = "firstof", OHLC = FALSE)
FEDFUNDS_monthly <- FEDFUNDS_monthly[common_dates]

# Difference Inflation (Non-Stationary)
cpi_values <- as.numeric(CPI_monthly$CPIAUCSL)
inflation_diff <- diff(cpi_values)

# Create Dataframe For Regressors
regressors_df <- data.frame(
  date = common_dates,
  mortgage_rate = as.numeric(MORT_monthly$MORTGAGE30US),
  unemployment_rate = as.numeric(UNRATE_monthly$UNRATE),
  inflation_rate = inflation_diff,
  fed_funds_rate = as.numeric(FEDFUNDS_monthly$FEDFUNDS)
)

# Merge With national_df Dataset
df <- national_df |>
  inner_join(regressors_df, by = "date")

# Ensure No Null Values
colSums(is.na(df))

##              date     national_zhvi            d_zhvi     mortgage_rate 
##                 0                 0                 0                 0 
## unemployment_rate    inflation_rate    fed_funds_rate 
##                 0                 0                 0

# Create Regression
model <- lm(df$d_zhvi ~ df$mortgage_rate + df$unemployment_rate + df$inflation_rate + df$fed_funds_rate, data = df)
summary_model <- summary(model)
summary_model

## 
## Call:
## lm(formula = df$d_zhvi ~ df$mortgage_rate + df$unemployment_rate + 
##     df$inflation_rate + df$fed_funds_rate, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3365.3  -665.6  -132.2   618.6  4443.5 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3895.12     402.00   9.689  < 2e-16 ***
## df$mortgage_rate      -313.41      80.35  -3.900 0.000118 ***
## df$unemployment_rate  -275.92      42.52  -6.489 3.52e-10 ***
## df$inflation_rate      446.01      89.59   4.978 1.08e-06 ***
## df$fed_funds_rate      -61.97      64.01  -0.968 0.333758    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1107 on 303 degrees of freedom
## Multiple R-squared:  0.2904, Adjusted R-squared:  0.281 
## F-statistic: 30.99 on 4 and 303 DF,  p-value: < 2.2e-16

# Multicolinearlity?
vif(model)

##     df$mortgage_rate df$unemployment_rate    df$inflation_rate 
##             3.130566             1.720394             1.049248 
##    df$fed_funds_rate 
##             4.213044

# Correlation Matrix
numeric_vars <- df[, c("mortgage_rate", "unemployment_rate", "inflation_rate", "fed_funds_rate")]
cor_matrix <- cor(numeric_vars)
cor_matrix

##                   mortgage_rate unemployment_rate inflation_rate fed_funds_rate
## mortgage_rate       1.000000000        -0.3698296    0.007063664     0.81116953
## unemployment_rate  -0.369829597         1.0000000   -0.203309373    -0.60046869
## inflation_rate      0.007063664        -0.2033094    1.000000000     0.08289704
## fed_funds_rate      0.811169530        -0.6004687    0.082897035     1.00000000

# Scatterplot of Fitted Vs. Actual
ggplot(data = df, mapping = aes(x = model$fitted.values, y = d_zhvi)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue4") +
  labs(
    title = "Scatterplot of Fitted and Actual Values of Differenced ZHVI",
    x = "Fitted Values",
    y = "Actual Values"
  ) + 
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

# Scatterplot of Residuals and Fitted Values
ggplot(data = df, mapping = aes(x = model$fitted.values, y = model$residuals)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red4") +
  labs(
    title = "Scatterplot of Fitted and Residual Values of Differenced ZHVI",
    x = "Fitted Values",
    y = "Residual Values"
  ) +
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

# Q-Q Plot
ggplot(mapping = aes(sample = model$residuals)) +
  stat_qq() +
  stat_qq_line(color = "blue4") +
  theme_bw() +
  labs(
    title = "Q-Q Plot of Residuals",
    x = "Theoretical Quantiles",
    y = "Sample Quantiles"
  )

# Histogram Of Residuals
ggplot() +
  geom_histogram(aes(x = model$residuals), bins = 30, fill = "steelblue", color = "white") +
  theme_bw() +
  labs(
    title = "Histogram of Regression Residuals",
    x = "Residuals",
    y = "Count"
  )

### Model 2
getSymbols(c("PERMIT", "HOUST", "DSPIC96", "UMCSENT"), src = "FRED")

## [1] "PERMIT"  "HOUST"   "DSPIC96" "UMCSENT"

# Test stationarity

# # Building Permits
# permit_monthly <- to.monthly(PERMIT, indexAt = "firstof", OHLC = FALSE)
# permit_monthly <- permit_monthly[common_dates]
# permit_series <- as.numeric(permit_monthly$PERMIT)
# adf_permit <- ur.df(permit_series, type = "trend", selectlags = "AIC")
# summary(adf_permit)
# 
# # Housing Starts
# houst_monthly <- to.monthly(HOUST, indexAt = "firstof", OHLC = FALSE)
# houst_monthly <- houst_monthly[common_dates]
# houst_series <- as.numeric(houst_monthly$HOUST)
# adf_houst <- ur.df(houst_series, type = "trend", selectlags = "AIC")
# summary(adf_houst)
# 
# # Disposable Income
# income_monthly <- to.monthly(DSPIC96, indexAt = "firstof", OHLC = FALSE)
# income_monthly <- income_monthly[common_dates]
# income_series <- as.numeric(income_monthly$DSPIC96)
# adf_income <- ur.df(income_series, type = "trend", selectlags = "AIC")
# summary(adf_income)
# 
# # Consumer Sentiment
# sent_monthly <- to.monthly(UMCSENT, indexAt = "firstof", OHLC = FALSE)
# sent_monthly <- sent_monthly[common_dates]
# sent_series <- as.numeric(sent_monthly$UMCSENT)
# adf_sent <- ur.df(sent_series, type = "trend", selectlags = "AIC")
# summary(adf_sent)

# Ensure Same Timeframe
common_dates_diff_2 <- seq(as.Date("2000-01-01"), as.Date("2025-08-01"), by = "month")

PERMIT_monthly <- to.monthly(PERMIT, indexAt = "firstof", OHLC = FALSE)
PERMIT_monthly <- PERMIT_monthly[common_dates_diff_2]
HOUST_monthly <- to.monthly(HOUST, indexAt = "firstof", OHLC = FALSE)
HOUST_monthly <- HOUST_monthly[common_dates_diff_2]
DSPIC96_monthly <- to.monthly(DSPIC96, indexAt = "firstof", OHLC = FALSE)
DSPIC96_monthly <- DSPIC96_monthly[common_dates_diff_2]
UMCSENT_monthly <- to.monthly(UMCSENT, indexAt = "firstof", OHLC = FALSE)

## Warning in to.period(x, "months", indexAt = indexAt, name = name, ...): missing
## values removed from data

UMCSENT_monthly <- UMCSENT_monthly[common_dates_diff_2]

# Difference Series
permit_values <- as.numeric(PERMIT_monthly$PERMIT)
permit_diff <- diff(permit_values)
houst_values <- as.numeric(HOUST_monthly$HOUST)
houst_diff <- diff(houst_values)
dspic96_values <- as.numeric(DSPIC96_monthly$DSPIC96)
dspic96_diff <- diff(dspic96_values)
umcsent_values <- as.numeric(UMCSENT_monthly$UMCSENT)
umcsent_diff <- diff(umcsent_values)

common_dates2 <- seq(as.Date("2000-02-01"), as.Date("2025-08-01"), by = "month")

# Create Dataframe For Regressors
regressors_df_2 <- data.frame(
  date = common_dates2,
  building_permits = permit_diff,
  housing_starts = houst_diff,
  disposible_inc = dspic96_diff,
  consumer_sent = umcsent_diff
)

# Make Original Dataset 1 Unit Shorter To Match Dimensions
df <- df[-308, ]

# Join Datasets Again
df <- df |>
  inner_join(regressors_df_2, by = "date")

# Drop 2nd Column For Regression
df <- df[, -2]

# Best Subset Regression
n <- ncol(df) - 2
regfit.full <- regsubsets(df$d_zhvi ~ . - date, data = df, nvmax = n)
reg.summary <- summary(regfit.full)
reg.summary

## Subset selection object
## Call: regsubsets.formula(df$d_zhvi ~ . - date, data = df, nvmax = n)
## 8 Variables  (and intercept)
##                   Forced in Forced out
## mortgage_rate         FALSE      FALSE
## unemployment_rate     FALSE      FALSE
## inflation_rate        FALSE      FALSE
## fed_funds_rate        FALSE      FALSE
## building_permits      FALSE      FALSE
## housing_starts        FALSE      FALSE
## disposible_inc        FALSE      FALSE
## consumer_sent         FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          mortgage_rate unemployment_rate inflation_rate fed_funds_rate
## 1  ( 1 ) " "           " "               "*"            " "           
## 2  ( 1 ) "*"           "*"               " "            " "           
## 3  ( 1 ) "*"           "*"               "*"            " "           
## 4  ( 1 ) "*"           "*"               "*"            " "           
## 5  ( 1 ) "*"           "*"               "*"            "*"           
## 6  ( 1 ) "*"           "*"               "*"            "*"           
## 7  ( 1 ) "*"           "*"               "*"            "*"           
## 8  ( 1 ) "*"           "*"               "*"            "*"           
##          building_permits housing_starts disposible_inc consumer_sent
## 1  ( 1 ) " "              " "            " "            " "          
## 2  ( 1 ) " "              " "            " "            " "          
## 3  ( 1 ) " "              " "            " "            " "          
## 4  ( 1 ) " "              " "            " "            "*"          
## 5  ( 1 ) " "              " "            " "            "*"          
## 6  ( 1 ) " "              " "            "*"            "*"          
## 7  ( 1 ) "*"              " "            "*"            "*"          
## 8  ( 1 ) "*"              "*"            "*"            "*"

# Best Model By Adj. R^2
best_adj_index <- which.max(reg.summary$adjr2)
best_adj_index

## [1] 5

coef(regfit.full, best_adj_index)

##       (Intercept)     mortgage_rate unemployment_rate    inflation_rate 
##        3913.62903        -311.67494        -279.00114         437.02777 
##    fed_funds_rate     consumer_sent 
##         -66.40291         -26.32595

# Regression
best_subset_model <- lm(df$d_zhvi ~ df$mortgage_rate + df$unemployment_rate + df$inflation_rate + df$fed_funds_rate + df$consumer_sent, data = df)
summary(best_subset_model)

## 
## Call:
## lm(formula = df$d_zhvi ~ df$mortgage_rate + df$unemployment_rate + 
##     df$inflation_rate + df$fed_funds_rate + df$consumer_sent, 
##     data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3351.5  -711.7  -128.6   638.0  4591.7 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           3913.63     401.22   9.754  < 2e-16 ***
## df$mortgage_rate      -311.67      80.18  -3.887 0.000125 ***
## df$unemployment_rate  -279.00      42.47  -6.570 2.21e-10 ***
## df$inflation_rate      437.03      89.62   4.876 1.75e-06 ***
## df$fed_funds_rate      -66.40      63.98  -1.038 0.300184    
## df$consumer_sent       -26.33      14.63  -1.799 0.072974 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1105 on 301 degrees of freedom
## Multiple R-squared:  0.2981, Adjusted R-squared:  0.2865 
## F-statistic: 25.57 on 5 and 301 DF,  p-value: < 2.2e-16

# VIF
vif(best_subset_model)

##     df$mortgage_rate df$unemployment_rate    df$inflation_rate 
##             3.124940             1.721345             1.053137 
##    df$fed_funds_rate     df$consumer_sent 
##             4.212157             1.007587

# Correlation Matrix
numeric_vars <- df[, c("mortgage_rate", "unemployment_rate", "inflation_rate", "fed_funds_rate", "consumer_sent")]
cor_matrix <- cor(numeric_vars)
cor_matrix

##                   mortgage_rate unemployment_rate inflation_rate fed_funds_rate
## mortgage_rate       1.000000000      -0.368785374    0.005273014     0.81075384
## unemployment_rate  -0.368785374       1.000000000   -0.202145527    -0.59974475
## inflation_rate      0.005273014      -0.202145527    1.000000000     0.08061768
## fed_funds_rate      0.810753843      -0.599744750    0.080617681     1.00000000
## consumer_sent      -0.033783811       0.006568184   -0.061433720    -0.04884186
##                   consumer_sent
## mortgage_rate      -0.033783811
## unemployment_rate   0.006568184
## inflation_rate     -0.061433720
## fed_funds_rate     -0.048841864
## consumer_sent       1.000000000

# Scatterplot of Fitted Vs. Actual
ggplot(data = df, mapping = aes(x = best_subset_model$fitted.values, y = d_zhvi)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue4") +
  labs(
    title = "Scatterplot of Fitted and Actual Values of Differenced ZHVI",
    subtitle = "Subset Selection",
    x = "Fitted Values",
    y = "Actual Values"
  ) + 
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

# Scatterplot of Residuals and Fitted Values
ggplot(data = df, mapping = aes(x = best_subset_model$fitted.values, y = best_subset_model$residuals)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red4") +
  labs(
    title = "Scatterplot of Fitted and Residual Values of Differenced ZHVI",
    subtitle = "Subset Selection",
    x = "Fitted Values",
    y = "Residual Values"
  ) +
  theme_bw()

## `geom_smooth()` using formula = 'y ~ x'

# Q-Q Plot
ggplot(mapping = aes(sample = best_subset_model$residuals)) +
  stat_qq() +
  stat_qq_line(color = "blue4") +
  theme_bw() +
  labs(
    title = "Q-Q Plot of Residuals",
    subtitle = "Subset Selection",
    x = "Theoretical Quantiles",
    y = "Sample Quantiles"
  )

# Histogram Of Residuals
ggplot() +
  geom_histogram(aes(x = best_subset_model$residuals), bins = 30, fill = "steelblue", color = "white") +
  theme_bw() +
  labs(
    title = "Histogram of Regression Residuals",
    subtitle = "Subset Selection",
    x = "Residuals",
    y = "Count"
  )

BUDT730-Final-Project.R

keshavmittu

2025-11-30