The Data

metrofour <- read.csv("C:/Users/karol/Downloads/Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")

str(metrofour[,c(1:11)])
## 'data.frame':    892 obs. of  11 variables:
##  $ RegionID   : int  102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
##  $ SizeRank   : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ RegionName : chr  "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
##  $ RegionType : chr  "country" "msa" "msa" "msa" ...
##  $ StateName  : chr  "" "NY" "CA" "IL" ...
##  $ X2000.01.31: num  175564 267636 288298 215112 182888 ...
##  $ X2000.02.29: num  176088 269292 289616 215684 182867 ...
##  $ X2000.03.31: num  176634 270702 291170 216449 182957 ...
##  $ X2000.04.30: num  177753 273614 294224 217892 183113 ...
##  $ X2000.05.31: num  178880 276373 297293 219406 183300 ...
##  $ X2000.06.30: num  180022 279412 300221 220844 183349 ...

We have to make this dataset tidy. Tidy Data is a way of structuring data so that it can be easily understood by people and analyzed by machines.

I need to remove the X at the beginning of the dates (X2000.01.31,X2000.02.29,…)

names(metrofour) <- sub("^X", "", names(metrofour))

str(metrofour[,c(1:11)])
## 'data.frame':    892 obs. of  11 variables:
##  $ RegionID  : int  102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
##  $ SizeRank  : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ RegionName: chr  "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
##  $ RegionType: chr  "country" "msa" "msa" "msa" ...
##  $ StateName : chr  "" "NY" "CA" "IL" ...
##  $ 2000.01.31: num  175564 267636 288298 215112 182888 ...
##  $ 2000.02.29: num  176088 269292 289616 215684 182867 ...
##  $ 2000.03.31: num  176634 270702 291170 216449 182957 ...
##  $ 2000.04.30: num  177753 273614 294224 217892 183113 ...
##  $ 2000.05.31: num  178880 276373 297293 219406 183300 ...
##  $ 2000.06.30: num  180022 279412 300221 220844 183349 ...
house_price <- metrofour %>% 
  pivot_longer(-c(RegionID, SizeRank, RegionName, RegionType, StateName),
    names_to = "Monthly",
    values_to = "Price"
  ) 
str(metrofour[,c(1:11)])
## 'data.frame':    892 obs. of  11 variables:
##  $ RegionID  : int  102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
##  $ SizeRank  : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ RegionName: chr  "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
##  $ RegionType: chr  "country" "msa" "msa" "msa" ...
##  $ StateName : chr  "" "NY" "CA" "IL" ...
##  $ 2000.01.31: num  175564 267636 288298 215112 182888 ...
##  $ 2000.02.29: num  176088 269292 289616 215684 182867 ...
##  $ 2000.03.31: num  176634 270702 291170 216449 182957 ...
##  $ 2000.04.30: num  177753 273614 294224 217892 183113 ...
##  $ 2000.05.31: num  178880 276373 297293 219406 183300 ...
##  $ 2000.06.30: num  180022 279412 300221 220844 183349 ...
#Converting the Date from factor to character

house_clean <- house_price %>%
            mutate(Monthly_parsed = as.Date(Monthly,"%Y.%m.%d"))


house_clean[["Monthly"]]<- as.character(house_clean$Monthly)

house_price[["Monthly"]]<- as.character(house_price $Monthly)
summary(house_clean)
##     RegionID         SizeRank      RegionName         RegionType       
##  Min.   :102001   Min.   :  0.0   Length:244408      Length:244408     
##  1st Qu.:394550   1st Qu.:223.8   Class :character   Class :character  
##  Median :394802   Median :446.5   Mode  :character   Mode  :character  
##  Mean   :414787   Mean   :451.4                                        
##  3rd Qu.:395052   3rd Qu.:676.2                                        
##  Max.   :845172   Max.   :927.0                                        
##                                                                        
##   StateName           Monthly              Price         Monthly_parsed      
##  Length:244408      Length:244408      Min.   :  54665   Min.   :2000-01-31  
##  Class :character   Class :character   1st Qu.: 153442   1st Qu.:2005-09-30  
##  Mode  :character   Mode  :character   Median : 197007   Median :2011-06-15  
##                                        Mean   : 233007   Mean   :2011-06-15  
##                                        3rd Qu.: 262890   3rd Qu.:2017-02-28  
##                                        Max.   :2139455   Max.   :2022-10-31  
##                                        NA's   :49994

We see some missing values in the Price variable, but before I deal with those values, I will filter my data to the cities that I am interested the most

pdx_data <- house_clean %>%
  dplyr:::filter(RegionName %in% c("Portland, OR"))  %>%
  dplyr:::filter(Monthly_parsed >= "2014-01-01")

summary(pdx_data)
##     RegionID         SizeRank   RegionName         RegionType       
##  Min.   :394998   Min.   :25   Length:106         Length:106        
##  1st Qu.:394998   1st Qu.:25   Class :character   Class :character  
##  Median :394998   Median :25   Mode  :character   Mode  :character  
##  Mean   :394998   Mean   :25                                        
##  3rd Qu.:394998   3rd Qu.:25                                        
##  Max.   :394998   Max.   :25                                        
##   StateName           Monthly              Price        Monthly_parsed      
##  Length:106         Length:106         Min.   :348564   Min.   :2014-01-31  
##  Class :character   Class :character   1st Qu.:414945   1st Qu.:2016-04-07  
##  Mode  :character   Mode  :character   Median :478985   Median :2018-06-15  
##                                        Mean   :484321   Mean   :2018-06-15  
##                                        3rd Qu.:511982   3rd Qu.:2020-08-23  
##                                        Max.   :702047   Max.   :2022-10-31

After filtering the data, we don’t have any missing values

Coerce to a tsibble with as_tsibble()

A time series can be recorded as a tsibble object in R. tsibble objects extend tidy data frames (tibble objects) by introducing temporal structure, and to do it, we need to declare key and index. In this case, the Monthly_parsed containing the data-time is the index and the RegionID is the key. Other columns can be considered as measured variables.

tsb_pdx <- pdx_data %>%
                   select(RegionName,RegionID, Monthly_parsed, Price)

tsb_pref_pdx <-tsb_pdx%>%
  as_tsibble(key= RegionName, index= Monthly_parsed)%>%
                   index_by(year_month = ~ yearmonth(.))

tsibble_pdx <-tsb_pref_pdx%>%
  select(-RegionID)%>%
  as_tsibble(key= RegionName, index= year_month)%>%
  mutate(Prices = Price/1000)

Data Visualization

To visualize the data, I could use the autoplot() command, but I rather to create my graph with ggplot.

plot_pdx_house <- tsibble_pdx %>%
  ggplot(aes(x= year_month, y= Prices)) +
  geom_line(size=1, color= "darkgreen")+
   
    labs(y="Price in Thousands of Dollars ", 
       x= " ",
       title=" Four Bedroom House Prices in Portland, OR, 2012-2022 ",
       caption = "data:https://www.zillow.com/research/data")+
  scale_y_continuous(labels=scales::dollar_format())+
  theme_minimal()


plot_pdx_house 

Data is non- stationary, we can see a trend-cycle component in the graph above.

Determining Stationarity

In our analysis, we use the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test (Kwiatkowski et al., 1992). In this test, the null hypothesis is that the data are stationary, and we look for evidence that the null hypothesis is false. Consequently, small p-values (e.g., less than 0.05) suggest that differencing is required. The test can be computed using the unitroot_kpss() function.

tsibble_pdx%>%
  features(Prices, unitroot_kpss)
## # A tibble: 1 × 3
##   RegionName   kpss_stat kpss_pvalue
##   <chr>            <dbl>       <dbl>
## 1 Portland, OR      1.98        0.01

The p-value is reported as 0.01 if it is less than 0.01, and as 0.1 if it is greater than 0.1. In this case, the test statistic (2.50) is bigger than the 1% critical value, so the p-value is less than 0.01, indicating that the null hypothesis is rejected. That is, the data are not stationary.

tsibble_pdx %>% 
  features(Prices ,unitroot_ndiffs)
## # A tibble: 1 × 2
##   RegionName   ndiffs
##   <chr>         <int>
## 1 Portland, OR      1

As we saw from the KPSS tests above, one difference (d) is required to make the tsibble_pdx data stationary.

Autocorrelation

tsibble_pdx %>%
  gg_tsdisplay(Prices,
                     plot_type='partial')+
       labs(y="Thousands of Dollars ", 
       x= " ")

ACF does not drop quickly to zero, moreover the value is large and positive (almost 1 in this case). All these are signs of a non-stationary time series. Therefore it should be differenced to obtain a stationary series.

PACF value r1 is almost 1. All other values ri,i >1 are small. This is a sign of a non stationary process that should be differenced in order to obtain a stationary series.

The data are clearly non-stationary, so we will first take a seasonal difference. The seasonally differenced data are shown below:

tsibble_pdx %>%
  gg_tsdisplay(difference(Prices, 12),
               plot_type='partial', lag=36) +
  labs(title="Seasonally differenced", y="")
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).

These are also clearly non-stationary, so we take a further first difference

tsibble_pdx %>%
  gg_tsdisplay(difference(Prices, 12) %>% difference(),
               plot_type='partial', lag=36) +
  labs(title = "Double differenced", y="")
## Warning: Removed 13 row(s) containing missing values (geom_path).
## Warning: Removed 13 rows containing missing values (geom_point).

Our aim now is to find an appropriate ARIMA model based on the ACF and PACF shown in the Double Differenced graph.

Seasonal Arima Model

all_fit <- tsibble_pdx%>%
  model(
    arima212012 = ARIMA(Prices ~ pdq(2,1,2)+ PDQ(0,1,2)),
    arima210011 = ARIMA(Prices ~ pdq(2,1,0)+ PDQ(0,1,1)),
    stepwise = ARIMA(Prices),
    search = ARIMA(Prices,stepwise=FALSE))
all_fit %>% pivot_longer(!RegionName,
            names_to = "Model name", 
            values_to = "Orders")
## # A mable: 4 x 3
## # Key:     RegionName, Model name [4]
##   RegionName   `Model name`                             Orders
##   <chr>        <chr>                                   <model>
## 1 Portland, OR arima212012           <ARIMA(2,1,2)(0,1,2)[12]>
## 2 Portland, OR arima210011           <ARIMA(2,1,0)(0,1,1)[12]>
## 3 Portland, OR stepwise     <ARIMA(1,1,2)(0,0,1)[12] w/ drift>
## 4 Portland, OR search       <ARIMA(1,1,2)(0,0,1)[12] w/ drift>
glance(all_fit) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 4 × 6
##   .model      sigma2 log_lik   AIC  AICc   BIC
##   <chr>        <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 stepwise     0.891   -146.  304.  305.  320.
## 2 search       0.891   -146.  304.  305.  320.
## 3 arima212012  1.09    -149.  312.  314.  330.
## 4 arima210011  1.68    -162.  331.  332.  341.

Of these models, the best is the ARIMA(2,1,2)(0,1,2)[12]model (i.e., it has the smallest AICc value).

arima212012 <- tsibble_pdx %>%
  model(arima212012 = ARIMA(Prices ~ pdq(2,1,2)+ PDQ(0,1,2)))%>%
  report()
## Series: Prices 
## Model: ARIMA(2,1,2)(0,1,2)[12] 
## 
## Coefficients:
##          ar1      ar2     ma1     ma2     sma1    sma2
##       0.7422  -0.0097  0.9808  0.9999  -1.0640  0.2030
## s.e.  0.1169   0.1191  0.0641  0.0769   0.2173  0.1673
## 
## sigma^2 estimated as 1.092:  log likelihood=-149.11
## AIC=312.21   AICc=313.53   BIC=329.94
all_fit %>% select(arima212012) %>%
  gg_tsresiduals()

augment(all_fit) %>%
  filter(.model=='arima212012') %>%
  features(.innov, ljung_box, lag = 36, dof = 6)
## # A tibble: 1 × 4
##   RegionName   .model      lb_stat lb_pvalue
##   <chr>        <chr>         <dbl>     <dbl>
## 1 Portland, OR arima212012    36.5     0.193
tsibble_pdx %>%
  model(ARIMA(Prices ~ pdq(2,1,2) + PDQ(0,1,2))) %>%
  forecast() %>%
  autoplot(tsibble_pdx) +
  labs(y=" Thousands of $US ",
       x =" ",
       title="Forecast from the ARIMA(2,1,2)(0,1,2)[12] model\napplied to the Portland House Prices data")
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`

ETS

fit_ets <- tsibble_pdx %>%
  model(ETS(Prices))
report(fit_ets)
## Series: Prices 
## Model: ETS(M,Ad,N) 
##   Smoothing parameters:
##     alpha = 0.9998912 
##     beta  = 0.9998339 
##     phi   = 0.9632019 
## 
##   Initial states:
##      l[0]     b[0]
##  346.6011 1.209576
## 
##   sigma^2:  0
## 
##      AIC     AICc      BIC 
## 545.9062 546.7547 561.8868

The model selected is ETS(M,Ad,N)

components(fit_ets) %>%
  autoplot() +
  labs(title = "ETS(M,Ad,N) components")
## Warning: Removed 1 row(s) containing missing values (geom_path).

Because this model has multiplicative errors, the innovation residuals are not equivalent to the regular residuals.

fit_ets %>%
    augment() %>%
    select(.innov, .resid) %>%
    pivot_longer(c(.innov, .resid)) %>%
    autoplot()
## Plot variable not specified, automatically selected `.vars = value`

fit_ets%>%
    gg_tsresiduals()

fit_ets %>%
  forecast(h = 24) %>%
  autoplot(tsibble_pdx)
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`

bind_rows(
    arima212012 %>% accuracy(),
    fit_ets %>% accuracy()) %>%
  select(-ME, -MPE, -ACF1)
## # A tibble: 2 × 8
##   RegionName   .model      .type     RMSE   MAE  MAPE   MASE  RMSSE
##   <chr>        <chr>       <chr>    <dbl> <dbl> <dbl>  <dbl>  <dbl>
## 1 Portland, OR arima212012 Training 0.947 0.685 0.134 0.0168 0.0186
## 2 Portland, OR ETS(Prices) Training 1.53  1.00  0.188 0.0246 0.0300

In this case the ARIMA model seems to be more accurate model based on the test set RMSE, MAPE and MASE.