metrofour <- read.csv("C:/Users/karol/Downloads/Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")

str(metrofour[,c(1:11)])
## 'data.frame':    892 obs. of  11 variables:
##  $ RegionID   : int  102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
##  $ SizeRank   : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ RegionName : chr  "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
##  $ RegionType : chr  "country" "msa" "msa" "msa" ...
##  $ StateName  : chr  "" "NY" "CA" "IL" ...
##  $ X2000.01.31: num  175564 267636 288298 215112 182888 ...
##  $ X2000.02.29: num  176088 269292 289616 215684 182867 ...
##  $ X2000.03.31: num  176634 270702 291170 216449 182957 ...
##  $ X2000.04.30: num  177753 273614 294224 217892 183113 ...
##  $ X2000.05.31: num  178880 276373 297293 219406 183300 ...
##  $ X2000.06.30: num  180022 279412 300221 220844 183349 ...

We have to make this dataset tidy. Tidy Data is a way of structuring data so that it can be easily understood by people and analyzed by machines.

I need to remove the X at the beginning of the dates (X2000.01.31,X2000.02.29,…)

names(metrofour) <- sub("^X", "", names(metrofour))

str(metrofour[,c(1:11)])
## 'data.frame':    892 obs. of  11 variables:
##  $ RegionID  : int  102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
##  $ SizeRank  : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ RegionName: chr  "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
##  $ RegionType: chr  "country" "msa" "msa" "msa" ...
##  $ StateName : chr  "" "NY" "CA" "IL" ...
##  $ 2000.01.31: num  175564 267636 288298 215112 182888 ...
##  $ 2000.02.29: num  176088 269292 289616 215684 182867 ...
##  $ 2000.03.31: num  176634 270702 291170 216449 182957 ...
##  $ 2000.04.30: num  177753 273614 294224 217892 183113 ...
##  $ 2000.05.31: num  178880 276373 297293 219406 183300 ...
##  $ 2000.06.30: num  180022 279412 300221 220844 183349 ...
house_price <- metrofour %>% 
  pivot_longer(-c(RegionID, SizeRank, RegionName, RegionType, StateName),
    names_to = "Monthly",
    values_to = "Price"
  ) 
str(metrofour[,c(1:11)])
## 'data.frame':    892 obs. of  11 variables:
##  $ RegionID  : int  102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
##  $ SizeRank  : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ RegionName: chr  "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
##  $ RegionType: chr  "country" "msa" "msa" "msa" ...
##  $ StateName : chr  "" "NY" "CA" "IL" ...
##  $ 2000.01.31: num  175564 267636 288298 215112 182888 ...
##  $ 2000.02.29: num  176088 269292 289616 215684 182867 ...
##  $ 2000.03.31: num  176634 270702 291170 216449 182957 ...
##  $ 2000.04.30: num  177753 273614 294224 217892 183113 ...
##  $ 2000.05.31: num  178880 276373 297293 219406 183300 ...
##  $ 2000.06.30: num  180022 279412 300221 220844 183349 ...
#Converting the Date from factor to character

house_clean <- house_price %>%
            mutate(Monthly_parsed = as.Date(Monthly,"%Y.%m.%d"))


house_clean[["Monthly"]]<- as.character(house_clean$Monthly)

house_price[["Monthly"]]<- as.character(house_price $Monthly)
summary(house_clean)
##     RegionID         SizeRank      RegionName         RegionType       
##  Min.   :102001   Min.   :  0.0   Length:244408      Length:244408     
##  1st Qu.:394550   1st Qu.:223.8   Class :character   Class :character  
##  Median :394802   Median :446.5   Mode  :character   Mode  :character  
##  Mean   :414787   Mean   :451.4                                        
##  3rd Qu.:395052   3rd Qu.:676.2                                        
##  Max.   :845172   Max.   :927.0                                        
##                                                                        
##   StateName           Monthly              Price         Monthly_parsed      
##  Length:244408      Length:244408      Min.   :  54665   Min.   :2000-01-31  
##  Class :character   Class :character   1st Qu.: 153442   1st Qu.:2005-09-30  
##  Mode  :character   Mode  :character   Median : 197007   Median :2011-06-15  
##                                        Mean   : 233007   Mean   :2011-06-15  
##                                        3rd Qu.: 262890   3rd Qu.:2017-02-28  
##                                        Max.   :2139455   Max.   :2022-10-31  
##                                        NA's   :49994

We see some missing values in the Price variable, but before I deal with those values, I will filter my data to the cities that I am interested the most

pdx_data <- house_clean %>%
  dplyr:::filter(RegionName %in% c("Portland, OR"))  %>%
  dplyr:::filter(Monthly_parsed >= "2012-01-01")

summary(pdx_data)
##     RegionID         SizeRank   RegionName         RegionType       
##  Min.   :394998   Min.   :25   Length:130         Length:130        
##  1st Qu.:394998   1st Qu.:25   Class :character   Class :character  
##  Median :394998   Median :25   Mode  :character   Mode  :character  
##  Mean   :394998   Mean   :25                                        
##  3rd Qu.:394998   3rd Qu.:25                                        
##  Max.   :394998   Max.   :25                                        
##   StateName           Monthly              Price        Monthly_parsed      
##  Length:130         Length:130         Min.   :285351   Min.   :2012-01-31  
##  Class :character   Class :character   1st Qu.:361840   1st Qu.:2014-10-07  
##  Mode  :character   Mode  :character   Median :458053   Median :2017-06-15  
##                                        Mean   :452466   Mean   :2017-06-15  
##                                        3rd Qu.:501009   3rd Qu.:2020-02-21  
##                                        Max.   :702047   Max.   :2022-10-31

After filtering the data, we don’t have any missing values

Coerce to a tsibble with as_tsibble()- Three Cities

A time series can be recorded as a tsibble object in R. tsibble objects extend tidy data frames (tibble objects) by introducing temporal structure, and to do it, we need to declare key and index. In this case, the Monthly_parsed containing the data-time is the index and the RegionID is the key. Other columns can be considered as measured variables.

tsb_pdx <- pdx_data %>%
                   select(RegionName,RegionID, Monthly_parsed, Price)

tsb_pref_pdx <-tsb_pdx%>%
  as_tsibble(key= RegionName, index= Monthly_parsed)%>%
                   index_by(year_month = ~ yearmonth(.))

tsibble_pdx <-tsb_pref_pdx%>%
  select(-RegionID)%>%
  as_tsibble(key= RegionName, index= year_month)

To visualize the data, I could use the autoplot() command, but I rather to create my graph with ggplot.

plot_cities_house <- tsibble_pdx %>%
  ggplot(aes(x= year_month, y= Price/1000)) +
  geom_line(size=1, color= "darkgreen")+
   
    labs(y="Price in Thousands of Dollars ", 
       x= " ",
       title=" Four Bedroom House Prices in Portland, OR, 2012-2022 ",
       caption = "data:https://www.zillow.com/research/data")+
  scale_y_continuous(labels=scales::dollar_format())+
  theme_minimal()


plot_cities_house 

tsibble_pdx%>%
gg_season(Price/1000, labels = "both")+
  labs(x= "",
       y= "Price in Thousands of Dollars ", 
       title="Portland's Seasonal Plot")+
  
  scale_y_continuous(labels=scales::dollar_format())+
  theme_minimal()

tsibble_pdx %>%
gg_subseries(Price/1000)+
  labs(y= "Price in Thousands of Dollars",
       x= "Year")+theme_minimal()+
  scale_y_continuous(labels=scales::dollar_format())+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

dcmp <- tsibble_pdx %>%
  model(stl = STL(Price))
components(dcmp)
## # A dable: 130 x 8 [1M]
## # Key:     RegionName, .model [1]
## # :        Price = trend + season_year + remainder
##    RegionName   .model year_month  Price   trend season_year remainder season_…¹
##    <chr>        <chr>       <mth>  <dbl>   <dbl>       <dbl>     <dbl>     <dbl>
##  1 Portland, OR stl      2012 Jan 285373 281952.       162.     3259.    285211.
##  2 Portland, OR stl      2012 Feb 285351 284224.       642.      485.    284709.
##  3 Portland, OR stl      2012 Mar 287016 286497.       905.     -386.    286111.
##  4 Portland, OR stl      2012 Apr 289055 288769.       828.     -542.    288227.
##  5 Portland, OR stl      2012 May 291599 291106.       564.      -71.2   291035.
##  6 Portland, OR stl      2012 Jun 293292 293443.       255.     -406.    293037.
##  7 Portland, OR stl      2012 Jul 294902 295780.       -16.4    -862.    294918.
##  8 Portland, OR stl      2012 Aug 296946 298188.      -398.     -844.    297344.
##  9 Portland, OR stl      2012 Sep 298823 300596.      -702.    -1072.    299525.
## 10 Portland, OR stl      2012 Oct 300955 303004.      -946.    -1103.    301901.
## # … with 120 more rows, and abbreviated variable name ¹​season_adjust

The output above shows the components of an STL decomposition. The original data is shown (as Price), followed by the estimated components. This output forms a “dable” or decomposition table. The header to the table shows that the Price series has been decomposed additively.

components(dcmp) %>%
  as_tsibble() %>%
  autoplot(Price, colour="gray") +
  geom_line(aes(y=trend), colour = "#D55E00")+
  ggpubr::theme_pubclean()

We can plot all of the components in a single figure using autoplot()

components(dcmp) %>% 
  autoplot(colour = "darkgreen") + theme_minimal()+
  labs(x= "")+ theme(panel.spacing = unit(.8, "cm"))

The three components are shown separately in the bottom three panels. These components can be combined to reconstruct the data shown in the top panel.

tsibble_pdx %>%
  model(`Holt's method` = ETS(Price ~ error("A") +
                       trend("A") + season("N")),
    `Damped Holt's method` = ETS(Price ~ error("A") +
                       trend("Ad", phi = 0.9) + season("N"))
  ) %>%
  forecast(h = 15) %>%
  autoplot(tsibble_pdx, level = NULL) +
  labs(title = "Portland, OR, House Prices",
       y = " ") +
  guides(colour = guide_legend(title = "Forecast"))+
  theme_minimal()
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`

We have set the damping parameter to a relatively low number
(ϕ=0.90) to exaggerate the effect of damping for comparison. Usually, we would estimate ϕ along with the other parameters. We have also used a rather large forecast horizon (h=15) to highlight the difference between a damped trend and a linear trend.

We will use time series cross-validation to compare the one-step forecast accuracy of the three methods

tsibble_pdx %>%
  stretch_tsibble(.init = 10) %>%
  model(
    SES = ETS(Price ~ error("A") + trend("N") + season("N")),
    Holt = ETS(Price ~ error("A") + trend("A") + season("N")),
    Damped = ETS(Price ~ error("A") + trend("Ad") +
                   season("N"))
  ) %>%
  forecast(h = 1) %>%
  accuracy(tsibble_pdx)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 1 observation is missing at 2022 Nov
## # A tibble: 3 × 11
##   .model RegionName   .type      ME  RMSE   MAE    MPE  MAPE   MASE  RMSSE  ACF1
##   <chr>  <chr>        <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl>  <dbl> <dbl>
## 1 Damped Portland, OR Test    99.4  1528. 1002. 0.0325 0.203 0.0258 0.0320 0.571
## 2 Holt   Portland, OR Test     2.65 1574. 1001. 0.0108 0.204 0.0257 0.0329 0.552
## 3 SES    Portland, OR Test  3180.   4620. 3564. 0.678  0.738 0.0916 0.0967 0.891

Holt and Damped Holt’s method are best comparing RMSE and MAE values. Sometimes different accuracy measures will suggest different forecasting methods, and then a decision is required as to which forecasting method we prefer to use. So we will proceed with using the damped Holt’s method and apply it to the whole data set to get forecasts for future prices.

fit_damped <- tsibble_pdx %>%
  model(
    Damped = ETS(Price  ~ error("A") + trend("Ad") +
                   season("N"))
  )
# Estimated parameters:
tidy(fit_damped)
## # A tibble: 5 × 4
##   RegionName   .model term    estimate
##   <chr>        <chr>  <chr>      <dbl>
## 1 Portland, OR Damped alpha      1.00 
## 2 Portland, OR Damped beta       1.00 
## 3 Portland, OR Damped phi        0.945
## 4 Portland, OR Damped l[0]  282217.   
## 5 Portland, OR Damped b[0]    2931.

The value of α is very close to one, showing that the level reacts strongly to each new observation.

Comparing forecasting performance of non-seasonal methods.

fit_damped %>%
  forecast(h = 10) %>%
  autoplot(tsibble_pdx)+
  labs(title = "Portland, OR, House Prices",
       x = " ") +
  theme_minimal()
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`

The resulting forecasts look sensible with decreasing trend, which flattens out due to the value of the damping parameter (0.944), and relatively wide prediction intervals reflecting the variation in the historical data.

Arima Model

Unit root tests

tsibble_pdx %>%
  features(Price, unitroot_kpss)
## # A tibble: 1 × 3
##   RegionName   kpss_stat kpss_pvalue
##   <chr>            <dbl>       <dbl>
## 1 Portland, OR      2.50        0.01

The p-value is reported as 0.01 if it is less than 0.01, and as 0.1 if it is greater than 0.1. In this case, the test statistic (2.50) is bigger than the 1% critical value, so the p-value is less than 0.01, indicating that the null hypothesis is rejected. That is, the data are not stationary. We can difference the data, and apply the test again.

tsibble_pdx %>%
   features(Price, unitroot_ndiffs)
## # A tibble: 1 × 2
##   RegionName   ndiffs
##   <chr>         <int>
## 1 Portland, OR      1

As we saw from the KPSS tests above, one difference is required to make the Portland data stationary.

The following R code selects a non-seasonal ARIMA model automatically.

fit_arima <- tsibble_pdx %>%
  model(ARIMA(Price))
report(fit_arima)
## Series: Price 
## Model: ARIMA(2,1,0)(0,0,2)[12] w/ drift 
## 
## Coefficients:
##          ar1      ar2     sma1     sma2  constant
##       1.4839  -0.6566  -0.2862  -0.2478  505.1575
## s.e.  0.0677   0.0699   0.1192   0.1303   51.8211
## 
## sigma^2 estimated as 1156456:  log likelihood=-1084
## AIC=2180   AICc=2180.69   BIC=2197.16
fit_arima %>% forecast(h=10) %>%
  autoplot(tsibble_pdx)+
  theme_minimal()
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`

tsibble_pdx %>%
  gg_tsdisplay(difference(Price, 12) %>% difference(),
               plot_type='partial', lag=36)+
  labs(title = "Double differenced", y="")
## Warning: Removed 13 row(s) containing missing values (geom_path).
## Warning: Removed 13 rows containing missing values (geom_point).

fit <- tsibble_pdx %>%
  model(
    arima012011 = ARIMA(Price ~ pdq(1,1,0) + PDQ(0,0,1)),
    arima210002 = ARIMA(Price ~ pdq(2,1,0) + PDQ(0,0,2)),
    auto = ARIMA(Price, stepwise = FALSE, approx = FALSE)
  )

glance(fit) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 3 × 6
##   .model        sigma2 log_lik   AIC  AICc   BIC
##   <chr>          <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 auto         953723.  -1071. 2157. 2159. 2180.
## 2 arima210002 1156456.  -1084. 2180. 2181. 2197.
## 3 arima012011 1951955.  -1117. 2243. 2243. 2254.