metrofour <- read.csv("C:/Users/karol/Downloads/Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
str(metrofour[,c(1:11)])
## 'data.frame': 892 obs. of 11 variables:
## $ RegionID : int 102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
## $ SizeRank : int 0 1 2 3 4 5 6 7 8 9 ...
## $ RegionName : chr "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
## $ RegionType : chr "country" "msa" "msa" "msa" ...
## $ StateName : chr "" "NY" "CA" "IL" ...
## $ X2000.01.31: num 175564 267636 288298 215112 182888 ...
## $ X2000.02.29: num 176088 269292 289616 215684 182867 ...
## $ X2000.03.31: num 176634 270702 291170 216449 182957 ...
## $ X2000.04.30: num 177753 273614 294224 217892 183113 ...
## $ X2000.05.31: num 178880 276373 297293 219406 183300 ...
## $ X2000.06.30: num 180022 279412 300221 220844 183349 ...
We have to make this dataset tidy. Tidy Data is a way of structuring data so that it can be easily understood by people and analyzed by machines.
I need to remove the X at the beginning of the dates (X2000.01.31,X2000.02.29,…)
names(metrofour) <- sub("^X", "", names(metrofour))
str(metrofour[,c(1:11)])
## 'data.frame': 892 obs. of 11 variables:
## $ RegionID : int 102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
## $ SizeRank : int 0 1 2 3 4 5 6 7 8 9 ...
## $ RegionName: chr "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
## $ RegionType: chr "country" "msa" "msa" "msa" ...
## $ StateName : chr "" "NY" "CA" "IL" ...
## $ 2000.01.31: num 175564 267636 288298 215112 182888 ...
## $ 2000.02.29: num 176088 269292 289616 215684 182867 ...
## $ 2000.03.31: num 176634 270702 291170 216449 182957 ...
## $ 2000.04.30: num 177753 273614 294224 217892 183113 ...
## $ 2000.05.31: num 178880 276373 297293 219406 183300 ...
## $ 2000.06.30: num 180022 279412 300221 220844 183349 ...
house_price <- metrofour %>%
pivot_longer(-c(RegionID, SizeRank, RegionName, RegionType, StateName),
names_to = "Monthly",
values_to = "Price"
)
str(metrofour[,c(1:11)])
## 'data.frame': 892 obs. of 11 variables:
## $ RegionID : int 102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
## $ SizeRank : int 0 1 2 3 4 5 6 7 8 9 ...
## $ RegionName: chr "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
## $ RegionType: chr "country" "msa" "msa" "msa" ...
## $ StateName : chr "" "NY" "CA" "IL" ...
## $ 2000.01.31: num 175564 267636 288298 215112 182888 ...
## $ 2000.02.29: num 176088 269292 289616 215684 182867 ...
## $ 2000.03.31: num 176634 270702 291170 216449 182957 ...
## $ 2000.04.30: num 177753 273614 294224 217892 183113 ...
## $ 2000.05.31: num 178880 276373 297293 219406 183300 ...
## $ 2000.06.30: num 180022 279412 300221 220844 183349 ...
#Converting the Date from factor to character
house_clean <- house_price %>%
mutate(Monthly_parsed = as.Date(Monthly,"%Y.%m.%d"))
house_clean[["Monthly"]]<- as.character(house_clean$Monthly)
house_price[["Monthly"]]<- as.character(house_price $Monthly)
summary(house_clean)
## RegionID SizeRank RegionName RegionType
## Min. :102001 Min. : 0.0 Length:244408 Length:244408
## 1st Qu.:394550 1st Qu.:223.8 Class :character Class :character
## Median :394802 Median :446.5 Mode :character Mode :character
## Mean :414787 Mean :451.4
## 3rd Qu.:395052 3rd Qu.:676.2
## Max. :845172 Max. :927.0
##
## StateName Monthly Price Monthly_parsed
## Length:244408 Length:244408 Min. : 54665 Min. :2000-01-31
## Class :character Class :character 1st Qu.: 153442 1st Qu.:2005-09-30
## Mode :character Mode :character Median : 197007 Median :2011-06-15
## Mean : 233007 Mean :2011-06-15
## 3rd Qu.: 262890 3rd Qu.:2017-02-28
## Max. :2139455 Max. :2022-10-31
## NA's :49994
We see some missing values in the Price variable, but before I deal with those values, I will filter my data to the cities that I am interested the most
pdx_data <- house_clean %>%
dplyr:::filter(RegionName %in% c("Portland, OR")) %>%
dplyr:::filter(Monthly_parsed >= "2014-01-01")
summary(pdx_data)
## RegionID SizeRank RegionName RegionType
## Min. :394998 Min. :25 Length:106 Length:106
## 1st Qu.:394998 1st Qu.:25 Class :character Class :character
## Median :394998 Median :25 Mode :character Mode :character
## Mean :394998 Mean :25
## 3rd Qu.:394998 3rd Qu.:25
## Max. :394998 Max. :25
## StateName Monthly Price Monthly_parsed
## Length:106 Length:106 Min. :348564 Min. :2014-01-31
## Class :character Class :character 1st Qu.:414945 1st Qu.:2016-04-07
## Mode :character Mode :character Median :478985 Median :2018-06-15
## Mean :484321 Mean :2018-06-15
## 3rd Qu.:511982 3rd Qu.:2020-08-23
## Max. :702047 Max. :2022-10-31
After filtering the data, we don’t have any missing values
A time series can be recorded as a tsibble object in R. tsibble objects extend tidy data frames (tibble objects) by introducing temporal structure, and to do it, we need to declare key and index. In this case, the Monthly_parsed containing the data-time is the index and the RegionID is the key. Other columns can be considered as measured variables.
tsb_pdx <- pdx_data %>%
select(RegionName,RegionID, Monthly_parsed, Price)
tsb_pref_pdx <-tsb_pdx%>%
as_tsibble(key= RegionName, index= Monthly_parsed)%>%
index_by(year_month = ~ yearmonth(.))
tsibble_pdx <-tsb_pref_pdx%>%
select(-RegionID)%>%
as_tsibble(key= RegionName, index= year_month)%>%
mutate(Prices = Price/1000)
To visualize the data, I could use the autoplot() command, but I rather to create my graph with ggplot.
plot_pdx_house <- tsibble_pdx %>%
ggplot(aes(x= year_month, y= Prices)) +
geom_line(size=1, color= "darkgreen")+
labs(y="Price in Thousands of Dollars ",
x= " ",
title=" Four Bedroom House Prices in Portland, OR, 2012-2022 ",
caption = "data:https://www.zillow.com/research/data")+
scale_y_continuous(labels=scales::dollar_format())+
theme_minimal()
plot_pdx_house
Data is non- stationary, we can see a trend-cycle component in the graph above.
In our analysis, we use the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test (Kwiatkowski et al., 1992). In this test, the null hypothesis is that the data are stationary, and we look for evidence that the null hypothesis is false. Consequently, small p-values (e.g., less than 0.05) suggest that differencing is required. The test can be computed using the unitroot_kpss() function.
tsibble_pdx%>%
features(Prices, unitroot_kpss)
## # A tibble: 1 × 3
## RegionName kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 Portland, OR 1.98 0.01
The p-value is reported as 0.01 if it is less than 0.01, and as 0.1 if it is greater than 0.1. In this case, the test statistic (2.50) is bigger than the 1% critical value, so the p-value is less than 0.01, indicating that the null hypothesis is rejected. That is, the data are not stationary.
tsibble_pdx %>%
features(Prices ,unitroot_ndiffs)
## # A tibble: 1 × 2
## RegionName ndiffs
## <chr> <int>
## 1 Portland, OR 1
As we saw from the KPSS tests above, one difference (d) is required to make the tsibble_pdx data stationary.
tsibble_pdx %>%
gg_tsdisplay(Prices,
plot_type='partial')+
labs(y="Thousands of Dollars ",
x= " ")
ACF does not drop quickly to zero, moreover the value is large and positive (almost 1 in this case). All these are signs of a non-stationary time series. Therefore it should be differenced to obtain a stationary series.
PACF value r1 is almost 1. All other values ri,i >1 are small. This is a sign of a non stationary process that should be differenced in order to obtain a stationary series.
The data are clearly non-stationary, so we will first take a seasonal difference. The seasonally differenced data are shown below:
tsibble_pdx %>%
gg_tsdisplay(difference(Prices, 12),
plot_type='partial', lag=36) +
labs(title="Seasonally differenced", y="")
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
These are also clearly non-stationary, so we take a further first
difference
tsibble_pdx %>%
gg_tsdisplay(difference(Prices, 12) %>% difference(),
plot_type='partial', lag=36) +
labs(title = "Double differenced", y="")
## Warning: Removed 13 row(s) containing missing values (geom_path).
## Warning: Removed 13 rows containing missing values (geom_point).
Our aim now is to find an appropriate ARIMA model based on the ACF and PACF shown in the Double Differenced graph.
all_fit <- tsibble_pdx%>%
model(
arima212012 = ARIMA(Prices ~ pdq(2,1,2)+ PDQ(0,1,2)),
arima210011 = ARIMA(Prices ~ pdq(2,1,0)+ PDQ(0,1,1)),
stepwise = ARIMA(Prices),
search = ARIMA(Prices,stepwise=FALSE))
all_fit %>% pivot_longer(!RegionName,
names_to = "Model name",
values_to = "Orders")
## # A mable: 4 x 3
## # Key: RegionName, Model name [4]
## RegionName `Model name` Orders
## <chr> <chr> <model>
## 1 Portland, OR arima212012 <ARIMA(2,1,2)(0,1,2)[12]>
## 2 Portland, OR arima210011 <ARIMA(2,1,0)(0,1,1)[12]>
## 3 Portland, OR stepwise <ARIMA(1,1,2)(0,0,1)[12] w/ drift>
## 4 Portland, OR search <ARIMA(1,1,2)(0,0,1)[12] w/ drift>
glance(all_fit) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 4 × 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 stepwise 0.891 -146. 304. 305. 320.
## 2 search 0.891 -146. 304. 305. 320.
## 3 arima212012 1.09 -149. 312. 314. 330.
## 4 arima210011 1.68 -162. 331. 332. 341.
Of these models, the best is the ARIMA(2,1,2)(0,1,2)[12]model (i.e., it has the smallest AICc value).
arima212012 <- tsibble_pdx %>%
model(arima212012 = ARIMA(Prices ~ pdq(2,1,2)+ PDQ(0,1,2)))%>%
report()
## Series: Prices
## Model: ARIMA(2,1,2)(0,1,2)[12]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1 sma2
## 0.7422 -0.0097 0.9808 0.9999 -1.0640 0.2030
## s.e. 0.1169 0.1191 0.0641 0.0769 0.2173 0.1673
##
## sigma^2 estimated as 1.092: log likelihood=-149.11
## AIC=312.21 AICc=313.53 BIC=329.94
all_fit %>% select(arima212012) %>%
gg_tsresiduals()
augment(all_fit) %>%
filter(.model=='arima212012') %>%
features(.innov, ljung_box, lag = 36, dof = 6)
## # A tibble: 1 × 4
## RegionName .model lb_stat lb_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 Portland, OR arima212012 36.5 0.193
tsibble_pdx %>%
model(ARIMA(Prices ~ pdq(2,1,2) + PDQ(0,1,2))) %>%
forecast() %>%
autoplot(tsibble_pdx) +
labs(y=" Thousands of $US ",
x =" ",
title="Forecast from the ARIMA(2,1,2)(0,1,2)[12] model\napplied to the Portland House Prices data")
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`
fit_ets <- tsibble_pdx %>%
model(ETS(Prices))
report(fit_ets)
## Series: Prices
## Model: ETS(M,Ad,N)
## Smoothing parameters:
## alpha = 0.9998912
## beta = 0.9998339
## phi = 0.9632019
##
## Initial states:
## l[0] b[0]
## 346.6011 1.209576
##
## sigma^2: 0
##
## AIC AICc BIC
## 545.9062 546.7547 561.8868
The model selected is ETS(M,Ad,N)
components(fit_ets) %>%
autoplot() +
labs(title = "ETS(M,Ad,N) components")
## Warning: Removed 1 row(s) containing missing values (geom_path).
Because this model has multiplicative errors, the innovation residuals are not equivalent to the regular residuals.
fit_ets %>%
augment() %>%
select(.innov, .resid) %>%
pivot_longer(c(.innov, .resid)) %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = value`
fit_ets%>%
gg_tsresiduals()
fit_ets %>%
forecast(h = 24) %>%
autoplot(tsibble_pdx)
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`
bind_rows(
arima212012 %>% accuracy(),
fit_ets %>% accuracy()) %>%
select(-ME, -MPE, -ACF1)
## # A tibble: 2 × 8
## RegionName .model .type RMSE MAE MAPE MASE RMSSE
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Portland, OR arima212012 Training 0.947 0.685 0.134 0.0168 0.0186
## 2 Portland, OR ETS(Prices) Training 1.53 1.00 0.188 0.0246 0.0300
In this case the ARIMA model seems to be more accurate model based on the test set RMSE, MAPE and MASE.