metrofour <- read.csv("C:/Users/karol/Downloads/Metro_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
str(metrofour[,c(1:11)])
## 'data.frame': 892 obs. of 11 variables:
## $ RegionID : int 102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
## $ SizeRank : int 0 1 2 3 4 5 6 7 8 9 ...
## $ RegionName : chr "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
## $ RegionType : chr "country" "msa" "msa" "msa" ...
## $ StateName : chr "" "NY" "CA" "IL" ...
## $ X2000.01.31: num 175564 267636 288298 215112 182888 ...
## $ X2000.02.29: num 176088 269292 289616 215684 182867 ...
## $ X2000.03.31: num 176634 270702 291170 216449 182957 ...
## $ X2000.04.30: num 177753 273614 294224 217892 183113 ...
## $ X2000.05.31: num 178880 276373 297293 219406 183300 ...
## $ X2000.06.30: num 180022 279412 300221 220844 183349 ...
We have to make this dataset tidy. Tidy Data is a way of structuring data so that it can be easily understood by people and analyzed by machines.
I need to remove the X at the beginning of the dates (X2000.01.31,X2000.02.29,…)
names(metrofour) <- sub("^X", "", names(metrofour))
str(metrofour[,c(1:11)])
## 'data.frame': 892 obs. of 11 variables:
## $ RegionID : int 102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
## $ SizeRank : int 0 1 2 3 4 5 6 7 8 9 ...
## $ RegionName: chr "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
## $ RegionType: chr "country" "msa" "msa" "msa" ...
## $ StateName : chr "" "NY" "CA" "IL" ...
## $ 2000.01.31: num 175564 267636 288298 215112 182888 ...
## $ 2000.02.29: num 176088 269292 289616 215684 182867 ...
## $ 2000.03.31: num 176634 270702 291170 216449 182957 ...
## $ 2000.04.30: num 177753 273614 294224 217892 183113 ...
## $ 2000.05.31: num 178880 276373 297293 219406 183300 ...
## $ 2000.06.30: num 180022 279412 300221 220844 183349 ...
house_price <- metrofour %>%
pivot_longer(-c(RegionID, SizeRank, RegionName, RegionType, StateName),
names_to = "Monthly",
values_to = "Price"
)
str(metrofour[,c(1:11)])
## 'data.frame': 892 obs. of 11 variables:
## $ RegionID : int 102001 394913 753899 394463 394514 394692 395209 394856 394974 394347 ...
## $ SizeRank : int 0 1 2 3 4 5 6 7 8 9 ...
## $ RegionName: chr "United States" "New York, NY" "Los Angeles, CA" "Chicago, IL" ...
## $ RegionType: chr "country" "msa" "msa" "msa" ...
## $ StateName : chr "" "NY" "CA" "IL" ...
## $ 2000.01.31: num 175564 267636 288298 215112 182888 ...
## $ 2000.02.29: num 176088 269292 289616 215684 182867 ...
## $ 2000.03.31: num 176634 270702 291170 216449 182957 ...
## $ 2000.04.30: num 177753 273614 294224 217892 183113 ...
## $ 2000.05.31: num 178880 276373 297293 219406 183300 ...
## $ 2000.06.30: num 180022 279412 300221 220844 183349 ...
#Converting the Date from factor to character
house_clean <- house_price %>%
mutate(Monthly_parsed = as.Date(Monthly,"%Y.%m.%d"))
house_clean[["Monthly"]]<- as.character(house_clean$Monthly)
house_price[["Monthly"]]<- as.character(house_price $Monthly)
summary(house_clean)
## RegionID SizeRank RegionName RegionType
## Min. :102001 Min. : 0.0 Length:244408 Length:244408
## 1st Qu.:394550 1st Qu.:223.8 Class :character Class :character
## Median :394802 Median :446.5 Mode :character Mode :character
## Mean :414787 Mean :451.4
## 3rd Qu.:395052 3rd Qu.:676.2
## Max. :845172 Max. :927.0
##
## StateName Monthly Price Monthly_parsed
## Length:244408 Length:244408 Min. : 54665 Min. :2000-01-31
## Class :character Class :character 1st Qu.: 153442 1st Qu.:2005-09-30
## Mode :character Mode :character Median : 197007 Median :2011-06-15
## Mean : 233007 Mean :2011-06-15
## 3rd Qu.: 262890 3rd Qu.:2017-02-28
## Max. :2139455 Max. :2022-10-31
## NA's :49994
We see some missing values in the Price variable, but before I deal with those values, I will filter my data to the cities that I am interested the most
pdx_data <- house_clean %>%
dplyr:::filter(RegionName %in% c("Portland, OR")) %>%
dplyr:::filter(Monthly_parsed >= "2012-01-01")
summary(pdx_data)
## RegionID SizeRank RegionName RegionType
## Min. :394998 Min. :25 Length:130 Length:130
## 1st Qu.:394998 1st Qu.:25 Class :character Class :character
## Median :394998 Median :25 Mode :character Mode :character
## Mean :394998 Mean :25
## 3rd Qu.:394998 3rd Qu.:25
## Max. :394998 Max. :25
## StateName Monthly Price Monthly_parsed
## Length:130 Length:130 Min. :285351 Min. :2012-01-31
## Class :character Class :character 1st Qu.:361840 1st Qu.:2014-10-07
## Mode :character Mode :character Median :458053 Median :2017-06-15
## Mean :452466 Mean :2017-06-15
## 3rd Qu.:501009 3rd Qu.:2020-02-21
## Max. :702047 Max. :2022-10-31
After filtering the data, we don’t have any missing values
A time series can be recorded as a tsibble object in R. tsibble objects extend tidy data frames (tibble objects) by introducing temporal structure, and to do it, we need to declare key and index. In this case, the Monthly_parsed containing the data-time is the index and the RegionID is the key. Other columns can be considered as measured variables.
tsb_pdx <- pdx_data %>%
select(RegionName,RegionID, Monthly_parsed, Price)
tsb_pref_pdx <-tsb_pdx%>%
as_tsibble(key= RegionName, index= Monthly_parsed)%>%
index_by(year_month = ~ yearmonth(.))
tsibble_pdx <-tsb_pref_pdx%>%
select(-RegionID)%>%
as_tsibble(key= RegionName, index= year_month)
To visualize the data, I could use the autoplot() command, but I rather to create my graph with ggplot.
plot_cities_house <- tsibble_pdx %>%
ggplot(aes(x= year_month, y= Price/1000)) +
geom_line(size=1, color= "darkgreen")+
labs(y="Price in Thousands of Dollars ",
x= " ",
title=" Four Bedroom House Prices in Portland, OR, 2012-2022 ",
caption = "data:https://www.zillow.com/research/data")+
scale_y_continuous(labels=scales::dollar_format())+
theme_minimal()
plot_cities_house
tsibble_pdx%>%
gg_season(Price/1000, labels = "both")+
labs(x= "",
y= "Price in Thousands of Dollars ",
title="Portland's Seasonal Plot")+
scale_y_continuous(labels=scales::dollar_format())+
theme_minimal()
tsibble_pdx %>%
gg_subseries(Price/1000)+
labs(y= "Price in Thousands of Dollars",
x= "Year")+theme_minimal()+
scale_y_continuous(labels=scales::dollar_format())+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
dcmp <- tsibble_pdx %>%
model(stl = STL(Price))
components(dcmp)
## # A dable: 130 x 8 [1M]
## # Key: RegionName, .model [1]
## # : Price = trend + season_year + remainder
## RegionName .model year_month Price trend season_year remainder season_…¹
## <chr> <chr> <mth> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Portland, OR stl 2012 Jan 285373 281952. 162. 3259. 285211.
## 2 Portland, OR stl 2012 Feb 285351 284224. 642. 485. 284709.
## 3 Portland, OR stl 2012 Mar 287016 286497. 905. -386. 286111.
## 4 Portland, OR stl 2012 Apr 289055 288769. 828. -542. 288227.
## 5 Portland, OR stl 2012 May 291599 291106. 564. -71.2 291035.
## 6 Portland, OR stl 2012 Jun 293292 293443. 255. -406. 293037.
## 7 Portland, OR stl 2012 Jul 294902 295780. -16.4 -862. 294918.
## 8 Portland, OR stl 2012 Aug 296946 298188. -398. -844. 297344.
## 9 Portland, OR stl 2012 Sep 298823 300596. -702. -1072. 299525.
## 10 Portland, OR stl 2012 Oct 300955 303004. -946. -1103. 301901.
## # … with 120 more rows, and abbreviated variable name ¹season_adjust
The output above shows the components of an STL decomposition. The original data is shown (as Price), followed by the estimated components. This output forms a “dable” or decomposition table. The header to the table shows that the Price series has been decomposed additively.
components(dcmp) %>%
as_tsibble() %>%
autoplot(Price, colour="gray") +
geom_line(aes(y=trend), colour = "#D55E00")+
ggpubr::theme_pubclean()
We can plot all of the components in a single figure using autoplot()
components(dcmp) %>%
autoplot(colour = "darkgreen") + theme_minimal()+
labs(x= "")+ theme(panel.spacing = unit(.8, "cm"))
The three components are shown separately in the bottom three panels. These components can be combined to reconstruct the data shown in the top panel.
tsibble_pdx %>%
model(`Holt's method` = ETS(Price ~ error("A") +
trend("A") + season("N")),
`Damped Holt's method` = ETS(Price ~ error("A") +
trend("Ad", phi = 0.9) + season("N"))
) %>%
forecast(h = 15) %>%
autoplot(tsibble_pdx, level = NULL) +
labs(title = "Portland, OR, House Prices",
y = " ") +
guides(colour = guide_legend(title = "Forecast"))+
theme_minimal()
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`
We have set the damping parameter to a relatively low number
(ϕ=0.90) to exaggerate the effect of damping for comparison. Usually, we
would estimate ϕ along with the other parameters. We have also used a
rather large forecast horizon (h=15) to highlight the difference between
a damped trend and a linear trend.
We will use time series cross-validation to compare the one-step forecast accuracy of the three methods
tsibble_pdx %>%
stretch_tsibble(.init = 10) %>%
model(
SES = ETS(Price ~ error("A") + trend("N") + season("N")),
Holt = ETS(Price ~ error("A") + trend("A") + season("N")),
Damped = ETS(Price ~ error("A") + trend("Ad") +
season("N"))
) %>%
forecast(h = 1) %>%
accuracy(tsibble_pdx)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 1 observation is missing at 2022 Nov
## # A tibble: 3 × 11
## .model RegionName .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Damped Portland, OR Test 99.4 1528. 1002. 0.0325 0.203 0.0258 0.0320 0.571
## 2 Holt Portland, OR Test 2.65 1574. 1001. 0.0108 0.204 0.0257 0.0329 0.552
## 3 SES Portland, OR Test 3180. 4620. 3564. 0.678 0.738 0.0916 0.0967 0.891
Holt and Damped Holt’s method are best comparing RMSE and MAE values. Sometimes different accuracy measures will suggest different forecasting methods, and then a decision is required as to which forecasting method we prefer to use. So we will proceed with using the damped Holt’s method and apply it to the whole data set to get forecasts for future prices.
fit_damped <- tsibble_pdx %>%
model(
Damped = ETS(Price ~ error("A") + trend("Ad") +
season("N"))
)
# Estimated parameters:
tidy(fit_damped)
## # A tibble: 5 × 4
## RegionName .model term estimate
## <chr> <chr> <chr> <dbl>
## 1 Portland, OR Damped alpha 1.00
## 2 Portland, OR Damped beta 1.00
## 3 Portland, OR Damped phi 0.945
## 4 Portland, OR Damped l[0] 282217.
## 5 Portland, OR Damped b[0] 2931.
The value of α is very close to one, showing that the level reacts strongly to each new observation.
fit_damped %>%
forecast(h = 10) %>%
autoplot(tsibble_pdx)+
labs(title = "Portland, OR, House Prices",
x = " ") +
theme_minimal()
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`
The resulting forecasts look sensible with decreasing trend, which
flattens out due to the value of the damping parameter (0.944), and
relatively wide prediction intervals reflecting the variation in the
historical data.
tsibble_pdx %>%
features(Price, unitroot_kpss)
## # A tibble: 1 × 3
## RegionName kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 Portland, OR 2.50 0.01
The p-value is reported as 0.01 if it is less than 0.01, and as 0.1 if it is greater than 0.1. In this case, the test statistic (2.50) is bigger than the 1% critical value, so the p-value is less than 0.01, indicating that the null hypothesis is rejected. That is, the data are not stationary. We can difference the data, and apply the test again.
tsibble_pdx %>%
features(Price, unitroot_ndiffs)
## # A tibble: 1 × 2
## RegionName ndiffs
## <chr> <int>
## 1 Portland, OR 1
As we saw from the KPSS tests above, one difference is required to make the Portland data stationary.
The following R code selects a non-seasonal ARIMA model automatically.
fit_arima <- tsibble_pdx %>%
model(ARIMA(Price))
report(fit_arima)
## Series: Price
## Model: ARIMA(2,1,0)(0,0,2)[12] w/ drift
##
## Coefficients:
## ar1 ar2 sma1 sma2 constant
## 1.4839 -0.6566 -0.2862 -0.2478 505.1575
## s.e. 0.0677 0.0699 0.1192 0.1303 51.8211
##
## sigma^2 estimated as 1156456: log likelihood=-1084
## AIC=2180 AICc=2180.69 BIC=2197.16
fit_arima %>% forecast(h=10) %>%
autoplot(tsibble_pdx)+
theme_minimal()
## `mutate_if()` ignored the following grouping variables:
## • Column `year_month`
tsibble_pdx %>%
gg_tsdisplay(difference(Price, 12) %>% difference(),
plot_type='partial', lag=36)+
labs(title = "Double differenced", y="")
## Warning: Removed 13 row(s) containing missing values (geom_path).
## Warning: Removed 13 rows containing missing values (geom_point).
fit <- tsibble_pdx %>%
model(
arima012011 = ARIMA(Price ~ pdq(1,1,0) + PDQ(0,0,1)),
arima210002 = ARIMA(Price ~ pdq(2,1,0) + PDQ(0,0,2)),
auto = ARIMA(Price, stepwise = FALSE, approx = FALSE)
)
glance(fit) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 3 × 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 auto 953723. -1071. 2157. 2159. 2180.
## 2 arima210002 1156456. -1084. 2180. 2181. 2197.
## 3 arima012011 1951955. -1117. 2243. 2243. 2254.