There are two parts in this project. In the first part, I used time series method to show the trends and pattern of house inventory and house sales count monthly in California (2008 Jan. - 2021 Apr.). I compared the difference and found the relationship between sales count and house inventory using cor.test(), linear model and anova() methods. I also used ARIMA() and ETS() method to forecast the sales count in San Francisco in 3 years.
In the second part, I used time series method to show the trends and pattern of average house value in California (1996 Jan. - 2021 Apr.). I made the time series for the value of different types of house (single family, condo, one bedroom, two bedrooms, three bedrooms, four bedrooms and five bedrooms) in San Francisco (1996 Jan. - 2021 Apr.). At the end, I forecast the house value of single family house in San Francisco in 3 years.
The databases of this project are downloaded from zillow.com.
options(Ncpus = 8)
library(pacman)
p_load(fs, readr, lubridate, tidyverse, janitor, DataExplorer, summarytools, data.table, dtplyr, ggplot2, ggpubr, zoo, fpp3)
inventory <- read_csv('https://files.zillowstatic.com/research/public_v2/invt_fs/Metro_invt_fs_uc_sfrcondo_smoothed_month.csv?t=1622670174')
inventory <- inventory %>%
separate(RegionName, c('city', 'state'), sep = ',') %>%
filter(StateName == 'CA') %>%
pivot_longer(-c(1:6), names_to = 'date', values_to = 'inventory') %>%
filter(!is.na(inventory))
inventory$month <- as.yearmon(inventory$date)
invent <- inventory %>% group_by(month) %>%
summarize(tot_invent = sum(inventory))
invent %>%
ggplot(aes(x = month, y = tot_invent)) +
geom_line(col = 'dark blue')
house_county <- read.csv('https://files.zillowstatic.com/research/public_v2/sales_count_now/Metro_sales_count_now_uc_sfrcondo_raw_month.csv?t=1622670174')
sale_county <- house_county %>%
pivot_longer(-c(1:5), names_to = 'date', values_to = 'sold') %>%
filter(StateName == 'CA') %>%
separate(RegionName, c('city', 'state'), sep = ',') %>%
filter(!is.na(sold))
sale_county$date <- as.Date(sale_county$date, format = 'X%Y.%m.%d')
sale_county <- sale_county %>% mutate(month = as.yearmon(date))
head(sale_county, n = 3)
sale1 <- sale_county %>%
select(month, sold) %>%
group_by(month) %>%
summarize(tot_sold = sum(sold))
a <- ggplot(sale1, aes(x=month, y=tot_sold)) +
geom_line(col = 'blue4') +
theme(aspect.ratio=0.3)
sale2 <- sale_county %>%
select(month, sold) %>%
mutate(year = year(month)) %>%
group_by(year) %>%
summarize(tot_sold = sum(sold)) %>%
filter(year < '2021')
b <- ggplot(sale2, aes(x=year, y=tot_sold)) +
geom_line(col = 'red3') +
theme(aspect.ratio=0.3)
ggarrange(a,b, nrow = 2)
sale_region_1 <- sale_county %>%
select(month, sold, city) %>%
group_by(month, city) %>%
summarize(tot_sold = sum(sold))
a <- ggplot(sale_region_1, aes(x=month, y=tot_sold, colour = city)) +
geom_line() +
theme(aspect.ratio=0.3,
legend.text = element_text(size = 5),
legend.key.size = unit(0.3, "cm"))
sale_region_2 <- sale_county %>%
select(month, sold, city) %>%
mutate(year = year(month)) %>%
group_by(year, city) %>%
summarize(tot_sold = sum(sold)) %>%
filter(year < '2021')
b <- ggplot(sale_region_2, aes(x=year, y=tot_sold, col = city)) +
geom_line() +
theme(aspect.ratio=0.3,
legend.text = element_text(size = 5),
legend.key.size = unit(0.3, "cm"))
ggarrange(a,b, nrow = 2)
sale_inventory <- invent %>%
inner_join(sale1, by = 'month') %>%
select(month, tot_invent, tot_sold)
head(sale_inventory, n = 3)
sale_inventory0 <- sale_inventory %>%
pivot_longer(tot_invent:tot_sold, names_to = 'type', values_to = 'count')
ggplot(sale_inventory0,aes(x=month, y=count, col = type)) +
geom_line()
sale_inventory %>%
ggplot(aes(x=tot_invent, y=tot_sold)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE)
cor.test(sale_inventory$tot_invent, sale_inventory$tot_sold)
##
## Pearson's product-moment correlation
##
## data: sale_inventory$tot_invent and sale_inventory$tot_sold
## t = 2.0369, df = 39, p-value = 0.04849
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.00268852 0.56393520
## sample estimates:
## cor
## 0.3100825
mod_lm <- lm(tot_sold ~ tot_invent, data = sale_inventory)
summary(mod_lm)
##
## Call:
## lm(formula = tot_sold ~ tot_invent, data = sale_inventory)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10385 -3943 480 3944 7505
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.884e+04 4.735e+03 3.979 0.000291 ***
## tot_invent 1.218e-01 5.981e-02 2.037 0.048492 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5075 on 39 degrees of freedom
## Multiple R-squared: 0.09615, Adjusted R-squared: 0.07298
## F-statistic: 4.149 on 1 and 39 DF, p-value: 0.04849
sale_invent_ct <- inventory %>%
inner_join(sale_county, by = c('month', 'city'))
sale_invent_ct %>%
pivot_longer(c(inventory,sold), names_to = 'type', values_to = 'count') %>%
ggplot(aes(x=month, y=count, col = type)) +
geom_line() +
facet_wrap(.~ city, nrow = 5, scales = 'free_y')
sale_invent_ct %>%
ggplot(aes(x=inventory, y=sold)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE)
sale_invent_ct %>%
ggplot(aes(x=inventory, y=sold, col=city)) +
geom_point() +
geom_smooth(method = 'lm', se = FALSE)
mod_lm2 <- lm(sold ~ inventory * city, data = sale_invent_ct)
anova(mod_lm2)
cor.test(sale_invent_ct$inventory, sale_invent_ct$sold)
##
## Pearson's product-moment correlation
##
## data: sale_invent_ct$inventory and sale_invent_ct$sold
## t = 51.847, df = 398, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9193727 0.9448770
## sample estimates:
## cor
## 0.9332918
cityname <- 'San Francisco'
tss <- sale_county %>%
filter(city == cityname) %>%
select(month, sold) %>%
mutate(month = yearmonth(month)) %>%
as_tsibble(index = month)
head(tss, n=3)
tss %>% autoplot(col = 'blue4')
tss %>%
features(sold, unitroot_kpss)
The p-value is less than 0.05, indicating that the null hypothesis is rejected. That is, the data are not stationary. We can difference the data, and apply the test again.
tss %>%
mutate(diff_sold = difference(sold)) %>%
features(diff_sold, unitroot_kpss)
Determining the appropriate number of first differences is carried out using the unitroot_ndiffs() feature.
tss %>%
features(sold, unitroot_ndiffs)
tss %>%
features(sold, unitroot_nsdiffs)
tss %>%
transmute(
`Sold` = sold,
`Log Sold` = log(sold),
`Annual change in log Sold` = difference(log(sold), 12),
`Doubly differenced log Sold` =
difference(difference(log(sold), 12), 1)) %>%
pivot_longer(-month, names_to="data_type", values_to="data") %>%
mutate(
data_type = as.factor(data_type)) %>%
ggplot(aes(x = month, y = data)) +
geom_line() +
facet_grid(vars(data_type), scales = "free_y")
Splitting the data from 2008 Jan. to 2018 Dec. as training data, and the rest data to testing data.
train <- tss %>%
filter_index(. ~ "2018 Dec")
ARIMA()
fit_arima <- train %>% model(ARIMA(sold))
report(fit_arima)
## Series: sold
## Model: ARIMA(3,0,1)(0,1,2)[12] w/ drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 sma1 sma2 constant
## -0.3504 0.5028 0.3666 0.8538 -0.4497 -0.3006 39.2581
## s.e. 0.1255 0.1078 0.0995 0.1146 0.1229 0.1125 17.8790
##
## sigma^2 estimated as 69539: log likelihood=-833.52
## AIC=1683.03 AICc=1684.34 BIC=1705.26
fit_arima %>% gg_tsresiduals(lag_max = 16)
augment(fit_arima) %>%
features(.innov, ljung_box, lag = 16, dof = 6)
ETC()
fit_ets <- train %>% model(ETS(sold))
report(fit_ets)
## Series: sold
## Model: ETS(M,Ad,M)
## Smoothing parameters:
## alpha = 0.4366114
## beta = 0.0001492463
## gamma = 0.0001006311
## phi = 0.9799985
##
## Initial states:
## l b s1 s2 s3 s4 s5 s6
## 2663.647 30.38833 0.657838 0.9789844 0.9593221 1.073763 1.013181 1.125743
## s7 s8 s9 s10 s11 s12
## 1.154598 1.217304 1.141733 1.0354 0.9660794 0.6760525
##
## sigma^2: 0.0053
##
## AIC AICc BIC
## 2111.257 2117.364 2163.010
fit_ets %>%
gg_tsresiduals(lag_max = 16)
augment(fit_ets) %>%
features(.innov, ljung_box, lag = 16, dof = 6)
The output below evaluates the forecasting performance of the two competing models over the train and test set. In this case the ARIMA model seems to be the slightly more accurate model based on the test set RMSE, MAPE and MASE.
bind_rows(
fit_arima %>% accuracy(),
fit_ets %>% accuracy(),
fit_arima %>% forecast(h = "3 years") %>%
accuracy(tss),
fit_ets %>% forecast(h = "3 years") %>%
accuracy(tss)
) %>%
select(-ME, -MPE, -ACF1)
tss %>%
model(ARIMA(sold)) %>%
forecast(h="3 years") %>%
head(n = 5)
tss %>%
model(ARIMA(sold)) %>%
forecast(h="3 years") %>%
autoplot(tss)
house_value <- read.csv('https://files.zillowstatic.com/research/public_v2/zhvi/Metro_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv?t=1622670174')
house_value <- house_value %>%
separate(RegionName, c('city', 'state'), sep = ',') %>%
pivot_longer(-c(1:6), names_to = 'date', values_to = 'value') %>%
filter(StateName == 'CA') %>%
filter(!is.na(value))
house_value$date <- as.Date(house_value$date, format = 'X%Y.%m.%d')
house_value <- house_value %>% mutate(month = as.yearmon(date))
head(house_value, n = 3)
value_a <- house_value %>%
select(city, month, value) %>%
group_by(month) %>%
summarize(mean_value = mean(value))
a <- ggplot(value_a, aes(x=month, y=mean_value)) +
geom_line(col = 'dark blue') +
geom_vline(aes(xintercept = as.numeric(as.yearmon('2018-08'))),
linetype = 'dotted', col = 'green4') +
geom_vline(aes(xintercept = as.numeric(as.yearmon('2020-06'))),
linetype = 'dotted', col = 'green4') +
geom_text(aes(x=as.numeric(as.yearmon('2018-08')), y = 200000),
label = '2018-08-31', size = 2.5, col = 'green4')+
geom_text(aes(x=as.numeric(as.yearmon('2020-06')), y = 300000),
label = '2020-06-30', size = 2.5, col = 'green4')+
theme(aspect.ratio=0.37)
value_b <- house_value %>%
mutate(year = year(month)) %>%
select(city, year, month, value) %>%
group_by(year) %>%
summarize(mean_value = mean(value))
b <- ggplot(value_b,aes(x=year, y=mean_value)) +
geom_line(col = 'orange') +
geom_vline(aes(xintercept = as.numeric(as.yearmon('2006-01'))),
linetype = 'dotted', col = 'green4') +
geom_vline(aes(xintercept = as.numeric(as.yearmon('2012-01'))),
linetype = 'dotted', col = 'green4') +
geom_text(aes(x=as.numeric(as.yearmon('2006-01')), y = 470000),
label = '2006-01-31', size = 2.5, col = 'green4')+
geom_text(aes(x=as.numeric(as.yearmon('2012-01')), y = 200000),
label = '2012-01-31', size = 2.5, col = 'green4')+
theme(aspect.ratio=0.37)
ggarrange(a,b, nrow = 2)
city1 <- c('San Francisco', 'San Jose', 'Sacramento', 'Rriveside',
'Napa', 'Santa Cruz')
value_city <- house_value %>%
filter(city %in% city1)
value_city1 <- value_city %>%
select(month, value, city) %>%
group_by(month, city) %>%
summarize(mean_price = mean(value))
a <- value_city1 %>%
ggplot(aes(x=month, y=mean_price, col = city)) +
geom_line()+
theme(aspect.ratio=0.35)
value_city2 <- value_city %>%
mutate(year = year(month)) %>%
select(year, value, city) %>%
group_by(year, city) %>%
summarize(mean_value = mean(value))
b <- value_city2 %>%
ggplot(aes(x=year, y=mean_value, col = city)) +
geom_line() +
theme(aspect.ratio=0.35)
ggarrange(a,b, nrow = 2)
value0sg <- read_csv('https://files.zillowstatic.com/research/public_v2/zhvi/City_zhvi_uc_sfr_tier_0.33_0.67_sm_sa_mon.csv?t=1622670174')
value0cd <- read_csv('https://files.zillowstatic.com/research/public_v2/zhvi/City_zhvi_uc_condo_tier_0.33_0.67_sm_sa_mon.csv?t=1622670174')
value01 <- read_csv('https://files.zillowstatic.com/research/public_v2/zhvi/City_zhvi_bdrmcnt_1_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv?t=1622670174')
value02 <- read_csv('https://files.zillowstatic.com/research/public_v2/zhvi/City_zhvi_bdrmcnt_2_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv?t=1622670174')
value03 <- read_csv('https://files.zillowstatic.com/research/public_v2/zhvi/City_zhvi_bdrmcnt_3_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv?t=1622670174')
value04 <- read_csv('https://files.zillowstatic.com/research/public_v2/zhvi/City_zhvi_bdrmcnt_4_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv?t=1622670174')
value05 <- read_csv('https://files.zillowstatic.com/research/public_v2/zhvi/City_zhvi_bdrmcnt_5_uc_sfrcondo_tier_0.33_0.67_sm_sa_mon.csv?t=1622670174')
value_1 <- value01 %>%
filter(StateName == 'CA') %>%
pivot_longer(-c(1:8) , names_to = 'date', values_to = 'room1') %>%
filter(!is.na(room1))
value_2 <- value02 %>%
filter(StateName == 'CA') %>%
pivot_longer(-c(1:8) , names_to = 'date', values_to = 'room2') %>%
filter(!is.na(room2))
value_3 <- value03 %>%
filter(StateName == 'CA') %>%
pivot_longer(-c(1:8) , names_to = 'date', values_to = 'room3') %>%
filter(!is.na(room3))
value_4 <- value04 %>%
filter(StateName == 'CA') %>%
pivot_longer(-c(1:8) , names_to = 'date', values_to = 'room4') %>%
filter(!is.na(room4))
value_5 <- value05 %>%
filter(StateName == 'CA') %>%
pivot_longer(-c(1:8) , names_to = 'date', values_to = 'room5') %>%
filter(!is.na(room5))
value_s <- value0sg %>%
filter(StateName == 'CA') %>%
pivot_longer(-c(1:8) , names_to = 'date', values_to = 'values') %>%
filter(!is.na(values))
value_c <- value0cd %>%
filter(StateName == 'CA') %>%
pivot_longer(-c(1:8) , names_to = 'date', values_to = 'valuec') %>%
filter(!is.na(valuec))
value_1$month <- as.yearmon(value_1$date)
value_2$month <- as.yearmon(value_2$date)
value_3$month <- as.yearmon(value_3$date)
value_4$month <- as.yearmon(value_4$date)
value_5$month <- as.yearmon(value_5$date)
value_s$month <- as.yearmon(value_s$date)
value_c$month <- as.yearmon(value_c$date)
head(value_1, n =2)
value_s_city <- value_s %>%
mutate(year=year(month))%>%
select(RegionName, date, month, year, values)
value_c_city <- value_c %>%
mutate(year=year(month))%>%
select(RegionName, date, month, year, valuec)
value_1_city <- value_1 %>%
mutate(year=year(month))%>%
select(RegionName, date, month, year, room1)
value_2_city <- value_2 %>%
mutate(year=year(month))%>%
select(RegionName, date, month, year, room2)
value_3_city <- value_3 %>%
mutate(year=year(month))%>%
select(RegionName, date, month, year, room3)
value_4_city <- value_4 %>%
mutate(year=year(month))%>%
select(RegionName, date, month, year, room4)
value_5_city <- value_5 %>%
mutate(year=year(month))%>%
select(RegionName, date, month, year, room5)
head(value_s_city, n = 2)
value_city <- value_s_city %>%
inner_join(value_c_city, by = c('RegionName', 'month', 'date')) %>%
inner_join(value_1_city, by = c('RegionName', 'month', 'date')) %>%
inner_join(value_2_city, by = c('RegionName', 'month', 'date')) %>%
inner_join(value_3_city, by = c('RegionName', 'month', 'date')) %>%
inner_join(value_4_city, by = c('RegionName', 'month', 'date')) %>%
inner_join(value_5_city, by = c('RegionName', 'month', 'date'))
value_city0 <-
value_city %>%
select(month, values, valuec, room1, room2, room3, room4, room5) %>%
group_by(month) %>%
summarize(single_family=mean(values), condo=mean(valuec),
one_room = mean(room1), two_room = mean(room2),
three_room = mean(room3), four_room = mean(room4),
five_room = mean(room5))
value_city0 %>%
pivot_longer(-1, names_to = 'type', values_to = 'house_value') %>%
ggplot(aes(x = month, y = house_value, col = type)) +
geom_line()
city2 <- c('San Francisco', 'San Jose', 'Redwood City', 'Fremont')
value_city %>%
filter(RegionName %in% city2) %>%
select(RegionName, month, values, valuec, room1, room2, room3, room4, room5) %>%
pivot_longer(-c(1:2), names_to = 'type', values_to = 'house_value') %>%
ggplot(aes(x = month, y = house_value, col = type)) +
geom_line() +
facet_wrap(.~ RegionName, nrow = 2) +
theme(axis.text.x = element_text(angle = 30, size = 7))
house_type <- 'values'
city <- 'San Francisco'
ts <- value_city %>%
filter(RegionName == city) %>%
select(month, house_type) %>%
mutate(month = yearmonth(month)) %>%
as_tsibble(index = month)
head(ts, n=3)
ts %>% autoplot(col = 'blue4')
ts %>%
features(values, unitroot_kpss)
The p-value is less than 0.05, indicating that the null hypothesis is rejected. That is, the data are not stationary. We can difference the data, and apply the test again.
ts %>%
mutate(diff_value = difference(values)) %>%
features(diff_value, unitroot_kpss)
Determining the appropriate number of first differences is carried out using the unitroot_ndiffs() feature.
ts %>%
features(values, unitroot_ndiffs)
ts %>%
mutate(log_value = log(values)) %>%
features(log_value, unitroot_nsdiffs)
ts %>%
transmute(
`Value` = values,
`Log Value` = log(values),
`Annual change in log value` = difference(log(values), 1),
`Doubly differenced log value` =
difference(difference(log(values), 1), 1)) %>%
pivot_longer(-month, names_to="data_type", values_to="data") %>%
mutate(
data_type = as.factor(data_type)) %>%
ggplot(aes(x = month, y = data)) +
geom_line() +
facet_grid(vars(data_type), scales = "free_y")
train <- ts %>%
filter_index(. ~ "2016-12-31")
ARIMA()
fit_arima <- train %>% model(ARIMA(log(values)))
report(fit_arima)
## Series: values
## Model: ARIMA(3,2,0)(2,0,0)[12]
## Transformation: log(values)
##
## Coefficients:
## ar1 ar2 ar3 sar1 sar2
## -0.0046 0.1281 -0.3824 -0.7613 -0.4774
## s.e. 0.0590 0.0593 0.0589 0.0577 0.0581
##
## sigma^2 estimated as 1.055e-05: log likelihood=1095.85
## AIC=-2179.71 AICc=-2179.37 BIC=-2158.56
fit_arima %>% gg_tsresiduals(lag_max = 16)
augment(fit_arima) %>%
features(.innov, ljung_box, lag = 16, dof = 6)
ETC()
fit_ets <- train %>% model(ETS(log(values)))
report(fit_ets)
## Series: values
## Model: ETS(M,A,N)
## Transformation: log(values)
## Smoothing parameters:
## alpha = 0.9998986
## beta = 0.8721333
##
## Initial states:
## l b
## 12.62794 -0.003258111
##
## sigma^2: 0
##
## AIC AICc BIC
## -1355.511 -1355.268 -1337.844
fit_ets %>%
gg_tsresiduals(lag_max = 16)
augment(fit_ets) %>%
features(.innov, ljung_box, lag = 16, dof = 6)
The output below evaluates the forecasting performance of the two competing models over the train and test set. The ARIMA model seems to be the slightly more accurate model based on the test set RMSE, MAPE and MASE.
bind_rows(
fit_arima %>% accuracy(),
fit_ets %>% accuracy(),
fit_arima %>% forecast(h = "3 years") %>%
accuracy(ts),
fit_ets %>% forecast(h = "3 years") %>%
accuracy(ts)
) %>%
select(-ME, -MPE, -ACF1)
value_fc <- ts %>%
model(ARIMA(values)) %>%
forecast(h="3 years") %>%
hilo(level = c(80, 95))
value_fc %>% head(n = 5)
ts %>%
model(ARIMA(values)) %>%
forecast(h="3 years") %>%
autoplot(ts)