#loading the required libraries
library(tidyverse)
library(ggplot2)
library(forecast)
library(tseries)
library(lubridate)
library(prophet)

Data Loading + Preparation + Initial TS plot

df <- read.csv("D:/Downloads/PRSA2017_Data_20130301-20170228/PRSA_Data_20130301-20170228/PRSA_Data_Aotizhongxin_20130301-20170228.csv")

#looking at the first 5 observations
head(df, 5)

##   No year month day hour PM2.5 PM10 SO2 NO2  CO O3 TEMP   PRES  DEWP RAIN  wd
## 1  1 2013     3   1    0     4    4   4   7 300 77 -0.7 1023.0 -18.8    0 NNW
## 2  2 2013     3   1    1     8    8   4   7 300 77 -1.1 1023.2 -18.2    0   N
## 3  3 2013     3   1    2     7    7   5  10 300 73 -1.1 1023.5 -18.2    0 NNW
## 4  4 2013     3   1    3     6    6  11  11 300 72 -1.4 1024.5 -19.4    0  NW
## 5  5 2013     3   1    4     3    3  12  12 300 72 -2.0 1025.2 -19.5    0   N
##   WSPM      station
## 1  4.4 Aotizhongxin
## 2  4.7 Aotizhongxin
## 3  5.6 Aotizhongxin
## 4  3.1 Aotizhongxin
## 5  2.0 Aotizhongxin

# uniting the date, month and year into a single column - date
df <- df %>%
  unite('date',year:day, sep = "-", remove = FALSE)

#converting degree Celsius to kelvin
df <- df %>% dplyr::mutate(df, TEMP_abs = TEMP + 273.15)

# converting to datetime object
df[['date']] <- as.POSIXct(df[['date']],format = "%Y-%m-%d")

# aggregating data at daily level 
df <- df %>% 
  group_by(date) %>%
  summarise(daily_mean_TEMP_abs = (mean(TEMP_abs, na.rm = TRUE)))

# plotting the time-series
(plot1 <- df %>%
    ggplot() +
    geom_line(aes (date, daily_mean_TEMP_abs)) + 
    ggtitle("TEMP vs day-month-year") +
    ylab("Temperature") +
    xlab("Date"))

From the past assignment, we know that the TS is variance stationary but mean non-stationary. However, fitting a prophet model can be done irrespective of the stationarity state of the TS.

Fitting a Prophet model + plotting

#fitting a prophet model
prophet_data = df %>% 
  rename(ds = date, # Have to name our date variable "ds"
         y = daily_mean_TEMP_abs)  # Have to name our time series "y"
train = prophet_data %>% 
  filter(ds <= ymd("2017-01-28"))
test = prophet_data %>%
  filter(ds > ymd("2017-01-28"))
model = prophet(train, daily.seasonality = TRUE)
future = make_future_dataframe(model,periods = 30)
forecast = predict(model,future)

A prophet model has been fitted with forecast period as 30 days. This means that beyond the date:2017-01-28, we will make predictions out 30 days. The following plots will help us visualize the forecast.

#plotting the prophet model
plot(model,forecast)+
  ylab("daily_mean_TEMP_abs")+xlab("Date") + ggtitle("Prophet model plot") + 
  theme_bw()

#plotting the prophet model: interactive charts
dyplot.prophet(model,forecast)

The above plots indicate that prophet model has been able to capture the seasonality really well. Also, upon close inspection, it seems that the prophet model has been able to capture the trend too.(Very low but increasing trend)

#visualizing the components
prophet_plot_components(model,forecast)

Trend: The trend seems to have a cycle of ascent and descent over a period of 1.5 years. However, beyond June 2016, the trend has shown a plateau till early months of 2017. This maybe due to factors like global warming, that are not allowing the gradual descent that was observed for the past years.

Weekly: Observing seasonality is difficult at weekly level. However, there is gradual ascent and descent with Tuesday having the lowest value.

Yearly: Seasonal and cyclical nature can be clearly observed here.

Daily: Seasonal and cyclical nature can be seen here. It logically fits with the fact that temperature rises as sun shines and decreased as the sun sets. Other factors may be human intervention like pollution from cars and factories, etc., limited to the area from where data was recorded.

Changepoints

#plotting the change points
plot(model,forecast)+
  add_changepoints_to_plot(model)+
  theme_bw()+
  xlab("Date")+
  ylab("daily_mean_TEMP_abs")+ ggtitle("Default model")

# number of change points: Manual number of change points
model = prophet(train,n.changepoints=50,daily.seasonality=TRUE)
forecast = predict(model,future)
plot(model,forecast)+
  add_changepoints_to_plot(model)+
  theme_bw()+
  xlab("Date")+
  ylab("daily_mean_TEMP_abs") +
  ggtitle("New model with 'n.changepoints = 50' ")

#observing the components again
prophet_plot_components(model,forecast)

With respect to the previous trend (refer to the previous component figure), the one with added change points to 50 has shown to smooth out the trend a bit. However, upon visualizing the two component plots, we can say that not much change is observed in weekly, yearly and daily components. Hence, incorporation of additional change points (to 50) has not added much value to the initially fitted prophet model, which has by default 25 changepoints.

Forecast

Let us forecast out 30 days using our train data. Also, let us visually compare how our forecast varies from the observed values of test data.

#forecast
forecast_plot_data = forecast %>% 
  as_tibble() %>% 
  mutate(ds = as.POSIXct(ds),format = "%Y-%m-%d") %>% 
  filter(ds > ymd("2017-01-28"))

forecast_not_scaled = ggplot()+
  geom_line(aes(test$ds,test$y))+
  geom_line(aes(forecast_plot_data$ds,forecast_plot_data$yhat),color='red')

forecast_scaled = forecast_not_scaled + 
  ylim(250,315)

forecast_scaled

Saturation Points

Ideally saturation points are not required for this TS. Since, we are tracking the absolute value of temperature, ideally the lowest could be 0 Kelvin. But, we all know that it is impossible to achieve 0 Kelvin.The highest temperature recorded in China is around 50.5 degree Celsius. Hence, setting this as the cap for saturation points. For convenience, the saturation points are set to: Lowest: 0 Kelvin, Highest: 323.15 Kelvin.

#saturation points
one_year_future = make_future_dataframe(model,periods = 365)

one_year_forecast = predict(model,one_year_future)

plot(model,one_year_forecast)+
  theme_bw()+
  xlab("Date")+
  ylab("daily_mean_TEMP_abs") + 
  ggtitle("one year future forecast without saturation points")

#specifying saturation points

# Set "floor" in training data
train$floor = 0 # no
train$cap = 323.15
future$floor = 0
future$cap = 323.15
# Set floor in forecast data
one_year_future$floor = 0
one_year_future$cap = 323.15
sat_model = prophet(train,growth='logistic',daily.seasonality=TRUE)
sat_one_year_forecast = predict(sat_model,one_year_future)

#plotting with saturation points
plot(sat_model, sat_one_year_forecast) +
  ylim(0,330) +
  theme_bw() +
  xlab("Date") +
  ylab("daily_mean_TEMP_abs") +
  ggtitle("one year future forecast with saturation points")

Seasonality

# creating new identical test and train data having no saturation points to see trends properly
train_iden = prophet_data %>% 
  filter(ds <= ymd("2017-01-28"))
test_iden = prophet_data %>%
  filter(ds > ymd("2017-01-28"))
model_iden = prophet(train_iden, daily.seasonality = TRUE)
future_iden = make_future_dataframe(model_iden,periods = 30)
forecast_iden = predict(model_iden,future)

#additive seasonality
additive = prophet(train_iden, daily.seasonality = TRUE)
add_fcst = predict(additive,future_iden)
plot(additive,add_fcst) +
  xlab("Date") +
  ylab("daily_mean_TEMP_abs") +
  ggtitle("additive seasonality")+
  ylim(250,323.15)

#multiplicative seasonality
multi = prophet(train_iden,seasonality.mode = 'multiplicative',daily.seasonality = TRUE)
multi_fcst = predict(multi,future_iden)
plot(multi,multi_fcst) + 
  xlab("Date") +
  ylab("daily_mean_TEMP_abs") +
  ggtitle("multiplicative seasonality")+
  ylim(250,323.15)

#additive seasonality: plotting components
prophet_plot_components(additive,add_fcst)

#multiplicative seasonality: plotting components
prophet_plot_components(multi, multi_fcst)

The component plots for additive and multiplicative seasonality incporporated models do not have much difference in the components except the trend part. There are minute changes in the trend before and after 2016. After 2016, as we head towards 2017, the additive model shows increasing (slope is very low but positive) trend whereas the multiplicative one shows constant(slope is almost zero) trend.

Holidays

Let us involve and model holidays to learn if some association can be understood. Since the data was recorded in the province of China, setting the country name to China.

#impact of holidays on TEMP
model = prophet(train_iden,fit = FALSE, daily.seasonality = TRUE)
model = add_country_holidays(model,country_name = 'CN')
model = fit.prophet(model,train_iden)
forecast = predict(model,future_iden)
prophet_plot_components(model,forecast)

Upon comparing with the previous component plots, there is hardly any difference in the components except the trend components between the model involving holidays and additive model. The trend components of the model involving holidays and multiplicative model are identical in nature.

#holiday effects
forecast %>%
  filter(holidays != 0) %>%
  dplyr::select(-ds:-additive_terms_upper, -holidays:-holidays_upper, -weekly:-yhat, -contains("upper"), -contains("lower")) %>%
  mutate_all(~ if_else(. == 0, as.numeric(NA), .)) %>%
  summarize_if(is.numeric, ~ max(., na.rm = T)) %>%
  pivot_longer(
    cols = model$train.holiday.names,
    names_to = 'holiday', 
    values_to = 'effect'
  ) %>%
  ggplot() +
  geom_col(aes(effect,holiday)) +
  theme_bw()

Here, we can see the relative impacts of the major holidays that are celebrated in China on the mean absolute Temperature.

Overall, inclusion of holidays does seem to create a positive impact by involving more information in the model and we can logically add them as the grnularity of this data is daily level.

Performance of prophet

In-sample performance:

# performance on in-sample data
forecast_insample = predict(model,train_iden)
forecast_metric_data = forecast_insample %>% 
  as_tibble() %>% 
  mutate(ds = as.POSIXct(ds),format = "%Y-%m-%d") %>% 
  filter(ds <= ymd("2017-01-28"))
RMSE = sqrt(mean((train_iden$y - forecast_metric_data$yhat)^2))

print(paste("In-sample RMSE:",round(RMSE,2)))

## [1] "In-sample RMSE: 2.44"

#visualization
ggplot()+
  geom_line(aes(train_iden$ds ,train_iden$y), color = 'green')+
  geom_line(aes(forecast_metric_data$ds,forecast_metric_data$yhat),color='red',
            alpha = 1)+
  theme_bw()+
  xlab("Date")+
  ylab("daily_mean_TEMP_abs") +
  ggtitle("In sample performance visualization")

Out-of-sample performance:

# performance on in-sample data
forecast_metric_data = forecast %>% 
  as_tibble() %>% 
  mutate(ds = as.POSIXct(ds),format = "%Y-%m-%d") %>% 
  filter(ds > ymd("2017-01-28"))
RMSE = sqrt(mean((test$y - forecast_metric_data$yhat)^2))

print(paste("Out-of-sample RMSE:",round(RMSE,2)))

## [1] "Out-of-sample RMSE: 3.22"

#visualization
ggplot()+
  geom_line(aes(test$ds ,test$y), color = 'green')+
  geom_line(aes(forecast_metric_data$ds,forecast_metric_data$yhat),color='red',
            alpha = 1)+
  theme_bw()+
  xlab("Date")+
  ylab("daily_mean_TEMP_abs")+
  ggtitle("Out of sample performance visualization")

It seems that our additive model with holiday inclusion does well both in-sample and out-of-sample in capturing the overall trend. When it comes to out of sample performance, the overall spikes are not effectively achieved by our fitted model, but the trend is captured.

Cross Validation

Cross Validation: Additive model

#cross-validation with prophet
df.cv <- cross_validation(model, initial = 365*3, period = 15, horizon = 30, units = 'days')

## Making 21 forecasts with cutoffs between 2016-03-04 and 2016-12-29

head(df.cv)

## # A tibble: 6 x 6
##       y ds                   yhat yhat_lower yhat_upper cutoff             
##   <dbl> <dttm>              <dbl>      <dbl>      <dbl> <dttm>             
## 1  282. 2016-03-05 00:00:00  277.       274.       280. 2016-03-04 00:00:00
## 2  278. 2016-03-06 00:00:00  278.       274.       281. 2016-03-04 00:00:00
## 3  279. 2016-03-07 00:00:00  278.       275.       281. 2016-03-04 00:00:00
## 4  275. 2016-03-08 00:00:00  278.       275.       281. 2016-03-04 00:00:00
## 5  274. 2016-03-09 00:00:00  279.       276.       282. 2016-03-04 00:00:00
## 6  276. 2016-03-10 00:00:00  279.       276.       282. 2016-03-04 00:00:00

unique(df.cv$cutoff)

##  [1] "2016-03-04 GMT" "2016-03-19 GMT" "2016-04-03 GMT" "2016-04-18 GMT"
##  [5] "2016-05-03 GMT" "2016-05-18 GMT" "2016-06-02 GMT" "2016-06-17 GMT"
##  [9] "2016-07-02 GMT" "2016-07-17 GMT" "2016-08-01 GMT" "2016-08-16 GMT"
## [13] "2016-08-31 GMT" "2016-09-15 GMT" "2016-09-30 GMT" "2016-10-15 GMT"
## [17] "2016-10-30 GMT" "2016-11-14 GMT" "2016-11-29 GMT" "2016-12-14 GMT"
## [21] "2016-12-29 GMT"

#plotting Cross-Val Actual vs Pred
df.cv %>% 
  ggplot()+
  geom_point(aes(ds,y)) +
  geom_point(aes(ds,yhat,color=factor(cutoff)))+
  theme_bw()+
  xlab("Date")+
  ylab("daily_mean_TEMP_abs") +
  scale_color_discrete(name = 'Cutoff')

head(performance_metrics(df.cv))

##   horizon      mse     rmse      mae        mape       mdape       smape
## 1  3 days 7.190088 2.681434 2.064839 0.007164684 0.005858644 0.007158430
## 2  4 days 6.371609 2.524205 1.970512 0.006816354 0.005198514 0.006811795
## 3  5 days 5.671169 2.381422 1.906400 0.006604059 0.005470322 0.006604561
## 4  6 days 4.662794 2.159350 1.754430 0.006079898 0.005641507 0.006079027
## 5  7 days 4.821051 2.195689 1.764002 0.006125983 0.005752609 0.006122947
## 6  8 days 6.198930 2.489765 1.946218 0.006779055 0.005775724 0.006771863
##    coverage
## 1 0.7619048
## 2 0.7619048
## 3 0.7936508
## 4 0.8571429
## 5 0.8412698
## 6 0.7777778

plot_cross_validation_metric(df.cv, metric = 'rmse',rolling_window = 0.1)

plot_cross_validation_metric(df.cv, metric = 'mae',rolling_window = 0.1)

plot_cross_validation_metric(df.cv, metric = 'mape',rolling_window = 0.1)

With respect to MAPE for additive model(with holiday inclusion), the interpretation is that our fitted model is able to forecast with around 0.8% error out 10 days, with around 0.65% error out 20 days and with around 0.65% error out 30 days into the future.

Cross Validation: Multiplicative model

#cross-validation with prophet
model = prophet(train_iden,fit = FALSE, daily.seasonality = TRUE, seasonality.mode = 'multiplicative')
model = add_country_holidays(model,country_name = 'CN')
model = fit.prophet(model,train_iden)
df.cv <- cross_validation(model, initial = 365*3, period = 15, horizon = 30, units = 'days')

## Making 21 forecasts with cutoffs between 2016-03-04 and 2016-12-29

head(df.cv)

## # A tibble: 6 x 6
##       y ds                   yhat yhat_lower yhat_upper cutoff             
##   <dbl> <dttm>              <dbl>      <dbl>      <dbl> <dttm>             
## 1  282. 2016-03-05 00:00:00  277.       274.       280. 2016-03-04 00:00:00
## 2  278. 2016-03-06 00:00:00  278.       274.       281. 2016-03-04 00:00:00
## 3  279. 2016-03-07 00:00:00  278.       275.       281. 2016-03-04 00:00:00
## 4  275. 2016-03-08 00:00:00  278.       275.       281. 2016-03-04 00:00:00
## 5  274. 2016-03-09 00:00:00  279.       275.       282. 2016-03-04 00:00:00
## 6  276. 2016-03-10 00:00:00  279.       276.       282. 2016-03-04 00:00:00

unique(df.cv$cutoff)

##  [1] "2016-03-04 GMT" "2016-03-19 GMT" "2016-04-03 GMT" "2016-04-18 GMT"
##  [5] "2016-05-03 GMT" "2016-05-18 GMT" "2016-06-02 GMT" "2016-06-17 GMT"
##  [9] "2016-07-02 GMT" "2016-07-17 GMT" "2016-08-01 GMT" "2016-08-16 GMT"
## [13] "2016-08-31 GMT" "2016-09-15 GMT" "2016-09-30 GMT" "2016-10-15 GMT"
## [17] "2016-10-30 GMT" "2016-11-14 GMT" "2016-11-29 GMT" "2016-12-14 GMT"
## [21] "2016-12-29 GMT"

#plotting Cross-Val Actual vs Pred
df.cv %>% 
  ggplot()+
  geom_point(aes(ds,y)) +
  geom_point(aes(ds,yhat,color=factor(cutoff)))+
  theme_bw()+
  xlab("Date")+
  ylab("daily_mean_TEMP_abs") +
  scale_color_discrete(name = 'Cutoff')

head(performance_metrics(df.cv))

##   horizon      mse     rmse      mae        mape       mdape       smape
## 1  3 days 7.760667 2.785797 2.176506 0.007542685 0.006686238 0.007541059
## 2  4 days 6.899188 2.626631 2.063986 0.007134789 0.005743357 0.007134458
## 3  5 days 5.953085 2.439894 1.948233 0.006748648 0.005743357 0.006753302
## 4  6 days 4.856094 2.203655 1.800220 0.006238106 0.005443915 0.006241576
## 5  7 days 5.042136 2.245470 1.796104 0.006235641 0.005443915 0.006237943
## 6  8 days 6.633942 2.575644 2.045129 0.007113078 0.005469569 0.007111581
##    coverage
## 1 0.7619048
## 2 0.7619048
## 3 0.7777778
## 4 0.8253968
## 5 0.8253968
## 6 0.7460317

plot_cross_validation_metric(df.cv, metric = 'rmse',rolling_window = 0.1)

plot_cross_validation_metric(df.cv, metric = 'mae',rolling_window = 0.1)

plot_cross_validation_metric(df.cv, metric = 'mape',rolling_window = 0.1)

With respect to MAPE, for multiplicative model(with holiday inclusion), the interpretation is that our fitted model is able to forecast with around 0.85% error out 10 days, with around 0.7% error out 20 days and with around 0.65% error out 30 days into the future.

For both the models(with inclusion of holidays), the metrics give almost the same error values with the ones obtained from multiplicative to be slightly higher. Thus, for model selection, Additive model with holiday inclusion is the chosen one.

Forecasting Assignment 3

Mihir Patel

2/17/2022