# import the dataset
library(readr)
data <- read_csv('train.csv')
# change the variable names
colnames(data) <- c('date','max','min','range','precipitation','humidity','avgwindspeed','v1','v2','v3','avgtemp')
knitr::kable(head(data))
| date | max | min | range | precipitation | humidity | avgwindspeed | v1 | v2 | v3 | avgtemp |
|---|---|---|---|---|---|---|---|---|---|---|
| 1960-01-01 | 2.2 | -5.2 | 7.4 | NA | 68.3 | 1.7 | 6.7 | NA | NA | -1.6 |
| 1960-01-02 | 1.2 | -5.6 | 6.8 | 0.4 | 87.7 | 1.3 | 0.0 | NA | NA | -1.9 |
| 1960-01-03 | 8.7 | -2.1 | 10.8 | 0.0 | 81.3 | 3.0 | 0.0 | NA | NA | 4.0 |
| 1960-01-04 | 10.8 | 1.2 | 9.6 | 0.0 | 79.7 | 4.4 | 2.6 | NA | NA | 7.5 |
| 1960-01-05 | 1.3 | -8.2 | 9.5 | NA | 44.0 | 5.1 | 8.2 | NA | NA | -4.6 |
| 1960-01-06 | -1.2 | -9.5 | 8.3 | 0.0 | 51.3 | 1.8 | 7.7 | NA | NA | -5.2 |
Hmisc::describe(data)
## data
##
## 11 Variables 23011 Observations
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 23011 0 23011 1 1991-07-02 7671 1963-02-24
## .10 .25 .50 .75 .90 .95
## 1966-04-20 1975-10-01 1991-07-02 2007-04-01 2016-09-12 2019-11-06
##
## lowest : 1960-01-01 1960-01-02 1960-01-03 1960-01-04 1960-01-05
## highest: 2022-12-27 2022-12-28 2022-12-29 2022-12-30 2022-12-31
## --------------------------------------------------------------------------------
## max
## n missing distinct Info Mean Gmd .05 .10
## 23008 3 490 1 17.07 12.26 -0.965 1.800
## .25 .50 .75 .90 .95
## 7.800 18.900 26.400 29.900 31.500
##
## lowest : -13.6 -12.4 -12.1 -11.8 -11.3, highest: 38 38.2 38.3 38.4 39.6
## --------------------------------------------------------------------------------
## min
## n missing distinct Info Mean Gmd .05 .10
## 23008 3 473 1 8.452 12.17 -9.00 -6.13
## .25 .50 .75 .90 .95
## -0.30 9.20 17.90 22.00 23.70
##
## lowest : -20.2 -19.2 -18.6 -18.5 -18.4, highest: 28.7 28.8 29.2 30 30.3
## --------------------------------------------------------------------------------
## range
## n missing distinct Info Mean Gmd .05 .10
## 23007 4 174 1 8.619 3.299 3.8 4.8
## .25 .50 .75 .90 .95
## 6.6 8.6 10.6 12.4 13.4
##
## lowest : 1 1.1 1.2 1.3 1.4 , highest: 17.9 18.5 18.6 19.1 19.6
## --------------------------------------------------------------------------------
## precipitation
## n missing distinct Info Mean Gmd .05 .10
## 9150 13861 771 0.985 9.594 15.12 0.0 0.0
## .25 .50 .75 .90 .95
## 0.1 1.4 8.5 27.3 47.5
##
## lowest : 0 0.1 0.2 0.3 0.4 , highest: 273.2 273.4 294.6 301.5 332.8
## --------------------------------------------------------------------------------
## humidity
## n missing distinct Info Mean Gmd .05 .10
## 23011 0 648 1 65.2 16.61 40.6 45.8
## .25 .50 .75 .90 .95
## 54.9 65.5 75.8 84.3 89.0
##
## lowest : 17.9 19 19.9 20.1 20.8, highest: 98.5 98.8 99 99.3 99.8
## --------------------------------------------------------------------------------
## avgwindspeed
## n missing distinct Info Mean Gmd .05 .10
## 23007 4 76 0.999 2.381 1.02 1.2 1.4
## .25 .50 .75 .90 .95
## 1.7 2.2 2.9 3.7 4.2
##
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 7.2 7.3 7.4 7.5 7.8
## --------------------------------------------------------------------------------
## v1
## n missing distinct Info Mean Gmd .05 .10
## 22893 118 138 0.998 5.859 4.365 0.0 0.0
## .25 .50 .75 .90 .95
## 2.2 6.6 9.0 10.5 11.3
##
## lowest : 0 0.1 0.2 0.3 0.4 , highest: 13.3 13.4 13.5 13.6 13.7
## --------------------------------------------------------------------------------
## v2
## n missing distinct Info Mean Gmd .05 .10
## 18149 4862 2728 1 11.93 7.331 2.33 3.57
## .25 .50 .75 .90 .95
## 7.00 11.22 16.62 21.03 23.26
##
## lowest : 0 0.01 0.02 0.05 0.06 , highest: 30.44 30.79 31.11 31.16 33.48
## --------------------------------------------------------------------------------
## v3
## n missing distinct Info Mean Gmd .05 .10
## 22645 366 934 0.998 48.65 36.04 0.0 0.0
## .25 .50 .75 .90 .95
## 17.8 55.7 78.0 85.7 89.4
##
## lowest : 0 0.7 0.8 0.9 1 , highest: 95.5 95.7 95.8 95.9 96.9
## --------------------------------------------------------------------------------
## avgtemp
## n missing distinct Info Mean Gmd .05 .10
## 23011 0 473 1 12.42 12.03 -5.2 -2.2
## .25 .50 .75 .90 .95
## 3.4 13.8 21.8 25.2 26.7
##
## lowest : -16.4 -15.5 -15.3 -15.2 -14.9, highest: 31.9 32.6 33.1 33.6 33.7
## --------------------------------------------------------------------------------
# Creating a custom theme for visualization
library(tidyverse)
my_theme <- function(base_size =8, base_family = "sans"){
theme_minimal(base_size = base_size, base_family = base_family) +
theme(
axis.text = element_text(size =8),
axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
axis.title = element_text(size =8),
panel.grid.major = element_line(color = "gray"),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white"),
strip.background = element_rect(fill = "#2c024c", color = "#290029", size =0.5),
strip.text = element_text(face = "bold", size = 8, color = "white"),
legend.position = "bottom",
legend.justification = "center",
legend.background = element_blank(),
panel.border = element_rect(color = "grey5", fill = NA, size = 0.5)
)
}
theme_set(my_theme())
mycolors=c("#f32440","#2185ef","#d421ef")
# Creating function to see the missing values
bar_missing <- function(x){
library(dplyr)
library(reshape2)
library(ggplot2)
x %>%
is.na %>%
melt %>%
ggplot(data = .,
aes(x = Var2)) +
geom_bar(aes(y=after_stat(count),fill=value),alpha=0.7) +
scale_fill_manual(values=c(mycolors[2],mycolors[1]),name = "", labels = c("Available","Missing"))+
theme_minimal()+
theme(axis.text.x = element_text(angle=45, vjust=0.5)) +
labs(x = "Variables in Dataset",
y = "Observations",
title='Missing Variables')+coord_flip()
}
matrix_missing <- function(x){
library(dplyr)
library(reshape2)
library(ggplot2)
x %>%
is.na %>%
melt %>%
ggplot(data = .,
aes(x = Var1,
y = Var2)) +
geom_tile(aes(fill = value),alpha=0.6) +
scale_fill_manual(values=c(mycolors[2],mycolors[1]),name = "",
labels = c("Available","Missing")) +
theme_minimal()+
theme(axis.text.x = element_text(angle=45, vjust=0.5)) +
labs(x = "Variables in Dataset",
y = "Total observations")+coord_flip()
}
library(gridExtra)
m1 <- data %>% bar_missing()
m2 <- data %>% matrix_missing()
grid.arrange(m1,m2,nrow=2)
library(tidyr)
data %>%
pivot_longer(cols=-date,
names_to='variable',
values_to='value') %>% # changing the data to long format for visualization purpose
ggplot(aes(x=date,y=value))+
geom_line(color=mycolors[2],alpha=0.8)+
facet_wrap(~variable,scales = 'free_y',nrow=4)+
labs(title='Time-Series Trend for Variables',xlab='Date',ylab='Value')
# box plot
data %>%
pivot_longer(cols=-date,
names_to='variable',
values_to='value') %>%
ggplot(aes(x=variable,y=value))+
geom_boxplot(alpha=0.6,fill=mycolors[2])+
facet_wrap(~variable,scales = 'free',nrow=4)+
coord_flip()+
labs(title='Boxplot for Variables')
# distribution plot
data %>%
pivot_longer(cols=-date,
names_to='variable',
values_to='value') %>%
ggplot(aes(x=value,y=..count..))+
geom_density(alpha=0.6,fill=mycolors[2])+
facet_wrap(~variable,scales = 'free',nrow=4)+
labs(title='Distribution for Variables')
library(stats)
library(forecast)
# Convert into ts format to conduct trend decomposition
ts_avgtemp <- ts(data$avgtemp,start=1960,frequency = 365.25)
ts_avgtemp %>% mstl() %>%
autoplot(color=mycolors[2])+
labs(title='Trend Decomposition')
# extract from 2000
df <- data[data$date>'2018-12-31',]
Hmisc::describe(df)
## df
##
## 11 Variables 1461 Observations
## --------------------------------------------------------------------------------
## date
## n missing distinct Info Mean Gmd .05
## 1461 0 1461 1 2020-12-31 487.3 2019-03-15
## .10 .25 .50 .75 .90 .95
## 2019-05-27 2020-01-01 2020-12-31 2021-12-31 2022-08-07 2022-10-19
##
## lowest : 2019-01-01 2019-01-02 2019-01-03 2019-01-04 2019-01-05
## highest: 2022-12-27 2022-12-28 2022-12-29 2022-12-30 2022-12-31
## --------------------------------------------------------------------------------
## max
## n missing distinct Info Mean Gmd .05 .10
## 1461 0 385 1 18.26 11.79 0.3 3.2
## .25 .50 .75 .90 .95
## 9.7 19.5 27.3 30.3 32.1
##
## lowest : -10.7 -8.6 -8.4 -7.5 -7.3 , highest: 35.9 36 36.1 36.5 36.8
## --------------------------------------------------------------------------------
## min
## n missing distinct Info Mean Gmd .05 .10
## 1460 1 394 1 9.385 12.23 -8.40 -5.51
## .25 .50 .75 .90 .95
## 0.80 9.45 19.00 23.11 24.70
##
## lowest : -18.6 -16.6 -16.5 -15.5 -14.4, highest: 27.2 27.3 27.4 27.8 27.9
## --------------------------------------------------------------------------------
## range
## n missing distinct Info Mean Gmd .05 .10
## 1460 1 151 1 8.867 3.364 3.7 4.8
## .25 .50 .75 .90 .95
## 6.8 9.0 10.9 12.7 13.6
##
## lowest : 1.5 1.7 1.8 1.9 2 , highest: 16.6 17 17.2 17.5 17.9
## --------------------------------------------------------------------------------
## precipitation
## n missing distinct Info Mean Gmd .05 .10
## 597 864 195 0.976 9.22 14.72 0.00 0.00
## .25 .50 .75 .90 .95
## 0.00 1.20 8.00 27.02 50.46
##
## lowest : 0 0.1 0.2 0.3 0.4 , highest: 114.5 120 123.1 129.6 176.2
## --------------------------------------------------------------------------------
## humidity
## n missing distinct Info Mean Gmd .05 .10
## 1461 0 467 1 62.53 17.09 37.8 42.9
## .25 .50 .75 .90 .95
## 51.6 63.0 73.0 82.1 87.4
##
## lowest : 17.9 24.3 25.5 25.6 26.3, highest: 96.8 97.6 98.1 98.5 99.3
## --------------------------------------------------------------------------------
## avgwindspeed
## n missing distinct Info Mean Gmd .05 .10
## 1461 0 42 0.997 2.249 0.7343 1.4 1.5
## .25 .50 .75 .90 .95
## 1.8 2.1 2.6 3.2 3.5
##
## lowest : 0.6 0.9 1 1.1 1.2, highest: 4.6 4.8 5 5.8 6
## --------------------------------------------------------------------------------
## v1
## n missing distinct Info Mean Gmd .05 .10
## 1458 3 137 0.999 6.393 4.551 0.0 0.1
## .25 .50 .75 .90 .95
## 2.6 7.5 9.5 11.1 12.1
##
## lowest : 0 0.1 0.2 0.3 0.4 , highest: 13.2 13.3 13.4 13.5 13.6
## --------------------------------------------------------------------------------
## v2
## n missing distinct Info Mean Gmd .05 .10
## 1457 4 1098 1 14.15 8.12 3.196 5.026
## .25 .50 .75 .90 .95
## 8.910 13.150 19.410 24.718 26.616
##
## lowest : 0.54 0.75 0.84 0.96 1.06 , highest: 30.01 30.06 30.35 30.44 30.79
## --------------------------------------------------------------------------------
## v3
## n missing distinct Info Mean Gmd .05 .10
## 1461 0 644 0.999 53.51 37.83 0.0 0.7
## .25 .50 .75 .90 .95
## 21.3 62.2 85.2 91.9 93.0
##
## lowest : 0 0.7 0.8 0.9 1 , highest: 94.8 94.9 95.1 95.3 95.9
## --------------------------------------------------------------------------------
## avgtemp
## n missing distinct Info Mean Gmd .05 .10
## 1461 0 386 1 13.48 11.84 -4.1 -1.2
## .25 .50 .75 .90 .95
## 5.3 14.0 22.8 26.1 27.5
##
## lowest : -14.9 -14.5 -12.2 -12.1 -11.8, highest: 31.1 31.2 31.5 31.6 31.7
## --------------------------------------------------------------------------------
nm1 <- df %>% bar_missing()
nm2 <- df %>% matrix_missing()
grid.arrange(nm1,nm2,nrow=2)
# fill 0 for precipitation NA values
df$precipitation <- ifelse(is.na(df$precipitation),0,df$precipitation)
# fill previous value for rest of data with missing values
library(zoo)
df <- na.locf(df)
df %>%
pivot_longer(cols=-date,
names_to='variable',
values_to='value') %>%
ggplot(aes(x=date,y=value))+
geom_line(color=mycolors[1],alpha=0.8)+
facet_wrap(~variable,scales = 'free_y',nrow=4)+
labs(title='Time-Series Trend for Variables',xlab='Date',ylab='Value')
df %>%
pivot_longer(cols=-date,
names_to='variable',
values_to='value') %>%
ggplot(aes(x=variable,y=value))+
geom_boxplot(alpha=0.6,fill=mycolors[1])+
facet_wrap(~variable,scales = 'free',nrow=4)+
coord_flip()+
labs(title='Boxplot for Variables')
df %>%
pivot_longer(cols=-date,
names_to='variable',
values_to='value') %>%
ggplot(aes(x=value,y=..count..))+
geom_density(alpha=0.6,fill=mycolors[1])+
facet_wrap(~variable,scales = 'free',nrow=4)+
labs(title='Distribution for Variables')
ts_avgtemp_new <- ts(df$avgtemp,start=2019,frequency = 365)
ts_avgtemp_new %>% mstl() %>%
autoplot(color=mycolors[1])+
labs(title='Trend Decomposition')
Here, I will be using data from 2019~2021 to predict 2022 average temperature.
Let’s split the data into train and test
# split the data sets into train and test
train <- df[df$date<'2022-01-01',]
train <- train$avgtemp
test <- df[df$date>='2022-01-01',]
test <- test$avgtemp
# convert the data into ts format
ts_train <- ts(train,frequency = 365)
ts_test <- ts(test,start=c(4,2),frequency = 365)
For time-series data, it is always important to check the stationarity before building a model. If the data is not stationary, you can’t predict the future values since either the mean or variance (covariance as well) is diverging.
Let’s explore the stationarity using the ACF and PACF plot. ACF plot is used to determine the order of q for ARIMA model while PACF is used to determine the order of p.
acf <- ggAcf(ts_train,color=mycolors[1],alpha=0.7)+labs(title='ACF')
pacf <- ggPacf(ts_train,color=mycolors[1],alpha=0.7)+labs(title='PACF')
grid.arrange(acf,pacf)
library(tseries)
adf.test(ts_train)
##
## Augmented Dickey-Fuller Test
##
## data: ts_train
## Dickey-Fuller = -1.4244, Lag order = 10, p-value = 0.822
## alternative hypothesis: stationary
# difference the value
lag_ts_train <- diff(ts_train)
# plot acf and pacf for lagged data
lag_acf <- ggAcf(lag_ts_train,color=mycolors[1],alpha=0.7)+labs(title='ACF')
lag_pacf <- ggPacf(lag_ts_train,color=mycolors[1],alpha=0.7)+labs(title='PACF')
grid.arrange(lag_acf,lag_pacf)
By lagging the value, it seems that the there was a s significant change in ACF plot. We can kind of assume that stationarity was achieved. Since we see the significant spikes infinitely, I will be assigning p=1 and q=1 when building custom ARIMA model
Let’s conduct adf test
adf.test(lag_ts_train)
##
## Augmented Dickey-Fuller Test
##
## data: lag_ts_train
## Dickey-Fuller = -13.35, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# fit auto ARIMA model
fit_arima <- ts_train %>% auto.arima()
fit_arima
## Series: .
## ARIMA(3,0,0)(0,1,0)[365]
##
## Coefficients:
## ar1 ar2 ar3
## 1.0067 -0.4327 0.1248
## s.e. 0.0367 0.0501 0.0369
##
## sigma^2 = 8.525: log likelihood = -1819.54
## AIC=3647.08 AICc=3647.13 BIC=3665.46
fit_arima_fore <- fit_arima %>% forecast(364)
# fit TBATS model
fit_tbats <- ts_train %>% tbats()
fit_tbats_fore <- ts_train %>% tbats() %>% forecast(364)
set.seed(0)
# fit neural network autoregression model
fit_nn <- nnetar(ts_train,p=1)
fit_nn_fore <- fit_nn %>% forecast(h=364)
# fit stl forecast
# stlf is forecasting based on decomposition of the data
fit_stl <- stlf(ts_train)
fit_stl_fore <- fit_stl %>% forecast(h=364)
# create an ensemble model
combination <- (fit_arima_fore[['mean']]+fit_tbats_fore[['mean']]+fit_nn_fore[['mean']]+fit_stl_fore[['mean']])/4
# visualize results
autoplot(ts_train) +
autolayer(fit_arima_fore,series = 'ARIMA',PI=F)+
autolayer(fit_tbats_fore,series = 'TBATS',PI=F)+
autolayer(fit_nn_fore,series = 'NNAR',PI=F)+
autolayer(fit_stl_fore,series = 'STL',PI=F)+
autolayer(combination,series = 'Ensemble')+
theme(legend.position=c(0.001, 0.99999),
legend.justification=c('left', 'top'))+
labs(x='Time', y='Avg Temperature',title='Average Temperature Forecast')
# visualize actual vs forecast
p1 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
autolayer(fit_arima_fore,size=0.7,PI=F,series = 'SARIMA')+
scale_color_manual(values=mycolors[1])+
theme(legend.position=c(0.001, 0.99999),
legend.justification=c('left', 'top'))+
labs(x='Time', y='Avg Temperature',title='Auto SARIMA Forecast')
p2 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
autolayer(fit_tbats_fore,size=0.8,PI=F,series = 'TBATS')+
scale_color_manual(values=mycolors[1])+
theme(legend.position=c(0.001, 0.99999),
legend.justification=c('left', 'top'))+
labs(x='Time', y='Avg Temperature',title='TBATS Forecast')
p3 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
autolayer(fit_nn_fore,size=0.8,PI=F,series = 'Neural Network')+
scale_color_manual(values=mycolors[1])+
theme(legend.position=c(0.001, 0.99999),
legend.justification=c('left', 'top'))+
labs(x='Time', y='Avg Temperature',title='Neural Network Forecast')
p4 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
autolayer(fit_stl_fore,size=0.7,PI=F,series = 'STL')+
scale_color_manual(values=mycolors[1])+
theme(legend.position=c(0.001, 0.99999),
legend.justification=c('left', 'top'))+
labs(x='Time', y='Avg Temperature',title='STL Forecast')
p5 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
autolayer(combination,size=0.7,series = 'Ensemble')+
scale_color_manual(values=mycolors[1])+
theme(legend.position=c(0.001, 0.99999),
legend.justification=c('left', 'top'))+
labs(x='Time', y='Avg Temperature',title='Ensemble Forecast')
grid.arrange(p1,p2,p3,p4,p5,nrow=3)
# create dataframe for each metric
rmse <- c(SARIMA=accuracy(fit_arima_fore,ts_test)['Test set','RMSE'] ,
TBATS=accuracy(fit_tbats_fore,ts_test)['Test set','RMSE'] ,
NNAR=accuracy(fit_nn_fore,ts_test)['Test set','RMSE'],
STL=accuracy(fit_stl_fore,ts_test)['Test set','RMSE'],
Ensemble=accuracy(combination,ts_test)['Test set','RMSE'])
rmse_df <- rmse %>% data.frame()
colnames(rmse_df) <- 'RMSE'
rmse_df %>% knitr::kable()
| RMSE | |
|---|---|
| SARIMA | 4.483757 |
| TBATS | 4.424768 |
| NNAR | 3.882513 |
| STL | 3.579389 |
| Ensemble | 3.536691 |
Interestingly, we can see that the ensemble method performed the best forecasting.
Although the data provider assigned a goal to analyze one-year daily average temperature, this is a challenging task since it is difficult to reflect seasonality in a daily data (This becomes much easier for monthly or quarterly data).
Also note that ARIMA model is powerful for short-term period forecast rather than long-term period forecast.