0. Introduction

1-a. Raw Data Exploration

# import the dataset
library(readr)
data <- read_csv('train.csv')

# change the variable names
colnames(data) <- c('date','max','min','range','precipitation','humidity','avgwindspeed','v1','v2','v3','avgtemp')
knitr::kable(head(data))
date max min range precipitation humidity avgwindspeed v1 v2 v3 avgtemp
1960-01-01 2.2 -5.2 7.4 NA 68.3 1.7 6.7 NA NA -1.6
1960-01-02 1.2 -5.6 6.8 0.4 87.7 1.3 0.0 NA NA -1.9
1960-01-03 8.7 -2.1 10.8 0.0 81.3 3.0 0.0 NA NA 4.0
1960-01-04 10.8 1.2 9.6 0.0 79.7 4.4 2.6 NA NA 7.5
1960-01-05 1.3 -8.2 9.5 NA 44.0 5.1 8.2 NA NA -4.6
1960-01-06 -1.2 -9.5 8.3 0.0 51.3 1.8 7.7 NA NA -5.2
Hmisc::describe(data)
## data 
## 
##  11  Variables      23011  Observations
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##      23011          0      23011          1 1991-07-02       7671 1963-02-24 
##        .10        .25        .50        .75        .90        .95 
## 1966-04-20 1975-10-01 1991-07-02 2007-04-01 2016-09-12 2019-11-06 
## 
## lowest : 1960-01-01 1960-01-02 1960-01-03 1960-01-04 1960-01-05
## highest: 2022-12-27 2022-12-28 2022-12-29 2022-12-30 2022-12-31
## --------------------------------------------------------------------------------
## max 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    23008        3      490        1    17.07    12.26   -0.965    1.800 
##      .25      .50      .75      .90      .95 
##    7.800   18.900   26.400   29.900   31.500 
## 
## lowest : -13.6 -12.4 -12.1 -11.8 -11.3, highest: 38    38.2  38.3  38.4  39.6 
## --------------------------------------------------------------------------------
## min 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    23008        3      473        1    8.452    12.17    -9.00    -6.13 
##      .25      .50      .75      .90      .95 
##    -0.30     9.20    17.90    22.00    23.70 
## 
## lowest : -20.2 -19.2 -18.6 -18.5 -18.4, highest: 28.7  28.8  29.2  30    30.3 
## --------------------------------------------------------------------------------
## range 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    23007        4      174        1    8.619    3.299      3.8      4.8 
##      .25      .50      .75      .90      .95 
##      6.6      8.6     10.6     12.4     13.4 
## 
## lowest : 1    1.1  1.2  1.3  1.4 , highest: 17.9 18.5 18.6 19.1 19.6
## --------------------------------------------------------------------------------
## precipitation 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     9150    13861      771    0.985    9.594    15.12      0.0      0.0 
##      .25      .50      .75      .90      .95 
##      0.1      1.4      8.5     27.3     47.5 
## 
## lowest : 0     0.1   0.2   0.3   0.4  , highest: 273.2 273.4 294.6 301.5 332.8
## --------------------------------------------------------------------------------
## humidity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    23011        0      648        1     65.2    16.61     40.6     45.8 
##      .25      .50      .75      .90      .95 
##     54.9     65.5     75.8     84.3     89.0 
## 
## lowest : 17.9 19   19.9 20.1 20.8, highest: 98.5 98.8 99   99.3 99.8
## --------------------------------------------------------------------------------
## avgwindspeed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    23007        4       76    0.999    2.381     1.02      1.2      1.4 
##      .25      .50      .75      .90      .95 
##      1.7      2.2      2.9      3.7      4.2 
## 
## lowest : 0.1 0.2 0.3 0.4 0.5, highest: 7.2 7.3 7.4 7.5 7.8
## --------------------------------------------------------------------------------
## v1 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    22893      118      138    0.998    5.859    4.365      0.0      0.0 
##      .25      .50      .75      .90      .95 
##      2.2      6.6      9.0     10.5     11.3 
## 
## lowest : 0    0.1  0.2  0.3  0.4 , highest: 13.3 13.4 13.5 13.6 13.7
## --------------------------------------------------------------------------------
## v2 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    18149     4862     2728        1    11.93    7.331     2.33     3.57 
##      .25      .50      .75      .90      .95 
##     7.00    11.22    16.62    21.03    23.26 
## 
## lowest : 0     0.01  0.02  0.05  0.06 , highest: 30.44 30.79 31.11 31.16 33.48
## --------------------------------------------------------------------------------
## v3 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    22645      366      934    0.998    48.65    36.04      0.0      0.0 
##      .25      .50      .75      .90      .95 
##     17.8     55.7     78.0     85.7     89.4 
## 
## lowest : 0    0.7  0.8  0.9  1   , highest: 95.5 95.7 95.8 95.9 96.9
## --------------------------------------------------------------------------------
## avgtemp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##    23011        0      473        1    12.42    12.03     -5.2     -2.2 
##      .25      .50      .75      .90      .95 
##      3.4     13.8     21.8     25.2     26.7 
## 
## lowest : -16.4 -15.5 -15.3 -15.2 -14.9, highest: 31.9  32.6  33.1  33.6  33.7 
## --------------------------------------------------------------------------------
# Creating a custom theme for visualization
library(tidyverse)

my_theme <- function(base_size =8, base_family = "sans"){
  theme_minimal(base_size = base_size, base_family = base_family) +
    theme(
      axis.text = element_text(size =8),
      axis.text.x = element_text(angle = 0, vjust = 0.5, hjust = 0.5),
      axis.title = element_text(size =8),
      panel.grid.major = element_line(color = "gray"),
      panel.grid.minor = element_blank(),
      panel.background = element_rect(fill = "white"),
      strip.background = element_rect(fill = "#2c024c", color = "#290029", size =0.5),
      strip.text = element_text(face = "bold", size = 8, color = "white"),
      legend.position = "bottom",
      legend.justification = "center",
      legend.background = element_blank(),
      panel.border = element_rect(color = "grey5", fill = NA, size = 0.5)
    )
}

theme_set(my_theme())

mycolors=c("#f32440","#2185ef","#d421ef")
# Creating function to see the missing values
bar_missing <- function(x){
  library(dplyr)
  library(reshape2)
  library(ggplot2)
  x %>%
    is.na %>%
    melt %>%
    ggplot(data = .,
           aes(x = Var2)) +
    geom_bar(aes(y=after_stat(count),fill=value),alpha=0.7) +
    scale_fill_manual(values=c(mycolors[2],mycolors[1]),name = "", labels = c("Available","Missing"))+
    theme_minimal()+
    theme(axis.text.x = element_text(angle=45, vjust=0.5)) +
    labs(x = "Variables in Dataset",
         y = "Observations",
         title='Missing Variables')+coord_flip()
}


matrix_missing <- function(x){
  library(dplyr)
  library(reshape2)
  library(ggplot2)
  x %>%
    is.na %>%
    melt %>%
    ggplot(data = .,
           aes(x = Var1,
               y = Var2)) +
    geom_tile(aes(fill = value),alpha=0.6) +
    scale_fill_manual(values=c(mycolors[2],mycolors[1]),name = "",
                      labels = c("Available","Missing")) +
    theme_minimal()+
    theme(axis.text.x = element_text(angle=45, vjust=0.5)) +
    labs(x = "Variables in Dataset",
         y = "Total observations")+coord_flip()
}
library(gridExtra)
m1 <- data %>% bar_missing()
m2 <- data %>% matrix_missing()

grid.arrange(m1,m2,nrow=2)

library(tidyr)


data %>% 
  pivot_longer(cols=-date,
                      names_to='variable',
                      values_to='value') %>% # changing the data to long format for visualization purpose
  ggplot(aes(x=date,y=value))+
  geom_line(color=mycolors[2],alpha=0.8)+
  facet_wrap(~variable,scales = 'free_y',nrow=4)+
  labs(title='Time-Series Trend for Variables',xlab='Date',ylab='Value')

# box plot
data %>% 
  pivot_longer(cols=-date,
                      names_to='variable',
                      values_to='value') %>% 
  ggplot(aes(x=variable,y=value))+
  geom_boxplot(alpha=0.6,fill=mycolors[2])+
  facet_wrap(~variable,scales = 'free',nrow=4)+
  coord_flip()+
  labs(title='Boxplot for Variables')

# distribution plot
data %>% 
  pivot_longer(cols=-date,
                      names_to='variable',
                      values_to='value') %>% 
  ggplot(aes(x=value,y=..count..))+
  geom_density(alpha=0.6,fill=mycolors[2])+
  facet_wrap(~variable,scales = 'free',nrow=4)+
  labs(title='Distribution for Variables')

1-b Raw Data Trend Decomposition

library(stats)
library(forecast)

# Convert into ts format to conduct trend decomposition
ts_avgtemp <- ts(data$avgtemp,start=1960,frequency = 365.25)
ts_avgtemp %>% mstl() %>%
  autoplot(color=mycolors[2])+
  labs(title='Trend Decomposition')

2. Revised Data Exploration

# extract from 2000 
df <- data[data$date>'2018-12-31',]
Hmisc::describe(df)
## df 
## 
##  11  Variables      1461  Observations
## --------------------------------------------------------------------------------
## date 
##          n    missing   distinct       Info       Mean        Gmd        .05 
##       1461          0       1461          1 2020-12-31      487.3 2019-03-15 
##        .10        .25        .50        .75        .90        .95 
## 2019-05-27 2020-01-01 2020-12-31 2021-12-31 2022-08-07 2022-10-19 
## 
## lowest : 2019-01-01 2019-01-02 2019-01-03 2019-01-04 2019-01-05
## highest: 2022-12-27 2022-12-28 2022-12-29 2022-12-30 2022-12-31
## --------------------------------------------------------------------------------
## max 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1461        0      385        1    18.26    11.79      0.3      3.2 
##      .25      .50      .75      .90      .95 
##      9.7     19.5     27.3     30.3     32.1 
## 
## lowest : -10.7 -8.6  -8.4  -7.5  -7.3 , highest: 35.9  36    36.1  36.5  36.8 
## --------------------------------------------------------------------------------
## min 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1460        1      394        1    9.385    12.23    -8.40    -5.51 
##      .25      .50      .75      .90      .95 
##     0.80     9.45    19.00    23.11    24.70 
## 
## lowest : -18.6 -16.6 -16.5 -15.5 -14.4, highest: 27.2  27.3  27.4  27.8  27.9 
## --------------------------------------------------------------------------------
## range 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1460        1      151        1    8.867    3.364      3.7      4.8 
##      .25      .50      .75      .90      .95 
##      6.8      9.0     10.9     12.7     13.6 
## 
## lowest : 1.5  1.7  1.8  1.9  2   , highest: 16.6 17   17.2 17.5 17.9
## --------------------------------------------------------------------------------
## precipitation 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      597      864      195    0.976     9.22    14.72     0.00     0.00 
##      .25      .50      .75      .90      .95 
##     0.00     1.20     8.00    27.02    50.46 
## 
## lowest : 0     0.1   0.2   0.3   0.4  , highest: 114.5 120   123.1 129.6 176.2
## --------------------------------------------------------------------------------
## humidity 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1461        0      467        1    62.53    17.09     37.8     42.9 
##      .25      .50      .75      .90      .95 
##     51.6     63.0     73.0     82.1     87.4 
## 
## lowest : 17.9 24.3 25.5 25.6 26.3, highest: 96.8 97.6 98.1 98.5 99.3
## --------------------------------------------------------------------------------
## avgwindspeed 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1461        0       42    0.997    2.249   0.7343      1.4      1.5 
##      .25      .50      .75      .90      .95 
##      1.8      2.1      2.6      3.2      3.5 
## 
## lowest : 0.6 0.9 1   1.1 1.2, highest: 4.6 4.8 5   5.8 6  
## --------------------------------------------------------------------------------
## v1 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1458        3      137    0.999    6.393    4.551      0.0      0.1 
##      .25      .50      .75      .90      .95 
##      2.6      7.5      9.5     11.1     12.1 
## 
## lowest : 0    0.1  0.2  0.3  0.4 , highest: 13.2 13.3 13.4 13.5 13.6
## --------------------------------------------------------------------------------
## v2 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1457        4     1098        1    14.15     8.12    3.196    5.026 
##      .25      .50      .75      .90      .95 
##    8.910   13.150   19.410   24.718   26.616 
## 
## lowest : 0.54  0.75  0.84  0.96  1.06 , highest: 30.01 30.06 30.35 30.44 30.79
## --------------------------------------------------------------------------------
## v3 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1461        0      644    0.999    53.51    37.83      0.0      0.7 
##      .25      .50      .75      .90      .95 
##     21.3     62.2     85.2     91.9     93.0 
## 
## lowest : 0    0.7  0.8  0.9  1   , highest: 94.8 94.9 95.1 95.3 95.9
## --------------------------------------------------------------------------------
## avgtemp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1461        0      386        1    13.48    11.84     -4.1     -1.2 
##      .25      .50      .75      .90      .95 
##      5.3     14.0     22.8     26.1     27.5 
## 
## lowest : -14.9 -14.5 -12.2 -12.1 -11.8, highest: 31.1  31.2  31.5  31.6  31.7 
## --------------------------------------------------------------------------------
nm1 <- df %>% bar_missing()
nm2 <- df %>% matrix_missing()

grid.arrange(nm1,nm2,nrow=2)

# fill 0 for precipitation NA values
df$precipitation <- ifelse(is.na(df$precipitation),0,df$precipitation)

# fill previous value for rest of data with missing values
library(zoo)
df <- na.locf(df)
df %>% 
  pivot_longer(cols=-date,
                      names_to='variable',
                      values_to='value') %>%
  ggplot(aes(x=date,y=value))+
  geom_line(color=mycolors[1],alpha=0.8)+
  facet_wrap(~variable,scales = 'free_y',nrow=4)+
  labs(title='Time-Series Trend for Variables',xlab='Date',ylab='Value')

df %>% 
  pivot_longer(cols=-date,
                      names_to='variable',
                      values_to='value') %>% 
  ggplot(aes(x=variable,y=value))+
  geom_boxplot(alpha=0.6,fill=mycolors[1])+
  facet_wrap(~variable,scales = 'free',nrow=4)+
  coord_flip()+
  labs(title='Boxplot for Variables')

df %>% 
  pivot_longer(cols=-date,
                      names_to='variable',
                      values_to='value') %>% 
  ggplot(aes(x=value,y=..count..))+
  geom_density(alpha=0.6,fill=mycolors[1])+
  facet_wrap(~variable,scales = 'free',nrow=4)+
  labs(title='Distribution for Variables')

ts_avgtemp_new <- ts(df$avgtemp,start=2019,frequency = 365)
ts_avgtemp_new %>% mstl() %>%
  autoplot(color=mycolors[1])+
  labs(title='Trend Decomposition')

3. Modeling

# split the data sets into train and test
train <- df[df$date<'2022-01-01',]
train <- train$avgtemp
test <- df[df$date>='2022-01-01',]
test <- test$avgtemp
# convert the data into ts format
ts_train <- ts(train,frequency = 365)
ts_test <- ts(test,start=c(4,2),frequency = 365)

Stationarity

  • For time-series data, it is always important to check the stationarity before building a model. If the data is not stationary, you can’t predict the future values since either the mean or variance (covariance as well) is diverging.

  • Let’s explore the stationarity using the ACF and PACF plot. ACF plot is used to determine the order of q for ARIMA model while PACF is used to determine the order of p.

acf <- ggAcf(ts_train,color=mycolors[1],alpha=0.7)+labs(title='ACF')
pacf <- ggPacf(ts_train,color=mycolors[1],alpha=0.7)+labs(title='PACF')

grid.arrange(acf,pacf)

  • The ACF and PACF plot both tells that the data is not stationary. You see that the ACF plot Let’s conduct adf test to see the result as well.
library(tseries)
adf.test(ts_train)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_train
## Dickey-Fuller = -1.4244, Lag order = 10, p-value = 0.822
## alternative hypothesis: stationary
  • The p-value of the test is 0.8 which suggest that the data is not stationary. Thus, we will difference the data. We will be predicting the change in temperature to predict the average temperature.
# difference the value 
lag_ts_train <- diff(ts_train)
# plot acf and pacf for lagged data
lag_acf <- ggAcf(lag_ts_train,color=mycolors[1],alpha=0.7)+labs(title='ACF')
lag_pacf <- ggPacf(lag_ts_train,color=mycolors[1],alpha=0.7)+labs(title='PACF')

grid.arrange(lag_acf,lag_pacf)

  • By lagging the value, it seems that the there was a s significant change in ACF plot. We can kind of assume that stationarity was achieved. Since we see the significant spikes infinitely, I will be assigning p=1 and q=1 when building custom ARIMA model

  • Let’s conduct adf test

adf.test(lag_ts_train)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  lag_ts_train
## Dickey-Fuller = -13.35, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
  • The adf test results says that the lagged value is stationary.

Buidling Models

# fit auto ARIMA model
fit_arima <- ts_train %>% auto.arima()
fit_arima
## Series: . 
## ARIMA(3,0,0)(0,1,0)[365] 
## 
## Coefficients:
##          ar1      ar2     ar3
##       1.0067  -0.4327  0.1248
## s.e.  0.0367   0.0501  0.0369
## 
## sigma^2 = 8.525:  log likelihood = -1819.54
## AIC=3647.08   AICc=3647.13   BIC=3665.46
fit_arima_fore <- fit_arima %>% forecast(364)
  • Above is the order of what Auto ARIMA function delivered.
# fit TBATS model
fit_tbats <- ts_train %>% tbats()
fit_tbats_fore <- ts_train %>% tbats() %>% forecast(364)
set.seed(0)
# fit neural network autoregression model
fit_nn <- nnetar(ts_train,p=1)
fit_nn_fore <- fit_nn %>% forecast(h=364)
# fit stl forecast
# stlf is forecasting based on decomposition of the data
fit_stl <- stlf(ts_train)
fit_stl_fore <- fit_stl %>% forecast(h=364)
# create an ensemble model
combination <- (fit_arima_fore[['mean']]+fit_tbats_fore[['mean']]+fit_nn_fore[['mean']]+fit_stl_fore[['mean']])/4
# visualize results

autoplot(ts_train) +
  autolayer(fit_arima_fore,series = 'ARIMA',PI=F)+
  autolayer(fit_tbats_fore,series = 'TBATS',PI=F)+
  autolayer(fit_nn_fore,series = 'NNAR',PI=F)+
  autolayer(fit_stl_fore,series = 'STL',PI=F)+
  autolayer(combination,series = 'Ensemble')+
  theme(legend.position=c(0.001, 0.99999),
        legend.justification=c('left', 'top'))+
   labs(x='Time', y='Avg Temperature',title='Average Temperature Forecast')

# visualize actual vs forecast

p1 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
  autolayer(fit_arima_fore,size=0.7,PI=F,series = 'SARIMA')+
  scale_color_manual(values=mycolors[1])+
  theme(legend.position=c(0.001, 0.99999),
        legend.justification=c('left', 'top'))+
   labs(x='Time', y='Avg Temperature',title='Auto SARIMA Forecast')

p2 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
  autolayer(fit_tbats_fore,size=0.8,PI=F,series = 'TBATS')+
  scale_color_manual(values=mycolors[1])+
  theme(legend.position=c(0.001, 0.99999),
        legend.justification=c('left', 'top'))+
   labs(x='Time', y='Avg Temperature',title='TBATS Forecast')

p3 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
  autolayer(fit_nn_fore,size=0.8,PI=F,series = 'Neural Network')+
  scale_color_manual(values=mycolors[1])+
  theme(legend.position=c(0.001, 0.99999),
        legend.justification=c('left', 'top'))+
   labs(x='Time', y='Avg Temperature',title='Neural Network Forecast')

p4 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
  autolayer(fit_stl_fore,size=0.7,PI=F,series = 'STL')+
  scale_color_manual(values=mycolors[1])+
  theme(legend.position=c(0.001, 0.99999),
        legend.justification=c('left', 'top'))+
   labs(x='Time', y='Avg Temperature',title='STL Forecast')

p5 <- autoplot(ts_test,size=0.7,color=mycolors[2],alpha=1)+
  autolayer(combination,size=0.7,series = 'Ensemble')+
  scale_color_manual(values=mycolors[1])+
  theme(legend.position=c(0.001, 0.99999),
        legend.justification=c('left', 'top'))+
   labs(x='Time', y='Avg Temperature',title='Ensemble Forecast')

grid.arrange(p1,p2,p3,p4,p5,nrow=3)

4. Result Summary

# create dataframe for each metric

rmse <- c(SARIMA=accuracy(fit_arima_fore,ts_test)['Test set','RMSE'] ,
TBATS=accuracy(fit_tbats_fore,ts_test)['Test set','RMSE'] ,
NNAR=accuracy(fit_nn_fore,ts_test)['Test set','RMSE'],
STL=accuracy(fit_stl_fore,ts_test)['Test set','RMSE'],
Ensemble=accuracy(combination,ts_test)['Test set','RMSE'])

rmse_df <- rmse %>% data.frame() 
colnames(rmse_df) <- 'RMSE'
rmse_df %>% knitr::kable()
RMSE
SARIMA 4.483757
TBATS 4.424768
NNAR 3.882513
STL 3.579389
Ensemble 3.536691