Forecasting wind speed in England by using ARIMA model

That algorithm contains the ARIMA models for forecasting wind Speed in England

Makarovskikh Tatyana Anatolyevna “Макаровских Татьяна Анатольевна”

Abotaleb mostafa “Аботалеб Мостафа”

Department of Electrical Engineering and Computer Science

South ural state university, Chelyabinsk, Russian federation

Dr Pradeep Mishra

Department of Mathematics & Statistics

Jawaharlal Nehru Krishi Vishwavidyalaya, India

# Imports
library(fpp2)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## v ggplot2   3.3.2     v fma       2.4  
## v forecast  8.13      v expsmooth 2.3
## 
library(forecast)
library(ggplot2)
library("readxl")
library(moments)
library(forecast)
library(here)
## here() starts at F:/Phd/Wind Speed
require(forecast)  
require(tseries)
## Loading required package: tseries
require(markovchain)
## Loading required package: markovchain
## Package:  markovchain
## Version:  0.8.5-3
## Date:     2020-12-03
## BugReport: https://github.com/spedygiorgio/markovchain/issues
require(data.table)
## Loading required package: data.table
#Global variable
Full_original_data <- read.csv("F:/Phd/Wind Speed/data/WindSpeed.csv")
y_lab<- "Wind Speed in England"   # input name of data
Actual_date_interval <- c("1994/07/07","2015/12/31")
Forecast_date_interval <- c("2016/01/01","2016/01/31")
validation_data_days <-1551 #(20% of dataset for testing )
frequency<-"day"
# Data Preparation & calculate some of statistics measures
original_data<-as.numeric(Full_original_data$Wind_speed)

summary(original_data)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.479   2.575   3.101   4.179  16.421
sd(original_data)  # calculate standard deviation
## [1] 2.167933
skewness(original_data)  # calculate Cofficient of skewness
## [1] 1.30852
kurtosis(original_data)   # calculate Cofficient of kurtosis
## [1] 5.058105
rows <- NROW(original_data)
training_data<-original_data[1:(rows-validation_data_days)]
testing_data<-original_data[(rows-validation_data_days+1):rows]
AD<-fulldate<-seq(as.Date(Actual_date_interval[1]),as.Date(Actual_date_interval[2]), frequency)  #input range for actual date
FD<-seq(as.Date(Forecast_date_interval[1]),as.Date(Forecast_date_interval[2]), frequency)  #input range forecasting date
N_forecasting_days<-nrow(data.frame(FD)) 
validation_dates<-tail(AD,validation_data_days)
validation_data_by_name<-weekdays(validation_dates)
forecasting_data_by_name<-weekdays(FD)
data_series<-ts(training_data)
#Auto arima model
##################
require(tseries) # need to install tseries tj test Stationarity in time series 
paste ("tests For Check Stationarity in series  ==> ",y_lab, sep=" ")
## [1] "tests For Check Stationarity in series  ==>  Wind Speed in England"
kpss.test(data_series) # applay kpss test
## Warning in kpss.test(data_series): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  data_series
## KPSS Level = 22.084, Truncation lag parameter = 11, p-value = 0.01
pp.test(data_series)   # applay pp test
## Warning in pp.test(data_series): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_series
## Dickey-Fuller Z(alpha) = -3370.5, Truncation lag parameter = 11,
## p-value = 0.01
## alternative hypothesis: stationary
adf.test(data_series)  # applay adf test
## Warning in adf.test(data_series): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_series
## Dickey-Fuller = -9.8023, Lag order = 18, p-value = 0.01
## alternative hypothesis: stationary
ndiffs(data_series)    # Doing first diffrencing on data
## [1] 1
#Taking the first difference
diff1_x1<-diff(data_series)
autoplot(diff1_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black",  ylab=y_lab,main = "1nd differenced series")
## Warning: Ignoring unknown parameters: col.main, col.lab, col.sub

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking first differences in  ==> ",y_lab, sep=" ")
## [1] "tests For Check Stationarity in series after taking first differences in  ==>  Wind Speed in England"
kpss.test(diff1_x1)   # applay kpss test after taking first differences
## Warning in kpss.test(diff1_x1): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff1_x1
## KPSS Level = 0.0016108, Truncation lag parameter = 11, p-value = 0.1
pp.test(diff1_x1)     # applay pp test after taking first differences
## Warning in pp.test(diff1_x1): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff1_x1
## Dickey-Fuller Z(alpha) = -5409, Truncation lag parameter = 11, p-value
## = 0.01
## alternative hypothesis: stationary
adf.test(diff1_x1)    # applay adf test after taking first differences
## Warning in adf.test(diff1_x1): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff1_x1
## Dickey-Fuller = -29.839, Lag order = 18, p-value = 0.01
## alternative hypothesis: stationary
#Taking the second difference
diff2_x1=diff(diff1_x1)
autoplot(diff2_x1, xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", ylab=y_lab ,main = "2nd differenced series")
## Warning: Ignoring unknown parameters: col.main, col.lab, col.sub

##Testing the stationary of the first differenced series
paste ("tests For Check Stationarity in series after taking Second differences in",y_lab, sep=" ")
## [1] "tests For Check Stationarity in series after taking Second differences in Wind Speed in England"
kpss.test(diff2_x1)   # applay kpss test after taking Second differences
## Warning in kpss.test(diff2_x1): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  diff2_x1
## KPSS Level = 0.00094204, Truncation lag parameter = 11, p-value = 0.1
pp.test(diff2_x1)     # applay pp test after taking Second differences
## Warning in pp.test(diff2_x1): p-value smaller than printed p-value
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff2_x1
## Dickey-Fuller Z(alpha) = -7424.2, Truncation lag parameter = 11,
## p-value = 0.01
## alternative hypothesis: stationary
adf.test(diff2_x1)    # applay adf test after taking Second differences
## Warning in adf.test(diff2_x1): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff2_x1
## Dickey-Fuller = -39.362, Lag order = 18, p-value = 0.01
## alternative hypothesis: stationary
####Fitting an ARIMA Model
#1. Using auto arima function
model1 <- auto.arima(data_series,stepwise=FALSE, approximation=FALSE, trace=T, test = c("kpss", "adf", "pp"))  #applaying auto arima
## 
##  ARIMA(0,1,0)                    : 24176.48
##  ARIMA(0,1,0) with drift         : 24178.49
##  ARIMA(0,1,1)                    : 23271.97
##  ARIMA(0,1,1) with drift         : 23273.97
##  ARIMA(0,1,2)                    : 22466.57
##  ARIMA(0,1,2) with drift         : 22468.51
##  ARIMA(0,1,3)                    : 22372.72
##  ARIMA(0,1,3) with drift         : 22374.6
##  ARIMA(0,1,4)                    : 22356.46
##  ARIMA(0,1,4) with drift         : 22358.31
##  ARIMA(0,1,5)                    : 22352.47
##  ARIMA(0,1,5) with drift         : 22354.31
##  ARIMA(1,1,0)                    : 23798.76
##  ARIMA(1,1,0) with drift         : 23800.76
##  ARIMA(1,1,1)                    : 22357.8
##  ARIMA(1,1,1) with drift         : 22359.61
##  ARIMA(1,1,2)                    : 22347.88
##  ARIMA(1,1,2) with drift         : 22349.71
##  ARIMA(1,1,3)                    : 22342.96
##  ARIMA(1,1,3) with drift         : 22344.77
##  ARIMA(1,1,4)                    : 22343.14
##  ARIMA(1,1,4) with drift         : 22344.92
##  ARIMA(2,1,0)                    : 23377.9
##  ARIMA(2,1,0) with drift         : 23379.9
##  ARIMA(2,1,1)                    : 22349.66
##  ARIMA(2,1,1) with drift         : 22351.49
##  ARIMA(2,1,2)                    : 22337.75
##  ARIMA(2,1,2) with drift         : 22348.56
##  ARIMA(2,1,3)                    : Inf
##  ARIMA(2,1,3) with drift         : 22339.1
##  ARIMA(3,1,0)                    : 23158.48
##  ARIMA(3,1,0) with drift         : 23160.48
##  ARIMA(3,1,1)                    : 22343.21
##  ARIMA(3,1,1) with drift         : 22345.02
##  ARIMA(3,1,2)                    : Inf
##  ARIMA(3,1,2) with drift         : Inf
##  ARIMA(4,1,0)                    : 23020.23
##  ARIMA(4,1,0) with drift         : 23022.23
##  ARIMA(4,1,1)                    : 22343.34
##  ARIMA(4,1,1) with drift         : 22345.14
##  ARIMA(5,1,0)                    : 22949.39
##  ARIMA(5,1,0) with drift         : 22951.4
## 
## 
## 
##  Best model: ARIMA(2,1,2)
model1 # show the result of autoarima 
## Series: data_series 
## ARIMA(2,1,2) 
## 
## Coefficients:
##           ar1     ar2      ma1      ma2
##       -0.4591  0.3958  -0.0475  -0.9042
## s.e.   0.0281  0.0193   0.0238   0.0235
## 
## sigma^2 estimated as 2.141:  log likelihood=-11163.87
## AIC=22337.74   AICc=22337.75   BIC=22371.4
#Make changes in the source of auto arima to run the best model
arima.string <- function (object, padding = FALSE) 
{
  order <- object$arma[c(1, 6, 2, 3, 7, 4, 5)]
  m <- order[7]
  result <- paste("ARIMA(", order[1], ",", order[2], ",", 
                  order[3], ")", sep = "")
  if (m > 1 && sum(order[4:6]) > 0) {
    result <- paste(result, "(", order[4], ",", order[5], 
                    ",", order[6], ")[", m, "]", sep = "")
  }
  if (padding && m > 1 && sum(order[4:6]) == 0) {
    result <- paste(result, "         ", sep = "")
    if (m <= 9) {
      result <- paste(result, " ", sep = "")
    }
    else if (m <= 99) {
      result <- paste(result, "  ", sep = "")
    }
    else {
      result <- paste(result, "   ", sep = "")
    }
  }
  if (!is.null(object$xreg)) {
    if (NCOL(object$xreg) == 1 && is.element("drift", names(object$coef))) {
      result <- paste(result, "with drift        ")
    }
    else {
      result <- paste("Regression with", result, "errors")
    }
  }
  else {
    if (is.element("constant", names(object$coef)) || is.element("intercept", 
                                                                 names(object$coef))) {
      result <- paste(result, "with non-zero mean")
    }
    else if (order[2] == 0 && order[5] == 0) {
      result <- paste(result, "with zero mean    ")
    }
    else {
      result <- paste(result, "                  ")
    }
  }
  if (!padding) {
    result <- gsub("[ ]*$", "", result)
  }
  return(result)
}


bestmodel <- arima.string(model1, padding = TRUE)
bestmodel <- substring(bestmodel,7,11)
bestmodel <- gsub(" ", "", bestmodel)
bestmodel <- gsub(")", "", bestmodel)
bestmodel <- strsplit(bestmodel, ",")[[1]]
bestmodel <- c(strtoi(bestmodel[1]),strtoi(bestmodel[2]),strtoi(bestmodel[3]))
bestmodel
## [1] 2 1 2
strtoi(bestmodel[3])
## [1] 2
#2. Using ACF and PACF Function
#par(mfrow=c(1,2))  # Code for making two plot in one graph 
acf(diff2_x1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab, main=paste("ACF-2nd differenced series ",y_lab, sep=" ",lag.max=20))    # plot ACF "auto correlation function after taking second diffrences

pacf(diff2_x1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab,main=paste("PACF-2nd differenced series ",y_lab, sep=" ",lag.max=20))   # plot PACF " Partial auto correlation function after taking second diffrences

library(forecast)   # install library forecast             
x1_model1= arima(data_series, order=c(bestmodel)) # Run Best model of auto arima  for forecasting
x1_model1  # Show result of best model of auto arima 
## 
## Call:
## arima(x = data_series, order = c(bestmodel))
## 
## Coefficients:
##           ar1     ar2      ma1      ma2
##       -0.4591  0.3958  -0.0475  -0.9042
## s.e.   0.0281  0.0193   0.0238   0.0235
## 
## sigma^2 estimated as 2.14:  log likelihood = -11163.87,  aic = 22337.74
paste ("accuracy of autoarima Model For  ==> ",y_lab, sep=" ")
## [1] "accuracy of autoarima Model For  ==>  Wind Speed in England"
accuracy(x1_model1)  # aacuracy of best model from auto arima
##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set 0.00872683 1.462684 1.071619 -Inf  Inf 0.8956295 0.00681155
x1_model1$x          # show result of best model from auto arima 
## NULL
checkresiduals(x1_model1,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)  # checkresiduals from best model from using auto arima 

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)
## Q* = 28.495, df = 6, p-value = 7.578e-05
## 
## Model df: 4.   Total lags used: 10
paste("Box-Ljung test , Ljung-Box test For Modelling for   ==> ",y_lab, sep=" ")
## [1] "Box-Ljung test , Ljung-Box test For Modelling for   ==>  Wind Speed in England"
Box.test(x1_model1$residuals^2, lag=20, type="Ljung-Box")   # Do test for resdulas by using Box-Ljung test , Ljung-Box test For Modelling
## 
##  Box-Ljung test
## 
## data:  x1_model1$residuals^2
## X-squared = 1219.2, df = 20, p-value < 2.2e-16
library(tseries)
jarque.bera.test(x1_model1$residuals)  # Do test jarque.bera.test 
## 
##  Jarque Bera Test
## 
## data:  x1_model1$residuals
## X-squared = 3147.8, df = 2, p-value < 2.2e-16
#Actual Vs Fitted
plot(data_series, col='red',lwd=2, main="Actual vs Fitted Plot", xlab='Time in (days)', ylab=y_lab) # plot actual and Fitted model 
lines(fitted(x1_model1), col='black')

#Test data
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) ) # make testing data in time series and start from rows-6
forecasting_auto_arima <- forecast(x1_model1, h=N_forecasting_days+validation_data_days)
validation_forecast<-head(forecasting_auto_arima$mean,validation_data_days)
MAPE_Per_Day<-round(abs(((testing_data-validation_forecast)/testing_data)*100)  ,3)
paste ("MAPE % For ",validation_data_days,frequency,"by using bats Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE % For  1551 day by using bats Model for  ==>  Wind Speed in England"
MAPE_Mean_All.ARIMA_Model<-round(mean(MAPE_Per_Day),3)
MAPE_Mean_All.ARIMA<-paste(round(mean(MAPE_Per_Day),3),"% MAPE ",validation_data_days,frequency,y_lab,sep=" ")
MAPE_auto_arima<-paste(round(MAPE_Per_Day,3),"%")
MAPE_auto.arima_Model<-paste(MAPE_Per_Day ,"%")
paste (" MAPE that's Error of Forecasting for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")
## [1] " MAPE that's Error of Forecasting for  1551  days in bats Model for  ==>  Wind Speed in England"
paste(MAPE_Mean_All.ARIMA,"%")
## [1] "Inf % MAPE  1551 day Wind Speed in England %"
paste ("MAPE that's Error of Forecasting day by day for ",validation_data_days," days in bats Model for  ==> ",y_lab, sep=" ")
## [1] "MAPE that's Error of Forecasting day by day for  1551  days in bats Model for  ==>  Wind Speed in England"
#data.frame(date_auto.arima=validation_dates,validation_data_by_name,actual_data=testing_data,forecasting_auto.arima=validation_forecast,MAPE_auto.arima_Model)
#data.frame(FD,forecating_date=forecasting_data_by_name,forecasting_by_auto.arima=tail(forecasting_auto_arima$mean,N_forecasting_days))
plot(forecasting_auto_arima)
x1_test <- ts(testing_data, start =(rows-validation_data_days+1) )
lines(x1_test, col='red',lwd=2)

graph4<-autoplot(forecasting_auto_arima,xlab = paste ("Time in  ", frequency ,y_lab , sep=" "),  col.main="black", col.lab="black", col.sub="black", cex.main=1, cex.lab=1, cex.sub=1,font.main=4, font.lab=4, ylab=y_lab)
graph4

MAPE_Mean_All.ARIMA
## [1] "Inf % MAPE  1551 day Wind Speed in England"
## Error of forecasting
Error_auto.arima<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
REOF_A_auto.arima<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_auto.arima<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_auto.arima<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_auto.arima<-sqrt(sum((Error_auto.arima^2))/validation_data_days)   #  Root mean square forecast error
MSE_auto.arima<-(sum((Error_auto.arima^2))/validation_data_days)   #  Root mean square forecast error
MAD_auto.arima<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_auto.arima<-c(Error_auto.arima)
REOF_auto.arima1<-c(paste(round(REOF_A_auto.arima,3),"%"))
REOF_auto.arima2<-c(paste(round(REOF_F_auto.arima,3),"%"))
#data.frame(correlation_auto.arima,MSE_auto.arima,RMSE_auto.arima,MAPE_Mean_All.ARIMA_Model,MAD_auto.arima) # analysis of Error  by using Auto ARIMAA model shows result of correlation ,MSE ,MPER
#data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_auto.arima,REOF_A_auto.arima=REOF_auto.arima1,REOF_F_auto.arima=REOF_auto.arima2)   # Analysis of error shows result AEOF,REOF_A,REOF_F
## Error of forecasting
Error_auto.arima<-abs(testing_data-validation_forecast)  # Absolute error of forecast (AEOF)
sqrt(sum(Error_auto.arima^2/validation_data_days))
## [1] 2.302737
REOF_A_auto.arima<-abs(((testing_data-validation_forecast)/testing_data)*100)  #Relative error of forecast (divided by actual)(REOF_A)
REOF_F_auto.arima<-abs(((testing_data-validation_forecast)/validation_forecast)*100)  #Relative error of forecast (divided by forecast)(REOF_F)
correlation_auto.arima<-cor(testing_data,validation_forecast, method = c("pearson"))     # correlation coefficient between predicted and actual values 
RMSE_auto.arima<-sqrt(sum((Error_auto.arima^2))/validation_data_days)   #  Root mean square forecast error
MAD_auto.arima<-abs((sum(testing_data-validation_forecast))/validation_data_days)   # average forecast accuracy
AEOF_auto.arima<-c(Error_auto.arima)
REOF_auto.arima1<-c(paste(round(REOF_A_auto.arima,3),"%"))
REOF_auto.arima2<-c(paste(round(REOF_F_auto.arima,3),"%"))
#data.frame(correlation_auto.arima,RMSE_auto.arima,MAPE_Mean_All,MAD_auto.arima) # analysis of Error  by using Holt's linear model shows result of correlation ,MSE ,MPER
#data.frame(validation_dates,Validation_day_name=validation_data_by_name,AEOF_auto.arima,REOF_A_auto.arima=REOF_auto.arima1,REOF_F_auto.arima=REOF_auto.arima2)   # Analysis of error shows result AEOF,REOF_A,REOF_F