#Create an empty data frame from Jan 1 1979 to Jun 7 2017 in format YYYYMMDD HH MM SS
time<-data.frame(seq(ISOdate(1979,1,1), ISOdate(2017,6,7), by='day'))
#Rename the variable from x to time
colnames(time)[1]<-'time'
#Create a new variable name "date"only with the first 10 characters of the variable time
time['date']<-substr(time$time,1,10)
#Rename variable "TIME" to "DS"
colnames(time)[2]<-'ds'
str(time)
## 'data.frame': 14038 obs. of 2 variables:
## $ time: POSIXct, format: "1979-01-01 12:00:00" "1979-01-02 12:00:00" ...
## $ ds : chr "1979-01-01" "1979-01-02" "1979-01-03" "1979-01-04" ...
library(readr)
seaice_north<- read_csv('seaice_north_28.csv')
## Parsed with column specification:
## cols(
## Year = col_integer(),
## Month = col_integer(),
## Day = col_integer(),
## dsold = col_character(),
## Extent = col_double(),
## ds_fail = col_character(),
## month2 = col_character(),
## day2 = col_character(),
## ds = col_date(format = "")
## )
#Eliminate information from 1978
seaice_north<-seaice_north[-c(1:34),]
#Eliminate extra columns
seaice_north<-seaice_north[,c(5,9)]
#Convert variable "ds" from date to character
seaice_north$ds<-as.character(seaice_north$ds)
str(seaice_north)
## Classes 'tbl_df', 'tbl' and 'data.frame': 12420 obs. of 2 variables:
## $ Extent: num 15 14.9 14.9 15 15.2 ...
## $ ds : chr "1979-01-02" "1979-01-04" "1979-01-06" "1979-01-08" ...
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#The "new" data set will be our main and on the right we'll bring the information from the original data set that has data from every other day, we will expect rows with n/a values.
daily.seaice.north<-time %>% left_join(seaice_north, by = "ds")
#Eliminate the 1st column and rename from extent to y
daily.seaice.north<-daily.seaice.north[,-c(1)]
colnames(daily.seaice.north)[2]<-'y'
str(daily.seaice.north)
## 'data.frame': 14038 obs. of 2 variables:
## $ ds: chr "1979-01-01" "1979-01-02" "1979-01-03" "1979-01-04" ...
## $ y : num NA 15 NA 14.9 NA ...
library(imputeTS)
plotNA.distribution(daily.seaice.north$y)
#Impute data in n/a rows
imputed.seaice.north<-na.interpolation(daily.seaice.north)
head(imputed.seaice.north);tail(imputed.seaice.north)
## ds y
## 1 1979-01-01 14.9970
## 2 1979-01-02 14.9970
## 3 1979-01-03 14.9595
## 4 1979-01-04 14.9220
## 5 1979-01-05 14.9255
## 6 1979-01-06 14.9290
## ds y
## 14033 2017-06-02 11.765
## 14034 2017-06-03 11.681
## 14035 2017-06-04 11.601
## 14036 2017-06-05 11.545
## 14037 2017-06-06 11.510
## 14038 2017-06-07 11.455
Plot imputed values:
plotNA.imputations(daily.seaice.north$y, imputed.seaice.north$y)
train.new<-imputed.seaice.north[c(1:12053),] #From Jan 1st 1979 to December 31st 2011 (12053 days)
#valid.new<-imputed.seaice.north[c(12054:13149),] #From Jan 1st 2012 to December 31st 2014 (1096 days)
#test.new<-imputed.seaice.north[c(13150:14038),] #From Jan 1st 2015 to June 7th 2017 (889 days)
head(train.new);tail(train.new)
## ds y
## 1 1979-01-01 14.9970
## 2 1979-01-02 14.9970
## 3 1979-01-03 14.9595
## 4 1979-01-04 14.9220
## 5 1979-01-05 14.9255
## 6 1979-01-06 14.9290
## ds y
## 12048 2011-12-26 12.682
## 12049 2011-12-27 12.755
## 12050 2011-12-28 12.854
## 12051 2011-12-29 13.097
## 12052 2011-12-30 13.115
## 12053 2011-12-31 13.180
library(prophet)
## Loading required package: Rcpp
m<-prophet(train.new, yearly.seasonality = TRUE, weekly.seasonality = TRUE, daily.seasonality = TRUE)
## Initial log joint probability = -343.871
## Optimization terminated normally:
## Convergence detected: relative gradient magnitude is below tolerance
We need to construct a dataframe for prediction. The make_future_dataframe function takes the model object and a number of periods to forecast:
future <- make_future_dataframe(m, periods = 0)
head(future);tail(future);str(future)
## ds
## 1 1979-01-01
## 2 1979-01-02
## 3 1979-01-03
## 4 1979-01-04
## 5 1979-01-05
## 6 1979-01-06
## ds
## 12050 2011-12-28
## 12051 2011-12-29
## 12052 2011-12-30
## 12053 2011-12-31
## 12054 <NA>
## 12055 2011-12-31
## 'data.frame': 12055 obs. of 1 variable:
## $ ds: POSIXct, format: "1979-01-01" "1979-01-02" ...
As with most modeling procedures in R, we use the generic predict function to get our forecast:
forecast <- predict(m)
head(forecast);tail(forecast)
## ds trend daily daily_lower daily_upper seasonal
## 1 1979-01-01 14.47520 -2.004815 -2.004815 -2.004815 0.1126136
## 2 1979-01-02 14.47462 -2.004815 -2.004815 -2.004815 0.1584334
## 3 1979-01-03 14.47403 -2.004815 -2.004815 -2.004815 0.2036588
## 4 1979-01-04 14.47345 -2.004815 -2.004815 -2.004815 0.2508084
## 5 1979-01-05 14.47287 -2.004815 -2.004815 -2.004815 0.2975026
## 6 1979-01-06 14.47228 -2.004815 -2.004815 -2.004815 0.3444948
## seasonal_lower seasonal_upper seasonalities seasonalities_lower
## 1 0.1126136 0.1126136 0.1126136 0.1126136
## 2 0.1584334 0.1584334 0.1584334 0.1584334
## 3 0.2036588 0.2036588 0.2036588 0.2036588
## 4 0.2508084 0.2508084 0.2508084 0.2508084
## 5 0.2975026 0.2975026 0.2975026 0.2975026
## 6 0.3444948 0.3444948 0.3444948 0.3444948
## seasonalities_upper weekly weekly_lower weekly_upper yearly
## 1 0.1126136 6.172632e-04 6.172632e-04 6.172632e-04 2.116812
## 2 0.1584334 4.913515e-05 4.913515e-05 4.913515e-05 2.163199
## 3 0.2036588 -1.071270e-03 -1.071270e-03 -1.071270e-03 2.209545
## 4 0.2508084 -2.425398e-04 -2.425398e-04 -2.425398e-04 2.255866
## 5 0.2975026 1.543358e-04 1.543358e-04 1.543358e-04 2.302163
## 6 0.3444948 8.875370e-04 8.875370e-04 8.875370e-04 2.348422
## yearly_lower yearly_upper yhat_lower yhat_upper trend_lower trend_upper
## 1 2.116812 2.116812 14.18218 15.00413 14.47520 14.47520
## 2 2.163199 2.163199 14.22140 15.04913 14.47462 14.47462
## 3 2.209545 2.209545 14.23505 15.11111 14.47403 14.47403
## 4 2.255866 2.255866 14.29972 15.15704 14.47345 14.47345
## 5 2.302163 2.302163 14.36159 15.21680 14.47287 14.47287
## 6 2.348422 2.348422 14.41372 15.23303 14.47228 14.47228
## yhat
## 1 14.58781
## 2 14.63305
## 3 14.67769
## 4 14.72426
## 5 14.77037
## 6 14.81678
## ds trend daily daily_lower daily_upper seasonal
## 12048 2011-12-26 12.58571 -2.004815 -2.004815 -2.004815 -0.181632114
## 12049 2011-12-27 12.58557 -2.004815 -2.004815 -2.004815 -0.134178999
## 12050 2011-12-28 12.58543 -2.004815 -2.004815 -2.004815 -0.087775849
## 12051 2011-12-29 12.58529 -2.004815 -2.004815 -2.004815 -0.039813541
## 12052 2011-12-30 12.58515 -2.004815 -2.004815 -2.004815 0.007423479
## 12053 2011-12-31 12.58501 -2.004815 -2.004815 -2.004815 0.054787004
## seasonal_lower seasonal_upper seasonalities seasonalities_lower
## 12048 -0.181632114 -0.181632114 -0.181632114 -0.181632114
## 12049 -0.134178999 -0.134178999 -0.134178999 -0.134178999
## 12050 -0.087775849 -0.087775849 -0.087775849 -0.087775849
## 12051 -0.039813541 -0.039813541 -0.039813541 -0.039813541
## 12052 0.007423479 0.007423479 0.007423479 0.007423479
## 12053 0.054787004 0.054787004 0.054787004 0.054787004
## seasonalities_upper weekly weekly_lower weekly_upper
## 12048 -0.181632114 6.172632e-04 6.172632e-04 6.172632e-04
## 12049 -0.134178999 4.913515e-05 4.913515e-05 4.913515e-05
## 12050 -0.087775849 -1.071270e-03 -1.071270e-03 -1.071270e-03
## 12051 -0.039813541 -2.425398e-04 -2.425398e-04 -2.425398e-04
## 12052 0.007423479 1.543358e-04 1.543358e-04 1.543358e-04
## 12053 0.054787004 8.875370e-04 8.875370e-04 8.875370e-04
## yearly yearly_lower yearly_upper yhat_lower yhat_upper trend_lower
## 12048 1.822566 1.822566 1.822566 11.98866 12.82949 12.58571
## 12049 1.870587 1.870587 1.870587 12.03926 12.89430 12.58557
## 12050 1.918111 1.918111 1.918111 12.07021 12.94840 12.58543
## 12051 1.965244 1.965244 1.965244 12.10484 12.98149 12.58529
## 12052 2.012084 2.012084 2.012084 12.13066 13.02043 12.58515
## 12053 2.058715 2.058715 2.058715 12.17763 13.06269 12.58501
## trend_upper yhat
## 12048 12.58571 12.40408
## 12049 12.58557 12.45139
## 12050 12.58543 12.49766
## 12051 12.58529 12.54548
## 12052 12.58515 12.59258
## 12053 12.58501 12.63980
You can use the generic plot function to plot the forecast, but you must also pass the model in to be plotted:
plot(m, forecast)
Just as in Python, you can plot the components of the forecast. In R, you use the prophet_plot_components function instead of an instance method:
#prophet_plot_components(m, forecast)
Extract the data from our prediction data frame to convert daily predictions into monthly predictions.
#Get ds and yhat values from the forecast data frame
dataframe<-forecast[,c('ds','yhat')]
#Extract Month and Year from our DS variable:
dataframe['Month']<-substr(dataframe$ds,6,7)
dataframe['Year']<-substr(dataframe$ds,1,4)
#Convert Month and Year from characters into numeric
dataframe$Month<-as.numeric(dataframe$Month)
dataframe$Year<-as.numeric(dataframe$Year)
#Create a new data frame in which we'll calculate the monthly mean of yhat by Month and Year.
newframe<-aggregate(dataframe$yhat,list(by=dataframe$Year,dataframe$Month),FUN=mean)
#Rename the new frame from by to Year, Month and yhat
colnames(newframe)<-c('Year','Month','yhat')
#Sort by Year and Month
newframe<-newframe[order(newframe$Year),]
head(newframe);tail(newframe);str(newframe)
## Year Month yhat
## 1 1979 1 15.23012
## 34 1979 2 16.08859
## 67 1979 3 16.21382
## 100 1979 4 15.47618
## 133 1979 5 14.05808
## 166 1979 6 12.51549
## Year Month yhat
## 231 2011 7 8.459748
## 264 2011 8 6.176259
## 297 2011 9 5.378490
## 330 2011 10 7.260112
## 363 2011 11 9.639488
## 396 2011 12 11.810423
## 'data.frame': 396 obs. of 3 variables:
## $ Year : num 1979 1979 1979 1979 1979 ...
## $ Month: num 1 2 3 4 5 6 7 8 9 10 ...
## $ yhat : num 15.2 16.1 16.2 15.5 14.1 ...
Import file “North.csv” with actual monthly values
actual_values<- read_csv('North.csv')
## Parsed with column specification:
## cols(
## Date = col_character(),
## Extent = col_double()
## )
actual_values_training<-actual_values[c(1:396),]
str(actual_values_training)
## Classes 'tbl_df', 'tbl' and 'data.frame': 396 obs. of 2 variables:
## $ Date : chr "Jan-79" "Feb-79" "Mar-79" "Apr-79" ...
## $ Extent: num 15.4 16.2 16.3 15.4 13.9 ...
actualvsprophet<-data.frame(newframe, actual_values_training$Extent)
#rename columns
colnames(actualvsprophet)[4]<-'Actual'
colnames(actualvsprophet)[3]<-'Prophet'
#calculate errors
actualvsprophet['Error']<-actualvsprophet$Actual-actualvsprophet$Prophet
Plot errors:
plot(actualvsprophet$Error, type='l')
RMSE
sqrt(mean((actualvsprophet$Error)^2))
## [1] 0.3061204
Box.test(actualvsprophet$Error,lag=1,type="Ljung-Box")
##
## Box-Ljung test
##
## data: actualvsprophet$Error
## X-squared = 158.08, df = 1, p-value < 2.2e-16
#write.csv(actualvsprophet,file = "actualvsprophet_training.csv")