setwd("C:/Users/charl/OneDrive/Desktop/DSCI 607 Data Analytics for Environmental Sciences/LAB SOLUTIONS")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(padr)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(knitr)
library(tseries)
library(skimr)
NDVI_df_11 <- readRDS("C:/Users/charl/OneDrive/Desktop/DSCI 607 Data Analytics for Environmental Sciences/LAB SOLUTIONS/NDVI_df_11.rds")
head(NDVI_df_11)
## x y NDVI Date
## 1: -86.09258 40.51259 0.2773395 2013.01.01
## 2: -86.08443 40.51259 0.2749555 2013.01.01
## 3: -86.07629 40.51259 0.2625337 2013.01.01
## 4: -86.06814 40.51259 0.2893912 2013.01.01
## 5: -86.06000 40.51259 0.3065163 2013.01.01
## 6: -86.05185 40.51259 0.3026700 2013.01.01
dates <- unique(NDVI_df_11$Date)
######################### "Spatial"map"
NDVI_df_11 %>% filter(Date==dates[1]) %>%
ggplot() +
geom_point(aes(x = x, y = y, fill = NDVI, col = NDVI))
# 2. Check for missing values
missing_values <- sum(is.na(NDVI_df_11))
if (missing_values > 0) {
print(paste("Number of missing values:", missing_values))
} else {
print("No missing values found.")
}
## [1] "No missing values found."
invalid_ndvi <- sum(NDVI_df_11$NDVI < -1 | NDVI_df_11$NDVI > 1)
if (invalid_ndvi > 0) {
print(paste("Number of invalid NDVI values:", invalid_ndvi))
# You can take appropriate actions, such as removing or imputing invalid values
} else {
print("No invalid NDVI values found.")
}
## [1] "No invalid NDVI values found."
library(dplyr)
station1 <- NDVI_df_11 %>%
group_by(Date) %>%
summarise(NDVI = mean(NDVI))
kable(head(station1,20),caption= "The first 20 rows of the NDVI data")
| Date | NDVI |
|---|---|
| 2013.01.01 | 0.0907526 |
| 2013.01.09 | 0.3549894 |
| 2013.01.17 | 0.3550440 |
| 2013.01.25 | 0.3268520 |
| 2013.02.02 | 0.3317690 |
| 2013.02.10 | 0.3287541 |
| 2013.02.18 | 0.3284527 |
| 2013.02.26 | 0.1497188 |
| 2013.03.06 | 0.2544321 |
| 2013.03.14 | 0.2933670 |
| 2013.03.22 | 0.2912882 |
| 2013.03.30 | 0.3262823 |
| 2013.04.07 | 0.4405373 |
| 2013.04.15 | 0.4349048 |
| 2013.04.23 | 0.4301113 |
| 2013.05.01 | 0.4707331 |
| 2013.05.09 | 0.4810666 |
| 2013.05.17 | 0.4794964 |
| 2013.05.25 | 0.4985180 |
| 2013.06.02 | 0.5322240 |
# Convert Date field from character to Date in date time format
station1$Date<-as_datetime(station1$Date)
###Choose three-year data
SiteNDVI_10yrs<-station1 %>%
select(Date,NDVI) %>% #only select variable we want
filter(Date>="2013-01-01" & Date< "2022-12-31")
# Check the first of the time frame
head(SiteNDVI_10yrs,5)
## # A tibble: 5 × 2
## Date NDVI
## <dttm> <dbl>
## 1 2013-01-09 00:00:00 0.355
## 2 2013-01-17 00:00:00 0.355
## 3 2013-01-25 00:00:00 0.327
## 4 2013-02-02 00:00:00 0.332
## 5 2013-02-10 00:00:00 0.329
# Check The End of the Time Frame
tail(SiteNDVI_10yrs,5)
## # A tibble: 5 × 2
## Date NDVI
## <dttm> <dbl>
## 1 2022-11-25 00:00:00 0.315
## 2 2022-12-03 00:00:00 0.314
## 3 2022-12-11 00:00:00 0.266
## 4 2022-12-19 00:00:00 0.215
## 5 2022-12-27 00:00:00 0.300
# find invalid NDVI
InvalidNDVI <- SiteNDVI_10yrs%>% filter(NDVI<0)
print(InvalidNDVI)
## # A tibble: 4 × 2
## Date NDVI
## <dttm> <dbl>
## 1 2014-01-25 00:00:00 -0.0112
## 2 2014-02-02 00:00:00 -0.0118
## 3 2021-02-02 00:00:00 -0.00435
## 4 2022-02-02 00:00:00 -0.0122
SiteNDVI_10yrs_fill<- SiteNDVI_10yrs
#find invalid NDVI and fill the mean value for them
validvalues <- SiteNDVI_10yrs_fill[SiteNDVI_10yrs_fill$NDVI>0,2]
SiteNDVI_10yrs_fill[SiteNDVI_10yrs_fill$NDVI<0,2]$NDVI <- mean(validvalues$NDVI)
####Recheck again
InvalidNDVI <- SiteNDVI_10yrs_fill %>% filter(NDVI<0)
print(InvalidNDVI)
## # A tibble: 0 × 2
## # ℹ 2 variables: Date <dttm>, NDVI <dbl>
#### Original 11-year data
ggplot(station1,aes(x=Date,y=NDVI))+geom_line()
#### Use ggplot() for time series data visualization to any two-year data in the 10-year time range of 2013-01-01 to 2022-12-31.After filling
ggplot(SiteNDVI_10yrs_fill,aes(x=Date,y=NDVI))+geom_line()
## Split the data from 2013-01-01 to 2023-12-31(11-year data) into two
data sets.
Please split the data from 2013-01-01 to 2023-12-31(11-year data) into two data sets.
Training data: 2013-01-01 to 2021-01-01. (8 years) Test data: 2021-01-01 to 2023-12-31. (3 years)
We split the data frame into one year of training data and one year of test data. We build the model on the one year of training data, then forecast the model for one year. Finally, we will validate the forecast using the test data.
### 8 years
SiteNDVI_8yrs_train<-SiteNDVI_10yrs_fill %>%
filter(Date>="2013-01-01" & Date < "2021-01-01")
### 3 years
SiteNDVI_3yrs_test<-SiteNDVI_10yrs_fill %>%
filter(Date>="2021-01-01" & Date < "2023-12-31")
###Time series visualization using ggplot()
NDVI_plot <- SiteNDVI_8yrs_train %>%
ggplot(aes(x = Date, y = NDVI)) +
geom_line(aes(color = "NDVI")) +
scale_x_datetime(name = "Date", date_breaks = "2 year") +
scale_y_continuous(breaks = seq(0, 1, 0.2)) +
theme_minimal() +
labs(title = "Muncie NDIV over 2021", subtitle = "2020-1-1 - 2022-1-1")
ggplotly(NDVI_plot)
We create a seasonal time series (msts), decompose the msts, visualize the decomposed multi-time series and then save the msts to a data frame object.
SiteNDVI_8yrs_train_monthly <- SiteNDVI_8yrs_train %>%
mutate(month = month(as.Date(Date)), year = year(as.Date(Date))) %>%
group_by(year, month) %>%
summarise(mean_NDVI = mean(NDVI, na.rm = T), .groups = "keep") %>%
as.data.frame()
SiteNDVI_8yrs_train_monthly <- SiteNDVI_8yrs_train_monthly[1:96,]
SiteNDVI_3yrs_test_monthly <- SiteNDVI_3yrs_test %>%
mutate(month = month(as.Date(Date)), year = year(as.Date(Date))) %>%
group_by(year, month) %>%
summarise(mean_NDVI = mean(NDVI, na.rm = T), .groups = "keep") %>%
as.data.frame()
#Step 3.1: Form the time series object
##Create the time series object using ts()
NDVI_ts_train<-ts(SiteNDVI_8yrs_train_monthly$mean_NDVI,start = c(2013,1), end = c(2023,12),frequency = 12)
plot(NDVI_ts_train)
##Create the time series object using msts()
NDVI_multi0 <-msts(SiteNDVI_8yrs_train_monthly$mean_NDVI,seasonal.periods = c(12)) #monthly
plot(NDVI_multi0)
## Decompose the time series data
#Method1: decompose(): Classical Seasonal Decomposition by Moving Averages;based on ts object
NDVI_ts_train_dec<-NDVI_ts_train%>%
decompose() %>%
plot()
The plot above shows the original time series (top), the estimated trend
component (second from top), the estimated seasonal component (third
from top), and the estimated random component (bottom). From the above
it can be observed that the estimated trend shows a small decrease in
2014-15, which is followed by a steady increase from 2016-17. There is a
further decline in the trend as seen in the 2022 towards 2023.
NDVI_ts_train_stl1 <- stlm(NDVI_ts_train, s.window = 'periodic')
# NDVI_ts_train_stl1 <- stlm(NDVI_ts_train, s.window = c(12))
# plot(NDVI_ts_train_stl1) # cannot use plot() here
#Method2: mstl(): Multiple seasonal decomposition: daily, weekly , monthly.
# Use x value
NDVI_multi0 <-msts(SiteNDVI_8yrs_train_monthly$mean_NDVI,seasonal.periods = c(12)) #monthly
NDVI_multi0%>%
mstl() %>%
autoplot()
# use time series (ts) object
NDVI_multi <-msts(NDVI_ts_train,seasonal.periods = c(12)) #monthly
NDVI_multi%>%
mstl() %>%
autoplot()
From the output we can clearly see the seasonal component and separated upward trend of the data.
#Method 1: moving average,Moving-average smoothing
par(mfrow=c(1,1))
ts.plot(NDVI_ts_train)
sm <- ma(NDVI_ts_train, order=12) # 12 month, moving average,Moving-average smoothing
lines(sm, col="blue") # plot
#Method 2: Simple exponential smoothing: Level Only
model <- hw(NDVI_ts_train, initial = "optimal", h=(12), beta=NULL, gamma=NULL) # h is the no. periods to forecast
#Method 3:Double Exponential smoothing: Level and Trend components
model <- hw(NDVI_ts_train, initial = "optimal", h=(12), gamma=NULL)
#Method 4: Holt Winters: Level, Trend and Seasonality
model <- hw(NDVI_ts_train, initial = "optimal", h=(12))#12 define the forecast Period Length
plot(model)
accuracy(model) # calculate accuracy measures
## ME RMSE MAE MPE MAPE MASE
## Training set -0.004091496 0.04084827 0.03173575 -2.464346 8.706748 0.6681917
## ACF1
## Training set 0.3017186
If both ACF and PACF plots show significant autocorrelation only at lag 0 (i.e., the first value) and decay quickly afterward, it suggests the time series is likely stationary. If the ACF plot shows a slow decay and/or the PACF plot has significant correlations at multiple lags, it might indicate non-stationary. If there is a sinusoidal pattern or gradual decay in the ACF and PACF plots, it might indicate seasonality, suggesting non-stationary.
# x11()
acf(NDVI_ts_train,lag.max=60,xaxt="n", xlab="Lag (months)")
# x11()
pacf(NDVI_ts_train,lag.max=60,xaxt="n", xlab="Lag (months)")
Step 6.2: Check the stationarity by adf and kpss tests.
adf.test():Augmented Dickey–Fuller Test,Computes the Augmented Dickey-Fuller test for the null that x has a unit root.
kpss.test(): KPSS Test for Stationarity,omputes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null hypothesis that x is level or trend stationary.
library(tseries)
adf.test(NDVI_ts_train)
## Warning in adf.test(NDVI_ts_train): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: NDVI_ts_train
## Dickey-Fuller = -9.5942, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
# p-value < 0.05 indicates h0 ( a unit root is not stationary) is rejected and then the TS is stationary
kpss.test(NDVI_ts_train,null = "Trend") # Trend: a linear trend
## Warning in kpss.test(NDVI_ts_train, null = "Trend"): p-value greater than
## printed p-value
##
## KPSS Test for Trend Stationarity
##
## data: NDVI_ts_train
## KPSS Trend = 0.011609, Truncation lag parameter = 4, p-value = 0.1
# p-value< 0.05. The KPSS test has the null of stationarity, which is rejected here, meaning TS is non-stationary, with trend
kpss.test(NDVI_ts_train,null = "Level") # level: a fix trend
## Warning in kpss.test(NDVI_ts_train, null = "Level"): p-value greater than
## printed p-value
##
## KPSS Test for Level Stationarity
##
## data: NDVI_ts_train
## KPSS Level = 0.01217, Truncation lag parameter = 4, p-value = 0.1
ndiffs(NDVI_ts_train, test = "adf") ## number of differences need to make it stationary
## [1] 0
ndiffs(NDVI_ts_train, test = "kpss") ## number of differences need to make it stationary
## [1] 0
ndvi_ts_stationary<- diff(NDVI_ts_train, differences= 1)
#
# adf.test(ndvi_ts_stationary)
# kpss.test(ndvi_ts_stationary)
#
acf(ndvi_ts_stationary)
pacf(ndvi_ts_stationary)
Please have five models for forecasting extra 3 years: 1) auto.arima();
2) stlm() with arima; 3) stlm() with est; 4) mstl(); 5) Seasonal
Arima(), for example: Arima(NDVI_ts_train, order =
c(2,0,2),seasonal=c(1,1,1)). When forecast() for next 3 years, use
h=12*3
########################################1. None seasonal ARIMA-remove seasonality and then focus on the data removed seasonality
####This is wrong since there exists seasonality in the time series
NDVI_ts_train0<-ts(NDVI_ts_train,frequency = 12)
AR0 <- Arima(NDVI_ts_train0, order = c(1,0,0)) #AR(1)
########################################2. Seasonal ARIMA-with seasonality
AR <- Arima(NDVI_ts_train, order = c(2,0,2),seasonal=c(1,1,1))
#AR <- Arima(NDVI_ts_train, order = c(1,0,3),seasonal=c(0,1,1))
#AR <- Arima(NDVI_ts_train, order = c(2,0,2),seasonal=c(0,0,0)) #non-season arima
#plotting the NDVI_ts_train series plus the forecast and 95% prediction intervals
ts.plot(NDVI_ts_train,xlim=c(2013,2025))
AR_forecast <- predict(AR, n.ahead = 24)$pred
AR_forecast_se <- predict(AR, n.ahead = 24)$se
points(AR_forecast, type = "l", col = 2,lwd = 3)
points(AR_forecast - 2*AR_forecast_se, type = "l", col = 2, lty = 2,lwd=2)
points(AR_forecast + 2*AR_forecast_se, type = "l", col= 2, lty = 2,lwd = 2)
checkresiduals(AR)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(1,1,1)[12]
## Q* = 29.192, df = 18, p-value = 0.04609
##
## Model df: 6. Total lags used: 24
########################################3. Auto ARIMA:produce the same result as the method2
AutoArimaModel=auto.arima(NDVI_ts_train)
AutoArimaModel #ARIMA(2,0,2)(1,1,1)[12]
## Series: NDVI_ts_train
## ARIMA(2,0,0)(2,1,0)[12]
##
## Coefficients:
## ar1 ar2 sar1 sar2
## 0.3973 -0.2368 -0.6210 -0.3864
## s.e. 0.0923 0.0939 0.0928 0.0902
##
## sigma^2 = 0.002346: log likelihood = 191.67
## AIC=-373.34 AICc=-372.81 BIC=-359.4
############predict
ts.plot(NDVI_ts_train,xlim=c(2013,2025))
AR_prediction <- predict(AutoArimaModel, n.ahead = 24)$pred
AR_prediction_se <- predict(AutoArimaModel, n.ahead = 24)$se
points(AR_prediction, type = "l", col = 2,lwd = 3)
points(AR_prediction - 2*AR_prediction_se, type = "l", col = 2, lty = 2,lwd=2)
points(AR_prediction + 2*AR_prediction_se, type = "l", col= 2, lty = 2,lwd = 2)
############forecast
AR_forecast<-forecast(AutoArimaModel, h=12*3)
NDVI_ts_train%>%
autoplot() +
autolayer(AR_forecast,series = "Multi-Seasonal ARIMA",color = "purple")
###mlst object
NDVI_multi <-msts(NDVI_ts_train,seasonal.periods = c(12)) #monthly
NDVI_multi_ts <- NDVI_multi%>%
mstl()
autoplot(NDVI_multi_ts)
f_arima1<-forecast(NDVI_multi_ts, h=12*3) #ETS(M,N,N)
NDVI_ts_train%>%
autoplot() +
autolayer(f_arima1,series = "Multi-Seasonal Holt_Winter",color = "purple")
# stlm object: multi seasonal ARIMA model, using arima model,ARIMA(0,0,3)
NDVI_stlm<-stlm(NDVI_ts_train, method = "arima")
f_arima2<-forecast(NDVI_stlm, h=12*3) #ARIMA(0,0,3)
NDVI_ts_train%>%
autoplot() +
autolayer(f_arima2,series = "Multi-Seasonal ARIMA",color = "purple")
NDVI_stl <- stlm(NDVI_ts_train, method = "ets")
f_arima3<-forecast(NDVI_stl, h=12*3) #ETS(M,N,N)
NDVI_ts_train%>%
autoplot() +
autolayer(f_arima3,series = "Multi-Seasonal Holt_Winter",color = "purple")
7. Compare models and validate models
Step 7.1: Compare AIC and BIC values and different errors
The measures calculated are:
ME: Mean Error
RMSE: Root Mean Squared Error
MAE: Mean Absolute Error
MPE: Mean Percentage Error
MAPE: Mean Absolute Percentage Error
MASE: Mean Absolute Scaled Error
ACF1: Autocorrelation of errors at lag 1.
By default, the MASE calculation is scaled using MAE of training set naive forecasts for non-seasonal time series, training set seasonal naive forecasts for seasonal time series and training set mean forecasts for non-time series data. If f is a numerical vector rather than a forecast object, the MASE will not be returned as the training data will not be available.
AIC_BIC <- rbind(c(f_arima1$model$aic,f_arima1$model$bic),c(f_arima2$model$aic,f_arima2$model$bic),c(f_arima3$model$aic,f_arima3$model$bic),c(AutoArimaModel$aic,AutoArimaModel$bic),c(AR$aic,AR$bic))
colnames(AIC_BIC) <- c("AIC","BIC")
rownames(AIC_BIC) <- c("mlst","stlm_arima","stlm_est","auto_arima","AR")
accuracy(f_arima1)
## ME RMSE MAE MPE MAPE MASE
## Training set -5.648852e-06 0.03989459 0.0305241 -1.465707 8.244585 0.6426807
## ACF1
## Training set 0.3055432
#accuracy(AR)
Accuracy<-rbind(accuracy(f_arima1),accuracy(f_arima2),accuracy(f_arima3),accuracy(AutoArimaModel),accuracy(AR))
rownames(Accuracy) <- c("mlst","stlm_arima","stlm_est","auto_arima","AR")
kable(Accuracy,caption= "Errors (Accuracy) comparison of five models")
| ME | RMSE | MAE | MPE | MAPE | MASE | ACF1 | |
|---|---|---|---|---|---|---|---|
| mlst | -0.0000056 | 0.0398946 | 0.0305241 | -1.465707 | 8.244585 | 0.6426807 | 0.3055432 |
| stlm_arima | -0.0002396 | 0.0371465 | 0.0276311 | -1.348926 | 7.676455 | 0.5817685 | -0.0258320 |
| stlm_est | -0.0000056 | 0.0398946 | 0.0305241 | -1.465707 | 8.244585 | 0.6426807 | 0.3055432 |
| auto_arima | -0.0025032 | 0.0454053 | 0.0324264 | -2.066469 | 8.988291 | 0.6827326 | -0.0174843 |
| AR | -0.0015703 | 0.0363836 | 0.0256189 | -1.591239 | 7.293075 | 0.5394024 | -0.0377460 |
kable(AIC_BIC,caption= "AIC and BIC values comparision of four models")
| AIC | BIC | |
|---|---|---|
| mlst | -199.9502 | -191.3018 |
| stlm_arima | -488.5390 | -479.8906 |
| stlm_est | -199.9502 | -191.3018 |
| auto_arima | -373.3362 | -359.3987 |
| AR | -397.4109 | -377.8985 |
NDVI_prediction <- as.data.frame(f_arima1$mean)
predicted_NDVI <- as.numeric(NDVI_prediction$x)
observed_NDVI <- SiteNDVI_3yrs_test_monthly$mean_NDVI
max1<-max(observed_NDVI)
max2<-max(predicted_NDVI)
plot(observed_NDVI,predicted_NDVI[1:length(observed_NDVI)],xlim=c(0,max(max1,max2)),ylim=c(0,max(max1, max2)), xlab="Observed NDVI" , ylab="Predicted NDVI" , col="red", pch=5)
lines(c(0,max(max1, max2)),c(0,max(max1, max2)),col="black",lwd=3,lty=1)
legend(95,40,c('','1:1Line'),lty=c(NA,1),pch=c(NA,NA),lwd=c(NA,3),bg='white',ncol=1,box.lty=0)
lmMod <- lm(observed_NDVI ~ predicted_NDVI[1:length(observed_NDVI)])
summary (lmMod)
##
## Call:
## lm(formula = observed_NDVI ~ predicted_NDVI[1:length(observed_NDVI)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.062490 -0.012383 -0.000901 0.023613 0.053589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0007913 0.0177361 -0.045 0.965
## predicted_NDVI[1:length(observed_NDVI)] 1.0207974 0.0345849 29.516 <2e-16
##
## (Intercept)
## predicted_NDVI[1:length(observed_NDVI)] ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03295 on 22 degrees of freedom
## Multiple R-squared: 0.9754, Adjusted R-squared: 0.9742
## F-statistic: 871.2 on 1 and 22 DF, p-value: < 2.2e-16