set the working directory

setwd("C:/Users/charl/OneDrive/Desktop/DSCI 607 Data Analytics for Environmental Sciences/LAB SOLUTIONS")

Load Required Packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(padr)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(knitr)
library(tseries)
library(skimr)

Import Data

NDVI_df_11 <- readRDS("C:/Users/charl/OneDrive/Desktop/DSCI 607 Data Analytics for Environmental Sciences/LAB SOLUTIONS/NDVI_df_11.rds")

head(NDVI_df_11)
##            x        y      NDVI       Date
## 1: -86.09258 40.51259 0.2773395 2013.01.01
## 2: -86.08443 40.51259 0.2749555 2013.01.01
## 3: -86.07629 40.51259 0.2625337 2013.01.01
## 4: -86.06814 40.51259 0.2893912 2013.01.01
## 5: -86.06000 40.51259 0.3065163 2013.01.01
## 6: -86.05185 40.51259 0.3026700 2013.01.01
dates <- unique(NDVI_df_11$Date)

Visualize data in space and time

######################### "Spatial"map"
NDVI_df_11 %>% filter(Date==dates[1]) %>%
  ggplot() +
  geom_point(aes(x = x, y = y, fill = NDVI, col = NDVI))

Check for missing values

# 2. Check for missing values
missing_values <- sum(is.na(NDVI_df_11))
if (missing_values > 0) {
  print(paste("Number of missing values:", missing_values))
} else {
  print("No missing values found.")
}
## [1] "No missing values found."

Check for invalid NDVI values

invalid_ndvi <- sum(NDVI_df_11$NDVI < -1 | NDVI_df_11$NDVI > 1)
if (invalid_ndvi > 0) {
  print(paste("Number of invalid NDVI values:", invalid_ndvi))
  # You can take appropriate actions, such as removing or imputing invalid values
} else {
  print("No invalid NDVI values found.")
}
## [1] "No invalid NDVI values found."

Average all Points

library(dplyr)
station1 <- NDVI_df_11 %>%
  group_by(Date) %>%
  summarise(NDVI = mean(NDVI))

Create a table for the first 20 rows of the subset data you selected using kable().

kable(head(station1,20),caption= "The first 20 rows of the  NDVI data")
The first 20 rows of the NDVI data
Date NDVI
2013.01.01 0.0907526
2013.01.09 0.3549894
2013.01.17 0.3550440
2013.01.25 0.3268520
2013.02.02 0.3317690
2013.02.10 0.3287541
2013.02.18 0.3284527
2013.02.26 0.1497188
2013.03.06 0.2544321
2013.03.14 0.2933670
2013.03.22 0.2912882
2013.03.30 0.3262823
2013.04.07 0.4405373
2013.04.15 0.4349048
2013.04.23 0.4301113
2013.05.01 0.4707331
2013.05.09 0.4810666
2013.05.17 0.4794964
2013.05.25 0.4985180
2013.06.02 0.5322240

Use ggplot() for time series data visualization to any two-year data in the 10-year time range of 2013-01-01 to 2022-12-31.

# Convert Date field from character to Date in date time format
station1$Date<-as_datetime(station1$Date)
###Choose three-year data
SiteNDVI_10yrs<-station1 %>% 
  select(Date,NDVI) %>%  #only select variable we want
  filter(Date>="2013-01-01" & Date< "2022-12-31")
# Check the first of the time frame
head(SiteNDVI_10yrs,5)
## # A tibble: 5 × 2
##   Date                 NDVI
##   <dttm>              <dbl>
## 1 2013-01-09 00:00:00 0.355
## 2 2013-01-17 00:00:00 0.355
## 3 2013-01-25 00:00:00 0.327
## 4 2013-02-02 00:00:00 0.332
## 5 2013-02-10 00:00:00 0.329
# Check The End of the Time Frame
tail(SiteNDVI_10yrs,5)
## # A tibble: 5 × 2
##   Date                 NDVI
##   <dttm>              <dbl>
## 1 2022-11-25 00:00:00 0.315
## 2 2022-12-03 00:00:00 0.314
## 3 2022-12-11 00:00:00 0.266
## 4 2022-12-19 00:00:00 0.215
## 5 2022-12-27 00:00:00 0.300
# find invalid NDVI
InvalidNDVI <- SiteNDVI_10yrs%>% filter(NDVI<0)
print(InvalidNDVI)
## # A tibble: 4 × 2
##   Date                    NDVI
##   <dttm>                 <dbl>
## 1 2014-01-25 00:00:00 -0.0112 
## 2 2014-02-02 00:00:00 -0.0118 
## 3 2021-02-02 00:00:00 -0.00435
## 4 2022-02-02 00:00:00 -0.0122
SiteNDVI_10yrs_fill<- SiteNDVI_10yrs
#find invalid NDVI and fill the mean value for them
 validvalues <- SiteNDVI_10yrs_fill[SiteNDVI_10yrs_fill$NDVI>0,2]
 SiteNDVI_10yrs_fill[SiteNDVI_10yrs_fill$NDVI<0,2]$NDVI <- mean(validvalues$NDVI)
####Recheck again
InvalidNDVI <- SiteNDVI_10yrs_fill %>% filter(NDVI<0)
print(InvalidNDVI)
## # A tibble: 0 × 2
## # ℹ 2 variables: Date <dttm>, NDVI <dbl>
#### Original 11-year data
ggplot(station1,aes(x=Date,y=NDVI))+geom_line()

#### Use ggplot() for time series data visualization to any two-year data in the 10-year time range of 2013-01-01 to 2022-12-31.After filling
ggplot(SiteNDVI_10yrs_fill,aes(x=Date,y=NDVI))+geom_line()

## Split the data from 2013-01-01 to 2023-12-31(11-year data) into two data sets.

Please split the data from 2013-01-01 to 2023-12-31(11-year data) into two data sets.

Training data: 2013-01-01 to 2021-01-01. (8 years) Test data: 2021-01-01 to 2023-12-31. (3 years)

We split the data frame into one year of training data and one year of test data. We build the model on the one year of training data, then forecast the model for one year. Finally, we will validate the forecast using the test data.

### 8 years
SiteNDVI_8yrs_train<-SiteNDVI_10yrs_fill %>%
  filter(Date>="2013-01-01" & Date < "2021-01-01") 
### 3 years
SiteNDVI_3yrs_test<-SiteNDVI_10yrs_fill %>%
  filter(Date>="2021-01-01" & Date < "2023-12-31") 

###Time series visualization using ggplot()
NDVI_plot <- SiteNDVI_8yrs_train %>%
  ggplot(aes(x = Date, y = NDVI)) +
  geom_line(aes(color = "NDVI")) +
  scale_x_datetime(name = "Date", date_breaks = "2 year") +
  scale_y_continuous(breaks = seq(0, 1, 0.2)) +
  theme_minimal() +
  labs(title = "Muncie NDIV over 2021", subtitle = "2020-1-1 - 2022-1-1")

ggplotly(NDVI_plot)

Convert training data and test data at 8-day scale to monthly scale

We create a seasonal time series (msts), decompose the msts, visualize the decomposed multi-time series and then save the msts to a data frame object.

SiteNDVI_8yrs_train_monthly <- SiteNDVI_8yrs_train %>% 
  mutate(month = month(as.Date(Date)), year = year(as.Date(Date))) %>% 
  group_by(year, month) %>%
  summarise(mean_NDVI = mean(NDVI, na.rm = T), .groups = "keep") %>%
  as.data.frame()

SiteNDVI_8yrs_train_monthly <- SiteNDVI_8yrs_train_monthly[1:96,]



SiteNDVI_3yrs_test_monthly <- SiteNDVI_3yrs_test %>% 
  mutate(month = month(as.Date(Date)), year = year(as.Date(Date))) %>% 
  group_by(year, month) %>%
  summarise(mean_NDVI = mean(NDVI, na.rm = T), .groups = "keep") %>%
  as.data.frame()

Decompose the monthly training data you obtain from step(9) using both of stlm() and mstl() functions.

#Step 3.1: Form the time series object
##Create the time series object using ts()
NDVI_ts_train<-ts(SiteNDVI_8yrs_train_monthly$mean_NDVI,start = c(2013,1), end = c(2023,12),frequency = 12)
plot(NDVI_ts_train)

##Create the time series object using msts()
NDVI_multi0 <-msts(SiteNDVI_8yrs_train_monthly$mean_NDVI,seasonal.periods = c(12)) #monthly
plot(NDVI_multi0)

## Decompose the time series data

#Method1: decompose(): Classical Seasonal Decomposition by Moving Averages;based on ts object
NDVI_ts_train_dec<-NDVI_ts_train%>%
  decompose() %>%
  plot()

The plot above shows the original time series (top), the estimated trend component (second from top), the estimated seasonal component (third from top), and the estimated random component (bottom). From the above it can be observed that the estimated trend shows a small decrease in 2014-15, which is followed by a steady increase from 2016-17. There is a further decline in the trend as seen in the 2022 towards 2023.

NDVI_ts_train_stl1 <- stlm(NDVI_ts_train, s.window = 'periodic')
# NDVI_ts_train_stl1 <- stlm(NDVI_ts_train, s.window = c(12))
# plot(NDVI_ts_train_stl1)  # cannot use plot() here


#Method2: mstl(): Multiple seasonal decomposition: daily, weekly , monthly. 
# Use x value
NDVI_multi0 <-msts(SiteNDVI_8yrs_train_monthly$mean_NDVI,seasonal.periods = c(12)) #monthly
NDVI_multi0%>%
  mstl() %>% 
  autoplot()

# use time series (ts) object
NDVI_multi <-msts(NDVI_ts_train,seasonal.periods = c(12)) #monthly
NDVI_multi%>%
  mstl() %>% 
  autoplot()

Interpret the results of your Seasonality Analysis on the training data. For example, how NDVI changes along with time.

From the output we can clearly see the seasonal component and separated upward trend of the data.

Smoothing with time series data: model and forecast

#Method 1: moving average,Moving-average smoothing
par(mfrow=c(1,1))
ts.plot(NDVI_ts_train)
sm <- ma(NDVI_ts_train, order=12) # 12 month, moving average,Moving-average smoothing
lines(sm, col="blue") # plot

#Method 2: Simple exponential smoothing: Level Only
model <- hw(NDVI_ts_train, initial = "optimal", h=(12), beta=NULL, gamma=NULL) # h is the no. periods to forecast

#Method 3:Double Exponential smoothing: Level and Trend components
model <- hw(NDVI_ts_train, initial = "optimal", h=(12), gamma=NULL)

#Method 4: Holt Winters: Level, Trend and Seasonality
model <- hw(NDVI_ts_train, initial = "optimal", h=(12))#12 define the forecast Period Length
plot(model)

accuracy(model) # calculate accuracy measures
##                        ME       RMSE        MAE       MPE     MAPE      MASE
## Training set -0.004091496 0.04084827 0.03173575 -2.464346 8.706748 0.6681917
##                   ACF1
## Training set 0.3017186

Check the stationarity

If both ACF and PACF plots show significant autocorrelation only at lag 0 (i.e., the first value) and decay quickly afterward, it suggests the time series is likely stationary. If the ACF plot shows a slow decay and/or the PACF plot has significant correlations at multiple lags, it might indicate non-stationary. If there is a sinusoidal pattern or gradual decay in the ACF and PACF plots, it might indicate seasonality, suggesting non-stationary.

# x11()
acf(NDVI_ts_train,lag.max=60,xaxt="n", xlab="Lag (months)")

# x11()
pacf(NDVI_ts_train,lag.max=60,xaxt="n", xlab="Lag (months)")

Step 6.2: Check the stationarity by adf and kpss tests.

adf.test():Augmented Dickey–Fuller Test,Computes the Augmented Dickey-Fuller test for the null that x has a unit root.

kpss.test(): KPSS Test for Stationarity,omputes the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test for the null hypothesis that x is level or trend stationary.

library(tseries)
adf.test(NDVI_ts_train) 
## Warning in adf.test(NDVI_ts_train): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  NDVI_ts_train
## Dickey-Fuller = -9.5942, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
# p-value < 0.05 indicates h0 ( a unit root is not stationary) is rejected and then the TS is stationary

kpss.test(NDVI_ts_train,null = "Trend") #  Trend: a linear trend 
## Warning in kpss.test(NDVI_ts_train, null = "Trend"): p-value greater than
## printed p-value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  NDVI_ts_train
## KPSS Trend = 0.011609, Truncation lag parameter = 4, p-value = 0.1
# p-value< 0.05. The KPSS test has the null of  stationarity, which is rejected here, meaning TS is non-stationary, with trend

kpss.test(NDVI_ts_train,null = "Level")  # level: a fix trend
## Warning in kpss.test(NDVI_ts_train, null = "Level"): p-value greater than
## printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  NDVI_ts_train
## KPSS Level = 0.01217, Truncation lag parameter = 4, p-value = 0.1

Use ndiffs() to determine d in ARIMA; Use acf() and pacf() to determine the orders of p,q;

ndiffs(NDVI_ts_train, test = "adf")  ## number of differences need to make it stationary
## [1] 0
ndiffs(NDVI_ts_train, test = "kpss") ## number of differences need to make it stationary
## [1] 0
 ndvi_ts_stationary<- diff(NDVI_ts_train, differences= 1)
# 
# adf.test(ndvi_ts_stationary)
# kpss.test(ndvi_ts_stationary)
# 
 acf(ndvi_ts_stationary)

 pacf(ndvi_ts_stationary)

Please have five models for forecasting extra 3 years: 1) auto.arima(); 2) stlm() with arima; 3) stlm() with est; 4) mstl(); 5) Seasonal Arima(), for example: Arima(NDVI_ts_train, order = c(2,0,2),seasonal=c(1,1,1)). When forecast() for next 3 years, use h=12*3

Arima: Models and Forecasting

Step 6.1: Non-Seasonal ARIMA

########################################1. None seasonal ARIMA-remove seasonality and then focus on the data removed seasonality

####This is wrong since there exists seasonality in the time series
NDVI_ts_train0<-ts(NDVI_ts_train,frequency = 12)
AR0 <- Arima(NDVI_ts_train0, order = c(1,0,0)) #AR(1)

Step 6.2: Seasonal ARIMA

########################################2. Seasonal ARIMA-with seasonality
AR <- Arima(NDVI_ts_train, order = c(2,0,2),seasonal=c(1,1,1)) 
#AR <- Arima(NDVI_ts_train, order = c(1,0,3),seasonal=c(0,1,1)) 
#AR <- Arima(NDVI_ts_train, order = c(2,0,2),seasonal=c(0,0,0)) #non-season arima

#plotting the NDVI_ts_train series plus the forecast and 95% prediction intervals
ts.plot(NDVI_ts_train,xlim=c(2013,2025))
AR_forecast <- predict(AR, n.ahead = 24)$pred
AR_forecast_se <- predict(AR, n.ahead = 24)$se
points(AR_forecast, type = "l", col = 2,lwd = 3)
points(AR_forecast - 2*AR_forecast_se, type = "l", col = 2, lty = 2,lwd=2)
points(AR_forecast + 2*AR_forecast_se, type = "l", col= 2, lty = 2,lwd = 2)

checkresiduals(AR)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2)(1,1,1)[12]
## Q* = 29.192, df = 18, p-value = 0.04609
## 
## Model df: 6.   Total lags used: 24

Step 6.3: Auto ARIMA

########################################3. Auto ARIMA:produce the same result as the method2
AutoArimaModel=auto.arima(NDVI_ts_train)
AutoArimaModel   #ARIMA(2,0,2)(1,1,1)[12]
## Series: NDVI_ts_train 
## ARIMA(2,0,0)(2,1,0)[12] 
## 
## Coefficients:
##          ar1      ar2     sar1     sar2
##       0.3973  -0.2368  -0.6210  -0.3864
## s.e.  0.0923   0.0939   0.0928   0.0902
## 
## sigma^2 = 0.002346:  log likelihood = 191.67
## AIC=-373.34   AICc=-372.81   BIC=-359.4
############predict
ts.plot(NDVI_ts_train,xlim=c(2013,2025))
AR_prediction <- predict(AutoArimaModel, n.ahead = 24)$pred
AR_prediction_se <- predict(AutoArimaModel, n.ahead = 24)$se
points(AR_prediction, type = "l", col = 2,lwd = 3)
points(AR_prediction - 2*AR_prediction_se, type = "l", col = 2, lty = 2,lwd=2)
points(AR_prediction + 2*AR_prediction_se, type = "l", col= 2, lty = 2,lwd = 2)

############forecast
AR_forecast<-forecast(AutoArimaModel, h=12*3)
NDVI_ts_train%>%
  autoplot() +
  autolayer(AR_forecast,series = "Multi-Seasonal ARIMA",color = "purple")

Step 6.4:A model with mstl() object

###mlst object
NDVI_multi <-msts(NDVI_ts_train,seasonal.periods = c(12)) #monthly
NDVI_multi_ts <- NDVI_multi%>%
  mstl()

autoplot(NDVI_multi_ts)

f_arima1<-forecast(NDVI_multi_ts, h=12*3)  #ETS(M,N,N) 
NDVI_ts_train%>%
  autoplot() +
  autolayer(f_arima1,series = "Multi-Seasonal Holt_Winter",color = "purple")

Step 6.5: Models with stlm() object

# stlm object: multi seasonal ARIMA model, using arima model,ARIMA(0,0,3)
NDVI_stlm<-stlm(NDVI_ts_train, method = "arima")
f_arima2<-forecast(NDVI_stlm, h=12*3) #ARIMA(0,0,3)
NDVI_ts_train%>%
  autoplot() +
  autolayer(f_arima2,series = "Multi-Seasonal ARIMA",color = "purple")

NDVI_stl <- stlm(NDVI_ts_train, method = "ets")
f_arima3<-forecast(NDVI_stl, h=12*3)  #ETS(M,N,N) 
NDVI_ts_train%>%
  autoplot() +
  autolayer(f_arima3,series = "Multi-Seasonal Holt_Winter",color = "purple")

7. Compare models and validate models

Step 7.1: Compare AIC and BIC values and different errors

The measures calculated are:

By default, the MASE calculation is scaled using MAE of training set naive forecasts for non-seasonal time series, training set seasonal naive forecasts for seasonal time series and training set mean forecasts for non-time series data. If f is a numerical vector rather than a forecast object, the MASE will not be returned as the training data will not be available.

AIC_BIC <- rbind(c(f_arima1$model$aic,f_arima1$model$bic),c(f_arima2$model$aic,f_arima2$model$bic),c(f_arima3$model$aic,f_arima3$model$bic),c(AutoArimaModel$aic,AutoArimaModel$bic),c(AR$aic,AR$bic))
colnames(AIC_BIC) <- c("AIC","BIC")
rownames(AIC_BIC) <- c("mlst","stlm_arima","stlm_est","auto_arima","AR")

accuracy(f_arima1)
##                         ME       RMSE       MAE       MPE     MAPE      MASE
## Training set -5.648852e-06 0.03989459 0.0305241 -1.465707 8.244585 0.6426807
##                   ACF1
## Training set 0.3055432
#accuracy(AR)
Accuracy<-rbind(accuracy(f_arima1),accuracy(f_arima2),accuracy(f_arima3),accuracy(AutoArimaModel),accuracy(AR))

rownames(Accuracy) <- c("mlst","stlm_arima","stlm_est","auto_arima","AR")

kable(Accuracy,caption= "Errors (Accuracy) comparison of five models")
Errors (Accuracy) comparison of five models
ME RMSE MAE MPE MAPE MASE ACF1
mlst -0.0000056 0.0398946 0.0305241 -1.465707 8.244585 0.6426807 0.3055432
stlm_arima -0.0002396 0.0371465 0.0276311 -1.348926 7.676455 0.5817685 -0.0258320
stlm_est -0.0000056 0.0398946 0.0305241 -1.465707 8.244585 0.6426807 0.3055432
auto_arima -0.0025032 0.0454053 0.0324264 -2.066469 8.988291 0.6827326 -0.0174843
AR -0.0015703 0.0363836 0.0256189 -1.591239 7.293075 0.5394024 -0.0377460
kable(AIC_BIC,caption= "AIC and BIC values comparision of four models")
AIC and BIC values comparision of four models
AIC BIC
mlst -199.9502 -191.3018
stlm_arima -488.5390 -479.8906
stlm_est -199.9502 -191.3018
auto_arima -373.3362 -359.3987
AR -397.4109 -377.8985

Valid the model(s) with test data

NDVI_prediction <- as.data.frame(f_arima1$mean)
predicted_NDVI <- as.numeric(NDVI_prediction$x)

observed_NDVI <- SiteNDVI_3yrs_test_monthly$mean_NDVI
max1<-max(observed_NDVI)
max2<-max(predicted_NDVI)
plot(observed_NDVI,predicted_NDVI[1:length(observed_NDVI)],xlim=c(0,max(max1,max2)),ylim=c(0,max(max1, max2)), xlab="Observed NDVI" , ylab="Predicted NDVI" , col="red", pch=5)
lines(c(0,max(max1, max2)),c(0,max(max1, max2)),col="black",lwd=3,lty=1)
legend(95,40,c('','1:1Line'),lty=c(NA,1),pch=c(NA,NA),lwd=c(NA,3),bg='white',ncol=1,box.lty=0)

lmMod <- lm(observed_NDVI ~ predicted_NDVI[1:length(observed_NDVI)]) 
summary (lmMod)
## 
## Call:
## lm(formula = observed_NDVI ~ predicted_NDVI[1:length(observed_NDVI)])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.062490 -0.012383 -0.000901  0.023613  0.053589 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             -0.0007913  0.0177361  -0.045    0.965
## predicted_NDVI[1:length(observed_NDVI)]  1.0207974  0.0345849  29.516   <2e-16
##                                            
## (Intercept)                                
## predicted_NDVI[1:length(observed_NDVI)] ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03295 on 22 degrees of freedom
## Multiple R-squared:  0.9754, Adjusted R-squared:  0.9742 
## F-statistic: 871.2 on 1 and 22 DF,  p-value: < 2.2e-16