Supplied : The dataset supplied represents yearly changes in the thickness of Ozone layer from 1927 to 2016 in Dobson Units.

Objective : With the supplied dataset, create a time series analysis model or models. The ultimate objective being to have the model predict for the next five (5) years (in either case) the thickness of the Ozone Layer in Dobson Units.

        Viewable at :  http://rpubs.com/fast_Eddie/479213
        
        
#####################
##
##   Create model1 with the dataset data1   
##
#######################

library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0       v purrr   0.3.2  
## v tibble  2.1.1       v dplyr   0.8.0.1
## v tidyr   0.8.3       v stringr 1.4.0  
## v readr   1.3.1       v forcats 0.4.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(TSA)
## 
## Attaching package: 'TSA'
## The following object is masked from 'package:readr':
## 
##     spec
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
###################  Original dataset

#library(readr)
data1 <- read_csv("C:/Users/dan/Desktop/RMIT 2019/Time Series Analysis/data1.csv")
## Parsed with column specification:
## cols(
##   `1.351184356` = col_double()
## )
#View(data1a)

data1 <- as.ts(data1, start = 1927, end = 2016, frequency = 1)
#View(data1)

#is.ts(data1)  ## check validity of ts
par(mfrow=c(1, 1)) 

y = data1
shapiro.test(y)
## 
##  Shapiro-Wilk normality test
## 
## data:  y
## W = 0.95681, p-value = 0.004816
model1=lm(data1~time(data1))  ## Create model1 here 
#summary(model1)
library(RColorBrewer)
# model1=lm(data1~time(data1))
# summary(model1)

plot(data1,type='o',ylab='y', main = "Original Dataset")
abline(model1, col = "#E31A1C") # add the fitted least squares line from model1

########  getting predictions for next 8 yrs for Model1  --  DO NOT REMOVE --

t = time(data1) # Create time points for model fitting
model1 = lm(data1~t) # label the model as model1
h = 8 # 5 steps ahed forecasts
# Now we will implement the two-step algorithm
new = data.frame(t = seq((length(t)+1), (length(t)+h), 1)) # Step 1
# Notice here that I'm using the same variable name "t" as in the 
# fitted model above, where the name of the variable showing time 
# is also "t". To run the predict() function properly,
# the names of variables in fitted model and "new" data frame
# must be the same!!!
forecasts = predict(model1, new, interval = "prediction")
# Here interval argument shows the prediction interval
#print(forecasts)  
###################    

n = nrow(data1)
plot(data1, xlim = c(1,n), ylim = c(-14, 12), ylab = "Ozone Thickness")
# We need to convert forecasts to time series object starting from the first 
# time steps-ahead to be able to use plot function. 
# We do this for all columns of forecasts
lines(ts(as.vector(forecasts[,1]), start = 76), col="red", type="l")
lines(ts(as.vector(forecasts[,2]), start = 76), col="blue", type="l")
lines(ts(as.vector(forecasts[,3]), start = 76), col="blue", type="l")
legend("topleft", lty=1, pch=1, col=c("black","blue","red"), text.width = 22,
       c("Data","5% forecast limits", "Forecasts"))

###################        Attempting to build a better model , Model 2

###################        Original dataset with 3 lag columns added in

# library(tidyverse)
# library(TSA)

data1a <-  read_csv("C:/Users/dan/Desktop/RMIT 2019/Time Series Analysis/data1a.csv")
## Warning: Missing column names filled in: 'X3' [3], 'X4' [4], 'X5' [5]
## Parsed with column specification:
## cols(
##   `Years (1927-2016)` = col_double(),
##   `Ozone Thickness (Dobson Units)` = col_double(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double()
## )
names(data1a)[1] <- 'Years'
names(data1a)[2] <- 'Ozone Thickness'
names(data1a)[3] <- 'lag1'              ## data1a has 3 lag columns added in <<-------- ****
names(data1a)[4] <- 'lag2'
names(data1a)[5] <- 'lag3'

inputData <- as.ts(data1a, start = 1927, end = 2016, frequency = 1)
#View(inputData)
#is.ts(inputData)
response_df <- inputData['Ozone Thickness']  # Y variable
predictors_df <- inputData[, !names(inputData) %in% "Years" ]  # X variables
#head(inputData)

model2 <- lm(Years ~ . , data = inputData)
model2 <- step(model2)
## Start:  AIC=422.98
## Years ~ `Ozone Thickness` + lag1 + lag2 + lag3
## 
##                     Df Sum of Sq   RSS    AIC
## <none>                           10024 422.98
## - lag3               1      3117 13141 444.53
## - lag2               1      5600 15624 459.59
## - lag1               1      8313 18338 473.52
## - `Ozone Thickness`  1     44507 54531 568.33
#summary(model2)

##########################  

par(mfrow=c(1, 2))    ##   Comparing both models 

acf(model1$residuals, main = "Model 1")   # this one appears better

acf(model2$residuals, main = "Model 2") 

###########################

# Q-Q plots

par(mfrow=c(1, 2))

y = data1
qqnorm(y, main = "Model 1 " )
qqline(y, col = 2, lwd = 1, lty = 2)

qqnorm(model2$residuals, main = "Model 2 " )
qqline(model2$residuals, col = 2, lwd = 1, lty = 2)

par(mfrow=c(1, 1))

par(mfrow=c(2, 2))  # divide graph into 2 rows and 2 columns
plot(model1)#lmMod

par(mfrow=c(2, 2))  # divide graph into 2 rows and 2 columns
plot(model2)#lmMod

# Predictions : Model 1 for the Years 2017 - 2021

a1 = round(forecasts[1],3)
a2 = round(forecasts[2],3)
a3 = round(forecasts[3],3)
a4 = round(forecasts[4],3)
a5 = round(forecasts[5],3)

cat(paste0("The Year 2017 estimated Ozone Thickness will be : " ,
           round(a1,3), " Dobson Units for model 1 \n"))
## The Year 2017 estimated Ozone Thickness will be : -8.217 Dobson Units for model 1
cat(paste0("\nThe Year 2018 estimated Ozone Thickness will be : " ,
           round(a2,3), " Dobson Units for model 1 \n"))
## 
## The Year 2018 estimated Ozone Thickness will be : -8.327 Dobson Units for model 1
cat(paste0("\nThe Year 2019 estimated Ozone Thickness will be : " ,
           round(a3,3), " Dobson Units for model 1 \n"))
## 
## The Year 2019 estimated Ozone Thickness will be : -8.437 Dobson Units for model 1
cat(paste0("\nThe Year 2020 estimated Ozone Thickness will be : " ,
           round(a4,3), " Dobson Units for model 1 \n"))
## 
## The Year 2020 estimated Ozone Thickness will be : -8.547 Dobson Units for model 1
cat(paste0("\nThe Year 2021 estimated Ozone Thickness will be : " ,
           round(a5,3), " Dobson Units for model 1 \n"))
## 
## The Year 2021 estimated Ozone Thickness will be : -8.658 Dobson Units for model 1

# Predictions : Model 2 for the Years 2017 - 2021

a1 = (2017 - round(model2$coefficients[1],0 )) / round(model2$coefficients[2],3 )
a2 = (2018 - round(model2$coefficients[1],0 )) / round(model2$coefficients[2],3 )
a3 = (2019 - round(model2$coefficients[1],0 )) / round(model2$coefficients[2],3 )
a4 = (2020 - round(model2$coefficients[1],0 )) / round(model2$coefficients[2],3 )
a5 = (2021 - round(model2$coefficients[1],0 )) / round(model2$coefficients[2],3 )

cat(paste0("The Year 2017 estimated Ozone Thickness will be : " ,
           round(a1,3), " Dobson Units for model 2 \n"))
## The Year 2017 estimated Ozone Thickness will be : -9.026 Dobson Units for model 2
cat(paste0("\nThe Year 2018 estimated Ozone Thickness will be : " ,
           round(a2,3), " Dobson Units for model 2 \n"))
## 
## The Year 2018 estimated Ozone Thickness will be : -9.158 Dobson Units for model 2
cat(paste0("\nThe Year 2019 estimated Ozone Thickness will be : " ,
           round(a3,3), " Dobson Units for model 2 \n"))
## 
## The Year 2019 estimated Ozone Thickness will be : -9.291 Dobson Units for model 2
cat(paste0("\nThe Year 2020 estimated Ozone Thickness will be : " ,
           round(a4,3), " Dobson Units for model 2 \n"))
## 
## The Year 2020 estimated Ozone Thickness will be : -9.424 Dobson Units for model 2
cat(paste0("\nThe Year 2021 estimated Ozone Thickness will be : " ,
           round(a5,3), " Dobson Units for model 2 \n"))
## 
## The Year 2021 estimated Ozone Thickness will be : -9.557 Dobson Units for model 2
summary(model1)
## 
## Call:
## lm(formula = data1 ~ t)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7151 -1.6671  0.0454  1.4702  4.7856 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.709703   0.436888   3.913  0.00018 ***
## t           -0.110292   0.008431 -13.081  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.043 on 87 degrees of freedom
## Multiple R-squared:  0.6629, Adjusted R-squared:  0.6591 
## F-statistic: 171.1 on 1 and 87 DF,  p-value: < 2.2e-16
summary(model2)
## 
## Call:
## lm(formula = Years ~ `Ozone Thickness` + lag1 + lag2 + lag3, 
##     data = inputData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25.9669  -5.7752   0.1809   6.5748  23.0788 
## 
## Coefficients:
##                    Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)       1949.3604     1.7120 1138.677  < 2e-16 ***
## `Ozone Thickness`   -7.5343     0.3949  -19.081  < 2e-16 ***
## lag1                11.1760     1.3552    8.247 2.27e-12 ***
## lag2               -10.9663     1.6203   -6.768 1.80e-09 ***
## lag3                 3.5864     0.7102    5.050 2.63e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.06 on 82 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.8173, Adjusted R-squared:  0.8084 
## F-statistic: 91.71 on 4 and 82 DF,  p-value: < 2.2e-16
par(mfrow=c(1,2))
d <- density(model1$residuals) # returns the density data
e <- density(model2$residuals) # returns the density data 
plot(d, main = "Model 1 residuals", col="red")
plot(e, main = "Model 2 residuals", col="red")

# Conclusions

A comparison of both models indicates the possibility that perhaps model 2 is the better model. My conclusion in this case is based on primarily the aligning of points along the diagonal line in the Q-Q plots. Essentially I’ve tried to close up a majority of these points as compared to model 1, hopefully producing a better fit.

The closing up of these points as in model 2 is an attempt to reduce the size of the residuals for that particular model. Also of note is the Multiple R-squared and the Adjusted R-squared of each model, model 2 indicates a better fit from the figures shown.

Finally, from above a look at a density plot for either, although either could be considered as approaching a normal distribution, I prefer model 2, as I think that this of the two is slightly closer to a normal distribution.

Model 2 is my choice, with the accompanying model 2 predictions.