Trends and Seasonality with Differencing

# *************************Installation packages and load data********************************
# Installed packages

# install.packages("readxl")
# install.packages("ggplot2") # I already have ggplot2, if you haven't yet packages then run this code by erasing hass at begaining 

library(ggplot2)
library(readxl)

# Set directory for data file

# load data Table B.4 for Exercise 2.2
AppendixB_4 <- read_excel("C:/Users/nasri/OneDrive/Desktop/TIME SERIES/HomeWork/DATA SET/AppendixB_datafile.xls", 
    sheet = "B.4-BLUE", skip = 3)

# change data format as data frame
my.data1 <- as.data.frame(AppendixB_4)
colnames(my.data1) # Check the column name then replaced with suitable one

## [1] "Year"                     "Production, thousand lbs"

colnames(my.data1) = c("Year", "Production") # column name changed into a small and suitable one, which is easy to type in coding

# load data Table B.6 for Exercise 2.2
AppendixB_10 <- read_excel("C:/Users/nasri/OneDrive/Desktop/TIME SERIES/HomeWork/DATA SET/AppendixB_datafile.xls", 
    sheet = "B.10-FLOWN", skip = 3)
# change data format as data frame
my.data2 <- as.data.frame(AppendixB_10)
colnames(my.data2) # Check the column name then replaced with suitable one

## [1] "Month"              "Miles, in Millions"

colnames(my.data2) = c("Month", "Miles") # column name changed into a small and suitable one, which is easy to type in coding
colnames(my.data2)

## [1] "Month" "Miles"

# *********************************************************************************************
# Function Variogram estimation by using define a function variogram
# Purpose : Define a function variogram
# Input   : Time series data set 
# Output  : Variogram (lag value and respective variogram value)
# ********************************************************************************************
variogram <- function (x, lag ){
    Lag <- NULL
    vark <- NULL
    vario <- NULL
  for (k in 1: lag ){
    Lag[k] <- k
    vark [k] = sd( diff (x,k ))^2
  vario [k] = vark [k]/ vark [1]
  }
return (as.data.frame( cbind (Lag , vario )))
}
#*********************************************************************************************

Removal of Trend using First Difference

Consider the data on US production of blue and gorgonzola cheeses in Table B.4. b. Take the first difference of the time series, then find the sample autocorrelation function and the variogram. What conclusions can you draw about the structure and behavior of the time series?

Answer: It seems that the trend is non-stationary for the US production of blue and gorgonzola cheeses.

p <- ggplot (my.data1, aes(x= Year, y=Production))
p + geom_point()+
  geom_line()+
  geom_smooth(method = "lm", se= FALSE, col="red", linetype = 2)+
  labs(x = "Year", y = "Production, 10000lb", title = "Variation of US cheeses production from 1950 -1997")+
  theme_classic()+
  theme(plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0.5),
        axis.title = element_text(size = rel(1.2)),
        legend.position = "bottom")

Figure 1: Variation of US blue and gorgonzola cheeses production from 1950-1997. The red dot line shows the fitted regression line.

# Prepare new data set with 1d to remove the trend
nrc<-dim(my.data1)[1] # Number of row in my data here 1 for row and we can use 2 for number of column
                      # and we can also get number of column by dim(my.data1)[2]

my.data1[2:nrc,1]     # Check the data for row second to end of number of row(48 number row), only for

##  [1] 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965
## [16] 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980
## [31] 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
## [46] 1996 1997

                      # column 1. Here we will get 47 number data    


diff(my.data1[,2])   # get the difference of data for colum 2, it usually create the difference of

##  [1] -2206  5432 -1329   -35   528   616   201   583  1263  2459  1036 -1698
## [13]   893  1400  2200  1198 -1625   802  1657  2218  1969  3330  1210 -1497
## [25]   244  5379   891   571  -719 -1585 -2829   799   483  2619  -682   765
## [37]  1665  1926 -3228  1873 -2063 -1064   -12  3219    79  1718  4462

                     # consecutive value only, here we have 48 row and 48 data, if we get difference
                     # of consecutive data, so we will have 47 number difference 

# Now combine above data and stored in two colum in new data name: dmy.data1
dmy.data1<-cbind(my.data1[2:nrc,1],diff(my.data1[,2]))

dmy.data1 <- as.data.frame(dmy.data1)

# Linear regression model is created for new data series to check the relation with time. We expect that the fit line will be almost horizontal line so there is no trend and therefore generated residual will follow normal distribution.
dmy.data1_fit <- lm(V2 ~ V1, data = dmy.data1)  # Fit the model
summary(dmy.data1_fit)  # Report the results

## 
## Call:
## lm(formula = V2 ~ V1, data = dmy.data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3985.1 -1104.1    46.5  1035.7  4699.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -568.0902 41276.2885  -0.014    0.989
## V1              0.6663    20.9095   0.032    0.975
## 
## Residual standard error: 1944 on 45 degrees of freedom
## Multiple R-squared:  2.256e-05,  Adjusted R-squared:  -0.0222 
## F-statistic: 0.001015 on 1 and 45 DF,  p-value: 0.9747

# plot the time series with trend over time for d=1
p <- ggplot (dmy.data1, aes(x= V1, y=V2))
p + geom_point()+
  geom_line()+
  geom_smooth(method = "lm", se= FALSE, col="red", linetype = 2)+
  labs(x = " ", y = "Cheeses Production, d =1", title = "Variation of US cheeses production from 1950 -1997 for d=1")+
  theme_classic()+
  theme(plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0.5),
        axis.title = element_text(size = rel(1.2)),
        legend.position = "bottom")

Figure 2: Variation of US cheeses production from 1950 -1997 after taking differences of d=1. The red dot line shows the fitted regression line.

# Autocorrelation function code
acf(dmy.data1[,2], lag.max=47,type="correlation", main=" ", ylab= "Autocorrelation")

Figure 3: Sample Autocorrelation funciton (ACF) for the US production of blue and gorgonzola cheeses for 1950 -1997 after taking difference of d=1

# call variogram funciton for my.data [,2]; for column two

variogram_production <- as.data.frame(variogram(dmy.data1[,2], length (dmy.data1[ ,2])/2))
dim(variogram_production)

## [1] 23  2

plot(variogram_production)

ggplot(variogram_production, aes(x=Lag, y=vario))+
  geom_col() +
  coord_flip()+
  labs(x = "Lag", y = "Variogram", title = "Variogram of US cheeses production for d=1")+
  theme_classic()+
  theme(plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0.5),
        axis.title = element_text(size = rel(1.2)),
        legend.position = "bottom")

Figure 4: variogram for the US production of blue and gorgonzola cheeses for 1950 -1997 after taking difference of d=1

###Diagnostic plot:

We can check if a model works well for data in many different ways. We pay great attention to regression results, such as slope coefficients, p-values, or R squared value that tell us how well a model represents given data. But it is not quite efficient. Residuals could show how poorly a model represents data. Residuals are leftover of the outcome variable after fitting a model to data and they could reveal unexplained patterns in the data by the fitted model. Utilizing the information obtained from diagnostic plot, we can check if linear regression assumptions are met, and also we can improve our model in an exploratory way.

Residuals vs Fitted

This plot shows if residuals have non-linear patterns. There could be a non-linear relationship between predictor variables and an outcome variable and the pattern could show up in this plot if the model doesn’t capture the non-linear relationship. If we find equally spread residuals around a horizontal line without distinct patterns, that is a good indication we don’t have non-linear relationships.

Normal Q-Q

This plot shows if residuals are normally distributed. Do residuals follow a straight line well or do they deviate severely? It’s good if residuals are lined well on the straight dashed line.

Scale-Location

It’s also called Spread-Location plot. This plot shows if residuals are spread equally along the ranges of predictors. This is how we can check the assumption of equal variance (homoscedasticity). It’s good if we see a horizontal line with equally (randomly) spread points.

Residuals vs Leverage

This plot helps us to find influential cases (i.e., subjects) if any. Not all outliers are influential in linear regression analysis (whatever outliers mean). Even though data have extreme values, they might not be influential to determine a regression line.

Here is the built-in diagnostic plots for linear regression analysis in R is given:

par(mfrow=c(2,2))

qqnorm(dmy.data1_fit$res,datax=TRUE,pch=16,xlab='Residual',main='')

qqline(dmy.data1_fit$res,datax=TRUE)
plot(dmy.data1_fit$fit,dmy.data1_fit$res,pch=16, xlab='Fitted Value',
ylab='Residual')
abline(h=0)
hist(dmy.data1_fit$res,col="gray",xlab='Residual',main='')
plot(dmy.data1_fit$res,type="l",xlab='Observation Order',
ylab='Residual', col="red")
points(dmy.data1_fit$res,pch=16,cex=.5)
abline(h=0)

Figure 5: Liner model’s residual plot for US production of cheeses after taking difference of d=1

Example 2

Reconsider the data on the number of airline miles flown in the United Kingdom from Exercise 2.10. Take the natural logarithm of the data and plot this new time series. a. What impact has the log transformation had on the time series? b. Find the autocorrelation function for this time series. c. Interpret the sample autocorrelation function.

Answer: It seems that the trend is non-stationary having seasonal variation for the number of airline miles flown in the United Kingdom.

p <- ggplot (my.data2, aes(x= Month, y=Miles))
p + geom_point()+
  geom_line()+
  geom_smooth(method = "lm", se= FALSE, col="red", linetype = 2)+
  labs(x = "Time", y = "Miles Flown, in Millions", title = "Variation of monthly number of airline miles flown in the United Kingdom")+
  theme_classic()+
  theme(plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0.5),
        axis.title = element_text(size = rel(1.2)),
        legend.position = "bottom")

Figure 6: Variation of the number of airline miles flown in the United Kingdom. The red dot line shows the fitted regression line.

#Transform data by natural log***************************************************************
my.data2[,2]

##  [1]  7.269  6.775  7.819  8.371  9.069 10.248 11.030 10.882 10.333  9.109
## [11]  7.685  7.682  8.350  7.829  8.829  9.948 10.638 11.253 11.424 11.391
## [21] 10.665  9.396  7.775  7.933  8.186  7.444  8.484  9.864 10.252 12.282
## [31] 11.637 11.577 12.417  9.637  8.094  9.280  8.334  7.899  9.994 10.078
## [41] 10.801 12.953 12.222 12.246 13.281 10.366  8.730  9.614  8.639  8.772
## [51] 10.894 10.455 11.179 10.588 10.794 12.770 13.812 10.857  9.290 10.925
## [61]  9.491  8.919 11.607  8.852 12.537 14.759 13.667 13.731 15.110 12.185
## [71] 10.645 12.161 10.840 10.436 13.589 13.402 13.103 14.933 14.147 14.057
## [81] 16.234 12.389 11.594 12.772

my.data2[,3] <- logb(my.data2[,2], base = exp(1))
my.data2[,3]

##  [1] 1.983619 1.913239 2.056557 2.124773 2.204862 2.327083 2.400619 2.387110
##  [9] 2.335343 2.209263 2.039270 2.038880 2.122262 2.057835 2.178042 2.297372
## [17] 2.364432 2.420635 2.435716 2.432824 2.366967 2.240284 2.050913 2.071031
## [25] 2.102425 2.007408 2.138182 2.288892 2.327473 2.508135 2.454190 2.449020
## [33] 2.519067 2.265610 2.091123 2.227862 2.120344 2.066736 2.301985 2.310355
## [41] 2.379639 2.561327 2.503238 2.505199 2.586334 2.338531 2.166765 2.263220
## [49] 2.156287 2.171565 2.388212 2.347080 2.414037 2.359721 2.378990 2.547099
## [57] 2.625538 2.384810 2.228939 2.391054 2.250344 2.188184 2.451608 2.180643
## [65] 2.528684 2.691853 2.614984 2.619656 2.715357 2.500206 2.365090 2.498234
## [73] 2.383243 2.345261 2.609261 2.595404 2.572841 2.703574 2.649503 2.643120
## [81] 2.787108 2.516809 2.450488 2.547255

colnames(my.data2) = c("Month", "Miles", "ln_Miles")

p <- ggplot (my.data2, aes(x= Month, y=ln_Miles))
p + geom_point()+
  geom_line()+
  geom_smooth(method = "lm", se= FALSE, col="red", linetype = 2)+
  labs(x = "Time", y = "ln(Miles in Millions)", title = "Variation of monthly number of airline miles flown in the United Kingdom")+
  theme_classic()+
  theme(plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0.5),
        axis.title = element_text(size = rel(1.2)),
        legend.position = "bottom")

Figure 7: Variation of the number of airline miles flown in the United Kingdom after natural log transform. The red dot line shows the fitted regression line.

# Autocorrelation function code
acf(my.data2[,3], lag.max=42,type="correlation", main=" ", ylab= "Autocorrelation")

Figure 8: Sample Autocorrelation funciton (ACF) for the number of airline miles flown in the United Kingdom after natural log transform

# here column 3 data is use to get ACF becasue, in column 3, the log transformed data has been stored

Exercise 2.12

Reconsider the data on the number of airline miles flown in the United Kingdom from Exercises 2.10 and 2.11. Take the first difference of the natural logarithm of the data and plot this new time series. a. What impact has the log transformation had on the time series? b. Find the autocorrelation function for this time series. c. Interpret the sample autocorrelation function.

# Prepare new data set with 1d
nrc<-dim(my.data2)[1] # Number of row in my data here 1 for row and we can use 2 for number of column
                      # and we can also get number of column by dim(my.data1)[2]

Time <- my.data2[2:nrc,1]     # Check the data for row second to end of number of row(48 number row), only for
                      # column 1. Here we will get 47 number data    


Differences <- diff(my.data2[,3])   # get the difference of data for colum 2, it usually create the difference of
                     # consecutive value only, here we have 48 row and 48 data, if we get difference
                     # of consecutive data, so we will have 47 number difference 

# as.Date.default()
# Now combine above data and stored in two colum in new data name: dmy.data1
dmy.data2<-cbind.data.frame(Time, Differences)

# dmy.data2 <- as.data.frame(dmy.data2)

# Linear regression model is created for new data series to check the relation with time. We expect that the fit line will be almost horizontal line so there is no trend and therefore generated residual will follow normal distribution.
dmy.data2_fit <- lm(Differences ~ Time, data = dmy.data2)  # Fit the model
summary(dmy.data2_fit)  # Report the results

## 
## Call:
## lm(formula = Differences ~ Time, data = dmy.data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27461 -0.07800  0.00113  0.09447  0.34455 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  2.287e-03  2.365e-02   0.097    0.923
## Time        -5.704e-11  2.341e-10  -0.244    0.808
## 
## Residual standard error: 0.1344 on 81 degrees of freedom
## Multiple R-squared:  0.0007325,  Adjusted R-squared:  -0.0116 
## F-statistic: 0.05938 on 1 and 81 DF,  p-value: 0.8081

# plot the time series with trend over time for d=1
p <- ggplot (dmy.data2, aes(x= Time, y=Differences))
p + geom_point()+
  geom_line()+
  geom_smooth(method = "lm", se= FALSE, col="red", linetype = 2)+
  labs(x = " ", y = "ln (Miles Flown, in millions), d =1", title = " ")+
  theme_classic()+
  theme(plot.title = element_text(size = rel(1.2), face = "bold", hjust = 0.5),
        axis.title = element_text(size = rel(1.2)),
        legend.position = "bottom")

Figure 9: Variation of the number of airline miles flown in the United Kingdom after natural log transform and taking differences for d=1. The red dot line shows the fitted regression line.

# Autocorrelation function code
acf(dmy.data2[,2], lag.max=42,type="correlation", main=" ", ylab= "Autocorrelation")

Figure 10: Sample Autocorrelation funciton (ACF) for the number of airline miles flown in the United Kingdom after log transform and d=1

# in ACF we use 2 column data. Because, after taking difference the d=1, data is stored in 2nd column of dmy.data

Trends and Seasonality with Differencing

Ajoy Kumar Saha and Md Mominul Islam

Date: 08/24/2020

Removal of Trend using First Difference

Example 2

Exercise 2.12