library(zoo)
## Warning: package 'zoo' was built under R version 3.5.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.3.1
## v tibble 2.0.1 v dplyr 0.8.0.1
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'tidyr' was built under R version 3.5.2
## Warning: package 'readr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
## Warning: package 'readxl' was built under R version 3.5.2
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.5.2
library(tseries)
## Warning: package 'tseries' was built under R version 3.5.2
library(urca)
## Warning: package 'urca' was built under R version 3.5.2
library(forecast)
## Warning: package 'forecast' was built under R version 3.5.2
library(ggplot2)
Import the File
TimeSeries<-read_excel("C:/Users/bachh/OneDrive/Desktop/Textbooks/TBANLT 540 (Regression)/Salary_perc.xlsx")
#TimeSeries$Year <- ts(TimeSeries$Year)
summary(TimeSeries)
## Year Total WhiteEarnings
## Length:39 Min. :62.30 Min. :61.70
## Class :character 1st Qu.:70.15 1st Qu.:68.85
## Mode :character Median :76.40 Median :75.80
## Mean :75.34 Mean :74.58
## 3rd Qu.:80.60 3rd Qu.:79.90
## Max. :82.50 Max. :82.10
## AfricanAmericanEarnings HispanicLatinoEarnings
## Min. :74.40 Min. :71.70
## 1st Qu.:83.95 1st Qu.:84.90
## Median :86.80 Median :87.70
## Mean :86.22 Mean :85.70
## 3rd Qu.:89.35 3rd Qu.:89.05
## Max. :93.70 Max. :91.10
head(TimeSeries)
## # A tibble: 6 x 5
## Year Total WhiteEarnings AfricanAmericanEarnings HispanicLatinoEarnings
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1979 62.3 61.7 74.4 71.7
## 2 1980 64.2 63.4 75.8 73.5
## 3 1981 64.4 63.1 76.9 75.7
## 4 1982 65.7 64.5 78.1 75.5
## 5 1983 66.5 65.6 78.9 78.5
## 6 1984 67.6 66.8 79.5 77.7
str(TimeSeries)
## Classes 'tbl_df', 'tbl' and 'data.frame': 39 obs. of 5 variables:
## $ Year : chr "1979" "1980" "1981" "1982" ...
## $ Total : num 62.3 64.2 64.4 65.7 66.5 67.6 68.1 69.5 69.8 70.2 ...
## $ WhiteEarnings : num 61.7 63.4 63.1 64.5 65.6 66.8 67.2 67.9 68.2 68.4 ...
## $ AfricanAmericanEarnings: num 74.4 75.8 76.9 78.1 78.9 79.5 82.6 82.8 84.4 82.8 ...
## $ HispanicLatinoEarnings : num 71.7 73.5 75.7 75.5 78.5 77.7 77.7 80.6 82 84.4 ...
Change data type to Timeseries and create three vectors we will work on.
#tell R data is time series data
TSWhiteEarnings<-zoo(TimeSeries$WhiteEarnings, order.by = TimeSeries$Year)
TSHispanicLatinoEarnings<-zoo(TimeSeries$HispanicLatinoEarnings, order.by = TimeSeries$Year)
TSAfricanAmericanEarnings<-zoo(TimeSeries$AfricanAmericanEarnings, order.by = TimeSeries$Year)
Understand the characteristics of each key attributes by creating line plots.
## plots for timeseries data
ggplot(data = TimeSeries, aes(x = TimeSeries$Year, y =TimeSeries$WhiteEarnings))+ geom_line()
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
ggplot(data = TimeSeries, aes(x = TimeSeries$Year, y =TimeSeries$HispanicLatinoEarnings)) + geom_line()
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
ggplot(data = TimeSeries, aes(x = TimeSeries$Year, y =TimeSeries$AfricanAmericanEarnings)) + geom_line()
## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?
Conduct ADF test and KPSS test to make sure stationality for tswhite, tshisp, and tsblack.
Here are the tests for TSWhiteEarnings:
#test for stationarity for TSWhiteEarnings
adf.test(TSWhiteEarnings)
##
## Augmented Dickey-Fuller Test
##
## data: TSWhiteEarnings
## Dickey-Fuller = -1.7449, Lag order = 3, p-value = 0.6733
## alternative hypothesis: stationary
The p-value of adf test is 0.673, which is larger than 0.05. Therefore, we fail to reject the null hypothesis. Thus, there is no stationarity.
#test for trend stationarity for TSWhiteEarnings
kpss.test(TSWhiteEarnings, null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: TSWhiteEarnings
## KPSS Trend = 0.21539, Truncation lag parameter = 3, p-value =
## 0.01023
The p-value of KPSS test is 0.01023, which is lower than 0.05. Therefore, we reject the null hypothesis. Thus, there is no trend stationarity.
Based on the ADF and KPSS tests, exponential smoothing will not be applied. But, we can apply AUTO ARIMA model.
Here are the tests for TSHispanicLatinoEarnings:
#test for stationarity for TSHispanicLatinoEarnings
adf.test(TSHispanicLatinoEarnings)
##
## Augmented Dickey-Fuller Test
##
## data: TSHispanicLatinoEarnings
## Dickey-Fuller = -2.0185, Lag order = 3, p-value = 0.5661
## alternative hypothesis: stationary
The p-value of adf test is 0.5661, which is larger than 0.05. Therefore, we fail to reject the null hypothesis. Thus, there is no stationarity.
#test for stationarity for TSHispanicLatinoEarnings
kpss.test(TSHispanicLatinoEarnings, null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: TSHispanicLatinoEarnings
## KPSS Trend = 0.21018, Truncation lag parameter = 3, p-value =
## 0.01218
The p-value of KPSS test is 0.01218, which is lower than 0.05. Therefore, we reject the null hypothesis. Thus, there is no trend stationarity.
Based on the ADF and KPSS tests, exponential smoothing will not be applied. But, we can apply AUTO ARIMA model.
Here are the tests for TSAfricanAmericanEarnings:
#test for stationarity for TSAfricanAmericanEarnings
adf.test(TSAfricanAmericanEarnings)
##
## Augmented Dickey-Fuller Test
##
## data: TSAfricanAmericanEarnings
## Dickey-Fuller = -2.5312, Lag order = 3, p-value = 0.3654
## alternative hypothesis: stationary
The p-value of adf test is 0.3654, which is larger than 0.05. Therefore, we fail to reject the null hypothesis. Thus, there is no stationarity.
#test for stationarity for tsblack
kpss.test(TSAfricanAmericanEarnings, null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: TSAfricanAmericanEarnings
## KPSS Trend = 0.14905, Truncation lag parameter = 3, p-value =
## 0.04746
The p-value of KPSS test is 0.04746, which is lower than 0.05. Therefore, we reject the null hypothesis. Thus, there is no trend stationarity.
Based on the ADF and KPSS tests, exponential smoothing will not be applied. But, we can apply AUTO ARIMA model.
#ACF for TSWhiteEarnings
par(mfrow=c(1,2))
acf(TSWhiteEarnings)
pacf(TSWhiteEarnings)
Based on the correlograms above, autoregressive model (p=1, d= 0, q = 0) seems to be appropriate. This is because ACF tails off, and PACF cuts off after 1 lag. However, at the same time, there may be some seasonal trend because there is slightly sign curve on the PACF. Therefore, AUTO ARIMA will be used.
#ACF for TSHispanicLatinoEarnings
par(mfrow=c(1,2))
acf(TSHispanicLatinoEarnings)
pacf(TSHispanicLatinoEarnings)
Based on the correlograms above, SARIMA seems to be appropriate. This is because ACF tails off, and PACF cuts off after 1 lag. In addition, there may exist some seasonal trends because there is a slight sign curve on the PACF. Therefore, AUTO ARIMA will be used.
#ACF for TSAfricanAmericanEarnings
par(mfrow=c(1,2))
acf(TSAfricanAmericanEarnings)
pacf(TSAfricanAmericanEarnings)
Based on the correlograms above, SARIMA seems to be appropriate. This is because ACF tails off, and PACF cuts off after 1 lag. In addition, there may exist some seasonal trends because there is a slight sign curve on the PACF. Therefore, AUTO ARIMA will be used.
Based on the analysis we have conducted above, we will apply AUTO ARIMA to build models.
Here is the model for TSWhiteEarnings
#Build model for tswhite
WhiteArima <- auto.arima(TSWhiteEarnings)
WhiteArima
## Series: TSWhiteEarnings
## ARIMA(0,1,1) with drift
##
## Coefficients:
## ma1 drift
## 0.2753 0.5418
## s.e. 0.1653 0.1943
##
## sigma^2 estimated as 0.9407: log likelihood=-51.77
## AIC=109.54 AICc=110.24 BIC=114.45
Here is the forecast of the WhiteArima model.
#Forecasting using AUTO ARIMA - TSWhiteEarnings
WhiteArimaForecast <- forecast(WhiteArima, h = 10)
plot(WhiteArimaForecast)
Here is the model for TSHispanicLatinoEarnings
#Build model for TSHispanicLatinoEarnings
HispanicLatinoArima <- auto.arima(TSHispanicLatinoEarnings)
HispanicLatinoArima
## Series: TSHispanicLatinoEarnings
## ARIMA(1,2,1)
##
## Coefficients:
## ar1 ma1
## -0.4495 -0.7310
## s.e. 0.1735 0.1817
##
## sigma^2 estimated as 2.839: log likelihood=-71.56
## AIC=149.11 AICc=149.84 BIC=153.95
Here is the forecast of the HispanicLatinoArima model.
#Forecasting using AUTO ARIMA - TSHispanicLatinoEarnings
HispanicLatinoArimaForecast <- forecast(HispanicLatinoArima, h = 10)
plot(HispanicLatinoArimaForecast)
Here is the model for TSAfricanAmericanEarnings
#Build model for TSAfricanAmericanEarnings
AfricanAmericanArima <- auto.arima(TSAfricanAmericanEarnings)
AfricanAmericanArima
## Series: TSAfricanAmericanEarnings
## ARIMA(0,1,0) with drift
##
## Coefficients:
## drift
## 0.4763
## s.e. 0.2840
##
## sigma^2 estimated as 3.147: log likelihood=-75.2
## AIC=154.39 AICc=154.74 BIC=157.67
Here is the forecast of the WhiteArima model.
#Forecasting using AUTO ARIMA - TSAfricanAmericanEarnings
AfricanAmericanArimaForecast <- forecast(AfricanAmericanArima, h = 10)
plot(AfricanAmericanArimaForecast)
Here are the residual plots for TSWhiteEarnings, TSHispanicLatinoEarnings, and TSAfricanAmericanEarnings
par(mfrow=c(1,2))
#Residual plots for Rain
acf(ts(WhiteArima$residuals), main='ACF Residual - Full')
pacf(ts(WhiteArima$residuals), main='PACF Residual - Full')
Interpretation of the Residual Plot for WhiteARIMA: In both ACF and PACF residual plots, there is only one line that goes over blue lines. This means that, these residuals are not significantly different from 0. In orther words, the model predicts accurately.
Considering all the information above, we can consider this model tends to be reliable.
To be sure, based on the residual plots, some non-linearlities seem to exist. We can consider this phenomenon due to the characteristics of the data iteself, not seasonal pattern. Therefore, as for the next step, if this characteristics need to be clarified, we need to ask labor economists questions about labor theroies.
par(mfrow=c(1,2))
#Residual plots for Rain
acf(ts(HispanicLatinoArima$residuals), main='ACF Residual - Full')
pacf(ts(HispanicLatinoArima$residuals), main='PACF Residual - Full')
Interpretation of the Residual Plot for HispanicLatinoArima: In both ACF and PACF residual plots, there is only one line that goes over blue lines. This means that, these residuals are not significantly different from 0. In orther words, the model predicts accurately.
Considering this, we can consider this model tends to be reliable.
To be sure, based on the residual plots, some non-linearlities seem to exist. We can consider this phenomenon due to the characteristics of the data iteself, not seasonal pattern. Therefore, as for the next step, if this characteristics need to be clarified, we need to ask labor economists questions about labor theroies.
par(mfrow=c(1,2))
#Residual plots for Rain
acf(ts(AfricanAmericanArima$residuals), main='ACF Residual - Full')
pacf(ts(AfricanAmericanArima$residuals), main='PACF Residual - Full')
Interpretation of the Residual Plot for AfricanAmericanArima: In both ACF and PACF residual plots, there is only one line that goes over blue lines. This means that, these residuals are not significantly different from 0. In orther words, the model predicts accurately.
Considering all the information above, we can consider this model tends to be reliable.
To be sure, based on the residual plots above, some non-linearlities seem to exist. We can consider this phenomenon due to the characteristics of the data iteself, not seasonal pattern. Therefore, as for the next step, if this characteristics need to be clarified, we need to ask labor economists questions about labor theroies.