library(zoo)

## Warning: package 'zoo' was built under R version 3.5.2

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(tidyverse)

## -- Attaching packages ----------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.0.0       v purrr   0.3.1  
## v tibble  2.0.1       v dplyr   0.8.0.1
## v tidyr   0.8.2       v stringr 1.3.1  
## v readr   1.3.1       v forcats 0.3.0

## Warning: package 'tibble' was built under R version 3.5.2

## Warning: package 'tidyr' was built under R version 3.5.2

## Warning: package 'readr' was built under R version 3.5.2

## Warning: package 'purrr' was built under R version 3.5.3

## Warning: package 'dplyr' was built under R version 3.5.3

## Warning: package 'stringr' was built under R version 3.5.2

## Warning: package 'forcats' was built under R version 3.5.2

## -- Conflicts -------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readxl)

## Warning: package 'readxl' was built under R version 3.5.2

library(corrgram)

## Warning: package 'corrgram' was built under R version 3.5.2

library(tseries)

## Warning: package 'tseries' was built under R version 3.5.2

library(urca)

## Warning: package 'urca' was built under R version 3.5.2

library(forecast)

## Warning: package 'forecast' was built under R version 3.5.2

library(ggplot2)

Import the File

TimeSeries<-read_excel("C:/Users/bachh/OneDrive/Desktop/Textbooks/TBANLT 540 (Regression)/Salary_perc.xlsx")

#TimeSeries$Year <- ts(TimeSeries$Year)
summary(TimeSeries)

##      Year               Total       WhiteEarnings  
##  Length:39          Min.   :62.30   Min.   :61.70  
##  Class :character   1st Qu.:70.15   1st Qu.:68.85  
##  Mode  :character   Median :76.40   Median :75.80  
##                     Mean   :75.34   Mean   :74.58  
##                     3rd Qu.:80.60   3rd Qu.:79.90  
##                     Max.   :82.50   Max.   :82.10  
##  AfricanAmericanEarnings HispanicLatinoEarnings
##  Min.   :74.40           Min.   :71.70         
##  1st Qu.:83.95           1st Qu.:84.90         
##  Median :86.80           Median :87.70         
##  Mean   :86.22           Mean   :85.70         
##  3rd Qu.:89.35           3rd Qu.:89.05         
##  Max.   :93.70           Max.   :91.10

head(TimeSeries)

## # A tibble: 6 x 5
##   Year  Total WhiteEarnings AfricanAmericanEarnings HispanicLatinoEarnings
##   <chr> <dbl>         <dbl>                   <dbl>                  <dbl>
## 1 1979   62.3          61.7                    74.4                   71.7
## 2 1980   64.2          63.4                    75.8                   73.5
## 3 1981   64.4          63.1                    76.9                   75.7
## 4 1982   65.7          64.5                    78.1                   75.5
## 5 1983   66.5          65.6                    78.9                   78.5
## 6 1984   67.6          66.8                    79.5                   77.7

str(TimeSeries)

## Classes 'tbl_df', 'tbl' and 'data.frame':    39 obs. of  5 variables:
##  $ Year                   : chr  "1979" "1980" "1981" "1982" ...
##  $ Total                  : num  62.3 64.2 64.4 65.7 66.5 67.6 68.1 69.5 69.8 70.2 ...
##  $ WhiteEarnings          : num  61.7 63.4 63.1 64.5 65.6 66.8 67.2 67.9 68.2 68.4 ...
##  $ AfricanAmericanEarnings: num  74.4 75.8 76.9 78.1 78.9 79.5 82.6 82.8 84.4 82.8 ...
##  $ HispanicLatinoEarnings : num  71.7 73.5 75.7 75.5 78.5 77.7 77.7 80.6 82 84.4 ...

Change data type to Timeseries and create three vectors we will work on.

#tell R data is time series data
TSWhiteEarnings<-zoo(TimeSeries$WhiteEarnings, order.by = TimeSeries$Year)
TSHispanicLatinoEarnings<-zoo(TimeSeries$HispanicLatinoEarnings, order.by = TimeSeries$Year)
TSAfricanAmericanEarnings<-zoo(TimeSeries$AfricanAmericanEarnings, order.by = TimeSeries$Year)

Understand the characteristics of each key attributes by creating line plots.

## plots for timeseries data
ggplot(data = TimeSeries, aes(x = TimeSeries$Year, y =TimeSeries$WhiteEarnings))+ geom_line()

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

ggplot(data = TimeSeries, aes(x = TimeSeries$Year, y =TimeSeries$HispanicLatinoEarnings)) + geom_line()

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

ggplot(data = TimeSeries, aes(x = TimeSeries$Year, y =TimeSeries$AfricanAmericanEarnings)) + geom_line()

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?

Test for Stationality

Conduct ADF test and KPSS test to make sure stationality for tswhite, tshisp, and tsblack.

TSWhiteEarnings

Here are the tests for TSWhiteEarnings:

#test for stationarity for TSWhiteEarnings
adf.test(TSWhiteEarnings)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  TSWhiteEarnings
## Dickey-Fuller = -1.7449, Lag order = 3, p-value = 0.6733
## alternative hypothesis: stationary

The p-value of adf test is 0.673, which is larger than 0.05. Therefore, we fail to reject the null hypothesis. Thus, there is no stationarity.

#test for trend stationarity for TSWhiteEarnings
kpss.test(TSWhiteEarnings, null = "Trend")

## 
##  KPSS Test for Trend Stationarity
## 
## data:  TSWhiteEarnings
## KPSS Trend = 0.21539, Truncation lag parameter = 3, p-value =
## 0.01023

The p-value of KPSS test is 0.01023, which is lower than 0.05. Therefore, we reject the null hypothesis. Thus, there is no trend stationarity.

Based on the ADF and KPSS tests, exponential smoothing will not be applied. But, we can apply AUTO ARIMA model.

TSHispanicLatinoEarnings

Here are the tests for TSHispanicLatinoEarnings:

#test for stationarity for TSHispanicLatinoEarnings
adf.test(TSHispanicLatinoEarnings)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  TSHispanicLatinoEarnings
## Dickey-Fuller = -2.0185, Lag order = 3, p-value = 0.5661
## alternative hypothesis: stationary

The p-value of adf test is 0.5661, which is larger than 0.05. Therefore, we fail to reject the null hypothesis. Thus, there is no stationarity.

#test for stationarity for TSHispanicLatinoEarnings
kpss.test(TSHispanicLatinoEarnings, null = "Trend")

## 
##  KPSS Test for Trend Stationarity
## 
## data:  TSHispanicLatinoEarnings
## KPSS Trend = 0.21018, Truncation lag parameter = 3, p-value =
## 0.01218

The p-value of KPSS test is 0.01218, which is lower than 0.05. Therefore, we reject the null hypothesis. Thus, there is no trend stationarity.

Based on the ADF and KPSS tests, exponential smoothing will not be applied. But, we can apply AUTO ARIMA model.

TSAfricanAmericanEarnings

Here are the tests for TSAfricanAmericanEarnings:

#test for stationarity for TSAfricanAmericanEarnings
adf.test(TSAfricanAmericanEarnings)

## 
##  Augmented Dickey-Fuller Test
## 
## data:  TSAfricanAmericanEarnings
## Dickey-Fuller = -2.5312, Lag order = 3, p-value = 0.3654
## alternative hypothesis: stationary

The p-value of adf test is 0.3654, which is larger than 0.05. Therefore, we fail to reject the null hypothesis. Thus, there is no stationarity.

#test for stationarity for tsblack
kpss.test(TSAfricanAmericanEarnings, null = "Trend")

## 
##  KPSS Test for Trend Stationarity
## 
## data:  TSAfricanAmericanEarnings
## KPSS Trend = 0.14905, Truncation lag parameter = 3, p-value =
## 0.04746

The p-value of KPSS test is 0.04746, which is lower than 0.05. Therefore, we reject the null hypothesis. Thus, there is no trend stationarity.

Based on the ADF and KPSS tests, exponential smoothing will not be applied. But, we can apply AUTO ARIMA model.

Create the Correlograms

TSWhiteEarnings

#ACF for TSWhiteEarnings
par(mfrow=c(1,2))
acf(TSWhiteEarnings)
pacf(TSWhiteEarnings)

Based on the correlograms above, autoregressive model (p=1, d= 0, q = 0) seems to be appropriate. This is because ACF tails off, and PACF cuts off after 1 lag. However, at the same time, there may be some seasonal trend because there is slightly sign curve on the PACF. Therefore, AUTO ARIMA will be used.

TSHispanicLatinoEarnings

#ACF for TSHispanicLatinoEarnings
par(mfrow=c(1,2))
acf(TSHispanicLatinoEarnings)
pacf(TSHispanicLatinoEarnings)

Based on the correlograms above, SARIMA seems to be appropriate. This is because ACF tails off, and PACF cuts off after 1 lag. In addition, there may exist some seasonal trends because there is a slight sign curve on the PACF. Therefore, AUTO ARIMA will be used.

TSAfricanAmericanEarnings

#ACF for TSAfricanAmericanEarnings
par(mfrow=c(1,2))
acf(TSAfricanAmericanEarnings)
pacf(TSAfricanAmericanEarnings)

Model Building and Forecasting

Based on the analysis we have conducted above, we will apply AUTO ARIMA to build models.

TSWhiteEarnings

Here is the model for TSWhiteEarnings

#Build model for tswhite
WhiteArima <- auto.arima(TSWhiteEarnings)
WhiteArima

## Series: TSWhiteEarnings 
## ARIMA(0,1,1) with drift 
## 
## Coefficients:
##          ma1   drift
##       0.2753  0.5418
## s.e.  0.1653  0.1943
## 
## sigma^2 estimated as 0.9407:  log likelihood=-51.77
## AIC=109.54   AICc=110.24   BIC=114.45

Here is the forecast of the WhiteArima model.

#Forecasting using AUTO ARIMA - TSWhiteEarnings
WhiteArimaForecast <- forecast(WhiteArima, h = 10)
plot(WhiteArimaForecast)

TSHispanicLatinoEarnings

Here is the model for TSHispanicLatinoEarnings

#Build model for TSHispanicLatinoEarnings
HispanicLatinoArima <- auto.arima(TSHispanicLatinoEarnings)
HispanicLatinoArima

## Series: TSHispanicLatinoEarnings 
## ARIMA(1,2,1) 
## 
## Coefficients:
##           ar1      ma1
##       -0.4495  -0.7310
## s.e.   0.1735   0.1817
## 
## sigma^2 estimated as 2.839:  log likelihood=-71.56
## AIC=149.11   AICc=149.84   BIC=153.95

Here is the forecast of the HispanicLatinoArima model.

#Forecasting using AUTO ARIMA - TSHispanicLatinoEarnings
HispanicLatinoArimaForecast <- forecast(HispanicLatinoArima, h = 10)
plot(HispanicLatinoArimaForecast)

TSAfricanAmericanEarnings

Here is the model for TSAfricanAmericanEarnings

#Build model for TSAfricanAmericanEarnings
AfricanAmericanArima <- auto.arima(TSAfricanAmericanEarnings)
AfricanAmericanArima

## Series: TSAfricanAmericanEarnings 
## ARIMA(0,1,0) with drift 
## 
## Coefficients:
##        drift
##       0.4763
## s.e.  0.2840
## 
## sigma^2 estimated as 3.147:  log likelihood=-75.2
## AIC=154.39   AICc=154.74   BIC=157.67

Here is the forecast of the WhiteArima model.

#Forecasting using AUTO ARIMA - TSAfricanAmericanEarnings
AfricanAmericanArimaForecast <- forecast(AfricanAmericanArima, h = 10)
plot(AfricanAmericanArimaForecast)

Resisual Plot

Here are the residual plots for TSWhiteEarnings, TSHispanicLatinoEarnings, and TSAfricanAmericanEarnings

TSWhiteEarnings

par(mfrow=c(1,2))
#Residual plots for Rain
acf(ts(WhiteArima$residuals), main='ACF Residual - Full')
pacf(ts(WhiteArima$residuals), main='PACF Residual - Full')

Interpretation of the Residual Plot for WhiteARIMA: In both ACF and PACF residual plots, there is only one line that goes over blue lines. This means that, these residuals are not significantly different from 0. In orther words, the model predicts accurately.
Considering all the information above, we can consider this model tends to be reliable.

To be sure, based on the residual plots, some non-linearlities seem to exist. We can consider this phenomenon due to the characteristics of the data iteself, not seasonal pattern. Therefore, as for the next step, if this characteristics need to be clarified, we need to ask labor economists questions about labor theroies.

TSHispanicLatinoEarnings

par(mfrow=c(1,2))
#Residual plots for Rain
acf(ts(HispanicLatinoArima$residuals), main='ACF Residual - Full')
pacf(ts(HispanicLatinoArima$residuals), main='PACF Residual - Full')

Interpretation of the Residual Plot for HispanicLatinoArima: In both ACF and PACF residual plots, there is only one line that goes over blue lines. This means that, these residuals are not significantly different from 0. In orther words, the model predicts accurately.
Considering this, we can consider this model tends to be reliable.

TSAfricanAmericanEarnings

par(mfrow=c(1,2))
#Residual plots for Rain
acf(ts(AfricanAmericanArima$residuals), main='ACF Residual - Full')
pacf(ts(AfricanAmericanArima$residuals), main='PACF Residual - Full')

Interpretation of the Residual Plot for AfricanAmericanArima: In both ACF and PACF residual plots, there is only one line that goes over blue lines. This means that, these residuals are not significantly different from 0. In orther words, the model predicts accurately.
Considering all the information above, we can consider this model tends to be reliable.

To be sure, based on the residual plots above, some non-linearlities seem to exist. We can consider this phenomenon due to the characteristics of the data iteself, not seasonal pattern. Therefore, as for the next step, if this characteristics need to be clarified, we need to ask labor economists questions about labor theroies.

MenWomenSalary

Test for Stationality

TSWhiteEarnings

TSHispanicLatinoEarnings

TSAfricanAmericanEarnings

Create the Correlograms

TSWhiteEarnings

TSHispanicLatinoEarnings

TSAfricanAmericanEarnings

Model Building and Forecasting

TSWhiteEarnings

TSHispanicLatinoEarnings

TSAfricanAmericanEarnings

Resisual Plot

TSWhiteEarnings

TSHispanicLatinoEarnings

TSAfricanAmericanEarnings