# Load data
df_fred <- read_csv("jobs_market.csv")
## Rows: 523 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): WILL5000INDFC
## dbl (1): IHLCHGUS
## date (1): DATE
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Convert column to numeric
df_fred$WILL5000INDFC <- as.numeric(df_fred$WILL5000INDFC)
## Warning: NAs introduced by coercion
# Preview data
glimpse(df_fred)
## Rows: 523
## Columns: 3
## $ DATE <date> 2020-03-02, 2020-03-03, 2020-03-04, 2020-03-05, 2020-03…
## $ WILL5000INDFC <dbl> 146.51, 142.57, 148.23, 143.30, 140.68, 129.58, 135.71, …
## $ IHLCHGUS <dbl> -1.4, -1.4, -1.6, -1.7, -1.8, -1.9, -2.1, -2.2, -2.3, -2…
# Check for NAs
sum(is.na(df_fred$WILL5000INDFC))
## [1] 16
# Drop NAs
df_fred <- df_fred %>% drop_na()
# Preview relationship between independent and dependent variables
plot(df_fred$IHLCHGUS, df_fred$WILL5000INDFC)
Looks like the linearity assumption is not being met, but we may be able to handle this later via a seasonal dummy variable, so let’s proceed.
# Create linear regression model
model <- lm(IHLCHGUS ~ WILL5000INDFC, data=df_fred)
summary(model)
##
## Call:
## lm(formula = IHLCHGUS ~ WILL5000INDFC, data = df_fred)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.029 -8.444 -5.301 5.806 44.184
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -104.66377 3.25205 -32.18 <2e-16 ***
## WILL5000INDFC 0.53842 0.01698 31.70 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.64 on 505 degrees of freedom
## Multiple R-squared: 0.6656, Adjusted R-squared: 0.6649
## F-statistic: 1005 on 1 and 505 DF, p-value: < 2.2e-16
The Wilshire 5000 Total Market Full Cap Index is significant and adjusted R-squared is moderately high.
plot(model)
By reviewing the Residuals vs Fitted and Normal Q-Q plots we can conclude the model is not meeting the homoscedasticity and normality assumptions, respectively.
# Check for autocorrelation with Durbin-Watson Test
dwtest(model)
##
## Durbin-Watson test
##
## data: model
## DW = 0.012492, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# Check for autocorrelation with acf plot
acf(model$residuals, type="correlation")
With the presence of autocorrelation we can conclude the model is not meeting the independence assumption.