#Exercise 6
# load data
library("faraway")
data("uswages")
library("car")
## Warning: package 'car' was built under R version 3.3.2
##
## Attaching package: 'car'
## The following objects are masked from 'package:faraway':
##
## logit, vif
library("ggplot2")
library("gridExtra")
# manipulating data
# we see that exper has neg. values
uswages$exper[uswages$exper <0] <-NA
# convert race, smsa, and pt to factor variables
uswages$race <- factor(uswages$race)
levels(uswages$race) <- c("White","Black")
uswages$smsa <- factor(uswages$smsa)
levels(uswages$smsa) <- c("No","Yes")
uswages$pt <- factor(uswages$pt)
levels(uswages$pt) <- c("No","Yes")
# create region, a factor variable based on the four regions ne, mw, so, we
uswages <- data.frame(uswages,
region =
1*uswages$ne +
2*uswages$mw +
3*uswages$so +
4*uswages$we)
uswages$region <- factor(uswages$region)
levels(uswages$region) <- c("ne","mw","so","we")
# delete the four regions ne, mw, so, we
uswages <- subset(uswages,select=-c(ne:we))
# Take care of NAs
uswages <- na.omit(uswages)
# 1. Nonconstance variance
# 1.a. Using the uswage data, fit the model (m):
# wage???educ+exper+race+smsa+pt+region
g <- lm(wage~educ+exper+race+smsa+pt+region, data = uswages)
# 1.b. Produce the Residuals vs Fitted plot, and discuss if there may be heteroskedasticiy in the error variance.
mod <- fortify(g)
p1 <- qplot(.fitted, .resid, data = mod) + geom_hline(yintercept = 0, linetype = "dashed") + labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals") + geom_smooth(color = "red", se = F)
p1

## Here we use the Residual vs Fitted plot to detect patterns in residuals that would indicate nonconstant error variance.
## We some evidence of heterskedasticity, in other words nonconstant error variance; as the red line is not totally straight but a little curved.
# 1.c. Produce the Scale-Location plot, and discuss if there may be heteroskedasticiy in the error variance.
p2 <- qplot(.fitted, abs(.resid), data = mod) + geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Scale-Location", x = "Fitted", y = "|Residuals|") + geom_smooth(method = "gam", color = "red", se = F)
p2

## The second plot is called the Scale-Location plot, which strengthens the pattern in the residuals by ploting the absolute values.
## We some evidence of heterskedasticity, in other words nonconstant error variance.
## We used the gam method for the Scale-Location plot (n > 1000). We see that the a slight upward sloping line strengthens our suspicion of nonconstant error variance.
grid.arrange(p1, p2, nrow = 2)

# 1.d. Perform the approximate test of nonconstant error variance.
summary(lm(abs(residuals(g)) ~ fitted(g)))
##
## Call:
## lm(formula = abs(residuals(g)) ~ fitted(g))
##
## Residuals:
## Min 1Q Median 3Q Max
## -370.9 -152.6 -51.9 71.8 7367.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.43904 23.36854 3.442 0.000589 ***
## fitted(g) 0.27303 0.03615 7.552 6.53e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 323.9 on 1965 degrees of freedom
## Multiple R-squared: 0.02821, Adjusted R-squared: 0.02771
## F-statistic: 57.03 on 1 and 1965 DF, p-value: 6.528e-14
## We look at the t-test for the slope coefficient with null hypthesis that the slope is zero. At the 10% level of significance, we conclude that the slope is not zero since the p-value, 6.528e-14, is less than 0.10
## This test is only approximate as the degrees of freedom number for the t-distribution, 1965, is theorectically too large.
# 2. Non-normal errors
# 2.a. Plot the Normal Q-Q Plot and Histogram of the residuals from model m Exercise 1. Do they indicate non-normal errors?
p3 <- qplot(sample = scale(.resid), data = mod) + geom_abline(intercept = 0,
slope = 1, color = "red") + labs(title = "Untransformed y", y = "Residuals")
p4 <- qplot(scale(.resid), data = mod, geom = "blank") + geom_line(aes(y = ..density..,
colour = "Empirical"), stat = "density") + stat_function(fun = dnorm, aes(colour = "Normal")) +
geom_histogram(aes(y = ..density..), alpha = 0.4) + scale_colour_manual(name = "Density",
values = c("red", "blue")) + theme(legend.position = c(0.85, 0.85)) + labs(title = "Untransformed y",
y = "Residuals")
grid.arrange(p3, p4, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## We check the residuals for normality using the Normal QQ-plot and see that they do indicate non-normal errors.
## The histogram of the residuals alone is not suitable for detecting nonnormality.
## However the kernal density estimator compared to the normal density indicates that the residuals could be nonnormal.
## Clearly the residuals of the untransformed model are not normal.
# 2.b. Perfrom the Shapiro-Wilk test of normality for the residuals of model m. What is the P-value and what does it say about normality?
shapiro.test(residuals(g))
##
## Shapiro-Wilk normality test
##
## data: residuals(g)
## W = 0.71236, p-value < 2.2e-16
## We reject the null hypothesis of normality for the residuals of the untransformed model with level of significance 10% since the p-value is much less than 0.10.
# 2.c. Find the optimal Box-Cox power transform and apply it to wage, refit model m, replot Normal Q-Q Plot and perform the Shapiro-Wilk test of normality again. Did the Box-Cox Power Transform work?
require(car)
lambda <- powerTransform(g)
lambda
## Estimated transformation parameters
## Y1
## 0.1034019
lam <- lambda$lambda
glam <- lm(wage^lam ~ educ + exper + race + smsa + pt + region, uswages)
modlam <- fortify(glam)
p5 <- qplot(sample = scale(.resid), data = modlam) + geom_abline(intercept = 0,
slope = 1, color = "red") + labs(title = "Normal QQ-Plot", y = "Residuals Box-Cox-Transform")
shapiro.test(residuals(glam))
##
## Shapiro-Wilk normality test
##
## data: residuals(glam)
## W = 0.96461, p-value < 2.2e-16
## The Box-Cox Power Transform didn't work;
## The Shapiro-Wilk test concludes that the errors are not normal for the Box-Cox Transform of wage with level of significance 10% since the p-value is much less than 0.10.
# 3. Exercise - Influential outliers
# 3.a. Produce the influence plot for model m. Are there any really large CookD values?
influencePlot(g)

## StudRes Hat CookD
## 6591 0.1096679 0.02061630 0.0000281445
## 15387 20.1499155 0.01621022 0.6159365616
## As we can see from the plot, there are really large CookD values present.
# 3.b. Produce the half-normal plot of the leverage values. Are they any high leverage data points?
wages <- row.names(uswages)
halfnorm(lm.influence(g)$hat, labs = wages, ylab = "Leverages")

## As we can see from the plot, there are really large leverage values present.
# 3.c. Produce the half-normal plot of the Cook's distance. Are they any high Cook's distance points?
cook <- cooks.distance(g)
halfnorm(cook, 3, labs = wages, ylab = "Cook's distance")

## As we can see from the plot, there are really high Cook's distance points present.
# 3.d. Fit model excluding observation with largest Cook's Distance. Do the coeficients change? Are there any coeficients with notable changes?
glam1 <- lm(wage^lam ~ educ + exper + race + smsa + pt + region, uswages, subset = (cook<max(cook)))
# Comparison of model fitted coefficents with and without the worst influential observation.
compareCoefs(glam, glam1)
##
## Call:
## 1: lm(formula = wage^lam ~ educ + exper + race + smsa + pt + region,
## data = uswages)
## 2: lm(formula = wage^lam ~ educ + exper + race + smsa + pt + region,
## data = uswages, subset = (cook < max(cook)))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 1.61e+00 1.48e-02 1.61e+00 1.46e-02
## educ 1.72e-02 8.76e-04 1.76e-02 8.65e-04
## exper 3.03e-03 1.98e-04 2.96e-03 1.95e-04
## raceBlack -4.15e-02 9.50e-03 -3.99e-02 9.36e-03
## smsaYes 3.52e-02 5.88e-03 3.42e-02 5.80e-03
## ptYes -1.97e-01 9.04e-03 -2.03e-01 8.93e-03
## regionmw 1.48e-03 7.31e-03 1.39e-03 7.20e-03
## regionso 1.53e-03 7.01e-03 4.54e-05 6.91e-03
## regionwe 9.63e-03 7.59e-03 9.54e-03 7.48e-03
## The coefficients show some changes but there are no coeficients with notable or significant changes.
# 3.e. Produce the omnibus diagnotic plot for model m. Which observation consistantly stands out as an outlier-influential point in all four plots?
oldpar <- par(mfrow = c(2, 2))
plot(glam, main = "US Wages Data")

par(oldpar)
## 25276 2780 15387 obeservations consistantly stands out as an outlier-influential point in all four plots
# 4. Model structure
# 4.a. Produce the CERES plots for model m. Do the factor varibles stop the plots from printing?
ceresPlots(g, terms = ~.)
## Warning in ceresPlots(g, terms = ~.): Factors skipped in drawing CERES
## plots.

## Yes, the factor varibles stop the plots from printing.
# 4.b. How many plots are there? Why these?
## As the factor variables are not plotted, there are just two plots for educ and exper.
# 4.c. Do the plots indicate a polynomial model should be considered?
## Yes, we see evidence for a polynomial model should be considered.
# 5. Interaction model
# 5.a. Fit an interaction model using the region and the two numeric variables. Is the model useful?
# We will fit this model and use it later for comparing it to a larger model.
g_main_effects <- lm(wage ~ educ + exper + region, uswages)
# We will create a factor for the two groups: (educ < 15), (educ > 15), we call it lt35 (meaning "less than 35") and add it to the uswages dataframe.
uswages$lt15 <- factor(uswages$educ < 15)
levels(uswages$lt15) <- c(0, 1)
g_interaction <- lm(wage ~ lt15 * educ + exper + region, uswages)
anova(g_main_effects, g_interaction)
## Analysis of Variance Table
##
## Model 1: wage ~ educ + exper + region
## Model 2: wage ~ lt15 * educ + exper + region
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1961 356452814
## 2 1959 348836000 2 7616813 21.387 6.48e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## YES, The model is useful; since the F-Test rejects the main effects model with the P-Value much lesser than alpha value=0.05.
# 5.b. Test the interaction model versus model m. What is the p-value and which model does it indicate?
anova(g, g_interaction)
## Analysis of Variance Table
##
## Model 1: wage ~ educ + exper + race + smsa + pt + region
## Model 2: wage ~ lt15 * educ + exper + region
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1958 333222495
## 2 1959 348836000 -1 -15613506 91.744 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## The answer is YES, since the F-Test rejects the main effects model with the P-Value much lesser than alpha value=0.05.
# 6. Collinearity
# 6.a. Find the variance inflation factors for model m.
vif(g)
## GVIF Df GVIF^(1/(2*Df))
## educ 1.114749 1 1.055817
## exper 1.100628 1 1.049108
## race 1.048380 1 1.023904
## smsa 1.026374 1 1.013101
## pt 1.004126 1 1.002061
## region 1.061190 3 1.009948
# 6.b. Do they indicate collinerairty in the predictors?
summary(uswages)
## wage educ exper race smsa
## Min. : 50.39 Min. : 0.00 Min. : 0.00 White:1812 No : 483
## 1st Qu.: 314.69 1st Qu.:12.00 1st Qu.: 8.00 Black: 155 Yes:1484
## Median : 522.32 Median :12.00 Median :16.00
## Mean : 613.99 Mean :13.08 Mean :18.74
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00
## Max. :7716.05 Max. :18.00 Max. :59.00
## pt region lt15
## No :1802 ne:448 0: 587
## Yes: 165 mw:488 1:1380
## so:616
## we:415
##
##
x.df = data.frame(uswages)
new_data=subset(x.df,select = c(wage,educ,exper) )
summary(new_data)
## wage educ exper
## Min. : 50.39 Min. : 0.00 Min. : 0.00
## 1st Qu.: 314.69 1st Qu.:12.00 1st Qu.: 8.00
## Median : 522.32 Median :12.00 Median :16.00
## Mean : 613.99 Mean :13.08 Mean :18.74
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00
## Max. :7716.05 Max. :18.00 Max. :59.00
round(cor(new_data),1)
## wage educ exper
## wage 1.0 0.3 0.2
## educ 0.3 1.0 -0.3
## exper 0.2 -0.3 1.0
## The correlation matrix detects pairwise collinearity.