#Exercise 6

# load data
library("faraway")
data("uswages")

library("car")
## Warning: package 'car' was built under R version 3.3.2
## 
## Attaching package: 'car'
## The following objects are masked from 'package:faraway':
## 
##     logit, vif
library("ggplot2")
library("gridExtra")


# manipulating data
# we see that exper has neg. values
uswages$exper[uswages$exper <0] <-NA

# convert race, smsa, and pt to factor variables
uswages$race <- factor(uswages$race)
levels(uswages$race) <- c("White","Black")
uswages$smsa <- factor(uswages$smsa)
levels(uswages$smsa) <- c("No","Yes")
uswages$pt <- factor(uswages$pt)
levels(uswages$pt) <- c("No","Yes")

# create region, a factor variable based on the four regions ne, mw, so, we
uswages <- data.frame(uswages,
                      region =
                        1*uswages$ne +
                        2*uswages$mw +
                        3*uswages$so +
                        4*uswages$we)
uswages$region <- factor(uswages$region)
levels(uswages$region) <- c("ne","mw","so","we")

# delete the four regions ne, mw, so, we
uswages <- subset(uswages,select=-c(ne:we))

# Take care of NAs
uswages <- na.omit(uswages)

# 1. Nonconstance variance

# 1.a. Using the uswage data, fit the model (m):
# wage???educ+exper+race+smsa+pt+region

g <- lm(wage~educ+exper+race+smsa+pt+region, data = uswages)

# 1.b. Produce the Residuals vs Fitted plot, and discuss if there may be heteroskedasticiy in the error variance.

mod <- fortify(g)

p1 <- qplot(.fitted, .resid, data = mod) + geom_hline(yintercept = 0, linetype = "dashed") + labs(title = "Residuals vs Fitted", x = "Fitted", y = "Residuals") + geom_smooth(color = "red", se = F)

p1

## Here we use the Residual vs Fitted plot to detect patterns in residuals that would indicate nonconstant error variance.
## We some evidence of heterskedasticity, in other words nonconstant error variance; as the red line is not totally straight but a little curved.

# 1.c. Produce the Scale-Location plot, and discuss if there may be heteroskedasticiy in the error variance.

p2 <- qplot(.fitted, abs(.resid), data = mod) + geom_hline(yintercept = 0, linetype = "dashed") + 
  labs(title = "Scale-Location", x = "Fitted", y = "|Residuals|") + geom_smooth(method = "gam", color = "red", se = F)

p2

## The second plot is called the Scale-Location plot, which strengthens the pattern in the residuals by ploting the absolute values.
## We some evidence of heterskedasticity, in other words nonconstant error variance.
## We used the gam method for the Scale-Location plot (n > 1000). We see that the a slight upward sloping line strengthens our suspicion of nonconstant error variance.

grid.arrange(p1, p2, nrow = 2)

# 1.d. Perform the approximate test of nonconstant error variance.

summary(lm(abs(residuals(g)) ~ fitted(g)))
## 
## Call:
## lm(formula = abs(residuals(g)) ~ fitted(g))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -370.9 -152.6  -51.9   71.8 7367.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 80.43904   23.36854   3.442 0.000589 ***
## fitted(g)    0.27303    0.03615   7.552 6.53e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 323.9 on 1965 degrees of freedom
## Multiple R-squared:  0.02821,    Adjusted R-squared:  0.02771 
## F-statistic: 57.03 on 1 and 1965 DF,  p-value: 6.528e-14
## We look at the t-test for the slope coefficient with null hypthesis that the slope is zero. At the 10% level of significance, we conclude that the slope is not zero since the p-value, 6.528e-14, is less than 0.10
## This test is only approximate as the degrees of freedom number for the t-distribution, 1965, is theorectically too large.

# 2. Non-normal errors

# 2.a. Plot the Normal Q-Q Plot and Histogram of the residuals from model m Exercise 1. Do they indicate non-normal errors?

p3 <- qplot(sample = scale(.resid), data = mod) + geom_abline(intercept = 0, 
                                                                slope = 1, color = "red") + labs(title = "Untransformed y", y = "Residuals")

p4 <- qplot(scale(.resid), data = mod, geom = "blank") + geom_line(aes(y = ..density.., 
                                                                         colour = "Empirical"), stat = "density") + stat_function(fun = dnorm, aes(colour = "Normal")) + 
  geom_histogram(aes(y = ..density..), alpha = 0.4) + scale_colour_manual(name = "Density", 
                                                                          values = c("red", "blue")) + theme(legend.position = c(0.85, 0.85)) + labs(title = "Untransformed y", 
                                                                                                                                                     y = "Residuals")
grid.arrange(p3, p4, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## We check the residuals for normality using the Normal QQ-plot and see that they do indicate non-normal errors.
## The histogram of the residuals alone is not suitable for detecting nonnormality.
## However the kernal density estimator compared to the normal density indicates that the residuals could be nonnormal.
## Clearly the residuals of the untransformed model are not normal.

# 2.b. Perfrom the Shapiro-Wilk test of normality for the residuals of model m. What is the P-value and what does it say about normality?

shapiro.test(residuals(g))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(g)
## W = 0.71236, p-value < 2.2e-16
## We reject the null hypothesis of normality for the residuals of the untransformed model with level of significance 10% since the p-value is much less than 0.10.

# 2.c. Find the optimal Box-Cox power transform and apply it to wage, refit model m, replot Normal Q-Q Plot and perform the Shapiro-Wilk test of normality again. Did the Box-Cox Power Transform work?

require(car)
lambda <- powerTransform(g)
lambda
## Estimated transformation parameters 
##        Y1 
## 0.1034019
lam <- lambda$lambda
glam <- lm(wage^lam ~ educ + exper + race + smsa + pt + region, uswages)
modlam <- fortify(glam)
p5 <- qplot(sample = scale(.resid), data = modlam) + geom_abline(intercept = 0, 
                                                                 slope = 1, color = "red") + labs(title = "Normal QQ-Plot", y = "Residuals Box-Cox-Transform")
shapiro.test(residuals(glam))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(glam)
## W = 0.96461, p-value < 2.2e-16
## The Box-Cox Power Transform didn't work; 
## The Shapiro-Wilk test concludes that the errors are not normal for the Box-Cox Transform of wage with level of significance 10% since the p-value is much less than 0.10.

# 3. Exercise - Influential outliers

# 3.a. Produce the influence plot for model m. Are there any really large CookD values?

influencePlot(g)

##          StudRes        Hat        CookD
## 6591   0.1096679 0.02061630 0.0000281445
## 15387 20.1499155 0.01621022 0.6159365616
## As we can see from the plot, there are really large CookD values present.

# 3.b. Produce the half-normal plot of the leverage values. Are they any high leverage data points?

wages <- row.names(uswages)
halfnorm(lm.influence(g)$hat, labs = wages, ylab = "Leverages")

## As we can see from the plot, there are really large leverage values present.

# 3.c. Produce the half-normal plot of the Cook's distance. Are they any high Cook's distance points?

cook <- cooks.distance(g)
halfnorm(cook, 3, labs = wages, ylab = "Cook's distance")

## As we can see from the plot, there are really high Cook's distance points present.

# 3.d. Fit model excluding observation with largest Cook's Distance. Do the coeficients change? Are there any coeficients with notable changes?

glam1 <- lm(wage^lam ~ educ + exper + race + smsa + pt + region, uswages, subset = (cook<max(cook)))

# Comparison of model fitted coefficents with and without the worst influential observation.

compareCoefs(glam, glam1)
## 
## Call:
## 1: lm(formula = wage^lam ~ educ + exper + race + smsa + pt + region, 
##   data = uswages)
## 2: lm(formula = wage^lam ~ educ + exper + race + smsa + pt + region, 
##   data = uswages, subset = (cook < max(cook)))
##                Est. 1      SE 1    Est. 2      SE 2
## (Intercept)  1.61e+00  1.48e-02  1.61e+00  1.46e-02
## educ         1.72e-02  8.76e-04  1.76e-02  8.65e-04
## exper        3.03e-03  1.98e-04  2.96e-03  1.95e-04
## raceBlack   -4.15e-02  9.50e-03 -3.99e-02  9.36e-03
## smsaYes      3.52e-02  5.88e-03  3.42e-02  5.80e-03
## ptYes       -1.97e-01  9.04e-03 -2.03e-01  8.93e-03
## regionmw     1.48e-03  7.31e-03  1.39e-03  7.20e-03
## regionso     1.53e-03  7.01e-03  4.54e-05  6.91e-03
## regionwe     9.63e-03  7.59e-03  9.54e-03  7.48e-03
## The coefficients show some changes but there are no coeficients with notable or significant changes.

# 3.e. Produce the omnibus diagnotic plot for model m. Which observation consistantly stands out as an outlier-influential point in all four plots?

oldpar <- par(mfrow = c(2, 2))

plot(glam, main = "US Wages Data")

par(oldpar)

## 25276 2780 15387 obeservations consistantly stands out as an outlier-influential point in all four plots

# 4. Model structure

# 4.a. Produce the CERES plots for model m. Do the factor varibles stop the plots from printing?

ceresPlots(g, terms = ~.)
## Warning in ceresPlots(g, terms = ~.): Factors skipped in drawing CERES
## plots.

## Yes, the factor varibles stop the plots from printing.

# 4.b. How many plots are there? Why these?

## As the factor variables are not plotted, there are just two plots for educ and exper. 

# 4.c. Do the plots indicate a polynomial model should be considered?

## Yes, we see evidence for a polynomial model should be considered.

# 5. Interaction model

# 5.a. Fit an interaction model using the region and the two numeric variables. Is the model useful?

# We will fit this model and use it later for comparing it to a larger model.
g_main_effects <- lm(wage ~ educ + exper + region, uswages)

# We will create a factor for the two groups: (educ < 15), (educ > 15), we call it lt35 (meaning "less than 35") and add it to the uswages dataframe.
uswages$lt15 <- factor(uswages$educ < 15)

levels(uswages$lt15) <- c(0, 1)

g_interaction <- lm(wage ~ lt15 * educ + exper + region, uswages)

anova(g_main_effects, g_interaction)
## Analysis of Variance Table
## 
## Model 1: wage ~ educ + exper + region
## Model 2: wage ~ lt15 * educ + exper + region
##   Res.Df       RSS Df Sum of Sq      F   Pr(>F)    
## 1   1961 356452814                                 
## 2   1959 348836000  2   7616813 21.387 6.48e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## YES, The model is useful; since the F-Test rejects the main effects model with the P-Value much lesser than alpha value=0.05.

# 5.b. Test the interaction model versus model m. What is the p-value and which model does it indicate?

anova(g, g_interaction)
## Analysis of Variance Table
## 
## Model 1: wage ~ educ + exper + race + smsa + pt + region
## Model 2: wage ~ lt15 * educ + exper + region
##   Res.Df       RSS Df Sum of Sq      F    Pr(>F)    
## 1   1958 333222495                                  
## 2   1959 348836000 -1 -15613506 91.744 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## The answer is YES, since the F-Test rejects the main effects model with the P-Value much lesser than alpha value=0.05.

# 6. Collinearity

# 6.a. Find the variance inflation factors for model m.

vif(g)
##            GVIF Df GVIF^(1/(2*Df))
## educ   1.114749  1        1.055817
## exper  1.100628  1        1.049108
## race   1.048380  1        1.023904
## smsa   1.026374  1        1.013101
## pt     1.004126  1        1.002061
## region 1.061190  3        1.009948
# 6.b. Do they indicate collinerairty in the predictors?

summary(uswages)
##       wage              educ           exper          race       smsa     
##  Min.   :  50.39   Min.   : 0.00   Min.   : 0.00   White:1812   No : 483  
##  1st Qu.: 314.69   1st Qu.:12.00   1st Qu.: 8.00   Black: 155   Yes:1484  
##  Median : 522.32   Median :12.00   Median :16.00                          
##  Mean   : 613.99   Mean   :13.08   Mean   :18.74                          
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00                          
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00                          
##    pt       region   lt15    
##  No :1802   ne:448   0: 587  
##  Yes: 165   mw:488   1:1380  
##             so:616           
##             we:415           
##                              
## 
x.df = data.frame(uswages)
new_data=subset(x.df,select = c(wage,educ,exper) )
summary(new_data)
##       wage              educ           exper      
##  Min.   :  50.39   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 314.69   1st Qu.:12.00   1st Qu.: 8.00  
##  Median : 522.32   Median :12.00   Median :16.00  
##  Mean   : 613.99   Mean   :13.08   Mean   :18.74  
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00  
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00
round(cor(new_data),1)
##       wage educ exper
## wage   1.0  0.3   0.2
## educ   0.3  1.0  -0.3
## exper  0.2 -0.3   1.0
## The correlation matrix detects pairwise collinearity.