#Assignment 7
#Exercise 1
#Create factors, remove NAs
#Process the data to make factors with named levels for race, smsa, and pt.
#Replace the indicator variables ne, mw, so, and we with one factor called region with levels named: ne, mw, so, and we*.
#Remove all NAs.
#load data
#1(a)
library("faraway")
require(car)
## Loading required package: car
##
## Attaching package: 'car'
## The following objects are masked from 'package:faraway':
##
## logit, vif
data("uswages")
summary(uswages)
## wage educ exper race
## Min. : 50.39 Min. : 0.00 Min. :-2.00 Min. :0.000
## 1st Qu.: 308.64 1st Qu.:12.00 1st Qu.: 8.00 1st Qu.:0.000
## Median : 522.32 Median :12.00 Median :15.00 Median :0.000
## Mean : 608.12 Mean :13.11 Mean :18.41 Mean :0.078
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00 3rd Qu.:0.000
## Max. :7716.05 Max. :18.00 Max. :59.00 Max. :1.000
## smsa ne mw so
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.756 Mean :0.229 Mean :0.2485 Mean :0.3125
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.000 Max. :1.0000 Max. :1.0000
## we pt
## Min. :0.00 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:0.0000
## Median :0.00 Median :0.0000
## Mean :0.21 Mean :0.0925
## 3rd Qu.:0.00 3rd Qu.:0.0000
## Max. :1.00 Max. :1.0000
uswages$exper[uswages$exper <0] <-NA
#1(b)
# convert race, smsa, and pt to factor variables
uswages$race <- factor(uswages$race)
levels(uswages$race) <- c("White","Black")
uswages$smsa <- factor(uswages$smsa)
levels(uswages$smsa) <- c("No","Yes")
uswages$pt <- factor(uswages$pt)
levels(uswages$pt) <- c("No","Yes")
#1(c)
summary(uswages)
## wage educ exper race smsa
## Min. : 50.39 Min. : 0.00 Min. : 0.00 White:1844 No : 488
## 1st Qu.: 308.64 1st Qu.:12.00 1st Qu.: 8.00 Black: 156 Yes:1512
## Median : 522.32 Median :12.00 Median :16.00
## Mean : 608.12 Mean :13.11 Mean :18.74
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00
## Max. :7716.05 Max. :18.00 Max. :59.00
## NA's :33
## ne mw so we
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.00
## 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00
## Median :0.000 Median :0.0000 Median :0.0000 Median :0.00
## Mean :0.229 Mean :0.2485 Mean :0.3125 Mean :0.21
## 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.00
##
## pt
## No :1815
## Yes: 185
##
##
##
##
##
# create region, a factor variable based on the four regions ne, mw, so, we
uswages <- data.frame(uswages,
region =
1*uswages$ne +
2*uswages$mw +
3*uswages$so +
4*uswages$we)
uswages$region <- factor(uswages$region)
levels(uswages$region) <- c("ne","mw","so","we")
# delete the four regions ne, mw, so, we
uswages <- subset(uswages,select=-c(ne:we))
#1(d)
# Take care of NAs
uswages <- na.omit(uswages)
#Exercise 2
summary(uswages)
## wage educ exper race smsa
## Min. : 50.39 Min. : 0.00 Min. : 0.00 White:1812 No : 483
## 1st Qu.: 314.69 1st Qu.:12.00 1st Qu.: 8.00 Black: 155 Yes:1484
## Median : 522.32 Median :12.00 Median :16.00
## Mean : 613.99 Mean :13.08 Mean :18.74
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00
## Max. :7716.05 Max. :18.00 Max. :59.00
## pt region
## No :1802 ne:448
## Yes: 165 mw:488
## so:616
## we:415
##
##
#Exercise 3
#Compute OLS fit to model log(wage)~.
#Perform the Shapiro-Wilk Test of Normality for the residuals, what is the conclusion?
g1 = lm(log(wage) ~ ., data = uswages)
summary(g1)
##
## Call:
## lm(formula = log(wage) ~ ., data = uswages)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5158 -0.3309 0.0504 0.3520 3.9446
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.710039 0.076052 61.932 < 2e-16 ***
## educ 0.087515 0.004494 19.472 < 2e-16 ***
## exper 0.015483 0.001016 15.243 < 2e-16 ***
## raceBlack -0.215273 0.048723 -4.418 1.05e-05 ***
## smsaYes 0.177741 0.030177 5.890 4.54e-09 ***
## ptYes -1.067418 0.046344 -23.032 < 2e-16 ***
## regionmw 0.008320 0.037489 0.222 0.824
## regionso 0.010080 0.035945 0.280 0.779
## regionwe 0.046846 0.038922 1.204 0.229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5686 on 1958 degrees of freedom
## Multiple R-squared: 0.368, Adjusted R-squared: 0.3654
## F-statistic: 142.5 on 8 and 1958 DF, p-value: < 2.2e-16
shapiro.test(residuals(g1))
##
## Shapiro-Wilk normality test
##
## data: residuals(g1)
## W = 0.96353, p-value < 2.2e-16
#The Shapiro-Wilk test rejects normality of errors.
#Exercise 4
#Compute WLS fit to model log(wage)~. and weights = 1/(1+educ)
#Perform the Shapiro-Wilk Test of Normality for the residuals, what is the conclusion?
summary(uswages)
## wage educ exper race smsa
## Min. : 50.39 Min. : 0.00 Min. : 0.00 White:1812 No : 483
## 1st Qu.: 314.69 1st Qu.:12.00 1st Qu.: 8.00 Black: 155 Yes:1484
## Median : 522.32 Median :12.00 Median :16.00
## Mean : 613.99 Mean :13.08 Mean :18.74
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00
## Max. :7716.05 Max. :18.00 Max. :59.00
## pt region
## No :1802 ne:448
## Yes: 165 mw:488
## so:616
## we:415
##
##
g2 = lm(log(wage) ~ ., data = uswages, weight = 1/(1+ educ))
summary(g2)
##
## Call:
## lm(formula = log(wage) ~ ., data = uswages, weights = 1/(1 +
## educ))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.69235 -0.09294 0.01296 0.09763 2.99856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.091092 0.072223 70.491 < 2e-16 ***
## educ 0.064515 0.003791 17.019 < 2e-16 ***
## exper 0.013344 0.001078 12.375 < 2e-16 ***
## raceBlack -0.231167 0.051227 -4.513 6.78e-06 ***
## smsaYes 0.215742 0.032822 6.573 6.30e-11 ***
## ptYes -1.000953 0.051389 -19.478 < 2e-16 ***
## regionmw -0.072096 0.042154 -1.710 0.0874 .
## regionso -0.068740 0.039576 -1.737 0.0826 .
## regionwe -0.088407 0.042706 -2.070 0.0386 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1744 on 1958 degrees of freedom
## Multiple R-squared: 0.2896, Adjusted R-squared: 0.2867
## F-statistic: 99.8 on 8 and 1958 DF, p-value: < 2.2e-16
compareCoefs(g1,g2)
##
## Call:
## 1: lm(formula = log(wage) ~ ., data = uswages)
## 2: lm(formula = log(wage) ~ ., data = uswages, weights = 1/(1 + educ))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 4.71004 0.07605 5.09109 0.07222
## educ 0.08751 0.00449 0.06452 0.00379
## exper 0.01548 0.00102 0.01334 0.00108
## raceBlack -0.21527 0.04872 -0.23117 0.05123
## smsaYes 0.17774 0.03018 0.21574 0.03282
## ptYes -1.06742 0.04634 -1.00095 0.05139
## regionmw 0.00832 0.03749 -0.07210 0.04215
## regionso 0.01008 0.03595 -0.06874 0.03958
## regionwe 0.04685 0.03892 -0.08841 0.04271
shapiro.test(residuals(g2))
##
## Shapiro-Wilk normality test
##
## data: residuals(g2)
## W = 0.972, p-value < 2.2e-16
#The Shapiro-Wilk test rejects normality of errors.
#Exercise 5
#Compute Robust fit to model log(wage)~. using Huber, Hampel, Biquare, LTS, and LAD
#Compare coefficients of the above fits using OLS, WLS, Huber, Hampel, Biquare, LTS, and LAD
#Which would you recommend?
#Why?
ols= lm(log(wage)~ . , data = uswages )
wls = lm(log(wage) ~ ., data = uswages, weight = 1/(1+ educ))
#Huber M - estimation
library(MASS)
huber = rlm(log(wage) ~ .,psi= psi.huber, data = uswages )
#Tukey Bisquare M-estimation
bisquare = rlm(log(wage) ~ .,psi= psi.bisquare,init="lts",maxit=100 ,data = uswages )
#Hample M-estimation
hample = rlm(log(wage) ~ .,psi= psi.hampel,init="lts",maxit=100 ,data = uswages )
#Least Trimmed Squares(LTS)
#install.packages("robustbase")
library(robustbase)
## Warning: package 'robustbase' was built under R version 3.4.2
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:faraway':
##
## epilepsy
lts = ltsReg(log(wage) ~ .,data = uswages )
## Warning in covMcd(X, alpha = alpha, use.correction = use.correction): The 988-th order statistic of the absolute deviation of variable 3
## is zero.
## There are 1812 observations (in the entire dataset of 1967 obs.)
## lying on the hyperplane with equation a_1*(x_i1 - m_1) + ... +
## a_p*(x_ip - m_p) = 0 with (m_1, ..., m_p) the mean of these
## observations and coefficients a_i from the vector a <- c(0, 0, 1,
## 0, 0, 0, 0, 0)
lts_exact =ltsReg(log(wage) ~ .,data = uswages ,
nsamp = "exact")
## Warning in .fastlts(x, y, h, nsamp, intercept, adjust, trace = as.integer(trace)): 'nsamp' options 'best' and 'exact' not allowed for n greater than 599. Will use default.
## Warning in covMcd(X, alpha = alpha, use.correction = use.correction): The 988-th order statistic of the absolute deviation of variable 3
## is zero.
## There are 1812 observations (in the entire dataset of 1967 obs.)
## lying on the hyperplane with equation a_1*(x_i1 - m_1) + ... +
## a_p*(x_ip - m_p) = 0 with (m_1, ..., m_p) the mean of these
## observations and coefficients a_i from the vector a <- c(0, 0, 1,
## 0, 0, 0, 0, 0)
#Least Absolute Deviation
library(quantreg)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
lad = rq(log(wage) ~ .,data = uswages )
coefs = compareCoefs(ols,wls,huber,bisquare,hample,lts,lts_exact,lad,se = FALSE)
## Warning in compareCoefs(ols, wls, huber, bisquare, hample, lts,
## lts_exact, : models to be compared are of different classes
##
## Call:
## 1: lm(formula = log(wage) ~ ., data = uswages)
## 2: lm(formula = log(wage) ~ ., data = uswages, weights = 1/(1 + educ))
## 3: rlm(formula = log(wage) ~ ., data = uswages, psi = psi.huber)
## 4: rlm(formula = log(wage) ~ ., data = uswages, psi = psi.bisquare,
## init = "lts", maxit = 100)
## 5: rlm(formula = log(wage) ~ ., data = uswages, psi = psi.hampel, init
## = "lts", maxit = 100)
## 6: ltsReg.formula(formula = log(wage) ~ ., data = uswages)
## 7: ltsReg.formula(formula = log(wage) ~ ., data = uswages, nsamp =
## "exact")
## 8: rq(formula = log(wage) ~ ., data = uswages)
## Est. 1 Est. 2 Est. 3 Est. 4 Est. 5 Est. 6
## (Intercept) 4.710039 5.091092 4.621799 4.579708 4.605699
## educ 0.087515 0.064515 0.096150 0.099596 0.096683 0.100869
## exper 0.015483 0.013344 0.016229 0.016741 0.016109 0.017604
## raceBlack -0.215273 -0.231167 -0.196352 -0.200370 -0.201430 -0.213834
## smsaYes 0.177741 0.215742 0.161001 0.158679 0.163157 0.160903
## ptYes -1.067418 -1.000953 -1.152926 -1.172663 -1.141632 -1.240930
## regionmw 0.008320 -0.072096 0.024339 0.022086 0.013778 0.018559
## regionso 0.010080 -0.068740 -0.009966 -0.017928 -0.005755 -0.036881
## regionwe 0.046846 -0.088407 0.038043 0.030596 0.036834 0.048242
## Intercept 4.568491
## Est. 7 Est. 8
## (Intercept) 4.663960
## educ 0.101826 0.094944
## exper 0.017784 0.016968
## raceBlack -0.200400 -0.251185
## smsaYes 0.152231 0.180681
## ptYes -1.231272 -1.197686
## regionmw 0.026702 0.000201
## regionso -0.035224 -0.046384
## regionwe 0.053190 0.013072
## Intercept 4.555933
colnames(coefs) = c("OLS","WLS","Huber", "Bisquare", "Hample", "LTS", "LTS-exact", "LAD")
coefs
## OLS WLS Huber Bisquare Hample
## (Intercept) 4.71003927 5.09109161 4.621799237 4.57970795 4.60569936
## educ 0.08751484 0.06451525 0.096150108 0.09959648 0.09668271
## exper 0.01548319 0.01334415 0.016228613 0.01674113 0.01610866
## raceBlack -0.21527343 -0.23116674 -0.196351924 -0.20037038 -0.20143047
## smsaYes 0.17774071 0.21574205 0.161001139 0.15867881 0.16315655
## ptYes -1.06741757 -1.00095287 -1.152926040 -1.17266284 -1.14163240
## regionmw 0.00831990 -0.07209561 0.024339415 0.02208550 0.01377789
## regionso 0.01008044 -0.06874001 -0.009966393 -0.01792789 -0.00575486
## regionwe 0.04684583 -0.08840729 0.038042793 0.03059615 0.03683405
## Intercept NA NA NA NA NA
## LTS LTS-exact LAD
## (Intercept) NA NA 4.663959542
## educ 0.10086887 0.10182550 0.094943531
## exper 0.01760387 0.01778414 0.016967663
## raceBlack -0.21383395 -0.20040017 -0.251184921
## smsaYes 0.16090299 0.15223141 0.180681034
## ptYes -1.24093034 -1.23127227 -1.197686089
## regionmw 0.01855923 0.02670195 0.000200561
## regionso -0.03688079 -0.03522418 -0.046384126
## regionwe 0.04824153 0.05318957 0.013072260
## Intercept 4.56849100 4.55593333 NA
#We see that LTS and LTS-exact appear to agree with each other and both are very different different from OLS.
#All three M-estimation methods, Huber, Bisquare, and Hample are different from each other, and different from OLS and both LTS's.
#LAD is similar to OLS.
#LTS is recommended since it has the best breakdown.