#LabExercise_2
library("faraway")
data(uswages)
head(uswages)
## wage educ exper race smsa ne mw so we pt
## 6085 771.60 18 18 0 1 1 0 0 0 0
## 23701 617.28 15 20 0 1 0 0 0 1 0
## 16208 957.83 16 9 0 1 0 0 1 0 0
## 2720 617.28 12 24 0 1 1 0 0 0 0
## 9723 902.18 14 12 0 1 0 1 0 0 0
## 22239 299.15 12 33 0 1 0 0 0 1 0
summary(uswages)
## wage educ exper race
## Min. : 50.39 Min. : 0.00 Min. :-2.00 Min. :0.000
## 1st Qu.: 308.64 1st Qu.:12.00 1st Qu.: 8.00 1st Qu.:0.000
## Median : 522.32 Median :12.00 Median :15.00 Median :0.000
## Mean : 608.12 Mean :13.11 Mean :18.41 Mean :0.078
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00 3rd Qu.:0.000
## Max. :7716.05 Max. :18.00 Max. :59.00 Max. :1.000
## smsa ne mw so
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.756 Mean :0.229 Mean :0.2485 Mean :0.3125
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.000 Max. :1.0000 Max. :1.0000
## we pt
## Min. :0.00 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:0.0000
## Median :0.00 Median :0.0000
## Mean :0.21 Mean :0.0925
## 3rd Qu.:0.00 3rd Qu.:0.0000
## Max. :1.00 Max. :1.0000
#Manipulate data. We see that experience has negative values.
uswages$exper[uswages$exper < 0] = NA
#Convert race, smsa, and pt to factor variables
uswages$race = factor(uswages$race)
levels(uswages$race) = c("White","Black")
uswages$smsa = factor(uswages$smsa)
levels(uswages$smsa) = c("No","Yes")
uswages$pt = factor(uswages$pt)
levels(uswages$pt) = c("No","Yes")
#Create region, a factor variable based on the four regions ne, mw, so, we
uswages = data.frame(uswages,
region =
1*uswages$ne +
2*uswages$mw +
3*uswages$so +
4*uswages$we)
uswages$region = factor(uswages$region)
levels(uswages$region) = c("ne","mw","so","we")
#Take care of NAs
uswages <- na.omit(uswages)
#Column names
names(uswages)
## [1] "wage" "educ" "exper" "race" "smsa" "ne" "mw"
## [8] "so" "we" "pt" "region"
#Exercise 1
g = lm(log(wage) ~ educ + exper + race + smsa + pt + region, data = uswages)
confint(g, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 4.56088834 4.85919020
## educ 0.07870075 0.09632893
## exper 0.01349117 0.01747522
## raceBlack -0.31082788 -0.11971899
## smsaYes 0.11855745 0.23692397
## ptYes -1.15830653 -0.97652860
## regionmw -0.06520300 0.08184280
## regionso -0.06041461 0.08057549
## regionwe -0.02948699 0.12317864
#Exercise 2
g1 = lm(log(wage) ~ educ + exper + race + smsa + pt, data = uswages)
confint(g1)
## 2.5 % 97.5 %
## (Intercept) 4.59090124 4.86663286
## educ 0.07860326 0.09620455
## exper 0.01343477 0.01741146
## raceBlack -0.31310266 -0.12537317
## smsaYes 0.11803316 0.23542167
## ptYes -1.15785402 -0.97626013
tab1 = anova(g1,g)
tab1
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + exper + race + smsa + pt
## Model 2: log(wage) ~ educ + exper + race + smsa + pt + region
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1961 633.62
## 2 1958 633.06 3 0.55445 0.5716 0.6337
#Exercise 3
g.bg = lm(log(wage) ~ educ + exper + race + smsa + pt + region, data = uswages)
g.sm = lm(log(wage) ~ educ + exper + race + smsa + pt, data = uswages)
sse.sm = deviance(g.sm)
df.sm = df.residual(g.sm)
sse.bg = deviance(g.bg)
df.bg = df.residual(g.bg)
mse.prt = (sse.sm-sse.bg)/(df.sm-df.bg)
mse.bg = sse.bg/df.bg
f.ratio = mse.prt/mse.bg
f.ratio
## [1] 0.5716147
p.value = pf(f.ratio, df.sm-df.bg, df.bg, lower.tail=FALSE)
p.value
## [1] 0.6337081
#Analysis:
#The P-value is 0.634 which is greater than 0.05.
#Therefore, model 2 is better.
#Exercise 4
install.packages("ellipse", repos = "http://cran.us.r-project.org", type = "source")
## Installing package into 'C:/Users/aksha/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
library(ellipse)
plot(ellipse(g1, c("educ", "exper")), type = "l", main = "Joint Confidence Region")
points(0,0)
points(coef(g1)["educ"], coef(g)["exper"])

#Exercise 5
g2 <- lm(log(wage) ~ race + smsa + pt, data = uswages)
plot(ellipse(g1, c("educ", "exper")), type = "l", main = "Joint Confidence Region")
points(0,0)
points(coef(g)["educ"], coef(g)["exper"], pch=18)
abline(v=confint(g)["educ",], lty=2)
abline(h=confint(g)["exper",], lty=2)

compareg2g1 <- anova(g2, g1)
compareg2g1
## Analysis of Variance Table
##
## Model 1: log(wage) ~ race + smsa + pt
## Model 2: log(wage) ~ educ + exper + race + smsa + pt
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1963 788.00
## 2 1961 633.62 2 154.38 238.9 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Analysis:
#If the p-value is less than or equal to the alpha (p < .05), then we reject the null hypothesis, and we say the result is statistically significant.
#If the p-value is greater than alpha (p > .05), then we fail to reject the null hypothesis, and we say that the result is statistically nonsignificant (n.s.).
#The F-Ratio 238.9 is big and since the p-value 2.2e-16 is much less than 0.05, we reject the null hypothesis H0 : ??educ = ??exper = 0.
#Exercise 6
g1 = lm(log(wage) ~ educ + exper + race + smsa + pt, data = uswages)
x0 = data.frame(educ = 12, exper = 5, race = "White", smsa = "Yes", pt = "No", stringsAsFactors = FALSE)
predict(g1, x0, level = 0.95, interval = "confidence")
## fit lwr upr
## 1 6.031457 5.986455 6.076459
#Exercise 7
x0 <- rbind(x0, data.frame(educ = 12, exper = 5, race = "Black", smsa = "Yes", pt = "No"))
predict(g1, x0, level = 0.95, interval = "confidence")
## fit lwr upr
## 1 6.031457 5.986455 6.076459
## 2 5.812219 5.716405 5.908033
33
## [1] 33