library("faraway")
# load data
data("uswages")
# manipulating data
# we see that exper has neg. values
uswages$exper[uswages$exper <0] <-NA
# convert race, smsa, and pt to factor variables
uswages$race <- factor(uswages$race)
levels(uswages$race) <- c("White","Black")
uswages$smsa <- factor(uswages$smsa)
levels(uswages$smsa) <- c("No","Yes")
uswages$pt <- factor(uswages$pt)
levels(uswages$pt) <- c("No","Yes")
# create region, a factor variable based on the four regions ne, mw, so, we
uswages <- data.frame(uswages,
region =
1*uswages$ne +
2*uswages$mw +
3*uswages$so +
4*uswages$we)
uswages$region <- factor(uswages$region)
levels(uswages$region) <- c("ne","mw","so","we")
# delete the four regions ne, mw, so, we
uswages <- subset(uswages,select=-c(ne:we))
# Take care of NAs
uswages <- na.omit(uswages)
# Variable names
names(uswages)
## [1] "wage" "educ" "exper" "race" "smsa" "pt" "region"
# 1. Exercise
g <- lm(wage ~ region, data = uswages)
coef(g)
## (Intercept) regionmw regionso regionwe
## 641.717813 -48.027300 -56.902861 9.514236
## Thus, we can see that the number of coefficients associated with region is 3.
# 2. Exercise
aggregate(wage ~ region, data = uswages, mean)
## region wage
## 1 ne 641.7178
## 2 mw 593.6905
## 3 so 584.8150
## 4 we 651.2320
coef(g)
## (Intercept) regionmw regionso regionwe
## 641.717813 -48.027300 -56.902861 9.514236
## the average wage in the northeast is b0 :
## b0 = 641.7178
## the average wage in the midwest is b0+b11 dollars.
## b0 + b1 = 641.717813 - 48.027300 = 593.6905
## the average wage in the south is b0+b2 dollars.
## b0 + b2 = 641.717813 - 56.902861 = 584.8150
## the average wage in the west is b0+b3 dollars.
## b0 + b3 = 641.717813 + 9.514236 = 651.2320
# 3. Exercise
model1 <- lm(wage ~ region, data = uswages)
(tab1 <- anova(model1))
## Analysis of Variance Table
##
## Response: wage
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 1645448 548483 2.6142 0.04968 *
## Residuals 1963 411855385 209809
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
deviance(model1)
## [1] 411855385
model2 <- lm(wage ~ region + educ + exper, data = uswages)
(tab2 <- anova(model2))
## Analysis of Variance Table
##
## Response: wage
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 1645448 548483 3.0174 0.02885 *
## educ 1 27912238 27912238 153.5572 < 2e-16 ***
## exper 1 27490333 27490333 151.2361 < 2e-16 ***
## Residuals 1961 356452814 181771
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
deviance(model2)
## [1] 356452814
## Partial F-stat of the Big Model vs Small Model
sse.model1 <- deviance(model1)
df.model1 <- df.residual(model1)
sse.model2 <- deviance(model2)
df.model2 <- df.residual(model2)
mse.prt <- (sse.model1-sse.model2)/(df.model1-df.model2)
mse.model2 <- sse.model2/df.model2
(f.ratio <- mse.prt/mse.model2)
## [1] 152.3967
## P-Value of the Partial F-Ratio
## Compute the p-value of the Partial F-Ratio.
(p.value <- 1-pf(f.ratio, df.model1-df.model2, df.model2))
## [1] 0
# OR
(p.value <- pf(f.ratio, df.model1-df.model2, df.model2, lower.tail=FALSE))
## [1] 3.025386e-62
## Conclusion:
## The F-Ratio 152.3967 is big, therefore we can conclude that the big model (model2) fits the data better than the small model (model1) since the p-value is very less than 0.05
## Using the anova() function:
## While all the extractions and computations above are correct, they are not the best way to use R.
## The anova() function is preferred
(tab3 <- anova(model1, model2))
## Analysis of Variance Table
##
## Model 1: wage ~ region
## Model 2: wage ~ region + educ + exper
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1963 411855385
## 2 1961 356452814 2 55402572 152.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## So, The F-Ratio is big, therefore take Model 2 (the big model) since p-value much less than 0.05
## Model 2 is better.
## And hence, education and experience matter and prove significant.
# 4. Exercise
mm1 <- lm(wage ~ educ + exper, data = uswages)
mm2 <- lm(wage ~ region + educ + exper, data = uswages)
## Partial F-stat of the Big Model vs Small Model
sse.mm1 <- deviance(mm1)
df.mm1 <- df.residual(mm1)
sse.mm2 <- deviance(mm2)
df.mm2 <- df.residual(mm2)
mse.prt <- (sse.mm1-sse.mm2)/(df.mm1-df.mm2)
mse.mm2 <- sse.mm2/df.mm2
(f.ratio <- mse.prt/mse.mm2)
## [1] 2.404111
## P-Value of the Partial F-Ratio
## Compute the p-value of the Partial F-Ratio.
(p.value <- 1-pf(f.ratio, df.mm1-df.mm2, df.mm2))
## [1] 0.06576161
# OR
(p.value <- pf(f.ratio, df.mm1-df.mm2, df.mm2, lower.tail=FALSE))
## [1] 0.06576161
## Using the anova() function:
## While all the extractions and computations above are correct, they are not the best way to use R.
## The anova() function is preferred
(tab3 <- anova(mm1, mm2))
## Analysis of Variance Table
##
## Model 1: wage ~ educ + exper
## Model 2: wage ~ region + educ + exper
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1964 357763806
## 2 1961 356452814 3 1310993 2.4041 0.06576 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Using level of significance ??=0.05 ??=0.05, The F-Ratio 2.4041 is small, therefore reject the big model (Model 2; mm2) since the p-value 0.06 is greater than 0.05.
## Model 1 is better
## Hence, education and experience determine wage regardless of the region of the United States you live in, i.e. region doesn't matter
# 5. Exercise
m1 <- lm(log(wage) ~ educ + exper, data = uswages)
m2 <- lm(log(wage) ~ region + educ + exper, data = uswages)
## Partial F-stat of the Big Model vs Small Model
sse.m1 <- deviance(m1)
df.m1 <- df.residual(m1)
sse.m2 <- deviance(m2)
df.m2 <- df.residual(m2)
mse.prt <- (sse.m1-sse.m2)/(df.m1-df.m2)
mse.m2 <- sse.m2/df.m2
(f.ratio <- mse.prt/mse.m2)
## [1] 1.289134
## P-Value of the Partial F-Ratio
## Compute the p-value of the Partial F-Ratio.
(p.value <- 1-pf(f.ratio, df.m1-df.m2, df.m2))
## [1] 0.2764635
# OR
(p.value <- pf(f.ratio, df.m1-df.m2, df.m2, lower.tail=FALSE))
## [1] 0.2764635
## Using the anova() function:
## While all the extractions and computations above are correct, they are not the best way to use R.
## The anova() function is preferred
(tab3 <- anova(m1, m2))
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + exper
## Model 2: log(wage) ~ region + educ + exper
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1964 826.66
## 2 1961 825.03 3 1.6271 1.2891 0.2765
## Using level of significance ??=0.05 ??=0.05, The F-Ratio 1.2891 is small,
## therefore reject the big model (Model 2; mm2)
## since the p-value 0.2765 is greater than 0.05.
## Model 1 is better.
## Again, education and experience determine wage regardless of the region of the United States you live in, i.e. region doesn't matter