require("faraway")
## Loading required package: faraway
head("uswages")
## [1] "uswages"
require("faraway")
head("uswages")
## [1] "uswages"
summary(uswages)
## wage educ exper race
## Min. : 50.39 Min. : 0.00 Min. :-2.00 Min. :0.000
## 1st Qu.: 308.64 1st Qu.:12.00 1st Qu.: 8.00 1st Qu.:0.000
## Median : 522.32 Median :12.00 Median :15.00 Median :0.000
## Mean : 608.12 Mean :13.11 Mean :18.41 Mean :0.078
## 3rd Qu.: 783.48 3rd Qu.:16.00 3rd Qu.:27.00 3rd Qu.:0.000
## Max. :7716.05 Max. :18.00 Max. :59.00 Max. :1.000
## smsa ne mw so
## Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.756 Mean :0.229 Mean :0.2485 Mean :0.3125
## 3rd Qu.:1.000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.000 Max. :1.0000 Max. :1.0000
## we pt
## Min. :0.00 Min. :0.0000
## 1st Qu.:0.00 1st Qu.:0.0000
## Median :0.00 Median :0.0000
## Mean :0.21 Mean :0.0925
## 3rd Qu.:0.00 3rd Qu.:0.0000
## Max. :1.00 Max. :1.0000
uswages$exper[uswages$exper <0] <-NA
# convert race, smsa, and pt to factor variables
uswages$race <- factor(uswages$race)
levels(uswages$race) <- c("White","Black")
uswages$smsa <- factor(uswages$smsa)
levels(uswages$smsa) <- c("No","Yes")
uswages$pt <- factor(uswages$pt)
levels(uswages$pt) <- c("No","Yes")
# create region, a factor variable based on the four regions ne, mw, so, we
uswages <- data.frame(uswages,
region =
1*uswages$ne +
2*uswages$mw +
3*uswages$so +
4*uswages$we)
uswages$region <- factor(uswages$region)
levels(uswages$region) <- c("ne","mw","so","we")
# delete the four regions ne, mw, so, we
uswages <- subset(uswages,select=-c(ne:we))
# Take care of NAs
uswages <- na.omit(uswages)
# Variable names
names(uswages)
## [1] "wage" "educ" "exper" "race" "smsa" "pt" "region"
#Question 1
# Run a model with region as predictor of wages. Show that the number of coefficients associated with region is 3.
g=lm(wage ~ region,data= uswages)
coef(g)
## (Intercept) regionmw regionso regionwe
## 641.717813 -48.027300 -56.902861 9.514236
# We see that there are 3 coefficients namely regionmw(midwest region)=-48.027300, regionso(south region)=-56.902861, and regionwe(western region)=9.514236.
#Question 2
#Apply the aggregate(wage ~ region, data = uswages, mean) function in R to obtain the mean wages by region
#Show that the average wage in the northeast is b0.
#Show that the average wage in the midwest is b0+b1 dollars.
#Show that the average wage in the south is b0+b2 dollars.
#Show that the average wage in the west is b0+b3 dollars.
g=lm(wage ~ region,data= uswages)
coef(g)
## (Intercept) regionmw regionso regionwe
## 641.717813 -48.027300 -56.902861 9.514236
aggregate(wage ~ region,data=uswages, mean)
## region wage
## 1 ne 641.7178
## 2 mw 593.6905
## 3 so 584.8150
## 4 we 651.2320
#Average wage in northeast(ne) = 641.7178(b0)
#Average wage in midwest(mw) = b0 + b1 = 641.7178 - 48.027300 = 593.6905
#Average wage in south(so) = b0 + b2= 641.7178 - 56.902861 =584.8150
#Average wage in west(we)= b0+b3=641.7178 + 9.514236= 651.2320
#Question 3
#Compare the two models:
# Model 1: wage ~ region
# Model 2: wage ~ region + educ + exper
# Show that the F-Ratio is 152.397 with p-value 3.02510^{-62}.
# What is the conclusion - Model 1 or Model 2 is better?
# So does education and experience matter?
m1 = lm(wage ~ region, data= uswages)
m2 = lm(wage ~region + educ + exper , data= uswages)
summary(m2)
##
## Call:
## lm(formula = wage ~ region + educ + exper, data = uswages)
##
## Residuals:
## Min 1Q Median 3Q Max
## -991.6 -235.5 -49.5 149.3 7264.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -228.3596 54.6719 -4.177 3.08e-05 ***
## regionmw -32.5997 27.9236 -1.167 0.243
## regionso -27.6755 26.5297 -1.043 0.297
## regionwe 34.6088 29.0959 1.189 0.234
## educ 51.7280 3.3466 15.457 < 2e-16 ***
## exper 9.3551 0.7607 12.298 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 426.3 on 1961 degrees of freedom
## Multiple R-squared: 0.138, Adjusted R-squared: 0.1358
## F-statistic: 62.77 on 5 and 1961 DF, p-value: < 2.2e-16
sse.sm = deviance(m1)
df.sm = df.residual(m1)
sse.bg = deviance(m2)
df.bg = df.residual(m2)
mse.prt = (sse.sm - sse.bg)/(df.sm - df.bg)
mse.bg =sse.bg/df.bg
f.ratio_model2 = mse.prt/mse.bg
f.ratio_model2
## [1] 152.3967
# The F- ratio equals 152.397
p.value = 1-pf(f.ratio_model2,df.sm - df.bg,df.bg)
p.value
## [1] 0
p.value1=pf(f.ratio_model2,df.sm-df.bg,df.bg,lower.tail = FALSE)
p.value1
## [1] 3.025386e-62
#the p.value1 3.02510^{-62} mathches the one in the question
# p - value is very less compared to 0.05 thus we reject the null hypothesis giving us Model 2 is better.
#Education and experience do matter
#Question 4
#Compare the two models:
# Model 1: wage ~ educ + exper
#Model 2: wage ~ region + educ + exper
#Show that the F-ratio is 2.404 with p-value equal to 0.066.
#Using level of significance ??=0.05, what is the conclusion: Model 1 or Model 2 is better?
#So does education and experience determine wage regardless of the region of the United States you live in, or does region still matter?
m1 = lm(wage ~ educ + exper, data= uswages)
m2 = lm(wage ~region + educ + exper , data= uswages)
sse.sm = deviance(m1)
df.sm = df.residual(m1)
sse.bg = deviance(m2)
df.bg = df.residual(m2)
mse.prt = (sse.sm - sse.bg)/(df.sm - df.bg)
mse.bg =sse.bg/df.bg
f.ratio_model2 = mse.prt/mse.bg
f.ratio_model2
## [1] 2.404111
# The F- ratio of model 2 equals 2.404
p.value = 1-pf(f.ratio_model2,df.sm - df.bg,df.bg)
p.value
## [1] 0.06576161
p.value1=pf(f.ratio_model2,df.sm-df.bg,df.bg,lower.tail = FALSE)
p.value1
## [1] 0.06576161
#the p-value equals 0.066
#The F-Ratio 2.404 is big, & p-value is 0.066 which is greater than 0.05
#therfore based on this Model 1 is better.
#education and experience determine wage regardless of the region of the United States you live in.
#Question 5
#Repeat exercise #4 using log(wage) for the outcome variable.
#Compare the two models:
# Model 1: log(wage) ~ educ + exper
#Model 2: log(wage) ~ region + educ + exper
#Show that the F-ratio is 1.289 with p-value equal to 0.276.
#Using level of significance ??=0.05, what is the conclusion: Model 1 or Model 2 is better?
#So does education and experience determine wage regardless of the region of the United States you live in, or does region still matter?
m1 = lm(log(wage) ~ educ + exper, data= uswages)
m2 = lm(log(wage) ~region + educ + exper , data= uswages)
sse.sm = deviance(m1)
df.sm = df.residual(m1)
sse.bg = deviance(m2)
df.bg = df.residual(m2)
mse.prt = (sse.sm - sse.bg)/(df.sm - df.bg)
mse.bg =sse.bg/df.bg
f.ratio_model2 = mse.prt/mse.bg
f.ratio_model2
## [1] 1.289134
# The F- ratio of model 2 equals 1.289
p.value = 1-pf(f.ratio_model2,df.sm - df.bg,df.bg)
p.value
## [1] 0.2764635
p.value1=pf(f.ratio_model2,df.sm-df.bg,df.bg,lower.tail = FALSE)
p.value1
## [1] 0.2764635
#p-value is 0.276 which is greater than 0.05 based on this we therfore have Model 1 is better.
#education and experience determine wage regardless of the region of the United States you live in.