require("faraway")
## Loading required package: faraway
head("uswages")
## [1] "uswages"
require("faraway")
head("uswages")
## [1] "uswages"
summary(uswages)
##       wage              educ           exper            race      
##  Min.   :  50.39   Min.   : 0.00   Min.   :-2.00   Min.   :0.000  
##  1st Qu.: 308.64   1st Qu.:12.00   1st Qu.: 8.00   1st Qu.:0.000  
##  Median : 522.32   Median :12.00   Median :15.00   Median :0.000  
##  Mean   : 608.12   Mean   :13.11   Mean   :18.41   Mean   :0.078  
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00   3rd Qu.:0.000  
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00   Max.   :1.000  
##       smsa             ne              mw               so        
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.756   Mean   :0.229   Mean   :0.2485   Mean   :0.3125  
##  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##        we             pt        
##  Min.   :0.00   Min.   :0.0000  
##  1st Qu.:0.00   1st Qu.:0.0000  
##  Median :0.00   Median :0.0000  
##  Mean   :0.21   Mean   :0.0925  
##  3rd Qu.:0.00   3rd Qu.:0.0000  
##  Max.   :1.00   Max.   :1.0000
uswages$exper[uswages$exper <0] <-NA

# convert race, smsa, and pt to factor variables
uswages$race <- factor(uswages$race)
levels(uswages$race) <- c("White","Black")
uswages$smsa <- factor(uswages$smsa)
levels(uswages$smsa) <- c("No","Yes")
uswages$pt <- factor(uswages$pt)
levels(uswages$pt) <- c("No","Yes")

# create region, a factor variable based on the four regions ne, mw, so, we
uswages <- data.frame(uswages,
                      region =
                        1*uswages$ne +
                        2*uswages$mw +
                        3*uswages$so +
                        4*uswages$we)
uswages$region <- factor(uswages$region)
levels(uswages$region) <- c("ne","mw","so","we")

# delete the four regions ne, mw, so, we
uswages <- subset(uswages,select=-c(ne:we))


# Take care of NAs
uswages <- na.omit(uswages)

# Variable names
names(uswages)
## [1] "wage"   "educ"   "exper"  "race"   "smsa"   "pt"     "region"
#Question 1
# Run a model with region as predictor of wages. Show that the number of coefficients associated with region is 3.
g=lm(wage ~ region,data= uswages)
coef(g)
## (Intercept)    regionmw    regionso    regionwe 
##  641.717813  -48.027300  -56.902861    9.514236
# We see that there are 3 coefficients namely regionmw(midwest region)=-48.027300, regionso(south region)=-56.902861, and regionwe(western region)=9.514236.

#Question 2
#Apply the aggregate(wage ~ region, data = uswages, mean) function in R to obtain the mean wages by region
#Show that the average wage in the northeast is b0.
#Show that the average wage in the midwest is b0+b1 dollars.
#Show that the average wage in the south is b0+b2 dollars.
#Show that the average wage in the west is b0+b3 dollars.

g=lm(wage ~ region,data= uswages)

coef(g)
## (Intercept)    regionmw    regionso    regionwe 
##  641.717813  -48.027300  -56.902861    9.514236
aggregate(wage ~ region,data=uswages, mean)
##   region     wage
## 1     ne 641.7178
## 2     mw 593.6905
## 3     so 584.8150
## 4     we 651.2320
#Average wage in northeast(ne) = 641.7178(b0)
#Average wage in midwest(mw) = b0 + b1 = 641.7178 - 48.027300 = 593.6905
#Average wage in south(so) = b0 + b2= 641.7178 - 56.902861 =584.8150
#Average wage in west(we)= b0+b3=641.7178 + 9.514236= 651.2320

#Question 3


#Compare the two models:
# Model 1: wage ~ region
# Model 2: wage ~ region + educ + exper
# Show that the F-Ratio is 152.397 with p-value 3.02510^{-62}.
# What is the conclusion - Model 1 or Model 2 is better?
# So does education and experience matter?

m1 = lm(wage ~ region, data= uswages)

m2 = lm(wage ~region + educ + exper , data= uswages)
summary(m2)
## 
## Call:
## lm(formula = wage ~ region + educ + exper, data = uswages)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -991.6 -235.5  -49.5  149.3 7264.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -228.3596    54.6719  -4.177 3.08e-05 ***
## regionmw     -32.5997    27.9236  -1.167    0.243    
## regionso     -27.6755    26.5297  -1.043    0.297    
## regionwe      34.6088    29.0959   1.189    0.234    
## educ          51.7280     3.3466  15.457  < 2e-16 ***
## exper          9.3551     0.7607  12.298  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 426.3 on 1961 degrees of freedom
## Multiple R-squared:  0.138,  Adjusted R-squared:  0.1358 
## F-statistic: 62.77 on 5 and 1961 DF,  p-value: < 2.2e-16
sse.sm = deviance(m1)
df.sm = df.residual(m1)


sse.bg = deviance(m2)
df.bg = df.residual(m2)

mse.prt = (sse.sm - sse.bg)/(df.sm - df.bg)
mse.bg =sse.bg/df.bg

f.ratio_model2 = mse.prt/mse.bg
f.ratio_model2
## [1] 152.3967
# The F- ratio equals 152.397

p.value = 1-pf(f.ratio_model2,df.sm - df.bg,df.bg)
p.value
## [1] 0
p.value1=pf(f.ratio_model2,df.sm-df.bg,df.bg,lower.tail = FALSE)
p.value1
## [1] 3.025386e-62
#the p.value1 3.02510^{-62} mathches the one in the question
# p - value is very less compared to 0.05 thus we reject the null hypothesis giving us Model 2 is better.

#Education and experience do matter



#Question 4
#Compare the two models:
#  Model 1: wage ~ educ + exper
#Model 2: wage ~ region + educ + exper
#Show that the F-ratio is 2.404 with p-value equal to 0.066.
#Using level of significance ??=0.05, what is the conclusion: Model 1 or Model 2 is better?
#So does education and experience determine wage regardless of the region of the United States you live in, or does region still matter?

m1 = lm(wage ~ educ + exper, data= uswages)


m2 = lm(wage ~region + educ + exper , data= uswages)
sse.sm = deviance(m1)
df.sm = df.residual(m1)


sse.bg = deviance(m2)
df.bg = df.residual(m2)

mse.prt = (sse.sm - sse.bg)/(df.sm - df.bg)
mse.bg =sse.bg/df.bg

f.ratio_model2 = mse.prt/mse.bg
f.ratio_model2
## [1] 2.404111
# The F- ratio of model 2 equals 2.404

p.value = 1-pf(f.ratio_model2,df.sm - df.bg,df.bg)
p.value
## [1] 0.06576161
p.value1=pf(f.ratio_model2,df.sm-df.bg,df.bg,lower.tail = FALSE)
p.value1
## [1] 0.06576161
#the p-value equals 0.066
#The F-Ratio 2.404 is big, & p-value is 0.066 which is greater than 0.05
#therfore based on this Model 1 is better. 

#education and experience determine wage regardless of the region of the United States you live in.

#Question 5
#Repeat exercise #4 using log(wage) for the outcome variable.

#Compare the two models:
#  Model 1: log(wage) ~ educ + exper
#Model 2: log(wage) ~ region + educ + exper
#Show that the F-ratio is 1.289 with p-value equal to 0.276.
#Using level of significance ??=0.05, what is the conclusion: Model 1 or Model 2 is better?
#So does education and experience determine wage regardless of the region of the United States you live in, or does region still matter?

m1 = lm(log(wage) ~ educ + exper, data= uswages)


m2 = lm(log(wage) ~region + educ + exper , data= uswages)
sse.sm = deviance(m1)
df.sm = df.residual(m1)


sse.bg = deviance(m2)
df.bg = df.residual(m2)

mse.prt = (sse.sm - sse.bg)/(df.sm - df.bg)
mse.bg =sse.bg/df.bg

f.ratio_model2 = mse.prt/mse.bg
f.ratio_model2
## [1] 1.289134
# The F- ratio of model 2 equals 1.289

p.value = 1-pf(f.ratio_model2,df.sm - df.bg,df.bg)
p.value
## [1] 0.2764635
p.value1=pf(f.ratio_model2,df.sm-df.bg,df.bg,lower.tail = FALSE)
p.value1
## [1] 0.2764635
#p-value is 0.276 which is greater than 0.05 based on this we therfore have  Model 1 is better. 

#education and experience determine wage regardless of the region of the United States you live in.