library("faraway")

# load data
data("uswages")

# manipulating data
# we see that exper has neg. values
uswages$exper[uswages$exper <0] <-NA

# convert race, smsa, and pt to factor variables
uswages$race <- factor(uswages$race)
levels(uswages$race) <- c("White","Black")
uswages$smsa <- factor(uswages$smsa)
levels(uswages$smsa) <- c("No","Yes")
uswages$pt <- factor(uswages$pt)
levels(uswages$pt) <- c("No","Yes")

# create region, a factor variable based on the four regions ne, mw, so, we
uswages <- data.frame(uswages,
                      region =
                        1*uswages$ne +
                        2*uswages$mw +
                        3*uswages$so +
                        4*uswages$we)
uswages$region <- factor(uswages$region)
levels(uswages$region) <- c("ne","mw","so","we")

# delete the four regions ne, mw, so, we
uswages <- subset(uswages,select=-c(ne:we))

# Take care of NAs
uswages <- na.omit(uswages)

# Variable names
names(uswages)
## [1] "wage"   "educ"   "exper"  "race"   "smsa"   "pt"     "region"
# 1. Exercise

g <- lm(wage ~ region, data = uswages)
coef(g)
## (Intercept)    regionmw    regionso    regionwe 
##  641.717813  -48.027300  -56.902861    9.514236
## Thus, we can see that the number of coefficients associated with region is 3.

# 2. Exercise

aggregate(wage ~ region, data = uswages, mean)
##   region     wage
## 1     ne 641.7178
## 2     mw 593.6905
## 3     so 584.8150
## 4     we 651.2320
coef(g)
## (Intercept)    regionmw    regionso    regionwe 
##  641.717813  -48.027300  -56.902861    9.514236
## the average wage in the northeast is b0 :

## b0 = 641.7178

## the average wage in the midwest is b0+b11 dollars.

## b0 + b1 = 641.717813 - 48.027300 = 593.6905

## the average wage in the south is b0+b2 dollars.

## b0 + b2 = 641.717813 - 56.902861  = 584.8150

## the average wage in the west is b0+b3 dollars.

## b0 + b3 = 641.717813 + 9.514236 = 651.2320

# 3. Exercise

model1 <- lm(wage ~ region, data = uswages)

(tab1 <- anova(model1))
## Analysis of Variance Table
## 
## Response: wage
##             Df    Sum Sq Mean Sq F value  Pr(>F)  
## region       3   1645448  548483  2.6142 0.04968 *
## Residuals 1963 411855385  209809                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
deviance(model1)
## [1] 411855385
model2 <- lm(wage ~ region + educ + exper, data = uswages)

(tab2 <- anova(model2))
## Analysis of Variance Table
## 
## Response: wage
##             Df    Sum Sq  Mean Sq  F value  Pr(>F)    
## region       3   1645448   548483   3.0174 0.02885 *  
## educ         1  27912238 27912238 153.5572 < 2e-16 ***
## exper        1  27490333 27490333 151.2361 < 2e-16 ***
## Residuals 1961 356452814   181771                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
deviance(model2)
## [1] 356452814
## Partial F-stat of the Big Model vs Small Model

sse.model1 <- deviance(model1)
df.model1 <- df.residual(model1)
sse.model2 <- deviance(model2)
df.model2 <- df.residual(model2)
mse.prt <- (sse.model1-sse.model2)/(df.model1-df.model2)
mse.model2 <- sse.model2/df.model2
(f.ratio <- mse.prt/mse.model2)
## [1] 152.3967
## P-Value of the Partial F-Ratio
## Compute the p-value of the Partial F-Ratio.

(p.value <- 1-pf(f.ratio, df.model1-df.model2, df.model2))
## [1] 0
# OR

(p.value <- pf(f.ratio, df.model1-df.model2, df.model2, lower.tail=FALSE))
## [1] 3.025386e-62
## Conclusion: 

## The F-Ratio 152.3967 is big, therefore we can conclude that the big model (model2) fits the data better than the small model (model1) since the p-value is very less than 0.05

## Using the anova() function: 

## While all the extractions and computations above are correct, they are not the best way to use R.

## The anova() function is preferred

(tab3 <- anova(model1, model2))
## Analysis of Variance Table
## 
## Model 1: wage ~ region
## Model 2: wage ~ region + educ + exper
##   Res.Df       RSS Df Sum of Sq     F    Pr(>F)    
## 1   1963 411855385                                 
## 2   1961 356452814  2  55402572 152.4 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## So, The F-Ratio is big, therefore take Model 2 (the big model) since p-value much less than 0.05

## Model 2 is better.

## And hence, education and experience matter and prove significant.


# 4. Exercise

mm1 <- lm(wage ~ educ + exper, data = uswages)
mm2 <- lm(wage ~ region + educ + exper, data = uswages)


## Partial F-stat of the Big Model vs Small Model

sse.mm1 <- deviance(mm1)
df.mm1 <- df.residual(mm1)
sse.mm2 <- deviance(mm2)
df.mm2 <- df.residual(mm2)
mse.prt <- (sse.mm1-sse.mm2)/(df.mm1-df.mm2)
mse.mm2 <- sse.mm2/df.mm2
(f.ratio <- mse.prt/mse.mm2)
## [1] 2.404111
## P-Value of the Partial F-Ratio
## Compute the p-value of the Partial F-Ratio.

(p.value <- 1-pf(f.ratio, df.mm1-df.mm2, df.mm2))
## [1] 0.06576161
# OR

(p.value <- pf(f.ratio, df.mm1-df.mm2, df.mm2, lower.tail=FALSE))
## [1] 0.06576161
## Using the anova() function: 

## While all the extractions and computations above are correct, they are not the best way to use R.

## The anova() function is preferred

(tab3 <- anova(mm1, mm2))
## Analysis of Variance Table
## 
## Model 1: wage ~ educ + exper
## Model 2: wage ~ region + educ + exper
##   Res.Df       RSS Df Sum of Sq      F  Pr(>F)  
## 1   1964 357763806                              
## 2   1961 356452814  3   1310993 2.4041 0.06576 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##  Using level of significance ??=0.05 ??=0.05, The F-Ratio 2.4041 is small, therefore reject the big model (Model 2; mm2) since the p-value 0.06 is greater than 0.05.

## Model 1 is better

## Hence, education and experience determine wage regardless of the region of the United States you live in, i.e. region doesn't matter

# 5. Exercise

m1 <- lm(log(wage) ~ educ + exper, data = uswages)
m2 <- lm(log(wage) ~ region + educ + exper, data = uswages)

## Partial F-stat of the Big Model vs Small Model

sse.m1 <- deviance(m1)
df.m1 <- df.residual(m1)
sse.m2 <- deviance(m2)
df.m2 <- df.residual(m2)
mse.prt <- (sse.m1-sse.m2)/(df.m1-df.m2)
mse.m2 <- sse.m2/df.m2
(f.ratio <- mse.prt/mse.m2)
## [1] 1.289134
## P-Value of the Partial F-Ratio
## Compute the p-value of the Partial F-Ratio.

(p.value <- 1-pf(f.ratio, df.m1-df.m2, df.m2))
## [1] 0.2764635
# OR

(p.value <- pf(f.ratio, df.m1-df.m2, df.m2, lower.tail=FALSE))
## [1] 0.2764635
## Using the anova() function: 

## While all the extractions and computations above are correct, they are not the best way to use R.

## The anova() function is preferred

(tab3 <- anova(m1, m2))
## Analysis of Variance Table
## 
## Model 1: log(wage) ~ educ + exper
## Model 2: log(wage) ~ region + educ + exper
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1964 826.66                           
## 2   1961 825.03  3    1.6271 1.2891 0.2765
## Using level of significance ??=0.05 ??=0.05, The F-Ratio 1.2891 is small, 
## therefore reject the big model (Model 2; mm2) 
## since the p-value 0.2765 is greater than 0.05.
## Model 1 is better.

## Again, education and experience determine wage regardless of the region of the United States you live in, i.e. region doesn't matter