#LabExercise_2
library("faraway")
data(uswages)
head(uswages)
##         wage educ exper race smsa ne mw so we pt
## 6085  771.60   18    18    0    1  1  0  0  0  0
## 23701 617.28   15    20    0    1  0  0  0  1  0
## 16208 957.83   16     9    0    1  0  0  1  0  0
## 2720  617.28   12    24    0    1  1  0  0  0  0
## 9723  902.18   14    12    0    1  0  1  0  0  0
## 22239 299.15   12    33    0    1  0  0  0  1  0
summary(uswages)
##       wage              educ           exper            race      
##  Min.   :  50.39   Min.   : 0.00   Min.   :-2.00   Min.   :0.000  
##  1st Qu.: 308.64   1st Qu.:12.00   1st Qu.: 8.00   1st Qu.:0.000  
##  Median : 522.32   Median :12.00   Median :15.00   Median :0.000  
##  Mean   : 608.12   Mean   :13.11   Mean   :18.41   Mean   :0.078  
##  3rd Qu.: 783.48   3rd Qu.:16.00   3rd Qu.:27.00   3rd Qu.:0.000  
##  Max.   :7716.05   Max.   :18.00   Max.   :59.00   Max.   :1.000  
##       smsa             ne              mw               so        
##  Min.   :0.000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.756   Mean   :0.229   Mean   :0.2485   Mean   :0.3125  
##  3rd Qu.:1.000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##        we             pt        
##  Min.   :0.00   Min.   :0.0000  
##  1st Qu.:0.00   1st Qu.:0.0000  
##  Median :0.00   Median :0.0000  
##  Mean   :0.21   Mean   :0.0925  
##  3rd Qu.:0.00   3rd Qu.:0.0000  
##  Max.   :1.00   Max.   :1.0000
#Manipulate data. We see that experience has negative values.
uswages$exper[uswages$exper < 0] = NA

#Convert race, smsa, and pt to factor variables
uswages$race = factor(uswages$race)
levels(uswages$race) = c("White","Black")
uswages$smsa = factor(uswages$smsa)
levels(uswages$smsa) = c("No","Yes")
uswages$pt = factor(uswages$pt)
levels(uswages$pt) = c("No","Yes")

#Create region, a factor variable based on the four regions ne, mw, so, we
uswages = data.frame(uswages,
                     region =
                       1*uswages$ne +
                       2*uswages$mw +
                       3*uswages$so +
                       4*uswages$we)
uswages$region = factor(uswages$region)
levels(uswages$region) = c("ne","mw","so","we")

#Take care of NAs
uswages <- na.omit(uswages)

#Column names
names(uswages)
##  [1] "wage"   "educ"   "exper"  "race"   "smsa"   "ne"     "mw"    
##  [8] "so"     "we"     "pt"     "region"
#Exercise 1
g = lm(log(wage) ~ educ + exper + race + smsa + pt + region, data = uswages)
confint(g, level = 0.95)
##                   2.5 %      97.5 %
## (Intercept)  4.56088834  4.85919020
## educ         0.07870075  0.09632893
## exper        0.01349117  0.01747522
## raceBlack   -0.31082788 -0.11971899
## smsaYes      0.11855745  0.23692397
## ptYes       -1.15830653 -0.97652860
## regionmw    -0.06520300  0.08184280
## regionso    -0.06041461  0.08057549
## regionwe    -0.02948699  0.12317864
#Exercise 2
g1 = lm(log(wage) ~ educ + exper + race + smsa + pt, data = uswages)
confint(g1)
##                   2.5 %      97.5 %
## (Intercept)  4.59090124  4.86663286
## educ         0.07860326  0.09620455
## exper        0.01343477  0.01741146
## raceBlack   -0.31310266 -0.12537317
## smsaYes      0.11803316  0.23542167
## ptYes       -1.15785402 -0.97626013
tab1 = anova(g1,g)
tab1
## Analysis of Variance Table
## 
## Model 1: log(wage) ~ educ + exper + race + smsa + pt
## Model 2: log(wage) ~ educ + exper + race + smsa + pt + region
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   1961 633.62                           
## 2   1958 633.06  3   0.55445 0.5716 0.6337
#Exercise 3
g.bg = lm(log(wage) ~ educ + exper + race + smsa + pt + region, data = uswages)
g.sm = lm(log(wage) ~ educ + exper + race + smsa + pt, data = uswages)
sse.sm = deviance(g.sm)
df.sm = df.residual(g.sm)
sse.bg = deviance(g.bg)
df.bg = df.residual(g.bg)
mse.prt = (sse.sm-sse.bg)/(df.sm-df.bg)
mse.bg = sse.bg/df.bg
f.ratio = mse.prt/mse.bg
f.ratio
## [1] 0.5716147
p.value = pf(f.ratio, df.sm-df.bg, df.bg, lower.tail=FALSE)
p.value
## [1] 0.6337081
#Analysis:
#The P-value is 0.634 which is greater than 0.05. 
#Therefore, model 2 is better.


#Exercise 4
install.packages("ellipse", repos = "http://cran.us.r-project.org", type = "source")
## Installing package into 'C:/Users/aksha/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
library(ellipse)
plot(ellipse(g1, c("educ", "exper")), type = "l", main = "Joint Confidence Region")
points(0,0)
points(coef(g1)["educ"], coef(g)["exper"])

#Exercise 5
g2 <- lm(log(wage) ~ race + smsa + pt, data = uswages)
plot(ellipse(g1, c("educ", "exper")), type = "l", main = "Joint Confidence Region")
points(0,0)
points(coef(g)["educ"], coef(g)["exper"], pch=18)
abline(v=confint(g)["educ",], lty=2)
abline(h=confint(g)["exper",], lty=2)

compareg2g1 <- anova(g2, g1)
compareg2g1
## Analysis of Variance Table
## 
## Model 1: log(wage) ~ race + smsa + pt
## Model 2: log(wage) ~ educ + exper + race + smsa + pt
##   Res.Df    RSS Df Sum of Sq     F    Pr(>F)    
## 1   1963 788.00                                 
## 2   1961 633.62  2    154.38 238.9 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Analysis:
#If the p-value is less than or equal to the alpha (p < .05), then we reject the null hypothesis, and we say the result is statistically significant. 
#If the p-value is greater than alpha (p > .05), then we fail to reject the null hypothesis, and we say that the result is statistically nonsignificant (n.s.).
#The F-Ratio 238.9 is big and since the p-value 2.2e-16 is much less than 0.05, we reject the null hypothesis H0 : ??educ = ??exper = 0.


#Exercise 6
g1 = lm(log(wage) ~ educ + exper + race + smsa + pt, data = uswages)
x0 = data.frame(educ = 12, exper = 5, race = "White", smsa = "Yes", pt = "No", stringsAsFactors = FALSE)
predict(g1, x0, level = 0.95, interval = "confidence")
##        fit      lwr      upr
## 1 6.031457 5.986455 6.076459
#Exercise 7
x0 <- rbind(x0, data.frame(educ = 12, exper = 5, race = "Black", smsa = "Yes", pt = "No"))
predict(g1, x0, level = 0.95, interval = "confidence")
##        fit      lwr      upr
## 1 6.031457 5.986455 6.076459
## 2 5.812219 5.716405 5.908033
33
## [1] 33