library(car)
Pres <-read.table("http://socserv.socsci.mcmaster.ca/jfox/books/Companion/data/Prestige.txt", header=TRUE)
head(Pres)
##                     education income women prestige census type
## GOV.ADMINISTRATORS      13.11  12351 11.16     68.8   1113 prof
## GENERAL.MANAGERS        12.26  25879  4.02     69.1   1130 prof
## ACCOUNTANTS             12.77   9271 15.70     63.4   1171 prof
## PURCHASING.OFFICERS     11.42   8865  9.11     56.8   1175 prof
## CHEMISTS                14.62   8403 11.68     73.5   2111 prof
## PHYSICISTS              15.64  11030  5.13     77.6   2113 prof
summary(Pres)
##    education          income          women           prestige    
##  Min.   : 6.380   Min.   :  611   Min.   : 0.000   Min.   :14.80  
##  1st Qu.: 8.445   1st Qu.: 4106   1st Qu.: 3.592   1st Qu.:35.23  
##  Median :10.540   Median : 5930   Median :13.600   Median :43.60  
##  Mean   :10.738   Mean   : 6798   Mean   :28.979   Mean   :46.83  
##  3rd Qu.:12.648   3rd Qu.: 8187   3rd Qu.:52.203   3rd Qu.:59.27  
##  Max.   :15.970   Max.   :25879   Max.   :97.510   Max.   :87.20  
##      census       type   
##  Min.   :1113   bc  :44  
##  1st Qu.:3120   prof:31  
##  Median :5135   wc  :23  
##  Mean   :5402   NA's: 4  
##  3rd Qu.:8312            
##  Max.   :9517
library(ggplot2)

p1 <- ggplot(Pres) + geom_point(aes(x=income, y=prestige), size=2, colour="#993399") + 
  xlab("Income") + ylab("prestige")  
p2 <- ggplot(Pres) + geom_point(aes(x=education, y=prestige), size=2, colour="#993399") + 
  xlab("Education") + ylab("prestige")  
p3 <- ggplot(Pres) + geom_point(aes(x=census, y=prestige), size=2, colour="#993399") + 
  xlab("Census") + ylab("prestige")
p4 <- ggplot(Pres) + geom_point(aes(x=women, y=prestige), size=2, colour="#993399") + 
  xlab("Women") + ylab("prestige")
par(mfrow=c(2,2))
print(p1)

print(p2)

print(p3)

print(p4)

Build models

library(MASS)
lm_null <- lm(prestige ~ 1,data=Pres)
lm_full <- lm(prestige ~ ., data=Pres)
#summary(lm_null)

step <- stepAIC(lm_full,trace=F)
#step$anova
#summary((step))
lm1 <- lm(paste("prestige ~", paste(attr(terms(step), "term.labels"),collapse = "+")), data=Pres) 
#summary(lm1)

#forward <- stepAIC(lm_full,direction='forward',trace=F)
#forward$anova
#backward <- stepAIC(lm_full,direction='backward',trace=F)
#backward$anova

#(bic<-stepAIC(lm_full,trace=F,k=log(nrow(Pres))))

# build model at polynomial degree 2
lm2.ortho <- lm (prestige ~ poly(education, degree=2)+poly(income, degree=2)+poly(census, degree=2)+type,data=Pres) # default argument raw =FALSE will use orthogonal polynomials

#summary(lm2.ortho)

Model comparation

First we can use the statistic to compare the goodness of fit of different models.t-test, F-statistics of an anova, likelihood ratio test (LRT) are most commonly used methods.

1. t-test work in subset selection

\(t_statistic = \beta coefficient/standard. error\)

summary(lm1 )$coefficients
##                  Estimate  Std. Error    t value     Pr(>|t|)
## (Intercept) -1.143676e+01 7.822993454 -1.4619413 1.471648e-01
## education    3.947385e+00 0.649775874  6.0749949 2.762787e-08
## income       9.365407e-04 0.000222084  4.2170552 5.785442e-05
## census       1.124464e-03 0.000611256  1.8395963 6.905247e-02
## typeprof     1.090528e+01 4.645053620  2.3477194 2.103224e-02
## typewc       5.605413e-01 3.061947847  0.1830669 8.551484e-01
summary(lm2.ortho )$coefficients
##                                 Estimate Std. Error    t value
## (Intercept)                   44.4759530   2.180394 20.3981241
## poly(education, degree = 2)1 100.5463144  18.697973  5.3773910
## poly(education, degree = 2)2  -1.8954692   8.502187 -0.2229390
## poly(income, degree = 2)1     45.0111573   9.943029  4.5269059
## poly(income, degree = 2)2    -17.9970776   8.119913 -2.2164127
## poly(census, degree = 2)1     12.3619575  16.917145  0.7307355
## poly(census, degree = 2)2     17.6367374  10.075151  1.7505184
## typeprof                       6.7488408   4.966802  1.3587901
## typewc                         0.9253886   3.085571  0.2999083
##                                  Pr(>|t|)
## (Intercept)                  2.601935e-35
## poly(education, degree = 2)1 6.031383e-07
## poly(education, degree = 2)2 8.240938e-01
## poly(income, degree = 2)1    1.846576e-05
## poly(income, degree = 2)2    2.921273e-02
## poly(census, degree = 2)1    4.668601e-01
## poly(census, degree = 2)2    8.347490e-02
## typeprof                     1.776469e-01
## typewc                       7.649464e-01

For model lm1, census and typewc in type should not be included in to model. For model lm2.ortho, quadratic form of education, linear and quadratic form of census and type should not be included in to model.

2. Analysis of Variance (anova)

F-statistics of an anova shown in the summary is to test the fitted model against the null model.

F-statistics of an anova can be used to compare two models too.

anova(lm1, lm2.ortho)
## Analysis of Variance Table
## 
## Model 1: prestige ~ education + income + census + type
## Model 2: prestige ~ poly(education, degree = 2) + poly(income, degree = 2) + 
##     poly(census, degree = 2) + type
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     92 4515.2                              
## 2     89 4080.2  3    434.97 3.1626 0.02843 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The polynomial model lm2.ortho is preferred to model lm1 with only linear components.

3.Likelihood Ratio Test (LRT)

The likelihood ratio test can be used for testing 2 nested models.

logLik(lm_null)
## 'log Lik.' -434.4366 (df=2)
logLik(lm1)
## 'log Lik.' -326.7375 (df=7)
logLik(lm2.ortho)
## 'log Lik.' -321.7739 (df=10)
dl <- 2*as.numeric(logLik(lm1) - logLik(lm_null))
1-pchisq(dl,5)
## [1] 0
dl <- 2*as.numeric(logLik(lm2.ortho) - logLik(lm1))
1-pchisq(dl,3)
## [1] 0.01919575

The ANOVA and the LRT give very similar results and lead to the same conclusion: a). The null model can be rejected. b). The polynomial model is preferred to the model with only linear components in this case.

4. Function stargazer help to see the coffecient and statistics of two models easily.

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
stargazer(lm1,lm2.ortho,title="Comparison of 2 Regression outputs",type="text",align=TRUE)
## 
## Comparison of 2 Regression outputs
## ==========================================================================
##                                           Dependent variable:             
##                              ---------------------------------------------
##                                    prestige ~              prestige       
##                                       (1)                    (2)          
## --------------------------------------------------------------------------
## education                           3.947***                              
##                                     (0.650)                               
##                                                                           
## income                              0.001***                              
##                                     (0.0002)                              
##                                                                           
## census                               0.001*                               
##                                     (0.001)                               
##                                                                           
## poly(education, degree = 2)1                              100.546***      
##                                                            (18.698)       
##                                                                           
## poly(education, degree = 2)2                                -1.895        
##                                                            (8.502)        
##                                                                           
## poly(income, degree = 2)1                                 45.011***       
##                                                            (9.943)        
##                                                                           
## poly(income, degree = 2)2                                 -17.997**       
##                                                            (8.120)        
##                                                                           
## poly(census, degree = 2)1                                   12.362        
##                                                            (16.917)       
##                                                                           
## poly(census, degree = 2)2                                  17.637*        
##                                                            (10.075)       
##                                                                           
## typeprof                            10.905**                6.749         
##                                     (4.645)                (4.967)        
##                                                                           
## typewc                               0.561                  0.925         
##                                     (3.062)                (3.086)        
##                                                                           
## Constant                            -11.437               44.476***       
##                                     (7.823)                (2.180)        
##                                                                           
## --------------------------------------------------------------------------
## Observations                           98                     98          
## R2                                   0.841                  0.856         
## Adjusted R2                          0.832                  0.843         
## Residual Std. Error             7.006 (df = 92)        6.771 (df = 89)    
## F Statistic                  97.117*** (df = 5; 92) 66.165*** (df = 8; 89)
## ==========================================================================
## Note:                                          *p<0.1; **p<0.05; ***p<0.01

5. Akaike information criterion (AIC) and the Bayesian information criaterion (BIC)

AIC(lm_null,lm1,lm2.ortho)
## Warning in AIC.default(lm_null, lm1, lm2.ortho): models are not all fitted
## to the same number of observations
##           df      AIC
## lm_null    2 872.8732
## lm1        7 667.4750
## lm2.ortho 10 663.5479
BIC(lm_null,lm1,lm2.ortho)
## Warning in BIC.default(lm_null, lm1, lm2.ortho): models are not all fitted
## to the same number of observations
##           df      BIC
## lm_null    2 878.1232
## lm1        7 685.5698
## lm2.ortho 10 689.3975

a). null model has has much higher AIC and BIC than other two models.

b). AIC slighly prefer model with higher degree.

c). BIC penalizes more on model with only linear components and prefer model with higher degree.

d). The difference is not large enough for decide which model is definitly better.

6.k-fold cross-validation

library(DAAG)
## Warning: package 'DAAG' was built under R version 3.4.4
## Loading required package: lattice
## 
## Attaching package: 'DAAG'
## The following object is masked from 'package:MASS':
## 
##     hills
## The following object is masked from 'package:car':
## 
##     vif
cvResults_1 <- suppressWarnings(CVlm(data=Pres[complete.cases(Pres), ], form.lm=prestige ~ education+income+census+type, m=5, dots=FALSE, seed=10, legend.pos="topleft",  printit=FALSE))  

cvResults_2 <- suppressWarnings(CVlm(data=Pres[complete.cases(Pres), ], form.lm=prestige ~ poly(education, degree=2)+poly(income, degree=2)+poly(census, degree=2)+type, m=5, dots=FALSE, seed=10, legend.pos="topleft",  printit=FALSE)) 

attr(cvResults_1, 'ms')
## [1] 48.85781
attr(cvResults_2, 'ms')
## [1] 45.40219

Read the plot

When the lines of best fit for each validation are parallel and close to each other, the model’s prediction accuracy will not vary too much for any one particular sample.

But sometimes the plot is not clear because the lines stack together.

mean square error

We can use the metric mean square error from the k-fold cross validation to compare different linear models.By comparing mean square error, we know that the model with high degree perform slightly better in k-fold cross validation.