library(car)
Pres <-read.table("http://socserv.socsci.mcmaster.ca/jfox/books/Companion/data/Prestige.txt", header=TRUE)
head(Pres)
## education income women prestige census type
## GOV.ADMINISTRATORS 13.11 12351 11.16 68.8 1113 prof
## GENERAL.MANAGERS 12.26 25879 4.02 69.1 1130 prof
## ACCOUNTANTS 12.77 9271 15.70 63.4 1171 prof
## PURCHASING.OFFICERS 11.42 8865 9.11 56.8 1175 prof
## CHEMISTS 14.62 8403 11.68 73.5 2111 prof
## PHYSICISTS 15.64 11030 5.13 77.6 2113 prof
summary(Pres)
## education income women prestige
## Min. : 6.380 Min. : 611 Min. : 0.000 Min. :14.80
## 1st Qu.: 8.445 1st Qu.: 4106 1st Qu.: 3.592 1st Qu.:35.23
## Median :10.540 Median : 5930 Median :13.600 Median :43.60
## Mean :10.738 Mean : 6798 Mean :28.979 Mean :46.83
## 3rd Qu.:12.648 3rd Qu.: 8187 3rd Qu.:52.203 3rd Qu.:59.27
## Max. :15.970 Max. :25879 Max. :97.510 Max. :87.20
## census type
## Min. :1113 bc :44
## 1st Qu.:3120 prof:31
## Median :5135 wc :23
## Mean :5402 NA's: 4
## 3rd Qu.:8312
## Max. :9517
library(ggplot2)
p1 <- ggplot(Pres) + geom_point(aes(x=income, y=prestige), size=2, colour="#993399") +
xlab("Income") + ylab("prestige")
p2 <- ggplot(Pres) + geom_point(aes(x=education, y=prestige), size=2, colour="#993399") +
xlab("Education") + ylab("prestige")
p3 <- ggplot(Pres) + geom_point(aes(x=census, y=prestige), size=2, colour="#993399") +
xlab("Census") + ylab("prestige")
p4 <- ggplot(Pres) + geom_point(aes(x=women, y=prestige), size=2, colour="#993399") +
xlab("Women") + ylab("prestige")
par(mfrow=c(2,2))
print(p1)
print(p2)
print(p3)
print(p4)
library(MASS)
lm_null <- lm(prestige ~ 1,data=Pres)
lm_full <- lm(prestige ~ ., data=Pres)
#summary(lm_null)
step <- stepAIC(lm_full,trace=F)
#step$anova
#summary((step))
lm1 <- lm(paste("prestige ~", paste(attr(terms(step), "term.labels"),collapse = "+")), data=Pres)
#summary(lm1)
#forward <- stepAIC(lm_full,direction='forward',trace=F)
#forward$anova
#backward <- stepAIC(lm_full,direction='backward',trace=F)
#backward$anova
#(bic<-stepAIC(lm_full,trace=F,k=log(nrow(Pres))))
# build model at polynomial degree 2
lm2.ortho <- lm (prestige ~ poly(education, degree=2)+poly(income, degree=2)+poly(census, degree=2)+type,data=Pres) # default argument raw =FALSE will use orthogonal polynomials
#summary(lm2.ortho)
First we can use the statistic to compare the goodness of fit of different models.t-test, F-statistics of an anova, likelihood ratio test (LRT) are most commonly used methods.
\(t_statistic = \beta coefficient/standard. error\)
summary(lm1 )$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.143676e+01 7.822993454 -1.4619413 1.471648e-01
## education 3.947385e+00 0.649775874 6.0749949 2.762787e-08
## income 9.365407e-04 0.000222084 4.2170552 5.785442e-05
## census 1.124464e-03 0.000611256 1.8395963 6.905247e-02
## typeprof 1.090528e+01 4.645053620 2.3477194 2.103224e-02
## typewc 5.605413e-01 3.061947847 0.1830669 8.551484e-01
summary(lm2.ortho )$coefficients
## Estimate Std. Error t value
## (Intercept) 44.4759530 2.180394 20.3981241
## poly(education, degree = 2)1 100.5463144 18.697973 5.3773910
## poly(education, degree = 2)2 -1.8954692 8.502187 -0.2229390
## poly(income, degree = 2)1 45.0111573 9.943029 4.5269059
## poly(income, degree = 2)2 -17.9970776 8.119913 -2.2164127
## poly(census, degree = 2)1 12.3619575 16.917145 0.7307355
## poly(census, degree = 2)2 17.6367374 10.075151 1.7505184
## typeprof 6.7488408 4.966802 1.3587901
## typewc 0.9253886 3.085571 0.2999083
## Pr(>|t|)
## (Intercept) 2.601935e-35
## poly(education, degree = 2)1 6.031383e-07
## poly(education, degree = 2)2 8.240938e-01
## poly(income, degree = 2)1 1.846576e-05
## poly(income, degree = 2)2 2.921273e-02
## poly(census, degree = 2)1 4.668601e-01
## poly(census, degree = 2)2 8.347490e-02
## typeprof 1.776469e-01
## typewc 7.649464e-01
For model lm1, census and typewc in type should not be included in to model. For model lm2.ortho, quadratic form of education, linear and quadratic form of census and type should not be included in to model.
F-statistics of an anova shown in the summary is to test the fitted model against the null model.
F-statistics of an anova can be used to compare two models too.
anova(lm1, lm2.ortho)
## Analysis of Variance Table
##
## Model 1: prestige ~ education + income + census + type
## Model 2: prestige ~ poly(education, degree = 2) + poly(income, degree = 2) +
## poly(census, degree = 2) + type
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 92 4515.2
## 2 89 4080.2 3 434.97 3.1626 0.02843 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The polynomial model lm2.ortho is preferred to model lm1 with only linear components.
The likelihood ratio test can be used for testing 2 nested models.
logLik(lm_null)
## 'log Lik.' -434.4366 (df=2)
logLik(lm1)
## 'log Lik.' -326.7375 (df=7)
logLik(lm2.ortho)
## 'log Lik.' -321.7739 (df=10)
dl <- 2*as.numeric(logLik(lm1) - logLik(lm_null))
1-pchisq(dl,5)
## [1] 0
dl <- 2*as.numeric(logLik(lm2.ortho) - logLik(lm1))
1-pchisq(dl,3)
## [1] 0.01919575
The ANOVA and the LRT give very similar results and lead to the same conclusion: a). The null model can be rejected. b). The polynomial model is preferred to the model with only linear components in this case.
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.1. https://CRAN.R-project.org/package=stargazer
stargazer(lm1,lm2.ortho,title="Comparison of 2 Regression outputs",type="text",align=TRUE)
##
## Comparison of 2 Regression outputs
## ==========================================================================
## Dependent variable:
## ---------------------------------------------
## prestige ~ prestige
## (1) (2)
## --------------------------------------------------------------------------
## education 3.947***
## (0.650)
##
## income 0.001***
## (0.0002)
##
## census 0.001*
## (0.001)
##
## poly(education, degree = 2)1 100.546***
## (18.698)
##
## poly(education, degree = 2)2 -1.895
## (8.502)
##
## poly(income, degree = 2)1 45.011***
## (9.943)
##
## poly(income, degree = 2)2 -17.997**
## (8.120)
##
## poly(census, degree = 2)1 12.362
## (16.917)
##
## poly(census, degree = 2)2 17.637*
## (10.075)
##
## typeprof 10.905** 6.749
## (4.645) (4.967)
##
## typewc 0.561 0.925
## (3.062) (3.086)
##
## Constant -11.437 44.476***
## (7.823) (2.180)
##
## --------------------------------------------------------------------------
## Observations 98 98
## R2 0.841 0.856
## Adjusted R2 0.832 0.843
## Residual Std. Error 7.006 (df = 92) 6.771 (df = 89)
## F Statistic 97.117*** (df = 5; 92) 66.165*** (df = 8; 89)
## ==========================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
AIC(lm_null,lm1,lm2.ortho)
## Warning in AIC.default(lm_null, lm1, lm2.ortho): models are not all fitted
## to the same number of observations
## df AIC
## lm_null 2 872.8732
## lm1 7 667.4750
## lm2.ortho 10 663.5479
BIC(lm_null,lm1,lm2.ortho)
## Warning in BIC.default(lm_null, lm1, lm2.ortho): models are not all fitted
## to the same number of observations
## df BIC
## lm_null 2 878.1232
## lm1 7 685.5698
## lm2.ortho 10 689.3975
a). null model has has much higher AIC and BIC than other two models.
b). AIC slighly prefer model with higher degree.
c). BIC penalizes more on model with only linear components and prefer model with higher degree.
d). The difference is not large enough for decide which model is definitly better.
library(DAAG)
## Warning: package 'DAAG' was built under R version 3.4.4
## Loading required package: lattice
##
## Attaching package: 'DAAG'
## The following object is masked from 'package:MASS':
##
## hills
## The following object is masked from 'package:car':
##
## vif
cvResults_1 <- suppressWarnings(CVlm(data=Pres[complete.cases(Pres), ], form.lm=prestige ~ education+income+census+type, m=5, dots=FALSE, seed=10, legend.pos="topleft", printit=FALSE))
cvResults_2 <- suppressWarnings(CVlm(data=Pres[complete.cases(Pres), ], form.lm=prestige ~ poly(education, degree=2)+poly(income, degree=2)+poly(census, degree=2)+type, m=5, dots=FALSE, seed=10, legend.pos="topleft", printit=FALSE))
attr(cvResults_1, 'ms')
## [1] 48.85781
attr(cvResults_2, 'ms')
## [1] 45.40219
Read the plot
When the lines of best fit for each validation are parallel and close to each other, the model’s prediction accuracy will not vary too much for any one particular sample.
But sometimes the plot is not clear because the lines stack together.
mean square error
We can use the metric mean square error from the k-fold cross validation to compare different linear models.By comparing mean square error, we know that the model with high degree perform slightly better in k-fold cross validation.