Note that GENDER = 1 corresponds to Male and GENDER = 0 corresponds to Female.
ES$GENDER <- factor(ES$GENDER, labels = c("Female", "Male"))
DT::datatable(ES)
Consider the model fit from Example 4.20.
mod420 <- lm(LNSAL ~ ., data = ES[, -c(1)])
summary(mod420)
Call:
lm(formula = LNSAL ~ ., data = ES[, -c(1)])
Residuals:
Min 1Q Median 3Q Max
-0.163466 -0.048971 -0.001111 0.041345 0.124534
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.862e+00 9.703e-02 101.634 < 2e-16 ***
EXP 4.364e-02 3.761e-03 11.604 < 2e-16 ***
EDUC 3.094e-02 2.950e-03 10.487 < 2e-16 ***
GENDERMale 1.166e-01 3.696e-02 3.155 0.00217 **
NUMSUP 3.259e-04 7.850e-05 4.152 7.36e-05 ***
ASSETS 2.391e-03 4.439e-04 5.386 5.49e-07 ***
EXPSQ -6.347e-04 1.384e-04 -4.588 1.41e-05 ***
GEN_SUP 3.020e-04 9.239e-05 3.269 0.00152 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.06596 on 92 degrees of freedom
Multiple R-squared: 0.9401, Adjusted R-squared: 0.9355
F-statistic: 206.3 on 7 and 92 DF, p-value: < 2.2e-16
# This is the same as
mod420B <- lm(LNSAL ~ EXP + EDUC + GENDER + NUMSUP + ASSETS + EXPSQ + GEN_SUP, data = ES)
summary(mod420B)
Call:
lm(formula = LNSAL ~ EXP + EDUC + GENDER + NUMSUP + ASSETS +
EXPSQ + GEN_SUP, data = ES)
Residuals:
Min 1Q Median 3Q Max
-0.163466 -0.048971 -0.001111 0.041345 0.124534
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.862e+00 9.703e-02 101.634 < 2e-16 ***
EXP 4.364e-02 3.761e-03 11.604 < 2e-16 ***
EDUC 3.094e-02 2.950e-03 10.487 < 2e-16 ***
GENDERMale 1.166e-01 3.696e-02 3.155 0.00217 **
NUMSUP 3.259e-04 7.850e-05 4.152 7.36e-05 ***
ASSETS 2.391e-03 4.439e-04 5.386 5.49e-07 ***
EXPSQ -6.347e-04 1.384e-04 -4.588 1.41e-05 ***
GEN_SUP 3.020e-04 9.239e-05 3.269 0.00152 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.06596 on 92 degrees of freedom
Multiple R-squared: 0.9401, Adjusted R-squared: 0.9355
F-statistic: 206.3 on 7 and 92 DF, p-value: < 2.2e-16
Slighlty augmented data
ES2 <- read.table("EXECSAL2.txt", header = TRUE)
DT::datatable(ES2)
colnames(ES2) <- c("ID", "LogSal", "EXP", "EDUC", "GENDER", "NUMSUP", "ASSETS", "BOARD", "AGE", "PROFITS", "IEXP", "TSALES")
ES2$GENDER <- factor(ES2$GENDER, labels = c("Female", "Male"))
ES2$BOARD <- factor(ES2$BOARD, labels = c("No", "Yes"))
ES2$IEXP <- factor(ES2$IEXP, labels = c("No", "Yes"))
DT::datatable(ES2, rownames = FALSE)
FM <- lm(LogSal ~ 1, data = ES2[, -c(1)])
summary(FM)
Call:
lm(formula = LogSal ~ 1, data = ES2[, -c(1)])
Residuals:
Min 1Q Median 3Q Max
-0.79072 -0.17304 0.00768 0.15298 0.60838
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.45502 0.02598 440.9 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.2598 on 99 degrees of freedom
step(FM, scope = LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS + BOARD + AGE + PROFITS + IEXP + TSALES, direction = "forward")
Start: AIC=-268.57
LogSal ~ 1
Df Sum of Sq RSS AIC
+ EXP 1 4.1364 2.5462 -363.06
+ AGE 1 2.6488 4.0338 -317.05
+ GENDER 1 1.0492 5.6335 -283.64
+ EDUC 1 0.3264 6.3563 -271.57
+ NUMSUP 1 0.2897 6.3930 -271.00
+ ASSETS 1 0.2774 6.4052 -270.81
<none> 6.6827 -268.57
+ TSALES 1 0.0201 6.6625 -266.87
+ PROFITS 1 0.0181 6.6646 -266.84
+ BOARD 1 0.0169 6.6657 -266.82
+ IEXP 1 0.0002 6.6824 -266.57
Step: AIC=-363.06
LogSal ~ EXP
Df Sum of Sq RSS AIC
+ GENDER 1 0.87027 1.6760 -402.88
+ EDUC 1 0.32522 2.2210 -374.72
+ NUMSUP 1 0.31253 2.2337 -374.15
+ ASSETS 1 0.26811 2.2781 -372.18
<none> 2.5462 -363.06
+ BOARD 1 0.04591 2.5003 -362.87
+ TSALES 1 0.04132 2.5049 -362.69
+ PROFITS 1 0.01466 2.5316 -361.63
+ AGE 1 0.00843 2.5378 -361.39
+ IEXP 1 0.00381 2.5424 -361.21
Step: AIC=-402.88
LogSal ~ EXP + GENDER
Df Sum of Sq RSS AIC
+ NUMSUP 1 0.60068 1.0753 -445.26
+ EDUC 1 0.28150 1.3945 -419.27
+ ASSETS 1 0.19195 1.4840 -413.04
+ BOARD 1 0.10205 1.5739 -407.16
<none> 1.6760 -402.88
+ PROFITS 1 0.00735 1.6686 -401.32
+ TSALES 1 0.00137 1.6746 -400.96
+ IEXP 1 0.00022 1.6757 -400.89
+ AGE 1 0.00000 1.6760 -400.88
Step: AIC=-445.26
LogSal ~ EXP + GENDER + NUMSUP
Df Sum of Sq RSS AIC
+ EDUC 1 0.45697 0.61832 -498.59
+ ASSETS 1 0.11593 0.95936 -454.67
+ BOARD 1 0.02841 1.04688 -445.94
<none> 1.07529 -445.26
+ AGE 1 0.00623 1.06906 -443.84
+ PROFITS 1 0.00622 1.06907 -443.84
+ TSALES 1 0.00044 1.07485 -443.30
+ IEXP 1 0.00003 1.07526 -443.26
Step: AIC=-498.59
LogSal ~ EXP + GENDER + NUMSUP + EDUC
Df Sum of Sq RSS AIC
+ ASSETS 1 0.087902 0.53041 -511.93
<none> 0.61832 -498.59
+ BOARD 1 0.009688 0.60863 -498.17
+ IEXP 1 0.002451 0.61587 -496.99
+ PROFITS 1 0.001376 0.61694 -496.82
+ AGE 1 0.000343 0.61797 -496.65
+ TSALES 1 0.000000 0.61832 -496.59
Step: AIC=-511.93
LogSal ~ EXP + GENDER + NUMSUP + EDUC + ASSETS
Df Sum of Sq RSS AIC
<none> 0.53041 -511.93
+ IEXP 1 0.0092875 0.52113 -511.69
+ BOARD 1 0.0037568 0.52666 -510.64
+ TSALES 1 0.0003588 0.53006 -509.99
+ PROFITS 1 0.0002463 0.53017 -509.97
+ AGE 1 0.0000122 0.53040 -509.93
Call:
lm(formula = LogSal ~ EXP + GENDER + NUMSUP + EDUC + ASSETS,
data = ES2[, -c(1)])
Coefficients:
(Intercept) EXP GENDERMale NUMSUP EDUC
9.9619345 0.0272762 0.2246932 0.0005244 0.0290921
ASSETS
0.0019623
FM <- lm(LogSal ~ ., data = ES2[, -c(1)])
summary(FM)
Call:
lm(formula = LogSal ~ ., data = ES2[, -c(1)])
Residuals:
Min 1Q Median 3Q Max
-0.201770 -0.050464 0.004435 0.046826 0.185952
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.002e+01 1.481e-01 67.692 < 2e-16 ***
EXP 2.792e-02 1.773e-03 15.745 < 2e-16 ***
EDUC 2.903e-02 3.426e-03 8.475 4.57e-13 ***
GENDERMale 2.243e-01 1.708e-02 13.135 < 2e-16 ***
NUMSUP 5.140e-04 4.922e-05 10.443 < 2e-16 ***
ASSETS 2.048e-03 5.250e-04 3.901 0.000186 ***
BOARDYes -1.538e-02 1.686e-02 -0.912 0.364124
AGE -5.097e-04 1.438e-03 -0.355 0.723795
PROFITS -2.633e-03 5.128e-03 -0.513 0.608896
IEXPYes -2.656e-02 2.037e-02 -1.304 0.195613
TSALES -9.774e-04 2.959e-03 -0.330 0.741955
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.07608 on 89 degrees of freedom
Multiple R-squared: 0.9229, Adjusted R-squared: 0.9142
F-statistic: 106.5 on 10 and 89 DF, p-value: < 2.2e-16
step(FM, scope = LogSal ~ ., direction = "backward")
Start: AIC=-504.84
LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS + BOARD + AGE +
PROFITS + IEXP + TSALES
Df Sum of Sq RSS AIC
- TSALES 1 0.00063 0.51583 -506.71
- AGE 1 0.00073 0.51593 -506.70
- PROFITS 1 0.00153 0.51673 -506.54
- BOARD 1 0.00482 0.52002 -505.91
- IEXP 1 0.00984 0.52504 -504.94
<none> 0.51520 -504.84
- ASSETS 1 0.08810 0.60330 -491.05
- EDUC 1 0.41581 0.93102 -447.66
- NUMSUP 1 0.63133 1.14653 -426.84
- GENDER 1 0.99872 1.51393 -399.05
- EXP 1 1.43512 1.95032 -373.72
Step: AIC=-506.71
LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS + BOARD + AGE +
PROFITS + IEXP
Df Sum of Sq RSS AIC
- AGE 1 0.00050 0.51633 -508.62
- PROFITS 1 0.00149 0.51732 -508.43
- BOARD 1 0.00448 0.52031 -507.85
- IEXP 1 0.00992 0.52575 -506.81
<none> 0.51583 -506.71
- ASSETS 1 0.08769 0.60352 -493.01
- EDUC 1 0.41593 0.93176 -449.59
- NUMSUP 1 0.63878 1.15461 -428.14
- GENDER 1 1.03375 1.54959 -398.72
- EXP 1 1.52826 2.04409 -371.02
Step: AIC=-508.62
LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS + BOARD + PROFITS +
IEXP
Df Sum of Sq RSS AIC
- PROFITS 1 0.0015 0.5178 -510.33
- BOARD 1 0.0040 0.5203 -509.85
- IEXP 1 0.0096 0.5260 -508.77
<none> 0.5163 -508.62
- ASSETS 1 0.0898 0.6061 -494.58
- EDUC 1 0.4243 0.9406 -450.64
- NUMSUP 1 0.6384 1.1547 -430.13
- GENDER 1 1.0503 1.5666 -399.62
- EXP 1 3.9764 4.4927 -294.27
Step: AIC=-510.33
LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS + BOARD + IEXP
Df Sum of Sq RSS AIC
- BOARD 1 0.0033 0.5211 -511.69
- IEXP 1 0.0089 0.5267 -510.64
<none> 0.5178 -510.33
- ASSETS 1 0.0885 0.6064 -496.55
- EDUC 1 0.4230 0.9408 -452.62
- NUMSUP 1 0.6420 1.1598 -431.69
- GENDER 1 1.0490 1.5668 -401.61
- EXP 1 3.9749 4.4927 -296.27
Step: AIC=-511.69
LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS + IEXP
Df Sum of Sq RSS AIC
- IEXP 1 0.0093 0.5304 -511.93
<none> 0.5211 -511.69
- ASSETS 1 0.0947 0.6159 -496.99
- EDUC 1 0.4347 0.9558 -453.04
- NUMSUP 1 0.6868 1.2079 -429.63
- GENDER 1 1.0466 1.5677 -403.55
- EXP 1 3.9718 4.4929 -298.27
Step: AIC=-511.93
LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS
Df Sum of Sq RSS AIC
<none> 0.5304 -511.93
- ASSETS 1 0.0879 0.6183 -498.59
- EDUC 1 0.4289 0.9594 -454.67
- NUMSUP 1 0.6908 1.2212 -430.53
- GENDER 1 1.0656 1.5961 -403.76
- EXP 1 3.9627 4.4932 -300.26
Call:
lm(formula = LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS,
data = ES2[, -c(1)])
Coefficients:
(Intercept) EXP EDUC GENDERMale NUMSUP
9.9619345 0.0272762 0.0290921 0.2246932 0.0005244
ASSETS
0.0019623
leapslibrary(leaps)
ans <- regsubsets(LogSal ~ ., data = ES2[, -c(1)], nvmax = 10)
summary(ans)
Subset selection object
Call: regsubsets.formula(LogSal ~ ., data = ES2[, -c(1)], nvmax = 10)
10 Variables (and intercept)
Forced in Forced out
EXP FALSE FALSE
EDUC FALSE FALSE
GENDERMale FALSE FALSE
NUMSUP FALSE FALSE
ASSETS FALSE FALSE
BOARDYes FALSE FALSE
AGE FALSE FALSE
PROFITS FALSE FALSE
IEXPYes FALSE FALSE
TSALES FALSE FALSE
1 subsets of each size up to 10
Selection Algorithm: exhaustive
EXP EDUC GENDERMale NUMSUP ASSETS BOARDYes AGE PROFITS IEXPYes
1 ( 1 ) "*" " " " " " " " " " " " " " " " "
2 ( 1 ) "*" " " "*" " " " " " " " " " " " "
3 ( 1 ) "*" " " "*" "*" " " " " " " " " " "
4 ( 1 ) "*" "*" "*" "*" " " " " " " " " " "
5 ( 1 ) "*" "*" "*" "*" "*" " " " " " " " "
6 ( 1 ) "*" "*" "*" "*" "*" " " " " " " "*"
7 ( 1 ) "*" "*" "*" "*" "*" "*" " " " " "*"
8 ( 1 ) "*" "*" "*" "*" "*" "*" " " "*" "*"
9 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
10 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*"
TSALES
1 ( 1 ) " "
2 ( 1 ) " "
3 ( 1 ) " "
4 ( 1 ) " "
5 ( 1 ) " "
6 ( 1 ) " "
7 ( 1 ) " "
8 ( 1 ) " "
9 ( 1 ) " "
10 ( 1 ) "*"
par(mfrow = c(2, 2))
plot(ans, scale = "Cp")
plot(ans, scale = "adjr2")
plot(ans, scale = "r2")
plot(ans, scale = "bic")
par(mfrow= c(1, 1))
All together now.
DF <- data.frame(R2 = summary(ans)$rsq, Cp = summary(ans)$cp, BIC = summary(ans)$bic, R2adj = summary(ans)$adjr2)
DT::datatable(round(DF, 4))
See http://robjhyndman.com/hyndsight/crossvalidation/ and http://stat-ata-asu.github.io/STT3851ClassRepo/Rmarkdown/Cross-ValidationInClassHO.pdf
library(car)
Warning in stats::runif(1L): '.Random.seed' is not an integer vector but of
type 'NULL', so ignored
finmod <- lm(LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS, data = ES2)
summary(finmod)
Call:
lm(formula = LogSal ~ EXP + EDUC + GENDER + NUMSUP + ASSETS,
data = ES2)
Residuals:
Min 1Q Median 3Q Max
-0.201219 -0.056016 -0.003581 0.053656 0.187251
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.9619345 0.1010567 98.578 < 2e-16 ***
EXP 0.0272762 0.0010293 26.501 < 2e-16 ***
EDUC 0.0290921 0.0033367 8.719 9.71e-14 ***
GENDERMale 0.2246932 0.0163503 13.742 < 2e-16 ***
NUMSUP 0.0005244 0.0000474 11.064 < 2e-16 ***
ASSETS 0.0019623 0.0004972 3.947 0.000153 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.07512 on 94 degrees of freedom
Multiple R-squared: 0.9206, Adjusted R-squared: 0.9164
F-statistic: 218.1 on 5 and 94 DF, p-value: < 2.2e-16
residualPlots(finmod)
Test stat Pr(>|t|)
EXP -4.173 0.000
EDUC -1.115 0.268
GENDER NA NA
NUMSUP 0.227 0.821
ASSETS -0.384 0.702
Tukey test -1.622 0.105
influenceIndexPlot(finmod)
influencePlot(finmod)
StudRes Hat CookD
40 -2.932805 0.09830851 0.3802668
71 -1.004007 0.12818992 0.1571660