## Load data
library(faraway)
data(seatpos)
head(seatpos)
Age Weight HtShoes Ht Seated Arm Thigh Leg hipcenter
1 46 180 187.2 184.9 95.2 36.1 45.3 41.3 -206.30
2 31 175 167.5 165.5 83.8 32.9 36.5 35.9 -178.21
3 23 100 153.6 152.2 82.9 26.0 36.6 31.0 -71.67
4 19 185 190.3 187.4 97.3 37.4 44.1 41.0 -257.72
5 23 159 178.0 174.1 93.9 29.5 40.1 36.9 -173.23
6 47 170 178.7 177.0 92.4 36.0 43.2 37.4 -185.15
Car seat position depending driver size
Description:
Car drivers like to adjust the seat position for their own
comfort. Car designers would find it helpful to know where
different drivers will position the seat depending on their size
and age. Researchers at the HuMoSim laboratory at the University
of Michigan collected data on 38 drivers.
Format:
The dataset contains the following variables
‘Age’ Age in years
‘Weight’ Weight in lbs
‘HtShoes’ Height in shoes in cm
‘Ht’ Height bare foot in cm
‘Seated’ Seated height in cm
‘Arm’ lower arm length in cm
‘Thigh’ Thigh length in cm
‘Leg’ Lower leg length in cm
‘hipcenter’ horizontal distance of the midpoint of the hips from a
fixed location in the car in mm
Source:
"Linear Models in R" by Julian Faraway, CRC Press, 2004
The model is overall highly significant, but none of the predictors are significant, indicating presence of collinearity issue.
## Fit model with all predictors
g <- lm(hipcenter ~ ., data = seatpos)
## Show
summary(g)
Call:
lm(formula = hipcenter ~ ., data = seatpos)
Residuals:
Min 1Q Median 3Q Max
-73.83 -22.83 -3.68 25.02 62.34
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 436.4321 166.5716 2.62 0.014 *
Age 0.7757 0.5703 1.36 0.184
Weight 0.0263 0.3310 0.08 0.937
HtShoes -2.6924 9.7530 -0.28 0.784
Ht 0.6013 10.1299 0.06 0.953
Seated 0.5338 3.7619 0.14 0.888
Arm -1.3281 3.9002 -0.34 0.736
Thigh -1.1431 2.6600 -0.43 0.671
Leg -6.4390 4.7139 -1.37 0.182
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 37.7 on 29 degrees of freedom
Multiple R-squared: 0.687, Adjusted R-squared: 0.6
F-statistic: 7.94 on 8 and 29 DF, p-value: 0.0000131
## Create pairwise correlation matrix. Show only ones > 0.7
raw.var.cor <- round(cor(seatpos), 3)
raw.var.cor[abs(raw.var.cor) < 0.7] <- NA
raw.var.cor
Age Weight HtShoes Ht Seated Arm Thigh Leg hipcenter
Age 1 NA NA NA NA NA NA NA NA
Weight NA 1.000 0.828 0.829 0.776 NA NA 0.784 NA
HtShoes NA 0.828 1.000 0.998 0.930 0.752 0.725 0.908 -0.797
Ht NA 0.829 0.998 1.000 0.928 0.752 0.735 0.910 -0.799
Seated NA 0.776 0.930 0.928 1.000 NA NA 0.812 -0.731
Arm NA NA 0.752 0.752 NA 1.000 NA 0.754 NA
Thigh NA NA 0.725 0.735 NA NA 1.000 NA NA
Leg NA 0.784 0.908 0.910 0.812 0.754 NA 1.000 -0.787
hipcenter NA NA -0.797 -0.799 -0.731 NA NA -0.787 1.000
## Use faraway::vif()
vif(g)
Age Weight HtShoes Ht Seated Arm Thigh Leg
1.998 3.647 307.429 333.138 8.951 4.496 2.763 6.694
## sqrt(vif)
sqrt(vif(g))
Age Weight HtShoes Ht Seated Arm Thigh Leg
1.413 1.910 17.534 18.252 2.992 2.120 1.662 2.587
sqrt(VIF) 17.5 for the HtShoes variable means the SE of this variable is 17.5 times higher than it would be have been without collinearity.
## Fit model with randomly perturbated outcomes
g2 <- lm(hipcenter + 10*rnorm(38) ~ ., data = seatpos)
## Show
summary(g2)$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) 479.0004 183.6303 2.60850 0.01423
Age 0.7596 0.6287 1.20808 0.23678
Weight 0.1577 0.3649 0.43235 0.66869
HtShoes -3.6828 10.7518 -0.34253 0.73442
Ht 0.5797 11.1673 0.05191 0.95895
Seated 1.0594 4.1472 0.25546 0.80017
Arm -0.6137 4.2996 -0.14273 0.88749
Thigh -1.9316 2.9324 -0.65872 0.51527
Leg -4.4631 5.1966 -0.85885 0.39747
## Original
summary(g)$coef
Estimate Std. Error t value Pr(>|t|)
(Intercept) 436.43213 166.5716 2.62009 0.01384
Age 0.77572 0.5703 1.36012 0.18427
Weight 0.02631 0.3310 0.07950 0.93718
HtShoes -2.69241 9.7530 -0.27606 0.78446
Ht 0.60134 10.1299 0.05936 0.95307
Seated 0.53375 3.7619 0.14188 0.88815
Arm -1.32807 3.9002 -0.34051 0.73592
Thigh -1.14312 2.6600 -0.42974 0.67056
Leg -6.43905 4.7139 -1.36598 0.18245
The R-squared is almost as good as the full model.
## Check pairwise correlation of length variables
round(cor(seatpos[,3:8]), 3)
HtShoes Ht Seated Arm Thigh Leg
HtShoes 1.000 0.998 0.930 0.752 0.725 0.908
Ht 0.998 1.000 0.928 0.752 0.735 0.910
Seated 0.930 0.928 1.000 0.625 0.607 0.812
Arm 0.752 0.752 0.625 1.000 0.671 0.754
Thigh 0.725 0.735 0.607 0.671 1.000 0.650
Leg 0.908 0.910 0.812 0.754 0.650 1.000
## Fit model with only one length variable
g3 <- lm(hipcenter ~ Age + Weight + Ht, data = seatpos)
## Show
summary(g3)
Call:
lm(formula = hipcenter ~ Age + Weight + Ht, data = seatpos)
Residuals:
Min 1Q Median 3Q Max
-91.53 -23.00 2.16 24.95 53.98
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 528.29773 135.31295 3.90 0.00043 ***
Age 0.51950 0.40804 1.27 0.21159
Weight 0.00427 0.31172 0.01 0.98915
Ht -4.21190 0.99906 -4.22 0.00017 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 36.5 on 34 degrees of freedom
Multiple R-squared: 0.656, Adjusted R-squared: 0.626
F-statistic: 21.6 on 3 and 34 DF, p-value: 0.0000000513
## Load glmpath (L1 Regularization Path for Generalized Linear Models and Cox Proportional Hazards Model)
library(glmpath)
## Fit model with all predictors using rms::ols()
library(rms)
g2 <- ols(hipcenter ~ Age + Weight + HtShoes + Ht + Seated + Arm + Thigh + Leg, data = seatpos,
x = TRUE, y = TRUE)
## model matrix x, and outcome vector y
dat.glmpath <- list(x = g2$x, y = g2$y)
## Fit logistic models over a range of L1
gpath <- glmpath(data = dat.glmpath, family = gaussian(link = "identity"))
## Plot results
par(mar = c(5.1, 4.1, 4.1, 5.1))
plot(gpath, type = c("coefficients", "aic", "bic")[1]) # coefficients
plot(gpath, type = c("coefficients", "aic", "bic")[2]) # AIC
plot(gpath, type = c("coefficients", "aic", "bic")[3]) # BIC
## List b.predictor (matrix of coefficient estimates from the predictor steps) with aic at each L1 bound
b.pred.aic <- cbind(gpath$b.predictor, aic = gpath$aic)
b.pred.aic
Intercept Intercept Age Weight HtShoes Ht Seated Arm Thigh Leg aic
1 -164.88 0 0.0000 0.000000 0.000 0.0000 0.0000 0.0000 0.0000 0.000 421.5
2 -47.28 0 0.0000 0.000000 0.000 -0.6956 0.0000 0.0000 0.0000 0.000 415.5
3 363.98 0 0.0000 0.000000 0.000 -2.1229 0.0000 0.0000 0.0000 -4.686 387.4
4 406.67 0 0.2810 0.000000 0.000 -2.2209 0.0000 0.0000 0.0000 -5.679 385.6
5 413.42 0 0.3491 0.000000 0.000 -2.1669 0.0000 0.0000 -0.2418 -5.925 386.9
6 430.41 0 0.5098 0.000000 -1.212 -0.8155 0.0000 0.0000 -0.8501 -6.475 388.0
7 433.54 0 0.6428 0.000000 -1.882 0.0000 0.0000 -0.6786 -1.0546 -6.506 387.6
8 435.90 0 0.7401 0.000000 -1.795 0.0000 0.0000 -1.1973 -1.1674 -6.494 387.5
9 437.78 0 0.7428 0.005847 -1.806 0.0000 0.0000 -1.2245 -1.1666 -6.502 389.5
10 436.90 0 0.7644 0.027583 -2.114 0.0000 0.5166 -1.2963 -1.0950 -6.412 391.5
11 436.43 0 0.7757 0.026314 -2.692 0.6011 0.5337 -1.3281 -1.1431 -6.439 393.5
## Show coefficients with the smallest AIC
b.pred.aic[which.min(gpath$aic), , drop = FALSE]
Intercept Intercept Age Weight HtShoes Ht Seated Arm Thigh Leg aic
4 406.7 0 0.281 0 0 -2.221 0 0 0 -5.679 385.6
## Extract coefficients for which the AIC is the smallest
gpath$b.predictor[which.min(gpath$aic), -1]
Intercept Age Weight HtShoes Ht Seated Arm Thigh Leg
0.000 0.281 0.000 0.000 -2.221 0.000 0.000 0.000 -5.679