## Load foreign package
library(foreign)
## Load dataset converted to Stata format
nhdat <- read.dta("./lab1.dta")
## Recode
nhdat <- within(nhdat, {
race <- factor(race, 1:3, c("White","Black","Other"))
height[height %in% 999] <- NA
})
## Restrict to complete cases
restricted <- nhdat[complete.cases(nhdat[,c("fvc","ageyrs","height","wt","sex","race")]),]
restricted <- restricted[5 <= restricted$ageyrs & restricted$ageyrs <= 18,]
## Univariable
res.lm1 <- lm(fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex, restricted)
summary(res.lm1)
Call:
lm(formula = fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex,
data = restricted)
Residuals:
Min 1Q Median 3Q Max
-372.0 -26.0 0.0 26.7 443.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -306.654 14.288 -21.46 < 2e-16 ***
ageyrs 2.908 0.564 5.15 2.7e-07 ***
height 3.087 0.134 23.04 < 2e-16 ***
wt 1.679 0.105 16.01 < 2e-16 ***
raceBlack -46.104 2.456 -18.77 < 2e-16 ***
raceOther -12.501 5.762 -2.17 0.03 *
sex -52.764 6.513 -8.10 7.8e-16 ***
ageyrs:sex 7.011 0.513 13.67 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 48.7 on 3010 degrees of freedom
Multiple R-squared: 0.84, Adjusted R-squared: 0.84
F-statistic: 2.26e+03 on 7 and 3010 DF, p-value: <2e-16
## Load car package
library(car)
## Normality
qqPlot(res.lm1)
## Linearlity
residualPlots(model = res.lm1, terms = ~ height, fitted = F)
Test stat Pr(>|t|)
height 17.53 0
## Update the model with height^2
res.lm2 <- update(res.lm1, ~ . + I(height^2))
## Linearity with squared height
residualPlots(model = res.lm2, terms = ~ height, fitted = F)
Test stat Pr(>|t|)
height 1.045 0.296
## Independence
durbinWatsonTest(res.lm2)
lag Autocorrelation D-W Statistic p-value
1 0.04099 1.918 0.022
Alternative hypothesis: rho != 0
## Constant variance
residualPlot(res.lm2)
## Influential points
car::influencePlot(res.lm2, id.n = 5)
StudRes Hat CookD
1270 7.85274 0.001542 0.101844
1339 4.99888 0.008782 0.156220
1356 -7.12575 0.010944 0.247818
2048 0.21857 0.020655 0.010583
4047 -9.29975 0.006775 0.252457
5866 8.76009 0.005392 0.212341
13760 0.68546 0.023305 0.035297
13769 0.03419 0.028065 0.001937
14322 7.34142 0.001445 0.092278
17402 -1.97384 0.022129 0.098930
22051 -0.83827 0.032438 0.051165
22714 7.99853 0.004331 0.174041
title("Y: jackknifed residuals, X: leverage, Size: Cook D")
## Load quantile regression package
library(quantreg)
## Fit
res.rq1 <- rq(fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex + I(height^2), data = restricted)
sum.res.rq1 <- summary(res.rq1, se = "boot")
sum.res.rq1
Call: rq(formula = fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex +
I(height^2), data = restricted)
tau: [1] 0.5
Coefficients:
Value Std. Error t value Pr(>|t|)
(Intercept) 642.10693 68.54141 9.36816 0.00000
ageyrs 4.14749 0.58021 7.14828 0.00000
height -10.22399 0.98088 -10.42324 0.00000
wt 1.34796 0.12080 11.15886 0.00000
raceBlack -40.21105 2.25978 -17.79427 0.00000
raceOther -12.31840 6.16181 -1.99915 0.04568
sex -26.85090 5.12586 -5.23832 0.00000
I(height^2) 0.04601 0.00353 13.04591 0.00000
ageyrs:sex 4.31349 0.52467 8.22133 0.00000
cbind(coef(sum.res.rq1)[,1:2],
coef(summary(res.lm2))[,1:2]
)
Value Std. Error Estimate Std. Error
(Intercept) 642.10693 68.541414 677.81183 57.787543
ageyrs 4.14749 0.580208 4.76766 0.548157
height -10.22399 0.980884 -10.60566 0.791494
wt 1.34796 0.120798 1.29591 0.102292
raceBlack -40.21105 2.259776 -44.19068 2.342643
raceOther -12.31840 6.161814 -12.96797 5.489200
sex -26.85090 5.125859 -26.90883 6.377865
I(height^2) 0.04601 0.003527 0.04673 0.002666
ageyrs:sex 4.31349 0.524670 4.31170 0.512220
## Median height
ht.median <- median(restricted$height)
## Create a basis function
restricted <- within(restricted, {
ht_spline <- (height-ht.median)*(height>ht.median)
})
## Check
head(restricted[,c("height","ht_spline")], 50)
height ht_spline
447 140.4 0.0
449 138.0 0.0
451 132.9 0.0
456 143.9 0.0
465 128.3 0.0
466 137.1 0.0
477 126.5 0.0
482 145.5 0.0
484 130.1 0.0
486 132.3 0.0
493 131.6 0.0
497 149.9 0.0
500 119.0 0.0
502 130.5 0.0
503 145.7 0.0
504 130.1 0.0
505 133.0 0.0
507 133.6 0.0
508 128.7 0.0
512 150.1 0.0
514 110.3 0.0
516 122.9 0.0
517 117.4 0.0
528 175.2 19.9
530 158.8 3.5
532 166.3 11.0
534 172.9 17.6
535 161.5 6.2
537 149.7 0.0
538 159.9 4.6
539 167.9 12.6
541 166.8 11.5
542 163.8 8.5
543 162.9 7.6
544 172.4 17.1
546 179.6 24.3
548 168.4 13.1
549 174.7 19.4
551 159.3 4.0
557 162.6 7.3
559 165.7 10.4
560 155.1 0.0
769 154.4 0.0
770 138.2 0.0
789 158.9 3.6
791 137.6 0.0
794 127.0 0.0
797 152.1 0.0
804 143.9 0.0
806 119.9 0.0
## Fit spline model
res.lm3 <- lm(fvc ~ ageyrs + height + ht_spline + wt + race + sex + ageyrs:sex, restricted)
summary(res.lm3)
Call:
lm(formula = fvc ~ ageyrs + height + ht_spline + wt + race +
sex + ageyrs:sex, data = restricted)
Residuals:
Min 1Q Median 3Q Max
-411.8 -24.2 0.0 25.8 410.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -170.336 15.935 -10.69 < 2e-16 ***
ageyrs 5.156 0.557 9.26 < 2e-16 ***
height 1.998 0.144 13.88 < 2e-16 ***
ht_spline 3.372 0.202 16.66 < 2e-16 ***
wt 1.362 0.102 13.33 < 2e-16 ***
raceBlack -44.080 2.354 -18.73 < 2e-16 ***
raceOther -11.746 5.514 -2.13 0.033 *
sex -27.789 6.411 -4.33 0.000015 ***
ageyrs:sex 4.291 0.517 8.30 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 46.6 on 3009 degrees of freedom
Multiple R-squared: 0.854, Adjusted R-squared: 0.853
F-statistic: 2.2e+03 on 8 and 3009 DF, p-value: <2e-16
## qqPlot
qqPlot(res.lm3)
## residuals vs height
residualPlots(res.lm3, ~ height, fitted = F)
Test stat Pr(>|t|)
height 5.314 0
## residuals vs fitted
residualPlot(res.lm3)
## AIC
list(AIC(res.lm2),
AIC(res.lm3))
[[1]]
[1] 31740
[[2]]
[1] 31768
## Load ggplot2
library(ggplot2)
## Create a function
CoefHt <- coef(res.lm3)["height"]
CoefHtSpline <- coef(res.lm3)["ht_spline"]
basis.fun <- function(x) CoefHt*x + CoefHtSpline*(x - ht.median)*(x > ht.median)
## Plot
ggplot(data = data.frame(ht = 0), aes(x = ht)) +
layer(geom = "line", stat = "function", fun = basis.fun) +
scale_x_continuous(limit = c(120,179)) +
scale_y_continuous(name = "fvc predicted")