## Load SAS dataset
library(sas7bdat)
nh2fs.orig <- read.sas7bdat("./nh2fs2013.sas7bdat")
## Changed to lower case variable names
names(nh2fs.orig) <- tolower(names(nh2fs.orig))
## Restrict to complete cases only
variable.used <- c("pulse","ageyrs","height","wt","sex","race","smokever","booze")
nh2fs <- nh2fs.orig[,variable.used]
nh2fs <- nh2fs[complete.cases(nh2fs), ]
## Recode
nh2fs <- within(nh2fs, {
bmi <- wt / (height/100)^2
sex <- as.numeric(sex == 1)
alcohol <- as.numeric(booze > 0)
race <- factor(race)
})
res.q2 <- lm(pulse ~ ageyrs + bmi + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q2)
Call:
lm(formula = pulse ~ ageyrs + bmi + race + sex + alcohol + smokever,
data = nh2fs)
Residuals:
Min 1Q Median 3Q Max
-46.21 -8.79 -1.18 5.94 82.34
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 76.52290 0.99511 76.90 < 2e-16 ***
ageyrs -0.00883 0.01064 -0.83 0.407
bmi 0.17285 0.02828 6.11 0.000000001 ***
race2 -0.83274 0.45106 -1.85 0.065 .
race3 -0.75982 1.04653 -0.73 0.468
sex -3.33587 0.29696 -11.23 < 2e-16 ***
alcohol -0.71779 0.29558 -2.43 0.015 *
smokever 1.43266 0.30435 4.71 0.000002550 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 12.9 on 8547 degrees of freedom
(695 observations deleted due to missingness)
Multiple R-squared: 0.0213, Adjusted R-squared: 0.0205
F-statistic: 26.6 on 7 and 8547 DF, p-value: <2e-16
res.q3 <- lm(pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q3)
Call:
lm(formula = pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol +
smokever, data = nh2fs)
Residuals:
Min 1Q Median 3Q Max
-46.31 -8.74 -1.24 5.90 82.44
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 85.56862 2.57754 33.20 < 2e-16 ***
ageyrs -0.00657 0.01065 -0.62 0.53768
bmi -0.48953 0.17641 -2.77 0.00553 **
I(bmi^2) 0.01141 0.00300 3.80 0.00014 ***
race2 -0.87723 0.45085 -1.95 0.05172 .
race3 -0.85909 1.04603 -0.82 0.41151
sex -3.15339 0.30058 -10.49 < 2e-16 ***
alcohol -0.68244 0.29549 -2.31 0.02094 *
smokever 1.37055 0.30455 4.50 0.0000069 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 12.9 on 8546 degrees of freedom
Multiple R-squared: 0.023, Adjusted R-squared: 0.022
F-statistic: 25.1 on 8 and 8546 DF, p-value: <2e-16
anova(res.q2, res.q3, test = "LRT")
Analysis of Variance Table
Model 1: pulse ~ ageyrs + bmi + race + sex + alcohol + smokever
Model 2: pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + smokever
Res.Df RSS Df Sum of Sq Pr(>Chi)
1 8547 1415734
2 8546 1413341 1 2393 0.00014 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Obtain median
bmi_median <- median(nh2fs$bmi)
bmi_median
[1] 25.38
nh2fs <- within(nh2fs, {
## subtracted term and indicator term
bmi_spline <- (bmi - bmi_median)
bmi_spline_indicator <- as.numeric(bmi >= bmi_median)
bmi_spline <- bmi_spline * bmi_spline_indicator
})
head(nh2fs[,c("height","wt","bmi","bmi_spline")], 30)
height wt bmi bmi_spline
1 174.6 62.48 20.50 0.0000
3 169.0 66.34 23.23 0.0000
4 162.6 94.46 35.73 10.3459
5 163.1 74.28 27.92 2.5412
7 153.9 54.55 23.03 0.0000
8 160.0 58.97 23.04 0.0000
9 164.0 68.95 25.64 0.2538
10 176.6 65.43 20.98 0.0000
11 156.2 76.77 31.47 6.0832
13 151.8 65.77 28.54 3.1600
14 167.6 74.84 26.64 1.2612
15 171.1 73.03 24.95 0.0000
16 169.3 68.72 23.98 0.0000
18 162.0 68.04 25.93 0.5440
21 153.3 68.38 29.10 3.7148
22 166.2 76.77 27.79 2.4107
23 179.4 124.51 38.69 13.3045
24 173.8 66.23 21.93 0.0000
25 155.2 82.44 34.23 8.8439
27 173.0 69.06 23.07 0.0000
28 140.4 47.97 24.34 0.0000
30 152.8 76.09 32.59 7.2078
31 178.8 82.33 25.75 0.3708
32 164.9 81.87 30.11 4.7262
33 154.7 66.23 27.67 2.2922
34 161.5 58.51 22.43 0.0000
36 144.4 55.34 26.54 1.1583
37 149.2 48.65 21.85 0.0000
38 154.2 64.07 26.95 1.5635
39 160.6 62.94 24.40 0.0000
## Fit
res.q4 <- lm(pulse ~ ageyrs + bmi + bmi_spline + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q4)
Call:
lm(formula = pulse ~ ageyrs + bmi + bmi_spline + race + sex +
alcohol + smokever, data = nh2fs)
Residuals:
Min 1Q Median 3Q Max
-46.39 -8.74 -1.22 5.98 83.06
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 85.04628 1.80587 47.09 < 2e-16 ***
ageyrs -0.00505 0.01065 -0.47 0.6351
bmi -0.21409 0.07406 -2.89 0.0039 **
bmi_spline 0.56348 0.09970 5.65 0.000000016 ***
race2 -0.93084 0.45058 -2.07 0.0389 *
race3 -0.93578 1.04511 -0.90 0.3706
sex -3.00986 0.30199 -9.97 < 2e-16 ***
alcohol -0.63311 0.29542 -2.14 0.0321 *
smokever 1.32206 0.30443 4.34 0.000014235 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 12.8 on 8546 degrees of freedom
Multiple R-squared: 0.0249, Adjusted R-squared: 0.024
F-statistic: 27.3 on 8 and 8546 DF, p-value: <2e-16
nh2fs <- within(nh2fs, {
bmi.cat <- cut(bmi,
breaks = c(-Inf,(18.5-10^-7),25,30,Inf),
labels = c("Underweight","Normal","Overweight","Obese"))
bmi.cat <- relevel(bmi.cat, ref = "Normal")
})
res.q6 <- lm(pulse ~ ageyrs + bmi.cat + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q6)
Call:
lm(formula = pulse ~ ageyrs + bmi.cat + race + sex + alcohol +
smokever, data = nh2fs)
Residuals:
Min 1Q Median 3Q Max
-46.12 -8.75 -1.11 6.09 82.31
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 80.2237 0.6806 117.87 < 2e-16 ***
ageyrs -0.0089 0.0107 -0.84 0.4035
bmi.catUnderweight 2.6170 0.9000 2.91 0.0036 **
bmi.catOverweight 0.6412 0.3155 2.03 0.0421 *
bmi.catObese 2.7669 0.3986 6.94 4.1e-12 ***
race2 -0.8510 0.4506 -1.89 0.0590 .
race3 -0.8814 1.0458 -0.84 0.3994
sex -3.2439 0.2994 -10.84 < 2e-16 ***
alcohol -0.6579 0.2958 -2.22 0.0261 *
smokever 1.3548 0.3044 4.45 8.7e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 12.9 on 8545 degrees of freedom
(695 observations deleted due to missingness)
Multiple R-squared: 0.0231, Adjusted R-squared: 0.022
F-statistic: 22.4 on 9 and 8545 DF, p-value: <2e-16
library(car)
residualPlots(res.q3, ~ 1)
Test stat Pr(>|t|)
Tukey test -4.46 0
durbinWatsonTest(res.q4)
lag Autocorrelation D-W Statistic p-value
1 0.02022 1.959 0.06
Alternative hypothesis: rho != 0
q10 <- data.frame(rstudent(res.q4),
dfbetas(res.q4)
)
sapply(q10, summary)[c("Max.","Min."),]
rstudent.res.q4. X.Intercept. ageyrs bmi bmi_spline race2 race3 sex alcohol smokever
Max. 6.48 0.115 0.096 0.0815 0.104 0.1520 0.234 0.0681 0.0624 0.0833
Min. -3.61 -0.091 -0.113 -0.1200 -0.154 -0.0915 -0.195 -0.0732 -0.0918 -0.0654
## Load quantreg for quantile regression
library(quantreg)
res.q11 <- rq(pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q11, se = "boot")
Call: rq(formula = pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol +
smokever, data = nh2fs)
tau: [1] 0.5
Coefficients:
Value Std. Error t value Pr(>|t|)
(Intercept) 83.98651 2.32388 36.14061 0.00000
ageyrs -0.00394 0.00728 -0.54038 0.58895
bmi -0.34627 0.19104 -1.81255 0.06994
I(bmi^2) 0.00724 0.00381 1.89937 0.05755
race2 -0.25154 0.51395 -0.48943 0.62455
race3 -1.68619 1.60167 -1.05277 0.29247
sex -3.85650 0.30407 -12.68294 0.00000
alcohol -0.06394 0.22055 -0.28989 0.77191
smokever 0.31603 0.45327 0.69722 0.48568
## AGEYR and Alcohol interaction
res.q12 <- lm(pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + smokever + ageyrs:alcohol, data = nh2fs)
summary(res.q12)
Call:
lm(formula = pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol +
smokever + ageyrs:alcohol, data = nh2fs)
Residuals:
Min 1Q Median 3Q Max
-46.30 -8.71 -1.22 5.98 82.30
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 86.1969 2.6792 32.17 < 2e-16 ***
ageyrs -0.0174 0.0165 -1.05 0.29174
bmi -0.4897 0.1764 -2.78 0.00552 **
I(bmi^2) 0.0114 0.0030 3.80 0.00014 ***
race2 -0.8685 0.4510 -1.93 0.05417 .
race3 -0.8484 1.0461 -0.81 0.41742
sex -3.1479 0.3007 -10.47 < 2e-16 ***
alcohol -1.7010 1.2211 -1.39 0.16365
smokever 1.3642 0.3046 4.48 0.0000076 ***
ageyrs:alcohol 0.0185 0.0215 0.86 0.38999
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 12.9 on 8545 degrees of freedom
(695 observations deleted due to missingness)
Multiple R-squared: 0.023, Adjusted R-squared: 0.022
F-statistic: 22.4 on 9 and 8545 DF, p-value: <2e-16