EPI 204 Homework 2 in R

References

Question 1: Prepare dataset

## Load SAS dataset
library(sas7bdat)
nh2fs.orig <- read.sas7bdat("./nh2fs2013.sas7bdat")

## Changed to lower case variable names
names(nh2fs.orig) <- tolower(names(nh2fs.orig))

## Restrict to complete cases only
variable.used <- c("pulse","ageyrs","height","wt","sex","race","smokever","booze")
nh2fs <- nh2fs.orig[,variable.used]
nh2fs <- nh2fs[complete.cases(nh2fs), ]

## Recode
nh2fs <- within(nh2fs, {

    bmi     <- wt / (height/100)^2
    sex     <- as.numeric(sex == 1)
    alcohol <- as.numeric(booze > 0)

    race    <- factor(race)
 })

Question 2:

res.q2 <- lm(pulse ~ ageyrs + bmi + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q2)

Call:
lm(formula = pulse ~ ageyrs + bmi + race + sex + alcohol + smokever, 
    data = nh2fs)

Residuals:
   Min     1Q Median     3Q    Max 
-46.21  -8.79  -1.18   5.94  82.34 

Coefficients:
            Estimate Std. Error t value    Pr(>|t|)    
(Intercept) 76.52290    0.99511   76.90     < 2e-16 ***
ageyrs      -0.00883    0.01064   -0.83       0.407    
bmi          0.17285    0.02828    6.11 0.000000001 ***
race2       -0.83274    0.45106   -1.85       0.065 .  
race3       -0.75982    1.04653   -0.73       0.468    
sex         -3.33587    0.29696  -11.23     < 2e-16 ***
alcohol     -0.71779    0.29558   -2.43       0.015 *  
smokever     1.43266    0.30435    4.71 0.000002550 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 12.9 on 8547 degrees of freedom
  (695 observations deleted due to missingness)
Multiple R-squared:  0.0213,    Adjusted R-squared:  0.0205 
F-statistic: 26.6 on 7 and 8547 DF,  p-value: <2e-16

Question 3:

res.q3 <- lm(pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q3)

Call:
lm(formula = pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + 
    smokever, data = nh2fs)

Residuals:
   Min     1Q Median     3Q    Max 
-46.31  -8.74  -1.24   5.90  82.44 

Coefficients:
            Estimate Std. Error t value  Pr(>|t|)    
(Intercept) 85.56862    2.57754   33.20   < 2e-16 ***
ageyrs      -0.00657    0.01065   -0.62   0.53768    
bmi         -0.48953    0.17641   -2.77   0.00553 ** 
I(bmi^2)     0.01141    0.00300    3.80   0.00014 ***
race2       -0.87723    0.45085   -1.95   0.05172 .  
race3       -0.85909    1.04603   -0.82   0.41151    
sex         -3.15339    0.30058  -10.49   < 2e-16 ***
alcohol     -0.68244    0.29549   -2.31   0.02094 *  
smokever     1.37055    0.30455    4.50 0.0000069 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 12.9 on 8546 degrees of freedom
Multiple R-squared:  0.023, Adjusted R-squared:  0.022 
F-statistic: 25.1 on 8 and 8546 DF,  p-value: <2e-16

anova(res.q2, res.q3, test = "LRT")
Analysis of Variance Table

Model 1: pulse ~ ageyrs + bmi + race + sex + alcohol + smokever
Model 2: pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + smokever
  Res.Df     RSS Df Sum of Sq Pr(>Chi)    
1   8547 1415734                          
2   8546 1413341  1      2393  0.00014 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Question 4:

## Obtain median
bmi_median <- median(nh2fs$bmi)
bmi_median
[1] 25.38

nh2fs <- within(nh2fs, {

    ## subtracted term and indicator term
    bmi_spline <- (bmi - bmi_median)
    bmi_spline_indicator <- as.numeric(bmi >= bmi_median)

    bmi_spline <- bmi_spline * bmi_spline_indicator
 })

head(nh2fs[,c("height","wt","bmi","bmi_spline")], 30)
   height     wt   bmi bmi_spline
1   174.6  62.48 20.50     0.0000
3   169.0  66.34 23.23     0.0000
4   162.6  94.46 35.73    10.3459
5   163.1  74.28 27.92     2.5412
7   153.9  54.55 23.03     0.0000
8   160.0  58.97 23.04     0.0000
9   164.0  68.95 25.64     0.2538
10  176.6  65.43 20.98     0.0000
11  156.2  76.77 31.47     6.0832
13  151.8  65.77 28.54     3.1600
14  167.6  74.84 26.64     1.2612
15  171.1  73.03 24.95     0.0000
16  169.3  68.72 23.98     0.0000
18  162.0  68.04 25.93     0.5440
21  153.3  68.38 29.10     3.7148
22  166.2  76.77 27.79     2.4107
23  179.4 124.51 38.69    13.3045
24  173.8  66.23 21.93     0.0000
25  155.2  82.44 34.23     8.8439
27  173.0  69.06 23.07     0.0000
28  140.4  47.97 24.34     0.0000
30  152.8  76.09 32.59     7.2078
31  178.8  82.33 25.75     0.3708
32  164.9  81.87 30.11     4.7262
33  154.7  66.23 27.67     2.2922
34  161.5  58.51 22.43     0.0000
36  144.4  55.34 26.54     1.1583
37  149.2  48.65 21.85     0.0000
38  154.2  64.07 26.95     1.5635
39  160.6  62.94 24.40     0.0000

## Fit 
res.q4 <- lm(pulse ~ ageyrs + bmi + bmi_spline + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q4)

Call:
lm(formula = pulse ~ ageyrs + bmi + bmi_spline + race + sex + 
    alcohol + smokever, data = nh2fs)

Residuals:
   Min     1Q Median     3Q    Max 
-46.39  -8.74  -1.22   5.98  83.06 

Coefficients:
            Estimate Std. Error t value    Pr(>|t|)    
(Intercept) 85.04628    1.80587   47.09     < 2e-16 ***
ageyrs      -0.00505    0.01065   -0.47      0.6351    
bmi         -0.21409    0.07406   -2.89      0.0039 ** 
bmi_spline   0.56348    0.09970    5.65 0.000000016 ***
race2       -0.93084    0.45058   -2.07      0.0389 *  
race3       -0.93578    1.04511   -0.90      0.3706    
sex         -3.00986    0.30199   -9.97     < 2e-16 ***
alcohol     -0.63311    0.29542   -2.14      0.0321 *  
smokever     1.32206    0.30443    4.34 0.000014235 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 12.8 on 8546 degrees of freedom
Multiple R-squared:  0.0249,    Adjusted R-squared:  0.024 
F-statistic: 27.3 on 8 and 8546 DF,  p-value: <2e-16

Question 6:

nh2fs <- within(nh2fs, {
    bmi.cat <- cut(bmi,
                   breaks = c(-Inf,(18.5-10^-7),25,30,Inf),
                   labels = c("Underweight","Normal","Overweight","Obese"))

    bmi.cat <- relevel(bmi.cat, ref = "Normal")
})

res.q6 <- lm(pulse ~ ageyrs + bmi.cat + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q6)

Call:
lm(formula = pulse ~ ageyrs + bmi.cat + race + sex + alcohol + 
    smokever, data = nh2fs)

Residuals:
   Min     1Q Median     3Q    Max 
-46.12  -8.75  -1.11   6.09  82.31 

Coefficients:
                   Estimate Std. Error t value Pr(>|t|)    
(Intercept)         80.2237     0.6806  117.87  < 2e-16 ***
ageyrs              -0.0089     0.0107   -0.84   0.4035    
bmi.catUnderweight   2.6170     0.9000    2.91   0.0036 ** 
bmi.catOverweight    0.6412     0.3155    2.03   0.0421 *  
bmi.catObese         2.7669     0.3986    6.94  4.1e-12 ***
race2               -0.8510     0.4506   -1.89   0.0590 .  
race3               -0.8814     1.0458   -0.84   0.3994    
sex                 -3.2439     0.2994  -10.84  < 2e-16 ***
alcohol             -0.6579     0.2958   -2.22   0.0261 *  
smokever             1.3548     0.3044    4.45  8.7e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 12.9 on 8545 degrees of freedom
  (695 observations deleted due to missingness)
Multiple R-squared:  0.0231,    Adjusted R-squared:  0.022 
F-statistic: 22.4 on 9 and 8545 DF,  p-value: <2e-16

Question 8:

library(car)
residualPlots(res.q3,  ~ 1)

plot of chunk unnamed-chunk-7

           Test stat Pr(>|t|)
Tukey test     -4.46        0

Question 9:

durbinWatsonTest(res.q4)
 lag Autocorrelation D-W Statistic p-value
   1         0.02022         1.959    0.06
 Alternative hypothesis: rho != 0

Question 10

q10 <- data.frame(rstudent(res.q4),
              dfbetas(res.q4)
              )

sapply(q10, summary)[c("Max.","Min."),]
     rstudent.res.q4. X.Intercept. ageyrs     bmi bmi_spline   race2  race3     sex alcohol smokever
Max.             6.48        0.115  0.096  0.0815      0.104  0.1520  0.234  0.0681  0.0624   0.0833
Min.            -3.61       -0.091 -0.113 -0.1200     -0.154 -0.0915 -0.195 -0.0732 -0.0918  -0.0654

Question 11

## Load quantreg for quantile regression
library(quantreg)
res.q11 <- rq(pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + smokever, data = nh2fs)
summary(res.q11, se = "boot")

Call: rq(formula = pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + 
    smokever, data = nh2fs)

tau: [1] 0.5

Coefficients:
            Value     Std. Error t value   Pr(>|t|) 
(Intercept)  83.98651   2.32388   36.14061   0.00000
ageyrs       -0.00394   0.00728   -0.54038   0.58895
bmi          -0.34627   0.19104   -1.81255   0.06994
I(bmi^2)      0.00724   0.00381    1.89937   0.05755
race2        -0.25154   0.51395   -0.48943   0.62455
race3        -1.68619   1.60167   -1.05277   0.29247
sex          -3.85650   0.30407  -12.68294   0.00000
alcohol      -0.06394   0.22055   -0.28989   0.77191
smokever      0.31603   0.45327    0.69722   0.48568

Question 12:

## AGEYR and Alcohol interaction
res.q12 <- lm(pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + smokever + ageyrs:alcohol, data = nh2fs)
summary(res.q12)

Call:
lm(formula = pulse ~ ageyrs + bmi + I(bmi^2) + race + sex + alcohol + 
    smokever + ageyrs:alcohol, data = nh2fs)

Residuals:
   Min     1Q Median     3Q    Max 
-46.30  -8.71  -1.22   5.98  82.30 

Coefficients:
               Estimate Std. Error t value  Pr(>|t|)    
(Intercept)     86.1969     2.6792   32.17   < 2e-16 ***
ageyrs          -0.0174     0.0165   -1.05   0.29174    
bmi             -0.4897     0.1764   -2.78   0.00552 ** 
I(bmi^2)         0.0114     0.0030    3.80   0.00014 ***
race2           -0.8685     0.4510   -1.93   0.05417 .  
race3           -0.8484     1.0461   -0.81   0.41742    
sex             -3.1479     0.3007  -10.47   < 2e-16 ***
alcohol         -1.7010     1.2211   -1.39   0.16365    
smokever         1.3642     0.3046    4.48 0.0000076 ***
ageyrs:alcohol   0.0185     0.0215    0.86   0.38999    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 12.9 on 8545 degrees of freedom
  (695 observations deleted due to missingness)
Multiple R-squared:  0.023, Adjusted R-squared:  0.022 
F-statistic: 22.4 on 9 and 8545 DF,  p-value: <2e-16