EPI204 Lab 2 in R

References

Load data

## Load foreign package
library(foreign)

## Load dataset converted to Stata format
nhdat <- read.dta("./lab1.dta")

Variable manipulation

## Recode
nhdat <- within(nhdat, {
    race <- factor(race, 1:3, c("White","Black","Other"))
    height[height %in% 999] <- NA
})

## Restrict to complete cases
restricted <- nhdat[complete.cases(nhdat[,c("fvc","ageyrs","height","wt","sex","race")]),]
restricted <- restricted[5 <= restricted$ageyrs & restricted$ageyrs <= 18,]

Linear regression model and checking assumptions

## Univariable
res.lm1 <- lm(fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex, restricted)
summary(res.lm1)

Call:
lm(formula = fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex, 
    data = restricted)

Residuals:
   Min     1Q Median     3Q    Max 
-372.0  -26.0    0.0   26.7  443.9 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -306.654     14.288  -21.46  < 2e-16 ***
ageyrs         2.908      0.564    5.15  2.7e-07 ***
height         3.087      0.134   23.04  < 2e-16 ***
wt             1.679      0.105   16.01  < 2e-16 ***
raceBlack    -46.104      2.456  -18.77  < 2e-16 ***
raceOther    -12.501      5.762   -2.17     0.03 *  
sex          -52.764      6.513   -8.10  7.8e-16 ***
ageyrs:sex     7.011      0.513   13.67  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 48.7 on 3010 degrees of freedom
Multiple R-squared:  0.84,  Adjusted R-squared:  0.84 
F-statistic: 2.26e+03 on 7 and 3010 DF,  p-value: <2e-16

## Load car package
library(car)

## Normality
qqPlot(res.lm1)

plot of chunk unnamed-chunk-4


## Linearlity
residualPlots(model = res.lm1, terms = ~ height, fitted = F)

plot of chunk unnamed-chunk-4

       Test stat Pr(>|t|)
height     17.53        0

## Update the model with height^2
res.lm2 <- update(res.lm1,  ~ . + I(height^2))

## Linearity with squared height
residualPlots(model = res.lm2, terms = ~ height, fitted = F)

plot of chunk unnamed-chunk-4

       Test stat Pr(>|t|)
height     1.045    0.296

## Independence
durbinWatsonTest(res.lm2)
 lag Autocorrelation D-W Statistic p-value
   1         0.04099         1.918   0.022
 Alternative hypothesis: rho != 0

## Constant variance
residualPlot(res.lm2)

plot of chunk unnamed-chunk-4


## Influential points
car::influencePlot(res.lm2, id.n = 5)
       StudRes      Hat    CookD
1270   7.85274 0.001542 0.101844
1339   4.99888 0.008782 0.156220
1356  -7.12575 0.010944 0.247818
2048   0.21857 0.020655 0.010583
4047  -9.29975 0.006775 0.252457
5866   8.76009 0.005392 0.212341
13760  0.68546 0.023305 0.035297
13769  0.03419 0.028065 0.001937
14322  7.34142 0.001445 0.092278
17402 -1.97384 0.022129 0.098930
22051 -0.83827 0.032438 0.051165
22714  7.99853 0.004331 0.174041
title("Y: jackknifed residuals, X: leverage, Size: Cook D")

plot of chunk unnamed-chunk-4

Quantile regression

## Load quantile regression package
library(quantreg)

## Fit
res.rq1 <- rq(fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex + I(height^2), data = restricted)
sum.res.rq1 <- summary(res.rq1, se = "boot")
sum.res.rq1

Call: rq(formula = fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex + 
    I(height^2), data = restricted)

tau: [1] 0.5

Coefficients:
            Value     Std. Error t value   Pr(>|t|) 
(Intercept) 642.10693  68.54141    9.36816   0.00000
ageyrs        4.14749   0.58021    7.14828   0.00000
height      -10.22399   0.98088  -10.42324   0.00000
wt            1.34796   0.12080   11.15886   0.00000
raceBlack   -40.21105   2.25978  -17.79427   0.00000
raceOther   -12.31840   6.16181   -1.99915   0.04568
sex         -26.85090   5.12586   -5.23832   0.00000
I(height^2)   0.04601   0.00353   13.04591   0.00000
ageyrs:sex    4.31349   0.52467    8.22133   0.00000


cbind(coef(sum.res.rq1)[,1:2],
      coef(summary(res.lm2))[,1:2]
      )
                Value Std. Error  Estimate Std. Error
(Intercept) 642.10693  68.541414 677.81183  57.787543
ageyrs        4.14749   0.580208   4.76766   0.548157
height      -10.22399   0.980884 -10.60566   0.791494
wt            1.34796   0.120798   1.29591   0.102292
raceBlack   -40.21105   2.259776 -44.19068   2.342643
raceOther   -12.31840   6.161814 -12.96797   5.489200
sex         -26.85090   5.125859 -26.90883   6.377865
I(height^2)   0.04601   0.003527   0.04673   0.002666
ageyrs:sex    4.31349   0.524670   4.31170   0.512220

Linear spline

## Median height
ht.median <- median(restricted$height)

## Create a basis function
restricted <- within(restricted, {
    ht_spline <- (height-ht.median)*(height>ht.median)
})

## Check
head(restricted[,c("height","ht_spline")], 50)
    height ht_spline
447  140.4       0.0
449  138.0       0.0
451  132.9       0.0
456  143.9       0.0
465  128.3       0.0
466  137.1       0.0
477  126.5       0.0
482  145.5       0.0
484  130.1       0.0
486  132.3       0.0
493  131.6       0.0
497  149.9       0.0
500  119.0       0.0
502  130.5       0.0
503  145.7       0.0
504  130.1       0.0
505  133.0       0.0
507  133.6       0.0
508  128.7       0.0
512  150.1       0.0
514  110.3       0.0
516  122.9       0.0
517  117.4       0.0
528  175.2      19.9
530  158.8       3.5
532  166.3      11.0
534  172.9      17.6
535  161.5       6.2
537  149.7       0.0
538  159.9       4.6
539  167.9      12.6
541  166.8      11.5
542  163.8       8.5
543  162.9       7.6
544  172.4      17.1
546  179.6      24.3
548  168.4      13.1
549  174.7      19.4
551  159.3       4.0
557  162.6       7.3
559  165.7      10.4
560  155.1       0.0
769  154.4       0.0
770  138.2       0.0
789  158.9       3.6
791  137.6       0.0
794  127.0       0.0
797  152.1       0.0
804  143.9       0.0
806  119.9       0.0

## Fit spline model
res.lm3 <- lm(fvc ~ ageyrs + height + ht_spline + wt + race + sex + ageyrs:sex, restricted)
summary(res.lm3)

Call:
lm(formula = fvc ~ ageyrs + height + ht_spline + wt + race + 
    sex + ageyrs:sex, data = restricted)

Residuals:
   Min     1Q Median     3Q    Max 
-411.8  -24.2    0.0   25.8  410.5 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -170.336     15.935  -10.69  < 2e-16 ***
ageyrs         5.156      0.557    9.26  < 2e-16 ***
height         1.998      0.144   13.88  < 2e-16 ***
ht_spline      3.372      0.202   16.66  < 2e-16 ***
wt             1.362      0.102   13.33  < 2e-16 ***
raceBlack    -44.080      2.354  -18.73  < 2e-16 ***
raceOther    -11.746      5.514   -2.13    0.033 *  
sex          -27.789      6.411   -4.33 0.000015 ***
ageyrs:sex     4.291      0.517    8.30  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 46.6 on 3009 degrees of freedom
Multiple R-squared:  0.854, Adjusted R-squared:  0.853 
F-statistic: 2.2e+03 on 8 and 3009 DF,  p-value: <2e-16

## qqPlot
qqPlot(res.lm3)

plot of chunk unnamed-chunk-6


## residuals vs height
residualPlots(res.lm3,  ~ height, fitted = F)

plot of chunk unnamed-chunk-6

       Test stat Pr(>|t|)
height     5.314        0

## residuals vs fitted
residualPlot(res.lm3)

plot of chunk unnamed-chunk-6


## AIC
list(AIC(res.lm2),
     AIC(res.lm3))
[[1]]
[1] 31740

[[2]]
[1] 31768

## Load ggplot2
library(ggplot2)

## Create a function
CoefHt <- coef(res.lm3)["height"]
CoefHtSpline <- coef(res.lm3)["ht_spline"]
basis.fun <- function(x) CoefHt*x + CoefHtSpline*(x - ht.median)*(x > ht.median)

## Plot
ggplot(data = data.frame(ht = 0), aes(x = ht)) +
    layer(geom = "line", stat = "function", fun = basis.fun) +
    scale_x_continuous(limit = c(120,179)) +
    scale_y_continuous(name = "fvc predicted")

plot of chunk unnamed-chunk-6