EPI204 Lab 1 in R

References

Load data

## Load foreign package
library(foreign)

## Load dataset converted to Stata format
nhdat <- read.dta("./lab1.dta")

Variable manipulation

nhdat <- within(nhdat, {
    race <- factor(race, 1:3, c("White","Black","Other"))
})

Summary statistics

summary(nhdat[,c("ageyrs","height")])
     ageyrs         height   
 Min.   : 0     Min.   : 50  
 1st Qu.: 9     1st Qu.:149  
 Median :26     Median :165  
 Mean   :31     Mean   :289  
 3rd Qu.:55     3rd Qu.:178  
 Max.   :74     Max.   :999  
 NA's   :3968                

nhdat <- within(nhdat, {
     height[height %in% 999] <- NA
 })


## Restrict to complete cases
restricted <- nhdat[complete.cases(nhdat[,c("fvc","ageyrs","height","wt","sex","race")]),]
restricted <- restricted[5 <= restricted$ageyrs & restricted$ageyrs <= 18,]

## summary
summary(restricted$fvc)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
     62     190     280     290     372     975 

## Histogram
hist(restricted$fvc)

plot of chunk unnamed-chunk-4

hist(restricted$height)

plot of chunk unnamed-chunk-4


## Scatter plot
plot(fvc ~ height, restricted)

plot of chunk unnamed-chunk-4

Linear regression model

## Univariable
res.lm1 <- lm(fvc ~ ageyrs, restricted)
summary(res.lm1)

Call:
lm(formula = fvc ~ ageyrs, data = restricted)

Residuals:
   Min     1Q Median     3Q    Max 
-290.4  -41.3   -3.9   36.0  560.8 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -49.716      4.706   -10.6   <2e-16 ***
ageyrs        27.287      0.363    75.2   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 71.9 on 3016 degrees of freedom
Multiple R-squared:  0.652, Adjusted R-squared:  0.652 
F-statistic: 5.65e+03 on 1 and 3016 DF,  p-value: <2e-16

## Multivariable
res.lm2 <- lm(fvc ~ ageyrs + height + sex + wt, restricted)
summary(res.lm2)

Call:
lm(formula = fvc ~ ageyrs + height + sex + wt, data = restricted)

Residuals:
   Min     1Q Median     3Q    Max 
-350.3  -29.8   -0.3   29.3  461.4 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -396.562     14.042  -28.24   <2e-16 ***
ageyrs         4.897      0.596    8.22    3e-16 ***
height         3.482      0.142   24.51   <2e-16 ***
sex           33.139      1.996   16.61   <2e-16 ***
wt             1.649      0.114   14.48   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 53 on 3013 degrees of freedom
Multiple R-squared:  0.811, Adjusted R-squared:  0.811 
F-statistic: 3.24e+03 on 4 and 3013 DF,  p-value: <2e-16

res.lm3 <- lm(fvc ~ ageyrs + height + sex + wt + race, restricted)
summary(res.lm3)

Call:
lm(formula = fvc ~ ageyrs + height + sex + wt + race, data = restricted)

Residuals:
   Min     1Q Median     3Q    Max 
-358.0  -28.7   -1.2   27.4  452.8 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -389.519     13.333  -29.22   <2e-16 ***
ageyrs         4.700      0.566    8.31   <2e-16 ***
height         3.485      0.135   25.86   <2e-16 ***
sex           32.664      1.894   17.24   <2e-16 ***
wt             1.707      0.108   15.80   <2e-16 ***
raceBlack    -46.652      2.531  -18.43   <2e-16 ***
raceOther    -11.967      5.937   -2.02    0.044 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 50.2 on 3011 degrees of freedom
Multiple R-squared:  0.83,  Adjusted R-squared:  0.83 
F-statistic: 2.46e+03 on 6 and 3011 DF,  p-value: <2e-16

res.lm6 <- lm(fvc ~ ageyrs + sex + ageyrs:sex, restricted)
summary(res.lm6)

Call:
lm(formula = fvc ~ ageyrs + sex + ageyrs:sex, data = restricted)

Residuals:
   Min     1Q Median     3Q    Max 
-346.8  -37.6   -2.5   34.5  510.3 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)    6.987      5.954    1.17     0.24    
ageyrs        20.450      0.463   44.20   <2e-16 ***
sex         -104.298      8.255  -12.63   <2e-16 ***
ageyrs:sex    12.610      0.637   19.79   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 63 on 3014 degrees of freedom
Multiple R-squared:  0.733, Adjusted R-squared:  0.733 
F-statistic: 2.76e+03 on 3 and 3014 DF,  p-value: <2e-16