## Load foreign package
library(foreign)
## Load dataset converted to Stata format
nhdat <- read.dta("./lab1.dta")
nhdat <- within(nhdat, {
race <- factor(race, 1:3, c("White","Black","Other"))
})
summary(nhdat[,c("ageyrs","height")])
ageyrs height
Min. : 0 Min. : 50
1st Qu.: 9 1st Qu.:149
Median :26 Median :165
Mean :31 Mean :289
3rd Qu.:55 3rd Qu.:178
Max. :74 Max. :999
NA's :3968
nhdat <- within(nhdat, {
height[height %in% 999] <- NA
})
## Restrict to complete cases
restricted <- nhdat[complete.cases(nhdat[,c("fvc","ageyrs","height","wt","sex","race")]),]
restricted <- restricted[5 <= restricted$ageyrs & restricted$ageyrs <= 18,]
## summary
summary(restricted$fvc)
Min. 1st Qu. Median Mean 3rd Qu. Max.
62 190 280 290 372 975
## Histogram
hist(restricted$fvc)
hist(restricted$height)
## Scatter plot
plot(fvc ~ height, restricted)
## Univariable
res.lm1 <- lm(fvc ~ ageyrs, restricted)
summary(res.lm1)
Call:
lm(formula = fvc ~ ageyrs, data = restricted)
Residuals:
Min 1Q Median 3Q Max
-290.4 -41.3 -3.9 36.0 560.8
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -49.716 4.706 -10.6 <2e-16 ***
ageyrs 27.287 0.363 75.2 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 71.9 on 3016 degrees of freedom
Multiple R-squared: 0.652, Adjusted R-squared: 0.652
F-statistic: 5.65e+03 on 1 and 3016 DF, p-value: <2e-16
## Multivariable
res.lm2 <- lm(fvc ~ ageyrs + height + sex + wt, restricted)
summary(res.lm2)
Call:
lm(formula = fvc ~ ageyrs + height + sex + wt, data = restricted)
Residuals:
Min 1Q Median 3Q Max
-350.3 -29.8 -0.3 29.3 461.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -396.562 14.042 -28.24 <2e-16 ***
ageyrs 4.897 0.596 8.22 3e-16 ***
height 3.482 0.142 24.51 <2e-16 ***
sex 33.139 1.996 16.61 <2e-16 ***
wt 1.649 0.114 14.48 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 53 on 3013 degrees of freedom
Multiple R-squared: 0.811, Adjusted R-squared: 0.811
F-statistic: 3.24e+03 on 4 and 3013 DF, p-value: <2e-16
res.lm3 <- lm(fvc ~ ageyrs + height + sex + wt + race, restricted)
summary(res.lm3)
Call:
lm(formula = fvc ~ ageyrs + height + sex + wt + race, data = restricted)
Residuals:
Min 1Q Median 3Q Max
-358.0 -28.7 -1.2 27.4 452.8
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -389.519 13.333 -29.22 <2e-16 ***
ageyrs 4.700 0.566 8.31 <2e-16 ***
height 3.485 0.135 25.86 <2e-16 ***
sex 32.664 1.894 17.24 <2e-16 ***
wt 1.707 0.108 15.80 <2e-16 ***
raceBlack -46.652 2.531 -18.43 <2e-16 ***
raceOther -11.967 5.937 -2.02 0.044 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 50.2 on 3011 degrees of freedom
Multiple R-squared: 0.83, Adjusted R-squared: 0.83
F-statistic: 2.46e+03 on 6 and 3011 DF, p-value: <2e-16
res.lm6 <- lm(fvc ~ ageyrs + sex + ageyrs:sex, restricted)
summary(res.lm6)
Call:
lm(formula = fvc ~ ageyrs + sex + ageyrs:sex, data = restricted)
Residuals:
Min 1Q Median 3Q Max
-346.8 -37.6 -2.5 34.5 510.3
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.987 5.954 1.17 0.24
ageyrs 20.450 0.463 44.20 <2e-16 ***
sex -104.298 8.255 -12.63 <2e-16 ***
ageyrs:sex 12.610 0.637 19.79 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 63 on 3014 degrees of freedom
Multiple R-squared: 0.733, Adjusted R-squared: 0.733
F-statistic: 2.76e+03 on 3 and 3014 DF, p-value: <2e-16