http://www.umass.edu/statdata/statdata/data/lowbwt.txt
SOURCE: Hosmer and Lemeshow (2000) Applied Logistic Regression: Second Edition. These data are copyrighted by John Wiley & Sons Inc. and must be acknowledged and used accordingly. Data were collected at Baystate Medical Center, Springfield, Massachusetts during 1986.
## Load dataset directly from the internet
lbw <- read.table("http://www.umass.edu/statdata/statdata/data/lowbwt.dat", head = T, skip = 4)
## "Fix" cases 10 and 39 to make them identical to BIO 213 dataset
lbw[c(10,39),"BWT"] <- c(2655,3035)
## Change variable names to lower cases
names(lbw) <- tolower(names(lbw))
## Check first 6 observations
head(lbw)
id low age lwt race smoke ptl ht ui ftv bwt
1 85 0 19 182 2 0 0 0 1 0 2523
2 86 0 33 155 3 0 0 0 0 3 2551
3 87 0 20 105 1 1 0 0 0 1 2557
4 88 0 21 108 1 1 0 0 1 2 2594
5 89 0 18 107 1 1 0 0 1 0 2600
6 91 0 21 124 3 0 0 0 0 0 2622
library(gpairs)
gpairs(lbw)
## Relabel race: 1, 2, 3 -> White, Black, Other
lbw$race.cat <- factor(lbw$race, levels = 1:3, labels = c("White","Black","Other"))
## Dichotomize ptl
table(lbw$ptl)
0 1 2 3
159 24 5 1
lbw$preterm <- factor(ifelse(lbw$ptl >= 1, "1+", "0"))
## Change 0,1 binary to No,Yes binary
lbw$smoke <- factor(ifelse(lbw$smoke == 1, "Yes", "No"))
lbw$ht <- factor(ifelse(lbw$ht == 1, "Yes", "No"))
lbw$ui <- factor(ifelse(lbw$ui == 1, "Yes", "No"))
lbw$low <- factor(ifelse(lbw$low == 1, "Yes", "No"))
## Alternative way
## lbw$low <- factor(lbw$low, levels = 0:1, labels = c("No", "Yes")))
## Categorize ftv (frequency of visit): 0 | 1 2 | 3 4 6 -> None(0) | Normal(1-2) | Many (>= 3)
table(lbw$ftv)
0 1 2 3 4 6
100 47 30 7 4 1
lbw$ftv.cat <- cut(lbw$ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
levels(lbw$ftv.cat)
[1] "None" "Normal" "Many"
## Make Normal the reference level
lbw$ftv.cat <- relevel(lbw$ftv.cat, ref = "Normal")
levels(lbw$ftv.cat)
[1] "Normal" "None" "Many"
## Check first 6 observations
head(lbw)
id low age lwt race smoke ptl ht ui ftv bwt race.cat preterm ftv.cat
1 85 No 19 182 2 No 0 No Yes 0 2523 Black 0 None
2 86 No 33 155 3 No 0 No No 3 2551 Other 0 Many
3 87 No 20 105 1 Yes 0 No No 1 2557 White 0 Normal
4 88 No 21 108 1 Yes 0 No Yes 2 2594 White 0 Normal
5 89 No 18 107 1 Yes 0 No Yes 0 2600 White 0 None
6 91 No 21 124 3 No 0 No No 0 2622 Other 0 None
## Summarize
summary(lbw)
id low age lwt race smoke ptl ht ui
Min. : 4 No :130 Min. :14.0 Min. : 80 Min. :1.00 No :115 Min. :0.000 No :177 No :161
1st Qu.: 68 Yes: 59 1st Qu.:19.0 1st Qu.:110 1st Qu.:1.00 Yes: 74 1st Qu.:0.000 Yes: 12 Yes: 28
Median :123 Median :23.0 Median :121 Median :1.00 Median :0.000
Mean :121 Mean :23.2 Mean :130 Mean :1.85 Mean :0.196
3rd Qu.:176 3rd Qu.:26.0 3rd Qu.:140 3rd Qu.:3.00 3rd Qu.:0.000
Max. :226 Max. :45.0 Max. :250 Max. :3.00 Max. :3.000
ftv bwt race.cat preterm ftv.cat
Min. :0.000 Min. : 709 White:96 0 :159 Normal: 77
1st Qu.:0.000 1st Qu.:2414 Black:26 1+: 30 None :100
Median :0.000 Median :2977 Other:67 Many : 12
Mean :0.794 Mean :2945
3rd Qu.:1.000 3rd Qu.:3475
Max. :6.000 Max. :4990
## Recoding using within function
lbw <- within(lbw, {
## Relabel race: 1, 2, 3 -> White, Black, Other
race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))
## Dichotomize ptl
preterm <- factor(ifelse(ptl >= 1, "1+", "0"))
## Change 0,1 binary to No,Yes binary
smoke <- factor(ifelse(smoke == 1, "Yes", "No"))
ht <- factor(ifelse(ht == 1, "Yes", "No"))
ui <- factor(ifelse(ui == 1, "Yes", "No"))
low <- factor(ifelse(low == 1, "Yes", "No"))
## Categorize ftv (frequency of visit): 0 1 2 3 4 6 -> None(0), Normal(1-2), Many (>= 3)
ftv.cat <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
## Make Normal the reference level
ftv.cat <- relevel(ftv.cat, ref = "Normal")
})
Alternative to the “## Categorize smoke ht ui low” part.
## Categorize smoke ht ui
lbw[,c("smoke","ht","ui")] <- lapply(lbw[,c("smoke","ht","ui")],
function(var) {
var <- factor(ifelse(var == 1, "Yes", "No"))
})
## Recoding using within function
lbw <- within(lbw, {
## Relabel race: 1, 2, 3 -> White, Black, Other
race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))
## Dichotomize ptl
preterm <- factor(ptl >= 1, levels = c(FALSE,TRUE), labels = c("0","1+"))
## You can alse use cut
## preterm <- cut(ptl, breaks = c(-Inf, 0, Inf), labels = c("0","1+"))
## Change 0,1 binary to No,Yes binary
smoke <- factor(smoke, levels = c(0,1), labels = c("No","Yes"))
ht <- factor(ht , levels = c(0,1), labels = c("No","Yes"))
ui <- factor(ui , levels = c(0,1), labels = c("No","Yes"))
low <- factor(low , levels = c(0,1), labels = c("No","Yes"))
## Categorize ftv (frequency of visit): 0 1 2 3 4 6 -> None(0), Normal(1-2), Many (>= 3)
ftv.cat <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
## Make Normal the reference level
ftv.cat <- relevel(ftv.cat, ref = "Normal")
})
lm.full <- lm(bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, data = lbw)
lm.null <- lm(bwt ~ 1, data = lbw)
lm.full
Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat +
preterm, data = lbw)
Coefficients:
(Intercept) age lwt smokeYes htYes uiYes ftv.catNone
2947.32 -2.91 4.22 -307.34 -568.63 -494.11 -56.50
ftv.catMany race.catBlack race.catOther preterm1+
-185.86 -467.30 -322.81 -207.91
summary(lm.full)
Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat +
preterm, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-1896.7 -443.3 53.2 466.1 1654.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2947.32 320.48 9.20 < 2e-16 ***
age -2.91 9.67 -0.30 0.76354
lwt 4.22 1.72 2.46 0.01488 *
smokeYes -307.34 109.13 -2.82 0.00541 **
htYes -568.63 200.88 -2.83 0.00518 **
uiYes -494.11 137.23 -3.60 0.00041 ***
ftv.catNone -56.50 105.36 -0.54 0.59245
ftv.catMany -185.86 203.19 -0.91 0.36158
race.catBlack -467.30 149.78 -3.12 0.00211 **
race.catOther -322.81 117.40 -2.75 0.00658 **
preterm1+ -207.91 136.35 -1.52 0.12907
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 647 on 178 degrees of freedom
Multiple R-squared: 0.255, Adjusted R-squared: 0.213
F-statistic: 6.08 on 10 and 178 DF, p-value: 0.0000000609
confint(lm.full)
2.5 % 97.5 %
(Intercept) 2314.9019 3579.741
age -22.0027 16.174
lwt 0.8344 7.612
smokeYes -522.7049 -91.979
htYes -965.0356 -172.218
uiYes -764.9083 -223.303
ftv.catNone -264.4122 151.414
ftv.catMany -586.8257 215.111
race.catBlack -762.8701 -171.736
race.catOther -554.4737 -91.143
preterm1+ -476.9680 61.155