# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 5 Day 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Bronchopulmonary Dysplasia Data Example:
# BPD is a chronic lung disease that can develop among infants who receive prolonged
# mechanical ventilation. As such, infants who require intubation early in life are
# at high risk of developing BPD. Researhers wanted to determine if there was an
# an association between the probability of developing BPD and an infants birthweight.
# To study this relationship, they selected a random sample of 223 infants who were
# confined to the NICU for at least the first 30 days of life, who required intubation
# in the first 12 hours of life, and who survived for at least 30 day.
# Data Dictionary:
# 1. birthweight birthweight of baby in grams
# 2. BPD binary indicator of presence vs. absence of bronchopulmonary
# dysplasia (BPD): 0=absent, 1=present
# Download and load the dataset used in lecture:
download.file("http://www.duke.edu/~sgrambow/crp241data/bpd_data.RData",
destfile = "bpd_data.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("bpd_data.RData")
# Visualize the relationship between BPD and birthweight
# - Create a scatter plot of Y (BPD) vs. X (birthweight)
plot(bpd$birthweight,bpd$BPD)
# Linear Regression Model:
# - Can always get R to do it!
# - But does it make sense for this data?
lin.fit <- lm(BPD ~ birthweight, data=bpd)
summary(lin.fit)
##
## Call:
## lm(formula = BPD ~ birthweight, data = bpd)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82821 -0.33201 -0.07459 0.38445 1.02988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.216e+00 1.057e-01 11.505 < 2e-16 ***
## birthweight -7.462e-04 8.697e-05 -8.579 1.7e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4124 on 221 degrees of freedom
## Multiple R-squared: 0.2498, Adjusted R-squared: 0.2464
## F-statistic: 73.6 on 1 and 221 DF, p-value: 1.698e-15
confint.default(lin.fit)
## 2.5 % 97.5 %
## (Intercept) 1.0090191728 1.4233971753
## birthweight -0.0009166261 -0.0005756906
# - Add the fitted regression line to the scatter plot
plot(bpd$birthweight,bpd$BPD)
abline(lin.fit,col='red')

# Logistic Regression Model:
# - Note: Default estimates are on the log-odds scale
log.fit <- glm(BPD ~ birthweight, data=bpd, family='binomial')
summary(log.fit)
##
## Call:
## glm(formula = BPD ~ birthweight, family = "binomial", data = bpd)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9916 -0.7993 -0.4096 0.9242 2.4802
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.0342913 0.6957121 5.799 6.68e-09 ***
## birthweight -0.0042291 0.0006408 -6.600 4.11e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 286.14 on 222 degrees of freedom
## Residual deviance: 223.72 on 221 degrees of freedom
## AIC: 227.72
##
## Number of Fisher Scoring iterations: 4
confint.default(log.fit)
## 2.5 % 97.5 %
## (Intercept) 2.670720695 5.397861859
## birthweight -0.005485022 -0.002973258
# - To get estimates on odds scale, must exponentiate
exp(log.fit$coef)
## (Intercept) birthweight
## 56.5028612 0.9957798
exp(confint.default(log.fit))
## 2.5 % 97.5 %
## (Intercept) 14.45038 220.9335239
## birthweight 0.99453 0.9970312
# - To get estimates on odds scale per 100grams, must multiply by 100 and exponentiate
exp(100*log.fit$coef)
## (Intercept) birthweight
## 1.610809e+175 6.551350e-01
exp(100*confint.default(log.fit))
## 2.5 % 97.5 %
## (Intercept) 9.725816e+115 2.667853e+234
## birthweight 5.778146e-01 7.428020e-01
# Getting predicted probabilities from a Logistic Regression Model:
# - What is predicted probability of BPD for an infant whose
# birthweight is 750 grams?
xbeta <- 4.0342913 + (-0.0042291*750)
p.x750 <- exp(xbeta)/(1+exp(xbeta))
p.x750
## [1] 0.7031757
# FYI:
# - Plot predicted probabilites for all observed data points
p.hats <- predict(log.fit,type='response')
plot(bpd$birthweight,bpd$BPD)
points(bpd$birthweight,p.hats,col='blue')

# - Get smooth line of predicted probabilities as function of X
x <- seq(400,1800,by=1)
xb <- 4.0342913 + (-0.0042291*x)
p <- exp(xb)/(1+exp(xb))
plot(bpd$birthweight,bpd$BPD)
lines(x,p,col='blue',lwd=2)

# End of Program