# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 5 Day 4 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Bronchopulmonary Dysplasia Data Example: 
# BPD is a chronic lung disease that can develop among infants who receive prolonged 
# mechanical ventilation. As such, infants who require intubation early in life are 
# at high risk of developing BPD. Researhers wanted to determine if there was an 
# an association between the probability of developing BPD and an infants birthweight. 
# To study this relationship, they selected a random sample of 223 infants who were 
# confined to the NICU for at least the first 30 days of life, who required intubation
# in the first 12 hours of life, and who survived for at least 30 day. 

# Data Dictionary: 
# 1. birthweight    birthweight of baby in grams
# 2. BPD            binary indicator of presence vs. absence of bronchopulmonary 
#                   dysplasia (BPD): 0=absent, 1=present

# Download and load the dataset used in lecture: 
download.file("http://www.duke.edu/~sgrambow/crp241data/bpd_data.RData",
              destfile = "bpd_data.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("bpd_data.RData")


# Visualize the relationship between BPD and birthweight
# - Create a scatter plot of Y (BPD) vs. X (birthweight)
plot(bpd$birthweight,bpd$BPD)

# Linear Regression Model: 
# - Can always get R to do it! 
# - But does it make sense for this data?
lin.fit <- lm(BPD ~ birthweight, data=bpd)
summary(lin.fit)
## 
## Call:
## lm(formula = BPD ~ birthweight, data = bpd)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82821 -0.33201 -0.07459  0.38445  1.02988 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.216e+00  1.057e-01  11.505  < 2e-16 ***
## birthweight -7.462e-04  8.697e-05  -8.579  1.7e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4124 on 221 degrees of freedom
## Multiple R-squared:  0.2498, Adjusted R-squared:  0.2464 
## F-statistic:  73.6 on 1 and 221 DF,  p-value: 1.698e-15
confint.default(lin.fit)
##                     2.5 %        97.5 %
## (Intercept)  1.0090191728  1.4233971753
## birthweight -0.0009166261 -0.0005756906
# - Add the fitted regression line to the scatter plot 
plot(bpd$birthweight,bpd$BPD)
abline(lin.fit,col='red')

# Logistic Regression Model: 
# - Note: Default estimates are on the log-odds scale
log.fit <- glm(BPD ~ birthweight, data=bpd, family='binomial')
summary(log.fit)
## 
## Call:
## glm(formula = BPD ~ birthweight, family = "binomial", data = bpd)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9916  -0.7993  -0.4096   0.9242   2.4802  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.0342913  0.6957121   5.799 6.68e-09 ***
## birthweight -0.0042291  0.0006408  -6.600 4.11e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 286.14  on 222  degrees of freedom
## Residual deviance: 223.72  on 221  degrees of freedom
## AIC: 227.72
## 
## Number of Fisher Scoring iterations: 4
confint.default(log.fit)
##                    2.5 %       97.5 %
## (Intercept)  2.670720695  5.397861859
## birthweight -0.005485022 -0.002973258
# - To get estimates on odds scale, must exponentiate
exp(log.fit$coef)
## (Intercept) birthweight 
##  56.5028612   0.9957798
exp(confint.default(log.fit))
##                2.5 %      97.5 %
## (Intercept) 14.45038 220.9335239
## birthweight  0.99453   0.9970312
# - To get estimates on odds scale per 100grams, must multiply by 100 and exponentiate
exp(100*log.fit$coef)
##   (Intercept)   birthweight 
## 1.610809e+175  6.551350e-01
exp(100*confint.default(log.fit))
##                     2.5 %        97.5 %
## (Intercept) 9.725816e+115 2.667853e+234
## birthweight  5.778146e-01  7.428020e-01
# Getting predicted probabilities from a Logistic Regression Model: 
# - What is predicted probability of BPD for an infant whose 
#   birthweight is 750 grams?
xbeta <- 4.0342913 + (-0.0042291*750)
p.x750 <- exp(xbeta)/(1+exp(xbeta))
p.x750
## [1] 0.7031757
# FYI: 
# - Plot predicted probabilites for all observed data points
p.hats <- predict(log.fit,type='response')
plot(bpd$birthweight,bpd$BPD)
points(bpd$birthweight,p.hats,col='blue')

# - Get smooth line of predicted probabilities as function of X
x  <- seq(400,1800,by=1)
xb <- 4.0342913 + (-0.0042291*x)
p  <-  exp(xb)/(1+exp(xb))
plot(bpd$birthweight,bpd$BPD)
lines(x,p,col='blue',lwd=2)

# End of Program