# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 9 ~
# ~     Interaction        ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Example 1

# FEV1 and Smoking Status: 
# A study was conducted to determine if there was an association between smoking 
# status and forced expiratory volume in one second (FEV1) in patients with COPD. 
# FEV1 was measured in liters. Smoking groups of interest were smokers (current, 
# recent, or former smokers) vs. non- smokers. Whether or not a patient was 65
# years or older was also recorded. 200 patients were randomly selected from the 
# researchers' clinical practice. 

# Data Dictionary: 
# 1.  FEV1      forced expiratory volume in one second (liters)
# 2.  SMOKING   patient smoking status (1 = Current/recent/former smoker; 0 = Non-smoker)
# 3.    AGE65     patient age (1 = Age >= 65 years; 0 = Age < 65 years)

# Download and load the lead dataset used in lecture: 
download.file("http://www.duke.edu/~sgrambow/crp241data/fev1_smoking.RData",
              destfile = "fev1_smoking.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("fev1_smoking.RData")

# let's look at some quick summary stats
# FEV1 by AGE65
by(fsdata$FEV1,fsdata$AGE65,summary) 
## fsdata$AGE65: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.479   2.984   3.729   3.783   4.682   6.280 
## -------------------------------------------------------- 
## fsdata$AGE65: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.704   3.444   3.371   4.011   5.547
# FEV1 by SMOKING
by(fsdata$FEV1,fsdata$SMOKING,summary) 
## fsdata$SMOKING: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.745   3.323   3.860   3.921   4.633   6.280 
## -------------------------------------------------------- 
## fsdata$SMOKING: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.442   3.170   3.233   3.923   5.573
# (1) Two Sample T-test Analysis in Entire Cohort 
#     - Assuming popluation variances are equal 

t.test(fsdata$FEV1~fsdata$SMOKING,var.equal=T)
## 
##  Two Sample t-test
## 
## data:  fsdata$FEV1 by fsdata$SMOKING
## t = 4.7934, df = 198, p-value = 3.215e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4045320 0.9700351
## sample estimates:
## mean in group 0 mean in group 1 
##        3.920575        3.233291
3.233291 - 3.920575 
## [1] -0.687284
# (2) Two Sample T-test Analysis Stratified by Age 
old   <- subset(fsdata,AGE65==1)
young <- subset(fsdata,AGE65==0)

# - (2a) Among Age < 65 years
by(young$FEV1,young$SMOKING,summary) 
## young$SMOKING: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.745   3.298   3.859   3.962   4.704   6.280 
## -------------------------------------------------------- 
## young$SMOKING: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.479   2.680   3.510   3.604   4.642   5.573
summary(young$FEV1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.479   2.984   3.729   3.783   4.682   6.280
t.test(young$FEV1~young$SMOKING,var.equal=T)
## 
##  Two Sample t-test
## 
## data:  young$FEV1 by young$SMOKING
## t = 1.64, df = 98, p-value = 0.1042
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.07517284  0.79089874
## sample estimates:
## mean in group 0 mean in group 1 
##        3.961962        3.604099
3.604099 - 3.961962
## [1] -0.357863
# - (2b) Among Age >= 65 years
by(old$FEV1,old$SMOKING,summary) 
## old$SMOKING: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.883   3.348   3.860   3.879   4.485   5.547 
## -------------------------------------------------------- 
## old$SMOKING: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.180   2.803   2.862   3.644   4.243
summary(old$FEV1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.704   3.444   3.371   4.011   5.547
t.test(old$FEV1~old$SMOKING,var.equal=T)
## 
##  Two Sample t-test
## 
## data:  old$FEV1 by old$SMOKING
## t = 5.8922, df = 98, p-value = 5.388e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.674283 1.359125
## sample estimates:
## mean in group 0 mean in group 1 
##        3.879188        2.862483
2.862483 - 3.879188
## [1] -1.016705
# (3) Linear Regression Analysis with Interaction Term
ifit <- lm(FEV1~SMOKING + AGE65 + SMOKING*AGE65,data=fsdata)
summary(ifit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING + AGE65 + SMOKING * AGE65, data = fsdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21667 -0.66888 -0.06022  0.77311  2.31807 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.96196    0.13910  28.484   <2e-16 ***
## SMOKING       -0.35786    0.19671  -1.819   0.0704 .  
## AGE65         -0.08277    0.19671  -0.421   0.6744    
## SMOKING:AGE65 -0.65884    0.27819  -2.368   0.0188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9836 on 196 degrees of freedom
## Multiple R-squared:  0.1653, Adjusted R-squared:  0.1525 
## F-statistic: 12.93 on 3 and 196 DF,  p-value: 9.579e-08
# (4) Linear Regression Analysis Stratified by Age 
old   <- subset(fsdata,AGE65==1)
young <- subset(fsdata,AGE65==0)

# - (4a) Among Age < 65 years
yfit <- lm(FEV1~SMOKING,data=young)
summary(yfit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING, data = young)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21667 -0.73090 -0.09361  0.85583  2.31807 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9620     0.1543   25.68   <2e-16 ***
## SMOKING      -0.3579     0.2182   -1.64    0.104    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.091 on 98 degrees of freedom
## Multiple R-squared:  0.02671,    Adjusted R-squared:  0.01678 
## F-statistic:  2.69 on 1 and 98 DF,  p-value: 0.1042
confint(yfit)
##                  2.5 %     97.5 %
## (Intercept)  3.6557595 4.26816455
## SMOKING     -0.7908987 0.07517284
# - (4b) Among Age >= 65 years
ofit <- lm(FEV1~SMOKING,data=old)
summary(ofit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING, data = old)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.99650 -0.60340 -0.04252  0.66335  1.66817 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.8792     0.1220  31.794  < 2e-16 ***
## SMOKING      -1.0167     0.1726  -5.892 5.39e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8628 on 98 degrees of freedom
## Multiple R-squared:  0.2616, Adjusted R-squared:  0.2541 
## F-statistic: 34.72 on 1 and 98 DF,  p-value: 5.388e-08
confint(ofit)
##                 2.5 %    97.5 %
## (Intercept)  3.637059  4.121316
## SMOKING     -1.359125 -0.674283
# (5) Linear Regression Analysis Ignoring Age
ufit <- lm(FEV1~SMOKING,data=fsdata)
summary(ufit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING, data = fsdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.17528 -0.67376 -0.06018  0.69897  2.35946 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9206     0.1014  38.670  < 2e-16 ***
## SMOKING      -0.6873     0.1434  -4.793 3.21e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.014 on 198 degrees of freedom
## Multiple R-squared:  0.104,  Adjusted R-squared:  0.09945 
## F-statistic: 22.98 on 1 and 198 DF,  p-value: 3.215e-06
confint(ufit)
##                  2.5 %    97.5 %
## (Intercept)  3.7206393  4.120510
## SMOKING     -0.9700351 -0.404532
# Example 2

# Low Birth Weight Data: 
# A study was performed at Baystate Medical Center, Springfield, Massachusetts, to 
# understand the variables that are related to  the likelihood of a mother giving 
# birth to a baby with low-birth weight (defined as a baby weighing less than 2500g). 
# 189 mothers were randomly selected to partcipate in the study. 

# Data Dictionary: 
# There are numerous variables in the lead dataset but those of interest for the 
# current exercise include:
# 1.  bwt     birth weight of baby (grams)
# 2.  age     age of mother during pregnancy (years)
# 3.    smoke   smoking status of mother during pregnancy (1 = smoker; 0 = non-smoker)

# Download and load the lead dataset used in lecture: 
download.file("http://www.duke.edu/~sgrambow/crp241data/bwdata.RData",
              destfile = "bwdata.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("bwdata.RData")

# Question 1: 
# Is there evidence of an association between a mother's age during pregancy 
# and a baby's birth weight? Perform a hypothesis test. 
fit1 <- lm(bwt ~ age, data=bwdata)
summary(fit1)
## 
## Call:
## lm(formula = bwt ~ age, data = bwdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2294.78  -517.63    10.51   530.80  1774.92 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2655.74     238.86   11.12   <2e-16 ***
## age            12.43      10.02    1.24    0.216    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 728.2 on 187 degrees of freedom
## Multiple R-squared:  0.008157,   Adjusted R-squared:  0.002853 
## F-statistic: 1.538 on 1 and 187 DF,  p-value: 0.2165
# Question 2: 
# Create a figure to describe the relationship examined in Question 1. 
plot(bwdata$age,bwdata$bwt,
     xlab='Mothers Age During Pregnancy (age)',
     ylab='Babys Birth Weight (grams)',
     cex=2)
abline(fit1,lwd=3)

# Question 3: 
# Is there evidence that the association between a mother's age during pregancy 
# and a baby's birth weight depends on the mother's smoking status during 
# pregnancy? Perform a hypothesis test. 
fit2 <- lm(bwt ~ age + smoke + age*smoke, data=bwdata)
summary(fit2)
## 
## Call:
## lm(formula = bwt ~ age + smoke + age * smoke, data = bwdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2189.27  -458.46    51.46   527.26  1521.39 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2406.06     292.19   8.235 3.18e-14 ***
## age            27.73      12.15   2.283   0.0236 *  
## smoke         798.17     484.34   1.648   0.1011    
## age:smoke     -46.57      20.45  -2.278   0.0239 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 709.3 on 185 degrees of freedom
## Multiple R-squared:  0.06909,    Adjusted R-squared:  0.054 
## F-statistic: 4.577 on 3 and 185 DF,  p-value: 0.004068
# Question 4: 
# Modify the figure created in Question 2 to describe the relationship examined 
# in Question 3. Using this figure, describe the observed effect modification, 
# if one exists. 
smoker    <- subset(bwdata,smoke==1)
nonsmoker <- subset(bwdata,smoke==0)

fit3 <- lm(bwt ~ age, data=smoker)
summary(fit3)
## 
## Call:
## lm(formula = bwt ~ age, data = smoker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1967.7  -464.1    13.6   498.8  1391.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3204.23     357.96   8.951 2.57e-13 ***
## age           -18.84      15.24  -1.236     0.22    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 657.3 on 72 degrees of freedom
## Multiple R-squared:  0.02078,    Adjusted R-squared:  0.007183 
## F-statistic: 1.528 on 1 and 72 DF,  p-value: 0.2204
confint(fit3)
##                  2.5 %     97.5 %
## (Intercept) 2490.64656 3917.81914
## age          -49.22246   11.54138
fit4 <- lm(bwt ~ age, data=nonsmoker)
summary(fit4)
## 
## Call:
## lm(formula = bwt ~ age, data = nonsmoker)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2189.27  -449.75    73.58   542.25  1521.39 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2406.06     305.05   7.887 2.14e-12 ***
## age            27.73      12.68   2.186   0.0309 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 740.5 on 113 degrees of freedom
## Multiple R-squared:  0.04058,    Adjusted R-squared:  0.03209 
## F-statistic:  4.78 on 1 and 113 DF,  p-value: 0.03085
confint(fit4)
##                   2.5 %     97.5 %
## (Intercept) 1801.691173 3010.42478
## age            2.602105   52.86065
plot(nonsmoker$age,nonsmoker$bwt,
     xlab='Mothers Age During Pregnancy (age)',
     ylab='Babys Birth Weight (grams)',
     col='royalblue',cex=2)
points(smoker$age,smoker$bwt,
       col='sienna3',pch=19,cex=2)
abline(fit4,col='royalblue',lty=2,lwd=3)
abline(fit3,col='sienna3',lwd=3)
legend('topleft',c('Smoker','Non-Smoker'),
       col=c('sienna3','royalblue'),
       lty=c(1,2),lwd=2,horiz=T)

# End of Program