#
# CRP 245 Part 2 Day 6-7 Interaction
#
# including interactions in models

# FEV1 and Smoking Status: 
# A study was conducted to determine if there was an association 
# between smoking status and forced expiratory volume in one 
# second (FEV1) in patients with COPD.FEV1 was measured in liters. 
# Smoking groups of interest were smokers (current,recent, or 
# former smokers) vs. non- smokers. Whether or not a patient was 65
# years or older was also recorded. 200 patients were randomly 
# selected from the researchers' clinical practice. 

# Data Dictionary: 
# 1.  FEV1      forced expiratory volume in one second (liters)
# 2.  SMOKING   patient smoking status (1 = Current/recent/former 
#                                      smoker; 0 = Non-smoker)
# 3.    AGE65     patient age (1 = Age >= 65 years; 0 = Age < 65 years)

# Download and load the lead dataset used in lecture: 
load(url("http://www.duke.edu/~sgrambow/crp241data/fev1_smoking.RData"))

# let's look at some quick summary stats
# FEV1 by AGE65
by(fsdata$FEV1,fsdata$AGE65,summary) 
## fsdata$AGE65: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.479   2.984   3.729   3.783   4.682   6.280 
## ------------------------------------------------------------ 
## fsdata$AGE65: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.704   3.444   3.371   4.011   5.547
# FEV1 by SMOKING
by(fsdata$FEV1,fsdata$SMOKING,summary) 
## fsdata$SMOKING: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.745   3.323   3.860   3.921   4.633   6.280 
## ------------------------------------------------------------ 
## fsdata$SMOKING: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.442   3.170   3.233   3.923   5.573
# Regression Model WITHOUT Interaction Term: 
# - This model ignores age

ufit <- lm(FEV1~SMOKING,data=fsdata)
summary(ufit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING, data = fsdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.17528 -0.67376 -0.06018  0.69897  2.35946 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9206     0.1014  38.670  < 2e-16 ***
## SMOKING      -0.6873     0.1434  -4.793 3.21e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.014 on 198 degrees of freedom
## Multiple R-squared:  0.104,  Adjusted R-squared:  0.09945 
## F-statistic: 22.98 on 1 and 198 DF,  p-value: 3.215e-06
confint(ufit)
##                  2.5 %    97.5 %
## (Intercept)  3.7206393  4.120510
## SMOKING     -0.9700351 -0.404532
#
# Regression Model WITH Interaction Term: 
# - This model accounts for effect modificaiton due to age

ifit <- lm(FEV1~SMOKING + AGE65 + SMOKING*AGE65,data=fsdata)
summary(ifit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING + AGE65 + SMOKING * AGE65, data = fsdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21667 -0.66888 -0.06022  0.77311  2.31807 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.96196    0.13910  28.484   <2e-16 ***
## SMOKING       -0.35786    0.19671  -1.819   0.0704 .  
## AGE65         -0.08277    0.19671  -0.421   0.6744    
## SMOKING:AGE65 -0.65884    0.27819  -2.368   0.0188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9836 on 196 degrees of freedom
## Multiple R-squared:  0.1653, Adjusted R-squared:  0.1525 
## F-statistic: 12.93 on 3 and 196 DF,  p-value: 9.579e-08
#
# Get age-specific effects of smoking
# - Do because interaction is statistically significant 

# Approach 1: Subset data and fit new models 
# - Subset based on levels of effect modifier (Z)
old   <- subset(fsdata,AGE65==1)
young <- subset(fsdata,AGE65==0)

# - Fit model for FEV1 (Y) ~ SMOKING (X) in each subset 
yfit <- lm(FEV1~SMOKING,data=young)
summary(yfit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING, data = young)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21667 -0.73090 -0.09361  0.85583  2.31807 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9620     0.1543   25.68   <2e-16 ***
## SMOKING      -0.3579     0.2182   -1.64    0.104    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.091 on 98 degrees of freedom
## Multiple R-squared:  0.02671,    Adjusted R-squared:  0.01678 
## F-statistic:  2.69 on 1 and 98 DF,  p-value: 0.1042
confint(yfit)
##                  2.5 %     97.5 %
## (Intercept)  3.6557595 4.26816455
## SMOKING     -0.7908987 0.07517284
# -- Among Age >= 65 years
ofit <- lm(FEV1~SMOKING,data=old)
summary(ofit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING, data = old)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.99650 -0.60340 -0.04252  0.66335  1.66817 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.8792     0.1220  31.794  < 2e-16 ***
## SMOKING      -1.0167     0.1726  -5.892 5.39e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8628 on 98 degrees of freedom
## Multiple R-squared:  0.2616, Adjusted R-squared:  0.2541 
## F-statistic: 34.72 on 1 and 98 DF,  p-value: 5.388e-08
confint(ofit)
##                 2.5 %    97.5 %
## (Intercept)  3.637059  4.121316
## SMOKING     -1.359125 -0.674283
# Approach 2: Combination of betas from interaction model 
# - More statistically appropriate, but can be tricky to execute 
# install.packages('multcomp')
library(multcomp)
## Loading required package: mvtnorm
## Loading required package: survival
## Loading required package: TH.data
## Loading required package: MASS
## 
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
## 
##     geyser
summary(ifit)
## 
## Call:
## lm(formula = FEV1 ~ SMOKING + AGE65 + SMOKING * AGE65, data = fsdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21667 -0.66888 -0.06022  0.77311  2.31807 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.96196    0.13910  28.484   <2e-16 ***
## SMOKING       -0.35786    0.19671  -1.819   0.0704 .  
## AGE65         -0.08277    0.19671  -0.421   0.6744    
## SMOKING:AGE65 -0.65884    0.27819  -2.368   0.0188 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9836 on 196 degrees of freedom
## Multiple R-squared:  0.1653, Adjusted R-squared:  0.1525 
## F-statistic: 12.93 on 3 and 196 DF,  p-value: 9.579e-08
# - 0's and 1's indicate the regression coefficients that should 
#   added/subtracted to estimate the effect of interest 
# there are four betas in model

# smoke diff among old is beta 1 + beta 3
smokediff.old <- matrix(c(0, 1, 0, 1), 1)

# smoke diff among young is beta 1 
smokediff.young <- matrix(c(0, 1, 0, 0), 1)
# combine hypotheses together
K <- rbind(smokediff.old,smokediff.young)
# conduct tests of both hypotheses and get resulting CIs
t <- glht(ifit, linfct = K)
summary(t)
## 
##   Simultaneous Tests for General Linear Hypotheses
## 
## Fit: lm(formula = FEV1 ~ SMOKING + AGE65 + SMOKING * AGE65, data = fsdata)
## 
## Linear Hypotheses:
##        Estimate Std. Error t value Pr(>|t|)    
## 1 == 0  -1.0167     0.1967  -5.169 1.16e-06 ***
## 2 == 0  -0.3579     0.1967  -1.819    0.136    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## (Adjusted p values reported -- single-step method)
confint(t)
## 
##   Simultaneous Confidence Intervals
## 
## Fit: lm(formula = FEV1 ~ SMOKING + AGE65 + SMOKING * AGE65, data = fsdata)
## 
## Quantile = 2.2532
## 95% family-wise confidence level
##  
## 
## Linear Hypotheses:
##        Estimate lwr      upr     
## 1 == 0 -1.01670 -1.45994 -0.57347
## 2 == 0 -0.35786 -0.80110  0.08537
# End of Program