# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 9 ~
# ~ Interaction ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Example 1
# FEV1 and Smoking Status:
# A study was conducted to determine if there was an association
# between smoking status and forced expiratory volume in one
# second (FEV1) in patients with COPD.FEV1 was measured in liters.
# Smoking groups of interest were smokers (current,recent, or
# former smokers) vs. non- smokers. Whether or not a patient was 65
# years or older was also recorded. 200 patients were randomly
# selected from the researchers' clinical practice.
# Data Dictionary:
# 1. FEV1 forced expiratory volume in one second (liters)
# 2. SMOKING patient smoking status (1 = Current/recent/former
# smoker; 0 = Non-smoker)
# 3. AGE65 patient age (1 = Age >= 65 years; 0 = Age < 65 years)
# Download and load the lead dataset used in lecture:
load(url("http://www.duke.edu/~sgrambow/crp241data/fev1_smoking.RData"))
# let's look at some quick summary stats
# FEV1 by AGE65
by(fsdata$FEV1,fsdata$AGE65,summary)
## fsdata$AGE65: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.479 2.984 3.729 3.783 4.682 6.280
## ------------------------------------------------------------
## fsdata$AGE65: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.335 2.704 3.444 3.371 4.011 5.547
# FEV1 by SMOKING
by(fsdata$FEV1,fsdata$SMOKING,summary)
## fsdata$SMOKING: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.745 3.323 3.860 3.921 4.633 6.280
## ------------------------------------------------------------
## fsdata$SMOKING: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.335 2.442 3.170 3.233 3.923 5.573
# (1) Two Sample T-test Analysis in Entire Cohort
# - Assuming population variances are equal
t.test(fsdata$FEV1~fsdata$SMOKING,var.equal=T)
##
## Two Sample t-test
##
## data: fsdata$FEV1 by fsdata$SMOKING
## t = 4.7934, df = 198, p-value = 3.215e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.4045320 0.9700351
## sample estimates:
## mean in group 0 mean in group 1
## 3.920575 3.233291
3.233291 - 3.920575
## [1] -0.687284
# (2) Two Sample T-test Analysis Stratified by Age
old <- subset(fsdata,AGE65==1)
young <- subset(fsdata,AGE65==0)
# - (2a) Among Age < 65 years
by(young$FEV1,young$SMOKING,summary)
## young$SMOKING: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.745 3.298 3.859 3.962 4.704 6.280
## ------------------------------------------------------------
## young$SMOKING: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.479 2.680 3.510 3.604 4.642 5.573
summary(young$FEV1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.479 2.984 3.729 3.783 4.682 6.280
t.test(young$FEV1~young$SMOKING,var.equal=T)
##
## Two Sample t-test
##
## data: young$FEV1 by young$SMOKING
## t = 1.64, df = 98, p-value = 0.1042
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.07517284 0.79089874
## sample estimates:
## mean in group 0 mean in group 1
## 3.961962 3.604099
3.604099 - 3.961962
## [1] -0.357863
# - (2b) Among Age >= 65 years
by(old$FEV1,old$SMOKING,summary)
## old$SMOKING: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.883 3.348 3.860 3.879 4.485 5.547
## ------------------------------------------------------------
## old$SMOKING: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.335 2.180 2.803 2.862 3.644 4.243
summary(old$FEV1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.335 2.704 3.444 3.371 4.011 5.547
t.test(old$FEV1~old$SMOKING,var.equal=T)
##
## Two Sample t-test
##
## data: old$FEV1 by old$SMOKING
## t = 5.8922, df = 98, p-value = 5.388e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.674283 1.359125
## sample estimates:
## mean in group 0 mean in group 1
## 3.879188 2.862483
2.862483 - 3.879188
## [1] -1.016705
# (3) Linear Regression Analysis with Interaction Term
ifit <- lm(FEV1~SMOKING + AGE65 + SMOKING*AGE65,data=fsdata)
summary(ifit)
##
## Call:
## lm(formula = FEV1 ~ SMOKING + AGE65 + SMOKING * AGE65, data = fsdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21667 -0.66888 -0.06022 0.77311 2.31807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.96196 0.13910 28.484 <2e-16 ***
## SMOKING -0.35786 0.19671 -1.819 0.0704 .
## AGE65 -0.08277 0.19671 -0.421 0.6744
## SMOKING:AGE65 -0.65884 0.27819 -2.368 0.0188 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9836 on 196 degrees of freedom
## Multiple R-squared: 0.1653, Adjusted R-squared: 0.1525
## F-statistic: 12.93 on 3 and 196 DF, p-value: 9.579e-08
# (4) Linear Regression Analysis Stratified by Age
old <- subset(fsdata,AGE65==1)
young <- subset(fsdata,AGE65==0)
# - (4a) Among Age < 65 years
yfit <- lm(FEV1~SMOKING,data=young)
summary(yfit)
##
## Call:
## lm(formula = FEV1 ~ SMOKING, data = young)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21667 -0.73090 -0.09361 0.85583 2.31807
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9620 0.1543 25.68 <2e-16 ***
## SMOKING -0.3579 0.2182 -1.64 0.104
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.091 on 98 degrees of freedom
## Multiple R-squared: 0.02671, Adjusted R-squared: 0.01678
## F-statistic: 2.69 on 1 and 98 DF, p-value: 0.1042
confint(yfit)
## 2.5 % 97.5 %
## (Intercept) 3.6557595 4.26816455
## SMOKING -0.7908987 0.07517284
# - (4b) Among Age >= 65 years
ofit <- lm(FEV1~SMOKING,data=old)
summary(ofit)
##
## Call:
## lm(formula = FEV1 ~ SMOKING, data = old)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.99650 -0.60340 -0.04252 0.66335 1.66817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.8792 0.1220 31.794 < 2e-16 ***
## SMOKING -1.0167 0.1726 -5.892 5.39e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8628 on 98 degrees of freedom
## Multiple R-squared: 0.2616, Adjusted R-squared: 0.2541
## F-statistic: 34.72 on 1 and 98 DF, p-value: 5.388e-08
confint(ofit)
## 2.5 % 97.5 %
## (Intercept) 3.637059 4.121316
## SMOKING -1.359125 -0.674283
# (5) Linear Regression Analysis Ignoring Age
ufit <- lm(FEV1~SMOKING,data=fsdata)
summary(ufit)
##
## Call:
## lm(formula = FEV1 ~ SMOKING, data = fsdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.17528 -0.67376 -0.06018 0.69897 2.35946
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9206 0.1014 38.670 < 2e-16 ***
## SMOKING -0.6873 0.1434 -4.793 3.21e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.014 on 198 degrees of freedom
## Multiple R-squared: 0.104, Adjusted R-squared: 0.09945
## F-statistic: 22.98 on 1 and 198 DF, p-value: 3.215e-06
confint(ufit)
## 2.5 % 97.5 %
## (Intercept) 3.7206393 4.120510
## SMOKING -0.9700351 -0.404532
# Example 2
# Low Birth Weight Data:
# A study was performed at Baystate Medical Center,
# Springfield, Massachusetts, to understand the variables
# that are related to the likelihood of a mother giving
# birth to a baby with low-birth weight (defined as a baby
# weighing less than 2500g). 189 mothers were randomly
# selected to partcipate in the study.
# Data Dictionary:
# There are numerous variables in the lead dataset but those of interest for the
# current exercise include:
# 1. bwt birth weight of baby (grams)
# 2. age age of mother during pregnancy (years)
# 3. smoke smoking status of mother during pregnancy
# (1 = smoker; 0 = non-smoker)
# Download and load the lead dataset used in lecture:
load(url("http://www.duke.edu/~sgrambow/crp241data/bwdata.RData"))
# Question 1:
# Is there evidence of an association between a mother's age
# during pregnancy and a baby's birth weight? Perform a
# hypothesis test.
fit1 <- lm(bwt ~ age, data=bwdata)
summary(fit1)
##
## Call:
## lm(formula = bwt ~ age, data = bwdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2294.78 -517.63 10.51 530.80 1774.92
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2655.74 238.86 11.12 <2e-16 ***
## age 12.43 10.02 1.24 0.216
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 728.2 on 187 degrees of freedom
## Multiple R-squared: 0.008157, Adjusted R-squared: 0.002853
## F-statistic: 1.538 on 1 and 187 DF, p-value: 0.2165
# Question 2:
# Create a figure to describe the relationship examined in
# Question 1.
plot(bwdata$age,bwdata$bwt,
xlab='Mothers Age During Pregnancy (age)',
ylab='Babys Birth Weight (grams)',
cex=2)
abline(fit1,lwd=3)

# Question 3:
# Is there evidence that the association between a mother's
# age during pregnancy and a baby's birth weight depends on
# the mother's smoking status during
# pregnancy? Perform a hypothesis test.
fit2 <- lm(bwt ~ age + smoke + age*smoke, data=bwdata)
summary(fit2)
##
## Call:
## lm(formula = bwt ~ age + smoke + age * smoke, data = bwdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2189.27 -458.46 51.46 527.26 1521.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2406.06 292.19 8.235 3.18e-14 ***
## age 27.73 12.15 2.283 0.0236 *
## smoke 798.17 484.34 1.648 0.1011
## age:smoke -46.57 20.45 -2.278 0.0239 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 709.3 on 185 degrees of freedom
## Multiple R-squared: 0.06909, Adjusted R-squared: 0.054
## F-statistic: 4.577 on 3 and 185 DF, p-value: 0.004068
# Question 4:
# Modify the figure created in Question 2 to describe the
# relationship examined in Question 3. Using this figure,
# describe the observed effect modification,
# if one exists.
smoker <- subset(bwdata,smoke==1)
nonsmoker <- subset(bwdata,smoke==0)
fit3 <- lm(bwt ~ age, data=smoker)
summary(fit3)
##
## Call:
## lm(formula = bwt ~ age, data = smoker)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1967.7 -464.1 13.6 498.8 1391.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3204.23 357.96 8.951 2.57e-13 ***
## age -18.84 15.24 -1.236 0.22
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 657.3 on 72 degrees of freedom
## Multiple R-squared: 0.02078, Adjusted R-squared: 0.007183
## F-statistic: 1.528 on 1 and 72 DF, p-value: 0.2204
confint(fit3)
## 2.5 % 97.5 %
## (Intercept) 2490.64656 3917.81914
## age -49.22246 11.54138
fit4 <- lm(bwt ~ age, data=nonsmoker)
summary(fit4)
##
## Call:
## lm(formula = bwt ~ age, data = nonsmoker)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2189.27 -449.75 73.58 542.25 1521.39
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2406.06 305.05 7.887 2.14e-12 ***
## age 27.73 12.68 2.186 0.0309 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 740.5 on 113 degrees of freedom
## Multiple R-squared: 0.04058, Adjusted R-squared: 0.03209
## F-statistic: 4.78 on 1 and 113 DF, p-value: 0.03085
confint(fit4)
## 2.5 % 97.5 %
## (Intercept) 1801.691173 3010.42478
## age 2.602105 52.86065
plot(nonsmoker$age,nonsmoker$bwt,
xlab='Mothers Age During Pregnancy (age)',
ylab='Babys Birth Weight (grams)',
col='royalblue',cex=2)
points(smoker$age,smoker$bwt,
col='sienna3',pch=19,cex=2)
abline(fit4,col='royalblue',lty=2,lwd=3)
abline(fit3,col='sienna3',lwd=3)
legend('topleft',c('Smoker','Non-Smoker'),
col=c('sienna3','royalblue'),
lty=c(1,2),lwd=2,horiz=T)

# End of Program