# ~ CRP 245 Module 2 Day 1 ~
# Example 1 -- Intutition about "Adjusted" Regression
# FEV1 and Genetic Variation:
# A study was conducted to determine if there was an
# association between a genetic variant and forced
# expiratory volume in one second (FEV1) in patients with COPD.
# FEV1 was measured in liters. Genotypes of interest were wild
# type vs. mutant (i.e. heterozygous or homozygous for risk
# allele). Sex was also recorded. 100 patients were randomly
# selected from the researchers clinical practice.
# Data Dictionary:
# 1. FEV1 forced expiratory volume in one second (liters)
# 2. GENO patient genotyps (1 = Mutant; 0 = Wild Type)
# 3. SEX patient sex (1 = Female; 0 = Male)
# Download and load the FEV1 dataset used in lecture:
download.file("http://www.duke.edu/~sgrambow/crp241data/fev1_geno.RData",
destfile = "fev1_geno.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("fev1_geno.RData")
# Let's start with a standard unadjusted
# analysis:
# (1) Two Sample T-test Analysis
# - Assuming population variances are equal
t.test(fgdata$FEV1~fgdata$GENO,var.equal=T)
##
## Two Sample t-test
##
## data: fgdata$FEV1 by fgdata$GENO
## t = 2.7962, df = 198, p-value = 0.005681
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1228584 0.7108077
## sample estimates:
## mean in group 0 mean in group 1
## 3.641340 3.224507
3.641340 -3.224507
## [1] 0.416833
# Note that the difference in the means is
# [1] 0.416833
# which correponds to the change in Y for
# a 1-unit change in X, which is simply
# the slope for a simple linear
# regression of Y on X.
# Is sex a confounder of the relationship between
# FEV1 and genotype? Add labels to sex and genotype
# variables to make ouptut easier to interpret
fgdata$fSEX <- factor(fgdata$SEX,labels=c('Male','Female'))
fgdata$fGENO <- factor(fgdata$GENO,labels=c('Wild Type','Mutant'))
# - Check distribution of sex by each genotype
table(fgdata$fSEX,fgdata$fGENO) # Counts
##
## Wild Type Mutant
## Male 70 30
## Female 42 58
prop.table(table(fgdata$fSEX,fgdata$fGENO),2) # Proportions among genotype
##
## Wild Type Mutant
## Male 0.6250000 0.3409091
## Female 0.3750000 0.6590909
# We see that the sex distribution by group differs
# Wild Type is 63% male
# Mutant Type is 38% male
# - Check distribution of FEV1 by each genotype
boxplot(fgdata$FEV1~fgdata$fSEX,
ylab='FEV1 Level (liters)')

# We see that FEV differs by Sex
# with males having higher FEV1 levels
# (2) Unadjusted Linear Regression Analysis
ufit <- lm(FEV1 ~ GENO, data=fgdata)
summary(ufit)
##
## Call:
## lm(formula = FEV1 ~ GENO, data = fgdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35865 -0.68975 -0.02921 0.59003 2.63869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.64134 0.09888 36.824 < 2e-16 ***
## GENO -0.41683 0.14907 -2.796 0.00568 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.046 on 198 degrees of freedom
## Multiple R-squared: 0.03799, Adjusted R-squared: 0.03313
## F-statistic: 7.819 on 1 and 198 DF, p-value: 0.005681
# Yields slope estimate for GENO of -0.41683
# which is difference in means we saw with
# t-test analysis
# (3) Adjusted Linear Regression Analysis
afit <- lm(FEV1 ~ GENO + SEX, data=fgdata)
summary(afit)
##
## Call:
## lm(formula = FEV1 ~ GENO + SEX, data = fgdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23691 -0.69417 -0.01816 0.77985 2.36431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9157 0.1081 36.213 < 2e-16 ***
## GENO -0.2090 0.1467 -1.425 0.156
## SEX -0.7317 0.1456 -5.025 1.12e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9877 on 197 degrees of freedom
## Multiple R-squared: 0.1473, Adjusted R-squared: 0.1386
## F-statistic: 17.02 on 2 and 197 DF, p-value: 1.526e-07
# Yields slope estimate for GENO of -0.2090
# which is difference in means 'adjusted'
# for sex. What does this mean?
# Let's stratify to understand...
# (4) Stratified Analysis
females <- subset(fgdata,SEX==1)
males <- subset(fgdata,SEX==0)
# - How does "adjusting" work
# Calculate mean difference among females
by(females$FEV1,females$GENO,summary)
## females$GENO: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.283 2.420 3.152 3.141 3.982 4.947
## ------------------------------------------------------------
## females$GENO: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.335 2.648 3.030 3.006 3.644 4.716
3.006 - 3.141
## [1] -0.135
# yields
# [1] -0.135 which is
# slope (or difference in means) for females
# Calculate mean difference among males
by(males$FEV1,males$GENO,summary)
## males$GENO: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.679 3.170 3.898 3.942 4.838 6.280
## ------------------------------------------------------------
## males$GENO: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.589 3.011 3.431 3.646 4.545 5.673
3.646 - 3.942
## [1] -0.296
# yields
# [1] -0.296 which is
# slope (or difference in means) for males
# now we calculate
# Average of strata-differences
(3.006 - 3.141 + 3.646 - 3.942)/2
## [1] -0.2155
# yields
# [1] -0.2155
# which is approximately equal to GENO slope
# in adjusted model (-.2090) -- this is
# what adjustment is doing!
# We can get same result by
# examining unadjusted simple linear
# regression models by SEX
# - (4a) Unadjusted Linear Regression Analysis Among Females
ffit <- lm(FEV1 ~ GENO, data=females)
summary(ffit)
##
## Call:
## lm(formula = FEV1 ~ GENO, data = females)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85813 -0.63437 0.02404 0.72909 1.80653
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1408 0.1354 23.202 <2e-16 ***
## GENO -0.1345 0.1777 -0.756 0.451
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8773 on 98 degrees of freedom
## Multiple R-squared: 0.005805, Adjusted R-squared: -0.004339
## F-statistic: 0.5723 on 1 and 98 DF, p-value: 0.4512
# yields slope of -0.1345 as above
# - (4a) Unadjusted Linear Regression Analysis Among Males
mfit <- lm(FEV1 ~ GENO, data=males)
summary(mfit)
##
## Call:
## lm(formula = FEV1 ~ GENO, data = males)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.26284 -0.70681 -0.09617 0.91070 2.33838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9416 0.1303 30.249 <2e-16 ***
## GENO -0.2954 0.2379 -1.242 0.217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.09 on 98 degrees of freedom
## Multiple R-squared: 0.01549, Adjusted R-squared: 0.005442
## F-statistic: 1.542 on 1 and 98 DF, p-value: 0.2173
# yields slope of -0.2954 as above
# Muscle Contraction in Alcoholics:
# This dataset contains maximum voluntary contraction (MVC) of quadriceps muscle, age,
# and height, of 41 male alcoholics. Researchers were interested in investigating the
# effect of both age and height on mean MVC.
# Data Set: mve
# Data Dictionary:
# (1) age age (years)
# (2) height height (cm)
# (3) mvc Max voluntary contraction, quadriceps muscle (newtons)
# Download and load the data files
download.file("http://www.duke.edu/~sgrambow/crp245data/mvc.RData",
destfile = "mvc.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("mvc.RData")
# Summary and Visualization of Muscle Data
# summary for Y -- mvc
summary(muscle$mvc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 74.0 270.0 343.0 322.1 402.0 540.0
sd(muscle$mvc)
## [1] 112.1767
hist(muscle$mvc)

# summary for X1 -- age -- in years
summary(muscle$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.00 34.00 41.00 43.54 53.00 65.00
sd(muscle$age)
## [1] 11.32717
hist(muscle$age)

plot(muscle$age,muscle$mvc)

# summary for X2 -- height -- height in cm
summary(muscle$height)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 155.0 167.0 172.0 170.7 175.0 180.0
sd(muscle$height)
## [1] 6.53079
hist(muscle$height)

plot(muscle$height,muscle$mvc)

# Simple Linear Regression
# Model mvc (Y) ~ age (X1)
ufit1 <- lm(muscle$mvc ~ muscle$age)
summary(ufit1)
##
## Call:
## lm(formula = muscle$mvc ~ muscle$age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.88 -67.05 10.41 64.31 207.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 501.858 64.794 7.745 2.08e-09 ***
## muscle$age -4.128 1.441 -2.864 0.0067 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.3 on 39 degrees of freedom
## Multiple R-squared: 0.1738, Adjusted R-squared: 0.1526
## F-statistic: 8.203 on 1 and 39 DF, p-value: 0.006701
# Simple Linear Regression
# Model mvc (Y) ~ height (X2)
ufit2 <- lm(muscle$mvc ~ muscle$height)
summary(ufit2)
##
## Call:
## lm(formula = muscle$mvc ~ muscle$height)
##
## Residuals:
## Min 1Q Median 3Q Max
## -228.446 -43.229 0.729 49.134 195.757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -907.626 426.612 -2.128 0.03975 *
## muscle$height 7.203 2.497 2.885 0.00635 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103.1 on 39 degrees of freedom
## Multiple R-squared: 0.1758, Adjusted R-squared: 0.1547
## F-statistic: 8.321 on 1 and 39 DF, p-value: 0.006351
# Multiple Linear Regression
# Model: mvc (Y) ~ age (X1) + height (X2)
mfit <- lm(mvc ~ age + height, data=muscle)
summary(mfit)
##
## Call:
## lm(formula = mvc ~ age + height, data = muscle)
##
## Residuals:
## Min 1Q Median 3Q Max
## -220.523 -33.483 5.665 50.917 170.841
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -465.626 460.333 -1.011 0.3182
## age -3.075 1.467 -2.096 0.0428 *
## height 5.398 2.545 2.121 0.0405 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 98.92 on 38 degrees of freedom
## Multiple R-squared: 0.2612, Adjusted R-squared: 0.2224
## F-statistic: 6.719 on 2 and 38 DF, p-value: 0.003173
# The ANOVA Table for Multiple Regression
# ANOVA Table can provide overall summary
# of a multiple regression analysis
# Want to load the CAR package
# to use its Anova function
# which calculates Type III sums of squares
# which are generally best
# nice ANOVA tutorial here:
# http://www.understandingdata.net/2017/05/11/anova-tables-in-r/#APAtable
#
library(car)
## Loading required package: carData
# generate ANOVA table for our model
Anova(mfit,type = "III")
## Anova Table (Type III tests)
##
## Response: mvc
## Sum Sq Df F value Pr(>F)
## (Intercept) 10012 1 1.0231 0.31818
## age 42985 1 4.3927 0.04281 *
## height 44024 1 4.4989 0.04050 *
## Residuals 371849 38
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Other Useful Regression functions/quantities
coefficients(mfit) # model coefficients
## (Intercept) age height
## -465.626228 -3.075405 5.398182
confint(mfit, level=0.95) # CIs for model parameters
## 2.5 % 97.5 %
## (Intercept) -1397.5226142 466.2701576
## age -6.0459158 -0.1048933
## height 0.2460489 10.5503147
fitted(mfit) # predicted values
## 1 2 3 4 5 6 7 8
## 356.6622 396.0197 382.1479 392.9443 367.5235 367.5235 299.6699 364.4481
## 9 10 11 12 13 14 15 16
## 402.2354 391.4390 374.4918 401.4827 328.2310 365.2656 345.9957 342.9203
## 17 18 19 20 21 22 23 24
## 283.5403 348.3184 345.2430 315.1767 369.1585 366.0831 312.9188 264.3352
## 25 26 27 28 29 30 31 32
## 342.2325 339.1571 344.5553 282.0999 327.6081 229.6882 305.2628 316.0591
## 33 34 35 36 37 38 39 40
## 299.8646 282.9174 326.1029 201.9447 219.7094 221.2795 202.0096 241.3670
## 41
## 241.3670
residuals(mfit) # residuals
## 1 2 3 4 5 6
## 109.337761 -92.019661 -39.147893 11.055743 -220.523498 -73.523498
## 7 8 9 10 11 12
## 92.330088 -217.448093 -132.235365 20.560998 27.508171 -33.482738
## 13 14 15 16 17 18
## 162.769030 -169.265616 -2.995666 -23.920261 103.459739 92.681557
## 19 20 21 22 23 24
## 95.756962 27.823275 170.841457 50.916862 -18.918843 5.664793
## 25 26 27 28 29 30
## 25.767471 101.842876 47.444694 11.900098 40.391867 -13.688234
## 31 32 33 34 35 36
## -11.262779 75.940858 166.135403 21.082576 -2.102879 -5.944698
## 37 38 39 40 41
## -121.709393 -5.279543 -6.009593 -104.367015 -167.367015
# Logistic Regresssion Example:
# Affairs Data
# Fairs Affair Data Example:
# A cross-sectional study was performed
# to understand the factors that correlate
# with an individuals decision to engage in
# extramarital affairs. 601 individuals
# participated in the study. The response
# of interest was whether or not the subject
# engaged in any extramarital sexual relations
# at least once in the last 12 months. Researchers
# wanted to understand the relationship between
# a subjects decision to engage in an affair and
# the length of their marriage.
# Data Set: affairs
# Data Dictionary:
# (1) id Subject identifier
# (2) anyaffair Subject engaged in extramarital sexual relations at least once
# in the last 12 months (1 = Yes; 0 = No)
# (3) yrsmarried Number of years subject has been married at time of study (in years)
# (4) gender Subject gender (female or male)
# (5) children Does the subject have children (no or yes)
## Download and load the data file = bpdata
download.file("http://www.duke.edu/~sgrambow/crp245data/affairs.RData",
destfile = "affairs.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("affairs.RData")
# Summary of Affairs Data
# X1 -- yrsmarried
summary(affairs$yrsmarried)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.125 4.000 7.000 8.178 15.000 15.000
sd(affairs$yrsmarried)
## [1] 5.571303
# X2 -- gender
table(affairs$gender)
##
## female male
## 315 286
# multiple logistic model with yrsmarried and gender
fit.mlog <- glm(anyaffair ~ yrsmarried+gender, data=affairs, family='binomial')
summary(fit.mlog)
##
## Call:
## glm(formula = anyaffair ~ yrsmarried + gender, family = "binomial",
## data = affairs)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9294 -0.8232 -0.6625 -0.5775 1.9197
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.71395 0.20633 -8.307 < 2e-16 ***
## yrsmarried 0.05843 0.01727 3.383 0.000718 ***
## gendermale 0.22157 0.19064 1.162 0.245139
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.38 on 600 degrees of freedom
## Residual deviance: 662.15 on 598 degrees of freedom
## AIC: 668.15
##
## Number of Fisher Scoring iterations: 4
#confidence limits on log odds scale
confint(fit.mlog)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -2.12990424 -1.31985336
## yrsmarried 0.02479755 0.09260135
## gendermale -0.15199429 0.59625678
# converting to odds ratio scale
exp(fit.mlog$coefficients)
## (Intercept) yrsmarried gendermale
## 0.180152 1.060170 1.248036
exp(confint(fit.mlog))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1188487 0.2671745
## yrsmarried 1.0251076 1.0970243
## gendermale 0.8589932 1.8153110
# per decade of marriage
exp(10*fit.mlog$coefficients)
## (Intercept) yrsmarried gendermale
## 3.600734e-08 1.793728e+00 9.167926e+00
exp(10*confint(fit.mlog))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 5.622681e-10 1.853317e-06
## yrsmarried 1.281429e+00 2.524425e+00
## gendermale 2.187244e-01 3.886067e+02
# Thus, the OR for having an affair per decade of marriage is 1.79 with
# 95% CI from 1.28 to 2.52
# Survival Regression Example -- CGD Data
# Data are from a placebo controlled trial
# of gamma interferon in chronic granulotomous
# disease (CGD). Outcome of interest is time to
# first serious infection observed through end
# of study for each patient.
# Data Set: cgd
# Data Dictionary:
# (1) id: subject identifiction number
# (2) treat: placebo or gamma interferon (1 = gamma; 0 = placebo)
# (3) sex: sex (1 = Male; 0 = Female)
# (4) age: age in years, at study entry
# (5) height: height in cm at study entry
# (6) weight: weight in kg at study entry
# (7) inherit: pattern of inheritance (1 = X-linked; 0 = autosomal)
# (8) propylac: use of prophylactic antibiotics at study entry (1 = yes; 0 = no)
# (9) tstop: time to fist serious infection or censoring
# (10) status: binary indicator of event or censoring (1 = event, 0 = censor)
## Download and load the data file = bpdata
download.file("http://www.duke.edu/~sgrambow/crp241data/cgd.RData",
destfile = "cgd.RData",quiet=TRUE,mode="wb",cacheOK=FALSE)
load("cgd.RData")
# Install and load package for Survival Analysis
#install.packages('survival')
library(survival)
##
## Attaching package: 'survival'
## The following object is masked _by_ '.GlobalEnv':
##
## cgd
# x1 -- treatment
table(cgd$treat)
##
## 0 1
## 65 63
# x2 -- age
summary(cgd$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 14.64 22.00 44.00
sd(cgd$age)
## [1] 9.844247
# Y -- time to first infection or censor (tstop)
# Obtain survival estimates using survfit() function
fit.km <- survfit(Surv(tstop, status)~ 1, data=cgd, conf.type="none")
fit.km
## Call: survfit(formula = Surv(tstop, status) ~ 1, data = cgd, conf.type = "none")
##
## n events median
## 128 44 373
# Create Kaplan Meier plot of survival curve by treatment arm
fit.kmtrt <- survfit(Surv(tstop, status) ~ treat, data=cgd)
# install.packages('surminer')
# we need this package for the ggsurvplot
# command used below to create the KM plot
library(survminer)
## Loading required package: ggplot2
## Loading required package: ggpubr
## Loading required package: magrittr
ggsurvplot(fit.kmtrt, data=cgd, ggtheme=theme_minimal(),
risk.table = TRUE)

# Compare survival curves by treatment arm using log-rank test
survdiff(Surv(tstop, status) ~ treat, data=cgd)
## Call:
## survdiff(formula = Surv(tstop, status) ~ treat, data = cgd)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## treat=0 65 30 18.9 6.48 11.7
## treat=1 63 14 25.1 4.89 11.7
##
## Chisq= 11.7 on 1 degrees of freedom, p= 6e-04
# Compare survival curves by treatment arm using Cox PH regression
fit.mcox <- coxph(Surv(tstop, status) ~ treat+age, data=cgd)
summary(fit.mcox)
## Call:
## coxph(formula = Surv(tstop, status) ~ treat + age, data = cgd)
##
## n= 128, number of events= 44
##
## coef exp(coef) se(coef) z Pr(>|z|)
## treat -1.15715 0.31438 0.33740 -3.430 0.000604 ***
## age -0.02832 0.97208 0.01714 -1.652 0.098485 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## treat 0.3144 3.181 0.1623 0.609
## age 0.9721 1.029 0.9400 1.005
##
## Concordance= 0.656 (se = 0.045 )
## Likelihood ratio test= 14.71 on 2 df, p=6e-04
## Wald test = 13.41 on 2 df, p=0.001
## Score (logrank) test = 14.39 on 2 df, p=8e-04
# change hazard ratio to reflect 5-year change instead of 1-year
exp(5*fit.mcox$coefficients)
## treat age
## 0.003071051 0.867986783
# change confidence intervals to match 5-year change
exp(5*confint(fit.mcox))
## 2.5 % 97.5 %
## treat 0.0001125455 0.0838004
## age 0.7337948122 1.0267190
# Thus, the HR for 5-year increase in age is 0.87 with
# 95% CI from 0.73 to 1.03
# change HR's so they > 1
exp(-(fit.mcox$coefficients))
## treat age
## 3.180846 1.028720
# do same to CI's
exp(-confint(fit.mcox))
## 2.5 % 97.5 %
## treat 6.162180 1.6419154
## age 1.063861 0.9947402
# The HR for Placebo to Treat is 3.18 with 95% CI from 1.64 to 6.16
# The HR for each 1 year decrease in age is 1.03 with 95% CI from 0.99 to 1.06
# End of Porgram