# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 8 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Example 1
# FEV1 and Genetic Variation:
# A study was conducted to determine if there was an
# association between a genetic variant and forced
# expiratory volume in one second (FEV1) in patients
# with COPD. FEV1 was measured in liters. Genotypes
# of interest were wild type vs. mutant (i.e. heterozygous
# or homozygous for risk allele). Sex at birth was also recorded.
# 100 patients were randomly selected from the researchers
# clinical practice.
# Data Dictionary:
# 1. FEV1 forced expiratory volume in one second (liters)
# 2. GENO patient genotypes (1 = Mutant; 0 = Wild Type)
# 3. SEX patient sex (1 = Female; 0 = Male)
# Download and load the lead dataset used in lecture:
load(url("http://www.duke.edu/~sgrambow/crp241data/fev1_geno.RData"))
# examine the data
View(fgdata) # shows the raw data
str(fgdata) # shows structure of variables in data frame
## 'data.frame': 200 obs. of 3 variables:
## $ FEV1: num 2.66 3.62 4.8 1.61 3.19 ...
## $ GENO: int 0 0 0 0 1 1 0 0 0 1 ...
## $ SEX : int 0 1 0 1 0 1 0 1 0 1 ...
summary(fgdata) # provides descriptive summaries by variable
## FEV1 GENO SEX
## Min. :1.283 Min. :0.00 Min. :0.0
## 1st Qu.:2.706 1st Qu.:0.00 1st Qu.:0.0
## Median :3.387 Median :0.00 Median :0.5
## Mean :3.458 Mean :0.44 Mean :0.5
## 3rd Qu.:4.137 3rd Qu.:1.00 3rd Qu.:1.0
## Max. :6.280 Max. :1.00 Max. :1.0
# Add labels to sex and genotype variables
# to make ouptut easier to interpret
fgdata$fSEX <- factor(fgdata$SEX,labels=c('Male','Female'))
fgdata$fGENO <- factor(fgdata$GENO,labels=c('Wild Type','Mutant'))
# Is sex a confounder of the relationship between FEV1
# and genotype?
# Checking confounder criteria
# Is sex associated with covariate (genotype)
# - Check distribution of sex by each genotype
table(fgdata$fSEX,fgdata$fGENO) # Counts
##
## Wild Type Mutant
## Male 70 30
## Female 42 58
prop.table(table(fgdata$fSEX,fgdata$fGENO),2) # Proportions among genotype
##
## Wild Type Mutant
## Male 0.6250000 0.3409091
## Female 0.3750000 0.6590909
# ANSWER: 38% vs. 66% females among patients with
# Wild Type vs. Mutant genotype
# Is sex associated with outcome?
# - Check distribution of FEV1 by each genotype
boxplot(fgdata$FEV1~fgdata$fSEX,
ylab='FEV1 Level (liters)')

# boxplot suggests there are differences
# in distribution of FEV1 by Sex
# (1) Two Sample T-test Analysis
# - Assuming popluation variances are equal
t.test(fgdata$FEV1~fgdata$GENO,var.equal=T)
##
## Two Sample t-test
##
## data: fgdata$FEV1 by fgdata$GENO
## t = 2.7962, df = 198, p-value = 0.005681
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1228584 0.7108077
## sample estimates:
## mean in group 0 mean in group 1
## 3.641340 3.224507
3.641340 -3.224507
## [1] 0.416833
# (2) Unadjusted Linear Regression Analysis
ufit <- lm(FEV1 ~ GENO, data=fgdata)
summary(ufit)
##
## Call:
## lm(formula = FEV1 ~ GENO, data = fgdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35865 -0.68975 -0.02921 0.59003 2.63869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.64134 0.09888 36.824 < 2e-16 ***
## GENO -0.41683 0.14907 -2.796 0.00568 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.046 on 198 degrees of freedom
## Multiple R-squared: 0.03799, Adjusted R-squared: 0.03313
## F-statistic: 7.819 on 1 and 198 DF, p-value: 0.005681
confint(ufit)
## 2.5 % 97.5 %
## (Intercept) 3.4463389 3.8363403
## GENO -0.7108077 -0.1228584
# ANOVA Table for ufit
summary(aov(ufit))
## Df Sum Sq Mean Sq F value Pr(>F)
## GENO 1 8.56 8.562 7.819 0.00568 **
## Residuals 198 216.84 1.095
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# note that the slope coefficient is -0.41683
# which is equal to the difference in the means
# that was found with the t-test.
#
# (3) Adjusted Linear Regression Analysis
afit <- lm(FEV1 ~ GENO + SEX, data=fgdata)
summary(afit)
##
## Call:
## lm(formula = FEV1 ~ GENO + SEX, data = fgdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.23691 -0.69417 -0.01816 0.77985 2.36431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9157 0.1081 36.213 < 2e-16 ***
## GENO -0.2090 0.1467 -1.425 0.156
## SEX -0.7317 0.1456 -5.025 1.12e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9877 on 197 degrees of freedom
## Multiple R-squared: 0.1473, Adjusted R-squared: 0.1386
## F-statistic: 17.02 on 2 and 197 DF, p-value: 1.526e-07
confint(afit)
## 2.5 % 97.5 %
## (Intercept) 3.7024806 4.12896143
## GENO -0.4981893 0.08025266
## SEX -1.0188149 -0.44455282
# ANOVA Table for afit
summary(aov(afit))
## Df Sum Sq Mean Sq F value Pr(>F)
## GENO 1 8.56 8.562 8.776 0.00343 **
## SEX 1 24.64 24.639 25.254 1.12e-06 ***
## Residuals 197 192.20 0.976
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (4) Stratified Analysis
females <- subset(fgdata,SEX==1)
males <- subset(fgdata,SEX==0)
# - How does "adjusting" work
by(females$FEV1,females$GENO,summary) # Mean difference among females
## females$GENO: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.283 2.420 3.152 3.141 3.982 4.947
## ------------------------------------------------------------
## females$GENO: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.335 2.648 3.030 3.006 3.644 4.716
3.006 - 3.141
## [1] -0.135
by(males$FEV1,males$GENO,summary) # Mean difference among males
## males$GENO: 0
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.679 3.170 3.898 3.942 4.838 6.280
## ------------------------------------------------------------
## males$GENO: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.589 3.011 3.431 3.646 4.545 5.673
3.646 - 3.942
## [1] -0.296
(3.006 - 3.141 + 3.646 - 3.942)/2 # Average of strata-differences
## [1] -0.2155
# - (4a) Unadjusted Linear Regression Analysis Among Females
ffit <- lm(FEV1 ~ GENO, data=females)
summary(ffit)
##
## Call:
## lm(formula = FEV1 ~ GENO, data = females)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85813 -0.63437 0.02404 0.72909 1.80653
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1408 0.1354 23.202 <2e-16 ***
## GENO -0.1345 0.1777 -0.756 0.451
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8773 on 98 degrees of freedom
## Multiple R-squared: 0.005805, Adjusted R-squared: -0.004339
## F-statistic: 0.5723 on 1 and 98 DF, p-value: 0.4512
confint(ffit)
## 2.5 % 97.5 %
## (Intercept) 2.8721889 3.4094579
## GENO -0.4871961 0.2182726
# - (4a) Unadjusted Linear Regression Analysis Among Males
mfit <- lm(FEV1 ~ GENO, data=males)
summary(mfit)
##
## Call:
## lm(formula = FEV1 ~ GENO, data = males)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.26284 -0.70681 -0.09617 0.91070 2.33838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9416 0.1303 30.249 <2e-16 ***
## GENO -0.2954 0.2379 -1.242 0.217
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.09 on 98 degrees of freedom
## Multiple R-squared: 0.01549, Adjusted R-squared: 0.005442
## F-statistic: 1.542 on 1 and 98 DF, p-value: 0.2173
confint(mfit)
## 2.5 % 97.5 %
## (Intercept) 3.6830593 4.2002394
## GENO -0.7675146 0.1767227
# Example 2
# Lead Study Data:
# A study was performed of the effects of exposure to lead on the
# psychological and neurological well-being of children. The data
# for this study are provided in the lead Dataset, posted on the
# course web site. In summary, a group of children living near a
# lead smelter in El Paso, Texas, were identified and their blood
# levels of lead were measured.
# An exposed group of 36 children were identified who had blood-lead
# levels >= 40 ug/ml. This group is defined by the variable
# Group = 2. A control group of 66 children were also identified
# who had blood-lead levels <40 ug/ml. This group is identified
# by the variable GROUP = 1. All children lived in close proximity
# to the lead smelter.
# One of the key outcome variables studied was the number of finger-wrist
# taps in the dominant hand (#taps in one ten second trial), a measure of
# neurological function (MAXFWT).
# Data Dictionary:
# There are numerous variables in the lead dataset but those of
# interest for the current exercise include:
# 1. ageyrs age of child in years xx.xx
# 2. Group exposure group (1= Control, 2= Exposed)
# 3. maxfwt finger-wrist tapping test in dominant hand
# (max of right and left hands)
# Download and load the lead dataset used in lecture:
load(url("http://www.duke.edu/~sgrambow/crp241data/lead.RData"))
# Note: Missing values in maxfwt are coded as a 99.
# - Recode as an NA so that R treats them correctly as missing values.
lead$maxfwt[lead$maxfwt==99] <- NA
# Question 1:
# Is age a confounder of the relationship between lead exposure and the
# score of the finger-wrist tapping test?
#ANSWERS:
# examining association between AGE and MAXFWT
# using scatterplot and pearson correlation coefficient
plot(lead$ageyrs,lead$maxfwt)

cor.test(lead$ageyrs,lead$maxfwt)
##
## Pearson's product-moment correlation
##
## data: lead$ageyrs and lead$maxfwt
## t = 8.3903, df = 97, p-value = 3.947e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5173061 0.7499017
## sample estimates:
## cor
## 0.6484923
# examining association between AGE and GROUP
# using boxplot, comparison of summary statistics and t-test
# equivalently, one could use SLR to compare AGE and GROUP as well
# since we now know it is equivalent to t-test
boxplot(lead$ageyrs~lead$Group,
main="age dist by exposure 1= Control, 2= Exposed")

by(lead$ageyrs,lead$Group,summary)
## lead$Group: 1
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.750 6.500 9.420 9.327 12.560 15.920
## ------------------------------------------------------------
## lead$Group: 2
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.750 5.272 7.835 8.270 10.335 15.250
t.test(lead$ageyrs~lead$Group,var.equal=T)
##
## Two Sample t-test
##
## data: lead$ageyrs by lead$Group
## t = 1.6188, df = 122, p-value = 0.1081
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2356665 2.3507167
## sample estimates:
## mean in group 1 mean in group 2
## 9.327308 8.269783
9.327-8.270
## [1] 1.057
summary(lm(lead$ageyrs~lead$Group))
##
## Call:
## lm(formula = lead$ageyrs ~ lead$Group)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5773 -2.8698 -0.0485 2.9202 6.9802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.3848 0.9496 10.936 <2e-16 ***
## lead$Group -1.0575 0.6533 -1.619 0.108
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.514 on 122 degrees of freedom
## Multiple R-squared: 0.02103, Adjusted R-squared: 0.013
## F-statistic: 2.621 on 1 and 122 DF, p-value: 0.1081
# Question 2:
# Estimate the unadjusted association between lead exposure and the
# score of the finger-wrist tapping test. Is it statistically significant?
#ANSWER
# we can use SLR as in example 1 to examine
# the unadjusted relationship between group and maxfwt
ufit <- lm(maxfwt ~ Group, data=lead)
summary(ufit)
##
## Call:
## lm(formula = maxfwt ~ Group, data = lead)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.438 -5.933 0.562 7.067 35.571
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.446 3.758 16.351 < 2e-16 ***
## Group -7.009 2.618 -2.677 0.00872 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.45 on 97 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.06881, Adjusted R-squared: 0.05921
## F-statistic: 7.168 on 1 and 97 DF, p-value: 0.008718
# Question 3:
# Estimate the association between lead exposure and the score of the
# finger-wrist tapping test adjusted for age. Is it statistically significant?
#ANSWER
# we can use adjusted linear regression as in example 1 to examine
# the relationship between maxfwt and group, adjusted for age
afit <- lm(maxfwt ~ Group + ageyrs, data=lead)
summary(afit)
##
## Call:
## lm(formula = maxfwt ~ Group + ageyrs, data = lead)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.380 -4.301 0.977 5.495 36.150
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.7367 4.6345 6.848 7.10e-10 ***
## Group -4.8489 2.0342 -2.384 0.0191 *
## ageyrs 2.6592 0.3239 8.210 1.02e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.595 on 96 degrees of freedom
## (25 observations deleted due to missingness)
## Multiple R-squared: 0.4529, Adjusted R-squared: 0.4415
## F-statistic: 39.74 on 2 and 96 DF, p-value: 2.669e-13
# Question 4:
# What is the impact of adjusting for age on the conclusions of the analysis?
#ANSWER
# as discussed, the parameter estimate for Group is attenuated when age is
# included in the model, going from -7.009 in the unadjusted model to -4.85
# in the adjusted model. Given that the controls were on
# average about 1 year older than the exposed group, and the
# developmental advantage this might convey, the attenuating effect
# observed makes sense.
# Question 5:
# How were the missing values handled in the regression analysis?
# ANSWER
# After converting the missing values from 99 to NA so R correctly recognizes
# them as missing, the lm function simply 'kicks out' or excludes any
# observation that is missing for any variable included in the regression
# model. This is the usual default approach in most software packages
# Note that this can result in bias if the missingness is 'informative'
# or nonrandom.
# End of Program