# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~ CRP 241 Module 4 Day 8 ~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
# Example 1

# FEV1 and Genetic Variation: 
# A study was conducted to determine if there was an 
# association between a genetic variant and forced 
# expiratory volume in one second (FEV1) in patients 
# with COPD. FEV1 was measured in liters. Genotypes 
# of interest were wild type vs. mutant (i.e. heterozygous 
# or homozygous for risk allele). Sex at birth was also recorded. 
# 100 patients were randomly selected from the researchers
# clinical practice. 

# Data Dictionary: 
# 1.  FEV1      forced expiratory volume in one second (liters)
# 2.  GENO      patient genotypes (1 = Mutant; 0 = Wild Type)
# 3.    SEX       patient sex (1 = Female; 0 = Male)

# Download and load the lead dataset used in lecture: 
load(url("http://www.duke.edu/~sgrambow/crp241data/fev1_geno.RData"))

# examine the data
View(fgdata) # shows the raw data
str(fgdata) # shows structure of variables in data frame
## 'data.frame':    200 obs. of  3 variables:
##  $ FEV1: num  2.66 3.62 4.8 1.61 3.19 ...
##  $ GENO: int  0 0 0 0 1 1 0 0 0 1 ...
##  $ SEX : int  0 1 0 1 0 1 0 1 0 1 ...
summary(fgdata) # provides descriptive summaries by variable
##       FEV1            GENO           SEX     
##  Min.   :1.283   Min.   :0.00   Min.   :0.0  
##  1st Qu.:2.706   1st Qu.:0.00   1st Qu.:0.0  
##  Median :3.387   Median :0.00   Median :0.5  
##  Mean   :3.458   Mean   :0.44   Mean   :0.5  
##  3rd Qu.:4.137   3rd Qu.:1.00   3rd Qu.:1.0  
##  Max.   :6.280   Max.   :1.00   Max.   :1.0
# Add labels to sex and genotype variables 
# to make ouptut easier to interpret
fgdata$fSEX <- factor(fgdata$SEX,labels=c('Male','Female'))
fgdata$fGENO <- factor(fgdata$GENO,labels=c('Wild Type','Mutant'))


# Is sex a confounder of the relationship between FEV1 
# and genotype? 
# Checking confounder criteria
# Is sex associated with covariate (genotype)
# - Check distribution of sex by each genotype
table(fgdata$fSEX,fgdata$fGENO)                  # Counts
##         
##          Wild Type Mutant
##   Male          70     30
##   Female        42     58
prop.table(table(fgdata$fSEX,fgdata$fGENO),2)    # Proportions among genotype
##         
##          Wild Type    Mutant
##   Male   0.6250000 0.3409091
##   Female 0.3750000 0.6590909
# ANSWER: 38% vs. 66% females among patients with 
#         Wild Type vs. Mutant genotype

# Is sex associated with outcome?
# - Check distribution of FEV1 by each genotype
boxplot(fgdata$FEV1~fgdata$fSEX,
        ylab='FEV1 Level (liters)')

# boxplot suggests there are differences
# in distribution of FEV1 by Sex


# (1) Two Sample T-test Analysis
#     - Assuming popluation variances are equal 
t.test(fgdata$FEV1~fgdata$GENO,var.equal=T)
## 
##  Two Sample t-test
## 
## data:  fgdata$FEV1 by fgdata$GENO
## t = 2.7962, df = 198, p-value = 0.005681
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1228584 0.7108077
## sample estimates:
## mean in group 0 mean in group 1 
##        3.641340        3.224507
3.641340 -3.224507
## [1] 0.416833
# (2) Unadjusted Linear Regression Analysis
ufit <- lm(FEV1 ~ GENO, data=fgdata)
summary(ufit)
## 
## Call:
## lm(formula = FEV1 ~ GENO, data = fgdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35865 -0.68975 -0.02921  0.59003  2.63869 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.64134    0.09888  36.824  < 2e-16 ***
## GENO        -0.41683    0.14907  -2.796  0.00568 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.046 on 198 degrees of freedom
## Multiple R-squared:  0.03799,    Adjusted R-squared:  0.03313 
## F-statistic: 7.819 on 1 and 198 DF,  p-value: 0.005681
confint(ufit)
##                  2.5 %     97.5 %
## (Intercept)  3.4463389  3.8363403
## GENO        -0.7108077 -0.1228584
# ANOVA Table for ufit
summary(aov(ufit))
##              Df Sum Sq Mean Sq F value  Pr(>F)   
## GENO          1   8.56   8.562   7.819 0.00568 **
## Residuals   198 216.84   1.095                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# note that the slope coefficient is -0.41683
# which is equal to the difference in the means 
# that was found with the t-test.
#

# (3) Adjusted Linear Regression Analysis
afit <- lm(FEV1 ~ GENO + SEX, data=fgdata)
summary(afit)
## 
## Call:
## lm(formula = FEV1 ~ GENO + SEX, data = fgdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.23691 -0.69417 -0.01816  0.77985  2.36431 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9157     0.1081  36.213  < 2e-16 ***
## GENO         -0.2090     0.1467  -1.425    0.156    
## SEX          -0.7317     0.1456  -5.025 1.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9877 on 197 degrees of freedom
## Multiple R-squared:  0.1473, Adjusted R-squared:  0.1386 
## F-statistic: 17.02 on 2 and 197 DF,  p-value: 1.526e-07
confint(afit)
##                  2.5 %      97.5 %
## (Intercept)  3.7024806  4.12896143
## GENO        -0.4981893  0.08025266
## SEX         -1.0188149 -0.44455282
# ANOVA Table for afit
summary(aov(afit))
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## GENO          1   8.56   8.562   8.776  0.00343 ** 
## SEX           1  24.64  24.639  25.254 1.12e-06 ***
## Residuals   197 192.20   0.976                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (4) Stratified Analysis 
females <- subset(fgdata,SEX==1)
males <- subset(fgdata,SEX==0)

# - How does "adjusting" work
by(females$FEV1,females$GENO,summary)   # Mean difference among females
## females$GENO: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.283   2.420   3.152   3.141   3.982   4.947 
## ------------------------------------------------------------ 
## females$GENO: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.648   3.030   3.006   3.644   4.716
3.006 - 3.141
## [1] -0.135
by(males$FEV1,males$GENO,summary)       # Mean difference among males
## males$GENO: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.679   3.170   3.898   3.942   4.838   6.280 
## ------------------------------------------------------------ 
## males$GENO: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.589   3.011   3.431   3.646   4.545   5.673
3.646 - 3.942
## [1] -0.296
(3.006 - 3.141 + 3.646 - 3.942)/2             # Average of strata-differences 
## [1] -0.2155
# - (4a) Unadjusted Linear Regression Analysis Among Females
ffit <- lm(FEV1 ~ GENO, data=females)
summary(ffit)
## 
## Call:
## lm(formula = FEV1 ~ GENO, data = females)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85813 -0.63437  0.02404  0.72909  1.80653 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.1408     0.1354  23.202   <2e-16 ***
## GENO         -0.1345     0.1777  -0.756    0.451    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8773 on 98 degrees of freedom
## Multiple R-squared:  0.005805,   Adjusted R-squared:  -0.004339 
## F-statistic: 0.5723 on 1 and 98 DF,  p-value: 0.4512
confint(ffit)
##                  2.5 %    97.5 %
## (Intercept)  2.8721889 3.4094579
## GENO        -0.4871961 0.2182726
# - (4a) Unadjusted Linear Regression Analysis Among Males
mfit <- lm(FEV1 ~ GENO, data=males)
summary(mfit)
## 
## Call:
## lm(formula = FEV1 ~ GENO, data = males)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.26284 -0.70681 -0.09617  0.91070  2.33838 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9416     0.1303  30.249   <2e-16 ***
## GENO         -0.2954     0.2379  -1.242    0.217    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.09 on 98 degrees of freedom
## Multiple R-squared:  0.01549,    Adjusted R-squared:  0.005442 
## F-statistic: 1.542 on 1 and 98 DF,  p-value: 0.2173
confint(mfit)
##                  2.5 %    97.5 %
## (Intercept)  3.6830593 4.2002394
## GENO        -0.7675146 0.1767227
# Example 2

# Lead Study Data: 
# A study was performed of the effects of exposure to lead on the 
# psychological and neurological well-being of children.  The data
# for this study are provided in the lead Dataset, posted on the 
# course web site.  In summary, a group of children living near a 
# lead smelter in El Paso, Texas, were identified and their blood 
# levels of lead were measured.  

# An exposed group of 36 children were identified who had blood-lead 
# levels >= 40 ug/ml.  This group is defined by the variable 
# Group = 2.  A control group of 66 children were also identified 
# who had blood-lead levels <40 ug/ml.  This group is identified 
# by the variable GROUP = 1. All children lived in close proximity 
# to the lead smelter.  

# One of the key outcome variables studied was the number of finger-wrist 
# taps in the dominant hand (#taps in one ten second trial), a measure of 
# neurological function (MAXFWT).

# Data Dictionary: 
# There are numerous variables in the lead dataset but those of 
# interest for the current exercise include:
# 1.  ageyrs    age of child in years xx.xx
# 2.  Group     exposure group (1= Control, 2= Exposed)
# 3.    maxfwt    finger-wrist tapping test in dominant hand 
#                (max of right and left hands)

# Download and load the lead dataset used in lecture: 
load(url("http://www.duke.edu/~sgrambow/crp241data/lead.RData"))

# Note: Missing values in maxfwt are coded as a 99. 
#       - Recode as an NA so that R treats them correctly as missing values.
lead$maxfwt[lead$maxfwt==99] <- NA

# Question 1: 
# Is age a confounder of the relationship between lead exposure and the 
# score of the finger-wrist tapping test?

#ANSWERS:
# examining association between AGE and MAXFWT 
# using scatterplot and pearson correlation coefficient
plot(lead$ageyrs,lead$maxfwt)

cor.test(lead$ageyrs,lead$maxfwt)
## 
##  Pearson's product-moment correlation
## 
## data:  lead$ageyrs and lead$maxfwt
## t = 8.3903, df = 97, p-value = 3.947e-13
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5173061 0.7499017
## sample estimates:
##       cor 
## 0.6484923
# examining association between AGE and GROUP 
# using boxplot, comparison of summary statistics and t-test
# equivalently, one could use SLR to compare AGE and GROUP as well
# since we now know it is equivalent to t-test
boxplot(lead$ageyrs~lead$Group,
main="age dist by exposure 1= Control, 2= Exposed")

by(lead$ageyrs,lead$Group,summary)
## lead$Group: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.750   6.500   9.420   9.327  12.560  15.920 
## ------------------------------------------------------------ 
## lead$Group: 2
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.750   5.272   7.835   8.270  10.335  15.250
t.test(lead$ageyrs~lead$Group,var.equal=T)
## 
##  Two Sample t-test
## 
## data:  lead$ageyrs by lead$Group
## t = 1.6188, df = 122, p-value = 0.1081
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2356665  2.3507167
## sample estimates:
## mean in group 1 mean in group 2 
##        9.327308        8.269783
9.327-8.270
## [1] 1.057
summary(lm(lead$ageyrs~lead$Group))
## 
## Call:
## lm(formula = lead$ageyrs ~ lead$Group)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5773 -2.8698 -0.0485  2.9202  6.9802 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.3848     0.9496  10.936   <2e-16 ***
## lead$Group   -1.0575     0.6533  -1.619    0.108    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.514 on 122 degrees of freedom
## Multiple R-squared:  0.02103,    Adjusted R-squared:  0.013 
## F-statistic: 2.621 on 1 and 122 DF,  p-value: 0.1081
# Question 2: 
# Estimate the unadjusted association between lead exposure and the 
# score of the finger-wrist tapping test. Is it statistically significant?

#ANSWER
# we can use SLR as in example 1 to examine
# the unadjusted relationship between group and maxfwt
ufit <- lm(maxfwt ~ Group, data=lead)
summary(ufit)
## 
## Call:
## lm(formula = maxfwt ~ Group, data = lead)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.438  -5.933   0.562   7.067  35.571 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   61.446      3.758  16.351  < 2e-16 ***
## Group         -7.009      2.618  -2.677  0.00872 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.45 on 97 degrees of freedom
##   (25 observations deleted due to missingness)
## Multiple R-squared:  0.06881,    Adjusted R-squared:  0.05921 
## F-statistic: 7.168 on 1 and 97 DF,  p-value: 0.008718
# Question 3: 
# Estimate the association between lead exposure and the score of the 
# finger-wrist tapping test adjusted for age. Is it statistically significant?

#ANSWER
# we can use adjusted linear regression as in example 1 to examine
# the relationship between maxfwt and group, adjusted for age
afit <- lm(maxfwt ~ Group + ageyrs, data=lead)
summary(afit)
## 
## Call:
## lm(formula = maxfwt ~ Group + ageyrs, data = lead)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.380  -4.301   0.977   5.495  36.150 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  31.7367     4.6345   6.848 7.10e-10 ***
## Group        -4.8489     2.0342  -2.384   0.0191 *  
## ageyrs        2.6592     0.3239   8.210 1.02e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.595 on 96 degrees of freedom
##   (25 observations deleted due to missingness)
## Multiple R-squared:  0.4529, Adjusted R-squared:  0.4415 
## F-statistic: 39.74 on 2 and 96 DF,  p-value: 2.669e-13
# Question 4: 
# What is the impact of adjusting for age on the conclusions of the analysis?

#ANSWER
# as discussed, the parameter estimate for Group is attenuated when age is 
# included in the model, going from -7.009 in the unadjusted model to -4.85 
# in the adjusted model. Given that the controls were on 
# average about 1 year older than the exposed group, and the 
# developmental advantage this might convey, the attenuating effect 
# observed makes sense.

# Question 5: 
# How were the missing values handled in the regression analysis?

# ANSWER
# After converting the missing values from 99 to NA so R correctly recognizes
# them as missing, the lm function simply 'kicks out' or excludes any 
# observation that is missing for any variable included in the regression
# model.  This is the usual default approach in most software packages
# Note that this can result in bias if the missingness is 'informative'
# or nonrandom.
# End of Program