# ~ CRP 245 Part 2 Day 1 ~
# Example 1 -- Intuition about "Adjusted" Regression
# FEV1 and Genetic Variation: 
# A study was conducted to determine if there was an 
# association between a genetic variant and forced 
# expiratory volume in one second (FEV1) in patients with COPD. 
# FEV1 was measured in liters. Genotypes of interest were wild 
# type vs. mutant (i.e. heterozygous or homozygous for risk 
# allele). Sex was also recorded. 100 patients were randomly 
# selected from the researchers clinical practice. 

# Data Dictionary: 
# 1.  FEV1      forced expiratory volume in one second (liters)
# 2.  GENO      patient genotypes (1 = Mutant; 0 = Wild Type)
# 3.    SEX       patient sex (1 = Female; 0 = Male)

# Download and load the FEV1 dataset used in lecture: 
load(url("http://www.duke.edu/~sgrambow/crp241data/fev1_geno.RData"))

wild <- subset(fgdata,GENO==0)
mutant <- subset(fgdata,GENO==1)

# Let's start with a standard unadjusted analysis:
# (1) Two Sample T-test Analysis
#     - Assuming population variances are equal 
t.test(fgdata$FEV1~fgdata$GENO,var.equal=T)
## 
##  Two Sample t-test
## 
## data:  fgdata$FEV1 by fgdata$GENO
## t = 2.7962, df = 198, p-value = 0.005681
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1228584 0.7108077
## sample estimates:
## mean in group 0 mean in group 1 
##        3.641340        3.224507
3.641340 -3.224507
## [1] 0.416833
# Note that the difference in the means is
# [1] 0.416833
# which corresponds to the change in Y for a 1-unit 
# change in X, which is simply the slope for a simple linear 
# regression of Y on X.

# Is sex a confounder of the relationship between 
# FEV1 and genotype? Add labels to sex and genotype 
# variables to make ouptut easier to interpret

fgdata$fSEX <- factor(fgdata$SEX,labels=c('Male','Female'))
fgdata$fGENO <- factor(fgdata$GENO,labels=c('Wild Type','Mutant'))

# - Check distribution of sex by each genotype

table(fgdata$fSEX,fgdata$fGENO)                 # Counts
##         
##          Wild Type Mutant
##   Male          70     30
##   Female        42     58
prop.table(table(fgdata$fSEX,fgdata$fGENO),2)   # Proportions among genotype
##         
##          Wild Type    Mutant
##   Male   0.6250000 0.3409091
##   Female 0.3750000 0.6590909
# We see that the sex distribution by group differs
# Wild Type is 63% male
# Mutant  Type is 38% male

# - Check distribution of FEV1 by each genotype
boxplot(fgdata$FEV1~fgdata$fSEX,
        ylab='FEV1 Level (liters)')

# We see that FEV differs by Sex
# with males having higher FEV1 levels

# (2) Unadjusted Linear Regression Analysis
ufit <- lm(FEV1 ~ GENO, data=fgdata)
summary(ufit)
## 
## Call:
## lm(formula = FEV1 ~ GENO, data = fgdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35865 -0.68975 -0.02921  0.59003  2.63869 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.64134    0.09888  36.824  < 2e-16 ***
## GENO        -0.41683    0.14907  -2.796  0.00568 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.046 on 198 degrees of freedom
## Multiple R-squared:  0.03799,    Adjusted R-squared:  0.03313 
## F-statistic: 7.819 on 1 and 198 DF,  p-value: 0.005681
# Yields slope estimate for GENO of -0.41683
# which is difference in means we saw with
# t-test analysis

# (3) Adjusted Linear Regression Analysis
afit <- lm(FEV1 ~ GENO + SEX, data=fgdata)
summary(afit)
## 
## Call:
## lm(formula = FEV1 ~ GENO + SEX, data = fgdata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.23691 -0.69417 -0.01816  0.77985  2.36431 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9157     0.1081  36.213  < 2e-16 ***
## GENO         -0.2090     0.1467  -1.425    0.156    
## SEX          -0.7317     0.1456  -5.025 1.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9877 on 197 degrees of freedom
## Multiple R-squared:  0.1473, Adjusted R-squared:  0.1386 
## F-statistic: 17.02 on 2 and 197 DF,  p-value: 1.526e-07
# Yields slope estimate for GENO of -0.2090
# which is difference in means 'adjusted'
# for sex.  What does this mean?
# Let's stratify to understand...

# (4) Stratified Analysis 
females <- subset(fgdata,SEX==1)
males <- subset(fgdata,SEX==0)

# - How does "adjusting" work
# Calculate mean difference among females
by(females$FEV1,females$GENO,summary)   
## females$GENO: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.283   2.420   3.152   3.141   3.982   4.947 
## ------------------------------------------------------------ 
## females$GENO: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.335   2.648   3.030   3.006   3.644   4.716
3.006 - 3.141
## [1] -0.135
# yields 
# [1] -0.135 which is 
# slope (or difference in means) for females

# Calculate mean difference among males
by(males$FEV1,males$GENO,summary)       
## males$GENO: 0
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.679   3.170   3.898   3.942   4.838   6.280 
## ------------------------------------------------------------ 
## males$GENO: 1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.589   3.011   3.431   3.646   4.545   5.673
3.646 - 3.942
## [1] -0.296
# yields 
# [1] -0.296 which is 
# slope (or difference in means) for males

# now we calculate 
# Average of strata-differences 
(3.006 - 3.141 + 3.646 - 3.942)/2           
## [1] -0.2155
# yields 
# [1] -0.2155
# which is approximately equal to GENO slope
# in adjusted model (-.2090) -- this is 
# what adjustment is doing!

# We can get same result by
# examining unadjusted simple linear
# regression models by SEX

# - (4a) Unadjusted Linear Regression Analysis Among Females
ffit <- lm(FEV1 ~ GENO, data=females)
summary(ffit)
## 
## Call:
## lm(formula = FEV1 ~ GENO, data = females)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85813 -0.63437  0.02404  0.72909  1.80653 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.1408     0.1354  23.202   <2e-16 ***
## GENO         -0.1345     0.1777  -0.756    0.451    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8773 on 98 degrees of freedom
## Multiple R-squared:  0.005805,   Adjusted R-squared:  -0.004339 
## F-statistic: 0.5723 on 1 and 98 DF,  p-value: 0.4512
# yields slope of -0.1345 as above

# - (4a) Unadjusted Linear Regression Analysis Among Males
mfit <- lm(FEV1 ~ GENO, data=males)
summary(mfit)
## 
## Call:
## lm(formula = FEV1 ~ GENO, data = males)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.26284 -0.70681 -0.09617  0.91070  2.33838 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9416     0.1303  30.249   <2e-16 ***
## GENO         -0.2954     0.2379  -1.242    0.217    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.09 on 98 degrees of freedom
## Multiple R-squared:  0.01549,    Adjusted R-squared:  0.005442 
## F-statistic: 1.542 on 1 and 98 DF,  p-value: 0.2173
# yields slope of -0.2954 as above
# END PROGRAM