library(ggplot2)

sample.size <- 1000
dat <- data.frame(subject.id=1:sample.size)

dat$ages <- round(runif(sample.size, min=18, max=60))
hist(dat$ages)

# Simulate random baseline values for the participants, where the baseline variables
# are dependent on age.
dat$var1.baseline <- dat$ages + rnorm(sample.size, mean=10, sd=10)
dat$var2.baseline <- dat$ages + rnorm(sample.size, mean=20, sd=20)
dat$var3.baseline <- dat$ages + rnorm(sample.size, mean=30, sd=30)

# Simulate some random movement from baseline to placebo
dat$var1.placebo <- dat$var1.baseline + rnorm(sample.size, mean=0, sd=1)
dat$var2.placebo <- dat$var2.baseline + rnorm(sample.size, mean=0, sd=2)
dat$var3.placebo <- dat$var3.baseline + rnorm(sample.size, mean=0, sd=3)

# Try simulating a response variable that depends only on the values of the original 
# analogous variables and, indirectly, age (because the baseline variables depended on age). 
dat$var1.drug <- dat$var1.placebo - 0.5*dat$var1.baseline + rnorm(sample.size, sd=0.5)
dat$var2.drug <- dat$var2.placebo - 1.0*dat$var2.baseline + rnorm(sample.size, sd=0.5)
dat$var3.drug <- dat$var3.placebo - 1.5*dat$var3.baseline + rnorm(sample.size, sd=0.5)

dat$var1.diff <- dat$var1.drug - dat$var1.placebo
dat$var2.diff <- dat$var2.drug - dat$var2.placebo
dat$var3.diff <- dat$var3.drug - dat$var3.placebo

# How does age relate to the uncontrolled outcome variable?
ggplot(dat, aes(x=ages, y=var1.diff)) +
  geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line 

                           #  (by default includes 95% confidence region)

# How does the relevant predictor variable relate to the uncontrolled outcome variable?
ggplot(dat, aes(x=var1.baseline, y=var1.diff)) +
  geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line 

# Typical example from multiple regression, excluding the relevant input variable
var1.diff.model <- lm(formula = var1.diff ~ ages + var2.baseline + var3.baseline, data=dat)
summary(var1.diff.model)
## 
## Call:
## lm(formula = var1.diff ~ ages + var2.baseline + var3.baseline, 
##     data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3061  -3.4566   0.1039   3.4344  16.2689 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -4.792082   0.564353  -8.491   <2e-16 ***
## ages          -0.496261   0.016410 -30.242   <2e-16 ***
## var2.baseline -0.004540   0.008143  -0.558    0.577    
## var3.baseline  0.002265   0.005090   0.445    0.656    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.948 on 996 degrees of freedom
## Multiple R-squared:  0.602,  Adjusted R-squared:  0.6008 
## F-statistic: 502.2 on 3 and 996 DF,  p-value: < 2.2e-16
# How we would typically control in the one-dimensional case
var1.diff.model.with.age <- lm(formula = var1.diff ~ ages + var1.baseline + var2.baseline + var3.baseline, 
                          data=dat)
summary(var1.diff.model.with.age)
## 
## Call:
## lm(formula = var1.diff ~ ages + var1.baseline + var2.baseline + 
##     var3.baseline, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.78210 -0.31877  0.00184  0.33861  1.40408 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   -4.392e-02  6.019e-02   -0.730    0.466    
## ages          -2.083e-03  2.342e-03   -0.889    0.374    
## var1.baseline -4.986e-01  1.636e-03 -304.738   <2e-16 ***
## var2.baseline  1.286e-03  8.390e-04    1.533    0.126    
## var3.baseline -4.436e-05  5.244e-04   -0.085    0.933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5097 on 995 degrees of freedom
## Multiple R-squared:  0.9958, Adjusted R-squared:  0.9958 
## F-statistic: 5.871e+04 on 4 and 995 DF,  p-value: < 2.2e-16
# Let's control for age in all of the outcome variables
age.var1diff.model <- lm(formula = var1.diff ~ ages, data=dat)
age.var2diff.model <- lm(formula = var2.diff ~ ages, data=dat)
age.var3diff.model <- lm(formula = var3.diff ~ ages, data=dat)

dat$var1.diff.controlled <- dat$var1.diff - predict(age.var1diff.model)
dat$var2.diff.controlled <- dat$var2.diff - predict(age.var2diff.model)
dat$var3.diff.controlled <- dat$var3.diff - predict(age.var3diff.model)

# And check to make sure that the age-related portion of the outcome variable
# has been taken away.
ggplot(dat, aes(x=ages, y=var1.diff.controlled)) +
  geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line 

# We still get some signal, presumably the part that we want, but it's still noisy...
ggplot(dat, aes(x=var1.baseline, y=var1.diff.controlled)) +
  geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line 

# What if we now try and predict the outcome from all the uncontrolled input variables?
partially.controlled.var1.diff.model <- lm(var1.diff.controlled ~ var1.baseline + var2.baseline + var3.baseline, 
                                 data=dat)
summary(partially.controlled.var1.diff.model)
## 
## Call:
## lm(formula = var1.diff.controlled ~ var1.baseline + var2.baseline + 
##     var3.baseline, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7711 -2.4889  0.1182  2.4681  9.3094 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.749369   0.403667  16.720   <2e-16 ***
## var1.baseline -0.258563   0.008015 -32.260   <2e-16 ***
## var2.baseline  0.065174   0.005315  12.261   <2e-16 ***
## var3.baseline  0.029017   0.003436   8.445   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.46 on 996 degrees of freedom
## Multiple R-squared:  0.5112, Adjusted R-squared:  0.5097 
## F-statistic: 347.2 on 3 and 996 DF,  p-value: < 2.2e-16
# Not quite what we wanted.... Let's try and control for age in the input variables as well...
# How does that affect our predictions?
age.var1baseline.model <- lm(formula = var1.baseline ~ ages, data=dat)
age.var2baseline.model <- lm(formula = var2.baseline ~ ages, data=dat)
age.var3baseline.model <- lm(formula = var3.baseline ~ ages, data=dat)

dat$var1.baseline.controlled <- dat$var1.baseline - predict(age.var1baseline.model)
dat$var2.baseline.controlled <- dat$var2.baseline - predict(age.var2baseline.model)
dat$var3.baseline.controlled <- dat$var3.baseline - predict(age.var3baseline.model)

# How well does age predict our original var1?
ggplot(dat, aes(x=ages, y=var1.baseline)) +
  geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line 

# How well does age predict our controlled var1? Looks good!
ggplot(dat, aes(x=ages, y=var1.baseline.controlled)) +
  geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line 

# How well does our controlled var1 predict the controlled outcome variable?
ggplot(dat, aes(x=var1.baseline.controlled, y=var1.diff.controlled)) +
  geom_point(shape=1) +    # Use hollow circles
  geom_smooth(method=lm)   # Add linear regression line 

# Finally, let's check numerically our results...
fully.controlled.var1.diff.model <- lm(var1.diff.controlled ~ var1.baseline.controlled + 
                                             var2.baseline.controlled + 
                                             var3.baseline.controlled, 
                                           data=dat)
summary(fully.controlled.var1.diff.model)
## 
## Call:
## lm(formula = var1.diff.controlled ~ var1.baseline.controlled + 
##     var2.baseline.controlled + var3.baseline.controlled, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.78210 -0.31877  0.00184  0.33861  1.40408 
## 
## Coefficients:
##                            Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)               9.535e-15  1.611e-02    0.000    1.000    
## var1.baseline.controlled -4.986e-01  1.635e-03 -304.892   <2e-16 ***
## var2.baseline.controlled  1.286e-03  8.386e-04    1.534    0.125    
## var3.baseline.controlled -4.436e-05  5.241e-04   -0.085    0.933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5094 on 996 degrees of freedom
## Multiple R-squared:  0.9894, Adjusted R-squared:  0.9894 
## F-statistic: 3.1e+04 on 3 and 996 DF,  p-value: < 2.2e-16