library(ggplot2)
sample.size <- 1000
dat <- data.frame(subject.id=1:sample.size)
dat$ages <- round(runif(sample.size, min=18, max=60))
hist(dat$ages)
# Simulate random baseline values for the participants, where the baseline variables
# are dependent on age.
dat$var1.baseline <- dat$ages + rnorm(sample.size, mean=10, sd=10)
dat$var2.baseline <- dat$ages + rnorm(sample.size, mean=20, sd=20)
dat$var3.baseline <- dat$ages + rnorm(sample.size, mean=30, sd=30)
# Simulate some random movement from baseline to placebo
dat$var1.placebo <- dat$var1.baseline + rnorm(sample.size, mean=0, sd=1)
dat$var2.placebo <- dat$var2.baseline + rnorm(sample.size, mean=0, sd=2)
dat$var3.placebo <- dat$var3.baseline + rnorm(sample.size, mean=0, sd=3)
# Try simulating a response variable that depends only on the values of the original
# analogous variables and, indirectly, age (because the baseline variables depended on age).
dat$var1.drug <- dat$var1.placebo - 0.5*dat$var1.baseline + rnorm(sample.size, sd=0.5)
dat$var2.drug <- dat$var2.placebo - 1.0*dat$var2.baseline + rnorm(sample.size, sd=0.5)
dat$var3.drug <- dat$var3.placebo - 1.5*dat$var3.baseline + rnorm(sample.size, sd=0.5)
dat$var1.diff <- dat$var1.drug - dat$var1.placebo
dat$var2.diff <- dat$var2.drug - dat$var2.placebo
dat$var3.diff <- dat$var3.drug - dat$var3.placebo
# How does age relate to the uncontrolled outcome variable?
ggplot(dat, aes(x=ages, y=var1.diff)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# (by default includes 95% confidence region)
# How does the relevant predictor variable relate to the uncontrolled outcome variable?
ggplot(dat, aes(x=var1.baseline, y=var1.diff)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# Typical example from multiple regression, excluding the relevant input variable
var1.diff.model <- lm(formula = var1.diff ~ ages + var2.baseline + var3.baseline, data=dat)
summary(var1.diff.model)
##
## Call:
## lm(formula = var1.diff ~ ages + var2.baseline + var3.baseline,
## data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3061 -3.4566 0.1039 3.4344 16.2689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.792082 0.564353 -8.491 <2e-16 ***
## ages -0.496261 0.016410 -30.242 <2e-16 ***
## var2.baseline -0.004540 0.008143 -0.558 0.577
## var3.baseline 0.002265 0.005090 0.445 0.656
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.948 on 996 degrees of freedom
## Multiple R-squared: 0.602, Adjusted R-squared: 0.6008
## F-statistic: 502.2 on 3 and 996 DF, p-value: < 2.2e-16
# How we would typically control in the one-dimensional case
var1.diff.model.with.age <- lm(formula = var1.diff ~ ages + var1.baseline + var2.baseline + var3.baseline,
data=dat)
summary(var1.diff.model.with.age)
##
## Call:
## lm(formula = var1.diff ~ ages + var1.baseline + var2.baseline +
## var3.baseline, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.78210 -0.31877 0.00184 0.33861 1.40408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.392e-02 6.019e-02 -0.730 0.466
## ages -2.083e-03 2.342e-03 -0.889 0.374
## var1.baseline -4.986e-01 1.636e-03 -304.738 <2e-16 ***
## var2.baseline 1.286e-03 8.390e-04 1.533 0.126
## var3.baseline -4.436e-05 5.244e-04 -0.085 0.933
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5097 on 995 degrees of freedom
## Multiple R-squared: 0.9958, Adjusted R-squared: 0.9958
## F-statistic: 5.871e+04 on 4 and 995 DF, p-value: < 2.2e-16
# Let's control for age in all of the outcome variables
age.var1diff.model <- lm(formula = var1.diff ~ ages, data=dat)
age.var2diff.model <- lm(formula = var2.diff ~ ages, data=dat)
age.var3diff.model <- lm(formula = var3.diff ~ ages, data=dat)
dat$var1.diff.controlled <- dat$var1.diff - predict(age.var1diff.model)
dat$var2.diff.controlled <- dat$var2.diff - predict(age.var2diff.model)
dat$var3.diff.controlled <- dat$var3.diff - predict(age.var3diff.model)
# And check to make sure that the age-related portion of the outcome variable
# has been taken away.
ggplot(dat, aes(x=ages, y=var1.diff.controlled)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# We still get some signal, presumably the part that we want, but it's still noisy...
ggplot(dat, aes(x=var1.baseline, y=var1.diff.controlled)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# What if we now try and predict the outcome from all the uncontrolled input variables?
partially.controlled.var1.diff.model <- lm(var1.diff.controlled ~ var1.baseline + var2.baseline + var3.baseline,
data=dat)
summary(partially.controlled.var1.diff.model)
##
## Call:
## lm(formula = var1.diff.controlled ~ var1.baseline + var2.baseline +
## var3.baseline, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7711 -2.4889 0.1182 2.4681 9.3094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.749369 0.403667 16.720 <2e-16 ***
## var1.baseline -0.258563 0.008015 -32.260 <2e-16 ***
## var2.baseline 0.065174 0.005315 12.261 <2e-16 ***
## var3.baseline 0.029017 0.003436 8.445 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.46 on 996 degrees of freedom
## Multiple R-squared: 0.5112, Adjusted R-squared: 0.5097
## F-statistic: 347.2 on 3 and 996 DF, p-value: < 2.2e-16
# Not quite what we wanted.... Let's try and control for age in the input variables as well...
# How does that affect our predictions?
age.var1baseline.model <- lm(formula = var1.baseline ~ ages, data=dat)
age.var2baseline.model <- lm(formula = var2.baseline ~ ages, data=dat)
age.var3baseline.model <- lm(formula = var3.baseline ~ ages, data=dat)
dat$var1.baseline.controlled <- dat$var1.baseline - predict(age.var1baseline.model)
dat$var2.baseline.controlled <- dat$var2.baseline - predict(age.var2baseline.model)
dat$var3.baseline.controlled <- dat$var3.baseline - predict(age.var3baseline.model)
# How well does age predict our original var1?
ggplot(dat, aes(x=ages, y=var1.baseline)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# How well does age predict our controlled var1? Looks good!
ggplot(dat, aes(x=ages, y=var1.baseline.controlled)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# How well does our controlled var1 predict the controlled outcome variable?
ggplot(dat, aes(x=var1.baseline.controlled, y=var1.diff.controlled)) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# Finally, let's check numerically our results...
fully.controlled.var1.diff.model <- lm(var1.diff.controlled ~ var1.baseline.controlled +
var2.baseline.controlled +
var3.baseline.controlled,
data=dat)
summary(fully.controlled.var1.diff.model)
##
## Call:
## lm(formula = var1.diff.controlled ~ var1.baseline.controlled +
## var2.baseline.controlled + var3.baseline.controlled, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.78210 -0.31877 0.00184 0.33861 1.40408
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.535e-15 1.611e-02 0.000 1.000
## var1.baseline.controlled -4.986e-01 1.635e-03 -304.892 <2e-16 ***
## var2.baseline.controlled 1.286e-03 8.386e-04 1.534 0.125
## var3.baseline.controlled -4.436e-05 5.241e-04 -0.085 0.933
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5094 on 996 degrees of freedom
## Multiple R-squared: 0.9894, Adjusted R-squared: 0.9894
## F-statistic: 3.1e+04 on 3 and 996 DF, p-value: < 2.2e-16