Use of formula

The “formula” specification is the way to specify the model structure in R. In idea, it is similar to the model statement in SAS.

## For null model, include 1 to represent the intercept.
lmResult1 <- lm(len ~ 1, data = ToothGrowth)
summary(lmResult1)
## 
## Call:
## lm(formula = len ~ 1, data = ToothGrowth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.613  -5.738   0.437   6.462  15.087 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   18.813      0.988    19.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.65 on 59 degrees of freedom

## For all other models, the intercept is implicitly included
lmResult2 <- lm(len ~ supp, data = ToothGrowth)
summary(lmResult2)
## 
## Call:
## lm(formula = len ~ supp, data = ToothGrowth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.763  -5.763   0.437   5.587  16.937 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    20.66       1.37   15.13   <2e-16 ***
## suppVC         -3.70       1.93   -1.92     0.06 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.48 on 58 degrees of freedom
## Multiple R-squared:  0.0595, Adjusted R-squared:  0.0433 
## F-statistic: 3.67 on 1 and 58 DF,  p-value: 0.0604

## If you for some reason want to remove the intercept use -1.
lmResult2 <- lm(len ~ -1 + supp, data = ToothGrowth)
summary(lmResult2)
## 
## Call:
## lm(formula = len ~ -1 + supp, data = ToothGrowth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.763  -5.763   0.437   5.587  16.937 
## 
## Coefficients:
##        Estimate Std. Error t value Pr(>|t|)    
## suppOJ    20.66       1.37    15.1   <2e-16 ***
## suppVC    16.96       1.37    12.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.48 on 58 degrees of freedom
## Multiple R-squared:  0.868,  Adjusted R-squared:  0.864 
## F-statistic:  192 on 2 and 58 DF,  p-value: <2e-16

## To include more variables add their names separated by +
lmResult3 <- lm(len ~ supp + dose, data = ToothGrowth)
summary(lmResult3)
## 
## Call:
## lm(formula = len ~ supp + dose, data = ToothGrowth)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.600 -3.700  0.373  2.116  8.800 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.272      1.282    7.23  1.3e-09 ***
## suppVC        -3.700      1.094   -3.38   0.0013 ** 
## dose           9.764      0.877   11.14  6.3e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.24 on 57 degrees of freedom
## Multiple R-squared:  0.704,  Adjusted R-squared:  0.693 
## F-statistic: 67.7 on 2 and 57 DF,  p-value: 8.72e-16

## An interaction term is represented by A:B
lmResult4 <- lm(len ~ supp + dose + supp:dose, data = ToothGrowth)
summary(lmResult4)
## 
## Call:
## lm(formula = len ~ supp + dose + supp:dose, data = ToothGrowth)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.23  -2.85   0.05   2.29   7.94 
## 
## Coefficients:
##             Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)    11.55       1.58    7.30 0.0000000011 ***
## suppVC         -8.26       2.24   -3.69      0.00051 ***
## dose            7.81       1.20    6.53 0.0000000203 ***
## suppVC:dose     3.90       1.69    2.31      0.02463 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.08 on 56 degrees of freedom
## Multiple R-squared:  0.73,   Adjusted R-squared:  0.715 
## F-statistic: 50.4 on 3 and 56 DF,  p-value: 6.52e-16

## This is a short hand for the previous one
lmResult5 <- lm(len ~ supp * dose, data = ToothGrowth)
summary(lmResult5)
## 
## Call:
## lm(formula = len ~ supp * dose, data = ToothGrowth)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8.23  -2.85   0.05   2.29   7.94 
## 
## Coefficients:
##             Estimate Std. Error t value     Pr(>|t|)    
## (Intercept)    11.55       1.58    7.30 0.0000000011 ***
## suppVC         -8.26       2.24   -3.69      0.00051 ***
## dose            7.81       1.20    6.53 0.0000000203 ***
## suppVC:dose     3.90       1.69    2.31      0.02463 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.08 on 56 degrees of freedom
## Multiple R-squared:  0.73,   Adjusted R-squared:  0.715 
## F-statistic: 50.4 on 3 and 56 DF,  p-value: 6.52e-16

## You can manipulate variables within the formula. outcome log
## transformation here.
lmResult6 <- lm(log(len) ~ supp + dose + supp:dose, data = ToothGrowth)
summary(lmResult6)
## 
## Call:
## lm(formula = log(len) ~ supp + dose + supp:dose, data = ToothGrowth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7487 -0.1928  0.0497  0.2321  0.5478 
## 
## Coefficients:
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)   2.4583     0.1121   21.93   < 2e-16 ***
## suppVC       -0.6564     0.1585   -4.14   0.00012 ***
## dose          0.4345     0.0847    5.13 0.0000038 ***
## suppVC:dose   0.3294     0.1198    2.75   0.00804 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.289 on 56 degrees of freedom
## Multiple R-squared:  0.683,  Adjusted R-squared:  0.666 
## F-statistic: 40.3 on 3 and 56 DF,  p-value: 5.28e-14

## Changing the unit of a predictor here.
lmResult7 <- lm(len ~ supp + I(dose * 1000), data = ToothGrowth)
summary(lmResult7)
## 
## Call:
## lm(formula = len ~ supp + I(dose * 1000), data = ToothGrowth)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.600 -3.700  0.373  2.116  8.800 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     9.272500   1.282365    7.23  1.3e-09 ***
## suppVC         -3.700000   1.093604   -3.38   0.0013 ** 
## I(dose * 1000)  0.009764   0.000877   11.14  6.3e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.24 on 57 degrees of freedom
## Multiple R-squared:  0.704,  Adjusted R-squared:  0.693 
## F-statistic: 67.7 on 2 and 57 DF,  p-value: 8.72e-16

## For categorical variables, you should change the reference category in the
## dataset.  This is doable in formula, but it'll get messy.
ToothGrowth$supp
##  [1] VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
## [24] VC VC VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
## [47] OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
## Levels: OJ VC
ToothGrowth$suppRev <- relevel(ToothGrowth$supp, ref = "VC")
ToothGrowth$suppRev
##  [1] VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
## [24] VC VC VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
## [47] OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
## Levels: VC OJ
## With the changed variable.
lmResult7 <- lm(len ~ suppRev + I(dose * 1000), data = ToothGrowth)
summary(lmResult7)
## 
## Call:
## lm(formula = len ~ suppRev + I(dose * 1000), data = ToothGrowth)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.600 -3.700  0.373  2.116  8.800 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.572500   1.282365    4.35  5.8e-05 ***
## suppRevOJ      3.700000   1.093604    3.38   0.0013 ** 
## I(dose * 1000) 0.009764   0.000877   11.14  6.3e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.24 on 57 degrees of freedom
## Multiple R-squared:  0.704,  Adjusted R-squared:  0.693 
## F-statistic: 67.7 on 2 and 57 DF,  p-value: 8.72e-16