The “formula” specification is the way to specify the model structure in R. In idea, it is similar to the model statement in SAS.
## For null model, include 1 to represent the intercept.
lmResult1 <- lm(len ~ 1, data = ToothGrowth)
summary(lmResult1)
##
## Call:
## lm(formula = len ~ 1, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.613 -5.738 0.437 6.462 15.087
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.813 0.988 19.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.65 on 59 degrees of freedom
## For all other models, the intercept is implicitly included
lmResult2 <- lm(len ~ supp, data = ToothGrowth)
summary(lmResult2)
##
## Call:
## lm(formula = len ~ supp, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.763 -5.763 0.437 5.587 16.937
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.66 1.37 15.13 <2e-16 ***
## suppVC -3.70 1.93 -1.92 0.06 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.48 on 58 degrees of freedom
## Multiple R-squared: 0.0595, Adjusted R-squared: 0.0433
## F-statistic: 3.67 on 1 and 58 DF, p-value: 0.0604
## If you for some reason want to remove the intercept use -1.
lmResult2 <- lm(len ~ -1 + supp, data = ToothGrowth)
summary(lmResult2)
##
## Call:
## lm(formula = len ~ -1 + supp, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.763 -5.763 0.437 5.587 16.937
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## suppOJ 20.66 1.37 15.1 <2e-16 ***
## suppVC 16.96 1.37 12.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.48 on 58 degrees of freedom
## Multiple R-squared: 0.868, Adjusted R-squared: 0.864
## F-statistic: 192 on 2 and 58 DF, p-value: <2e-16
## To include more variables add their names separated by +
lmResult3 <- lm(len ~ supp + dose, data = ToothGrowth)
summary(lmResult3)
##
## Call:
## lm(formula = len ~ supp + dose, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.600 -3.700 0.373 2.116 8.800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.272 1.282 7.23 1.3e-09 ***
## suppVC -3.700 1.094 -3.38 0.0013 **
## dose 9.764 0.877 11.14 6.3e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.24 on 57 degrees of freedom
## Multiple R-squared: 0.704, Adjusted R-squared: 0.693
## F-statistic: 67.7 on 2 and 57 DF, p-value: 8.72e-16
## An interaction term is represented by A:B
lmResult4 <- lm(len ~ supp + dose + supp:dose, data = ToothGrowth)
summary(lmResult4)
##
## Call:
## lm(formula = len ~ supp + dose + supp:dose, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.23 -2.85 0.05 2.29 7.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.55 1.58 7.30 0.0000000011 ***
## suppVC -8.26 2.24 -3.69 0.00051 ***
## dose 7.81 1.20 6.53 0.0000000203 ***
## suppVC:dose 3.90 1.69 2.31 0.02463 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.08 on 56 degrees of freedom
## Multiple R-squared: 0.73, Adjusted R-squared: 0.715
## F-statistic: 50.4 on 3 and 56 DF, p-value: 6.52e-16
## This is a short hand for the previous one
lmResult5 <- lm(len ~ supp * dose, data = ToothGrowth)
summary(lmResult5)
##
## Call:
## lm(formula = len ~ supp * dose, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.23 -2.85 0.05 2.29 7.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.55 1.58 7.30 0.0000000011 ***
## suppVC -8.26 2.24 -3.69 0.00051 ***
## dose 7.81 1.20 6.53 0.0000000203 ***
## suppVC:dose 3.90 1.69 2.31 0.02463 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.08 on 56 degrees of freedom
## Multiple R-squared: 0.73, Adjusted R-squared: 0.715
## F-statistic: 50.4 on 3 and 56 DF, p-value: 6.52e-16
## You can manipulate variables within the formula. outcome log
## transformation here.
lmResult6 <- lm(log(len) ~ supp + dose + supp:dose, data = ToothGrowth)
summary(lmResult6)
##
## Call:
## lm(formula = log(len) ~ supp + dose + supp:dose, data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7487 -0.1928 0.0497 0.2321 0.5478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.4583 0.1121 21.93 < 2e-16 ***
## suppVC -0.6564 0.1585 -4.14 0.00012 ***
## dose 0.4345 0.0847 5.13 0.0000038 ***
## suppVC:dose 0.3294 0.1198 2.75 0.00804 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.289 on 56 degrees of freedom
## Multiple R-squared: 0.683, Adjusted R-squared: 0.666
## F-statistic: 40.3 on 3 and 56 DF, p-value: 5.28e-14
## Changing the unit of a predictor here.
lmResult7 <- lm(len ~ supp + I(dose * 1000), data = ToothGrowth)
summary(lmResult7)
##
## Call:
## lm(formula = len ~ supp + I(dose * 1000), data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.600 -3.700 0.373 2.116 8.800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.272500 1.282365 7.23 1.3e-09 ***
## suppVC -3.700000 1.093604 -3.38 0.0013 **
## I(dose * 1000) 0.009764 0.000877 11.14 6.3e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.24 on 57 degrees of freedom
## Multiple R-squared: 0.704, Adjusted R-squared: 0.693
## F-statistic: 67.7 on 2 and 57 DF, p-value: 8.72e-16
## For categorical variables, you should change the reference category in the
## dataset. This is doable in formula, but it'll get messy.
ToothGrowth$supp
## [1] VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
## [24] VC VC VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
## [47] OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
## Levels: OJ VC
ToothGrowth$suppRev <- relevel(ToothGrowth$supp, ref = "VC")
ToothGrowth$suppRev
## [1] VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC VC
## [24] VC VC VC VC VC VC VC OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
## [47] OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ OJ
## Levels: VC OJ
## With the changed variable.
lmResult7 <- lm(len ~ suppRev + I(dose * 1000), data = ToothGrowth)
summary(lmResult7)
##
## Call:
## lm(formula = len ~ suppRev + I(dose * 1000), data = ToothGrowth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.600 -3.700 0.373 2.116 8.800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.572500 1.282365 4.35 5.8e-05 ***
## suppRevOJ 3.700000 1.093604 3.38 0.0013 **
## I(dose * 1000) 0.009764 0.000877 11.14 6.3e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.24 on 57 degrees of freedom
## Multiple R-squared: 0.704, Adjusted R-squared: 0.693
## F-statistic: 67.7 on 2 and 57 DF, p-value: 8.72e-16