Learning objectives:
Run and interpret variety of regression models in R
You might also start by listing the files in your working directory
getwd() # where am I?
## [1] "C:/Users/maria/Dropbox (Personal)/SpringBoard Fund/Rprojects/linear_regression"
list.files("dataSets") # files in the dataSets folder
## [1] "Exam.rds" "states.dta" "states.rds"
# Load the states data and read the states data
states.data <- readRDS("dataSets/states.rds")
#get labels
states.info <- data.frame(attributes(states.data)[c("names", "var.labels")])
#look at last few labels
tail(states.info, 8)
## names var.labels
## 14 csat Mean composite SAT score
## 15 vsat Mean verbal SAT score
## 16 msat Mean math SAT score
## 17 percent % HS graduates taking SAT
## 18 expense Per pupil expenditures prim&sec
## 19 income Median household income, $1,000
## 20 high % adults HS diploma
## 21 college % adults college degree
Examine the data before fitting models
Start by examining the data to check for problems.
# summary of expense and csat columns, all rows
sts.ex.sat <- subset(states.data, select = c("expense", "csat"))
summary(sts.ex.sat)
## expense csat
## Min. :2960 Min. : 832.0
## 1st Qu.:4352 1st Qu.: 888.0
## Median :5000 Median : 926.0
## Mean :5236 Mean : 944.1
## 3rd Qu.:5794 3rd Qu.: 997.0
## Max. :9259 Max. :1093.0
# correlation between expense and csat
cor(sts.ex.sat)
## expense csat
## expense 1.0000000 -0.4662978
## csat -0.4662978 1.0000000
Plot the data before fitting models
# scatter plot of expense vs csat
plot(sts.ex.sat)
Fit our regression model
sat.mod <- lm(csat ~ expense, # regression formula
data=states.data) # data set
# Summarize and print the results
summary(sat.mod) # show regression coefficients table
##
## Call:
## lm(formula = csat ~ expense, data = states.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -131.811 -38.085 5.607 37.852 136.495
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.061e+03 3.270e+01 32.44 < 2e-16 ***
## expense -2.228e-02 6.037e-03 -3.69 0.000563 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.81 on 49 degrees of freedom
## Multiple R-squared: 0.2174, Adjusted R-squared: 0.2015
## F-statistic: 13.61 on 1 and 49 DF, p-value: 0.0005631
Why is the association between expense and SAT scores /negative/?
summary(lm(csat ~ expense + percent, data = states.data))
##
## Call:
## lm(formula = csat ~ expense + percent, data = states.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.921 -24.318 1.741 15.502 75.623
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 989.807403 18.395770 53.806 < 2e-16 ***
## expense 0.008604 0.004204 2.046 0.0462 *
## percent -2.537700 0.224912 -11.283 4.21e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.62 on 48 degrees of freedom
## Multiple R-squared: 0.7857, Adjusted R-squared: 0.7768
## F-statistic: 88.01 on 2 and 48 DF, p-value: < 2.2e-16
OK, we fit our model. Now what?
Examine the model object:
class(sat.mod)
## [1] "lm"
names(sat.mod)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
methods(class = class(sat.mod))[1:9]
## [1] "add1.lm" "alias.lm"
## [3] "anova.lm" "case.names.lm"
## [5] "coerce,oldClass,S3-method" "confint.lm"
## [7] "cooks.distance.lm" "deviance.lm"
## [9] "dfbeta.lm"
# Use function methods to get more information about the fit
confint(sat.mod)
## 2.5 % 97.5 %
## (Intercept) 995.01753164 1126.44735626
## expense -0.03440768 -0.01014361
# hist(residuals(sat.mod))
Ordinary least squares regression relies on several assumptions, including that the residuals are normally distributed and homoscedastic, the errors are independent and the relationships are linear.
# Investigate these assumptions visually by plotting your model:
par(mar = c(4, 4, 2, 2), mfrow = c(1, 2)) #optional
plot(sat.mod, which = c(1, 2)) # "which" argument optional
Do congressional voting patterns predict SAT scores over and above expense? Fit two models and compare them:
# fit another model, adding house and senate as predictors
sat.voting.mod <- lm(csat ~ expense + house + senate,
data = na.omit(states.data))
sat.mod <- update(sat.mod, data=na.omit(states.data))
# compare using the anova() function
anova(sat.mod, sat.voting.mod)
## Analysis of Variance Table
##
## Model 1: csat ~ expense
## Model 2: csat ~ expense + house + senate
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 46 169050
## 2 44 149284 2 19766 2.9128 0.06486 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(summary(sat.voting.mod))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1082.93438041 38.633812740 28.0307405 1.067795e-29
## expense -0.01870832 0.009691494 -1.9303852 6.001998e-02
## house -1.44243754 0.600478382 -2.4021473 2.058666e-02
## senate 0.49817861 0.513561356 0.9700469 3.373256e-01
Use the /states.rds/ data set. Fit a model predicting energy consumed per capita (energy) from the percentage of residents living in metropolitan areas (metro). Be sure to - Examine/plot the data before fitting the model - Print and interpret the model summary' -plot’ the model to look for deviations from modeling assumptions - Select one or more additional predictors to add to your model and - repeat steps 1-3. Is this model significantly better than the model with /metro/ as the only predictor?
# summary
sts.ex.sat <- subset(states.data, select = c("metro", "energy"))
summary(sts.ex.sat)
## metro energy
## Min. : 20.40 Min. :200.0
## 1st Qu.: 46.98 1st Qu.:285.0
## Median : 67.55 Median :320.0
## Mean : 64.07 Mean :354.5
## 3rd Qu.: 81.58 3rd Qu.:371.5
## Max. :100.00 Max. :991.0
## NA's :1 NA's :1
# correlation
cor(sts.ex.sat)
## metro energy
## metro 1 NA
## energy NA 1
# scatter plot before fitting the model
plot(sts.ex.sat)
# Fit our regression model
sat.mod <- lm(energy ~ metro, # regression formula
data=states.data) # data set
# Summarize and print the results
summary(sat.mod) # show regression coefficients table
##
## Call:
## lm(formula = energy ~ metro, data = states.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -215.51 -64.54 -30.87 18.71 583.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 501.0292 61.8136 8.105 1.53e-10 ***
## metro -2.2871 0.9139 -2.503 0.0158 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 140.2 on 48 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1154, Adjusted R-squared: 0.097
## F-statistic: 6.263 on 1 and 48 DF, p-value: 0.01578
par(mar = c(4, 4, 2, 2), mfrow = c(1, 2)) #optional
plot(sat.mod, which = c(1, 2)) # "which" argument optional
sat.income.mod <- lm(energy ~ metro + income ,
data = na.omit(states.data))
sat.mod <- update(sat.mod, data=na.omit(states.data))
# compare using the anova() function
anova(sat.mod, sat.income.mod)
## Analysis of Variance Table
##
## Model 1: energy ~ metro
## Model 2: energy ~ metro + income
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 46 580411
## 2 45 513544 1 66867 5.8593 0.01959 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
coef(summary(sat.income.mod))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 632.24148275 89.3311635 7.07750193 7.821667e-09
## metro -0.07652075 0.9607449 -0.07964731 9.368709e-01
## income -8.49958871 3.5113499 -2.42060428 1.959285e-02
Modeling interactions
Interactions allow us assess the extent to which the association between one predictor and the outcome depends on a second predictor. For example: Does the association between expense and SAT scores depend on the median income in the state?
# Add the interaction to the model
sat.expense.by.percent <- lm(csat ~ expense*income,
data=states.data)
#Show the results
coef(summary(sat.expense.by.percent)) # show regression coefficients table
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.380364e+03 1.720863e+02 8.021351 2.367069e-10
## expense -6.384067e-02 3.270087e-02 -1.952262 5.687837e-02
## income -1.049785e+01 4.991463e+00 -2.103161 4.083253e-02
## expense:income 1.384647e-03 8.635529e-04 1.603431 1.155395e-01
Let’s try to predict SAT scores from region, a categorical variable. Note that you must make sure R does not think your categorical variable is numeric.
# make sure R knows region is categorical
str(states.data$region)
## Factor w/ 4 levels "West","N. East",..: 3 1 1 3 1 1 2 3 NA 3 ...
states.data$region <- factor(states.data$region)
#Add region to the model
sat.region <- lm(csat ~ region,
data=states.data)
#Show the results
coef(summary(sat.region)) # show regression coefficients table
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 946.30769 14.79582 63.9577807 1.352577e-46
## regionN. East -56.75214 23.13285 -2.4533141 1.800383e-02
## regionSouth -16.30769 19.91948 -0.8186806 4.171898e-01
## regionMidwest 63.77564 21.35592 2.9863209 4.514152e-03
anova(sat.region) # show ANOVA table
## Analysis of Variance Table
##
## Response: csat
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 82049 27349.8 9.6102 4.859e-05 ***
## Residuals 46 130912 2845.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Again, make sure to tell R which variables are categorical by converting them to factors!
In the previous example we use the default contrasts for region. The default in R is treatment contrasts, with the first level as the reference. We can change the reference group or use another coding scheme using the `C’ function.
# print default contrasts
contrasts(states.data$region)
## N. East South Midwest
## West 0 0 0
## N. East 1 0 0
## South 0 1 0
## Midwest 0 0 1
# change the reference group
coef(summary(lm(csat ~ C(region, base=4),
data=states.data)))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1010.08333 15.39998 65.589930 4.296307e-47
## C(region, base = 4)1 -63.77564 21.35592 -2.986321 4.514152e-03
## C(region, base = 4)2 -120.52778 23.52385 -5.123641 5.798399e-06
## C(region, base = 4)3 -80.08333 20.37225 -3.931000 2.826007e-04
# change the coding scheme
coef(summary(lm(csat ~ C(region, contr.helmert),
data=states.data)))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 943.986645 7.706155 122.4977451 1.689670e-59
## C(region, contr.helmert)1 -28.376068 11.566423 -2.4533141 1.800383e-02
## C(region, contr.helmert)2 4.022792 5.884552 0.6836191 4.976450e-01
## C(region, contr.helmert)3 22.032229 4.446777 4.9546509 1.023364e-05
See also ?contrasts',?contr.treatment’, and `?relevel’.
Use the states data set.
#Add the interaction to the model
sat.income.by.percent <- lm(energy ~ metro*income,
data=states.data)
coef(summary(sat.income.by.percent))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -330.3633291 298.5448346 -1.106579 0.274229651
## metro 9.8421058 4.5542877 2.161064 0.035931648
## income 26.0870944 9.3476847 2.790755 0.007630012
## metro:income -0.3685103 0.1312168 -2.808408 0.007282700
sat.region2 <- lm(energy ~ region,
data=states.data)
#Show the results
coef(summary(sat.region2)) # show regression coefficients table
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 405.61538 39.23196 10.3389019 1.395176e-13
## regionN. East -156.50427 61.33807 -2.5515032 1.411486e-02
## regionSouth -25.49038 52.81764 -0.4826112 6.316606e-01
## regionMidwest -61.61538 56.62646 -1.0881024 2.822182e-01
anova(sat.region2) # show ANOVA table
## Analysis of Variance Table
##
## Response: energy
## Df Sum Sq Mean Sq F value Pr(>F)
## region 3 145757 48586 2.4282 0.07737 .
## Residuals 46 920410 20009
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# print default contrasts
contrasts(states.data$region)
## N. East South Midwest
## West 0 0 0
## N. East 1 0 0
## South 0 1 0
## Midwest 0 0 1
# change the reference group
coef(summary(lm(energy ~ C(region, base=4),
data=states.data)))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 344.00000 40.83392 8.4243691 7.058813e-11
## C(region, base = 4)1 61.61538 56.62646 1.0881024 2.822182e-01
## C(region, base = 4)2 -94.88889 62.37484 -1.5212686 1.350374e-01
## C(region, base = 4)3 36.12500 54.01820 0.6687561 5.069935e-01
# change the coding scheme
coef(summary(lm(energy ~ C(region, contr.helmert),
data=states.data)))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 344.7128739 20.43331 16.87014156 2.504367e-21
## C(region, contr.helmert)1 -78.2521368 30.66903 -2.55150316 1.411486e-02
## C(region, contr.helmert)2 17.5872507 15.60323 1.12715468 2.655225e-01
## C(region, contr.helmert)3 -0.2376246 11.79088 -0.02015325 9.840083e-01