Linear Regression with Interaction Effects

Anna Shirokanova, Olesya Volchenko

5/13/2020

This seminar:

Interaction effects:

Exercise 1. Categorical by continuous

Exercise 2. Categorical by categorical

Exercise 3. Continuous by continuous

Example from the European Social Survey data, Wave 7

Variables involved:

Our research question is: “What predicts weekday alcohol consumption in Europe?”

This is an exercise to practise fitting and interpreting interaction effect models, not a real research model with all of its concerns.

Get the data:

library(foreign)
ess7 <-  read.spss("ESS7e02_2.sav", to.data.frame = T, use.value.labels = T)
ess7$alcwkdy <- as.numeric(as.character(ess7$alcwkdy))
ess7$eduyrs <- as.numeric(as.character(ess7$eduyrs))
ess7$agea <- as.numeric(as.character(ess7$agea))

Descriptive statistics:

library(dplyr)
ess <- dplyr::select(ess7, agea, alcwkdy, dshltgp, eduyrs, gndr)
library(psych)
describe(ess)
##          vars     n  mean    sd median trimmed   mad min  max range  skew
## agea        1 40086 49.28 18.74     49   49.15 22.24  14  114   100  0.04
## alcwkdy     2 30064 27.39 44.31     15   19.06 22.24   0 2189  2189  8.95
## dshltgp*    3 40185  1.75  0.44      2    1.81  0.00   1    2     1 -1.13
## eduyrs      4 39828 12.90  3.94     12   12.86  2.97   0   50    50  0.27
## gndr*       5 40163  1.53  0.50      2    1.54  0.00   1    2     1 -0.12
##          kurtosis   se
## agea        -0.93 0.09
## alcwkdy    246.20 0.26
## dshltgp*    -0.73 0.00
## eduyrs       1.95 0.02
## gndr*       -1.99 0.00
ess <- na.omit(ess) # 29879 full obs.
library(ggplot2)
ggplot(ess, aes(x = alcwkdy)) +
  geom_density()

ggplot(ess, aes(x = log(alcwkdy))) +
  geom_density() 
## Warning: Removed 7478 rows containing non-finite values (stat_density).

The variable is log-normal, but this does not affect the models in the exercise; we will use the variable “as is”.

Exercise 1. Gender and education

m1_1 <- lm(alcwkdy ~ gndr, data = ess)
m1_2 <- lm(alcwkdy ~ gndr + eduyrs, ess)
m1_3 <- lm(alcwkdy ~ eduyrs*gndr, ess)
anova(m1_1, m1_2, m1_3)
## Analysis of Variance Table
## 
## Model 1: alcwkdy ~ gndr
## Model 2: alcwkdy ~ gndr + eduyrs
## Model 3: alcwkdy ~ eduyrs * gndr
##   Res.Df      RSS Df Sum of Sq       F    Pr(>F)    
## 1  29830 57030774                                   
## 2  29829 57028278  1    2495.6  1.3059 0.2531412    
## 3  29828 57001073  1   27205.6 14.2364 0.0001615 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m1_1, m1_3)
## Analysis of Variance Table
## 
## Model 1: alcwkdy ~ gndr
## Model 2: alcwkdy ~ eduyrs * gndr
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1  29830 57030774                                  
## 2  29828 57001073  2     29701 7.7712 0.0004226 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Model 1_3 (interaction model) is significantly better than Model 1_2 (additive).

Conclusion: Let’s interpret and plot Model 1_3.

library(sjPlot)
tab_model(m1_1, m1_2, m1_3)
  alcwkdy alcwkdy alcwkdy
Predictors Estimates CI p Estimates CI p Estimates CI p
(Intercept) 34.39 33.70 – 35.09 <0.001 35.39 33.54 – 37.23 <0.001 38.54 36.07 – 41.00 <0.001
gndr [Female] -14.38 -15.37 – -13.39 <0.001 -14.35 -15.35 – -13.36 <0.001 -21.02 -24.62 – -17.42 <0.001
eduyrs -0.08 -0.21 – 0.05 0.253 -0.32 -0.50 – -0.14 0.001
eduyrs * gndr [Female] 0.50 0.24 – 0.76 <0.001
Observations 29832 29832 29832
R2 / R2 adjusted 0.026 / 0.026 0.026 / 0.026 0.027 / 0.027
plot_model(m1_3, type = "int")

Where to look:

  1. Do more educated males score higher than less educated males? Do more educated females score higher than less educated females?
  2. Do the confidence intervales of the two lines overlap? If yes, at what number of education years?

What changes if you swap their places in the model?

m1_4 <- lm(alcwkdy ~ gndr * eduyrs, ess)
tab_model(m1_4, 
          dv.labels = "Weekday Alcohol, grams")
  Weekday Alcohol, grams
Predictors Estimates CI p
(Intercept) 38.54 36.07 – 41.00 <0.001
gndr [Female] -21.02 -24.62 – -17.42 <0.001
eduyrs -0.32 -0.50 – -0.14 0.001
gndr [Female] * eduyrs 0.50 0.24 – 0.76 <0.001
Observations 29832
R2 / R2 adjusted 0.027 / 0.027
plot_model(m1_4, type = "int", 
           title = "Predicted values of Weekday Alcohol, grams", 
           legend.title = "Years of education", 
           axis.title = c("Gender", "Grams of alcohol on weekdays"))

Where to look:

  1. What are all the data points in the plot?
  2. Who scores higher, males or females, when a) education years = 0 and b) = 50?
  3. Do confidence intervals for points of the same colour overlap?
library(interplot)
## Loading required package: abind
## Loading required package: arm
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: Matrix
## Loading required package: lme4
## 
## arm (Version 1.10-1, built: 2018-4-12)
## Working directory is C:/Users/lssi5/YandexDisk/teaching/DA BA 2020
## 
## Attaching package: 'arm'
## The following objects are masked from 'package:psych':
## 
##     logit, rescale, sim
summary(m1_4)
## 
## Call:
## lm(formula = alcwkdy ~ gndr * eduyrs, data = ess)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -38.54  -20.18   -9.55    5.08 2155.19 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        38.53780    1.25751  30.646  < 2e-16 ***
## gndrFemale        -21.01817    1.83725 -11.440  < 2e-16 ***
## eduyrs             -0.31528    0.09175  -3.436 0.000591 ***
## gndrFemale:eduyrs   0.50019    0.13257   3.773 0.000162 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.71 on 29828 degrees of freedom
## Multiple R-squared:  0.02682,    Adjusted R-squared:  0.02673 
## F-statistic: 274.1 on 3 and 29828 DF,  p-value: < 2.2e-16
interplot(m1_4, var1 = "gndr", var2 = "eduyrs") + 
  xlab('Education') +
  ylab('Estimated Coefficient for Gender') 

Exercise 2. Gender and Visit to General Practitioner

m2_1 <- lm(alcwkdy ~ gndr, ess)
m2_2 <- lm(alcwkdy ~ gndr + dshltgp, ess)
m2_3 <- lm(alcwkdy ~ gndr * dshltgp, ess)
anova(m2_1, m2_2, m2_3)
## Analysis of Variance Table
## 
## Model 1: alcwkdy ~ gndr
## Model 2: alcwkdy ~ gndr + dshltgp
## Model 3: alcwkdy ~ gndr * dshltgp
##   Res.Df      RSS Df Sum of Sq       F    Pr(>F)    
## 1  29830 57030774                                   
## 2  29829 56961833  1     68941 36.1071 1.889e-09 ***
## 3  29828 56951855  1      9978  5.2259   0.02226 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Conclusion: …

summary(m2_3)
## 
## Call:
## lm(formula = alcwkdy ~ gndr * dshltgp, data = ess)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -37.68  -19.57   -9.57    4.99 2151.32 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               37.6761     0.6512  57.853  < 2e-16 ***
## gndrFemale               -16.1094     1.0090 -15.966  < 2e-16 ***
## dshltgpMarked             -4.6632     0.7760  -6.009 1.88e-09 ***
## gndrFemale:dshltgpMarked   2.6695     1.1678   2.286   0.0223 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.7 on 29828 degrees of freedom
## Multiple R-squared:  0.02766,    Adjusted R-squared:  0.02757 
## F-statistic: 282.9 on 3 and 29828 DF,  p-value: < 2.2e-16

Interpretation: …

plot_model(m2_3, type = "int")

Where to look:

  1. What are all the data points in the plot?
  2. Who scores higher, males or females, when a) contacted a GP and b) did not ?
  3. Do confidence intervals for points of the same colour overlap?

Exercise 3. Education and Age

m3_1 <- lm(alcwkdy ~ eduyrs, ess)
m3_2 <- lm(alcwkdy ~ eduyrs + agea, ess) 
m3_3 <- lm(alcwkdy ~ eduyrs * agea, ess)
anova(m3_1, m3_2, m3_3)
## Analysis of Variance Table
## 
## Model 1: alcwkdy ~ eduyrs
## Model 2: alcwkdy ~ eduyrs + agea
## Model 3: alcwkdy ~ eduyrs * agea
##   Res.Df      RSS Df Sum of Sq      F  Pr(>F)    
## 1  29830 58561625                                
## 2  29829 58406177  1    155448 79.404 < 2e-16 ***
## 3  29828 58393830  1     12347  6.307 0.01203 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(m3_1, m3_2, m3_3)
  alcwkdy alcwkdy alcwkdy
Predictors Estimates CI p Estimates CI p Estimates CI p
(Intercept) 29.43 27.61 – 31.25 <0.001 37.39 34.87 – 39.92 <0.001 43.89 38.23 – 49.56 <0.001
eduyrs -0.16 -0.29 – -0.02 0.020 -0.28 -0.41 – -0.15 <0.001 -0.78 -1.20 – -0.37 <0.001
agea -0.13 -0.16 – -0.10 <0.001 -0.25 -0.35 – -0.15 <0.001
eduyrs * agea 0.01 0.00 – 0.02 0.012
Observations 29832 29832 29832
R2 / R2 adjusted 0.000 / 0.000 0.003 / 0.003 0.003 / 0.003
plot_model(m3_3, type = "int")

summary(ess$agea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.00   34.00   49.00   48.55   63.00  114.00
plot_model(m3_3, type = "int", mdrt.values = "meansd")

library(interplot)
interplot(m3_3, var1 = "eduyrs", var2 = "agea") + 
  xlab('age') +
  ylab('Estimated Coefficient for Years of Education') 

The end.