Lab on Regression

HSE SPb Sociology and Social Informatics, April 23, 2018

My First Regression

Data from J.Miles and M.Shevlin’s book (2000)

library(foreign)

df <- read.csv("data1_1.csv")

Start modelling

model1 <- lm(grade ~ books, data = df)
summary(model1)
## 
## Call:
## lm(formula = grade ~ books, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.025 -12.616  -1.181  10.425  33.450 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   52.075      4.035  12.905 1.83e-15 ***
## books          5.738      1.647   3.483  0.00127 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.73 on 38 degrees of freedom
## Multiple R-squared:  0.242,  Adjusted R-squared:  0.222 
## F-statistic: 12.13 on 1 and 38 DF,  p-value: 0.001265

F-statistic is significant, so …

The model explains … per cent of the variance of “grade”.

For each extra book read by the student, the estimated grade changes by …

When books = 0, the estimated grade = …

model2 <- lm(grade ~ attend, data = df)
summary(model2)
## 
## Call:
## lm(formula = grade ~ attend, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.777 -10.904   2.021  12.430  31.755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   36.998      8.169   4.529 5.71e-05 ***
## attend         1.883      0.555   3.393  0.00163 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.83 on 38 degrees of freedom
## Multiple R-squared:  0.2325, Adjusted R-squared:  0.2123 
## F-statistic: 11.51 on 1 and 38 DF,  p-value: 0.001629
model3 <- lm(grade ~ books + attend, data = df)
summary(model3)
## 
## Call:
## lm(formula = grade ~ books + attend, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.802 -13.374   0.060   9.173  32.295 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.379      7.745   4.827 2.41e-05 ***
## books          4.037      1.753   2.303   0.0270 *  
## attend         1.284      0.587   2.187   0.0352 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.05 on 37 degrees of freedom
## Multiple R-squared:  0.3287, Adjusted R-squared:  0.2924 
## F-statistic: 9.059 on 2 and 37 DF,  p-value: 0.0006278

To identify a better model, compare nested models, i.e. those where one model can be obtained from another by deleting a predictor

anova(model1, model3)
## Analysis of Variance Table
## 
## Model 1: grade ~ books
## Model 2: grade ~ books + attend
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8250.4                              
## 2     37 7306.2  1    944.16 4.7814 0.03517 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model2, model3)
## Analysis of Variance Table
## 
## Model 1: grade ~ attend
## Model 2: grade ~ books + attend
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8353.4                              
## 2     37 7306.2  1    1047.1 5.3028 0.02701 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

There is a problem with attend : its minimal value is 6, not 0. The intercept is meaningless. There is a solution - centering the attend variable

df$attend_cent <- df$attend - mean(df$attend)
#df$attend_cent <- scale(df$attend, center = T, scale = F) 
model4 <- lm(grade ~ attend_cent, data = df)
summary(model2)
## 
## Call:
## lm(formula = grade ~ attend, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.777 -10.904   2.021  12.430  31.755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   36.998      8.169   4.529 5.71e-05 ***
## attend         1.883      0.555   3.393  0.00163 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.83 on 38 degrees of freedom
## Multiple R-squared:  0.2325, Adjusted R-squared:  0.2123 
## F-statistic: 11.51 on 1 and 38 DF,  p-value: 0.001629
summary(model4)
## 
## Call:
## lm(formula = grade ~ attend_cent, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.777 -10.904   2.021  12.430  31.755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   63.550      2.344  27.109  < 2e-16 ***
## attend_cent    1.883      0.555   3.393  0.00163 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.83 on 38 degrees of freedom
## Multiple R-squared:  0.2325, Adjusted R-squared:  0.2123 
## F-statistic: 11.51 on 1 and 38 DF,  p-value: 0.001629

The model quality is the same; but model4 has an interpretable intercept that fits the observed data.

model5 <- lm(grade ~ attend_cent + books, data = df)
summary(model5)
## 
## Call:
## lm(formula = grade ~ attend_cent + books, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.802 -13.374   0.060   9.173  32.295 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   55.476      4.151  13.365 9.81e-16 ***
## attend_cent    1.284      0.587   2.187   0.0352 *  
## books          4.037      1.753   2.303   0.0270 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.05 on 37 degrees of freedom
## Multiple R-squared:  0.3287, Adjusted R-squared:  0.2924 
## F-statistic: 9.059 on 2 and 37 DF,  p-value: 0.0006278
anova(model4, model5)
## Analysis of Variance Table
## 
## Model 1: grade ~ attend_cent
## Model 2: grade ~ attend_cent + books
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     38 8353.4                              
## 2     37 7306.2  1    1047.1 5.3028 0.02701 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

These are the same models

Is books or attend more important? Use beta coefficients to compare the effects within one model.

library(QuantPsyc)
lm.beta(model5)
## attend_cent       books 
##   0.3286422   0.3460987
library(sjPlot)
plot_model(model5, type = "std")

Books matter slightly more

Compare models in a table (1)

library(stargazer)
stargazer(model1, model4, model5, type = "text")
## 
## =======================================================================================
##                                             Dependent variable:                        
##                     -------------------------------------------------------------------
##                                                    grade                               
##                              (1)                    (2)                    (3)         
## ---------------------------------------------------------------------------------------
## books                      5.738***                                      4.037**       
##                            (1.647)                                       (1.753)       
##                                                                                        
## attend_cent                                       1.883***               1.283**       
##                                                   (0.555)                (0.587)       
##                                                                                        
## Constant                  52.075***              63.550***              55.476***      
##                            (4.035)                (2.344)                (4.151)       
##                                                                                        
## ---------------------------------------------------------------------------------------
## Observations                  40                     40                    40          
## R2                          0.242                  0.233                  0.329        
## Adjusted R2                 0.222                  0.212                  0.292        
## Residual Std. Error    14.735 (df = 38)       14.826 (df = 38)      14.052 (df = 37)   
## F Statistic         12.130*** (df = 1; 38) 11.512*** (df = 1; 38) 9.059*** (df = 2; 37)
## =======================================================================================
## Note:                                                       *p<0.1; **p<0.05; ***p<0.01

Compare models in a table (2)

library(sjPlot)
sjt.lm(model5, show.ci = F, p.numeric = F)
    grade
    B
(Intercept)   55.48 ***
attend_cent   1.28 *
books   4.04 *
Observations   40
R2 / adj. R2   .329 / .292
Notes * p<.05   ** p<.01   *** p<.001

My Second Regression

5th wave of ESS

Modelling: ideal retirement age.

Descriptive statistics first

library(foreign)
data <- read.spss("ESS5e03.sav", to.data.frame = TRUE)
savevars <- c("eduyrs", "agea", "agertr", "gndr", "cntry", "icmnact", "hinctnta")
ESS1 <- data[savevars]
rm(data, savevars)
head(ESS1)
##   eduyrs agea agertr   gndr   cntry      icmnact hinctnta
## 1     15   22   <NA>   Male Belgium   All others     <NA>
## 2     15   43   <NA>   Male Belgium In paid work        F
## 3     13   19   <NA> Female Belgium   All others        J
## 4     15   23   <NA> Female Belgium In paid work        H
## 5     15   58     60 Female Belgium   All others        S
## 6     13   62     60 Female Belgium      Retired        F

Start modelling

ESS2 <- na.omit(subset(ESS1, icmnact=="In paid work")) #na.omit - not the best idea, bit it will do for now
median(ESS2$income)
## [1] 6
median(ESS2$eduyrs)
## [1] 13
mean(ESS2$agea)
## [1] 54.07378
ESS2$agea_c <- ESS2$agea - mean(ESS2$agea) #center age

m1 <- lm(agertr ~ agea_c, data = ESS2)
summary(m1)
## 
## Call:
## lm(formula = agertr ~ agea_c, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.250  -2.506   0.195   2.505  58.728 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 60.55648    0.05998 1009.56   <2e-16 ***
## agea_c       0.24455    0.01035   23.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.169 on 7425 degrees of freedom
## Multiple R-squared:  0.06996,    Adjusted R-squared:  0.06983 
## F-statistic: 558.5 on 1 and 7425 DF,  p-value: < 2.2e-16
m2 <- lm(agertr ~ agea_c + gndr, data = ESS2)
summary(m2)
## 
## Call:
## lm(formula = agertr ~ agea_c + gndr, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -57.962  -2.723  -0.002   2.384  57.993 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 61.30832    0.08332  735.86   <2e-16 ***
## agea_c       0.23869    0.01025   23.30   <2e-16 ***
## gndrFemale  -1.52691    0.11879  -12.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.113 on 7424 degrees of freedom
## Multiple R-squared:  0.09021,    Adjusted R-squared:  0.08996 
## F-statistic:   368 on 2 and 7424 DF,  p-value: < 2.2e-16
anova(m1, m2)
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea_c
## Model 2: agertr ~ agea_c + gndr
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7425 198412                                  
## 2   7424 194093  1    4319.4 165.21 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m3 <- lm(agertr ~ agea_c + gndr + income, data = ESS2)
summary(m3)
## 
## Call:
## lm(formula = agertr ~ agea_c + gndr + income, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.321  -2.667   0.033   2.424  57.785 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 60.44109    0.17162 352.182  < 2e-16 ***
## agea_c       0.24357    0.01026  23.745  < 2e-16 ***
## gndrFemale  -1.44434    0.11939 -12.097  < 2e-16 ***
## income       0.13259    0.02295   5.776 7.95e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.102 on 7423 degrees of freedom
## Multiple R-squared:  0.09428,    Adjusted R-squared:  0.09391 
## F-statistic: 257.6 on 3 and 7423 DF,  p-value: < 2.2e-16
anova(m2, m3)
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea_c + gndr
## Model 2: agertr ~ agea_c + gndr + income
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7424 194093                                  
## 2   7423 193224  1     868.5 33.365 7.949e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m4 <- lm(agertr ~ agea_c + gndr + income + eduyrs, data = ESS2)
summary(m4)
## 
## Call:
## lm(formula = agertr ~ agea_c + gndr + income + eduyrs, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.964  -2.634   0.073   2.459  57.756 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 59.09525    0.24139 244.817  < 2e-16 ***
## agea_c       0.24686    0.01022  24.145  < 2e-16 ***
## gndrFemale  -1.52126    0.11930 -12.751  < 2e-16 ***
## income       0.07234    0.02410   3.002   0.0027 ** 
## eduyrs       0.13201    0.01672   7.896 3.31e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.081 on 7422 degrees of freedom
## Multiple R-squared:  0.1018, Adjusted R-squared:  0.1013 
## F-statistic: 210.3 on 4 and 7422 DF,  p-value: < 2.2e-16
anova(m3, m4)
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea_c + gndr + income
## Model 2: agertr ~ agea_c + gndr + income + eduyrs
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1   7423 193224                                  
## 2   7422 191615  1    1609.4 62.339 3.307e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Compare models and interpret the best model

sjt.lm(m3, m4, show.ci = F, p.numeric = F)
    agertr   agertr
    B   B
(Intercept)   60.44 ***   59.10 ***
agea_c   0.24 ***   0.25 ***
gndr (Female)   -1.44 ***   -1.52 ***
income   0.13 ***   0.07 **
eduyrs       0.13 ***
Observations   7427   7427
R2 / adj. R2   .094 / .094   .102 / .101
Notes * p<.05   ** p<.01   *** p<.001

Try out interaction effects:

m2i <- lm(agertr ~ agea_c * gndr, data = ESS2)
summary(m2i)
## 
## Call:
## lm(formula = agertr ~ agea_c * gndr, data = ESS2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58.07  -2.76   0.02   2.44  57.95 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       61.30421    0.08334 735.593   <2e-16 ***
## agea_c             0.25488    0.01396  18.257   <2e-16 ***
## gndrFemale        -1.52774    0.11878 -12.862   <2e-16 ***
## agea_c:gndrFemale -0.03507    0.02055  -1.707   0.0879 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.112 on 7423 degrees of freedom
## Multiple R-squared:  0.09056,    Adjusted R-squared:  0.0902 
## F-statistic: 246.4 on 3 and 7423 DF,  p-value: < 2.2e-16
anova(m2, m2i)  #additive model is no worse than the interaction
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea_c + gndr
## Model 2: agertr ~ agea_c * gndr
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   7424 194093                              
## 2   7423 194016  1    76.151 2.9135 0.08788 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m3i <- lm(agertr ~ agea_c + gndr * income, data = ESS2)
summary(m3i)
## 
## Call:
## lm(formula = agertr ~ agea_c + gndr * income, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.287  -2.664   0.029   2.418  57.806 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       60.53488    0.22898 264.372  < 2e-16 ***
## agea_c             0.24375    0.01026  23.752  < 2e-16 ***
## gndrFemale        -1.62083    0.30921  -5.242 1.63e-07 ***
## income             0.11822    0.03265   3.621 0.000296 ***
## gndrFemale:income  0.02833    0.04578   0.619 0.536087    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.102 on 7422 degrees of freedom
## Multiple R-squared:  0.09432,    Adjusted R-squared:  0.09384 
## F-statistic: 193.2 on 4 and 7422 DF,  p-value: < 2.2e-16
anova(m3, m3i)  #additive model is no worse than the interaction
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea_c + gndr + income
## Model 2: agertr ~ agea_c + gndr * income
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   7423 193224                           
## 2   7422 193214  1    9.9673 0.3829 0.5361
m4i <- lm(agertr ~ agea_c + gndr + income * eduyrs, data = ESS2)
m4ii <- lm(agertr ~ agea_c + income + gndr * eduyrs, data = ESS2)
anova(m4, m4i, m4ii)  # interaction model with income * eduyrs is better
## Analysis of Variance Table
## 
## Model 1: agertr ~ agea_c + gndr + income + eduyrs
## Model 2: agertr ~ agea_c + gndr + income * eduyrs
## Model 3: agertr ~ agea_c + income + gndr * eduyrs
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   7422 191615                              
## 2   7421 191464  1    150.45 5.8312 0.01577 *
## 3   7421 191576  0   -111.94                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(m4i)
## 
## Call:
## lm(formula = agertr ~ agea_c + gndr + income * eduyrs, data = ESS2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -59.113  -2.646   0.080   2.437  57.794 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   60.286944   0.549337 109.745   <2e-16 ***
## agea_c         0.245519   0.010236  23.986   <2e-16 ***
## gndrFemale    -1.503284   0.119495 -12.580   <2e-16 ***
## income        -0.118673   0.082688  -1.435   0.1513    
## eduyrs         0.037553   0.042536   0.883   0.3773    
## income:eduyrs  0.014503   0.006006   2.415   0.0158 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.079 on 7421 degrees of freedom
## Multiple R-squared:  0.1025, Adjusted R-squared:  0.1019 
## F-statistic: 169.6 on 5 and 7421 DF,  p-value: < 2.2e-16
sjPlot::plot_model(m4i, type = "pred", terms = "eduyrs")

sjPlot::plot_model(m4i, type = "int", terms = "eduyrs", mdrt.values = "minmax")

sjPlot::plot_model(m4i, type = "int", terms = "eduyrs", mdrt.values = "meansd")

sjPlot::plot_model(m4i, type = "int", terms = "eduyrs", mdrt.values = "quart")

That’s all for now!

In the next episode…

sjPlot::plot_model(m4i, type = "diag")
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]