Олеся Волченко
4 декабря 2019
age | salary | sex |
---|---|---|
35 | 259.6909 | M |
31 | 221.9241 | M |
11 | 79.3862 | M |
37 | 258.0699 | M |
36 | 253.9673 | M |
35 | 249.1487 | M |
33 | 232.1910 | M |
22 | 145.4084 | M |
29 | 203.6306 | M |
31 | 223.9631 | M |
39 | 154.1865 | F |
24 | 126.4985 | F |
31 | 141.8414 | F |
37 | 132.8600 | F |
31 | 130.1427 | F |
29 | 133.8811 | F |
32 | 134.9844 | F |
36 | 135.4844 | F |
30 | 136.8037 | F |
31 | 127.5136 | F |
model1 <- lm(salary ~ sex, data = genderdata2)
model2 <- lm(salary ~ age , data = genderdata2)
model3 <- lm(salary ~ age + sex, data = genderdata2)
tab_model(model1, model2, model3, show.ci = F)
salary | salary | salary | ||||
---|---|---|---|---|---|---|
Predictors | Estimates | p | Estimates | p | Estimates | p |
(Intercept) | 135.42 | <0.001 | 28.95 | 0.615 | -51.77 | 0.030 |
M | 77.32 | 0.001 | 89.02 | <0.001 | ||
age | 4.68 | 0.018 | 5.85 | <0.001 | ||
Observations | 20 | 20 | 20 | |||
R2 / adjusted R2 | 0.492 / 0.464 | 0.275 / 0.235 | 0.910 / 0.900 |
Но так ли это?
plot(genderdata2$age[1:10], genderdata2$salary[1:10], pch = 16, xlab = "age", ylab = "salary", col = "blue", xlim = c(0, 40), ylim = c(100, 300))
points(genderdata2$age[11:20], genderdata2$salary[11:20], pch = 16, col = "red")
abline(model3$coefficients[1], model3$coefficients[2], col = "red")
abline(model3$coefficients[1] + model3$coefficients[3] , model3$coefficients[2], col = "blue")
text(10, 250, paste("y = ", round(model3$coefficients[1], 1), "+", round(model3$coefficients[3], 1), "* Male +", round(model3$coefficients[2], 1), "* Age"))
model4 <- lm(salary ~ age * sex, data = genderdata2)
model4 <- lm(salary ~ age + sex + age:sex, data = genderdata2)
plot(genderdata2$age[1:10], genderdata2$salary[1:10], pch = 16, xlab = "age", ylab = "salary", col = "blue", xlim = c(0, 40), ylim = c(100, 300))
points(genderdata2$age[11:20], genderdata2$salary[11:20], pch = 16, col = "red")
abline(model4$coefficients[1], model4$coefficients[2], col = "red")
abline(model4$coefficients[1] + model4$coefficients[3] , model4$coefficients[2] + model4$coefficients[4] , col = "blue")
text(20, 250, paste("y = ", round(model4$coefficients[1], 1), "+", round(model4$coefficients[3], 1), "* Male +",
round(model4$coefficients[2], 1), "* Age +",
round(model4$coefficients[4], 1), "* Age * Male"))
model4 <- lm(salary ~ age * sex, data = genderdata2)
summary(model4)
##
## Call:
## lm(formula = salary ~ age * sex, data = genderdata2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.4035 -4.2339 -0.1142 3.7883 10.7490
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 97.9971 15.4736 6.333 9.94e-06 ***
## age 1.1695 0.4796 2.438 0.0268 *
## sexM -102.4819 17.4603 -5.869 2.37e-05 ***
## age:sexM 6.0713 0.5462 11.115 6.19e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.253 on 16 degrees of freedom
## Multiple R-squared: 0.9897, Adjusted R-squared: 0.9878
## F-statistic: 512.5 on 3 and 16 DF, p-value: 4.21e-16
tab_model(model3, model4, show.ci = F)
salary | salary | |||
---|---|---|---|---|
Predictors | Estimates | p | Estimates | p |
(Intercept) | -51.77 | 0.030 | 98.00 | <0.001 |
age | 5.85 | <0.001 | 1.17 | 0.027 |
M | 89.02 | <0.001 | -102.48 | <0.001 |
age:sexM | 6.07 | <0.001 | ||
Observations | 20 | 20 | ||
R2 / adjusted R2 | 0.910 / 0.900 | 0.990 / 0.988 |
anova(model3, model4)
## Analysis of Variance Table
##
## Model 1: salary ~ age + sex
## Model 2: salary ~ age * sex
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 17 5456.3
## 2 16 625.6 1 4830.6 123.55 6.19e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Зависимая переменная
- индекс доверия
Независимые переменные
Данные 7 раунда ESS для Великобритании
library(foreign)
data <- read.spss("C:/Users/lssi5/YandexDisk/datafiles/ESS/ESS7e02_2.sav", to.data.frame = T, use.value.labels = T)
## re-encoding from UTF-8
data1 <- data[which(data$cntry == "United Kingdom"), ]
data1$trust <- as.numeric(data1$ppltrst) + as.numeric(data1$pplfair) + as.numeric(data1$pplhlp)
data1$agea <- as.numeric(as.character(data1$agea))
data1$eduyrs <- as.numeric(as.character(data1$eduyrs))
data1$hinctnta <- as.numeric(data1$hinctnta)
model1 <- lm(trust ~ gndr + agea + eduyrs + hinctnta, data = data1)
summary(model1)
##
## Call:
## lm(formula = trust ~ gndr + agea + eduyrs + hinctnta, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.0957 -3.0291 0.6789 3.5609 14.6338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.787387 0.658154 20.949 < 2e-16 ***
## gndrFemale 0.260595 0.235306 1.107 0.268232
## agea 0.053535 0.006842 7.824 8.44e-15 ***
## eduyrs 0.192335 0.033452 5.750 1.04e-08 ***
## hinctnta 0.155551 0.042447 3.665 0.000255 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.045 on 1878 degrees of freedom
## (381 observations deleted due to missingness)
## Multiple R-squared: 0.05182, Adjusted R-squared: 0.0498
## F-statistic: 25.66 on 4 and 1878 DF, p-value: < 2.2e-16
model1scaled <- lm(trust ~ gndr + scale(as.numeric(agea)) + scale(as.numeric(eduyrs)) +
scale(as.numeric(hinctnta)), data = data1)
summary(model1scaled)
##
## Call:
## lm(formula = trust ~ gndr + scale(as.numeric(agea)) + scale(as.numeric(eduyrs)) +
## scale(as.numeric(hinctnta)), data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.0957 -3.0291 0.6789 3.5609 14.6338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.9611 0.1718 116.180 < 2e-16 ***
## gndrFemale 0.2606 0.2353 1.107 0.268232
## scale(as.numeric(agea)) 0.9845 0.1258 7.824 8.44e-15 ***
## scale(as.numeric(eduyrs)) 0.7213 0.1255 5.750 1.04e-08 ***
## scale(as.numeric(hinctnta)) 0.4659 0.1271 3.665 0.000255 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.045 on 1878 degrees of freedom
## (381 observations deleted due to missingness)
## Multiple R-squared: 0.05182, Adjusted R-squared: 0.0498
## F-statistic: 25.66 on 4 and 1878 DF, p-value: < 2.2e-16
(это модель с нестандартизованными коэффициентами)
plot_model(model1)
(а это модель со стандартизированными)
plot_model(model1scaled)
model3 <- lm(trust ~ gndr * agea + eduyrs + hinctnta, data = data1)
summary(model3)
##
## Call:
## lm(formula = trust ~ gndr * agea + eduyrs + hinctnta, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.9156 -3.0017 0.6007 3.5088 15.0508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.846628 0.747470 19.863 < 2e-16 ***
## gndrFemale -1.793443 0.730738 -2.454 0.014207 *
## agea 0.033024 0.009714 3.400 0.000689 ***
## eduyrs 0.196375 0.033410 5.878 4.91e-09 ***
## hinctnta 0.151563 0.042381 3.576 0.000357 ***
## gndrFemale:agea 0.039129 0.013182 2.968 0.003032 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.034 on 1877 degrees of freedom
## (381 observations deleted due to missingness)
## Multiple R-squared: 0.05625, Adjusted R-squared: 0.05374
## F-statistic: 22.37 on 5 and 1877 DF, p-value: < 2.2e-16
plot_model(model3, type = "pred", terms = c("gndr", "agea"))
plot_model(model3, type = "pred", terms = c("agea", "gndr"))
model4 <- lm(trust ~ gndr + eduyrs + hinctnta * agea, data = data1)
summary(model4)
##
## Call:
## lm(formula = trust ~ gndr + eduyrs + hinctnta * agea, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.9518 -3.0212 0.6555 3.5648 14.7800
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.835093 0.869047 14.769 < 2e-16 ***
## gndrFemale 0.259734 0.235193 1.104 0.2696
## eduyrs 0.196538 0.033529 5.862 5.40e-09 ***
## hinctnta 0.352387 0.124812 2.823 0.0048 **
## agea 0.070688 0.012304 5.745 1.07e-08 ***
## hinctnta:agea -0.003932 0.002345 -1.677 0.0937 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.042 on 1877 degrees of freedom
## (381 observations deleted due to missingness)
## Multiple R-squared: 0.05324, Adjusted R-squared: 0.05072
## F-statistic: 21.11 on 5 and 1877 DF, p-value: < 2.2e-16
plot_model(model4, type = "pred", terms = c("hinctnta", "agea"))
plot_model(model4, type = "pred", terms = c("agea", "hinctnta"))
library(interplot)
interplot(m = model4, var1 = "agea", var2 = "hinctnta") +
# Add labels for X and Y axes
xlab("age") +
ylab("decile income")
model5 <- lm(trust ~ gndr + agea + I(agea^2) + eduyrs + hinctnta, data = data1)
summary(model5)
##
## Call:
## lm(formula = trust ~ gndr + agea + I(agea^2) + eduyrs + hinctnta,
## data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.9760 -3.0302 0.6261 3.5896 14.6789
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.7719748 1.0411476 15.149 < 2e-16 ***
## gndrFemale 0.2846908 0.2351953 1.210 0.2263
## agea -0.0397687 0.0385719 -1.031 0.3027
## I(agea^2) 0.0008952 0.0003642 2.458 0.0141 *
## eduyrs 0.1976162 0.0334760 5.903 4.22e-09 ***
## hinctnta 0.1720188 0.0429168 4.008 6.36e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.038 on 1877 degrees of freedom
## (381 observations deleted due to missingness)
## Multiple R-squared: 0.05486, Adjusted R-squared: 0.05234
## F-statistic: 21.79 on 5 and 1877 DF, p-value: < 2.2e-16
plot_model(model5, type = "pred", terms = c("agea"))
anova(model1, model5)
## Analysis of Variance Table
##
## Model 1: trust ~ gndr + agea + eduyrs + hinctnta
## Model 2: trust ~ gndr + agea + I(agea^2) + eduyrs + hinctnta
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 1878 47796
## 2 1877 47643 1 153.33 6.041 0.01407 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tab_model(model1, model3, model4, model5)
trust | trust | trust | trust | |||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
Predictors | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p | Estimates | CI | p |
(Intercept) | 13.79 | 12.50 – 15.08 | <0.001 | 14.85 | 13.38 – 16.31 | <0.001 | 12.84 | 11.13 – 14.54 | <0.001 | 15.77 | 13.73 – 17.81 | <0.001 |
Female | 0.26 | -0.20 – 0.72 | 0.268 | -1.79 | -3.23 – -0.36 | 0.014 | 0.26 | -0.20 – 0.72 | 0.270 | 0.28 | -0.18 – 0.75 | 0.226 |
agea | 0.05 | 0.04 – 0.07 | <0.001 | 0.03 | 0.01 – 0.05 | 0.001 | 0.07 | 0.05 – 0.09 | <0.001 | -0.04 | -0.12 – 0.04 | 0.303 |
eduyrs | 0.19 | 0.13 – 0.26 | <0.001 | 0.20 | 0.13 – 0.26 | <0.001 | 0.20 | 0.13 – 0.26 | <0.001 | 0.20 | 0.13 – 0.26 | <0.001 |
hinctnta | 0.16 | 0.07 – 0.24 | <0.001 | 0.15 | 0.07 – 0.23 | <0.001 | 0.35 | 0.11 – 0.60 | 0.005 | 0.17 | 0.09 – 0.26 | <0.001 |
gndrFemale:agea | 0.04 | 0.01 – 0.06 | 0.003 | |||||||||
hinctnta:agea | -0.00 | -0.01 – 0.00 | 0.094 | |||||||||
I(agea^2) | 0.00 | 0.00 – 0.00 | 0.014 | |||||||||
Observations | 1883 | 1883 | 1883 | 1883 | ||||||||
R2 / adjusted R2 | 0.052 / 0.050 | 0.056 / 0.054 | 0.053 / 0.051 | 0.055 / 0.052 |
не путайте зависимую и независимые переменные.
интерпретируйте результаты.
проверяйте, в чем измеряются переменные.
помните, что регрессия не устанавливает причинно-следственных связей.
график предельных эффектов показывает не взаимосвязь между независимыми переменными, а то, как эффект одной из переменных (той, которая расположена по оси y) меняется в зависимости от значений другой. Такой график нужно строить, когда в интерактивном эффекте участвуют две непрерывные переменные.
сравнение моделей ановой показывает не значим ли добавленный коэффициент, а улучшает ли добавленный коэффициент модель (так, коэффициент может быть значимым, но модель не улучшать, потому что он тратит слишком много степеней свободы)
dummy-переменные интерпретируются не как непрерывные. Задумывайтесь, нужно ли переменную включать в модель как dummy или как числовую. Переменная может быть закодирована числами, но при этом её нужно включить в модель как dummy.