Настройки чанков
Библиотеки, настройки чисел
Загрузка данных
# данные окт2019-янв2020
df_dirty = read_spss('C:/Users/ASUS/Downloads/r28i_os_42.sav') # Настя
# df_dirty = read_spss('r28i_os_42.sav')
Функция удаления выбросов
# функция удаления выбросов
delete_outliers = function(datafr, column){
lower = quantile(datafr[[column]], 1/4) - 1.5*IQR(datafr[[column]])
upper = quantile(datafr[[column]], 3/4) + 1.5*IQR(datafr[[column]])
datafr = filter(datafr, (datafr[[column]] < upper) & (datafr[[column]] > lower))
return(datafr)
}
Переименовываем
# переименовываем
df_data = df_dirty
df_data = df_data %>% rename("educ" = "x_diplom") %>% rename("otrasl" = "xj4.1") %>% rename("gender" = "xh5") %>% rename("hours_per_week" = "xj6.2") %>% rename("employees" = "xj13") %>% rename("wage" = "xj13.2") %>% rename("if_government" = "xj23") %>% rename("if_foreigners" = "xj24") %>% rename("if_private" = "xj25") %>% rename("if_yours" = "xj26") %>% rename("exper_y" = "xj161.3y") %>% rename("exper_month" = "xj161.3m") %>% rename("age" = "x_age") %>% rename("fam_status" = "x_marst") %>% rename("children_18" = "xj72.173") %>% rename("h_day" = "xj6.1a")
Выбираем нужные переменные
df_data = df_data %>% select(educ, otrasl, gender, hours_per_week, wage, if_government, if_foreigners, if_private, if_yours, exper_y, exper_month, age, employees, fam_status, children_18, region)
Чистка выбросов
# удаляем "Затрудняюсь ответить", "Отказ от ответа"
df_data = df_data %>% filter(educ < 9*10^6) %>% filter(otrasl < 9*10^6) %>% filter(gender < 9*10^6) %>% filter(hours_per_week < 9*10^6) %>% filter(wage < 9*10^6) %>% filter(if_government < 9*10^6) %>% filter(if_foreigners < 9*10^6) %>% filter(if_private < 9*10^6) %>% filter(if_yours < 9*10^6) %>% filter(exper_y < 9*10^6) %>% filter(exper_month < 9*10^6)%>% filter(age < 9*10^6) %>% filter(employees < 9*10^6) %>% filter(fam_status < 9*10^6) %>% filter(children_18 < 9*10^6)
# удаляем NaNы
df_data = df_data %>% na.omit()
Меняем формат
# приводим к правильному формату
df_data$educ = df_data$educ %>% as.factor()
df_data$otrasl = df_data$otrasl %>% as.factor()
df_data$gender = df_data$gender %>% as.factor()
df_data$fam_status = df_data$fam_status %>% as.factor()
df_data$region = df_data$region %>% as.factor()
df_data$if_government = df_data$if_government %>% as.factor()
df_data$if_foreigners = df_data$if_foreigners %>% as.factor()
df_data$if_private = df_data$if_private %>% as.factor()
df_data$if_yours = df_data$if_yours %>% as.factor()
df_data$employees = df_data$employees %>% as.integer()
df_data$wage = df_data$wage %>% as.numeric()
df_data$hours_per_week = df_data$hours_per_week %>% as.numeric()
df_data$exper_y = df_data$exper_y %>% as.numeric()
df_data$exper_month = df_data$exper_month %>% as.numeric()
df_data$age = df_data$age %>% as.numeric()
df_data$children_18 = df_data$children_18 %>% as.numeric()
df_data = df_data %>% filter(wage>0)%>% filter((gender == "1" & age <= 59) | (gender == "2" & age <= 54))
Создаем новые переменные
# создаем стаж
df_data = df_data %>% mutate(experience = exper_y + exper_month/12)
# создаем свою объясняющую переменную
df_data = df_data %>% mutate(skolko_gosva = ifelse(if_government == "2", "0", ifelse((if_private == "1")|(if_foreigners == "1")|(if_yours == "1"), "1", "2")))
df_data$skolko_gosva = df_data$skolko_gosva %>% as.factor()
# нет участия = 0, гос-во частично = 1, гос-во полностью = 2
# в опроснике 1 - да, 2 - нет
Создание таблицы “зарплата-по-отраслям”
# создаем df со всеми отраслями и нормальными названиями
otraslii = data.frame(names(attributes(df_dirty$xj4.1)$labels),
unname(attributes(df_dirty$xj4.1)$labels))
colnames(otraslii) = c("names", "ids")
otraslii$ids = otraslii$ids %>% as.factor()
# считаем среднюю зп ПО ВЫБОРКЕ
df_wage = df_data %>% group_by(otrasl) %>% summarise(kolvo = n(), av_wage = mean(wage)) %>% left_join(otraslii, by = c("otrasl" = "ids"))
# считаем среднюю зп ПО ВСЕМ ДАННЫМ
df_dirty_w = df_dirty %>% filter(!is.na(xj6.2)) %>% filter(xj6.2 < 9*10^6) %>% filter(xj13.2 < 9*10^6) %>% filter(xj4.1 < 9*10^6) %>% filter(xj13.2 > 0)
df_dirty_w = df_dirty_w %>% group_by(xj4.1) %>% summarise(kolvo_R = n(), av_wage_R = mean(xj13.2, na.rm = T))
df_dirty_w$xj4.1 = df_dirty_w$xj4.1 %>% as.factor()
df_dirty_w$av_wage_R = df_dirty_w$av_wage_R %>% as.numeric()
# соединяем
df_wage = df_wage %>% left_join(df_dirty_w, by = c("otrasl" = "xj4.1"))
rm(df_dirty_w, otraslii)
df_wage = df_wage[c("names", "otrasl", "kolvo", "av_wage", "kolvo_R", "av_wage_R")] %>% arrange(otrasl)
Присоединяем поотраслевые зп
df_data = df_data %>% left_join(select(df_wage, c(otrasl, av_wage_R)), by = "otrasl")
Создаем относительную переменную ЗП
df_data = df_data %>% mutate(wage_to_average=wage/av_wage_R)
df_data$wage_to_average = df_data$wage_to_average %>% as.numeric()
Таблица описательных статистик до удаления выбросов
# таблица описательных статистик
table_opis1 = describe(select(df_data, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)), quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))
table_opis1 %>%
kable(caption = "Таблица характеристик числовых данных до удаления выбросов") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F, position = "center") %>%
column_spec(1, bold = T)
| n | mean | sd | median | mad | min | max | range | skew | kurtosis | se | Q0.25 | Q0.75 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| wage | 1830 | 30558.633 | 20182.275 | 25000.000 | 14826.000 | 2600.000 | 180000.00 | 177400.00 | 2.209 | 7.386 | 471.785 | 18000.000 | 38000.00 |
| wage_to_average | 1830 | 1.045 | 0.629 | 0.888 | 0.466 | 0.092 | 5.33 | 5.24 | 1.975 | 5.965 | 0.015 | 0.629 | 1.28 |
| hours_per_week | 1830 | 43.370 | 11.845 | 40.000 | 5.930 | 3.000 | 140.00 | 137.00 | 1.617 | 6.214 | 0.277 | 40.000 | 48.00 |
| age | 1830 | 41.875 | 8.407 | 42.000 | 10.378 | 21.000 | 59.00 | 38.00 | 0.007 | -0.914 | 0.197 | 35.000 | 49.00 |
| experience | 1830 | 18.981 | 9.244 | 18.000 | 10.378 | 0.167 | 45.00 | 44.83 | 0.216 | -0.818 | 0.216 | 12.000 | 26.00 |
| employees | 1830 | 403.125 | 1808.440 | 50.000 | 60.787 | 1.000 | 40000.00 | 39999.00 | 12.490 | 200.773 | 42.274 | 15.000 | 150.00 |
| children_18 | 1830 | 0.993 | 0.931 | 1.000 | 1.483 | 0.000 | 7.00 | 7.00 | 1.046 | 2.660 | 0.022 | 0.000 | 2.00 |
# Создаем переменную skolko_gosva_g с более развернутыми подписями уровней (это понадобится для подписей делений на графиках)
df_data = df_data %>% mutate(skolko_gosva_g = ifelse(skolko_gosva == "2", "2 - полностью\nгосударственная", ifelse(skolko_gosva == "1", "1 - частичное\nучастие государства", "0 - полностью\nчастная")))
Столбчатый график “Распределение респондентов по полу”
ggplot(df_data)+
geom_bar(aes(x = skolko_gosva_g, fill = gender), color = "black", position = "fill")+
scale_y_continuous(labels = percent)+
labs(y = "Пол респондента",x = "Участие государства в капитале компании",
title = NULL)+
theme(plot.title = element_text(hjust = 0.5, size = 18),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title.x = element_text(size = 14,vjust = 0),
axis.title.y = element_text(size = 14),
axis.text.x = element_text(size=12),
axis.text.y = element_text(size=12),
legend.text = element_text(size = 14),
legend.title = element_text(size = 14))+
scale_fill_manual(values = c("lightblue3", "lightblue"),
name = "Пол",
labels = c("Мужской", "Женский"))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey")) #линии на графике
# geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
# colour = "black", size = 0.5, fill = NA)
Рассеивания по возрасту
ggplot(df_data, aes(x = age, y = wage/1000))+
geom_point(color = "lightblue4", alpha = 0.4)+
geom_smooth(method = loess, se = F, formula = y ~ x)+
labs(x = "Возраст, лет",
y = "Среднемесячная заработная плата, тыс.руб.")+
theme(plot.caption = element_text(hjust = 0, face = "plain"),
plot.title.position = "plot",
plot.caption.position = "plot",
panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "grey"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Рассеивания по стажу
ggplot(df_data, aes(x = experience, y = wage/1000))+
geom_point(color = "lightblue4", alpha = 0.4)+
geom_smooth(method = loess, se = F, formula = y ~ x,color="blue")+
labs(x = "Стаж работы, лет",
y = "Среднемесячная заработная плата, тыс.руб.")+
theme(plot.caption = element_text(hjust = 0, face = "plain"),
plot.title.position = "plot",
plot.caption.position = "plot",
panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "grey"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Столбчатый график “Доля распондентов по уровню образования”
palet=colorRampPalette(c("lightblue4","lightblue"))
colors=palet(6)
ggplot(df_data)+
geom_bar(aes(x = skolko_gosva_g, fill = educ), color = "black", position = "fill")+
scale_y_continuous(labels = percent)+
labs(y = "Законченное образование",
x = "Участие государства в капитале компании")+
theme(plot.title = element_text(hjust = 0.5, size = 18),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title.x = element_text(size = 14,vjust = 0),
axis.title.y = element_text(size = 14),
axis.text.x = element_text(size=12),
axis.text.y = element_text(size=12),
legend.text = element_text(size = 14),
legend.title = element_text(size = 14))+
scale_fill_manual(values = colors,
name = "Законченное\nобразование",
labels = c("0-6 кл", "7-8 кл", "7-8 кл + что-то ещё", "среднее", "среднее специальное", "высшее и выше"))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey")) #линии на графике
# geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
# colour = "black", size = 0.5, fill = NA)
Боксплоты заработной платы по отраслям
ggplot(df_data)+
geom_boxplot(aes(x = as.factor(otrasl), y = as.numeric(wage)/1000))+
labs(x = "Номер отрасли",
y = "Зарплата, тыс.руб.")+
theme(axis.title.x = element_text(size = 14), # заголовок X
axis.title.y = element_text(size = 14), # заголовок Y
panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Рассеивания по количеству часов работы в неделю
ggplot(df_data, aes(x = hours_per_week, y = wage/1000))+
geom_point(color = "lightblue4", alpha = 0.4)+
geom_smooth(method = loess, se = F, formula = y ~ x)+
labs(x = "Количество часов работы в неделю",
y = "Среднемесячная заработная плата, тыс. руб.")+
theme(plot.caption = element_text(hjust = 0, face = "plain"),
plot.title.position = "plot",
plot.caption.position = "plot",
panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "grey"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Боксплот зп до удаления выбросов по зп
ggplot(df_data)+
geom_boxplot(aes(x = wage/1000))+
xlab("Среднемесячная заработная плата индивида, тыс. руб.")+
theme(axis.text=element_text(size=10),
axis.title=element_text(size=14))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Рассеивания по количеству сотрудников
ggplot(df_data, aes(x = employees/1000, y = wage/1000))+
geom_point(color = "lightblue4", alpha = 0.4)+
geom_smooth(method = loess, se = F, formula = y ~ x)+
labs(x = "Количество сотрудников в компании, тыс. чел.",
y = "Среднемесячная заработная плата, тыс. руб.")+
theme(plot.caption = element_text(hjust = 0, face = "plain"),
plot.title.position = "plot",
plot.caption.position = "plot",
panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "grey"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Боксплот по количеству часов работы в неделю до удаления выбросов
ggplot(df_data)+
geom_boxplot(aes(x = hours_per_week))+
xlab("Количество часов работы в неделю")+
theme(axis.text=element_text(size=10),
axis.title=element_text(size=14))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
slices <- c(874, 100, 607)
names <- c("полностью \nгосударственная\n","частичное участие \nгосударства\n", "полностью \nчастная\n")
pct <- round(slices/sum(slices)*100)
# lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(pct,"%",sep="") # ad % to labels
colors =palet(3)
pie(slices,
labels = lbls,
col = colors)
legend(1, 0.5, legend = names, cex=0.75, fill=colors, text.font = 3, )
title(main = "Распределение респондентов по занятости в компаниях с разной долей госучастия, %",
cex.main = 0.9,
font.main = 2)
Столбчатый график “Распределение skolko_gosva по отраслям”
colors=palet(3)
ggplot(df_data)+
geom_bar(aes(x = otrasl, fill = skolko_gosva), color = "black", position = "fill")+
scale_y_continuous(labels = percent)+
labs(y = "Участие государства в капитале компании",x = "Отрасль",
title = NULL)+
theme(plot.title = element_text(hjust = 0.5, size = 12),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
scale_fill_manual(values = colors,
name = "Участие\nгосударства",
labels = c("частные", "смешанные", "государственные"))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Рассеивания по логарифму количества сотрудников
ggplot(df_data, aes(x = log(employees), y = log(wage)))+
geom_point(color = "lightblue4", alpha = 0.4)+
geom_smooth(method = loess, se = F, formula = y ~ x)+
labs(x = "Количество сотрудников в компании, тыс. чел.",
y = "Среднемесячная заработная плата, тыс. руб.")+
theme(plot.caption = element_text(hjust = 0, face = "plain"),
plot.title.position = "plot",
plot.caption.position = "plot",
panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "grey"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Удаляем выбросы внутри отраслей и по количеству часов работы в неделю
df=df_data
#==================== теперь df_data - датафрейм до удаления выбросов
#==================== теперь df - датафрейм после удаления выбросов
#удаление выбросов по количеству часов работы в неделю
df = df %>% filter(hours_per_week >= 20 & hours_per_week <= 60)
# удаление выбросов по зп внутри отраслей
X = df %>% group_split(otrasl)
for (i in 1:28){
X[[i]] = delete_outliers(X[[i]], "wage")
}
df = bind_rows(X[[1]], X[[2]])
for (i in 3:28){
df = bind_rows(df, X[[i]])
}
Таблица описательных статистик числовых переменных
table_opis2 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))
table_opis2 %>%
kable(caption = "Таблица характеристик числовых данных после удаления выбросов") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F, position = "center") %>%
column_spec(1, bold = T)
| n | mean | sd | median | mad | min | max | range | skew | kurtosis | se | Q0.25 | Q0.75 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| wage | 1581 | 26998.961 | 14086.026 | 25000.000 | 11860.800 | 4000.000 | 120000.00 | 116000.00 | 1.600 | 4.974 | 354.260 | 17000.000 | 35000.00 |
| wage_to_average | 1581 | 0.929 | 0.415 | 0.864 | 0.406 | 0.147 | 2.34 | 2.19 | 0.734 | 0.189 | 0.010 | 0.607 | 1.17 |
| hours_per_week | 1581 | 41.333 | 6.869 | 40.000 | 1.483 | 20.000 | 60.00 | 40.00 | 0.076 | 1.890 | 0.173 | 40.000 | 45.00 |
| age | 1581 | 41.827 | 8.449 | 42.000 | 10.378 | 21.000 | 59.00 | 38.00 | 0.014 | -0.936 | 0.212 | 35.000 | 49.00 |
| experience | 1581 | 19.091 | 9.310 | 18.167 | 10.749 | 0.167 | 42.00 | 41.83 | 0.194 | -0.891 | 0.234 | 11.833 | 26.42 |
| employees | 1581 | 373.765 | 1604.472 | 50.000 | 59.304 | 1.000 | 25000.00 | 24999.00 | 10.882 | 141.668 | 40.352 | 15.000 | 150.00 |
| children_18 | 1581 | 0.997 | 0.917 | 1.000 | 1.483 | 0.000 | 7.00 | 7.00 | 0.936 | 2.031 | 0.023 | 0.000 | 2.00 |
Проводим тесты между основной объясняющей переменной и контрольными, чтобы убедиться в правильности нашего интуитивного понимания механизмов.
aov(employees ~ skolko_gosva, data = df) %>% summary() # подтверждено для нашей выборки
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 44590361 22295181 8.75 0.00017 ***
## Residuals 1578 4022850219 2549335
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(experience ~ skolko_gosva, data = df) %>% summary() # подтверждено для нашей выборки
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 1572 786 9.16 0.00011 ***
## Residuals 1578 135365 86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(age ~ skolko_gosva, data = df) %>% summary() # подтверждено для нашей выборки
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 721 360 5.07 0.0064 **
## Residuals 1578 112063 71
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(hours_per_week ~ skolko_gosva, data = df) %>% summary() # подтверждено для нашей выборки
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 4686 2343 52.9 <0.0000000000000002 ***
## Residuals 1578 69855 44
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(children_18 ~ skolko_gosva, data = df) %>% summary() # для нашей выборки не подтверждено
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 0 0.184 0.22 0.8
## Residuals 1578 1329 0.842
chisq.test(df$skolko_gosva, df$gender) #подтверждено для нашей выборки
##
## Pearson's Chi-squared test
##
## data: df$skolko_gosva and df$gender
## X-squared = 77, df = 2, p-value <0.0000000000000002
chisq.test(df$skolko_gosva, df$educ) #подтверждено для нашей выборки
##
## Pearson's Chi-squared test
##
## data: df$skolko_gosva and df$educ
## X-squared = 45, df = 10, p-value = 0.000002
chisq.test(df$skolko_gosva, df$otrasl) #подтверждено для нашей выборки
##
## Pearson's Chi-squared test
##
## data: df$skolko_gosva and df$otrasl
## X-squared = 981, df = 52, p-value <0.0000000000000002
chisq.test(df$skolko_gosva, df$fam_status) #для нашей выборки не подтверждено
##
## Pearson's Chi-squared test
##
## data: df$skolko_gosva and df$fam_status
## X-squared = 8, df = 10, p-value = 0.6
Итоговый дф
df_itog = df %>% select(wage_to_average, wage, skolko_gosva, gender, educ, age, experience, hours_per_week, employees, fam_status, children_18, otrasl)
write.csv(df, 'C:/Users/ASUS/Downloads/df_itog.csv')
Модель 1
# Эволюция моделей
model1_1 = lm(log(wage) ~ skolko_gosva, data = df)
model1_2 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week, data = df)
model1_3 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ, data = df)
model1_4 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees), data = df)
model1_5 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18, data = df)
model1_6 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl, data = df)
model1_7 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2), data = df)
model1_8 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df)
# ============== меняем базовые уровни в 7 и 8 моделях
df_1 <- within(df, skolko_gosva <- relevel(skolko_gosva, ref = 3))
model1_9 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2), data = df_1)
model1_10 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df_1)
# робастность стд ошибок
coef1_1 = coeftest(model1_1, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_2 = coeftest(model1_2, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_3 = coeftest(model1_3, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_4 = coeftest(model1_4, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_5 = coeftest(model1_5, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_6 = coeftest(model1_6, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_7 = coeftest(model1_7, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_8 = coeftest(model1_8, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_9 = coeftest(model1_9, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_10 = coeftest(model1_10, df = Inf, vcov. = vcovHC, type = "HC0")
stargazer(coef1_1, coef1_2, coef1_3, coef1_4, coef1_5, coef1_6, coef1_7, coef1_9, coef1_8, coef1_10, summary = F, type = "text")
##
## =========================================================================================================================
## Dependent variable:
## ---------------------------------------------------------------------------------------------------
##
## (1) (2) (3) (4) (5) (6) (7) (8) (9) (10)
## -------------------------------------------------------------------------------------------------------------------------
## skolko_gosva0 0.144*** 0.204***
## (0.030) (0.042)
##
## skolko_gosva1 0.019 0.022 0.013 -0.102** -0.093** -0.119*** -0.117*** 0.027 -0.102* 0.101
## (0.050) (0.050) (0.048) (0.048) (0.044) (0.044) (0.044) (0.047) (0.057) (0.064)
##
## skolko_gosva2 -0.230*** -0.198*** -0.229*** -0.242*** -0.195*** -0.145*** -0.144*** -0.204***
## (0.026) (0.027) (0.026) (0.024) (0.024) (0.030) (0.030) (0.042)
##
## experience 0.005*** 0.006*** 0.014*** 0.013*** 0.013*** 0.021*** 0.021*** 0.021*** 0.021***
## (0.001) (0.001) (0.002) (0.002) (0.002) (0.007) (0.007) (0.007) (0.007)
##
## hours_per_week 0.012*** 0.014*** 0.013*** 0.010*** 0.009*** 0.009*** 0.009*** 0.009*** 0.009***
## (0.002) (0.002) (0.002) (0.002) (0.002) (0.002) (0.002) (0.002) (0.002)
##
## educ2 0.912*** 0.744*** 0.692*** 0.706*** 0.711*** 0.711*** 0.706*** 0.706***
## (0.128) (0.134) (0.124) (0.130) (0.129) (0.129) (0.128) (0.128)
##
## educ3 1.120*** 0.983*** 0.881*** 0.842*** 0.842*** 0.842*** 0.835*** 0.835***
## (0.062) (0.063) (0.064) (0.071) (0.071) (0.071) (0.071) (0.071)
##
## educ4 1.110*** 0.947*** 0.884*** 0.831*** 0.847*** 0.847*** 0.838*** 0.838***
## (0.042) (0.044) (0.048) (0.058) (0.059) (0.059) (0.059) (0.059)
##
## educ5 1.150*** 0.975*** 0.949*** 0.869*** 0.874*** 0.874*** 0.865*** 0.865***
## (0.042) (0.043) (0.046) (0.058) (0.059) (0.059) (0.059) (0.059)
##
## educ6 1.400*** 1.210*** 1.190*** 1.100*** 1.100*** 1.100*** 1.100*** 1.100***
## (0.037) (0.040) (0.042) (0.056) (0.057) (0.057) (0.057) (0.057)
##
## age -0.011*** -0.011*** -0.009*** 0.028 0.028 0.025 0.025
## (0.003) (0.003) (0.003) (0.018) (0.018) (0.018) (0.018)
##
## log(employees) 0.071*** 0.061*** 0.057*** 0.056*** 0.056*** 0.056*** 0.056***
## (0.007) (0.006) (0.007) (0.007) (0.007) (0.007) (0.007)
##
## gender2 -0.278*** -0.251*** -0.266*** -0.266*** -0.294*** -0.191***
## (0.025) (0.026) (0.026) (0.026) (0.030) (0.047)
##
## fam_status2 -0.119** -0.127** -0.129** -0.129** -0.130** -0.130**
## (0.054) (0.053) (0.052) (0.052) (0.052) (0.052)
##
## fam_status3 -0.147** -0.155*** -0.155*** -0.155*** -0.157*** -0.157***
## (0.059) (0.058) (0.057) (0.057) (0.057) (0.057)
##
## fam_status4 -0.047 -0.072 -0.078 -0.078 -0.081 -0.081
## (0.062) (0.061) (0.060) (0.060) (0.060) (0.060)
##
## fam_status5 -0.028 -0.072 -0.067 -0.067 -0.066 -0.066
## (0.078) (0.076) (0.075) (0.075) (0.077) (0.077)
##
## fam_status6 -0.088 -0.071 -0.090 -0.090 -0.101 -0.101
## (0.151) (0.159) (0.165) (0.165) (0.170) (0.170)
##
## children_18 0.029** 0.027** 0.012 0.012 0.012 0.012
## (0.015) (0.014) (0.014) (0.014) (0.014) (0.014)
##
## otrasl2 0.068 0.070 0.070 0.061 0.061
## (0.063) (0.062) (0.062) (0.062) (0.062)
##
## otrasl3 0.055 0.067 0.067 0.068 0.068
## (0.076) (0.079) (0.079) (0.079) (0.079)
##
## otrasl4 0.333*** 0.350*** 0.350*** 0.347*** 0.347***
## (0.083) (0.081) (0.081) (0.081) (0.081)
##
## otrasl5 0.098* 0.107* 0.107* 0.099* 0.099*
## (0.059) (0.060) (0.060) (0.059) (0.059)
##
## otrasl6 0.245*** 0.246*** 0.246*** 0.237*** 0.237***
## (0.059) (0.058) (0.058) (0.058) (0.058)
##
## otrasl7 0.145*** 0.150*** 0.150*** 0.146*** 0.146***
## (0.054) (0.053) (0.053) (0.053) (0.053)
##
## otrasl8 -0.159** -0.141* -0.141* -0.149** -0.149**
## (0.073) (0.074) (0.074) (0.075) (0.075)
##
## otrasl9 0.072 0.073 0.073 0.057 0.057
## (0.085) (0.086) (0.086) (0.087) (0.087)
##
## otrasl10 -0.046 -0.036 -0.036 -0.061 -0.061
## (0.052) (0.052) (0.052) (0.054) (0.054)
##
## otrasl11 0.056 0.059 0.059 0.044 0.044
## (0.078) (0.079) (0.079) (0.078) (0.078)
##
## otrasl12 0.105** 0.112** 0.112** 0.092* 0.092*
## (0.052) (0.052) (0.052) (0.052) (0.052)
##
## otrasl13 0.080 0.078 0.078 0.087 0.087
## (0.074) (0.073) (0.073) (0.073) (0.073)
##
## otrasl14 0.102** 0.108** 0.108** 0.106** 0.106**
## (0.044) (0.044) (0.044) (0.043) (0.043)
##
## otrasl15 0.153** 0.163** 0.163** 0.157** 0.157**
## (0.068) (0.068) (0.068) (0.069) (0.069)
##
## otrasl16 0.216*** 0.226*** 0.226*** 0.219*** 0.219***
## (0.072) (0.069) (0.069) (0.070) (0.070)
##
## otrasl17 -0.179*** -0.171*** -0.171*** -0.170*** -0.170***
## (0.056) (0.055) (0.055) (0.055) (0.055)
##
## otrasl18 0.357 0.379* 0.379* 0.383* 0.383*
## (0.233) (0.221) (0.221) (0.218) (0.218)
##
## otrasl20 0.056 0.051 0.051 0.028 0.028
## (0.116) (0.116) (0.116) (0.119) (0.119)
##
## otrasl21 0.387*** 0.405*** 0.405*** 0.399*** 0.399***
## (0.126) (0.127) (0.127) (0.126) (0.126)
##
## otrasl23 0.378*** 0.416** 0.416** 0.424** 0.424**
## (0.141) (0.166) (0.166) (0.178) (0.178)
##
## otrasl24 -0.015 0.060 0.060 0.053 0.053
## (0.174) (0.182) (0.182) (0.191) (0.191)
##
## otrasl25 0.217** 0.211** 0.211** 0.210** 0.210**
## (0.094) (0.097) (0.097) (0.105) (0.105)
##
## otrasl26 -0.032 -0.040 -0.040 -0.064 -0.064
## (0.123) (0.124) (0.124) (0.130) (0.130)
##
## otrasl27 0.533** 0.543** 0.543** 0.530** 0.530**
## (0.263) (0.250) (0.250) (0.246) (0.246)
##
## otrasl29 -0.482*** -0.472*** -0.472*** -0.462*** -0.462***
## (0.042) (0.042) (0.042) (0.042) (0.042)
##
## otrasl30 0.120 0.126 0.126 0.117 0.117
## (0.116) (0.115) (0.115) (0.114) (0.114)
##
## I(experience2) -0.0002 -0.0002 -0.0002 -0.0002
## (0.0002) (0.0002) (0.0002) (0.0002)
##
## I(age2) -0.0005** -0.0005** -0.0004* -0.0004*
## (0.0002) (0.0002) (0.0002) (0.0002)
##
## skolko_gosva0:gender2 -0.103**
## (0.052)
##
## skolko_gosva1:gender2 -0.027 -0.130
## (0.082) (0.089)
##
## skolko_gosva2:gender2 0.103**
## (0.052)
##
## Constant 10.200*** 9.570*** 8.250*** 8.520*** 8.960*** 8.900*** 8.140*** 8.000*** 8.230*** 8.020***
## (0.017) (0.094) (0.056) (0.102) (0.133) (0.130) (0.351) (0.351) (0.353) (0.352)
##
## =========================================================================================================================
## =========================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Модель 2
model2_1 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2), data = df)
model2_2 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df)
# =================== меняем базовый уровень skolko_gosva
model2_3 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2), data = df_1)
model2_4 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df_1)
#==================== робастность стд ошибок
coef2_1 = coeftest(model2_1, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_2 = coeftest(model2_2, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_3 = coeftest(model2_3, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_4 = coeftest(model2_4, df = Inf, vcov. = vcovHC, type = "HC0")
stargazer(coef2_1, coef2_3, coef2_2, coef2_4, summary = F, type = "text")
##
## =============================================================
## Dependent variable:
## ---------------------------------------
##
## (1) (2) (3) (4)
## -------------------------------------------------------------
## skolko_gosva0 0.091*** 0.182***
## (0.020) (0.036)
##
## skolko_gosva1 -0.134*** -0.043 -0.131** 0.052
## (0.040) (0.041) (0.059) (0.063)
##
## skolko_gosva2 -0.091*** -0.182***
## (0.020) (0.036)
##
## experience 0.020*** 0.020*** 0.020*** 0.020***
## (0.006) (0.006) (0.006) (0.006)
##
## hours_per_week 0.007*** 0.007*** 0.007*** 0.007***
## (0.001) (0.001) (0.001) (0.001)
##
## educ2 0.279*** 0.279*** 0.263*** 0.263***
## (0.094) (0.094) (0.094) (0.094)
##
## educ3 0.392*** 0.392*** 0.370*** 0.370***
## (0.049) (0.049) (0.049) (0.049)
##
## educ4 0.348*** 0.348*** 0.324*** 0.324***
## (0.037) (0.037) (0.037) (0.037)
##
## educ5 0.386*** 0.386*** 0.360*** 0.360***
## (0.037) (0.037) (0.038) (0.038)
##
## educ6 0.588*** 0.588*** 0.562*** 0.562***
## (0.035) (0.035) (0.036) (0.036)
##
## age 0.011 0.011 0.008 0.008
## (0.015) (0.015) (0.015) (0.015)
##
## log(employees) 0.036*** 0.036*** 0.036*** 0.036***
## (0.006) (0.006) (0.006) (0.006)
##
## gender2 -0.189*** -0.189*** -0.233*** -0.094***
## (0.022) (0.022) (0.027) (0.035)
##
## fam_status2 -0.077* -0.077* -0.081* -0.081*
## (0.045) (0.045) (0.046) (0.046)
##
## fam_status3 -0.124** -0.124** -0.130*** -0.130***
## (0.049) (0.049) (0.050) (0.050)
##
## fam_status4 -0.047 -0.047 -0.054 -0.054
## (0.053) (0.053) (0.053) (0.053)
##
## fam_status5 -0.047 -0.047 -0.048 -0.048
## (0.068) (0.068) (0.069) (0.069)
##
## fam_status6 -0.080 -0.080 -0.096 -0.096
## (0.165) (0.165) (0.174) (0.174)
##
## children_18 0.014 0.014 0.014 0.014
## (0.013) (0.013) (0.013) (0.013)
##
## I(experience2) -0.0002* -0.0002* -0.0003* -0.0003*
## (0.0001) (0.0001) (0.0001) (0.0001)
##
## I(age2) -0.0002 -0.0002 -0.0002 -0.0002
## (0.0002) (0.0002) (0.0002) (0.0002)
##
## skolko_gosva0:gender2 -0.139***
## (0.043)
##
## skolko_gosva1:gender2 -0.006 -0.146*
## (0.079) (0.082)
##
## skolko_gosva2:gender2 0.139***
## (0.043)
##
## Constant -0.051 -0.142 0.056 -0.126
## (0.292) (0.291) (0.294) (0.293)
##
## =============================================================
## =============================================================
## Note: *p<0.1; **p<0.05; ***p<0.01