Настройки чанков
Библиотеки, настройки чисел
Загрузка данных
# данные окт2019-янв2020
df_dirty = read_spss('C:/Users/ASUS/Downloads/r28i_os_42.sav') # Настя
# df_dirty = read_spss('r28i_os_42.sav')
# df_dirty = read_spss('D:/Documents/codes_R/metrika_proj/r28i_os_42_new.sav') # Антон
#df_dirty = read_spss('~/r28i_os_42_new.sav') # Дима
Функции для поиска
Функция удаления выбросов
# функция удаления выбросов
delete_outliers = function(datafr, column){
lower = quantile(datafr[[column]], 1/4) - 1.5*IQR(datafr[[column]])
upper = quantile(datafr[[column]], 3/4) + 1.5*IQR(datafr[[column]])
datafr = filter(datafr, (datafr[[column]] < upper) & (datafr[[column]] > lower))
return(datafr)
}
Переименовываем
# переименовываем
df_data = df_dirty
df_data = df_data %>% rename("lvl_educ" = "x_diplom") %>% rename("otrasl" = "xj4.1") %>% rename("gender" = "xh5") %>% rename("hours_per_week" = "xj6.2") %>% rename("employees" = "xj13") %>% rename("wage" = "xj13.2") %>% rename("if_government" = "xj23") %>% rename("if_foreigners" = "xj24") %>% rename("if_private" = "xj25") %>% rename("if_yours" = "xj26") %>% rename("exper_y" = "xj161.3y") %>% rename("exper_month" = "xj161.3m") %>% rename("age" = "x_age") %>% rename("fam_status" = "x_marst") %>% rename("children_18" = "xj72.173") %>% rename("h_day" = "xj6.1a")
Выбираем нужные переменные
df_data = df_data %>% select(lvl_educ, otrasl, gender, hours_per_week, wage, if_government, if_foreigners, if_private, if_yours, exper_y, exper_month, age, employees, fam_status, children_18, region)
Чистка выбросов
# удаляем "Затрудняюсь ответить", "Отказ от ответа"
df_data = df_data %>% filter(lvl_educ < 9*10^6) %>% filter(otrasl < 9*10^6) %>% filter(gender < 9*10^6) %>% filter(hours_per_week < 9*10^6) %>% filter(wage < 9*10^6) %>% filter(if_government < 9*10^6) %>% filter(if_foreigners < 9*10^6) %>% filter(if_private < 9*10^6) %>% filter(if_yours < 9*10^6) %>% filter(exper_y < 9*10^6) %>% filter(exper_month < 9*10^6)%>% filter(age < 9*10^6) %>% filter(employees < 9*10^6) %>% filter(fam_status < 9*10^6) %>% filter(children_18 < 9*10^6)
# удаляем NaNы
df = df_data %>% na.omit()
Боксплот зарплат по отраслям
ggplot(df_data)+
geom_boxplot(aes(x = as.factor(otrasl), y = as.numeric(wage)/1000))+
labs(x = "Номер отрасли",
y = "Зарплата, тыс.руб.")+
theme(axis.title.x = element_text(size = 14), # заголовок X
axis.title.y = element_text(size = 14), # заголовок Y
panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Меняем формат
# приводим к правильному формату
df$lvl_educ = df$lvl_educ %>% as.factor()
df$otrasl = df$otrasl %>% as.factor()
df$gender = df$gender %>% as.factor()
df$fam_status = df$fam_status %>% as.factor()
df$region = df$region %>% as.factor()
df$if_government = df$if_government %>% as.factor()
df$if_foreigners = df$if_foreigners %>% as.factor()
df$if_private = df$if_private %>% as.factor()
df$if_yours = df$if_yours %>% as.factor()
df$employees = df$employees %>% as.integer()
df$wage = df$wage %>% as.numeric()
df$hours_per_week = df$hours_per_week %>% as.numeric()
df$exper_y = df$exper_y %>% as.numeric()
df$exper_month = df$exper_month %>% as.numeric()
df$age = df$age %>% as.numeric()
df$children_18 = df$children_18 %>% as.numeric()
Создаем новые переменные
# создаем стаж
df = df %>% mutate(experience = exper_y + exper_month/12)
# дамми образование
df = df %>% mutate(educ = ifelse(lvl_educ == "6", "1", "0"))
df$educ = df$educ %>% as.factor()
# 1 - есть высшее, 0 - нет высшего
# создаем свою объясняющую переменную
df = df %>% mutate(skolko_gosva = ifelse(if_government == "2", "0", ifelse((if_private == "1")|(if_foreigners == "1")|(if_yours == "1"), "1", "2")))
df$skolko_gosva = df$skolko_gosva %>% as.factor()
# нет участия = 0, гос-во частично = 1, гос-во полностью = 2
# в опроснике 1 - да, 2 - нет
# создаем логарифм зп
df = df %>% mutate(log_wage = log(wage))
df$log_wage = df$log_wage %>% as.numeric()
Создание таблицы “зарплата-по-отраслям”
# создаем df со всеми отраслями и нормальными названиями
otraslii = data.frame(names(attributes(df_dirty$xj4.1)$labels),
unname(attributes(df_dirty$xj4.1)$labels))
colnames(otraslii) = c("names", "ids")
otraslii$ids = otraslii$ids %>% as.factor()
# считаем среднюю зп ПО ВЫБОРКЕ
df_wage = df %>% group_by(otrasl) %>% summarise(kolvo = n(), av_wage = mean(wage)) %>% left_join(otraslii, by = c("otrasl" = "ids"))
# считаем среднюю зп ПО ВСЕМ ДАННЫМ
df_dirty_w = df_dirty %>% filter(!is.na(xj6.2)) %>% filter(xj6.2 < 9*10^6) %>% filter(xj13.2 < 9*10^6) %>% filter(xj4.1 < 9*10^6)
df_dirty_w = df_dirty_w %>% group_by(xj4.1) %>% summarise(kolvo_R = n(), av_wage_R = mean(xj13.2, na.rm = T))
df_dirty_w$xj4.1 = df_dirty_w$xj4.1 %>% as.factor()
df_dirty_w$av_wage_R = df_dirty_w$av_wage_R %>% as.numeric()
# соединяем
df_wage = df_wage %>% left_join(df_dirty_w, by = c("otrasl" = "xj4.1"))
rm(df_dirty_w, otraslii)
df_wage = df_wage[c("names", "otrasl", "kolvo", "av_wage", "kolvo_R", "av_wage_R")] %>% arrange(otrasl)
Присоединяем поотраслевые зп
df = df %>% left_join(select(df_wage, c(otrasl, av_wage_R)), by = "otrasl")
Создаем относительную переменную ЗП
df = df %>% mutate(wage_to_average=wage/av_wage_R)
df$wage_to_average = df$wage_to_average %>% as.numeric()
Начало чистки и таблица описательных статистик
# удаляем выбросы по возрасту, часам в неделю
df = df %>% filter((gender == "1" & age <= 59) | (gender == "2" & age <= 54)) # потому что есть работы, которые запрещены нетрудоспособным(ну и возраст типа, здоровье)
df = df %>% filter(hours_per_week >= 20 & hours_per_week <= 60)
# чистим выбросы по зп внутри каждой отрасли
df = df %>% filter(wage > 0)
# таблица описательных статистик
table_opis1 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))
view(table_opis1)
# write.csv(table_opis1, "table_opis_before.csv")
# stargazer(table_opis1, type = "latex", summary = F)
Боксплот зп до удаления выбросов по зп
ggplot(df)+
geom_boxplot(aes(x = wage/1000))+
xlab("Среднемесячная заработная плата индивида, тыс.руб.")+
theme(axis.text=element_text(size=10),
axis.title=element_text(size=14))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Удаляем выбросы внутри отраслей
X = df %>% group_split(otrasl)
for (i in 1:28){
X[[i]] = delete_outliers(X[[i]], "wage")
}
df = bind_rows(X[[1]], X[[2]])
for (i in 3:28){
df = bind_rows(df, X[[i]])
}
Выбор “полезных” переменных
df = df %>% select(wage, wage_to_average, skolko_gosva, otrasl, educ, gender, fam_status, children_18, hours_per_week, experience, age, employees)
# write.csv(df, 'D:/Documents/codes_R/metrika_proj/df_itog.csv')
Таблица описательных переменных
table_opis1 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))
table_opis1 %>%
kable(caption = "Таблица характеристик числовых данных после удаления выбросов") %>%
kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F, position = "center") %>%
column_spec(1, bold = T)
| n | mean | sd | median | mad | min | max | range | skew | kurtosis | se | Q0.25 | Q0.75 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| wage | 1581 | 26998.961 | 14086.026 | 25000.000 | 11860.800 | 4000.000 | 120000.00 | 116000.0 | 1.600 | 4.974 | 354.260 | 17000.000 | 35000.00 |
| wage_to_average | 1581 | 0.938 | 0.419 | 0.874 | 0.413 | 0.152 | 2.35 | 2.2 | 0.739 | 0.201 | 0.011 | 0.614 | 1.18 |
| hours_per_week | 1581 | 41.333 | 6.869 | 40.000 | 1.483 | 20.000 | 60.00 | 40.0 | 0.076 | 1.890 | 0.173 | 40.000 | 45.00 |
| age | 1581 | 41.827 | 8.449 | 42.000 | 10.378 | 21.000 | 59.00 | 38.0 | 0.014 | -0.936 | 0.212 | 35.000 | 49.00 |
| experience | 1581 | 19.091 | 9.310 | 18.167 | 10.749 | 0.167 | 42.00 | 41.8 | 0.194 | -0.891 | 0.234 | 11.833 | 26.42 |
| employees | 1581 | 373.765 | 1604.472 | 50.000 | 59.304 | 1.000 | 25000.00 | 24999.0 | 10.882 | 141.668 | 40.352 | 15.000 | 150.00 |
| children_18 | 1581 | 0.997 | 0.917 | 1.000 | 1.483 | 0.000 | 7.00 | 7.0 | 0.936 | 2.031 | 0.023 | 0.000 | 2.00 |
# write.csv(table_opis1, "D:/Documents/codes_R/metrika_proj/table_opis1.csv")
# stargazer(table_opis1, type = "latex", summary = F)
Проводим тесты между основной объясняющей переменной и контрольными, чтобы убедиться в правильности нашего интуитивного понимания механизмов.
aov(employees ~ skolko_gosva, data = df) %>% summary() # берем
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 44590361 22295181 8.75 0.00017 ***
## Residuals 1578 4022850219 2549335
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(experience ~ skolko_gosva, data = df) %>% summary() # берем
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 1572 786 9.16 0.00011 ***
## Residuals 1578 135365 86
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(age ~ skolko_gosva, data = df) %>% summary() #берем
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 721 360 5.07 0.0064 **
## Residuals 1578 112063 71
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(hours_per_week ~ skolko_gosva, data = df) %>% summary() #берем
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 4686 2343 52.9 <0.0000000000000002 ***
## Residuals 1578 69855 44
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(children_18 ~ skolko_gosva, data = df) %>% summary() #не берем
## Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva 2 0 0.184 0.22 0.8
## Residuals 1578 1329 0.842
chisq.test(df$skolko_gosva, df$gender) #берем
##
## Pearson's Chi-squared test
##
## data: df$skolko_gosva and df$gender
## X-squared = 77, df = 2, p-value <0.0000000000000002
chisq.test(df$skolko_gosva, df$educ) #берем
##
## Pearson's Chi-squared test
##
## data: df$skolko_gosva and df$educ
## X-squared = 19, df = 2, p-value = 0.00006
chisq.test(df$skolko_gosva, df$otrasl) #берем
##
## Pearson's Chi-squared test
##
## data: df$skolko_gosva and df$otrasl
## X-squared = 981, df = 52, p-value <0.0000000000000002
chisq.test(df$skolko_gosva, df$fam_status) #не берем
##
## Pearson's Chi-squared test
##
## data: df$skolko_gosva and df$fam_status
## X-squared = 8, df = 10, p-value = 0.6
Столбчатый график “Доля респондентов с высшим образованием”
ggplot(df)+
geom_bar(aes(x = skolko_gosva, fill = educ), color = "black", position = "fill")+
scale_y_continuous(labels = percent)+
labs(y = "Доля респондентов с высшим образованием",
x = "Участие государства в капитале компании")+
theme(plot.title = element_text(hjust = 0.5, size = 12),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
scale_fill_manual(values = c("gray", "lightblue"),
name = "Наличие\nвысшего\nобразования",
labels = c("Нет", "Есть"))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Столбчатый график “Доля респондентов, находящихся в браке”
df = df %>% mutate(if_married = ifelse(fam_status == "2" | fam_status == "6", "1", "0"))
df$if_married = df$if_married %>% as.factor()
ggplot(df)+
geom_bar(aes(x = skolko_gosva, fill = if_married), color = "black", position = "fill")+
scale_y_continuous(labels = percent)+
labs(y = "Доля респондентов по семейному статусу",x = "Участие государства в капитале компании",
title = NULL)+
theme(plot.title = element_text(hjust = 0.5, size = 12),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
scale_fill_manual(values = c("gray", "lightblue"),
name = "В браке",
labels = c("Нет", "Да"))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Столбчатый график “Распределение респондентов по полу”
ggplot(df)+
geom_bar(aes(x = skolko_gosva, fill = gender), color = "black", position = "fill")+
scale_y_continuous(labels = percent)+
labs(y = "Пол респондента",x = "Участие государства в капитале компании",
title = NULL)+
theme(plot.title = element_text(hjust = 0.5, size = 12),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
scale_fill_manual(values = c("gray", "lightblue"),
name = "Пол",
labels = c("Мужской", "Женский"))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Столбчатый график “Распределение skolko_gosva по отраслям”
ggplot(df)+
geom_bar(aes(x = otrasl, fill = skolko_gosva), color = "black", position = "fill")+
scale_y_continuous(labels = percent)+
labs(y = "Участие государства в капитале компании",x = "Отрасль",
title = NULL)+
theme(plot.title = element_text(hjust = 0.5, size = 12),
plot.title.position = "plot",
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
scale_fill_manual(values = c( "lightblue","skyblue3","dodgerblue4"),
name = "Участие\nгосударства",
labels = c("частные", "смешанные", "государственные"))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Плотность 3 графика по относительной зп{скачали}
ggplot(df)+
geom_density(aes(x = wage_to_average))+
facet_wrap(vars(skolko_gosva), labeller = label_both)+
xlab("Отношение средней заработной платы индивида к средней зп по отрасли")+
ylab("Плотность")+
ggtitle("Плотность относительной зп по факту участия гос-ва")+
theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
plot.title.position = "plot", # подгонка под размер графика
plot.caption.position = "plot",
axis.title.x = element_text(size = 10),
axis.title.y = element_text(size = 11))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Плотность заработной платы
plot1 <- ggplot(df)+
geom_density(aes(x = wage/1000))+
xlab("Cреднемесячная заработная плата респондента, тыс.руб")+
ylab("Плотность")+
theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
plot.title.position = "plot", # подгонка под размер графика
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
plot2 <- ggplot(df)+
geom_density(aes(x = log(wage)))+
xlab("Логарифм среднемесячной заработной платы респондента")+
ylab("")+
theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
plot.title.position = "plot", # подгонка под размер графика
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
ggarrange(plot1, plot2, ncol=2)
Плотность распределения количества сотрудников
plot3 <- ggplot(df)+
geom_density(aes(x = employees))+
xlab("Количество сотрудников в компании, чел.")+
ylab("Плотность")+
theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
plot.title.position = "plot", # подгонка под размер графика
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Плотность распределения относительной заработной платы
ggplot(df)+
geom_density(aes(x = wage_to_average))+
xlab("Отношение среднемесячной зарплаты респондента к средней по отрасли")+
ylab("Плотность")+
theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
plot.title.position = "plot", # подгонка под размер графика
plot.caption.position = "plot",
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Боксплот 3 коробки, отношение средней заработной платы индивида к средней зп по отрасли по skolko_gosva{скачали}
ggplot(df)+
geom_boxplot(aes(x = skolko_gosva, y = wage_to_average))+
xlab("Участие государства")+
ylab("Отношение средней зарплаты индивида к средней зарплате по отрасли")+
theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
plot.title.position = "plot", # подгонка под размер графика
plot.caption.position = "plot")+
theme(panel.background = element_rect(fill = "white"), # задний фон
panel.grid = element_line(color = "grey"))+ #линии на графике
geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
colour = "black", size = 0.5, fill = NA)
Рассеивания по стажу{скачали}
ggplot(df, aes(x = experience, y = log(wage)))+
geom_point(color = "red", alpha = 0.4)+
geom_smooth(method = loess, se = F, formula = y ~ x)+
labs(x = "Стаж работы, лет",
y = "Логарифм среднемесячной заработной платы")+
theme(plot.caption = element_text(hjust = 0, face = "plain"),
plot.title.position = "plot",
plot.caption.position = "plot",
panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "grey"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Рассеивания по возрасту
ggplot(df, aes(x = age, y = (wage)))+
geom_point(color = "red", alpha = 0.4)+
geom_smooth(method = loess, se = F, formula = y ~ x)+
labs(x = "Возраст, лет",
y = "Логарифм среднемесячной заработной платы")+
theme(plot.caption = element_text(hjust = 0, face = "plain"),
plot.title.position = "plot",
plot.caption.position = "plot",
panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "grey"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Рассеивания по количеству сотрудников
ggplot(df, aes(x = employees, y = log(wage)))+
geom_point(color = "red", alpha = 0.4)+
geom_smooth(method = loess, se = F, formula = y ~ x)+
labs(x = "Количество сотрудников в компании, чел.",
y = "Логарифм среднемесячной заработной платы")+
theme(plot.caption = element_text(hjust = 0, face = "plain"),
plot.title.position = "plot",
plot.caption.position = "plot",
panel.background = element_rect(fill = "white"),
panel.grid = element_line(color = "grey"),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Итоговый дф
df_itog = df %>% select(wage_to_average, wage, skolko_gosva, gender, educ, age, experience, hours_per_week, employees, fam_status, children_18, otrasl)
# write.csv(df_itog, "D:/Documents/codes_R/metrika_proj/df_itog.csv")
Модель 1
# Эволюция моделей
model1_1 = lm(log(wage) ~ skolko_gosva, data = df)
model1_2 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week, , data = df)
model1_3 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ, data = df)
model1_4 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees), data = df)
model1_5 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18, data = df)
model1_6 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl, data = df)
model1_7 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2), data = df)
model1_8 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df)
# ======================================== меняем базовые уровни в 7 и 8 моделях
df_1 <- within(df, skolko_gosva <- relevel(skolko_gosva, ref = 3))
model1_9 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2), data = df_1)
model1_10 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df_1)
coef1_1 = coeftest(model1_1, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_2 = coeftest(model1_2, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_3 = coeftest(model1_3, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_4 = coeftest(model1_4, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_5 = coeftest(model1_5, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_6 = coeftest(model1_6, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_7 = coeftest(model1_7, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_8 = coeftest(model1_8, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_9 = coeftest(model1_9, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_10 = coeftest(model1_10, df = Inf, vcov. = vcovHC, type = "HC0")
stargazer(coef1_1, coef1_2, coef1_3, coef1_4, coef1_5, coef1_6, coef1_7, coef1_9, coef1_8, coef1_10, summary = F, type = "text")
##
## =========================================================================================================================
## Dependent variable:
## ---------------------------------------------------------------------------------------------------
##
## (1) (2) (3) (4) (5) (6) (7) (8) (9) (10)
## -------------------------------------------------------------------------------------------------------------------------
## skolko_gosva0 0.145*** 0.206***
## (0.030) (0.042)
##
## skolko_gosva1 0.019 0.022 0.012 -0.104** -0.094** -0.122*** -0.120*** 0.025 -0.107* 0.100
## (0.050) (0.050) (0.048) (0.047) (0.044) (0.044) (0.043) (0.047) (0.056) (0.063)
##
## skolko_gosva2 -0.230*** -0.198*** -0.225*** -0.239*** -0.189*** -0.145*** -0.145*** -0.206***
## (0.026) (0.027) (0.026) (0.024) (0.024) (0.030) (0.030) (0.042)
##
## experience 0.005*** 0.006*** 0.014*** 0.013*** 0.013*** 0.022*** 0.022*** 0.022*** 0.022***
## (0.001) (0.001) (0.002) (0.002) (0.002) (0.007) (0.007) (0.007) (0.007)
##
## hours_per_week 0.012*** 0.014*** 0.013*** 0.010*** 0.009*** 0.009*** 0.009*** 0.009*** 0.009***
## (0.002) (0.002) (0.002) (0.002) (0.002) (0.002) (0.002) (0.002) (0.002)
##
## educ1 0.278*** 0.256*** 0.278*** 0.255*** 0.248*** 0.248*** 0.248*** 0.248***
## (0.025) (0.024) (0.023) (0.023) (0.023) (0.023) (0.023) (0.023)
##
## age -0.012*** -0.012*** -0.010*** 0.026 0.026 0.023 0.023
## (0.003) (0.003) (0.003) (0.018) (0.018) (0.018) (0.018)
##
## log(employees) 0.071*** 0.062*** 0.058*** 0.057*** 0.057*** 0.056*** 0.056***
## (0.007) (0.006) (0.007) (0.007) (0.007) (0.007) (0.007)
##
## gender2 -0.272*** -0.248*** -0.263*** -0.263*** -0.292*** -0.186***
## (0.025) (0.026) (0.026) (0.026) (0.030) (0.046)
##
## fam_status2 -0.119** -0.128** -0.130** -0.130** -0.131** -0.131**
## (0.054) (0.053) (0.052) (0.052) (0.052) (0.052)
##
## fam_status3 -0.152** -0.158*** -0.158*** -0.158*** -0.161*** -0.161***
## (0.059) (0.058) (0.057) (0.057) (0.057) (0.057)
##
## fam_status4 -0.048 -0.074 -0.080 -0.080 -0.083 -0.083
## (0.062) (0.061) (0.060) (0.060) (0.060) (0.060)
##
## fam_status5 -0.016 -0.068 -0.064 -0.064 -0.062 -0.062
## (0.078) (0.076) (0.075) (0.075) (0.076) (0.076)
##
## fam_status6 -0.088 -0.066 -0.087 -0.087 -0.099 -0.099
## (0.146) (0.158) (0.165) (0.165) (0.170) (0.170)
##
## children_18 0.026* 0.025* 0.010 0.010 0.011 0.011
## (0.015) (0.014) (0.014) (0.014) (0.014) (0.014)
##
## otrasl2 0.082 0.085 0.085 0.075 0.075
## (0.063) (0.062) (0.062) (0.062) (0.062)
##
## otrasl3 0.071 0.082 0.082 0.083 0.083
## (0.076) (0.079) (0.079) (0.079) (0.079)
##
## otrasl4 0.348*** 0.363*** 0.363*** 0.361*** 0.361***
## (0.083) (0.081) (0.081) (0.081) (0.081)
##
## otrasl5 0.110* 0.117* 0.117* 0.109* 0.109*
## (0.060) (0.061) (0.061) (0.060) (0.060)
##
## otrasl6 0.262*** 0.262*** 0.262*** 0.252*** 0.252***
## (0.059) (0.059) (0.059) (0.058) (0.058)
##
## otrasl7 0.156*** 0.161*** 0.161*** 0.157*** 0.157***
## (0.054) (0.053) (0.053) (0.053) (0.053)
##
## otrasl8 -0.148** -0.133* -0.133* -0.141* -0.141*
## (0.073) (0.074) (0.074) (0.074) (0.074)
##
## otrasl9 0.090 0.091 0.091 0.074 0.074
## (0.086) (0.086) (0.086) (0.087) (0.087)
##
## otrasl10 -0.030 -0.021 -0.021 -0.048 -0.048
## (0.053) (0.053) (0.053) (0.055) (0.055)
##
## otrasl11 0.075 0.076 0.076 0.060 0.060
## (0.079) (0.080) (0.080) (0.079) (0.079)
##
## otrasl12 0.128** 0.132*** 0.132*** 0.111** 0.111**
## (0.051) (0.051) (0.051) (0.052) (0.052)
##
## otrasl13 0.098 0.094 0.094 0.104 0.104
## (0.074) (0.073) (0.073) (0.074) (0.074)
##
## otrasl14 0.115** 0.120*** 0.120*** 0.118*** 0.118***
## (0.045) (0.045) (0.045) (0.044) (0.044)
##
## otrasl15 0.170** 0.179*** 0.179*** 0.172** 0.172**
## (0.068) (0.068) (0.068) (0.069) (0.069)
##
## otrasl16 0.234*** 0.243*** 0.243*** 0.234*** 0.234***
## (0.072) (0.069) (0.069) (0.069) (0.069)
##
## otrasl17 -0.177*** -0.168*** -0.168*** -0.167*** -0.167***
## (0.056) (0.055) (0.055) (0.055) (0.055)
##
## otrasl18 0.369 0.394* 0.394* 0.397* 0.397*
## (0.235) (0.223) (0.223) (0.219) (0.219)
##
## otrasl20 0.069 0.065 0.065 0.040 0.040
## (0.114) (0.114) (0.114) (0.118) (0.118)
##
## otrasl21 0.403*** 0.420*** 0.420*** 0.415*** 0.415***
## (0.126) (0.127) (0.127) (0.126) (0.126)
##
## otrasl23 0.401*** 0.433*** 0.433*** 0.440** 0.440**
## (0.138) (0.164) (0.164) (0.175) (0.175)
##
## otrasl24 0.003 0.070 0.070 0.065 0.065
## (0.166) (0.174) (0.174) (0.184) (0.184)
##
## otrasl25 0.235** 0.229** 0.229** 0.228** 0.228**
## (0.094) (0.098) (0.098) (0.106) (0.106)
##
## otrasl26 -0.020 -0.027 -0.027 -0.053 -0.053
## (0.123) (0.125) (0.125) (0.130) (0.130)
##
## otrasl27 0.552** 0.561** 0.561** 0.547** 0.547**
## (0.265) (0.252) (0.252) (0.248) (0.248)
##
## otrasl29 -0.486*** -0.470*** -0.470*** -0.460*** -0.460***
## (0.040) (0.040) (0.040) (0.040) (0.040)
##
## otrasl30 0.134 0.140 0.140 0.130 0.130
## (0.114) (0.114) (0.114) (0.113) (0.113)
##
## I(experience2) -0.0002 -0.0002 -0.0002 -0.0002
## (0.0002) (0.0002) (0.0002) (0.0002)
##
## I(age2) -0.0004** -0.0004** -0.0004* -0.0004*
## (0.0002) (0.0002) (0.0002) (0.0002)
##
## skolko_gosva0:gender2 -0.107**
## (0.052)
##
## skolko_gosva1:gender2 -0.023 -0.130
## (0.081) (0.088)
##
## skolko_gosva2:gender2 0.107**
## (0.052)
##
## Constant 10.200*** 9.570*** 9.360*** 9.480*** 9.880*** 9.740*** 9.010*** 8.870*** 9.090*** 8.880***
## (0.017) (0.094) (0.093) (0.117) (0.135) (0.141) (0.345) (0.344) (0.347) (0.346)
##
## =========================================================================================================================
## =========================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
#stargazer(model1_1, model1_2, model1_3, model1_4, model1_5, model1_6, model1_7, model1_9, model1_8, model1_10, summary = F, type = "text", keep.stat=c("adj.rsq", "n"))
Тест Уайта на гетероскедастичность для двух моделей
bptest(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df) # с логарифмом BP=101, это меньше и лучше
##
## studentized Breusch-Pagan test
##
## data: log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender
## BP = 101, df = 44, p-value = 0.000002
bptest(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + employees + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df)# без логарифма BP=122
##
## studentized Breusch-Pagan test
##
## data: log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + employees + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender
## BP = 122, df = 44, p-value = 0.000000003
Модель 2
model2_1 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2), data = df)
model2_2 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df)
# ================================================= меняем базовый уровень skolko_gosva
model2_3 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2), data = df_1)
model2_4 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df_1)
coef2_1 = coeftest(model2_1, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_2 = coeftest(model2_2, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_3 = coeftest(model2_3, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_4 = coeftest(model2_4, df = Inf, vcov. = vcovHC, type = "HC0")
stargazer(coef2_1, coef2_3, coef2_2, coef2_4, summary = F, type = "text", keep.stat=c("adj.rsq"))
##
## =============================================================
## Dependent variable:
## ---------------------------------------
##
## (1) (2) (3) (4)
## -------------------------------------------------------------
## skolko_gosva0 0.094*** 0.187***
## (0.021) (0.036)
##
## skolko_gosva1 -0.137*** -0.043 -0.138** 0.049
## (0.040) (0.041) (0.059) (0.063)
##
## skolko_gosva2 -0.094*** -0.187***
## (0.021) (0.036)
##
## experience 0.021*** 0.021*** 0.021*** 0.021***
## (0.006) (0.006) (0.006) (0.006)
##
## hours_per_week 0.007*** 0.007*** 0.007*** 0.007***
## (0.001) (0.001) (0.001) (0.001)
##
## educ1 0.222*** 0.222*** 0.221*** 0.221***
## (0.021) (0.021) (0.021) (0.021)
##
## age 0.011 0.011 0.008 0.008
## (0.015) (0.015) (0.015) (0.015)
##
## log(employees) 0.036*** 0.036*** 0.036*** 0.036***
## (0.006) (0.006) (0.006) (0.006)
##
## gender2 -0.187*** -0.187*** -0.234*** -0.091***
## (0.022) (0.022) (0.027) (0.035)
##
## fam_status2 -0.079* -0.079* -0.084* -0.084*
## (0.045) (0.045) (0.046) (0.046)
##
## fam_status3 -0.129*** -0.129*** -0.134*** -0.134***
## (0.050) (0.050) (0.050) (0.050)
##
## fam_status4 -0.049 -0.049 -0.056 -0.056
## (0.053) (0.053) (0.053) (0.053)
##
## fam_status5 -0.043 -0.043 -0.045 -0.045
## (0.069) (0.069) (0.070) (0.070)
##
## fam_status6 -0.079 -0.079 -0.095 -0.095
## (0.163) (0.163) (0.173) (0.173)
##
## children_18 0.014 0.014 0.014 0.014
## (0.013) (0.013) (0.013) (0.013)
##
## I(experience2) -0.0003* -0.0003* -0.0003* -0.0003*
## (0.0001) (0.0001) (0.0001) (0.0001)
##
## I(age2) -0.0002 -0.0002 -0.0002 -0.0002
## (0.0002) (0.0002) (0.0002) (0.0002)
##
## skolko_gosva0:gender2 -0.142***
## (0.043)
##
## skolko_gosva1:gender2 0.002 -0.140*
## (0.078) (0.082)
##
## skolko_gosva2:gender2 0.142***
## (0.043)
##
## Constant 0.315 0.222 0.397 0.211
## (0.286) (0.285) (0.288) (0.286)
##
## =============================================================
## =============================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# stargazer(model2_1, model2_3, model2_2, model2_4, summary = F, type = "text", keep.stat=c("adj.rsq", "n"))
Данные для критики
# names(attributes(df_data$region)$labels) # список регионов(названия)
# attributes(df_data$region)$labels # список регионов с номерами