1 ПОДГОТОВКА

Настройки чанков

Библиотеки, настройки чисел

Загрузка данных

# данные окт2019-янв2020

df_dirty = read_spss('C:/Users/ASUS/Downloads/r28i_os_42.sav') # Настя
# df_dirty = read_spss('r28i_os_42.sav')
# df_dirty = read_spss('D:/Documents/codes_R/metrika_proj/r28i_os_42_new.sav') # Антон
#df_dirty = read_spss('~/r28i_os_42_new.sav') # Дима

Функции для поиска

Функция удаления выбросов

# функция удаления выбросов
delete_outliers = function(datafr, column){
  lower = quantile(datafr[[column]], 1/4) - 1.5*IQR(datafr[[column]])
  upper = quantile(datafr[[column]], 3/4) + 1.5*IQR(datafr[[column]])
  datafr = filter(datafr, (datafr[[column]] < upper) & (datafr[[column]] > lower))
  return(datafr)
}

Переименовываем

# переименовываем
df_data = df_dirty
df_data = df_data %>% rename("lvl_educ" = "x_diplom") %>% rename("otrasl" = "xj4.1") %>% rename("gender" = "xh5") %>% rename("hours_per_week" = "xj6.2") %>% rename("employees" = "xj13") %>% rename("wage" = "xj13.2") %>% rename("if_government" = "xj23") %>% rename("if_foreigners" = "xj24") %>% rename("if_private" = "xj25") %>% rename("if_yours" = "xj26") %>% rename("exper_y" = "xj161.3y") %>% rename("exper_month" = "xj161.3m") %>% rename("age" = "x_age") %>% rename("fam_status" = "x_marst") %>% rename("children_18" = "xj72.173") %>% rename("h_day" = "xj6.1a")

Выбираем нужные переменные

df_data = df_data %>% select(lvl_educ, otrasl, gender, hours_per_week, wage, if_government, if_foreigners, if_private, if_yours, exper_y, exper_month, age, employees, fam_status, children_18, region)

Чистка выбросов

# удаляем "Затрудняюсь ответить", "Отказ от ответа" 
df_data = df_data %>% filter(lvl_educ < 9*10^6) %>% filter(otrasl < 9*10^6) %>% filter(gender < 9*10^6) %>% filter(hours_per_week < 9*10^6) %>% filter(wage < 9*10^6) %>% filter(if_government < 9*10^6) %>% filter(if_foreigners < 9*10^6) %>% filter(if_private < 9*10^6) %>% filter(if_yours < 9*10^6) %>% filter(exper_y < 9*10^6) %>% filter(exper_month < 9*10^6)%>% filter(age < 9*10^6) %>% filter(employees < 9*10^6) %>% filter(fam_status < 9*10^6) %>% filter(children_18 < 9*10^6)

# удаляем NaNы
df = df_data %>% na.omit()

Боксплот зарплат по отраслям

ggplot(df_data)+
  geom_boxplot(aes(x = as.factor(otrasl), y = as.numeric(wage)/1000))+
  labs(x = "Номер отрасли",
       y = "Зарплата, тыс.руб.")+
  theme(axis.title.x = element_text(size = 14), # заголовок X
        axis.title.y = element_text(size = 14), # заголовок Y
        panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Меняем формат

# приводим к правильному формату

df$lvl_educ = df$lvl_educ %>% as.factor()
df$otrasl = df$otrasl %>% as.factor()
df$gender = df$gender %>% as.factor()
df$fam_status = df$fam_status %>% as.factor()
df$region = df$region %>% as.factor()

df$if_government = df$if_government %>% as.factor()
df$if_foreigners = df$if_foreigners %>% as.factor()
df$if_private = df$if_private %>% as.factor()
df$if_yours = df$if_yours %>% as.factor()

df$employees = df$employees %>% as.integer()
df$wage = df$wage %>% as.numeric()
df$hours_per_week = df$hours_per_week %>% as.numeric()
df$exper_y = df$exper_y %>% as.numeric()
df$exper_month = df$exper_month %>% as.numeric()
df$age = df$age %>% as.numeric()
df$children_18 = df$children_18 %>% as.numeric()

Создаем новые переменные

# создаем стаж
df = df %>% mutate(experience = exper_y + exper_month/12)

# дамми образование
df = df %>% mutate(educ = ifelse(lvl_educ == "6", "1", "0"))
df$educ = df$educ %>% as.factor()
# 1 - есть высшее, 0 - нет высшего

# создаем свою объясняющую переменную

df = df %>% mutate(skolko_gosva = ifelse(if_government == "2", "0", ifelse((if_private == "1")|(if_foreigners == "1")|(if_yours == "1"), "1", "2")))
df$skolko_gosva = df$skolko_gosva %>% as.factor()
# нет участия = 0, гос-во частично = 1, гос-во полностью = 2
# в опроснике 1 - да, 2 - нет

# создаем логарифм зп
df = df %>% mutate(log_wage = log(wage))
df$log_wage = df$log_wage %>% as.numeric()

1.1 СОЗДАНИЕ ЗАВИСИМОЙ ПЕРЕМЕННОЙ - ОТНОСИТЕЛЬНОЙ ЗП

Создание таблицы “зарплата-по-отраслям”

# создаем df со всеми отраслями и нормальными названиями
otraslii = data.frame(names(attributes(df_dirty$xj4.1)$labels),
                      unname(attributes(df_dirty$xj4.1)$labels))
colnames(otraslii) = c("names", "ids")
otraslii$ids = otraslii$ids %>% as.factor()

# считаем среднюю зп ПО ВЫБОРКЕ
df_wage = df %>% group_by(otrasl) %>% summarise(kolvo = n(), av_wage = mean(wage)) %>% left_join(otraslii, by = c("otrasl" = "ids"))

# считаем среднюю зп ПО ВСЕМ ДАННЫМ
df_dirty_w = df_dirty %>% filter(!is.na(xj6.2)) %>% filter(xj6.2 < 9*10^6) %>% filter(xj13.2 < 9*10^6) %>% filter(xj4.1 < 9*10^6)
df_dirty_w = df_dirty_w %>% group_by(xj4.1) %>% summarise(kolvo_R = n(), av_wage_R = mean(xj13.2, na.rm = T))

df_dirty_w$xj4.1 = df_dirty_w$xj4.1 %>% as.factor()
df_dirty_w$av_wage_R = df_dirty_w$av_wage_R %>% as.numeric()

# соединяем
df_wage = df_wage %>% left_join(df_dirty_w, by = c("otrasl" = "xj4.1"))
rm(df_dirty_w, otraslii)

df_wage = df_wage[c("names", "otrasl", "kolvo", "av_wage", "kolvo_R", "av_wage_R")] %>% arrange(otrasl)

Присоединяем поотраслевые зп

df = df %>% left_join(select(df_wage, c(otrasl, av_wage_R)), by = "otrasl")

Создаем относительную переменную ЗП

df = df %>% mutate(wage_to_average=wage/av_wage_R)
df$wage_to_average = df$wage_to_average %>% as.numeric()

1.2 УДАЛЕНИЕ ВЫБРОСОВ

Начало чистки и таблица описательных статистик

# удаляем выбросы по возрасту, часам в неделю
df = df %>% filter((gender == "1" & age <= 59) | (gender == "2" & age <= 54)) # потому что есть работы, которые запрещены нетрудоспособным(ну и возраст типа, здоровье)

df = df %>% filter(hours_per_week >= 20 & hours_per_week <= 60)

# чистим выбросы по зп внутри каждой отрасли
df = df %>% filter(wage > 0)

# таблица описательных статистик
table_opis1 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
                      quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))
view(table_opis1)
# write.csv(table_opis1, "table_opis_before.csv")
# stargazer(table_opis1, type = "latex", summary = F)

Боксплот зп до удаления выбросов по зп

ggplot(df)+
  geom_boxplot(aes(x = wage/1000))+
  xlab("Среднемесячная заработная плата индивида, тыс.руб.")+
  theme(axis.text=element_text(size=10),
        axis.title=element_text(size=14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Удаляем выбросы внутри отраслей

X = df %>% group_split(otrasl)
for (i in 1:28){
  X[[i]] = delete_outliers(X[[i]], "wage")
}
df = bind_rows(X[[1]], X[[2]])
for (i in 3:28){
  df = bind_rows(df, X[[i]])
}

Выбор “полезных” переменных

df = df %>% select(wage, wage_to_average, skolko_gosva, otrasl, educ, gender, fam_status, children_18, hours_per_week, experience, age, employees)
# write.csv(df, 'D:/Documents/codes_R/metrika_proj/df_itog.csv')

2 ТАБЛИЦА ОПИСАТЕЛЬНЫХ СТАТИСТИК ЧИСЛОВЫХ ПЕРЕМЕННЫХ 1

Таблица описательных переменных

table_opis1 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
                      quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))

table_opis1 %>%
 kable(caption = "Таблица характеристик числовых данных после удаления выбросов") %>%
 kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F, position = "center") %>%
 column_spec(1, bold = T)

Таблица характеристик числовых данных после удаления выбросов
	n	mean	sd	median	mad	min	max	range	skew	kurtosis	se	Q0.25	Q0.75
wage	1581	26998.961	14086.026	25000.000	11860.800	4000.000	120000.00	116000.0	1.600	4.974	354.260	17000.000	35000.00
wage_to_average	1581	0.938	0.419	0.874	0.413	0.152	2.35	2.2	0.739	0.201	0.011	0.614	1.18
hours_per_week	1581	41.333	6.869	40.000	1.483	20.000	60.00	40.0	0.076	1.890	0.173	40.000	45.00
age	1581	41.827	8.449	42.000	10.378	21.000	59.00	38.0	0.014	-0.936	0.212	35.000	49.00
experience	1581	19.091	9.310	18.167	10.749	0.167	42.00	41.8	0.194	-0.891	0.234	11.833	26.42
employees	1581	373.765	1604.472	50.000	59.304	1.000	25000.00	24999.0	10.882	141.668	40.352	15.000	150.00
children_18	1581	0.997	0.917	1.000	1.483	0.000	7.00	7.0	0.936	2.031	0.023	0.000	2.00

# write.csv(table_opis1, "D:/Documents/codes_R/metrika_proj/table_opis1.csv")
# stargazer(table_opis1, type = "latex", summary = F)

Проводим тесты между основной объясняющей переменной и контрольными, чтобы убедиться в правильности нашего интуитивного понимания механизмов.

aov(employees ~ skolko_gosva, data = df) %>% summary() # берем

##                Df     Sum Sq  Mean Sq F value  Pr(>F)    
## skolko_gosva    2   44590361 22295181    8.75 0.00017 ***
## Residuals    1578 4022850219  2549335                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(experience ~ skolko_gosva, data = df) %>% summary() # берем

##                Df Sum Sq Mean Sq F value  Pr(>F)    
## skolko_gosva    2   1572     786    9.16 0.00011 ***
## Residuals    1578 135365      86                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(age ~ skolko_gosva, data = df) %>% summary() #берем

##                Df Sum Sq Mean Sq F value Pr(>F)   
## skolko_gosva    2    721     360    5.07 0.0064 **
## Residuals    1578 112063      71                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(hours_per_week ~ skolko_gosva, data = df) %>% summary() #берем

##                Df Sum Sq Mean Sq F value              Pr(>F)    
## skolko_gosva    2   4686    2343    52.9 <0.0000000000000002 ***
## Residuals    1578  69855      44                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(children_18 ~ skolko_gosva, data = df) %>% summary() #не берем

##                Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva    2      0   0.184    0.22    0.8
## Residuals    1578   1329   0.842

chisq.test(df$skolko_gosva, df$gender) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$gender
## X-squared = 77, df = 2, p-value <0.0000000000000002

chisq.test(df$skolko_gosva, df$educ) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$educ
## X-squared = 19, df = 2, p-value = 0.00006

chisq.test(df$skolko_gosva, df$otrasl) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$otrasl
## X-squared = 981, df = 52, p-value <0.0000000000000002

chisq.test(df$skolko_gosva, df$fam_status) #не берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$fam_status
## X-squared = 8, df = 10, p-value = 0.6

3 ГРАФИКИ

Столбчатый график “Доля респондентов с высшим образованием”

ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = educ), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Доля респондентов с высшим образованием",
       x = "Участие государства в капитале компании")+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "Наличие\nвысшего\nобразования",
                    labels = c("Нет", "Есть"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Доля респондентов, находящихся в браке”

df = df %>% mutate(if_married = ifelse(fam_status == "2" | fam_status == "6", "1", "0"))
df$if_married = df$if_married %>% as.factor()
ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = if_married), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Доля респондентов по семейному статусу",x = "Участие государства в капитале компании",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "В браке",
                    labels = c("Нет", "Да"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Распределение респондентов по полу”

ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = gender), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Пол респондента",x = "Участие государства в капитале компании",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "Пол",
                    labels = c("Мужской", "Женский"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
      panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Распределение skolko_gosva по отраслям”

ggplot(df)+
  geom_bar(aes(x = otrasl, fill =  skolko_gosva), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Участие государства в капитале компании",x = "Отрасль",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c( "lightblue","skyblue3","dodgerblue4"),
                    name = "Участие\nгосударства",
                    labels = c("частные", "смешанные", "государственные"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность 3 графика по относительной зп{скачали}

ggplot(df)+
  geom_density(aes(x = wage_to_average))+
  facet_wrap(vars(skolko_gosva), labeller = label_both)+
  xlab("Отношение средней заработной платы индивида к средней зп по отрасли")+
  ylab("Плотность")+
  ggtitle("Плотность относительной зп по факту участия гос-ва")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 11))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность заработной платы

plot1 <- ggplot(df)+
  geom_density(aes(x = wage/1000))+
  xlab("Cреднемесячная заработная плата респондента, тыс.руб")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

plot2 <- ggplot(df)+
  geom_density(aes(x = log(wage)))+
  xlab("Логарифм среднемесячной заработной платы респондента")+
  ylab("")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

ggarrange(plot1, plot2, ncol=2)

Плотность распределения количества сотрудников

plot3 <- ggplot(df)+
  geom_density(aes(x = employees))+
  xlab("Количество сотрудников в компании, чел.")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность распределения относительной заработной платы

ggplot(df)+
  geom_density(aes(x = wage_to_average))+
  xlab("Отношение среднемесячной зарплаты респондента к средней по отрасли")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Боксплот 3 коробки, отношение средней заработной платы индивида к средней зп по отрасли по skolko_gosva{скачали}

ggplot(df)+
  geom_boxplot(aes(x = skolko_gosva, y = wage_to_average))+
  xlab("Участие государства")+
  ylab("Отношение средней зарплаты индивида к средней зарплате по отрасли")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot")+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Рассеивания по стажу{скачали}

ggplot(df, aes(x = experience, y = log(wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Стаж работы, лет",
     y = "Логарифм среднемесячной заработной платы")+
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

Рассеивания по возрасту

ggplot(df, aes(x = age, y = (wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Возраст, лет",
     y = "Логарифм среднемесячной заработной платы")+
  
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

Рассеивания по количеству сотрудников

ggplot(df, aes(x = employees, y = log(wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Количество сотрудников в компании, чел.",
     y = "Логарифм среднемесячной заработной платы")+
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

4 РЕГРЕССИОННЫЙ АНАЛИЗ

Итоговый дф

df_itog = df %>% select(wage_to_average, wage, skolko_gosva, gender, educ, age, experience, hours_per_week, employees, fam_status, children_18, otrasl)

# write.csv(df_itog, "D:/Documents/codes_R/metrika_proj/df_itog.csv")

Модель 1

model1 = lm(log(wage) ~ skolko_gosva+ skolko_gosva:gender + log(employees) + age +  I(age^2) + hours_per_week + gender + educ + otrasl + fam_status + children_18 + experience + I(experience^2), data = df)

# se_robust <- sqrt(diag(vcovHC(model1, type="HC0")))
# model1_robust <- summary(model1)
# model1_robust$coefficients[,2] <- se_robust
# model1_robust

#coeftest(model1, df = Inf, vcov. = vcovHC, type = "HC0")

model1_2 = lm(log(wage) ~ skolko_gosva + skolko_gosva:gender + employees + age + I(age^2) + hours_per_week + gender + educ + otrasl + fam_status + children_18 + experience + I(experience^2), data = df)
# se_robust_2 <- sqrt(diag(vcovHC(model1_2, type="HC0")))
# model1_2_robust <- summary(model1_2)
# model1_2_robust$coefficients[,2] <- se_robust_2
# model1_2_robust

#model1_3 = lm(log(wage) ~ skolko_gosva + log(employees) + age +  I(age^2) + hours_per_week + gender + educ + fam_status + children_18 + experience + I(experience^2) + skolko_gosva:gender, data = df) 
#model1_3%>% summary()

coef1 = coeftest(model1, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_2 = coeftest(model1_2, df = Inf, vcov. = vcovHC, type = "HC0")
stargazer(coef1, coef1_2, summary = F, type = "text")

## 
## ==================================================
##                           Dependent variable:     
##                       ----------------------------
##                                                   
##                            (1)            (2)     
## --------------------------------------------------
## skolko_gosva1            -0.107*        -0.071    
##                          (0.056)        (0.054)   
##                                                   
## skolko_gosva2           -0.206***      -0.188***  
##                          (0.042)        (0.043)   
##                                                   
## log(employees)           0.056***                 
##                          (0.007)                  
##                                                   
## employees                              0.00001*   
##                                        (0.00001)  
##                                                   
## age                       0.023          0.022    
##                          (0.018)        (0.019)   
##                                                   
## I(age2)                  -0.0004*      -0.0004*   
##                          (0.0002)      (0.0002)   
##                                                   
## hours_per_week           0.009***      0.010***   
##                          (0.002)        (0.002)   
##                                                   
## gender2                 -0.292***      -0.300***  
##                          (0.030)        (0.031)   
##                                                   
## educ1                    0.248***      0.266***   
##                          (0.023)        (0.024)   
##                                                   
## otrasl2                   0.075         0.105*    
##                          (0.062)        (0.063)   
##                                                   
## otrasl3                   0.083         0.144*    
##                          (0.079)        (0.074)   
##                                                   
## otrasl4                  0.361***      0.403***   
##                          (0.081)        (0.086)   
##                                                   
## otrasl5                   0.109*       0.159***   
##                          (0.060)        (0.059)   
##                                                   
## otrasl6                  0.252***      0.221***   
##                          (0.058)        (0.058)   
##                                                   
## otrasl7                  0.157***       0.103*    
##                          (0.053)        (0.053)   
##                                                   
## otrasl8                  -0.141*       -0.176**   
##                          (0.074)        (0.075)   
##                                                   
## otrasl9                   0.074         -0.029    
##                          (0.087)        (0.087)   
##                                                   
## otrasl10                  -0.048       -0.123**   
##                          (0.055)        (0.054)   
##                                                   
## otrasl11                  0.060         -0.025    
##                          (0.079)        (0.079)   
##                                                   
## otrasl12                 0.111**        0.085*    
##                          (0.052)        (0.052)   
##                                                   
## otrasl13                  0.104          0.065    
##                          (0.074)        (0.074)   
##                                                   
## otrasl14                 0.118***        0.028    
##                          (0.044)        (0.042)   
##                                                   
## otrasl15                 0.172**         0.116    
##                          (0.069)        (0.071)   
##                                                   
## otrasl16                 0.234***      0.259***   
##                          (0.069)        (0.068)   
##                                                   
## otrasl17                -0.167***      -0.206***  
##                          (0.055)        (0.055)   
##                                                   
## otrasl18                  0.397*         0.339    
##                          (0.219)        (0.246)   
##                                                   
## otrasl20                  0.040          0.022    
##                          (0.118)        (0.133)   
##                                                   
## otrasl21                 0.415***       0.299**   
##                          (0.126)        (0.117)   
##                                                   
## otrasl23                 0.440**       0.548***   
##                          (0.175)        (0.172)   
##                                                   
## otrasl24                  0.065          0.036    
##                          (0.184)        (0.175)   
##                                                   
## otrasl25                 0.228**         0.078    
##                          (0.106)        (0.112)   
##                                                   
## otrasl26                  -0.053        -0.153    
##                          (0.130)        (0.128)   
##                                                   
## otrasl27                 0.547**        0.516**   
##                          (0.248)        (0.250)   
##                                                   
## otrasl29                -0.460***      -0.466***  
##                          (0.040)        (0.066)   
##                                                   
## otrasl30                  0.130          0.057    
##                          (0.113)        (0.112)   
##                                                   
## fam_status2              -0.131**      -0.114**   
##                          (0.052)        (0.054)   
##                                                   
## fam_status3             -0.161***      -0.139**   
##                          (0.057)        (0.059)   
##                                                   
## fam_status4               -0.083        -0.060    
##                          (0.060)        (0.062)   
##                                                   
## fam_status5               -0.062        -0.056    
##                          (0.076)        (0.077)   
##                                                   
## fam_status6               -0.099        -0.040    
##                          (0.170)        (0.156)   
##                                                   
## children_18               0.011          0.010    
##                          (0.014)        (0.014)   
##                                                   
## experience               0.022***      0.026***   
##                          (0.007)        (0.007)   
##                                                   
## I(experience2)           -0.0002        -0.0003   
##                          (0.0002)      (0.0002)   
##                                                   
## skolko_gosva1:gender2     -0.023        -0.030    
##                          (0.081)        (0.080)   
##                                                   
## skolko_gosva2:gender2    0.107**        0.113**   
##                          (0.052)        (0.054)   
##                                                   
## Constant                 9.090***      9.310***   
##                          (0.347)        (0.359)   
##                                                   
## ==================================================
## ==================================================
## Note:                  *p<0.1; **p<0.05; ***p<0.01

bptest(model1) # с логарифмом BP=101, это меньше и лучше
bptest(model1_2)# без логарифма BP=122

#options(error=recover)
#waldtest(model1, model2)

# stargazer(reg_ols_1, reg_ols_5, reg_ols_6, reg_iv_1, reg_iv_2,
#           se = list(se_ols_1, se_ols_5, se_ols_6, se_iv_1, se_iv_2), 
#           title = "Regression results",
#           omit = "Constant",
#           keep.stat = "n",
#           notes = "Robust standard errors in parentheses",
#           type = 'text')

#============== меняем базовый уровень в первой модели
# model1 = lm(log(wage) ~ skolko_gosva + log(employees) + age +  I(age^2) + hours_per_week + gender + educ + otrasl + fam_status + children_18 + experience + I(experience^2) + skolko_gosva:gender, data = df)
# df_1 <- within(df, skolko_gosva <- relevel(skolko_gosva, ref = 3))
# model1_3 = lm(log(wage) ~ skolko_gosva + experience + employees + otrasl + age + I(age^2) + hours_per_week + gender + educ + fam_status + children_18 + I(experience^2) + gender:skolko_gosva, data = df_1)
# model1_3 %>% summary()

Предельные эффекты

# as.numeric(model1_robust$coefficients["skolko_gosva2"]+model1_robust$coefficients["skolko_gosva2:gender2"])*100 #то, как изменяется зп в процентах при переходе от частного к государственному у женщин
# as.numeric(model1_robust$coefficients["skolko_gosva2"])*100 #то, как изменяется зп в процентах при переходе от частного к государственному у мужчин
# as.numeric(model1_robust$coefficients["skolko_gosva1"]+model1_robust$coefficients["skolko_gosva1:gender2"])*100 #то, как изменяется зп в процентах при переходе от частного к получастному у женщин
# as.numeric(model1_robust$coefficients["skolko_gosva1"])*100 #то, как изменяется зп в процентах при переходе от частного к получастному у мужчин

# df$gender = as.numeric(df$gender)
# df$educ = as.numeric(df$educ)
# df$otrasl = as.numeric(df$otrasl)
# df$fam_status = as.numeric(df$fam_status)
# avg_1 <- sum(model1_robust$coefficients * 
#                    c(1, 0, 0, mean(df$log_employees), mean(df$age), mean(df$age_sq), 
#                      mean(df$hours_per_week), mean(df$gender), mean(df$educ), mean(df$otrasl), mean(df$fam_status), mean(df$children_18), mean(df$experience), mean(df$exp_sq)))
# 
# df = df %>% mutate(log_employees = log(employees), age_sq = age^2, exp_sq = experience^2)
# 
# 
# skolko_gosva+ skolko_gosva:gender + log(employees) + age +  I(age^2) + hours_per_week + gender + educ + otrasl + fam_status + children_18 + experience + I(experience^2)

Модель 2

model2 = lm(wage_to_average ~ skolko_gosva + log(employees) + age + I(age^2) + hours_per_week + gender + educ + fam_status + children_18 + experience + I(experience^2) + skolko_gosva:gender, data = df)
se_robust <- sqrt(diag(vcovHC(model2, type="HC0")))

# model2_robust <- summary(model2)
# model2_robust$coefficients[,2] <- se_robust
# model2_robust
# bptest(model2)


coef2 =  coeftest(model2, df = Inf, vcov. = vcovHC, type = "HC0")

# as.numeric(model2$coefficients["skolko_gosva2"]+model2$coefficients["skolko_gosva2:gender2"]) #то, как изменяется средняя зп при переходе от частного к государственному у женщин
# as.numeric(model2$coefficients["skolko_gosva2"]) #то, как изменяется средняя зп при переходе от частного к государственному у мужчин
# as.numeric(model2$coefficients["skolko_gosva1"]+model2$coefficients["skolko_gosva1:gender2"]) #то, как изменяется средняя зп при переходе от частного к получастному у женщин
# as.numeric(model2$coefficients["skolko_gosva1"]) #то, как изменяется средняя зп при переходе от частного к получастному у мужчин

Таблица регрессий

# stargazer(model1, summary = F, type = "text")
# stargazer(model2, summary = F, type = "text")

Данные для критики

# names(attributes(df_data$region)$labels) # список регионов(названия)
# attributes(df_data$region)$labels # список регионов с номерами