1 ПОДГОТОВКА

Настройки чанков

Библиотеки, настройки чисел

Загрузка данных

# данные окт2019-янв2020

df_dirty = read_spss('C:/Users/ASUS/Downloads/r28i_os_42.sav') # Настя
# df_dirty = read_spss('r28i_os_42.sav')
# df_dirty = read_spss('D:/Documents/codes_R/metrika_proj/r28i_os_42_new.sav') # Антон
#df_dirty = read_spss('~/r28i_os_42_new.sav') # Дима

Функции для поиска

Функция удаления выбросов

# функция удаления выбросов
delete_outliers = function(datafr, column){
  lower = quantile(datafr[[column]], 1/4) - 1.5*IQR(datafr[[column]])
  upper = quantile(datafr[[column]], 3/4) + 1.5*IQR(datafr[[column]])
  datafr = filter(datafr, (datafr[[column]] < upper) & (datafr[[column]] > lower))
  return(datafr)
}

Переименовываем

# переименовываем
df_data = df_dirty
df_data = df_data %>% rename("lvl_educ" = "x_diplom") %>% rename("otrasl" = "xj4.1") %>% rename("gender" = "xh5") %>% rename("hours_per_week" = "xj6.2") %>% rename("employees" = "xj13") %>% rename("wage" = "xj13.2") %>% rename("if_government" = "xj23") %>% rename("if_foreigners" = "xj24") %>% rename("if_private" = "xj25") %>% rename("if_yours" = "xj26") %>% rename("exper_y" = "xj161.3y") %>% rename("exper_month" = "xj161.3m") %>% rename("age" = "x_age") %>% rename("fam_status" = "x_marst") %>% rename("children_18" = "xj72.173") %>% rename("h_day" = "xj6.1a")

Выбираем нужные переменные

df_data = df_data %>% select(lvl_educ, otrasl, gender, hours_per_week, wage, if_government, if_foreigners, if_private, if_yours, exper_y, exper_month, age, employees, fam_status, children_18, region)

Чистка выбросов

# удаляем "Затрудняюсь ответить", "Отказ от ответа" 
df_data = df_data %>% filter(lvl_educ < 9*10^6) %>% filter(otrasl < 9*10^6) %>% filter(gender < 9*10^6) %>% filter(hours_per_week < 9*10^6) %>% filter(wage < 9*10^6) %>% filter(if_government < 9*10^6) %>% filter(if_foreigners < 9*10^6) %>% filter(if_private < 9*10^6) %>% filter(if_yours < 9*10^6) %>% filter(exper_y < 9*10^6) %>% filter(exper_month < 9*10^6)%>% filter(age < 9*10^6) %>% filter(employees < 9*10^6) %>% filter(fam_status < 9*10^6) %>% filter(children_18 < 9*10^6)

# удаляем NaNы
df = df_data %>% na.omit()

Боксплот зарплат по отраслям

ggplot(df_data)+
  geom_boxplot(aes(x = as.factor(otrasl), y = as.numeric(wage)/1000))+
  labs(x = "Номер отрасли",
       y = "Зарплата, тыс.руб.")+
  theme(axis.title.x = element_text(size = 14), # заголовок X
        axis.title.y = element_text(size = 14), # заголовок Y
        panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Меняем формат

# приводим к правильному формату

df$lvl_educ = df$lvl_educ %>% as.factor()
df$otrasl = df$otrasl %>% as.factor()
df$gender = df$gender %>% as.factor()
df$fam_status = df$fam_status %>% as.factor()
df$region = df$region %>% as.factor()

df$if_government = df$if_government %>% as.factor()
df$if_foreigners = df$if_foreigners %>% as.factor()
df$if_private = df$if_private %>% as.factor()
df$if_yours = df$if_yours %>% as.factor()

df$employees = df$employees %>% as.integer()
df$wage = df$wage %>% as.numeric()
df$hours_per_week = df$hours_per_week %>% as.numeric()
df$exper_y = df$exper_y %>% as.numeric()
df$exper_month = df$exper_month %>% as.numeric()
df$age = df$age %>% as.numeric()
df$children_18 = df$children_18 %>% as.numeric()

Создаем новые переменные

# создаем стаж
df = df %>% mutate(experience = exper_y + exper_month/12)

# дамми образование
df = df %>% mutate(educ = ifelse(lvl_educ == "6", "1", "0"))
df$educ = df$educ %>% as.factor()
# 1 - есть высшее, 0 - нет высшего

# создаем свою объясняющую переменную

df = df %>% mutate(skolko_gosva = ifelse(if_government == "2", "0", ifelse((if_private == "1")|(if_foreigners == "1")|(if_yours == "1"), "1", "2")))
df$skolko_gosva = df$skolko_gosva %>% as.factor()
# нет участия = 0, гос-во частично = 1, гос-во полностью = 2
# в опроснике 1 - да, 2 - нет

# создаем логарифм зп
df = df %>% mutate(log_wage = log(wage))
df$log_wage = df$log_wage %>% as.numeric()

1.1 СОЗДАНИЕ ЗАВИСИМОЙ ПЕРЕМЕННОЙ - ОТНОСИТЕЛЬНОЙ ЗП

Создание таблицы “зарплата-по-отраслям”

# создаем df со всеми отраслями и нормальными названиями
otraslii = data.frame(names(attributes(df_dirty$xj4.1)$labels),
                      unname(attributes(df_dirty$xj4.1)$labels))
colnames(otraslii) = c("names", "ids")
otraslii$ids = otraslii$ids %>% as.factor()

# считаем среднюю зп ПО ВЫБОРКЕ
df_wage = df %>% group_by(otrasl) %>% summarise(kolvo = n(), av_wage = mean(wage)) %>% left_join(otraslii, by = c("otrasl" = "ids"))

# считаем среднюю зп ПО ВСЕМ ДАННЫМ
df_dirty_w = df_dirty %>% filter(!is.na(xj6.2)) %>% filter(xj6.2 < 9*10^6) %>% filter(xj13.2 < 9*10^6) %>% filter(xj4.1 < 9*10^6)
df_dirty_w = df_dirty_w %>% group_by(xj4.1) %>% summarise(kolvo_R = n(), av_wage_R = mean(xj13.2, na.rm = T))

df_dirty_w$xj4.1 = df_dirty_w$xj4.1 %>% as.factor()
df_dirty_w$av_wage_R = df_dirty_w$av_wage_R %>% as.numeric()

# соединяем
df_wage = df_wage %>% left_join(df_dirty_w, by = c("otrasl" = "xj4.1"))
rm(df_dirty_w, otraslii)

df_wage = df_wage[c("names", "otrasl", "kolvo", "av_wage", "kolvo_R", "av_wage_R")] %>% arrange(otrasl)

Присоединяем поотраслевые зп

df = df %>% left_join(select(df_wage, c(otrasl, av_wage_R)), by = "otrasl")

Создаем относительную переменную ЗП

df = df %>% mutate(wage_to_average=wage/av_wage_R)
df$wage_to_average = df$wage_to_average %>% as.numeric()

1.2 УДАЛЕНИЕ ВЫБРОСОВ

Начало чистки и таблица описательных статистик

# удаляем выбросы по возрасту, часам в неделю
df = df %>% filter((gender == "1" & age <= 59) | (gender == "2" & age <= 54)) # потому что есть работы, которые запрещены нетрудоспособным(ну и возраст типа, здоровье)

df = df %>% filter(hours_per_week >= 20 & hours_per_week <= 60)

# чистим выбросы по зп внутри каждой отрасли
df = df %>% filter(wage > 0)

# таблица описательных статистик
table_opis1 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
                      quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))
view(table_opis1)
# write.csv(table_opis1, "table_opis_before.csv")
# stargazer(table_opis1, type = "latex", summary = F)

Боксплот зп до удаления выбросов по зп

ggplot(df)+
  geom_boxplot(aes(x = wage/1000))+
  xlab("Среднемесячная заработная плата индивида, тыс.руб.")+
  theme(axis.text=element_text(size=10),
        axis.title=element_text(size=14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Удаляем выбросы внутри отраслей

X = df %>% group_split(otrasl)
for (i in 1:28){
  X[[i]] = delete_outliers(X[[i]], "wage")
}
df = bind_rows(X[[1]], X[[2]])
for (i in 3:28){
  df = bind_rows(df, X[[i]])
}

Выбор “полезных” переменных

df = df %>% select(wage, wage_to_average, skolko_gosva, otrasl, educ, gender, fam_status, children_18, hours_per_week, experience, age, employees)
# write.csv(df, 'D:/Documents/codes_R/metrika_proj/df_itog.csv')

2 ТАБЛИЦА ОПИСАТЕЛЬНЫХ СТАТИСТИК ЧИСЛОВЫХ ПЕРЕМЕННЫХ 1

Таблица описательных переменных

table_opis1 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
                      quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))

table_opis1 %>%
 kable(caption = "Таблица характеристик числовых данных после удаления выбросов") %>%
 kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F, position = "center") %>%
 column_spec(1, bold = T)

Таблица характеристик числовых данных после удаления выбросов
	n	mean	sd	median	mad	min	max	range	skew	kurtosis	se	Q0.25	Q0.75
wage	1581	26998.961	14086.026	25000.000	11860.800	4000.000	120000.00	116000.0	1.600	4.974	354.260	17000.000	35000.00
wage_to_average	1581	0.938	0.419	0.874	0.413	0.152	2.35	2.2	0.739	0.201	0.011	0.614	1.18
hours_per_week	1581	41.333	6.869	40.000	1.483	20.000	60.00	40.0	0.076	1.890	0.173	40.000	45.00
age	1581	41.827	8.449	42.000	10.378	21.000	59.00	38.0	0.014	-0.936	0.212	35.000	49.00
experience	1581	19.091	9.310	18.167	10.749	0.167	42.00	41.8	0.194	-0.891	0.234	11.833	26.42
employees	1581	373.765	1604.472	50.000	59.304	1.000	25000.00	24999.0	10.882	141.668	40.352	15.000	150.00
children_18	1581	0.997	0.917	1.000	1.483	0.000	7.00	7.0	0.936	2.031	0.023	0.000	2.00

# write.csv(table_opis1, "D:/Documents/codes_R/metrika_proj/table_opis1.csv")
# stargazer(table_opis1, type = "latex", summary = F)

Проводим тесты между основной объясняющей переменной и контрольными, чтобы убедиться в правильности нашего интуитивного понимания механизмов.

aov(employees ~ skolko_gosva, data = df) %>% summary() # берем

##                Df     Sum Sq  Mean Sq F value  Pr(>F)    
## skolko_gosva    2   44590361 22295181    8.75 0.00017 ***
## Residuals    1578 4022850219  2549335                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(experience ~ skolko_gosva, data = df) %>% summary() # берем

##                Df Sum Sq Mean Sq F value  Pr(>F)    
## skolko_gosva    2   1572     786    9.16 0.00011 ***
## Residuals    1578 135365      86                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(age ~ skolko_gosva, data = df) %>% summary() #берем

##                Df Sum Sq Mean Sq F value Pr(>F)   
## skolko_gosva    2    721     360    5.07 0.0064 **
## Residuals    1578 112063      71                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(hours_per_week ~ skolko_gosva, data = df) %>% summary() #берем

##                Df Sum Sq Mean Sq F value              Pr(>F)    
## skolko_gosva    2   4686    2343    52.9 <0.0000000000000002 ***
## Residuals    1578  69855      44                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(children_18 ~ skolko_gosva, data = df) %>% summary() #не берем

##                Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva    2      0   0.184    0.22    0.8
## Residuals    1578   1329   0.842

chisq.test(df$skolko_gosva, df$gender) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$gender
## X-squared = 77, df = 2, p-value <0.0000000000000002

chisq.test(df$skolko_gosva, df$educ) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$educ
## X-squared = 19, df = 2, p-value = 0.00006

chisq.test(df$skolko_gosva, df$otrasl) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$otrasl
## X-squared = 981, df = 52, p-value <0.0000000000000002

chisq.test(df$skolko_gosva, df$fam_status) #не берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$fam_status
## X-squared = 8, df = 10, p-value = 0.6

3 ГРАФИКИ

Столбчатый график “Доля респондентов с высшим образованием”

ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = educ), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Доля респондентов с высшим образованием",
       x = "Участие государства в капитале компании")+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "Наличие\nвысшего\nобразования",
                    labels = c("Нет", "Есть"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Доля респондентов, находящихся в браке”

df = df %>% mutate(if_married = ifelse(fam_status == "2" | fam_status == "6", "1", "0"))
df$if_married = df$if_married %>% as.factor()
ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = if_married), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Доля респондентов по семейному статусу",x = "Участие государства в капитале компании",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "В браке",
                    labels = c("Нет", "Да"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Распределение респондентов по полу”

ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = gender), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Пол респондента",x = "Участие государства в капитале компании",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "Пол",
                    labels = c("Мужской", "Женский"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
      panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Распределение skolko_gosva по отраслям”

ggplot(df)+
  geom_bar(aes(x = otrasl, fill =  skolko_gosva), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Участие государства в капитале компании",x = "Отрасль",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c( "lightblue","skyblue3","dodgerblue4"),
                    name = "Участие\nгосударства",
                    labels = c("частные", "смешанные", "государственные"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность 3 графика по относительной зп{скачали}

ggplot(df)+
  geom_density(aes(x = wage_to_average))+
  facet_wrap(vars(skolko_gosva), labeller = label_both)+
  xlab("Отношение средней заработной платы индивида к средней зп по отрасли")+
  ylab("Плотность")+
  ggtitle("Плотность относительной зп по факту участия гос-ва")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 11))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность заработной платы

plot1 <- ggplot(df)+
  geom_density(aes(x = wage/1000))+
  xlab("Cреднемесячная заработная плата респондента, тыс.руб")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

plot2 <- ggplot(df)+
  geom_density(aes(x = log(wage)))+
  xlab("Логарифм среднемесячной заработной платы респондента")+
  ylab("")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

ggarrange(plot1, plot2, ncol=2)

Плотность распределения количества сотрудников

plot3 <- ggplot(df)+
  geom_density(aes(x = employees))+
  xlab("Количество сотрудников в компании, чел.")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность распределения относительной заработной платы

ggplot(df)+
  geom_density(aes(x = wage_to_average))+
  xlab("Отношение среднемесячной зарплаты респондента к средней по отрасли")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Боксплот 3 коробки, отношение средней заработной платы индивида к средней зп по отрасли по skolko_gosva{скачали}

ggplot(df)+
  geom_boxplot(aes(x = skolko_gosva, y = wage_to_average))+
  xlab("Участие государства")+
  ylab("Отношение средней зарплаты индивида к средней зарплате по отрасли")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot")+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Рассеивания по стажу{скачали}

ggplot(df, aes(x = experience, y = log(wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Стаж работы, лет",
     y = "Логарифм среднемесячной заработной платы")+
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

Рассеивания по возрасту

ggplot(df, aes(x = age, y = (wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Возраст, лет",
     y = "Логарифм среднемесячной заработной платы")+
  
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

Рассеивания по количеству сотрудников

ggplot(df, aes(x = employees, y = log(wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Количество сотрудников в компании, чел.",
     y = "Логарифм среднемесячной заработной платы")+
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

4 РЕГРЕССИОННЫЙ АНАЛИЗ

Итоговый дф

df_itog = df %>% select(wage_to_average, wage, skolko_gosva, gender, educ, age, experience, hours_per_week, employees, fam_status, children_18, otrasl)

# write.csv(df_itog, "D:/Documents/codes_R/metrika_proj/df_itog.csv")

Модель 1

# Эволюция моделей
model1_1 = lm(log(wage) ~ skolko_gosva, data = df)
model1_2 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week, , data = df)
model1_3 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ, data = df)
model1_4 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees), data = df)
model1_5 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18, data = df)
model1_6 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl, data = df)
model1_7 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2), data = df)
model1_8 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df)
# ======================================== меняем базовые уровни в 7 и 8 моделях
df_1 <- within(df, skolko_gosva <- relevel(skolko_gosva, ref = 3))
model1_9 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2), data = df_1)
model1_10 = lm(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df_1)

coef1_1 = coeftest(model1_1, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_2 = coeftest(model1_2, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_3 = coeftest(model1_3, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_4 = coeftest(model1_4, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_5 = coeftest(model1_5, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_6 = coeftest(model1_6, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_7 = coeftest(model1_7, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_8 = coeftest(model1_8, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_9 = coeftest(model1_9, df = Inf, vcov. = vcovHC, type = "HC0")
coef1_10 = coeftest(model1_10, df = Inf, vcov. = vcovHC, type = "HC0")

stargazer(coef1_1, coef1_2, coef1_3, coef1_4, coef1_5, coef1_6, coef1_7, coef1_9, coef1_8, coef1_10, summary = F, type = "text")

## 
## =========================================================================================================================
##                                                               Dependent variable:                                        
##                       ---------------------------------------------------------------------------------------------------
##                                                                                                                          
##                          (1)       (2)       (3)       (4)       (5)       (6)       (7)       (8)       (9)      (10)   
## -------------------------------------------------------------------------------------------------------------------------
## skolko_gosva0                                                                               0.145***            0.206*** 
##                                                                                              (0.030)             (0.042) 
##                                                                                                                          
## skolko_gosva1           0.019     0.022     0.012   -0.104**  -0.094**  -0.122*** -0.120***   0.025    -0.107*    0.100  
##                        (0.050)   (0.050)   (0.048)   (0.047)   (0.044)   (0.044)   (0.043)   (0.047)   (0.056)   (0.063) 
##                                                                                                                          
## skolko_gosva2         -0.230*** -0.198*** -0.225*** -0.239*** -0.189*** -0.145*** -0.145***           -0.206***          
##                        (0.026)   (0.027)   (0.026)   (0.024)   (0.024)   (0.030)   (0.030)             (0.042)           
##                                                                                                                          
## experience                      0.005***  0.006***  0.014***  0.013***  0.013***  0.022***  0.022***  0.022***  0.022*** 
##                                  (0.001)   (0.001)   (0.002)   (0.002)   (0.002)   (0.007)   (0.007)   (0.007)   (0.007) 
##                                                                                                                          
## hours_per_week                  0.012***  0.014***  0.013***  0.010***  0.009***  0.009***  0.009***  0.009***  0.009*** 
##                                  (0.002)   (0.002)   (0.002)   (0.002)   (0.002)   (0.002)   (0.002)   (0.002)   (0.002) 
##                                                                                                                          
## educ1                                     0.278***  0.256***  0.278***  0.255***  0.248***  0.248***  0.248***  0.248*** 
##                                            (0.025)   (0.024)   (0.023)   (0.023)   (0.023)   (0.023)   (0.023)   (0.023) 
##                                                                                                                          
## age                                                 -0.012*** -0.012*** -0.010***   0.026     0.026     0.023     0.023  
##                                                      (0.003)   (0.003)   (0.003)   (0.018)   (0.018)   (0.018)   (0.018) 
##                                                                                                                          
## log(employees)                                      0.071***  0.062***  0.058***  0.057***  0.057***  0.056***  0.056*** 
##                                                      (0.007)   (0.006)   (0.007)   (0.007)   (0.007)   (0.007)   (0.007) 
##                                                                                                                          
## gender2                                                       -0.272*** -0.248*** -0.263*** -0.263*** -0.292*** -0.186***
##                                                                (0.025)   (0.026)   (0.026)   (0.026)   (0.030)   (0.046) 
##                                                                                                                          
## fam_status2                                                   -0.119**  -0.128**  -0.130**  -0.130**  -0.131**  -0.131** 
##                                                                (0.054)   (0.053)   (0.052)   (0.052)   (0.052)   (0.052) 
##                                                                                                                          
## fam_status3                                                   -0.152**  -0.158*** -0.158*** -0.158*** -0.161*** -0.161***
##                                                                (0.059)   (0.058)   (0.057)   (0.057)   (0.057)   (0.057) 
##                                                                                                                          
## fam_status4                                                    -0.048    -0.074    -0.080    -0.080    -0.083    -0.083  
##                                                                (0.062)   (0.061)   (0.060)   (0.060)   (0.060)   (0.060) 
##                                                                                                                          
## fam_status5                                                    -0.016    -0.068    -0.064    -0.064    -0.062    -0.062  
##                                                                (0.078)   (0.076)   (0.075)   (0.075)   (0.076)   (0.076) 
##                                                                                                                          
## fam_status6                                                    -0.088    -0.066    -0.087    -0.087    -0.099    -0.099  
##                                                                (0.146)   (0.158)   (0.165)   (0.165)   (0.170)   (0.170) 
##                                                                                                                          
## children_18                                                    0.026*    0.025*     0.010     0.010     0.011     0.011  
##                                                                (0.015)   (0.014)   (0.014)   (0.014)   (0.014)   (0.014) 
##                                                                                                                          
## otrasl2                                                                   0.082     0.085     0.085     0.075     0.075  
##                                                                          (0.063)   (0.062)   (0.062)   (0.062)   (0.062) 
##                                                                                                                          
## otrasl3                                                                   0.071     0.082     0.082     0.083     0.083  
##                                                                          (0.076)   (0.079)   (0.079)   (0.079)   (0.079) 
##                                                                                                                          
## otrasl4                                                                 0.348***  0.363***  0.363***  0.361***  0.361*** 
##                                                                          (0.083)   (0.081)   (0.081)   (0.081)   (0.081) 
##                                                                                                                          
## otrasl5                                                                  0.110*    0.117*    0.117*    0.109*    0.109*  
##                                                                          (0.060)   (0.061)   (0.061)   (0.060)   (0.060) 
##                                                                                                                          
## otrasl6                                                                 0.262***  0.262***  0.262***  0.252***  0.252*** 
##                                                                          (0.059)   (0.059)   (0.059)   (0.058)   (0.058) 
##                                                                                                                          
## otrasl7                                                                 0.156***  0.161***  0.161***  0.157***  0.157*** 
##                                                                          (0.054)   (0.053)   (0.053)   (0.053)   (0.053) 
##                                                                                                                          
## otrasl8                                                                 -0.148**   -0.133*   -0.133*   -0.141*   -0.141* 
##                                                                          (0.073)   (0.074)   (0.074)   (0.074)   (0.074) 
##                                                                                                                          
## otrasl9                                                                   0.090     0.091     0.091     0.074     0.074  
##                                                                          (0.086)   (0.086)   (0.086)   (0.087)   (0.087) 
##                                                                                                                          
## otrasl10                                                                 -0.030    -0.021    -0.021    -0.048    -0.048  
##                                                                          (0.053)   (0.053)   (0.053)   (0.055)   (0.055) 
##                                                                                                                          
## otrasl11                                                                  0.075     0.076     0.076     0.060     0.060  
##                                                                          (0.079)   (0.080)   (0.080)   (0.079)   (0.079) 
##                                                                                                                          
## otrasl12                                                                 0.128**  0.132***  0.132***   0.111**   0.111** 
##                                                                          (0.051)   (0.051)   (0.051)   (0.052)   (0.052) 
##                                                                                                                          
## otrasl13                                                                  0.098     0.094     0.094     0.104     0.104  
##                                                                          (0.074)   (0.073)   (0.073)   (0.074)   (0.074) 
##                                                                                                                          
## otrasl14                                                                 0.115**  0.120***  0.120***  0.118***  0.118*** 
##                                                                          (0.045)   (0.045)   (0.045)   (0.044)   (0.044) 
##                                                                                                                          
## otrasl15                                                                 0.170**  0.179***  0.179***   0.172**   0.172** 
##                                                                          (0.068)   (0.068)   (0.068)   (0.069)   (0.069) 
##                                                                                                                          
## otrasl16                                                                0.234***  0.243***  0.243***  0.234***  0.234*** 
##                                                                          (0.072)   (0.069)   (0.069)   (0.069)   (0.069) 
##                                                                                                                          
## otrasl17                                                                -0.177*** -0.168*** -0.168*** -0.167*** -0.167***
##                                                                          (0.056)   (0.055)   (0.055)   (0.055)   (0.055) 
##                                                                                                                          
## otrasl18                                                                  0.369    0.394*    0.394*    0.397*    0.397*  
##                                                                          (0.235)   (0.223)   (0.223)   (0.219)   (0.219) 
##                                                                                                                          
## otrasl20                                                                  0.069     0.065     0.065     0.040     0.040  
##                                                                          (0.114)   (0.114)   (0.114)   (0.118)   (0.118) 
##                                                                                                                          
## otrasl21                                                                0.403***  0.420***  0.420***  0.415***  0.415*** 
##                                                                          (0.126)   (0.127)   (0.127)   (0.126)   (0.126) 
##                                                                                                                          
## otrasl23                                                                0.401***  0.433***  0.433***   0.440**   0.440** 
##                                                                          (0.138)   (0.164)   (0.164)   (0.175)   (0.175) 
##                                                                                                                          
## otrasl24                                                                  0.003     0.070     0.070     0.065     0.065  
##                                                                          (0.166)   (0.174)   (0.174)   (0.184)   (0.184) 
##                                                                                                                          
## otrasl25                                                                 0.235**   0.229**   0.229**   0.228**   0.228** 
##                                                                          (0.094)   (0.098)   (0.098)   (0.106)   (0.106) 
##                                                                                                                          
## otrasl26                                                                 -0.020    -0.027    -0.027    -0.053    -0.053  
##                                                                          (0.123)   (0.125)   (0.125)   (0.130)   (0.130) 
##                                                                                                                          
## otrasl27                                                                 0.552**   0.561**   0.561**   0.547**   0.547** 
##                                                                          (0.265)   (0.252)   (0.252)   (0.248)   (0.248) 
##                                                                                                                          
## otrasl29                                                                -0.486*** -0.470*** -0.470*** -0.460*** -0.460***
##                                                                          (0.040)   (0.040)   (0.040)   (0.040)   (0.040) 
##                                                                                                                          
## otrasl30                                                                  0.134     0.140     0.140     0.130     0.130  
##                                                                          (0.114)   (0.114)   (0.114)   (0.113)   (0.113) 
##                                                                                                                          
## I(experience2)                                                                     -0.0002   -0.0002   -0.0002   -0.0002 
##                                                                                   (0.0002)  (0.0002)  (0.0002)  (0.0002) 
##                                                                                                                          
## I(age2)                                                                           -0.0004** -0.0004** -0.0004*  -0.0004* 
##                                                                                   (0.0002)  (0.0002)  (0.0002)  (0.0002) 
##                                                                                                                          
## skolko_gosva0:gender2                                                                                           -0.107** 
##                                                                                                                  (0.052) 
##                                                                                                                          
## skolko_gosva1:gender2                                                                                  -0.023    -0.130  
##                                                                                                        (0.081)   (0.088) 
##                                                                                                                          
## skolko_gosva2:gender2                                                                                  0.107**           
##                                                                                                        (0.052)           
##                                                                                                                          
## Constant              10.200*** 9.570***  9.360***  9.480***  9.880***  9.740***  9.010***  8.870***  9.090***  8.880*** 
##                        (0.017)   (0.094)   (0.093)   (0.117)   (0.135)   (0.141)   (0.345)   (0.344)   (0.347)   (0.346) 
##                                                                                                                          
## =========================================================================================================================
## =========================================================================================================================
## Note:                                                                                         *p<0.1; **p<0.05; ***p<0.01

#stargazer(model1_1, model1_2, model1_3, model1_4, model1_5, model1_6, model1_7, model1_9, model1_8, model1_10, summary = F, type = "text", keep.stat=c("adj.rsq", "n"))

Тест Уайта на гетероскедастичность для двух моделей

bptest(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + log(employees) + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df) # с логарифмом BP=101, это меньше и лучше

## 
##  studentized Breusch-Pagan test
## 
## data:  log(wage) ~ skolko_gosva + experience + hours_per_week + educ +     age + log(employees) + gender + fam_status + children_18 +     otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender
## BP = 101, df = 44, p-value = 0.000002

bptest(log(wage) ~ skolko_gosva + experience + hours_per_week + educ + age + employees + gender + fam_status + children_18 + otrasl + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df)# без логарифма BP=122

## 
##  studentized Breusch-Pagan test
## 
## data:  log(wage) ~ skolko_gosva + experience + hours_per_week + educ +     age + employees + gender + fam_status + children_18 + otrasl +     I(experience^2) + I(age^2) + skolko_gosva:gender
## BP = 122, df = 44, p-value = 0.000000003

Модель 2

model2_1 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2), data = df)
model2_2 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df)

# ================================================= меняем базовый уровень skolko_gosva
model2_3 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2), data = df_1)
model2_4 = lm(wage_to_average ~ skolko_gosva + experience + hours_per_week + educ+ age + log(employees) + gender + fam_status + children_18 + I(experience^2) + I(age^2) + skolko_gosva:gender, data = df_1)

coef2_1 =  coeftest(model2_1, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_2 =  coeftest(model2_2, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_3 =  coeftest(model2_3, df = Inf, vcov. = vcovHC, type = "HC0")
coef2_4 =  coeftest(model2_4, df = Inf, vcov. = vcovHC, type = "HC0")
stargazer(coef2_1, coef2_3, coef2_2, coef2_4, summary = F, type = "text", keep.stat=c("adj.rsq"))

## 
## =============================================================
##                                 Dependent variable:          
##                       ---------------------------------------
##                                                              
##                          (1)       (2)       (3)       (4)   
## -------------------------------------------------------------
## skolko_gosva0                   0.094***            0.187*** 
##                                  (0.021)             (0.036) 
##                                                              
## skolko_gosva1         -0.137***  -0.043   -0.138**    0.049  
##                        (0.040)   (0.041)   (0.059)   (0.063) 
##                                                              
## skolko_gosva2         -0.094***           -0.187***          
##                        (0.021)             (0.036)           
##                                                              
## experience            0.021***  0.021***  0.021***  0.021*** 
##                        (0.006)   (0.006)   (0.006)   (0.006) 
##                                                              
## hours_per_week        0.007***  0.007***  0.007***  0.007*** 
##                        (0.001)   (0.001)   (0.001)   (0.001) 
##                                                              
## educ1                 0.222***  0.222***  0.221***  0.221*** 
##                        (0.021)   (0.021)   (0.021)   (0.021) 
##                                                              
## age                     0.011     0.011     0.008     0.008  
##                        (0.015)   (0.015)   (0.015)   (0.015) 
##                                                              
## log(employees)        0.036***  0.036***  0.036***  0.036*** 
##                        (0.006)   (0.006)   (0.006)   (0.006) 
##                                                              
## gender2               -0.187*** -0.187*** -0.234*** -0.091***
##                        (0.022)   (0.022)   (0.027)   (0.035) 
##                                                              
## fam_status2            -0.079*   -0.079*   -0.084*   -0.084* 
##                        (0.045)   (0.045)   (0.046)   (0.046) 
##                                                              
## fam_status3           -0.129*** -0.129*** -0.134*** -0.134***
##                        (0.050)   (0.050)   (0.050)   (0.050) 
##                                                              
## fam_status4            -0.049    -0.049    -0.056    -0.056  
##                        (0.053)   (0.053)   (0.053)   (0.053) 
##                                                              
## fam_status5            -0.043    -0.043    -0.045    -0.045  
##                        (0.069)   (0.069)   (0.070)   (0.070) 
##                                                              
## fam_status6            -0.079    -0.079    -0.095    -0.095  
##                        (0.163)   (0.163)   (0.173)   (0.173) 
##                                                              
## children_18             0.014     0.014     0.014     0.014  
##                        (0.013)   (0.013)   (0.013)   (0.013) 
##                                                              
## I(experience2)        -0.0003*  -0.0003*  -0.0003*  -0.0003* 
##                       (0.0001)  (0.0001)  (0.0001)  (0.0001) 
##                                                              
## I(age2)                -0.0002   -0.0002   -0.0002   -0.0002 
##                       (0.0002)  (0.0002)  (0.0002)  (0.0002) 
##                                                              
## skolko_gosva0:gender2                               -0.142***
##                                                      (0.043) 
##                                                              
## skolko_gosva1:gender2                       0.002    -0.140* 
##                                            (0.078)   (0.082) 
##                                                              
## skolko_gosva2:gender2                     0.142***           
##                                            (0.043)           
##                                                              
## Constant                0.315     0.222     0.397     0.211  
##                        (0.286)   (0.285)   (0.288)   (0.286) 
##                                                              
## =============================================================
## =============================================================
## Note:                             *p<0.1; **p<0.05; ***p<0.01

# stargazer(model2_1, model2_3, model2_2, model2_4, summary = F, type = "text", keep.stat=c("adj.rsq", "n"))

Данные для критики

# names(attributes(df_data$region)$labels) # список регионов(названия)
# attributes(df_data$region)$labels # список регионов с номерами