1 ПОДГОТОВКА

Настройки чанков

Библиотеки, настройки чисел

Загрузка данных

# данные окт2019-янв2020

df_dirty = read_spss('C:/Users/ASUS/Downloads/r28i_os_42.sav') # Настя
# df_dirty = read_spss('r28i_os_42.sav')
#df_dirty = read_spss('D:/Documents/codes_R/metrika_proj/r28i_os_42_new.sav') # Антон

Функции для поиска

# функции для поиска
vse_atributi = function(datafr){
  spt = list()
  for (i in seq(1, dim(datafr)[2])){
    a = list(attributes(datafr[[i]])[[1]])
    spt = append(spt, a)
  }
  podpisi = unlist(spt)
  return(podpisi)
}

podpis_search = function(datafr, stroka){
  require(stringr)
  spt = list()
  for (i in seq(1, dim(datafr)[2])){
    a = list(attributes(datafr[[i]])[[1]])
    spt = append(spt, a)
  }
  podpisi = unlist(spt)
  imena_stolb = colnames(datafr)
  num = str_which(str_to_lower(podpisi), as.character(stroka))
  for (i in num){
    cat(podpisi[i], sep = "\n", fill = T, labels = paste0("[", imena_stolb[i], "]", "{", i, "}:"))
  }
}

# vse_atributi(df)
# podpis_search(df_dirty, "тип")

Функция удаления выбросов

# функция удаления выбросов
delete_outliers = function(datafr, column){
  lower = quantile(datafr[[column]], 1/4) - 1.5*IQR(datafr[[column]])
  upper = quantile(datafr[[column]], 3/4) + 1.5*IQR(datafr[[column]])
  datafr = filter(datafr, (datafr[[column]] < upper) & (datafr[[column]] > lower))
  return(datafr)
}

Переименовываем

# переименовываем
df_data = df_dirty
df_data = df_data %>% rename("lvl_educ" = "x_diplom") %>% rename("otrasl" = "xj4.1") %>% rename("gender" = "xh5") %>% rename("hours_per_week" = "xj6.2") %>% rename("employees" = "xj13") %>% rename("wage" = "xj13.2") %>% rename("if_government" = "xj23") %>% rename("if_foreigners" = "xj24") %>% rename("if_private" = "xj25") %>% rename("if_yours" = "xj26") %>% rename("exper_y" = "xj161.3y") %>% rename("exper_month" = "xj161.3m") %>% rename("age" = "x_age") %>% rename("fam_status" = "x_marst") %>% rename("children_18" = "xj72.173")

Выбираем нужные переменные

df_data = df_data %>% select(lvl_educ, otrasl, gender, hours_per_week, wage, if_government, if_foreigners, if_private, if_yours, exper_y, exper_month, age, employees, fam_status, children_18)

Чистка выбросов

# удаляем "Затрудняюсь ответить", "Отказ от ответа" 
df_data = df_data %>% filter(lvl_educ < 9*10^6) %>% filter(otrasl < 9*10^6) %>% filter(gender < 9*10^6) %>% filter(hours_per_week < 9*10^6) %>% filter(wage < 9*10^6) %>% filter(if_government < 9*10^6) %>% filter(if_foreigners < 9*10^6) %>% filter(if_private < 9*10^6) %>% filter(if_yours < 9*10^6) %>% filter(exper_y < 9*10^6) %>% filter(exper_month < 9*10^6)%>% filter(age < 9*10^6) %>% filter(employees < 9*10^6) %>% filter(fam_status < 9*10^6) %>% filter(children_18 < 9*10^6)

# удаляем NaNы
df = df_data %>% na.omit()

Боксплот зарплат по отраслям

ggplot(df_data)+
  geom_boxplot(aes(x = as.factor(otrasl), y = as.numeric(wage)/1000))+
  labs(x = "Номер отрасли",
       y = "Зарплата, тыс.руб.")+
  theme(axis.title.x = element_text(size = 14), # заголовок X
        axis.title.y = element_text(size = 14), # заголовок Y
        panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Меняем формат

# приводим к правильному формату

df$lvl_educ = df$lvl_educ %>% as.factor()
df$otrasl = df$otrasl %>% as.factor()
df$gender = df$gender %>% as.factor()
df$fam_status = df$fam_status %>% as.factor()

df$if_government = df$if_government %>% as.factor()
df$if_foreigners = df$if_foreigners %>% as.factor()
df$if_private = df$if_private %>% as.factor()
df$if_yours = df$if_yours %>% as.factor()

df$employees = df$employees %>% as.integer()
df$children_18 = df$children_18 %>% as.integer()
df$wage = df$wage %>% as.numeric()
df$hours_per_week = df$hours_per_week %>% as.numeric()
df$exper_y = df$exper_y %>% as.numeric()
df$exper_month = df$exper_month %>% as.numeric()
df$age = df$age %>% as.numeric()

Создаем новые переменные

# создаем стаж
df = df %>% mutate(experience = exper_y + exper_month/12)

# дамми образование
df = df %>% mutate(educ = ifelse(lvl_educ == "6", "1", "0"))
df$educ = df$educ %>% as.factor()
# 1 - есть высшее, 0 - нет высшего

# создаем свою объясняющую переменную

df = df %>% mutate(skolko_gosva = ifelse(if_government == "2", "0", ifelse((if_private == "1")|(if_foreigners == "1")|(if_yours == "1"), "1", "2")))
df$skolko_gosva = df$skolko_gosva %>% as.factor()
# нет участия = 0, гос-во частично = 1, гос-во полностью = 2
# в опроснике 1 - да, 2 - нет

# создаем бинарную переменную по семейному статусу
df = df %>% mutate(if_married = ifelse(fam_status == "2" | fam_status == "6", "1", "0"))
df$if_married = df$if_married %>% as.factor()
# 1 - в браке, 0 - не в браке

# создаем логарифм зп
df = df %>% mutate(log_wage = log(wage))
df$log_wage = df$log_wage %>% as.numeric()

# создаем логарифм кол-ва работников
df = df %>% mutate(log_employees = log(employees))

1.1 СОЗДАНИЕ ЗАВИСИМОЙ ПЕРЕМЕННОЙ - ОТНОСИТЕЛЬНОЙ ЗП

Создание таблицы “зарплата-по-отраслям”

# создаем df со всеми отраслями и нормальными названиями
otraslii = data.frame(names(attributes(df_dirty$xj4.1)$labels),
                      unname(attributes(df_dirty$xj4.1)$labels))
colnames(otraslii) = c("names", "ids")
otraslii$ids = otraslii$ids %>% as.factor()

# считаем среднюю зп ПО ВЫБОРКЕ
df_wage = df %>% group_by(otrasl) %>% summarise(kolvo = n(), av_wage = mean(wage)) %>% left_join(otraslii, by = c("otrasl" = "ids"))

# считаем среднюю зп ПО ВСЕМ ДАННЫМ
df_dirty_w = df_dirty %>% filter(!is.na(xj6.2)) %>% filter(xj6.2 < 9*10^6) %>% filter(xj13.2 < 9*10^6) %>% filter(xj4.1 < 9*10^6)
df_dirty_w = df_dirty_w %>% group_by(xj4.1) %>% summarise(kolvo_R = n(), av_wage_R = mean(xj13.2, na.rm = T))

df_dirty_w$xj4.1 = df_dirty_w$xj4.1 %>% as.factor()
df_dirty_w$av_wage_R = df_dirty_w$av_wage_R %>% as.numeric()

# соединяем
df_wage = df_wage %>% left_join(df_dirty_w, by = c("otrasl" = "xj4.1"))
rm(df_dirty_w, otraslii)

df_wage = df_wage[c("names", "otrasl", "kolvo", "av_wage", "kolvo_R", "av_wage_R")] %>% arrange(otrasl)

Присоединяем поотраслевые зп

df = df %>% left_join(select(df_wage, c(otrasl, av_wage_R)), by = "otrasl")

Создаем относительную переменную ЗП

df = df %>% mutate(wage_to_average=wage/av_wage_R)
df$wage_to_average = df$wage_to_average %>% as.numeric()

1.2 УДАЛЕНИЕ ВЫБРОСОВ

Начало чистки и таблица описательных статистик

# удаляем выбросы по возрасту, зп, стажу, возрасту, часам в неделю
df = df %>% filter((gender == "1" & age <= 59) | (gender == "2" & age <= 54)) # потому что есть работы, которые запрещены нетрудоспособным(ну и возраст типа, здоровье)

# чистим выбросы по зп внутри каждой отрасли
df = df %>% filter(wage > 0)

# таблица описательных статистик
table_opis1 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
                      quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))

# write.csv(table_opis1, "table_opis_before.csv")
# stargazer(table_opis1, type = "latex", summary = F)

Боксплот зп до удаления выбросов

ggplot(df)+
  geom_boxplot(aes(x = wage/1000))+
  xlab("Среднемесячная заработная плата индивида, тыс.руб.")+
  theme(axis.text=element_text(size=10),
        axis.title=element_text(size=14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Удаляем выбросы

X = df %>% group_split(otrasl)
for (i in 1:28){
  X[[i]] = delete_outliers(X[[i]], "wage")
}
df = bind_rows(X[[1]], X[[2]])
for (i in 3:28){
  df = bind_rows(df, X[[i]])
}

df = delete_outliers(df, "hours_per_week")
df = delete_outliers(df, "children_18")
df = delete_outliers(df, "employees")

Выбор “полезных” переменных

df = df %>% select(wage, wage_to_average, skolko_gosva, otrasl, educ, gender, if_married, children_18, hours_per_week, experience, age, employees)
# write.csv(df, 'D:/Documents/codes_R/metrika_proj/df_itog.csv')

2 ТАБЛИЦА ОПИСАТЕЛЬНЫХ СТАТИСТИК ЧИСЛОВЫХ ПЕРЕМЕННЫХ 1

Таблица описательных переменных

table_opis1 = describe(select(df, c(wage, wage_to_average, hours_per_week, age, experience, employees, children_18)),
                      quant = c(0.25, 0.75), omit = T) %>% select(-c(vars, trimmed))

table_opis1 %>%
 kable(caption = "Таблица характеристик числовых данных после удаления выбросов") %>%
 kable_styling(bootstrap_options = c("striped", "hover", "responsive"), full_width = F, position = "center") %>%
 column_spec(1, bold = T)

Таблица характеристик числовых данных после удаления выбросов
	n	mean	sd	median	mad	min	max	range	skew	kurtosis	se	Q0.25	Q0.75
wage	1242	26044.717	12963.358	23000.000	10378.200	5000.000	120000.0	115000.00	1.427	4.087	367.838	17000.000	32000.00
wage_to_average	1242	0.927	0.417	0.862	0.402	0.190	2.6	2.41	0.865	0.508	0.012	0.601	1.14
hours_per_week	1242	41.295	5.022	40.000	0.000	30.000	56.0	26.00	0.397	0.281	0.142	40.000	45.00
age	1242	41.634	8.467	41.000	10.378	21.000	59.0	38.00	0.032	-0.948	0.240	35.000	48.75
experience	1242	18.736	9.213	18.000	10.934	0.167	41.5	41.33	0.208	-0.874	0.261	11.167	26.00
employees	1242	62.677	73.923	30.000	32.617	1.000	350.0	349.00	1.897	3.129	2.098	12.000	80.00
children_18	1242	0.967	0.871	1.000	1.483	0.000	4.0	4.00	0.648	0.154	0.025	0.000	2.00

# write.csv(table_opis1, "D:/Documents/codes_R/metrika_proj/table_opis1.csv")
# stargazer(table_opis1, type = "latex", summary = F)

Проводим тесты между основной объясняющей переменной и контрольными, чтобы убедиться в правильности нашего интуитивного понимания механизмов.

aov(employees ~ skolko_gosva, data = df) %>% summary() # берем

##                Df  Sum Sq Mean Sq F value Pr(>F)   
## skolko_gosva    2   73558   36779    6.79 0.0012 **
## Residuals    1239 6708071    5414                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(experience ~ skolko_gosva, data = df) %>% summary() # берем

##                Df Sum Sq Mean Sq F value  Pr(>F)    
## skolko_gosva    2   1238     619    7.37 0.00066 ***
## Residuals    1239 104089      84                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(age ~ skolko_gosva, data = df) %>% summary() #берем

##                Df Sum Sq Mean Sq F value Pr(>F)   
## skolko_gosva    2    709     354    4.97 0.0071 **
## Residuals    1239  88259      71                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(hours_per_week ~ skolko_gosva, data = df) %>% summary() #берем

##                Df Sum Sq Mean Sq F value              Pr(>F)    
## skolko_gosva    2   2368    1184    50.7 <0.0000000000000002 ***
## Residuals    1239  28924      23                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(children_18 ~ skolko_gosva, data = df) %>% summary() #не берем

##                Df Sum Sq Mean Sq F value Pr(>F)
## skolko_gosva    2      1   0.361    0.48   0.62
## Residuals    1239    941   0.759

chisq.test(df$skolko_gosva, df$gender) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$gender
## X-squared = 62, df = 2, p-value = 0.00000000000004

chisq.test(df$skolko_gosva, df$educ) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$educ
## X-squared = 25, df = 2, p-value = 0.000003

chisq.test(df$skolko_gosva, df$otrasl) #берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$otrasl
## X-squared = 762, df = 50, p-value <0.0000000000000002

chisq.test(df$skolko_gosva, df$if_married) #не берем

## 
##  Pearson's Chi-squared test
## 
## data:  df$skolko_gosva and df$if_married
## X-squared = 3, df = 2, p-value = 0.2

3 ГРАФИКИ

Столбчатый график “Доля респондентов с высшим образованием”

ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = educ), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Доля респондентов с высшим образованием",
       x = "Участие государства в капитале компании")+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "Наличие\nвысшего\nобразования",
                    labels = c("Нет", "Есть"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Доля респондентов, находящихся в браке”

ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = if_married), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Доля респондентов, находящихся в браке",x = "Участие государства в капитале компании",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "Находится\nв браке",
                    labels = c("Нет", "Да"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Распределение респондентов по полу”

ggplot(df)+
  geom_bar(aes(x = skolko_gosva, fill = gender), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Пол респондента",x = "Участие государства в капитале компании",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c("gray", "lightblue"),
                    name = "Пол",
                    labels = c("Мужской", "Женский"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
      panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Столбчатый график “Распределение skolko_gosva по отраслям”

ggplot(df)+
  geom_bar(aes(x = otrasl, fill =  skolko_gosva), color = "black", position = "fill")+
  scale_y_continuous(labels = percent)+
  labs(y = "Участие государства в капитале компании",x = "Отрасль",
       title = NULL)+
  theme(plot.title = element_text(hjust = 0.5, size = 12),
        plot.title.position = "plot",
        plot.caption.position =  "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  scale_fill_manual(values = c( "lightblue","skyblue3","dodgerblue4"),
                    name = "Участие\nгосударства",
                    labels = c("частные", "смешанные", "государственные"))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность 3 графика по относительной зп{скачали}

ggplot(df)+
  geom_density(aes(x = wage_to_average))+
  facet_wrap(vars(skolko_gosva), labeller = label_both)+
  xlab("Отношение средней заработной платы индивида к средней зп по отрасли")+
  ylab("Плотность")+
  ggtitle("Плотность относительной зп по факту участия гос-ва")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 11))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность заработной платы

plot1 <- ggplot(df)+
  geom_density(aes(x = wage/1000))+
  xlab("Cреднемесячная заработная плата респондента, тыс.руб")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

plot2 <- ggplot(df)+
  geom_density(aes(x = log(wage)))+
  xlab("Логарифм среднемесячной заработной платы респондента")+
  ylab("")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

ggarrange(plot1, plot2, ncol=2)

Плотность распределения количества сотрудников

plot3 <- ggplot(df)+
  geom_density(aes(x = employees))+
  xlab("Количество сотрудников компании")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Плотность распределения относительной заработной платы

ggplot(df)+
  geom_density(aes(x = wage_to_average))+
  xlab("Отношение среднемесячной зарплаты респондента к средней по отрасли")+
  ylab("Плотность")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot",
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Боксплот 3 коробки, отношение средней заработной платы индивида к средней зп по отрасли по skolko_gosva{скачали}

ggplot(df)+
  geom_boxplot(aes(x = skolko_gosva, y = wage_to_average))+
  xlab("Участие государства")+
  ylab("Отношение средней зарплаты индивида к средней зарплате по отрасли")+
  theme(plot.title = element_text(hjust = 0.5), # выравнивание по центру
        plot.title.position = "plot", # подгонка под размер графика
        plot.caption.position = "plot")+
  theme(panel.background = element_rect(fill = "white"), # задний фон
        panel.grid = element_line(color = "grey"))+ #линии на графике
  geom_rect(aes(xmin = -Inf, xmax = Inf, ymin = -Inf, ymax = Inf), # рамка
            colour = "black", size = 0.5, fill = NA)

Рассеивания по стажу{скачали}

ggplot(df, aes(x = experience, y = log(wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Стаж работы, лет",
     y = "Логарифм среднемесячной заработной платы")+
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

Рассеивания по возрасту

ggplot(df, aes(x = age, y = (wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Возраст, лет",
     y = "Логарифм среднемесячной заработной платы")+
  
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

Рассеивания по количеству сотрудников

ggplot(df, aes(x = employees, y = log(wage)))+
  geom_point(color = "red", alpha = 0.4)+
  geom_smooth(method = loess, se = F, formula = y ~ x)+
  labs(x = "Количества сотрудников в компании, чел.",
     y = "Логарифм среднемесячной заработной платы")+
  theme(plot.caption = element_text(hjust = 0, face = "plain"),
      plot.title.position = "plot",
      plot.caption.position =  "plot",
      panel.background = element_rect(fill = "white"),
      panel.grid = element_line(color = "grey"),
      axis.title.x = element_text(size = 14),
      axis.title.y = element_text(size = 14))

4 РЕГРЕССИОННЫЙ АНАЛИЗ

Похожие на нормальные регрессии

df_itog = df %>% select(wage_to_average, skolko_gosva, gender, educ, age, experience, hours_per_week)

lm(log(wage)) ~ skolko_gosva + experience + age + hours_per_week + gender + educ + otrasl + if_married + children_18 + I(experience^2) + skolko_gosva:gender, data = df) %>% summary()

lm(wage_to_average) ~ skolko_gosva + experience + age + hours_per_week + gender + educ + otrasl + if_married + children_18 + I(experience^2) + skolko_gosva:gender, data = df) %>% summary()