- Set 30% attrition rate for a level of a variable as significant
### External, not significant
total <- work %>% select(education_field, attrition) %>%
group_by(education_field) %>% arrange(education_field) %>%
dplyr::count(education_field) %>% as.data.frame()
yes_count <- work %>% select(education_field, attrition) %>%
group_by(education_field) %>% arrange(education_field) %>%
filter(attrition == "1") %>% dplyr::count(education_field) %>%
dplyr::rename(yes = n) %>% as.data.frame()
head(yes_count)
## education_field yes
## 1 human_resources 4
## 2 marketing 20
## 3 medical 37
## 4 other 9
## 5 technical_degree 17
## 6 life_sciences 53
head(total)
## education_field n
## 1 human_resources 15
## 2 marketing 100
## 3 medical 270
## 4 other 52
## 5 technical_degree 75
## 6 life_sciences 358
join <- full_join(total, yes_count, by = "education_field") %>% mutate(perc = (yes/n)*100)
join$education_field <- as.factor(join$education_field)
head(join)
## education_field n yes perc
## 1 human_resources 15 4 26.66667
## 2 marketing 100 20 20.00000
## 3 medical 270 37 13.70370
## 4 other 52 9 17.30769
## 5 technical_degree 75 17 22.66667
## 6 life_sciences 358 53 14.80447
join %>% ggplot(aes(x = education_field, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Education Field and Attrition")

### external, significant
total <- work %>% mutate(age = discretize(age, method = 'interval', breaks = 6)) %>% select(age, attrition) %>% group_by(age) %>% arrange(age) %>% dplyr::count(age) %>% as.data.frame()
yes_count <- work %>% mutate(age = discretize(age, method = 'interval', breaks = 6)) %>% select(age, attrition) %>% group_by(age) %>% arrange(age) %>% filter(attrition == "1") %>% dplyr::count(age) %>% dplyr::rename(yes = n) %>% as.data.frame()
head(total)
## age n
## 1 [18,25) 55
## 2 [25,32) 206
## 3 [32,39) 287
## 4 [39,46) 165
## 5 [46,53) 103
## 6 [53,60] 54
head(yes_count)
## age yes
## 1 [18,25) 20
## 2 [25,32) 47
## 3 [32,39) 36
## 4 [39,46) 17
## 5 [46,53) 13
## 6 [53,60] 7
join <- full_join(total, yes_count, by = "age")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = age, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Age and Attrition")

### internal, not significant
total <- work %>% select(attrition, business_travel) %>% group_by(business_travel) %>% arrange(business_travel) %>% count(business_travel) %>% as.data.frame()
yes_count <- work %>% select(business_travel, attrition) %>% group_by(business_travel) %>% arrange(business_travel) %>% filter(attrition == "1") %>% count(business_travel) %>% rename(yes = n) %>% as.data.frame()
head(total)
## business_travel n
## 1 Non-Travel 94
## 2 Travel_Frequently 158
## 3 Travel_Rarely 618
yes_count
## business_travel yes
## 1 Non-Travel 11
## 2 Travel_Frequently 35
## 3 Travel_Rarely 94
join <- full_join(total, yes_count, by = "business_travel")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = business_travel, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Business Travel and Attrition")

### internal, not significant
total <- work %>% select(attrition, department) %>% group_by(department) %>% arrange(department) %>% count(department) %>% as.data.frame()
yes_count <- work %>% select(department, attrition) %>% group_by(department) %>% arrange(department) %>% filter(attrition == "1") %>% count(department) %>% rename(yes = n) %>% as.data.frame()
head(total)
## department n
## 1 human_resources 35
## 2 research_and_development 562
## 3 sales 273
yes_count
## department yes
## 1 human_resources 6
## 2 research_and_development 75
## 3 sales 59
join <- full_join(total, yes_count, by = "department")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = department, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Department and Attrition")

### external, significant
total <- work %>% select(attrition, distance_from_home) %>% mutate(distance_from_home = discretize(distance_from_home, method = 'interval', breaks = 6)) %>% group_by(distance_from_home) %>% arrange(distance_from_home) %>% dplyr::count(distance_from_home) %>% as.data.frame()
yes_count <- work %>% select(distance_from_home, attrition) %>% mutate(distance_from_home = discretize(distance_from_home, method = 'interval', breaks = 6)) %>% filter(attrition == "1") %>% group_by(distance_from_home) %>% arrange(distance_from_home) %>% dplyr::count(distance_from_home) %>% dplyr::rename(yes = n) %>% as.data.frame()
head(total)
## distance_from_home n
## 1 [1,5.67) 367
## 2 [5.67,10.3) 233
## 3 [10.3,15) 53
## 4 [15,19.7) 78
## 5 [19.7,24.3) 74
## 6 [24.3,29] 65
head(yes_count)
## distance_from_home yes
## 1 [1,5.67) 52
## 2 [5.67,10.3) 30
## 3 [10.3,15) 12
## 4 [15,19.7) 13
## 5 [19.7,24.3) 24
## 6 [24.3,29] 9
join <- full_join(total, yes_count, by = "distance_from_home")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = distance_from_home, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Distance From Home and Attrition")

### external, not significant
total <- work %>% select(attrition, education) %>% group_by(education) %>% arrange(education) %>% count(education) %>% as.data.frame()
yes_count <- work %>% select(education, attrition) %>% group_by(education) %>% arrange(education) %>% filter(attrition == "1") %>% count(education) %>% rename(yes = n) %>% as.data.frame()
head(total)
## education n
## 1 1 98
## 2 2 182
## 3 3 324
## 4 4 240
## 5 5 26
yes_count
## education yes
## 1 1 18
## 2 2 32
## 3 3 55
## 4 4 32
## 5 5 3
join <- full_join(total, yes_count, by = "education")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = education, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Education and Attrition")

### internal, not significant
total <- work %>% select(attrition, environment_satisfaction) %>% group_by(environment_satisfaction) %>% arrange(environment_satisfaction) %>% count(environment_satisfaction) %>% as.data.frame()
yes_count <- work %>% select(environment_satisfaction, attrition) %>% group_by(environment_satisfaction) %>% arrange(environment_satisfaction) %>% filter(attrition == "1") %>% count(environment_satisfaction) %>% rename(yes = n) %>% as.data.frame()
head(total)
## environment_satisfaction n
## 1 1 172
## 2 2 178
## 3 3 258
## 4 4 262
yes_count
## environment_satisfaction yes
## 1 1 42
## 2 2 24
## 3 3 35
## 4 4 39
join <- full_join(total, yes_count, by = "environment_satisfaction")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = environment_satisfaction, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Environment Satisfaction and Attrition")

### external, not significant
total <- work %>% select(attrition, gender) %>% group_by(gender) %>% arrange(gender) %>% count(gender) %>% as.data.frame()
yes_count <- work %>% select(gender, attrition) %>% group_by(gender) %>% arrange(gender) %>% filter(attrition == "1") %>% count(gender) %>% rename(yes = n) %>% as.data.frame()
head(total)
## gender n
## 1 Female 354
## 2 Male 516
yes_count
## gender yes
## 1 Female 53
## 2 Male 87
join <- full_join(total, yes_count, by = "gender")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = gender, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Gender and Attrition")

### internal, not significant
total <- work %>% select(attrition, hourly_rate) %>% mutate(hourly_rate = discretize(hourly_rate, method = 'interval', breaks = 10)) %>% group_by(hourly_rate) %>% arrange(hourly_rate) %>% count(hourly_rate) %>% as.data.frame()
yes_count <- work %>% select(hourly_rate, attrition) %>% mutate(hourly_rate = discretize(hourly_rate, method = 'interval', breaks = 10)) %>% filter(attrition == "1") %>% group_by(hourly_rate) %>% arrange(hourly_rate) %>% count(hourly_rate) %>% rename(yes = n) %>% as.data.frame()
head(total)
## hourly_rate n
## 1 [30,37) 69
## 2 [37,44) 87
## 3 [44,51) 88
## 4 [51,58) 102
## 5 [58,65) 79
## 6 [65,72) 73
yes_count
## hourly_rate yes
## 1 [30,37) 10
## 2 [37,44) 10
## 3 [44,51) 14
## 4 [51,58) 15
## 5 [58,65) 11
## 6 [65,72) 20
## 7 [72,79) 13
## 8 [79,86) 18
## 9 [86,93) 10
## 10 [93,100] 19
join <- full_join(total, yes_count, by = "hourly_rate")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = hourly_rate, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Hourly Rate and Attrition")

### internal, super significant
total <- work %>% select(attrition, job_involvement) %>% group_by(job_involvement) %>% arrange(job_involvement) %>% count(job_involvement) %>% as.data.frame()
yes_count <- work %>% select(job_involvement, attrition) %>% group_by(job_involvement) %>% arrange(job_involvement) %>% filter(attrition == "1") %>% count(job_involvement) %>% rename(yes = n) %>% as.data.frame()
head(total)
## job_involvement n
## 1 1 47
## 2 2 228
## 3 3 514
## 4 4 81
yes_count
## job_involvement yes
## 1 1 22
## 2 2 44
## 3 3 67
## 4 4 7
join <- full_join(total, yes_count, by = "job_involvement")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = job_involvement, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Job Involvement and Attrition")

### internal, not significant
total <- work %>% select(attrition, job_level) %>% group_by(job_level) %>% arrange(job_level) %>% count(job_level) %>% as.data.frame()
yes_count <- work %>% select(job_level, attrition) %>% group_by(job_level) %>% arrange(job_level) %>% filter(attrition == "1") %>% count(job_level) %>% rename(yes = n) %>% as.data.frame()
head(total)
## job_level n
## 1 1 329
## 2 2 312
## 3 3 132
## 4 4 60
## 5 5 37
yes_count
## job_level yes
## 1 1 86
## 2 2 30
## 3 3 17
## 4 4 3
## 5 5 4
join <- full_join(total, yes_count, by = "job_level")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = job_level, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Job Level and Attrition")

### internal, worth investigating
total <- work %>% select(attrition, job_role) %>% group_by(job_role) %>% arrange(job_role) %>% count(job_role) %>% as.data.frame()
yes_count <- work %>% select(job_role, attrition) %>% group_by(job_role) %>% arrange(job_role) %>% filter(attrition == "1") %>% count(job_role) %>% rename(yes = n) %>% as.data.frame()
head(total)
## job_role n
## 1 healthcare_representative 76
## 2 human_resources 27
## 3 laboratory_technician 153
## 4 manager 51
## 5 manufacturing_director 87
## 6 research_director 51
yes_count
## job_role yes
## 1 healthcare_representative 8
## 2 human_resources 6
## 3 laboratory_technician 30
## 4 manager 4
## 5 manufacturing_director 2
## 6 research_director 1
## 7 research_scientist 32
## 8 sales_executive 33
## 9 sales_representative 24
join <- full_join(total, yes_count, by = "job_role")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = job_role, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + coord_flip() + ggtitle("Job Role and Attrition")

### internal, worth exploring
total <- work %>% select(attrition, job_satisfaction) %>% group_by(job_satisfaction) %>% arrange(job_satisfaction) %>% count(job_satisfaction) %>% as.data.frame()
yes_count <- work %>% select(job_satisfaction, attrition) %>% group_by(job_satisfaction) %>% arrange(job_satisfaction) %>% filter(attrition == "1") %>% count(job_satisfaction) %>% rename(yes = n) %>% as.data.frame()
head(total)
## job_satisfaction n
## 1 1 179
## 2 2 166
## 3 3 254
## 4 4 271
yes_count
## job_satisfaction yes
## 1 1 38
## 2 2 31
## 3 3 43
## 4 4 28
join <- full_join(total, yes_count, by = "job_satisfaction")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = job_satisfaction, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Job Satisfaction and Attrition")

### external, not significant
total <- work %>% select(attrition, marital_status) %>% group_by(marital_status) %>% arrange(marital_status) %>% count(marital_status) %>% as.data.frame()
yes_count <- work %>% select(marital_status, attrition) %>% group_by(marital_status) %>% arrange(marital_status) %>% filter(attrition == "1") %>% count(marital_status) %>% rename(yes = n) %>% as.data.frame()
head(total)
## marital_status n
## 1 Divorced 191
## 2 Married 410
## 3 Single 269
yes_count
## marital_status yes
## 1 Divorced 12
## 2 Married 58
## 3 Single 70
join <- full_join(total, yes_count, by = "marital_status")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = marital_status, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Marital Status and Attrition")

### internal, significant
total <- work %>% select(attrition, monthly_income) %>%
mutate(monthly_income = discretize(monthly_income, breaks = 6)) %>%
group_by(monthly_income) %>% arrange(monthly_income) %>%
count(monthly_income) %>% as.data.frame()
yes_count <- work %>% select(attrition, monthly_income) %>%
mutate(monthly_income = discretize(monthly_income, breaks = 6)) %>% filter(attrition == "1") %>%
group_by(monthly_income) %>% arrange(monthly_income) %>%
count(monthly_income) %>% rename(yes = n) %>% as.data.frame()
yes_count
## monthly_income yes
## 1 [1.08e+03,2.51e+03) 49
## 2 [2.51e+03,3.57e+03) 28
## 3 [3.57e+03,4.95e+03) 20
## 4 [4.95e+03,6.48e+03) 14
## 5 [6.48e+03,1.04e+04) 19
## 6 [1.04e+04,2e+04] 10
join <- full_join(total, yes_count, by = "monthly_income")
join
## monthly_income n yes
## 1 [1.08e+03,2.51e+03) 145 49
## 2 [2.51e+03,3.57e+03) 145 28
## 3 [3.57e+03,4.95e+03) 145 20
## 4 [4.95e+03,6.48e+03) 145 14
## 5 [6.48e+03,1.04e+04) 145 19
## 6 [1.04e+04,2e+04] 145 10
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = monthly_income, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + coord_flip() + ggtitle("Income and Attrition")

### external, not significant
total <- work %>% select(attrition, num_companies_worked) %>% group_by(num_companies_worked) %>% arrange(num_companies_worked) %>% count(num_companies_worked) %>% as.data.frame()
yes_count <- work %>% select(num_companies_worked, attrition) %>% group_by(num_companies_worked) %>% arrange(num_companies_worked) %>% filter(attrition == "1") %>% count(num_companies_worked) %>% rename(yes = n) %>% as.data.frame()
head(total)
## num_companies_worked n
## 1 0 111
## 2 1 320
## 3 2 74
## 4 3 91
## 5 4 85
## 6 5 43
yes_count
## num_companies_worked yes
## 1 0 11
## 2 1 60
## 3 2 9
## 4 3 9
## 5 4 7
## 6 5 11
## 7 6 9
## 8 7 11
## 9 8 5
## 10 9 8
join <- full_join(total, yes_count, by = "num_companies_worked")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = num_companies_worked, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Number of Companies Worked and Attrition")

### internal, significant
total <- work %>% select(attrition, over_time) %>% group_by(over_time) %>% arrange(over_time) %>% count(over_time) %>% as.data.frame()
yes_count <- work %>% select(over_time, attrition) %>% group_by(over_time) %>% arrange(over_time) %>% filter(attrition == "1") %>% count(over_time) %>% rename(yes = n) %>% as.data.frame()
head(total)
## over_time n
## 1 No 618
## 2 Yes 252
yes_count
## over_time yes
## 1 No 60
## 2 Yes 80
join <- full_join(total, yes_count, by = "over_time")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = over_time, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Over Time and Attrition")

### internal, significant
total <- work %>% select(attrition, percent_salary_hike) %>% group_by(percent_salary_hike) %>% arrange(percent_salary_hike) %>% count(percent_salary_hike) %>% as.data.frame()
yes_count <- work %>% select(percent_salary_hike, attrition) %>% group_by(percent_salary_hike) %>% arrange(percent_salary_hike) %>% filter(attrition == "1") %>% count(percent_salary_hike) %>% rename(yes = n) %>% as.data.frame()
head(total)
## percent_salary_hike n
## 1 11 126
## 2 12 119
## 3 13 123
## 4 14 120
## 5 15 54
## 6 16 43
yes_count
## percent_salary_hike yes
## 1 11 24
## 2 12 17
## 3 13 21
## 4 14 13
## 5 15 8
## 6 16 10
## 7 17 10
## 8 18 9
## 9 19 5
## 10 20 3
## 11 21 2
## 12 22 7
## 13 23 5
## 14 24 5
## 15 25 1
join <- full_join(total, yes_count, by = "percent_salary_hike")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = percent_salary_hike, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Percent Salary Hike and Attrition")

### internal, not significant
total <- work %>% select(attrition, performance_rating) %>% group_by(performance_rating) %>% arrange(performance_rating) %>% count(performance_rating) %>% as.data.frame()
yes_count <- work %>% select(performance_rating, attrition) %>% group_by(performance_rating) %>% arrange(performance_rating) %>% filter(attrition == "1") %>% count(performance_rating) %>% rename(yes = n) %>% as.data.frame()
head(total)
## performance_rating n
## 1 3 738
## 2 4 132
yes_count
## performance_rating yes
## 1 3 117
## 2 4 23
join <- full_join(total, yes_count, by = "performance_rating")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = performance_rating, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Performance Rating and Attrition")

### external, not significant
total <- work %>% select(attrition, relationship_satisfaction) %>% group_by(relationship_satisfaction) %>% arrange(relationship_satisfaction) %>% count(relationship_satisfaction) %>% as.data.frame()
yes_count <- work %>% select(relationship_satisfaction, attrition) %>% group_by(relationship_satisfaction) %>% arrange(relationship_satisfaction) %>% filter(attrition == "1") %>% count(relationship_satisfaction) %>% rename(yes = n) %>% as.data.frame()
head(total)
## relationship_satisfaction n
## 1 1 174
## 2 2 171
## 3 3 261
## 4 4 264
yes_count
## relationship_satisfaction yes
## 1 1 35
## 2 2 27
## 3 3 36
## 4 4 42
join <- full_join(total, yes_count, by = "relationship_satisfaction")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = relationship_satisfaction, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Relationship Satisfaction and Attrition")

### internal, not significant
total <- work %>% select(attrition, stock_option_level) %>% group_by(stock_option_level) %>% arrange(stock_option_level) %>% count(stock_option_level) %>% as.data.frame()
yes_count <- work %>% select(stock_option_level, attrition) %>% group_by(stock_option_level) %>% arrange(stock_option_level) %>% filter(attrition == "1") %>% count(stock_option_level) %>% rename(yes = n) %>% as.data.frame()
head(total)
## stock_option_level n
## 1 0 379
## 2 1 355
## 3 2 81
## 4 3 55
yes_count
## stock_option_level yes
## 1 0 98
## 2 1 27
## 3 2 3
## 4 3 12
join <- full_join(total, yes_count, by = "stock_option_level")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = stock_option_level, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Stock Option Level and Attrition")

### external, worth exploring
total <- work %>% select(attrition, total_working_years) %>% mutate(total_working_years = discretize(total_working_years, method = 'interval', breaks = 5)) %>% group_by(total_working_years) %>% arrange(total_working_years) %>% count(total_working_years) %>% as.data.frame()
yes_count <- work %>% select(total_working_years, attrition) %>% filter(attrition == "1") %>%
mutate(total_working_years = discretize(total_working_years, method = 'interval', breaks = 5)) %>% group_by(total_working_years) %>% arrange(total_working_years) %>% filter(attrition == "1") %>% dplyr::count(total_working_years) %>% dplyr::rename(yes = n) %>% as.data.frame()
head(total)
## total_working_years n
## 1 [0,8) 305
## 2 [8,16) 370
## 3 [16,24) 129
## 4 [24,32) 47
## 5 [32,40] 19
yes_count
## total_working_years yes
## 1 [0,8) 79
## 2 [8,16) 42
## 3 [16,24) 13
## 4 [24,32) 4
## 5 [32,40] 2
join <- full_join(total, yes_count, by = "total_working_years")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = total_working_years, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Total Working Years and Attrition")

### internal?, not significant
total <- work %>% select(attrition, training_times_last_year) %>% group_by(training_times_last_year) %>% arrange(training_times_last_year) %>% count(training_times_last_year) %>% as.data.frame()
yes_count <- work %>% select(training_times_last_year, attrition) %>% group_by(training_times_last_year) %>% arrange(training_times_last_year) %>% filter(attrition == "1") %>% count(training_times_last_year) %>% rename(yes = n) %>% as.data.frame()
head(total)
## training_times_last_year n
## 1 0 30
## 2 1 39
## 3 2 309
## 4 3 308
## 5 4 73
## 6 5 75
yes_count
## training_times_last_year yes
## 1 0 8
## 2 1 5
## 3 2 57
## 4 3 43
## 5 4 16
## 6 5 7
## 7 6 4
join <- full_join(total, yes_count, by = "training_times_last_year")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = training_times_last_year, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Training Times Last Year and Attrition")

### internal/external, significant
total <- work %>% select(attrition, work_life_balance) %>% group_by(work_life_balance) %>% arrange(work_life_balance) %>% count(work_life_balance) %>% as.data.frame()
yes_count <- work %>% select(work_life_balance, attrition) %>% group_by(work_life_balance) %>% arrange(work_life_balance) %>% filter(attrition == "1") %>% count(work_life_balance) %>% rename(yes = n) %>% as.data.frame()
head(total)
## work_life_balance n
## 1 1 48
## 2 2 192
## 3 3 532
## 4 4 98
yes_count
## work_life_balance yes
## 1 1 17
## 2 2 30
## 3 3 80
## 4 4 13
join <- full_join(total, yes_count, by = "work_life_balance")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = work_life_balance, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Work Life Balance and Attrition")

### external, not significant
total <- work %>% select(attrition, years_at_company) %>% mutate(years_at_company = discretize(years_at_company, method = 'interval', breaks = 4)) %>% group_by(years_at_company) %>% arrange(years_at_company) %>% count(years_at_company) %>% as.data.frame()
yes_count <- work %>% select(years_at_company, attrition) %>% mutate(years_at_company = discretize(years_at_company, method = 'interval', breaks = 4)) %>% filter(attrition == "1") %>% group_by(years_at_company) %>% arrange(years_at_company) %>% count(years_at_company) %>% rename(yes = n) %>% as.data.frame()
head(total)
## years_at_company n
## 1 [0,10) 644
## 2 [10,20) 175
## 3 [20,30) 42
## 4 [30,40] 9
yes_count
## years_at_company yes
## 1 [0,10) 114
## 2 [10,20) 21
## 3 [20,30) 3
## 4 [30,40] 2
join <- full_join(total, yes_count, by = "years_at_company")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = years_at_company, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Years at Company and Attrition")

### external, not significant
total <- work %>% select(attrition, years_in_current_role) %>% mutate(years_in_current_role = discretize(years_in_current_role, method = 'interval', breaks = 6)) %>% group_by(years_in_current_role) %>% arrange(years_in_current_role) %>% count(years_in_current_role) %>% as.data.frame()
yes_count <- work %>% select(years_in_current_role, attrition) %>% mutate(years_in_current_role = discretize(years_in_current_role, method = 'interval', breaks = 6)) %>% filter(attrition == "1") %>% group_by(years_in_current_role) %>% arrange(years_in_current_role) %>% count(years_in_current_role) %>% rename(yes = n) %>% as.data.frame()
head(total)
## years_in_current_role n
## 1 [0,3) 412
## 2 [3,6) 147
## 3 [6,9) 209
## 4 [9,12) 69
## 5 [12,15) 23
## 6 [15,18] 10
yes_count
## years_in_current_role yes
## 1 [0,3) 93
## 2 [3,6) 15
## 3 [6,9) 23
## 4 [9,12) 5
## 5 [12,15) 3
## 6 [15,18] 1
join <- full_join(total, yes_count, by = "years_in_current_role")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = years_in_current_role, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Years in Current Role")

### internal, not significant
total <- work %>% select(attrition, years_since_last_promotion) %>% mutate(years_since_last_promotion = discretize(years_since_last_promotion, method = 'interval', breaks = 5)) %>% group_by(years_since_last_promotion) %>% arrange(years_since_last_promotion) %>% count(years_since_last_promotion) %>% as.data.frame()
yes_count <- work %>% select(years_since_last_promotion, attrition) %>% mutate(years_since_last_promotion = discretize(years_since_last_promotion, method = 'interval', breaks = 5)) %>% filter(attrition == "1") %>% group_by(years_since_last_promotion) %>% arrange(years_since_last_promotion) %>% count(years_since_last_promotion) %>% rename(yes = n) %>% as.data.frame()
head(total)
## years_since_last_promotion n
## 1 [0,3) 650
## 2 [3,6) 94
## 3 [6,9) 76
## 4 [9,12) 27
## 5 [12,15] 23
yes_count
## years_since_last_promotion yes
## 1 [0,3) 108
## 2 [3,6) 9
## 3 [6,9) 13
## 4 [9,12) 6
## 5 [12,15] 4
join <- full_join(total, yes_count, by = "years_since_last_promotion")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = years_since_last_promotion, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Years Since Last Promotion")

### internal, not significant
total <- work %>% select(attrition, years_with_curr_manager) %>%
mutate(years_with_curr_manager = discretize(years_with_curr_manager, method = 'interval', breaks = 5)) %>%
group_by(years_with_curr_manager) %>% arrange(years_with_curr_manager) %>% count(years_with_curr_manager) %>% as.data.frame()
yes_count <- work %>% select(years_with_curr_manager, attrition) %>%
mutate(years_with_curr_manager = discretize(years_with_curr_manager, method = 'interval', breaks = 5)) %>%
filter(attrition == "1") %>% group_by(years_with_curr_manager) %>% arrange(years_with_curr_manager) %>%
count(years_with_curr_manager) %>% rename(yes = n) %>% as.data.frame()
head(total)
## years_with_curr_manager n
## 1 [0,3.4) 484
## 2 [3.4,6.8) 85
## 3 [6.8,10.2) 261
## 4 [10.2,13.6) 31
## 5 [13.6,17] 9
yes_count
## years_with_curr_manager yes
## 1 [0,3.4) 95
## 2 [3.4,6.8) 11
## 3 [6.8,10.2) 32
## 4 [10.2,13.6) 1
## 5 [13.6,17] 1
join <- full_join(total, yes_count, by = "years_with_curr_manager")
join <- join %>% mutate(perc = (yes/n)*100)
join %>% ggplot(aes(x = years_with_curr_manager, y = perc)) + geom_bar(stat = 'identity', fill = 'skyblue4') + theme_minimal() + ggtitle("Years with Current Manager and Attrition")
