#Loading libraries
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.8
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(here)
## here() starts at /Users/florence/Downloads
library(skimr)
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(readr)
library(tibble)
library(ggpubr)
library(corrplot)
## corrplot 0.92 loaded
employee_survey_data <- read_csv(file="~/Downloads/HR Analytics Case Study/employee_survey_data.csv")
## Rows: 4410 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (4): EmployeeID, EnvironmentSatisfaction, JobSatisfaction, WorkLifeBalance
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(employee_survey_data)
## # A tibble: 6 × 4
## EmployeeID EnvironmentSatisfaction JobSatisfaction WorkLifeBalance
## <dbl> <dbl> <dbl> <dbl>
## 1 1 3 4 2
## 2 2 3 2 4
## 3 3 2 2 1
## 4 4 4 4 3
## 5 5 4 1 3
## 6 6 3 2 2
colnames(employee_survey_data)
## [1] "EmployeeID" "EnvironmentSatisfaction"
## [3] "JobSatisfaction" "WorkLifeBalance"
general_data <- read_csv(file = "~/Downloads/HR Analytics Case Study/general_data.csv")
## Rows: 4410 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (16): Age, DistanceFromHome, Education, EmployeeCount, EmployeeID, JobLe...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(general_data)
## # A tibble: 6 × 24
## Age Attrition BusinessTravel Department DistanceFromHome Education
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 51 No Travel_Rarely Sales 6 2
## 2 31 Yes Travel_Frequently Research & Devel… 10 1
## 3 32 No Travel_Frequently Research & Devel… 17 4
## 4 38 No Non-Travel Research & Devel… 2 5
## 5 32 No Travel_Rarely Research & Devel… 10 1
## 6 46 No Travel_Rarely Research & Devel… 8 3
## # … with 18 more variables: EducationField <chr>, EmployeeCount <dbl>,
## # EmployeeID <dbl>, Gender <chr>, JobLevel <dbl>, JobRole <chr>,
## # MaritalStatus <chr>, MonthlyIncome <dbl>, NumCompaniesWorked <dbl>,
## # Over18 <chr>, PercentSalaryHike <dbl>, StandardHours <dbl>,
## # StockOptionLevel <dbl>, TotalWorkingYears <dbl>,
## # TrainingTimesLastYear <dbl>, YearsAtCompany <dbl>,
## # YearsSinceLastPromotion <dbl>, YearsWithCurrManager <dbl>
colnames(general_data)
## [1] "Age" "Attrition"
## [3] "BusinessTravel" "Department"
## [5] "DistanceFromHome" "Education"
## [7] "EducationField" "EmployeeCount"
## [9] "EmployeeID" "Gender"
## [11] "JobLevel" "JobRole"
## [13] "MaritalStatus" "MonthlyIncome"
## [15] "NumCompaniesWorked" "Over18"
## [17] "PercentSalaryHike" "StandardHours"
## [19] "StockOptionLevel" "TotalWorkingYears"
## [21] "TrainingTimesLastYear" "YearsAtCompany"
## [23] "YearsSinceLastPromotion" "YearsWithCurrManager"
manager_survey_data <- read_csv(file = "~/Downloads/HR Analytics Case Study/manager_survey_data.csv")
## Rows: 4410 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): EmployeeID, JobInvolvement, PerformanceRating
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(manager_survey_data)
## # A tibble: 6 × 3
## EmployeeID JobInvolvement PerformanceRating
## <dbl> <dbl> <dbl>
## 1 1 3 3
## 2 2 2 4
## 3 3 3 3
## 4 4 2 3
## 5 5 3 3
## 6 6 3 3
colnames(manager_survey_data)
## [1] "EmployeeID" "JobInvolvement" "PerformanceRating"
in_time <-read_csv(file = "~/Downloads/HR Analytics Case Study/in_time.csv")
## New names:
## * `` -> ...1
## Rows: 4410 Columns: 262── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): ...1
## lgl (12): 2015-01-01, 2015-01-14, 2015-01-26, 2015-03-05, 2015-05-01, 2015...
## dttm (249): 2015-01-02, 2015-01-05, 2015-01-06, 2015-01-07, 2015-01-08, 2015...
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(in_time)
## # A tibble: 6 × 262
## ...1 `2015-01-01` `2015-01-02` `2015-01-05` `2015-01-06`
## <dbl> <lgl> <dttm> <dttm> <dttm>
## 1 1 NA 2015-01-02 09:43:45 2015-01-05 10:08:48 2015-01-06 09:54:26
## 2 2 NA 2015-01-02 10:15:44 2015-01-05 10:21:05 NA
## 3 3 NA 2015-01-02 10:17:41 2015-01-05 09:50:50 2015-01-06 10:14:13
## 4 4 NA 2015-01-02 10:05:06 2015-01-05 09:56:32 2015-01-06 10:11:07
## 5 5 NA 2015-01-02 10:28:17 2015-01-05 09:49:58 2015-01-06 09:45:28
## 6 6 NA 2015-01-02 09:43:08 2015-01-05 10:14:00 2015-01-06 10:08:42
## # … with 257 more variables: `2015-01-07` <dttm>, `2015-01-08` <dttm>,
## # `2015-01-09` <dttm>, `2015-01-12` <dttm>, `2015-01-13` <dttm>,
## # `2015-01-14` <lgl>, `2015-01-15` <dttm>, `2015-01-16` <dttm>,
## # `2015-01-19` <dttm>, `2015-01-20` <dttm>, `2015-01-21` <dttm>,
## # `2015-01-22` <dttm>, `2015-01-23` <dttm>, `2015-01-26` <lgl>,
## # `2015-01-27` <dttm>, `2015-01-28` <dttm>, `2015-01-29` <dttm>,
## # `2015-01-30` <dttm>, `2015-02-02` <dttm>, `2015-02-03` <dttm>, …
colnames(in_time)
## [1] "...1" "2015-01-01" "2015-01-02" "2015-01-05" "2015-01-06"
## [6] "2015-01-07" "2015-01-08" "2015-01-09" "2015-01-12" "2015-01-13"
## [11] "2015-01-14" "2015-01-15" "2015-01-16" "2015-01-19" "2015-01-20"
## [16] "2015-01-21" "2015-01-22" "2015-01-23" "2015-01-26" "2015-01-27"
## [21] "2015-01-28" "2015-01-29" "2015-01-30" "2015-02-02" "2015-02-03"
## [26] "2015-02-04" "2015-02-05" "2015-02-06" "2015-02-09" "2015-02-10"
## [31] "2015-02-11" "2015-02-12" "2015-02-13" "2015-02-16" "2015-02-17"
## [36] "2015-02-18" "2015-02-19" "2015-02-20" "2015-02-23" "2015-02-24"
## [41] "2015-02-25" "2015-02-26" "2015-02-27" "2015-03-02" "2015-03-03"
## [46] "2015-03-04" "2015-03-05" "2015-03-06" "2015-03-09" "2015-03-10"
## [51] "2015-03-11" "2015-03-12" "2015-03-13" "2015-03-16" "2015-03-17"
## [56] "2015-03-18" "2015-03-19" "2015-03-20" "2015-03-23" "2015-03-24"
## [61] "2015-03-25" "2015-03-26" "2015-03-27" "2015-03-30" "2015-03-31"
## [66] "2015-04-01" "2015-04-02" "2015-04-03" "2015-04-06" "2015-04-07"
## [71] "2015-04-08" "2015-04-09" "2015-04-10" "2015-04-13" "2015-04-14"
## [76] "2015-04-15" "2015-04-16" "2015-04-17" "2015-04-20" "2015-04-21"
## [81] "2015-04-22" "2015-04-23" "2015-04-24" "2015-04-27" "2015-04-28"
## [86] "2015-04-29" "2015-04-30" "2015-05-01" "2015-05-04" "2015-05-05"
## [91] "2015-05-06" "2015-05-07" "2015-05-08" "2015-05-11" "2015-05-12"
## [96] "2015-05-13" "2015-05-14" "2015-05-15" "2015-05-18" "2015-05-19"
## [101] "2015-05-20" "2015-05-21" "2015-05-22" "2015-05-25" "2015-05-26"
## [106] "2015-05-27" "2015-05-28" "2015-05-29" "2015-06-01" "2015-06-02"
## [111] "2015-06-03" "2015-06-04" "2015-06-05" "2015-06-08" "2015-06-09"
## [116] "2015-06-10" "2015-06-11" "2015-06-12" "2015-06-15" "2015-06-16"
## [121] "2015-06-17" "2015-06-18" "2015-06-19" "2015-06-22" "2015-06-23"
## [126] "2015-06-24" "2015-06-25" "2015-06-26" "2015-06-29" "2015-06-30"
## [131] "2015-07-01" "2015-07-02" "2015-07-03" "2015-07-06" "2015-07-07"
## [136] "2015-07-08" "2015-07-09" "2015-07-10" "2015-07-13" "2015-07-14"
## [141] "2015-07-15" "2015-07-16" "2015-07-17" "2015-07-20" "2015-07-21"
## [146] "2015-07-22" "2015-07-23" "2015-07-24" "2015-07-27" "2015-07-28"
## [151] "2015-07-29" "2015-07-30" "2015-07-31" "2015-08-03" "2015-08-04"
## [156] "2015-08-05" "2015-08-06" "2015-08-07" "2015-08-10" "2015-08-11"
## [161] "2015-08-12" "2015-08-13" "2015-08-14" "2015-08-17" "2015-08-18"
## [166] "2015-08-19" "2015-08-20" "2015-08-21" "2015-08-24" "2015-08-25"
## [171] "2015-08-26" "2015-08-27" "2015-08-28" "2015-08-31" "2015-09-01"
## [176] "2015-09-02" "2015-09-03" "2015-09-04" "2015-09-07" "2015-09-08"
## [181] "2015-09-09" "2015-09-10" "2015-09-11" "2015-09-14" "2015-09-15"
## [186] "2015-09-16" "2015-09-17" "2015-09-18" "2015-09-21" "2015-09-22"
## [191] "2015-09-23" "2015-09-24" "2015-09-25" "2015-09-28" "2015-09-29"
## [196] "2015-09-30" "2015-10-01" "2015-10-02" "2015-10-05" "2015-10-06"
## [201] "2015-10-07" "2015-10-08" "2015-10-09" "2015-10-12" "2015-10-13"
## [206] "2015-10-14" "2015-10-15" "2015-10-16" "2015-10-19" "2015-10-20"
## [211] "2015-10-21" "2015-10-22" "2015-10-23" "2015-10-26" "2015-10-27"
## [216] "2015-10-28" "2015-10-29" "2015-10-30" "2015-11-02" "2015-11-03"
## [221] "2015-11-04" "2015-11-05" "2015-11-06" "2015-11-09" "2015-11-10"
## [226] "2015-11-11" "2015-11-12" "2015-11-13" "2015-11-16" "2015-11-17"
## [231] "2015-11-18" "2015-11-19" "2015-11-20" "2015-11-23" "2015-11-24"
## [236] "2015-11-25" "2015-11-26" "2015-11-27" "2015-11-30" "2015-12-01"
## [241] "2015-12-02" "2015-12-03" "2015-12-04" "2015-12-07" "2015-12-08"
## [246] "2015-12-09" "2015-12-10" "2015-12-11" "2015-12-14" "2015-12-15"
## [251] "2015-12-16" "2015-12-17" "2015-12-18" "2015-12-21" "2015-12-22"
## [256] "2015-12-23" "2015-12-24" "2015-12-25" "2015-12-28" "2015-12-29"
## [261] "2015-12-30" "2015-12-31"
out_time <- read_csv(file = "~/Downloads/HR Analytics Case Study/out_time.csv")
## New names:
## * `` -> ...1
## Rows: 4410 Columns: 262── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): ...1
## lgl (12): 2015-01-01, 2015-01-14, 2015-01-26, 2015-03-05, 2015-05-01, 2015...
## dttm (249): 2015-01-02, 2015-01-05, 2015-01-06, 2015-01-07, 2015-01-08, 2015...
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(out_time)
## # A tibble: 6 × 262
## ...1 `2015-01-01` `2015-01-02` `2015-01-05` `2015-01-06`
## <dbl> <lgl> <dttm> <dttm> <dttm>
## 1 1 NA 2015-01-02 16:56:15 2015-01-05 17:20:11 2015-01-06 17:19:05
## 2 2 NA 2015-01-02 18:22:17 2015-01-05 17:48:22 NA
## 3 3 NA 2015-01-02 16:59:14 2015-01-05 17:06:46 2015-01-06 16:38:32
## 4 4 NA 2015-01-02 17:25:24 2015-01-05 17:14:03 2015-01-06 17:07:42
## 5 5 NA 2015-01-02 18:31:37 2015-01-05 17:49:15 2015-01-06 17:26:25
## 6 6 NA 2015-01-02 20:29:54 2015-01-05 20:57:19 2015-01-06 21:06:31
## # … with 257 more variables: `2015-01-07` <dttm>, `2015-01-08` <dttm>,
## # `2015-01-09` <dttm>, `2015-01-12` <dttm>, `2015-01-13` <dttm>,
## # `2015-01-14` <lgl>, `2015-01-15` <dttm>, `2015-01-16` <dttm>,
## # `2015-01-19` <dttm>, `2015-01-20` <dttm>, `2015-01-21` <dttm>,
## # `2015-01-22` <dttm>, `2015-01-23` <dttm>, `2015-01-26` <lgl>,
## # `2015-01-27` <dttm>, `2015-01-28` <dttm>, `2015-01-29` <dttm>,
## # `2015-01-30` <dttm>, `2015-02-02` <dttm>, `2015-02-03` <dttm>, …
colnames(out_time)
## [1] "...1" "2015-01-01" "2015-01-02" "2015-01-05" "2015-01-06"
## [6] "2015-01-07" "2015-01-08" "2015-01-09" "2015-01-12" "2015-01-13"
## [11] "2015-01-14" "2015-01-15" "2015-01-16" "2015-01-19" "2015-01-20"
## [16] "2015-01-21" "2015-01-22" "2015-01-23" "2015-01-26" "2015-01-27"
## [21] "2015-01-28" "2015-01-29" "2015-01-30" "2015-02-02" "2015-02-03"
## [26] "2015-02-04" "2015-02-05" "2015-02-06" "2015-02-09" "2015-02-10"
## [31] "2015-02-11" "2015-02-12" "2015-02-13" "2015-02-16" "2015-02-17"
## [36] "2015-02-18" "2015-02-19" "2015-02-20" "2015-02-23" "2015-02-24"
## [41] "2015-02-25" "2015-02-26" "2015-02-27" "2015-03-02" "2015-03-03"
## [46] "2015-03-04" "2015-03-05" "2015-03-06" "2015-03-09" "2015-03-10"
## [51] "2015-03-11" "2015-03-12" "2015-03-13" "2015-03-16" "2015-03-17"
## [56] "2015-03-18" "2015-03-19" "2015-03-20" "2015-03-23" "2015-03-24"
## [61] "2015-03-25" "2015-03-26" "2015-03-27" "2015-03-30" "2015-03-31"
## [66] "2015-04-01" "2015-04-02" "2015-04-03" "2015-04-06" "2015-04-07"
## [71] "2015-04-08" "2015-04-09" "2015-04-10" "2015-04-13" "2015-04-14"
## [76] "2015-04-15" "2015-04-16" "2015-04-17" "2015-04-20" "2015-04-21"
## [81] "2015-04-22" "2015-04-23" "2015-04-24" "2015-04-27" "2015-04-28"
## [86] "2015-04-29" "2015-04-30" "2015-05-01" "2015-05-04" "2015-05-05"
## [91] "2015-05-06" "2015-05-07" "2015-05-08" "2015-05-11" "2015-05-12"
## [96] "2015-05-13" "2015-05-14" "2015-05-15" "2015-05-18" "2015-05-19"
## [101] "2015-05-20" "2015-05-21" "2015-05-22" "2015-05-25" "2015-05-26"
## [106] "2015-05-27" "2015-05-28" "2015-05-29" "2015-06-01" "2015-06-02"
## [111] "2015-06-03" "2015-06-04" "2015-06-05" "2015-06-08" "2015-06-09"
## [116] "2015-06-10" "2015-06-11" "2015-06-12" "2015-06-15" "2015-06-16"
## [121] "2015-06-17" "2015-06-18" "2015-06-19" "2015-06-22" "2015-06-23"
## [126] "2015-06-24" "2015-06-25" "2015-06-26" "2015-06-29" "2015-06-30"
## [131] "2015-07-01" "2015-07-02" "2015-07-03" "2015-07-06" "2015-07-07"
## [136] "2015-07-08" "2015-07-09" "2015-07-10" "2015-07-13" "2015-07-14"
## [141] "2015-07-15" "2015-07-16" "2015-07-17" "2015-07-20" "2015-07-21"
## [146] "2015-07-22" "2015-07-23" "2015-07-24" "2015-07-27" "2015-07-28"
## [151] "2015-07-29" "2015-07-30" "2015-07-31" "2015-08-03" "2015-08-04"
## [156] "2015-08-05" "2015-08-06" "2015-08-07" "2015-08-10" "2015-08-11"
## [161] "2015-08-12" "2015-08-13" "2015-08-14" "2015-08-17" "2015-08-18"
## [166] "2015-08-19" "2015-08-20" "2015-08-21" "2015-08-24" "2015-08-25"
## [171] "2015-08-26" "2015-08-27" "2015-08-28" "2015-08-31" "2015-09-01"
## [176] "2015-09-02" "2015-09-03" "2015-09-04" "2015-09-07" "2015-09-08"
## [181] "2015-09-09" "2015-09-10" "2015-09-11" "2015-09-14" "2015-09-15"
## [186] "2015-09-16" "2015-09-17" "2015-09-18" "2015-09-21" "2015-09-22"
## [191] "2015-09-23" "2015-09-24" "2015-09-25" "2015-09-28" "2015-09-29"
## [196] "2015-09-30" "2015-10-01" "2015-10-02" "2015-10-05" "2015-10-06"
## [201] "2015-10-07" "2015-10-08" "2015-10-09" "2015-10-12" "2015-10-13"
## [206] "2015-10-14" "2015-10-15" "2015-10-16" "2015-10-19" "2015-10-20"
## [211] "2015-10-21" "2015-10-22" "2015-10-23" "2015-10-26" "2015-10-27"
## [216] "2015-10-28" "2015-10-29" "2015-10-30" "2015-11-02" "2015-11-03"
## [221] "2015-11-04" "2015-11-05" "2015-11-06" "2015-11-09" "2015-11-10"
## [226] "2015-11-11" "2015-11-12" "2015-11-13" "2015-11-16" "2015-11-17"
## [231] "2015-11-18" "2015-11-19" "2015-11-20" "2015-11-23" "2015-11-24"
## [236] "2015-11-25" "2015-11-26" "2015-11-27" "2015-11-30" "2015-12-01"
## [241] "2015-12-02" "2015-12-03" "2015-12-04" "2015-12-07" "2015-12-08"
## [246] "2015-12-09" "2015-12-10" "2015-12-11" "2015-12-14" "2015-12-15"
## [251] "2015-12-16" "2015-12-17" "2015-12-18" "2015-12-21" "2015-12-22"
## [256] "2015-12-23" "2015-12-24" "2015-12-25" "2015-12-28" "2015-12-29"
## [261] "2015-12-30" "2015-12-31"
unique(employee_survey_data$EmployeeID)
unique(general_data$EmployeeID)
unique(manager_survey_data$EmployeeID)
sum(duplicated(employee_survey_data))
## [1] 0
sum(duplicated(general_data))
## [1] 0
sum(duplicated(in_time))
## [1] 0
sum(duplicated(manager_survey_data))
## [1] 0
sum(duplicated(out_time))
## [1] 0
In in_time and out_time, I will format
data-time value as time only.
in_time_1 <- in_time %>%
mutate("2015-01-02" = strftime("2015-01-02", format = "%I:%M:%S %p", tz=Sys.timezone()))
head(in_time_1 )
## # A tibble: 6 × 262
## ...1 `2015-01-01` `2015-01-02` `2015-01-05` `2015-01-06`
## <dbl> <lgl> <chr> <dttm> <dttm>
## 1 1 NA 12:00:00 am 2015-01-05 10:08:48 2015-01-06 09:54:26
## 2 2 NA 12:00:00 am 2015-01-05 10:21:05 NA
## 3 3 NA 12:00:00 am 2015-01-05 09:50:50 2015-01-06 10:14:13
## 4 4 NA 12:00:00 am 2015-01-05 09:56:32 2015-01-06 10:11:07
## 5 5 NA 12:00:00 am 2015-01-05 09:49:58 2015-01-06 09:45:28
## 6 6 NA 12:00:00 am 2015-01-05 10:14:00 2015-01-06 10:08:42
## # … with 257 more variables: `2015-01-07` <dttm>, `2015-01-08` <dttm>,
## # `2015-01-09` <dttm>, `2015-01-12` <dttm>, `2015-01-13` <dttm>,
## # `2015-01-14` <lgl>, `2015-01-15` <dttm>, `2015-01-16` <dttm>,
## # `2015-01-19` <dttm>, `2015-01-20` <dttm>, `2015-01-21` <dttm>,
## # `2015-01-22` <dttm>, `2015-01-23` <dttm>, `2015-01-26` <lgl>,
## # `2015-01-27` <dttm>, `2015-01-28` <dttm>, `2015-01-29` <dttm>,
## # `2015-01-30` <dttm>, `2015-02-02` <dttm>, `2015-02-03` <dttm>, …
col_num<-ncol(in_time)
for (i in 2:col_num){
a <- colnames(in_time[i])
b <- colnames(out_time[i])
temp<-as.numeric(difftime(in_time[[a]], out_time[[b]], units ="hours"))
in_time[ , ncol(in_time) + 1] <- temp
}
I will merge general_data
employee_survey_data manager_survey_data into
one consolidated dataframe.
Since merge only merge two tables once a time, I will first combine
general data and employee_survey_data. Then I
combine with manger_survey_data.
HR_data <- merge (general_data, employee_survey_data, by=c("EmployeeID"))
head(HR_data)
## EmployeeID Age Attrition BusinessTravel Department
## 1 1 51 No Travel_Rarely Sales
## 2 2 31 Yes Travel_Frequently Research & Development
## 3 3 32 No Travel_Frequently Research & Development
## 4 4 38 No Non-Travel Research & Development
## 5 5 32 No Travel_Rarely Research & Development
## 6 6 46 No Travel_Rarely Research & Development
## DistanceFromHome Education EducationField EmployeeCount Gender JobLevel
## 1 6 2 Life Sciences 1 Female 1
## 2 10 1 Life Sciences 1 Female 1
## 3 17 4 Other 1 Male 4
## 4 2 5 Life Sciences 1 Male 3
## 5 10 1 Medical 1 Male 1
## 6 8 3 Life Sciences 1 Female 4
## JobRole MaritalStatus MonthlyIncome NumCompaniesWorked
## 1 Healthcare Representative Married 131160 1
## 2 Research Scientist Single 41890 0
## 3 Sales Executive Married 193280 1
## 4 Human Resources Married 83210 3
## 5 Sales Executive Single 23420 4
## 6 Research Director Married 40710 3
## Over18 PercentSalaryHike StandardHours StockOptionLevel TotalWorkingYears
## 1 Y 11 8 0 1
## 2 Y 23 8 1 6
## 3 Y 15 8 3 5
## 4 Y 11 8 3 13
## 5 Y 12 8 2 9
## 6 Y 13 8 0 28
## TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion
## 1 6 1 0
## 2 3 5 1
## 3 2 5 0
## 4 5 8 7
## 5 2 6 0
## 6 5 7 7
## YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance
## 1 0 3 4 2
## 2 4 3 2 4
## 3 3 2 2 1
## 4 5 4 4 3
## 5 4 4 1 3
## 6 7 3 2 2
write_csv(HR_data,"HR_data.csv")
Now, I will combine with manager_survey_data.
HR_data_combined <- merge (HR_data, manager_survey_data, by=c("EmployeeID"))
head(HR_data_combined)
## EmployeeID Age Attrition BusinessTravel Department
## 1 1 51 No Travel_Rarely Sales
## 2 2 31 Yes Travel_Frequently Research & Development
## 3 3 32 No Travel_Frequently Research & Development
## 4 4 38 No Non-Travel Research & Development
## 5 5 32 No Travel_Rarely Research & Development
## 6 6 46 No Travel_Rarely Research & Development
## DistanceFromHome Education EducationField EmployeeCount Gender JobLevel
## 1 6 2 Life Sciences 1 Female 1
## 2 10 1 Life Sciences 1 Female 1
## 3 17 4 Other 1 Male 4
## 4 2 5 Life Sciences 1 Male 3
## 5 10 1 Medical 1 Male 1
## 6 8 3 Life Sciences 1 Female 4
## JobRole MaritalStatus MonthlyIncome NumCompaniesWorked
## 1 Healthcare Representative Married 131160 1
## 2 Research Scientist Single 41890 0
## 3 Sales Executive Married 193280 1
## 4 Human Resources Married 83210 3
## 5 Sales Executive Single 23420 4
## 6 Research Director Married 40710 3
## Over18 PercentSalaryHike StandardHours StockOptionLevel TotalWorkingYears
## 1 Y 11 8 0 1
## 2 Y 23 8 1 6
## 3 Y 15 8 3 5
## 4 Y 11 8 3 13
## 5 Y 12 8 2 9
## 6 Y 13 8 0 28
## TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion
## 1 6 1 0
## 2 3 5 1
## 3 2 5 0
## 4 5 8 7
## 5 2 6 0
## 6 5 7 7
## YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance
## 1 0 3 4 2
## 2 4 3 2 4
## 3 3 2 2 1
## 4 5 4 4 3
## 5 4 4 1 3
## 6 7 3 2 2
## JobInvolvement PerformanceRating
## 1 3 3
## 2 2 4
## 3 3 3
## 4 2 3
## 5 3 3
## 6 3 3
Replace numeric value to what it represents as follows:
HR_data_combined$Education[HR_data_combined$Education == "1"] <- "Below College"
HR_data_combined$Education[HR_data_combined$Education == "2"] <- "College"
HR_data_combined$Education[HR_data_combined$Education == "3"] <- "Bachelor"
HR_data_combined$Education[HR_data_combined$Education == "4"] <- "Master"
HR_data_combined$Education[HR_data_combined$Education == "5"] <- "Doctor"
HR_data_combined$EnvironmentSatisfaction[HR_data_combined$EnvironmentSatisfaction == "1"] <- "Low"
HR_data_combined$EnvironmentSatisfaction[HR_data_combined$EnvironmentSatisfaction == "2"] <- "Medium"
HR_data_combined$EnvironmentSatisfaction[HR_data_combined$EnvironmentSatisfaction == "3"] <- "High"
HR_data_combined$EnvironmentSatisfaction[HR_data_combined$EnvironmentSatisfaction == "4"] <- "Very High"
HR_data_combined$JobSatisfaction[HR_data_combined$JobSatisfaction == "1"] <- "Low"
HR_data_combined$JobSatisfaction[HR_data_combined$JobSatisfaction == "2"] <- "Medium"
HR_data_combined$JobSatisfaction[HR_data_combined$JobSatisfaction == "3"] <- "High"
HR_data_combined$JobSatisfaction[HR_data_combined$JobSatisfaction == "4"] <- "Very High"
HR_data_combined$WorkLifeBalance[HR_data_combined$WorkLifeBalance == "1"] <- "Bad"
HR_data_combined$WorkLifeBalance[HR_data_combined$WorkLifeBalance == "2"] <- "Good"
HR_data_combined$WorkLifeBalance[HR_data_combined$WorkLifeBalance == "3"] <- "Better"
HR_data_combined$WorkLifeBalance[HR_data_combined$WorkLifeBalance == "4"] <- "Best"
HR_data_combined$JobInvolvement[HR_data_combined$JobInvolvement == "1"] <- "Low"
HR_data_combined$JobInvolvement[HR_data_combined$JobInvolvement == "2"] <- "Medium"
HR_data_combined$JobInvolvement[HR_data_combined$JobInvolvement == "3"] <- "High"
HR_data_combined$JobInvolvement[HR_data_combined$JobInvolvement == "4"] <- "Very High"
HR_data_combined$PerformanceRating[HR_data_combined$PerformanceRating == "1"] <- "Low"
HR_data_combined$PerformanceRating[HR_data_combined$PerformanceRating == "2"] <- "Good"
HR_data_combined$PerformanceRating[HR_data_combined$PerformanceRating == "3"] <- "Excellent"
HR_data_combined$PerformanceRating[HR_data_combined$PerformanceRating == "4"] <- "Outstanding"
head(HR_data_combined)
## EmployeeID Age Attrition BusinessTravel Department
## 1 1 51 No Travel_Rarely Sales
## 2 2 31 Yes Travel_Frequently Research & Development
## 3 3 32 No Travel_Frequently Research & Development
## 4 4 38 No Non-Travel Research & Development
## 5 5 32 No Travel_Rarely Research & Development
## 6 6 46 No Travel_Rarely Research & Development
## DistanceFromHome Education EducationField EmployeeCount Gender JobLevel
## 1 6 College Life Sciences 1 Female 1
## 2 10 Below College Life Sciences 1 Female 1
## 3 17 Master Other 1 Male 4
## 4 2 Doctor Life Sciences 1 Male 3
## 5 10 Below College Medical 1 Male 1
## 6 8 Bachelor Life Sciences 1 Female 4
## JobRole MaritalStatus MonthlyIncome NumCompaniesWorked
## 1 Healthcare Representative Married 131160 1
## 2 Research Scientist Single 41890 0
## 3 Sales Executive Married 193280 1
## 4 Human Resources Married 83210 3
## 5 Sales Executive Single 23420 4
## 6 Research Director Married 40710 3
## Over18 PercentSalaryHike StandardHours StockOptionLevel TotalWorkingYears
## 1 Y 11 8 0 1
## 2 Y 23 8 1 6
## 3 Y 15 8 3 5
## 4 Y 11 8 3 13
## 5 Y 12 8 2 9
## 6 Y 13 8 0 28
## TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion
## 1 6 1 0
## 2 3 5 1
## 3 2 5 0
## 4 5 8 7
## 5 2 6 0
## 6 5 7 7
## YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance
## 1 0 High Very High Good
## 2 4 High Medium Best
## 3 3 Medium Medium Bad
## 4 5 Very High Very High Better
## 5 4 Very High Low Better
## 6 7 High Medium Good
## JobInvolvement PerformanceRating
## 1 High Excellent
## 2 Medium Outstanding
## 3 High Excellent
## 4 Medium Excellent
## 5 High Excellent
## 6 High Excellent
write_csv(HR_data_combined, "HR_data_combined.csv")
monthlyincome_attrition <- read_csv(file = "~/Downloads/HR Analytics Case Study/monthlyincome_attrition.csv")
## Rows: 4362 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Attrition
## dbl (2): MonthlyIncome, EmployeeID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Add another column that contains income type
income_type <- monthlyincome_attrition %>%
mutate(income_type = case_when(
MonthlyIncome <= 52200 ~ "Low Income",
MonthlyIncome >= 52200 & MonthlyIncome <= 156600 ~ "Middle Income",
MonthlyIncome > 156600 ~"Upper Income"
))
head(income_type)
## # A tibble: 6 × 4
## MonthlyIncome Attrition EmployeeID income_type
## <dbl> <chr> <dbl> <chr>
## 1 131160 No 1 Middle Income
## 2 41890 Yes 2 Low Income
## 3 193280 No 3 Upper Income
## 4 83210 No 4 Middle Income
## 5 23420 No 5 Low Income
## 6 40710 No 6 Low Income
write_csv(income_type, "income_type.csv")
Now we will classify Distance From Home by three groups.
distancefromhome_attrition <- read_csv(file = "~/Downloads/HR Analytics Case Study/distancefromhome_attrition.csv")
## Rows: 4362 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Attrition
## dbl (2): DistanceFromHome, EmployeeID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
max(distancefromhome_attrition$DistanceFromHome)
## [1] 29
min(distancefromhome_attrition$DistanceFromHome)
## [1] 1
median(distancefromhome_attrition$DistanceFromHome)
## [1] 7
mean(distancefromhome_attrition$DistanceFromHome)
## [1] 9.202201
distance_type <- distancefromhome_attrition %>%
mutate(distance_type = case_when(
DistanceFromHome > 1 & DistanceFromHome < 7 ~"Short",
DistanceFromHome >= 7 & DistanceFromHome < 10 ~"Middle",
DistanceFromHome >=10 ~"Long"
))
head(distance_type)
## # A tibble: 6 × 4
## DistanceFromHome Attrition EmployeeID distance_type
## <dbl> <chr> <dbl> <chr>
## 1 6 No 1 Short
## 2 10 Yes 2 Long
## 3 17 No 3 Long
## 4 2 No 4 Short
## 5 10 No 5 Long
## 6 8 No 6 Middle
write_csv(distance_type, "distance_type.csv")
totalworkingyears_attrition <- read_csv(file = "~/Downloads/HR Analytics Case Study/totalworkingyears_attrition.csv")
## Rows: 4362 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Attrition
## dbl (2): TotalWorkingYears, EmployeeID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
min(totalworkingyears_attrition$TotalWorkingYears)
## [1] 0
max(totalworkingyears_attrition$TotalWorkingYears)
## [1] 40
mean(totalworkingyears_attrition$TotalWorkingYears)
## [1] 11.28496
median(totalworkingyears_attrition$TotalWorkingYears)
## [1] 10
workyears_type <- totalworkingyears_attrition %>%
mutate(workyears_type = case_when(
TotalWorkingYears >0 & TotalWorkingYears <=10 ~'Juniro',
TotalWorkingYears >=11 & TotalWorkingYears <20 ~'Senior',
TotalWorkingYears >=20 ~'Expert'
))
head(workyears_type)
## # A tibble: 6 × 4
## TotalWorkingYears Attrition EmployeeID workyears_type
## <dbl> <chr> <dbl> <chr>
## 1 1 No 1 Juniro
## 2 6 Yes 2 Juniro
## 3 5 No 3 Juniro
## 4 13 No 4 Senior
## 5 9 No 5 Juniro
## 6 28 No 6 Expert
write_csv(workyears_type,"workyears_type.csv")
yearsatcompany<-read_csv(file = "~/Downloads/HR Analytics Case Study/yearsatcompany_attrition.csv")
## Rows: 4362 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Attrition
## dbl (2): YearsAtCompany, EmployeeID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
min(yearsatcompany$YearsAtCompany)
## [1] 0
max(yearsatcompany$YearsAtCompany)
## [1] 40
mean(yearsatcompany$YearsAtCompany)
## [1] 7.008941
median(yearsatcompany$YearsAtCompany)
## [1] 5
loyalty_type <- yearsatcompany %>%
mutate(loyalty_type = case_when(
YearsAtCompany >0 & YearsAtCompany <=5 ~'Newbie',
YearsAtCompany >5 & YearsAtCompany <=8 ~'Stable Employee',
YearsAtCompany >8 ~'Loyal Employee'
))
head(loyalty_type)
## # A tibble: 6 × 4
## YearsAtCompany Attrition EmployeeID loyalty_type
## <dbl> <chr> <dbl> <chr>
## 1 1 No 1 Newbie
## 2 5 Yes 2 Newbie
## 3 5 No 3 Newbie
## 4 8 No 4 Stable Employee
## 5 6 No 5 Stable Employee
## 6 7 No 6 Stable Employee
write_csv(loyalty_type, "loyal_type.csv")
age_attrition <- read_csv(file = "~/Downloads/HR Analytics Case Study/Age_Attrition.csv")
## Rows: 82 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Attrition
## dbl (2): Age, COUNT(EmployeeID)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
age_group <- age_attrition %>%
mutate(age_group = case_when(
Age >=17 & Age <=30 ~"Young Adults",
Age >=31 & Age <=45 ~"Middle-Age Adults",
Age >45 ~"Old-Aged Adults "
))
head(age_group)
## # A tibble: 6 × 4
## Age Attrition `COUNT(EmployeeID)` age_group
## <dbl> <chr> <dbl> <chr>
## 1 51 No 51 "Old-Aged Adults "
## 2 31 Yes 54 "Middle-Age Adults"
## 3 32 No 149 "Middle-Age Adults"
## 4 38 No 167 "Middle-Age Adults"
## 5 46 No 87 "Old-Aged Adults "
## 6 28 Yes 40 "Young Adults"
write_csv(age_group, "age_group.csv")
HR_data <- read_csv(file = '~/Downloads/HR_analytics_csv/CT_JobSatisfactiomployee_Attrition.csv')
## Rows: 8 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Attrition
## dbl (2): JobSatisfaction, COUNT
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
For categorical values with multiple values, I use fastDummies to sort out.
HR_data_new <- fastDummies::dummy_cols(HR_data_combined)
head(HR_data_new)
## EmployeeID Age Attrition BusinessTravel Department
## 1 1 51 No Travel_Rarely Sales
## 2 2 31 Yes Travel_Frequently Research & Development
## 3 3 32 No Travel_Frequently Research & Development
## 4 4 38 No Non-Travel Research & Development
## 5 5 32 No Travel_Rarely Research & Development
## 6 6 46 No Travel_Rarely Research & Development
## DistanceFromHome Education EducationField EmployeeCount Gender JobLevel
## 1 6 College Life Sciences 1 Female 1
## 2 10 Below College Life Sciences 1 Female 1
## 3 17 Master Other 1 Male 4
## 4 2 Doctor Life Sciences 1 Male 3
## 5 10 Below College Medical 1 Male 1
## 6 8 Bachelor Life Sciences 1 Female 4
## JobRole MaritalStatus MonthlyIncome NumCompaniesWorked
## 1 Healthcare Representative Married 131160 1
## 2 Research Scientist Single 41890 0
## 3 Sales Executive Married 193280 1
## 4 Human Resources Married 83210 3
## 5 Sales Executive Single 23420 4
## 6 Research Director Married 40710 3
## Over18 PercentSalaryHike StandardHours StockOptionLevel TotalWorkingYears
## 1 Y 11 8 0 1
## 2 Y 23 8 1 6
## 3 Y 15 8 3 5
## 4 Y 11 8 3 13
## 5 Y 12 8 2 9
## 6 Y 13 8 0 28
## TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion
## 1 6 1 0
## 2 3 5 1
## 3 2 5 0
## 4 5 8 7
## 5 2 6 0
## 6 5 7 7
## YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance
## 1 0 High Very High Good
## 2 4 High Medium Best
## 3 3 Medium Medium Bad
## 4 5 Very High Very High Better
## 5 4 Very High Low Better
## 6 7 High Medium Good
## JobInvolvement PerformanceRating Attrition_No Attrition_Yes
## 1 High Excellent 1 0
## 2 Medium Outstanding 0 1
## 3 High Excellent 1 0
## 4 Medium Excellent 1 0
## 5 High Excellent 1 0
## 6 High Excellent 1 0
## BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently
## 1 0 0
## 2 0 1
## 3 0 1
## 4 1 0
## 5 0 0
## 6 0 0
## BusinessTravel_Travel_Rarely Department_Human Resources
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 1 0
## 6 1 0
## Department_Research & Development Department_Sales Education_Bachelor
## 1 0 1 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 0 0
## 6 1 0 1
## Education_Below College Education_College Education_Doctor Education_Master
## 1 0 1 0 0
## 2 1 0 0 0
## 3 0 0 0 1
## 4 0 0 1 0
## 5 1 0 0 0
## 6 0 0 0 0
## EducationField_Human Resources EducationField_Life Sciences
## 1 0 1
## 2 0 1
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 1
## EducationField_Marketing EducationField_Medical EducationField_Other
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 1 0
## 6 0 0 0
## EducationField_Technical Degree Gender_Female Gender_Male
## 1 0 1 0
## 2 0 1 0
## 3 0 0 1
## 4 0 0 1
## 5 0 0 1
## 6 0 1 0
## JobRole_Healthcare Representative JobRole_Human Resources
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 0
## JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive
## 1 0 0 0
## 2 0 1 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 1
## 6 1 0 0
## JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married
## 1 0 0 1
## 2 0 0 0
## 3 0 0 1
## 4 0 0 1
## 5 0 0 0
## 6 0 0 1
## MaritalStatus_Single Over18_Y EnvironmentSatisfaction_High
## 1 0 1 1
## 2 1 1 1
## 3 0 1 0
## 4 0 1 0
## 5 1 1 0
## 6 0 1 1
## EnvironmentSatisfaction_Low EnvironmentSatisfaction_Medium
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 0
## 5 0 0
## 6 0 0
## EnvironmentSatisfaction_Very High EnvironmentSatisfaction_NA
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 0 0
## JobSatisfaction_High JobSatisfaction_Low JobSatisfaction_Medium
## 1 0 0 0
## 2 0 0 1
## 3 0 0 1
## 4 0 0 0
## 5 0 1 0
## 6 0 0 1
## JobSatisfaction_Very High JobSatisfaction_NA WorkLifeBalance_Bad
## 1 1 0 0
## 2 0 0 0
## 3 0 0 1
## 4 1 0 0
## 5 0 0 0
## 6 0 0 0
## WorkLifeBalance_Best WorkLifeBalance_Better WorkLifeBalance_Good
## 1 0 0 1
## 2 1 0 0
## 3 0 0 0
## 4 0 1 0
## 5 0 1 0
## 6 0 0 1
## WorkLifeBalance_NA JobInvolvement_High JobInvolvement_Low
## 1 0 1 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 1 0
## 6 0 1 0
## JobInvolvement_Medium JobInvolvement_Very High PerformanceRating_Excellent
## 1 0 0 1
## 2 1 0 0
## 3 0 0 1
## 4 1 0 1
## 5 0 0 1
## 6 0 0 1
## PerformanceRating_Outstanding
## 1 0
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
HR_data_new$Attrition[HR_data_new$Attrition == "Yes"] <- 1
HR_data_new$Attrition[HR_data_new$Attrition == "No"] <- 0
HR_data_new$Attrition <- as.numeric(HR_data_new$Attrition)
head(HR_data_new)
## EmployeeID Age Attrition BusinessTravel Department
## 1 1 51 0 Travel_Rarely Sales
## 2 2 31 1 Travel_Frequently Research & Development
## 3 3 32 0 Travel_Frequently Research & Development
## 4 4 38 0 Non-Travel Research & Development
## 5 5 32 0 Travel_Rarely Research & Development
## 6 6 46 0 Travel_Rarely Research & Development
## DistanceFromHome Education EducationField EmployeeCount Gender JobLevel
## 1 6 College Life Sciences 1 Female 1
## 2 10 Below College Life Sciences 1 Female 1
## 3 17 Master Other 1 Male 4
## 4 2 Doctor Life Sciences 1 Male 3
## 5 10 Below College Medical 1 Male 1
## 6 8 Bachelor Life Sciences 1 Female 4
## JobRole MaritalStatus MonthlyIncome NumCompaniesWorked
## 1 Healthcare Representative Married 131160 1
## 2 Research Scientist Single 41890 0
## 3 Sales Executive Married 193280 1
## 4 Human Resources Married 83210 3
## 5 Sales Executive Single 23420 4
## 6 Research Director Married 40710 3
## Over18 PercentSalaryHike StandardHours StockOptionLevel TotalWorkingYears
## 1 Y 11 8 0 1
## 2 Y 23 8 1 6
## 3 Y 15 8 3 5
## 4 Y 11 8 3 13
## 5 Y 12 8 2 9
## 6 Y 13 8 0 28
## TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion
## 1 6 1 0
## 2 3 5 1
## 3 2 5 0
## 4 5 8 7
## 5 2 6 0
## 6 5 7 7
## YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance
## 1 0 High Very High Good
## 2 4 High Medium Best
## 3 3 Medium Medium Bad
## 4 5 Very High Very High Better
## 5 4 Very High Low Better
## 6 7 High Medium Good
## JobInvolvement PerformanceRating Attrition_No Attrition_Yes
## 1 High Excellent 1 0
## 2 Medium Outstanding 0 1
## 3 High Excellent 1 0
## 4 Medium Excellent 1 0
## 5 High Excellent 1 0
## 6 High Excellent 1 0
## BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently
## 1 0 0
## 2 0 1
## 3 0 1
## 4 1 0
## 5 0 0
## 6 0 0
## BusinessTravel_Travel_Rarely Department_Human Resources
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 1 0
## 6 1 0
## Department_Research & Development Department_Sales Education_Bachelor
## 1 0 1 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 0 0
## 6 1 0 1
## Education_Below College Education_College Education_Doctor Education_Master
## 1 0 1 0 0
## 2 1 0 0 0
## 3 0 0 0 1
## 4 0 0 1 0
## 5 1 0 0 0
## 6 0 0 0 0
## EducationField_Human Resources EducationField_Life Sciences
## 1 0 1
## 2 0 1
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 1
## EducationField_Marketing EducationField_Medical EducationField_Other
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 1 0
## 6 0 0 0
## EducationField_Technical Degree Gender_Female Gender_Male
## 1 0 1 0
## 2 0 1 0
## 3 0 0 1
## 4 0 0 1
## 5 0 0 1
## 6 0 1 0
## JobRole_Healthcare Representative JobRole_Human Resources
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 0
## JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive
## 1 0 0 0
## 2 0 1 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 1
## 6 1 0 0
## JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married
## 1 0 0 1
## 2 0 0 0
## 3 0 0 1
## 4 0 0 1
## 5 0 0 0
## 6 0 0 1
## MaritalStatus_Single Over18_Y EnvironmentSatisfaction_High
## 1 0 1 1
## 2 1 1 1
## 3 0 1 0
## 4 0 1 0
## 5 1 1 0
## 6 0 1 1
## EnvironmentSatisfaction_Low EnvironmentSatisfaction_Medium
## 1 0 0
## 2 0 0
## 3 0 1
## 4 0 0
## 5 0 0
## 6 0 0
## EnvironmentSatisfaction_Very High EnvironmentSatisfaction_NA
## 1 0 0
## 2 0 0
## 3 0 0
## 4 1 0
## 5 1 0
## 6 0 0
## JobSatisfaction_High JobSatisfaction_Low JobSatisfaction_Medium
## 1 0 0 0
## 2 0 0 1
## 3 0 0 1
## 4 0 0 0
## 5 0 1 0
## 6 0 0 1
## JobSatisfaction_Very High JobSatisfaction_NA WorkLifeBalance_Bad
## 1 1 0 0
## 2 0 0 0
## 3 0 0 1
## 4 1 0 0
## 5 0 0 0
## 6 0 0 0
## WorkLifeBalance_Best WorkLifeBalance_Better WorkLifeBalance_Good
## 1 0 0 1
## 2 1 0 0
## 3 0 0 0
## 4 0 1 0
## 5 0 1 0
## 6 0 0 1
## WorkLifeBalance_NA JobInvolvement_High JobInvolvement_Low
## 1 0 1 0
## 2 0 0 0
## 3 0 1 0
## 4 0 0 0
## 5 0 1 0
## 6 0 1 0
## JobInvolvement_Medium JobInvolvement_Very High PerformanceRating_Excellent
## 1 0 0 1
## 2 1 0 0
## 3 0 0 1
## 4 1 0 1
## 5 0 0 1
## 6 0 0 1
## PerformanceRating_Outstanding
## 1 0
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
Mat_data = subset(HR_data_new, select = -c(EmployeeID, BusinessTravel, Department, Education, EducationField, Gender, JobRole ,MaritalStatus, EnvironmentSatisfaction, JobSatisfaction, WorkLifeBalance, JobInvolvement, PerformanceRating, Over18_Y,Over18, EmployeeCount, StandardHours, Attrition_Yes, Attrition_No))
Mat_data[is.na(Mat_data)] <- 0
head(Mat_data)
## Age Attrition DistanceFromHome JobLevel MonthlyIncome NumCompaniesWorked
## 1 51 0 6 1 131160 1
## 2 31 1 10 1 41890 0
## 3 32 0 17 4 193280 1
## 4 38 0 2 3 83210 3
## 5 32 0 10 1 23420 4
## 6 46 0 8 4 40710 3
## PercentSalaryHike StockOptionLevel TotalWorkingYears TrainingTimesLastYear
## 1 11 0 1 6
## 2 23 1 6 3
## 3 15 3 5 2
## 4 11 3 13 5
## 5 12 2 9 2
## 6 13 0 28 5
## YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager
## 1 1 0 0
## 2 5 1 4
## 3 5 0 3
## 4 8 7 5
## 5 6 0 4
## 6 7 7 7
## BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently
## 1 0 0
## 2 0 1
## 3 0 1
## 4 1 0
## 5 0 0
## 6 0 0
## BusinessTravel_Travel_Rarely Department_Human Resources
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 1 0
## 6 1 0
## Department_Research & Development Department_Sales Education_Bachelor
## 1 0 1 0
## 2 1 0 0
## 3 1 0 0
## 4 1 0 0
## 5 1 0 0
## 6 1 0 1
## Education_Below College Education_College Education_Doctor Education_Master
## 1 0 1 0 0
## 2 1 0 0 0
## 3 0 0 0 1
## 4 0 0 1 0
## 5 1 0 0 0
## 6 0 0 0 0
## EducationField_Human Resources EducationField_Life Sciences
## 1 0 1
## 2 0 1
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 1
## EducationField_Marketing EducationField_Medical EducationField_Other
## 1 0 0 0
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 1 0
## 6 0 0 0
## EducationField_Technical Degree Gender_Female Gender_Male
## 1 0 1 0
## 2 0 1 0
## 3 0 0 1
## 4 0 0 1
## 5 0 0 1
## 6 0 1 0
## JobRole_Healthcare Representative JobRole_Human Resources
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 1
## 5 0 0
## 6 0 0
## JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 0
## 6 0 0 0
## JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive
## 1 0 0 0
## 2 0 1 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 1
## 6 1 0 0
## JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married
## 1 0 0 1
## 2 0 0 0
## 3 0 0 1
## 4 0 0 1
## 5 0 0 0
## 6 0 0 1
## MaritalStatus_Single EnvironmentSatisfaction_High EnvironmentSatisfaction_Low
## 1 0 1 0
## 2 1 1 0
## 3 0 0 0
## 4 0 0 0
## 5 1 0 0
## 6 0 1 0
## EnvironmentSatisfaction_Medium EnvironmentSatisfaction_Very High
## 1 0 0
## 2 0 0
## 3 1 0
## 4 0 1
## 5 0 1
## 6 0 0
## EnvironmentSatisfaction_NA JobSatisfaction_High JobSatisfaction_Low
## 1 0 0 0
## 2 0 0 0
## 3 0 0 0
## 4 0 0 0
## 5 0 0 1
## 6 0 0 0
## JobSatisfaction_Medium JobSatisfaction_Very High JobSatisfaction_NA
## 1 0 1 0
## 2 1 0 0
## 3 1 0 0
## 4 0 1 0
## 5 0 0 0
## 6 1 0 0
## WorkLifeBalance_Bad WorkLifeBalance_Best WorkLifeBalance_Better
## 1 0 0 0
## 2 0 1 0
## 3 1 0 0
## 4 0 0 1
## 5 0 0 1
## 6 0 0 0
## WorkLifeBalance_Good WorkLifeBalance_NA JobInvolvement_High
## 1 1 0 1
## 2 0 0 0
## 3 0 0 1
## 4 0 0 0
## 5 0 0 1
## 6 1 0 1
## JobInvolvement_Low JobInvolvement_Medium JobInvolvement_Very High
## 1 0 0 0
## 2 0 1 0
## 3 0 0 0
## 4 0 1 0
## 5 0 0 0
## 6 0 0 0
## PerformanceRating_Excellent PerformanceRating_Outstanding
## 1 1 0
## 2 0 1
## 3 1 0
## 4 1 0
## 5 1 0
## 6 1 0
res <- round(cor(Mat_data), digits = 2)
head(res)
## Age Attrition DistanceFromHome JobLevel MonthlyIncome
## Age 1.00 -0.16 0.01 0.00 -0.04
## Attrition -0.16 1.00 -0.01 -0.01 -0.03
## DistanceFromHome 0.01 -0.01 1.00 -0.04 -0.02
## JobLevel 0.00 -0.01 -0.04 1.00 0.05
## MonthlyIncome -0.04 -0.03 -0.02 0.05 1.00
## NumCompaniesWorked 0.30 0.04 -0.01 -0.01 -0.02
## NumCompaniesWorked PercentSalaryHike StockOptionLevel
## Age 0.30 -0.03 -0.03
## Attrition 0.04 0.03 -0.01
## DistanceFromHome -0.01 0.04 0.01
## JobLevel -0.01 0.01 0.00
## MonthlyIncome -0.02 0.00 0.03
## NumCompaniesWorked 1.00 0.03 0.02
## TotalWorkingYears TrainingTimesLastYear YearsAtCompany
## Age 0.68 -0.03 0.31
## Attrition -0.17 -0.05 -0.13
## DistanceFromHome 0.01 -0.01 0.03
## JobLevel -0.04 -0.03 -0.06
## MonthlyIncome -0.03 0.05 0.00
## NumCompaniesWorked 0.24 -0.03 -0.12
## YearsSinceLastPromotion YearsWithCurrManager
## Age 0.22 0.20
## Attrition -0.03 -0.16
## DistanceFromHome 0.00 0.02
## JobLevel -0.06 -0.06
## MonthlyIncome 0.07 0.02
## NumCompaniesWorked -0.04 -0.11
## BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently
## Age -0.01 -0.02
## Attrition -0.07 0.12
## DistanceFromHome -0.03 0.00
## JobLevel -0.04 -0.01
## MonthlyIncome 0.06 -0.03
## NumCompaniesWorked 0.00 -0.04
## BusinessTravel_Travel_Rarely Department_Human Resources
## Age 0.03 -0.01
## Attrition -0.05 0.08
## DistanceFromHome 0.02 -0.02
## JobLevel 0.04 0.04
## MonthlyIncome -0.01 -0.03
## NumCompaniesWorked 0.03 -0.06
## Department_Research & Development Department_Sales
## Age 0.02 -0.02
## Attrition -0.02 -0.02
## DistanceFromHome 0.01 0.00
## JobLevel -0.04 0.02
## MonthlyIncome 0.06 -0.05
## NumCompaniesWorked 0.05 -0.03
## Education_Bachelor Education_Below College Education_College
## Age -0.03 0.04 0.00
## Attrition -0.01 -0.01 0.04
## DistanceFromHome -0.02 0.05 -0.03
## JobLevel 0.03 -0.06 -0.01
## MonthlyIncome 0.04 -0.02 0.00
## NumCompaniesWorked 0.01 0.01 0.00
## Education_Doctor Education_Master
## Age -0.05 0.02
## Attrition -0.01 -0.01
## DistanceFromHome 0.00 0.02
## JobLevel 0.00 0.01
## MonthlyIncome 0.00 -0.03
## NumCompaniesWorked 0.01 -0.03
## EducationField_Human Resources EducationField_Life Sciences
## Age -0.05 0.00
## Attrition 0.09 0.01
## DistanceFromHome 0.03 0.01
## JobLevel 0.01 0.01
## MonthlyIncome -0.01 0.01
## NumCompaniesWorked -0.04 0.01
## EducationField_Marketing EducationField_Medical
## Age -0.03 0.01
## Attrition 0.00 0.00
## DistanceFromHome 0.01 -0.01
## JobLevel 0.04 -0.03
## MonthlyIncome -0.04 0.03
## NumCompaniesWorked 0.01 -0.01
## EducationField_Other EducationField_Technical Degree
## Age 0.00 0.05
## Attrition -0.03 -0.04
## DistanceFromHome -0.01 -0.01
## JobLevel -0.01 0.00
## MonthlyIncome 0.00 -0.01
## NumCompaniesWorked 0.00 0.01
## Gender_Female Gender_Male JobRole_Healthcare Representative
## Age 0.04 -0.04 -0.05
## Attrition -0.02 0.02 -0.01
## DistanceFromHome 0.04 -0.04 -0.02
## JobLevel 0.03 -0.03 0.01
## MonthlyIncome -0.01 0.01 -0.03
## NumCompaniesWorked 0.07 -0.07 0.01
## JobRole_Human Resources JobRole_Laboratory Technician
## Age 0.00 0.03
## Attrition -0.01 0.00
## DistanceFromHome 0.00 0.05
## JobLevel 0.03 -0.03
## MonthlyIncome -0.03 0.01
## NumCompaniesWorked 0.03 0.01
## JobRole_Manager JobRole_Manufacturing Director
## Age 0.01 0.00
## Attrition -0.02 -0.05
## DistanceFromHome -0.04 0.00
## JobLevel 0.03 0.00
## MonthlyIncome -0.01 0.03
## NumCompaniesWorked -0.01 0.00
## JobRole_Research Director JobRole_Research Scientist
## Age -0.02 0.01
## Attrition 0.05 0.03
## DistanceFromHome 0.00 0.00
## JobLevel 0.03 0.02
## MonthlyIncome 0.00 0.00
## NumCompaniesWorked 0.01 -0.02
## JobRole_Sales Executive JobRole_Sales Representative
## Age 0.01 -0.01
## Attrition 0.01 -0.01
## DistanceFromHome -0.01 0.01
## JobLevel -0.02 -0.02
## MonthlyIncome 0.00 0.00
## NumCompaniesWorked -0.02 0.01
## MaritalStatus_Divorced MaritalStatus_Married
## Age 0.03 0.08
## Attrition -0.09 -0.09
## DistanceFromHome 0.00 0.04
## JobLevel 0.01 0.01
## MonthlyIncome 0.03 0.02
## NumCompaniesWorked 0.04 -0.01
## MaritalStatus_Single EnvironmentSatisfaction_High
## Age -0.12 -0.01
## Attrition 0.18 -0.04
## DistanceFromHome -0.04 0.01
## JobLevel -0.02 -0.03
## MonthlyIncome -0.05 -0.03
## NumCompaniesWorked -0.02 -0.03
## EnvironmentSatisfaction_Low EnvironmentSatisfaction_Medium
## Age 0.01 -0.02
## Attrition 0.12 -0.02
## DistanceFromHome -0.01 -0.02
## JobLevel 0.02 0.02
## MonthlyIncome -0.01 0.05
## NumCompaniesWorked 0.01 -0.02
## EnvironmentSatisfaction_Very High EnvironmentSatisfaction_NA
## Age 0.02 -0.02
## Attrition -0.05 0.01
## DistanceFromHome 0.02 0.00
## JobLevel -0.01 -0.01
## MonthlyIncome 0.00 0.01
## NumCompaniesWorked 0.03 0.01
## JobSatisfaction_High JobSatisfaction_Low
## Age 0.01 0.00
## Attrition 0.01 0.09
## DistanceFromHome 0.02 0.00
## JobLevel -0.03 0.03
## MonthlyIncome 0.05 0.01
## NumCompaniesWorked -0.01 0.03
## JobSatisfaction_Medium JobSatisfaction_Very High
## Age 0.00 -0.01
## Attrition 0.00 -0.09
## DistanceFromHome 0.00 -0.02
## JobLevel 0.00 0.01
## MonthlyIncome -0.05 -0.02
## NumCompaniesWorked 0.03 -0.05
## JobSatisfaction_NA WorkLifeBalance_Bad WorkLifeBalance_Best
## Age 0.00 0.01 -0.01
## Attrition -0.02 0.10 0.02
## DistanceFromHome -0.01 0.03 -0.02
## JobLevel -0.02 0.01 -0.03
## MonthlyIncome -0.02 0.03 0.02
## NumCompaniesWorked 0.01 0.04 0.04
## WorkLifeBalance_Better WorkLifeBalance_Good
## Age -0.01 0.02
## Attrition -0.06 0.01
## DistanceFromHome 0.05 -0.06
## JobLevel 0.00 0.01
## MonthlyIncome -0.01 -0.02
## NumCompaniesWorked -0.04 0.00
## WorkLifeBalance_NA JobInvolvement_High JobInvolvement_Low
## Age 0.02 -0.02 0.00
## Attrition -0.01 -0.03 0.04
## DistanceFromHome 0.01 -0.04 0.01
## JobLevel -0.01 0.01 -0.01
## MonthlyIncome 0.00 0.03 -0.03
## NumCompaniesWorked -0.01 -0.02 -0.01
## JobInvolvement_Medium JobInvolvement_Very High
## Age -0.01 0.04
## Attrition 0.00 0.02
## DistanceFromHome 0.02 0.03
## JobLevel 0.01 -0.03
## MonthlyIncome -0.01 -0.01
## NumCompaniesWorked -0.01 0.05
## PerformanceRating_Excellent PerformanceRating_Outstanding
## Age 0.03 -0.03
## Attrition -0.02 0.02
## DistanceFromHome -0.04 0.04
## JobLevel 0.00 0.00
## MonthlyIncome -0.02 0.02
## NumCompaniesWorked -0.02 0.02
Melt the correlation Matrix
melted_res <- reshape2::melt(res, value.name = "Correlation")
head(melted_res)
## Var1 Var2 Correlation
## 1 Age Age 1.00
## 2 Attrition Age -0.16
## 3 DistanceFromHome Age 0.01
## 4 JobLevel Age 0.00
## 5 MonthlyIncome Age -0.04
## 6 NumCompaniesWorked Age 0.30
write_csv(melted_res,"melted_res.csv")
Extract the Correlation of Attrition from Matrix.
attrition_ext <- melted_res[melted_res$Var2=="Attrition", c("Var1", "Var2", "Correlation")]
head(attrition_ext)
## Var1 Var2 Correlation
## 66 Age Attrition -0.16
## 67 Attrition Attrition 1.00
## 68 DistanceFromHome Attrition -0.01
## 69 JobLevel Attrition -0.01
## 70 MonthlyIncome Attrition -0.03
## 71 NumCompaniesWorked Attrition 0.04
write.csv(attrition_ext, "attrition_ext.csv")
Visualize the Matrix of Attrition above in a Bar chart by descending order.
ggplot(attrition_ext, aes(reorder(Var1, -Correlation), Correlation))+geom_bar(stat = "identity", fill="#99CCFF")+
theme_bw()+theme(axis.text.x = element_text(angle=90, size=8, vjust=0.5, hjust=1))+
scale_y_continuous(name="Correlation", limits = c(-0.2,1)) + geom_hline(yintercept = 0)