Load the dataset and appropriate packages

Dataset Description

Dataset > IBM HR Analytics Employee Attrition & Performance This is a fictional data set created by IBM data scientists. Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or ‘compare average monthly income by education and attrition’.

Four ways to get initial understanding of the data

str(df)
## spec_tbl_df [1,470 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                     : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : chr [1:1470] "Yes" "No" "Yes" "No" ...
##  $ BusinessTravel          : chr [1:1470] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
##  $ DailyRate               : num [1:1470] 1102 279 1373 1392 591 ...
##  $ Department              : chr [1:1470] "Sales" "Research & Development" "Research & Development" "Research & Development" ...
##  $ DistanceFromHome        : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : chr [1:1470] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
##  $ EmployeeCount           : num [1:1470] 1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : num [1:1470] 1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : chr [1:1470] "Female" "Male" "Male" "Female" ...
##  $ HourlyRate              : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : chr [1:1470] "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
##  $ JobSatisfaction         : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : chr [1:1470] "Single" "Married" "Single" "Married" ...
##  $ MonthlyIncome           : num [1:1470] 5993 5130 2090 2909 3468 ...
##  $ MonthlyRate             : num [1:1470] 19479 24907 2396 23159 16632 ...
##  $ NumCompaniesWorked      : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : chr [1:1470] "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr [1:1470] "Yes" "No" "Yes" "Yes" ...
##  $ PercentSalaryHike       : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Attrition = col_character(),
##   ..   BusinessTravel = col_character(),
##   ..   DailyRate = col_double(),
##   ..   Department = col_character(),
##   ..   DistanceFromHome = col_double(),
##   ..   Education = col_double(),
##   ..   EducationField = col_character(),
##   ..   EmployeeCount = col_double(),
##   ..   EmployeeNumber = col_double(),
##   ..   EnvironmentSatisfaction = col_double(),
##   ..   Gender = col_character(),
##   ..   HourlyRate = col_double(),
##   ..   JobInvolvement = col_double(),
##   ..   JobLevel = col_double(),
##   ..   JobRole = col_character(),
##   ..   JobSatisfaction = col_double(),
##   ..   MaritalStatus = col_character(),
##   ..   MonthlyIncome = col_double(),
##   ..   MonthlyRate = col_double(),
##   ..   NumCompaniesWorked = col_double(),
##   ..   Over18 = col_character(),
##   ..   OverTime = col_character(),
##   ..   PercentSalaryHike = col_double(),
##   ..   PerformanceRating = col_double(),
##   ..   RelationshipSatisfaction = col_double(),
##   ..   StandardHours = col_double(),
##   ..   StockOptionLevel = col_double(),
##   ..   TotalWorkingYears = col_double(),
##   ..   TrainingTimesLastYear = col_double(),
##   ..   WorkLifeBalance = col_double(),
##   ..   YearsAtCompany = col_double(),
##   ..   YearsInCurrentRole = col_double(),
##   ..   YearsSinceLastPromotion = col_double(),
##   ..   YearsWithCurrManager = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(df)
##       Age         Attrition         BusinessTravel       DailyRate     
##  Min.   :18.00   Length:1470        Length:1470        Min.   : 102.0  
##  1st Qu.:30.00   Class :character   Class :character   1st Qu.: 465.0  
##  Median :36.00   Mode  :character   Mode  :character   Median : 802.0  
##  Mean   :36.92                                         Mean   : 802.5  
##  3rd Qu.:43.00                                         3rd Qu.:1157.0  
##  Max.   :60.00                                         Max.   :1499.0  
##   Department        DistanceFromHome   Education     EducationField    
##  Length:1470        Min.   : 1.000   Min.   :1.000   Length:1470       
##  Class :character   1st Qu.: 2.000   1st Qu.:2.000   Class :character  
##  Mode  :character   Median : 7.000   Median :3.000   Mode  :character  
##                     Mean   : 9.193   Mean   :2.913                     
##                     3rd Qu.:14.000   3rd Qu.:4.000                     
##                     Max.   :29.000   Max.   :5.000                     
##  EmployeeCount EmployeeNumber   EnvironmentSatisfaction    Gender         
##  Min.   :1     Min.   :   1.0   Min.   :1.000           Length:1470       
##  1st Qu.:1     1st Qu.: 491.2   1st Qu.:2.000           Class :character  
##  Median :1     Median :1020.5   Median :3.000           Mode  :character  
##  Mean   :1     Mean   :1024.9   Mean   :2.722                             
##  3rd Qu.:1     3rd Qu.:1555.8   3rd Qu.:4.000                             
##  Max.   :1     Max.   :2068.0   Max.   :4.000                             
##    HourlyRate     JobInvolvement    JobLevel       JobRole         
##  Min.   : 30.00   Min.   :1.00   Min.   :1.000   Length:1470       
##  1st Qu.: 48.00   1st Qu.:2.00   1st Qu.:1.000   Class :character  
##  Median : 66.00   Median :3.00   Median :2.000   Mode  :character  
##  Mean   : 65.89   Mean   :2.73   Mean   :2.064                     
##  3rd Qu.: 83.75   3rd Qu.:3.00   3rd Qu.:3.000                     
##  Max.   :100.00   Max.   :4.00   Max.   :5.000                     
##  JobSatisfaction MaritalStatus      MonthlyIncome    MonthlyRate   
##  Min.   :1.000   Length:1470        Min.   : 1009   Min.   : 2094  
##  1st Qu.:2.000   Class :character   1st Qu.: 2911   1st Qu.: 8047  
##  Median :3.000   Mode  :character   Median : 4919   Median :14236  
##  Mean   :2.729                      Mean   : 6503   Mean   :14313  
##  3rd Qu.:4.000                      3rd Qu.: 8379   3rd Qu.:20462  
##  Max.   :4.000                      Max.   :19999   Max.   :26999  
##  NumCompaniesWorked    Over18            OverTime         PercentSalaryHike
##  Min.   :0.000      Length:1470        Length:1470        Min.   :11.00    
##  1st Qu.:1.000      Class :character   Class :character   1st Qu.:12.00    
##  Median :2.000      Mode  :character   Mode  :character   Median :14.00    
##  Mean   :2.693                                            Mean   :15.21    
##  3rd Qu.:4.000                                            3rd Qu.:18.00    
##  Max.   :9.000                                            Max.   :25.00    
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.154     Mean   :2.712            Mean   :80    Mean   :0.7939  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.28     Mean   :2.799         Mean   :2.761   Mean   : 7.008  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.: 9.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 3.000     Median : 1.000          Median : 3.000      
##  Mean   : 4.229     Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :18.000     Max.   :15.000          Max.   :17.000
colnames(df)
##  [1] "Age"                      "Attrition"               
##  [3] "BusinessTravel"           "DailyRate"               
##  [5] "Department"               "DistanceFromHome"        
##  [7] "Education"                "EducationField"          
##  [9] "EmployeeCount"            "EmployeeNumber"          
## [11] "EnvironmentSatisfaction"  "Gender"                  
## [13] "HourlyRate"               "JobInvolvement"          
## [15] "JobLevel"                 "JobRole"                 
## [17] "JobSatisfaction"          "MaritalStatus"           
## [19] "MonthlyIncome"            "MonthlyRate"             
## [21] "NumCompaniesWorked"       "Over18"                  
## [23] "OverTime"                 "PercentSalaryHike"       
## [25] "PerformanceRating"        "RelationshipSatisfaction"
## [27] "StandardHours"            "StockOptionLevel"        
## [29] "TotalWorkingYears"        "TrainingTimesLastYear"   
## [31] "WorkLifeBalance"          "YearsAtCompany"          
## [33] "YearsInCurrentRole"       "YearsSinceLastPromotion" 
## [35] "YearsWithCurrManager"
dim(df)
## [1] 1470   35
class(df)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"
head(df)
## # A tibble: 6 x 35
##     Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
##   <dbl> <chr>     <chr>              <dbl> <chr>                 <dbl>     <dbl>
## 1    41 Yes       Travel_Rarely       1102 Sales                     1         2
## 2    49 No        Travel_Freque~       279 Research ~                8         1
## 3    37 Yes       Travel_Rarely       1373 Research ~                2         2
## 4    33 No        Travel_Freque~      1392 Research ~                3         4
## 5    27 No        Travel_Rarely        591 Research ~                2         1
## 6    32 No        Travel_Freque~      1005 Research ~                2         2
## # ... with 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## #   EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## #   HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## #   JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## #   MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## #   PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## #   RelationshipSatisfaction <dbl>, StandardHours <dbl>, ...
tail(df)
## # A tibble: 6 x 35
##     Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
##   <dbl> <chr>     <chr>              <dbl> <chr>                 <dbl>     <dbl>
## 1    26 No        Travel_Rarely       1167 Sales                     5         3
## 2    36 No        Travel_Freque~       884 Research ~               23         2
## 3    39 No        Travel_Rarely        613 Research ~                6         1
## 4    27 No        Travel_Rarely        155 Research ~                4         3
## 5    49 No        Travel_Freque~      1023 Sales                     2         3
## 6    34 No        Travel_Rarely        628 Research ~                8         3
## # ... with 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## #   EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## #   HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## #   JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## #   MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## #   PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## #   RelationshipSatisfaction <dbl>, StandardHours <dbl>, ...

Four ways of subsetting / choosing row or columns

  1. Department Names Get the ‘Sales’ department
sd<-df[df$Department=="Sales",]

head(sd,5)
## # A tibble: 5 x 35
##     Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
##   <dbl> <chr>     <chr>              <dbl> <chr>                 <dbl>     <dbl>
## 1    41 Yes       Travel_Rarely       1102 Sales                     1         2
## 2    53 No        Travel_Rarely       1219 Sales                     2         4
## 3    36 Yes       Travel_Rarely       1218 Sales                     9         4
## 4    42 No        Travel_Rarely        691 Sales                     8         4
## 5    46 No        Travel_Rarely        705 Sales                     2         4
## # ... with 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## #   EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## #   HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## #   JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## #   MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## #   PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## #   RelationshipSatisfaction <dbl>, StandardHours <dbl>, ...
  1. Indexing Features Sub-setting Age & Attrition for Sales Department
sd_age_attri<-sd[,c(1:2,5)]

head(sd_age_attri,5)
## # A tibble: 5 x 3
##     Age Attrition Department
##   <dbl> <chr>     <chr>     
## 1    41 Yes       Sales     
## 2    53 No        Sales     
## 3    36 Yes       Sales     
## 4    42 No        Sales     
## 5    46 No        Sales
  1. Subset Function
  1. Sub-setting Age & Attrition for Sales Department that is older than 35
  2. Sub-setting Department & Age from Education Field is Marketing
sd_age35above_attri<-subset(sd_age_attri,Age>35)
head(sd_age35above_attri,5)
## # A tibble: 5 x 3
##     Age Attrition Department
##   <dbl> <chr>     <chr>     
## 1    41 Yes       Sales     
## 2    53 No        Sales     
## 3    36 Yes       Sales     
## 4    42 No        Sales     
## 5    46 No        Sales
edu_marketing_age_d<-subset(df,EducationField=="Marketing",select = c(Age,Department,EducationField))
head(edu_marketing_age_d,5)
## # A tibble: 5 x 3
##     Age Department EducationField
##   <dbl> <chr>      <chr>         
## 1    42 Sales      Marketing     
## 2    46 Sales      Marketing     
## 3    50 Sales      Marketing     
## 4    35 Sales      Marketing     
## 5    34 Sales      Marketing
  1. Using filter() function from deplyr package Sub-setting monthly income > 3000
income_3kabove<-filter(df,MonthlyIncome>3000)


head(income_3kabove,5)
## # A tibble: 5 x 35
##     Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
##   <dbl> <chr>     <chr>              <dbl> <chr>                 <dbl>     <dbl>
## 1    41 Yes       Travel_Rarely       1102 Sales                     1         2
## 2    49 No        Travel_Freque~       279 Research ~                8         1
## 3    27 No        Travel_Rarely        591 Research ~                2         1
## 4    32 No        Travel_Freque~      1005 Research ~                2         2
## 5    38 No        Travel_Freque~       216 Research ~               23         3
## # ... with 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## #   EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## #   HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## #   JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## #   MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## #   PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## #   RelationshipSatisfaction <dbl>, StandardHours <dbl>, ...

Four ways to Preprocess data (Cleaning, etc)

  1. “Janitor” package
##removing empty column or rows
df_clean<-df%>%
  remove_empty(which=c("rows"))%>%
  remove_empty(which=c("cols"))

##removing duplicate records
df_clean2<-df_clean[!duplicated(df_clean), ]

dim(df_clean2)
## [1] 1470   35
  1. Factor all character columns
## The conversion
df_clean2[sapply(df_clean2, is.character)] <- lapply(df_clean2[sapply(df_clean2, is.character)],as.factor)
str(df_clean2)
## tibble [1,470 x 35] (S3: tbl_df/tbl/data.frame)
##  $ Age                     : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
##  $ DailyRate               : num [1:1470] 1102 279 1373 1392 591 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
##  $ DistanceFromHome        : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
##  $ EmployeeCount           : num [1:1470] 1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : num [1:1470] 1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
##  $ HourlyRate              : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 7 3 7 3 3 3 3 5 1 ...
##  $ JobSatisfaction         : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
##  $ MonthlyIncome           : num [1:1470] 5993 5130 2090 2909 3468 ...
##  $ MonthlyRate             : num [1:1470] 19479 24907 2396 23159 16632 ...
##  $ NumCompaniesWorked      : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
##  $ PercentSalaryHike       : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
  1. Dealing with missing data using is.na() function
sum(is.na(df_clean2))
## [1] 0
df_clean2[df_clean2=="N/A"]<-NA
df_clean2[df_clean2=="NA"]<-NA
sum(is.na(df_clean2))
## [1] 0
  1. Remove irrelevant column
df_clean3 = subset(df_clean2, select = -c(StockOptionLevel,Over18,JobRole,EmployeeCount,EmployeeNumber) )
dim(df_clean3)
## [1] 1470   30
  1. Create a new column
df_clean3$PastWorkingYears <- (df_clean3$TotalWorkingYears - df_clean3$YearsAtCompany)
df_clean3 = subset(df_clean3, select = -c(TotalWorkingYears) )
dim(df_clean3)
## [1] 1470   30
str(df_clean3)
## tibble [1,470 x 30] (S3: tbl_df/tbl/data.frame)
##  $ Age                     : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 1 1 1 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 2 3 2 3 2 3 3 2 3 ...
##  $ DailyRate               : num [1:1470] 1102 279 1373 1392 591 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 2 2 2 2 2 2 2 ...
##  $ DistanceFromHome        : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 2 5 2 4 2 4 2 2 4 ...
##  $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 1 2 2 1 2 2 1 2 2 2 ...
##  $ HourlyRate              : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
##  $ JobSatisfaction         : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 2 3 2 2 3 2 1 3 2 ...
##  $ MonthlyIncome           : num [1:1470] 5993 5130 2090 2909 3468 ...
##  $ MonthlyRate             : num [1:1470] 19479 24907 2396 23159 16632 ...
##  $ NumCompaniesWorked      : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 2 1 2 2 1 1 2 1 1 1 ...
##  $ PercentSalaryHike       : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
##  $ TrainingTimesLastYear   : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
##  $ PastWorkingYears        : num [1:1470] 2 0 7 0 4 1 11 0 1 10 ...

Write one function and use the function for the dataset

grouped_mean <- function(data, group_var, summary_var) {
  data %>%
    group_by(group_var) %>%
    summarise(mean = mean(summary_var))
  
 # message(paste("The mean is: "),mean_col, paste(" by "),group_col)
}

Mean <- function(data, group_col,mean_col){
  group_col <- enquo(group_col)
  mean_col<- enquo(mean_col)
  
  data %>%
    group_by((!!group_col)) %>%
    summarise(mean((!!mean_col)))
  
}
  

AverageIncome<-Mean(df_clean3, Department, MonthlyIncome)
AverageAge<-Mean(df_clean3, Department, Age)

Average monthly income by department

AverageIncome
## # A tibble: 3 x 2
##   Department             `mean(MonthlyIncome)`
##   <fct>                                  <dbl>
## 1 Human Resources                        6655.
## 2 Research & Development                 6281.
## 3 Sales                                  6959.

Average monthly income by department

AverageAge
## # A tibble: 3 x 2
##   Department             `mean(Age)`
##   <fct>                        <dbl>
## 1 Human Resources               37.8
## 2 Research & Development        37.0
## 3 Sales                         36.5

EDA

  1. Bivariate EDA
table(df_clean3$Attrition) #237 employee quit job
## 
##   No  Yes 
## 1233  237
ggplot(df_clean3, aes(Attrition)) +
  geom_bar(position = "dodge", aes(y=(..count..)/sum(..count..), fill=Attrition)) + 
  scale_y_continuous(labels=scales::percent) +
  ylab("relative frequencies") +
  xlab("Attriton") +
  geom_text(aes(label = scales::percent((..count..)/sum(..count..)), y=(..count..)/sum(..count..)), stat= "count",vjust =-.5)+
  scale_fill_brewer(palette="Set1")

Age - Age of the employee

ggplot(df_clean3, aes(Age, color=Attrition, fill=Attrition)) +
  geom_density() +
  labs(title = "Age vs. Attrition")+
  scale_fill_brewer(palette="Set1")#The attriton of younger employees are more than the old employees.

ggplot(df_clean3, aes(x=Attrition, Age, color=Attrition)) +
  geom_boxplot() +
  scale_color_manual(values=c("#CB181D", "#2171B5")) #There is a two outlier for Attrition is yes. #The people age betwwon 40 and 33 quiting their jobs

#Test stat
shapiro.test(df_clean3$Age) #the age distribution is not normal. Therefore we conduct the wilcox test
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$Age
## W = 0.97745, p-value = 2.037e-14
wilcox.test(Age ~ Attrition, data=df_clean3) #As p value is lower than 0,05 than the difference of ages between attritions is statistically significant
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Age by Attrition
## W = 185362, p-value = 5.304e-11
## alternative hypothesis: true location shift is not equal to 0

Department - Department in company

summary(df_clean3$Department)
##        Human Resources Research & Development                  Sales 
##                     63                    961                    446
table(df_clean3$Department, df_clean3$Attrition)
##                         
##                           No Yes
##   Human Resources         51  12
##   Research & Development 828 133
##   Sales                  354  92
ggplot(df_clean3,aes(x=Attrition,group=Department))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~Department)+
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Attrition vs. Department")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_y_continuous(labels=scales::percent) +
  ylab("relative frequencies") +
  scale_fill_brewer(palette="Set1")

#Test stat
chisq.test(df_clean3$Department, df_clean3$Attrition) #there is a significant relationship between department and Attrition
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$Department and df_clean3$Attrition
## X-squared = 10.796, df = 2, p-value = 0.004526
#HR has the highest attrition between departments 
#However in R&D the highest number of employees quit their job
#Even the employees in R&D has the highest number of employee quit their job, in the scales the percentage of employees in HR higher than R&D

BusinessTravel - How frequently the employees travelled for business purposes in the last year

summary(df_clean3$BusinessTravel)
##        Non-Travel Travel_Frequently     Travel_Rarely 
##               150               277              1043
ggplot(df_clean3,aes(x=Attrition,group=BusinessTravel))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~BusinessTravel)+
  labs(x="Attrition",y="Percentage",title="Attrition vs. BusinessTravel")+
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_y_continuous(labels=scales::percent) +
  ylab("relative frequencies") +
  scale_fill_brewer(palette="Set1")

#Test stat
chisq.test(df_clean3$BusinessTravel, df_clean3$Attrition) #there is a significant relationship between two variables
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$BusinessTravel and df_clean3$Attrition
## X-squared = 24.182, df = 2, p-value = 5.609e-06
#There is a relationship between business travel and attrition and we can see that in the scales plot 
#the highest attriton rate in employees travel frequently
# In the numerical the highest employee quiting their job is travel_rarely

DistanceFromHome - Distance from home in kms

df_clean3 %>% 
  summarise(Median = median(DistanceFromHome),
            Mean = mean(DistanceFromHome), 
            Max = max(DistanceFromHome), 
            Min = min(DistanceFromHome))
## # A tibble: 1 x 4
##   Median  Mean   Max   Min
##    <dbl> <dbl> <dbl> <dbl>
## 1      7  9.19    29     1
#Plots
ggplot(df_clean3, aes(DistanceFromHome, fill=Attrition)) +
  geom_density()

ggplot(df_clean3, aes(x=Attrition, DistanceFromHome)) +
  geom_boxplot() #There is no outlier

#Test stat
shapiro.test(df_clean3$DistanceFromHome) #the DistanceFromHome  distribution is not normal. Therefore we conduct the wilcox test
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$DistanceFromHome
## W = 0.86159, p-value < 2.2e-16
wilcox.test(DistanceFromHome ~ Attrition, data=df_clean3) # we can conclude that there is not significant difference between attritions in terms of DistanceFromHome
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  DistanceFromHome by Attrition
## W = 127996, p-value = 0.002387
## alternative hypothesis: true location shift is not equal to 0

Education - Education Level 1 ‘Below College’ 2 ‘College’ 3 ‘Bachelor’ 4 ‘Master’ 5 ‘Doctor’

table(df_clean3$Education, df_clean3$Attrition)
##    
##      No Yes
##   1 139  31
##   2 238  44
##   3 473  99
##   4 340  58
##   5  43   5
ggplot(df_clean3,aes(x=Attrition,group=Education))+
  geom_bar(aes(y=..prop..,fill=factor(..group..)),stat="count")+
  facet_grid(~Education)+
  labs(x="Attrition",y="Percentage",title="Attrition vs. EducationLevel")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_y_continuous(labels=scales::percent) +
  ylab("relative frequencies") +
  scale_fill_discrete(name="Education Level", label=c("Below College", "College", "Bachelor", "Master", "Doctor") )

#Test stat
chisq.test(df_clean3$Education, df_clean3$Attrition) #Since p>0.05 there is no significant relationshio between Attrition and education
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$Education and df_clean3$Attrition
## X-squared = 3.074, df = 4, p-value = 0.5455
#In the plot we can conclude that the attrition rate is simillar for all education groups.

EducationField - Field of education

summary(df_clean3$EducationField)
##  Human Resources    Life Sciences        Marketing          Medical 
##               27              606              159              464 
##            Other Technical Degree 
##               82              132
options(repr.plot.width=10, repr.plot.height=6) 
attr.edu <- df_clean3 %>% select(EducationField, Attrition) %>% group_by(EducationField, Attrition) %>% summarize(amount=n(), .groups='drop') %>%
  mutate(pct=round(prop.table(amount),2) * 100) %>% arrange(pct)

nofunc <- colorRampPalette(c("#2171B5"))
yesfunc <- colorRampPalette(c("#CB181D"))

yes.attr <- attr.edu %>% filter(Attrition == "Yes") %>% arrange(EducationField) 
no.attr <- attr.edu %>% filter(Attrition == "No") %>% arrange(EducationField)

par(mar = pyramid.plot(no.attr$pct, yes.attr$pct, labels = unique(attr.edu$EducationField),
                       top.labels=c("No","","Yes"), main = "Attrition by Field of Education", 
                       gap=10, show.values = T, rxcol = yesfunc(6), lxcol = nofunc(6)))

## 35 35
#Test statistics
chisq.test(df_clean3$EducationField, df_clean3$Attrition)
## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be
## incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$EducationField and df_clean3$Attrition
## X-squared = 16.025, df = 5, p-value = 0.006774

Gender

table(df_clean3$Gender, df_clean3$Attrition)
##         
##           No Yes
##   Female 501  87
##   Male   732 150
ggplot(df_clean3,aes(x=Attrition,group=Gender))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~Gender)+
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Gender Vs Attrition %")+
  scale_y_continuous(labels=scales::percent) +
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

ggplot(df_clean3,aes(x=Gender,group=Attrition))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~Attrition)+
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Gender Vs Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

#Test stat
chisq.test(df_clean3$Gender, df_clean3$Attrition) #Insignifcant, There is no relationship between gender and attrition
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df_clean3$Gender and df_clean3$Attrition
## X-squared = 1.117, df = 1, p-value = 0.2906

JobLevel Job level at company on a scale of 1 to 5

table(df_clean3$JobLevel)
## 
##   1   2   3   4   5 
## 543 534 218 106  69
ggplot(df_clean3,aes(x=Attrition,group=JobLevel))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~JobLevel)+
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Job Level Vs Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

chisq.test(df_clean3$JobLevel, df_clean3$Attrition) #significant
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$JobLevel and df_clean3$Attrition
## X-squared = 72.529, df = 4, p-value = 6.635e-15

MaritalStatus - Marital status of the employee

table(df_clean3$MaritalStatus)
## 
## Divorced  Married   Single 
##      327      673      470
summary(df_clean3$MaritalStatus)
## Divorced  Married   Single 
##      327      673      470
ggplot(df_clean3,aes(x=Attrition,group=MaritalStatus))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~MaritalStatus)+
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Marital Status Vs Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

#Test stat
chisq.test(df_clean3$MaritalStatus, df_clean3$Attrition) #significant
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$MaritalStatus and df_clean3$Attrition
## X-squared = 46.164, df = 2, p-value = 9.456e-11

MonthlyIncome - Monthly income in rupees per month

df_clean3 %>% 
  summarise(Median = median(MonthlyIncome), 
            Mean = mean(MonthlyIncome),
            Max = max(MonthlyIncome), 
            Min = min(MonthlyIncome))
## # A tibble: 1 x 4
##   Median  Mean   Max   Min
##    <dbl> <dbl> <dbl> <dbl>
## 1   4919 6503. 19999  1009
#plots
ggplot(df_clean3, aes(MonthlyIncome, color=Attrition, fill=Attrition)) +
  geom_density() +
  labs(title = "Monthly Income vs. Attrition")+
  scale_fill_brewer(palette="Set1")

ggplot(df_clean3, aes(x=Attrition, MonthlyIncome, color=Attrition)) +
  geom_boxplot() +
  scale_color_brewer(palette="Set1") 

#There are several outlier however, the outliers not excluded in the dataset because the employees with manager
#level also quit their job and stay in the company.
#Therefore, we can conclude that montly income not change with attrition but in the analysis, we can see 
#that several relationship bwteen MonthlyIncome and Attrition
#Test stat
shapiro.test(df_clean3$MonthlyIncome) #the MonthlyIncome  distribution is not normal. Therefore we conduct the wilcox test
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$MonthlyIncome
## W = 0.82791, p-value < 2.2e-16
wilcox.test(MonthlyIncome ~ Attrition, data=df_clean3) #As p value is lower than 0,05 than the difference of 
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  MonthlyIncome by Attrition
## W = 191601, p-value = 2.951e-14
## alternative hypothesis: true location shift is not equal to 0
#Mounthly Income between attritions is statistically significant

NumCompaniesWorked - Total number of companies the employee has worked for

table(df_clean3$NumCompaniesWorked, df_clean3$Attrition)
##    
##      No Yes
##   0 174  23
##   1 423  98
##   2 130  16
##   3 143  16
##   4 122  17
##   5  47  16
##   6  54  16
##   7  57  17
##   8  43   6
##   9  40  12
#Plot
ggplot(df_clean3,aes(x=Attrition,group=NumCompaniesWorked))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~NumCompaniesWorked)+
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="NumCompaniesWorked Vs Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1") #If an emplyee worked 5 or more companies they are tendencing to subject to attrition

#Test statistic
shapiro.test(df_clean3$NumCompaniesWorked) #NumCompaniesWorked is not normal. Therefore we conduct the wilcox test
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$NumCompaniesWorked
## W = 0.84878, p-value < 2.2e-16
wilcox.test(NumCompaniesWorked ~ Attrition, data=df_clean3) #Not significant
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  NumCompaniesWorked by Attrition
## W = 139292, p-value = 0.2424
## alternative hypothesis: true location shift is not equal to 0

PercentSalaryHike - Percent salary hike for last year

df_clean3 %>% 
  summarise(Median = median(PercentSalaryHike), 
            Mean = mean(PercentSalaryHike),
            Max = max(PercentSalaryHike), 
            Min = min(PercentSalaryHike))
## # A tibble: 1 x 4
##   Median  Mean   Max   Min
##    <dbl> <dbl> <dbl> <dbl>
## 1     14  15.2    25    11
ggplot(df_clean3, aes(PercentSalaryHike, color=Attrition, fill=Attrition)) +
  geom_density() +
  labs(title = "PercentSalaryHike vs. Attrition")+
  scale_fill_brewer(palette="Set1")#The attriton of emplyees are high when the percentsalaryhike increase

ggplot(df_clean3, aes(x=Attrition, PercentSalaryHike, color=Attrition)) +
  geom_boxplot() +
  scale_color_manual(values=c("#CB181D", "#2171B5")) #There is a no outlier.

#Test stat
shapiro.test(df_clean3$Age) #the age distribution is not normal. Therefore we conduct the wilcox test
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$Age
## W = 0.97745, p-value = 2.037e-14
wilcox.test(Age ~ Attrition, data=df_clean3) #As p value is lower than 0,05 than the difference of ages between attritions is statistically significant
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Age by Attrition
## W = 185362, p-value = 5.304e-11
## alternative hypothesis: true location shift is not equal to 0

TrainingTimesLastYear - Number of times training was conducted for this employee last year

table(df_clean3$TrainingTimesLastYear)
## 
##   0   1   2   3   4   5   6 
##  54  71 547 491 123 119  65
#Plots
ggplot(df_clean3, aes(TrainingTimesLastYear, color=Attrition, fill=Attrition)) +
  geom_density() +
  labs(title = "TrainingTimesLastYear vs. Attrition")+
  scale_fill_brewer(palette="Set1")#

ggplot(df_clean3, aes(x=Attrition, TrainingTimesLastYear, color=Attrition)) +
  geom_boxplot() +
  scale_color_manual(values=c("#CB181D", "#2171B5")) #There is a several outlier in the attrition ratio "Yes". However, we can think that this outliers can be retirement age and they are exit the labor force.

#Test stat
shapiro.test(df_clean3$TrainingTimesLastYear) #the TrainingTimesLastYear distribution is not normal. 
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$TrainingTimesLastYear
## W = 0.89509, p-value < 2.2e-16
#Therefore we conduct the wilcox test
wilcox.test(TrainingTimesLastYear ~ Attrition, data=df_clean3) #As p value is lower than 0,05 than the difference 
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  TrainingTimesLastYear by Attrition
## W = 157436, p-value = 0.0473
## alternative hypothesis: true location shift is not equal to 0
#of TrainingTimesLastYear between attritions is statistically significant

YearsAtCompany - Total number of years spent at the company by the employee

df_clean3 %>% 
  summarise(Median = median(YearsAtCompany), 
            Mean = mean(YearsAtCompany),
            Max = max(YearsAtCompany), 
            Min = min(YearsAtCompany))
## # A tibble: 1 x 4
##   Median  Mean   Max   Min
##    <dbl> <dbl> <dbl> <dbl>
## 1      5  7.01    40     0
ggplot(df_clean3, aes(YearsAtCompany, color=Attrition, fill=Attrition)) +
  geom_density() +
  labs(title = "YearsAtCompany vs. Attrition")+
  scale_fill_brewer(palette="Set1")#

ggplot(df_clean3, aes(x=Attrition, YearsAtCompany, color=Attrition)) +
  geom_boxplot() +
  scale_color_manual(values=c("#CB181D", "#2171B5")) #There is a several outlier in the attrition ratio "Yes". However, we can think that this outliers can be retirement age and they are exit the labor force.

#Test stat
shapiro.test(df_clean3$YearsAtCompany) #the YearsAtCompany distribution is not normal. Therefore we conduct the wilcox test
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$YearsAtCompany
## W = 0.83899, p-value < 2.2e-16
wilcox.test(YearsAtCompany ~ Attrition, data=df_clean3) #As p value is lower than 0,05 than the difference of 
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  YearsAtCompany by Attrition
## W = 189639, p-value = 2.916e-13
## alternative hypothesis: true location shift is not equal to 0
#YearsAtCompany between attritions is statistically significant

YearsSinceLastPromotion - Number of years since last promotion

df_clean3 %>% 
  summarise(Median = median(YearsSinceLastPromotion), 
            Mean = mean(YearsSinceLastPromotion),
            Max = max(YearsSinceLastPromotion), 
            Min = min(YearsSinceLastPromotion))
## # A tibble: 1 x 4
##   Median  Mean   Max   Min
##    <dbl> <dbl> <dbl> <dbl>
## 1      1  2.19    15     0
#Plots
ggplot(df_clean3, aes(YearsSinceLastPromotion, color=Attrition, fill=Attrition)) +
  geom_density() +
  labs(title = "YearsSinceLastPromotion vs. Attrition")+
  scale_fill_brewer(palette="Set1")#The last promotion between years 5 to 10, the employees have mor tendecing to subject to attrition

ggplot(df_clean3, aes(x=Attrition, YearsSinceLastPromotion, color=Attrition)) +
  geom_boxplot() +
  scale_color_manual(values=c("#CB181D", "#2171B5")) #There is a several outlier in the attrition ratio "Yes". However, we can think that this outliers can be retirement age and they are exit the labor force.

#Test stat
shapiro.test(df_clean3$YearsSinceLastPromotion) #the YearsSinceLastPromotion distribution is not normal. 
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$YearsSinceLastPromotion
## W = 0.70373, p-value < 2.2e-16
#Therefore we conduct the wilcox test
wilcox.test(YearsSinceLastPromotion ~ Attrition, data=df_clean3) #As p value is lower than 0,05 than the difference 
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  YearsSinceLastPromotion by Attrition
## W = 157847, p-value = 0.04118
## alternative hypothesis: true location shift is not equal to 0
#of YearsSinceLastPromotion between attritions is statistically significant

YearsWithCurrManager - Number of years under current manager

df_clean3 %>% 
  summarise(Median = median(YearsWithCurrManager), 
            Mean = mean(YearsWithCurrManager),
            Max = max(YearsWithCurrManager), 
            Min = min(YearsWithCurrManager))
## # A tibble: 1 x 4
##   Median  Mean   Max   Min
##    <dbl> <dbl> <dbl> <dbl>
## 1      3  4.12    17     0
#Plots
ggplot(df_clean3, aes(YearsWithCurrManager, color=Attrition, fill=Attrition)) +
  geom_density() +
  labs(title = "YearsWithCurrManager vs. Attrition")+
  scale_fill_brewer(palette="Set1")#The attriton of employees with same manager is lower

ggplot(df_clean3, aes(x=Attrition, YearsWithCurrManager, color=Attrition)) +
  geom_boxplot() +
  scale_color_manual(values=c("#CB181D", "#2171B5"))

#Test statistics
shapiro.test(df_clean3$YearsWithCurrManager) #the YearsWithCurrManager distribution is not normal. 
## 
##  Shapiro-Wilk normality test
## 
## data:  df_clean3$YearsWithCurrManager
## W = 0.89746, p-value < 2.2e-16
#Therefore we conduct the wilcox test
wilcox.test(YearsWithCurrManager ~ Attrition, data=df_clean3) #As p value is lower than 0,05 than the 
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  YearsWithCurrManager by Attrition
## W = 185860, p-value = 1.807e-11
## alternative hypothesis: true location shift is not equal to 0
#difference of YearsWithCurrManager between attritions is statistically significant

ggplots of employee survey with their percentages

survey_eplot <- gather(df_clean3, Satisfaction, value, EnvironmentSatisfaction, JobSatisfaction, WorkLifeBalance, na.rm = TRUE)
ggplot(survey_eplot, aes(factor(Satisfaction), fill=factor(value))) +
  geom_bar(position = "dodge", aes(y=(..count..)/sum(..count..))) + 
  scale_y_continuous(labels=scales::percent) +
  ylab("relative frequencies")+
  xlab(" ") +
  scale_fill_brewer(palette = "Set1")

EnvironmentSatisfaction

ggplot(df_clean3,aes(x=Attrition,group=EnvironmentSatisfaction), ordered=T)+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~EnvironmentSatisfaction)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Environment Satisfaction Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

ggplot(df_clean3,aes(x=EnvironmentSatisfaction,group=Attrition))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~Attrition)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Environment Satisfaction Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

#Test.stat
chisq.test(df_clean3$EnvironmentSatisfaction, df_clean3$Attrition)#Significant
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$EnvironmentSatisfaction and df_clean3$Attrition
## X-squared = 22.504, df = 3, p-value = 5.123e-05

JobSatisfaction

ggplot(df_clean3,aes(x=Attrition,group=JobSatisfaction))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~JobSatisfaction)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Job Satisfaction Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

ggplot(df_clean3,aes(x=JobSatisfaction,group=Attrition))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~Attrition)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Job Satisfaction Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

chisq.test(df_clean3$JobSatisfaction, df_clean3$Attrition)#Significant
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$JobSatisfaction and df_clean3$Attrition
## X-squared = 17.505, df = 3, p-value = 0.0005563

WorkLifeBalance

ggplot(df_clean3,aes(x=Attrition,group=WorkLifeBalance))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~WorkLifeBalance)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Work Life Balance Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

ggplot(df_clean3,aes(x=WorkLifeBalance,group=Attrition))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~Attrition)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Work Life Balance Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

#Test.stat
chisq.test(df_clean3$WorkLifeBalance, df_clean3$Attrition)#Significant
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$WorkLifeBalance and df_clean3$Attrition
## X-squared = 16.325, df = 3, p-value = 0.0009726

ggplot of manager survey with their percentages WorkLifeBalance

ggplot(df_clean3,aes(x=Attrition,group=PerformanceRating))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~PerformanceRating)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Performance Rating Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

ggplot(df_clean3,aes(x=PerformanceRating,group=Attrition))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~Attrition)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Performance Rating Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

#Test.stat
chisq.test(df_clean3$PerformanceRating, df_clean3$Attrition)#Insignificant
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  df_clean3$PerformanceRating and df_clean3$Attrition
## X-squared = 0.00015475, df = 1, p-value = 0.9901

JobInvolvement

ggplot(df_clean3,aes(x=Attrition,group=JobInvolvement))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~JobInvolvement)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="JobInvolvement Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

ggplot(df_clean3,aes(x=JobInvolvement,group=Attrition))+
  geom_bar(aes(y=..prop..,fill=factor(..x..)),stat="count")+
  facet_grid(~Attrition)+
  scale_y_continuous(labels=scales::percent) +
  theme(axis.text.x=element_text(angle=90,vjust=0.5),legend.position="none",plot.title=element_text(size=16,hjust=0.5))+
  labs(x="Attrition",y="Percentage",title="Job Involvement Vs. Attrition %")+
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ),stat= "count",vjust =-.5) +
  scale_fill_brewer(palette="Set1")

#Test.stat
chisq.test(df_clean3$JobInvolvement, df_clean3$Attrition)#Insignificant
## 
##  Pearson's Chi-squared test
## 
## data:  df_clean3$JobInvolvement and df_clean3$Attrition
## X-squared = 28.492, df = 3, p-value = 2.863e-06

Correlation of variables to attrition

df_clean3$AttritionNum<-unclass(df_clean3$Attrition)
df_clean3$AttritionNum <- as.numeric(df_clean3$AttritionNum)
numerical_proper <- c("Age","MonthlyIncome",
               "YearsAtCompany", "YearsInCurrentRole",
               "YearsSinceLastPromotion", "YearsWithCurrManager",
               "PerformanceRating","NumCompaniesWorked","AttritionNum")

ggcorrplot(cor(df_clean3 %>%
                     select(any_of(numerical_proper)) %>%
                     rename("income" = "MonthlyIncome",
                            "y_comp" = "YearsAtCompany",
                            "y_role" = "YearsInCurrentRole",
                            "y_promo" = "YearsSinceLastPromotion",
                            "y_w_boss" = "YearsWithCurrManager",
                            "perform_r" = "PerformanceRating",
                            "past_job" = "NumCompaniesWorked",
                            "attrition"= "AttritionNum")),
         method = 'square', type = 'lower',lab = TRUE,
         colors = c("#E46726", "white", "#6D9EC1"))

Logistic Regression for Prediction of Attrition Rate

df_clean3$AttritionNum <- NULL
logisticdata <- df_clean3
#Logistic model with all variables
log_model_all <- glm(Attrition~., data=logisticdata, family = binomial)
summary(log_model_all)
## 
## Call:
## glm(formula = Attrition ~ ., family = binomial, data = logisticdata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8224  -0.5175  -0.2669  -0.0993   3.3150  
## 
## Coefficients: (1 not defined because of singularities)
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       3.613e+00  1.400e+00   2.581 0.009848 ** 
## Age                              -3.049e-02  1.332e-02  -2.288 0.022125 *  
## BusinessTravelTravel_Frequently   1.972e+00  4.138e-01   4.765 1.89e-06 ***
## BusinessTravelTravel_Rarely       1.081e+00  3.837e-01   2.816 0.004856 ** 
## DailyRate                        -2.555e-04  2.169e-04  -1.178 0.238754    
## DepartmentResearch & Development -4.098e-01  5.518e-01  -0.743 0.457684    
## DepartmentSales                   2.818e-01  5.743e-01   0.491 0.623654    
## DistanceFromHome                  4.419e-02  1.050e-02   4.211 2.55e-05 ***
## Education                        -2.306e-02  8.597e-02  -0.268 0.788532    
## EducationFieldLife Sciences      -7.622e-01  7.884e-01  -0.967 0.333670    
## EducationFieldMarketing          -3.984e-01  8.346e-01  -0.477 0.633060    
## EducationFieldMedical            -8.994e-01  7.875e-01  -1.142 0.253423    
## EducationFieldOther              -7.179e-01  8.489e-01  -0.846 0.397773    
## EducationFieldTechnical Degree    1.056e-01  8.075e-01   0.131 0.895913    
## EnvironmentSatisfaction          -4.292e-01  8.154e-02  -5.263 1.42e-07 ***
## GenderMale                        4.230e-01  1.812e-01   2.335 0.019565 *  
## HourlyRate                        2.812e-04  4.296e-03   0.065 0.947808    
## JobInvolvement                   -5.437e-01  1.209e-01  -4.496 6.91e-06 ***
## JobLevel                         -3.780e-01  2.867e-01  -1.318 0.187387    
## JobSatisfaction                  -4.176e-01  7.993e-02  -5.225 1.74e-07 ***
## MaritalStatusMarried              3.699e-01  2.527e-01   1.464 0.143218    
## MaritalStatusSingle               1.409e+00  2.568e-01   5.487 4.08e-08 ***
## MonthlyIncome                    -3.825e-05  6.849e-05  -0.559 0.576490    
## MonthlyRate                       4.538e-06  1.221e-05   0.372 0.710154    
## NumCompaniesWorked                1.876e-01  3.787e-02   4.953 7.30e-07 ***
## OverTimeYes                       1.869e+00  1.870e-01   9.994  < 2e-16 ***
## PercentSalaryHike                -2.442e-02  3.823e-02  -0.639 0.522885    
## PerformanceRating                 1.686e-01  3.900e-01   0.432 0.665605    
## RelationshipSatisfaction         -2.470e-01  8.059e-02  -3.065 0.002180 ** 
## StandardHours                            NA         NA      NA       NA    
## TrainingTimesLastYear            -1.717e-01  7.163e-02  -2.397 0.016514 *  
## WorkLifeBalance                  -3.529e-01  1.206e-01  -2.927 0.003428 ** 
## YearsAtCompany                    4.760e-02  3.982e-02   1.195 0.231951    
## YearsInCurrentRole               -1.492e-01  4.460e-02  -3.345 0.000824 ***
## YearsSinceLastPromotion           1.797e-01  4.179e-02   4.300 1.71e-05 ***
## YearsWithCurrManager             -1.438e-01  4.656e-02  -3.089 0.002007 ** 
## PastWorkingYears                 -4.939e-02  2.832e-02  -1.744 0.081191 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1298.58  on 1469  degrees of freedom
## Residual deviance:  885.51  on 1434  degrees of freedom
## AIC: 957.51
## 
## Number of Fisher Scoring iterations: 6
mypreds <- predict(log_model_all, newdata = logisticdata)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
mypreds1 <- ifelse(exp(mypreds) < 0.5, "No", "Yes")
#Confusion matrix 
confusionMatrix(factor(mypreds1), factor(logisticdata$Attrition))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1140   94
##        Yes   93  143
##                                           
##                Accuracy : 0.8728          
##                  95% CI : (0.8547, 0.8894)
##     No Information Rate : 0.8388          
##     P-Value [Acc > NIR] : 0.0001521       
##                                           
##                   Kappa : 0.5289          
##                                           
##  Mcnemar's Test P-Value : 1.0000000       
##                                           
##             Sensitivity : 0.9246          
##             Specificity : 0.6034          
##          Pos Pred Value : 0.9238          
##          Neg Pred Value : 0.6059          
##              Prevalence : 0.8388          
##          Detection Rate : 0.7755          
##    Detection Prevalence : 0.8395          
##       Balanced Accuracy : 0.7640          
##                                           
##        'Positive' Class : No              
##