#The term attrition to the rate at which employees of an organization leave the workforce over a given period of time

#install.packages("tidyverse")
library(tidyverse)  # you will use the readr package in tidyverse to read in this data
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.1.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#install.packages('rpart')
#install.packages('caret')
#install.packages('dplyr')
#install.packages('ggplot2')


library(rpart)
library(caret)
## Warning: package 'caret' was built under R version 4.1.3
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
library(dplyr)

#Importing the dataset from from datasets of data source folder.

HREmployeeAttrition <- read.csv(file = ('/Users/Sileshi Boru/Downloads/HR_EmployeeAttrition.csv'))

#install the data HREmployeeAttrition to excute the program for processing the data.

#install.packages('HREmployeeAttrition')

#Viewing and Understanding the dataset and its structure

str(HREmployeeAttrition)
## 'data.frame':    1470 obs. of  27 variables:
##  $ Age                     : int  41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : logi  TRUE FALSE TRUE FALSE FALSE FALSE ...
##  $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
##  $ Department              : chr  "Sales" "Research & Development" "Research & Development" "Research & Development" ...
##  $ DistanceFromHome        : int  1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : int  2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : chr  "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EnvironmentSatisfaction : int  2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : chr  "Female" "Male" "Male" "Female" ...
##  $ JobInvolvement          : int  3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : int  2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : chr  "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
##  $ JobSatisfaction         : int  4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : chr  "Single" "Married" "Single" "Married" ...
##  $ MonthlyIncome           : int  5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
##  $ NumCompaniesWorked      : int  8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : chr  "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr  "Yes" "No" "Yes" "Yes" ...
##  $ PercentSalaryHike       : int  11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : int  3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: int  1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : int  8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : int  0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : int  1 3 3 3 3 2 2 3 3 2 ...
#names(HREmployeeAtREmployeeAttritiontrition)
glimpse(HREmployeeAttrition)
## Rows: 1,470
## Columns: 27
## $ Age                      <int> 41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 2~
## $ Attrition                <lgl> TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE~
## $ BusinessTravel           <chr> "Travel_Rarely", "Travel_Frequently", "Travel~
## $ Department               <chr> "Sales", "Research & Development", "Research ~
## $ DistanceFromHome         <int> 1, 8, 2, 3, 2, 2, 3, 24, 23, 27, 16, 15, 26, ~
## $ Education                <int> 2, 1, 2, 4, 1, 2, 3, 1, 3, 3, 3, 2, 1, 2, 3, ~
## $ EducationField           <chr> "Life Sciences", "Life Sciences", "Other", "L~
## $ EmployeeCount            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ EnvironmentSatisfaction  <int> 2, 3, 4, 4, 1, 4, 3, 4, 4, 3, 1, 4, 1, 2, 3, ~
## $ Gender                   <chr> "Female", "Male", "Male", "Female", "Male", "~
## $ JobInvolvement           <int> 3, 2, 2, 3, 3, 3, 4, 3, 2, 3, 4, 2, 3, 3, 2, ~
## $ JobLevel                 <int> 2, 2, 1, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 1, 1, ~
## $ JobRole                  <chr> "Sales Executive", "Research Scientist", "Lab~
## $ JobSatisfaction          <int> 4, 2, 3, 3, 2, 4, 1, 3, 3, 3, 2, 3, 3, 4, 3, ~
## $ MaritalStatus            <chr> "Single", "Married", "Single", "Married", "Ma~
## $ MonthlyIncome            <int> 5993, 5130, 2090, 2909, 3468, 3068, 2670, 269~
## $ NumCompaniesWorked       <int> 8, 1, 6, 1, 9, 0, 4, 1, 0, 6, 0, 0, 1, 0, 5, ~
## $ Over18                   <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", ~
## $ OverTime                 <chr> "Yes", "No", "Yes", "Yes", "No", "No", "Yes",~
## $ PercentSalaryHike        <int> 11, 23, 15, 11, 12, 13, 20, 22, 21, 13, 13, 1~
## $ PerformanceRating        <int> 3, 4, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 3, ~
## $ RelationshipSatisfaction <int> 1, 4, 2, 3, 4, 3, 1, 2, 2, 2, 3, 4, 4, 3, 2, ~
## $ StandardHours            <int> 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 8~
## $ StockOptionLevel         <int> 0, 1, 0, 0, 1, 0, 3, 1, 0, 2, 1, 0, 1, 1, 0, ~
## $ TotalWorkingYears        <int> 8, 10, 7, 8, 6, 8, 12, 1, 10, 17, 6, 10, 5, 3~
## $ TrainingTimesLastYear    <int> 0, 3, 3, 3, 3, 2, 3, 2, 2, 3, 5, 3, 1, 2, 4, ~
## $ WorkLifeBalance          <int> 1, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3, 2, 3, 3, ~
#summary(HREmployeeAttrition)
#Changing name of column Age
names(HREmployeeAttrition)[1] <- "Age"

#Checking for Missing Values

#
apply(is.na(HREmployeeAttrition),2, sum)
##                      Age                Attrition           BusinessTravel 
##                        0                        0                        0 
##               Department         DistanceFromHome                Education 
##                        0                        0                        0 
##           EducationField            EmployeeCount  EnvironmentSatisfaction 
##                        0                        0                        0 
##                   Gender           JobInvolvement                 JobLevel 
##                        0                        0                        0 
##                  JobRole          JobSatisfaction            MaritalStatus 
##                        0                        0                        0 
##            MonthlyIncome       NumCompaniesWorked                   Over18 
##                        0                        0                        0 
##                 OverTime        PercentSalaryHike        PerformanceRating 
##                        0                        0                        0 
## RelationshipSatisfaction            StandardHours         StockOptionLevel 
##                        0                        0                        0 
##        TotalWorkingYears    TrainingTimesLastYear          WorkLifeBalance 
##                        0                        0                        0

#Checking for Null values

#
sum(is.null(HREmployeeAttrition))
## [1] 0

#Checking for duplicate records

sum(is.na(duplicated(HREmployeeAttrition)))
## [1] 0

#Removing non-significant columns because all of the have the same value for below observation

HREmployeeAttrition$EmployeeCount <- NULL
HREmployeeAttrition$EmployeeNumber <- NULL
HREmployeeAttrition$StandardHours <- NULL
HREmployeeAttrition$Over18 <- NULL

#Changing categorical attributes to factor which are specified integer in data set

HREmployeeAttrition$Education <- factor(HREmployeeAttrition$Education)
HREmployeeAttrition$EnvironmentSatisfaction <- factor(HREmployeeAttrition$EnvironmentSatisfaction)
HREmployeeAttrition$JobInvolvement <- factor(HREmployeeAttrition$JobInvolvement)
HREmployeeAttrition$JobLevel <- factor(HREmployeeAttrition$JobLevel)
HREmployeeAttrition$JobSatisfaction <- factor(HREmployeeAttrition$JobSatisfaction)
HREmployeeAttrition$PerformanceRating <- factor(HREmployeeAttrition$PerformanceRating)
HREmployeeAttrition$RelationshipSatisfaction <- factor(HREmployeeAttrition$RelationshipSatisfaction)
HREmployeeAttrition$StockOptionLevel <- factor(HREmployeeAttrition$StockOptionLevel)
HREmployeeAttrition$WorkLifeBalance <- factor(HREmployeeAttrition$WorkLifeBalance)
#HREmployeeAttrition$Attritiontable(c(Attrition)
attritions_number <- HREmployeeAttrition %>% group_by(Attrition) %>% summarise(Count=n())
print(attritions_number)
## # A tibble: 2 x 2
##   Attrition Count
##   <lgl>     <int>
## 1 FALSE      1233
## 2 TRUE        237

Fig1: Attrition VS Overtime

HREmployeeAttrition %>%
  ggplot(aes(x = OverTime, group = Attrition)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), 
           stat="count", 
           alpha = 0.7) +
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), 
            stat= "count", 
            vjust = -.5) +
  labs(y = "Percentage", fill= "over time") +
  facet_grid(~Attrition) +
  theme(legend.position = "right", plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Attrition")

## Fig2: BusinessTravel VS Attrition

HREmployeeAttrition %>%
  ggplot(aes(x = BusinessTravel, group = Attrition)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), 
           stat="count", 
           alpha = 0.7) +
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), 
            stat= "count", 
            vjust = -.5) +
  labs(y = "Percentage", fill= "business travel") +
  facet_grid(~Attrition) +
  theme(legend.position = "right", plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Attrition")

Fig3: Marital status VS Attrition

#V3: Marital status VS Attrition
HREmployeeAttrition %>%
  ggplot(aes(x = MaritalStatus, group = Attrition)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), 
           stat="count", 
           alpha = 0.7) +
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), 
            stat= "count", 
            vjust = -.5) +
  labs(y = "Percentage", fill= "marital status") +
  facet_grid(~Attrition) +
  theme(legend.position = "right", plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Attrition")

Figr4:JobInvolvement vs Attrition

HREmployeeAttrition %>%
  ggplot(aes(x = JobInvolvement, group = Attrition)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), 
           stat="count", 
           alpha = 0.7) +
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), 
            stat= "count", 
            vjust = -.5) +
  labs(y = "Percentage", fill= "JobInvolvement") +
  facet_grid(~Attrition) +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Attrition")

Figr5:JobSatisfaction vs Attrition

HREmployeeAttrition %>%
  ggplot(aes(x = JobSatisfaction, group = Attrition)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), 
           stat="count", 
           alpha = 0.7) +
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), 
            stat= "count", 
            vjust = -.5) +
  labs(y = "Percentage", fill= "JobSatisfaction") +
  facet_grid(~Attrition) +
  theme(legend.position = "none", plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Attrition")

## Figr6:JobRole vs Attrition

levels(HREmployeeAttrition$JobRole) <- c("HC Rep",  "HR", "LT",  "Man",  "MD",  "RD",  "RsScientist",  "SalesEx",   "SalesRep")
HREmployeeAttrition %>%
  ggplot(aes(x = JobRole, group = Attrition)) + 
  geom_bar(aes(y = ..prop.., fill = factor(..x..)), 
           stat="count", 
           alpha = 0.7) +
  geom_text(aes(label = scales::percent(..prop..), y = ..prop.. ), 
            stat= "count", 
            vjust = -.5) +
  labs(y = "Percentage", fill= "job role") +
  facet_grid(~Attrition) +
  theme(legend.position = "right", plot.title = element_text(hjust = 0.5)) + 
  ggtitle("Attrition")+
theme(axis.text.x = element_text(angle = 90))

Figr7:MonthlyIncome,Gender vs Attrition

box.attrition <-HREmployeeAttrition  %>% select(Attrition, MonthlyIncome, Gender) %>% 
    ggplot(aes(x=Attrition, y=MonthlyIncome, fill=Attrition)) + geom_boxplot(color="black") + theme_minimal() + facet_wrap(~Gender) + 
    scale_fill_manual(values=c("#FA5858", "#9FF781"))
print(box.attrition)

Figr8:DistanceFromHome,Gender vs Attrition

box.attrition <-HREmployeeAttrition  %>% select(Attrition, DistanceFromHome, Gender) %>% 
    ggplot(aes(x=Attrition, y=DistanceFromHome, fill=Attrition)) + geom_boxplot(color="black") + theme_minimal() + facet_wrap(~Gender) + 
    scale_fill_manual(values=c("#FA5858", "#9FF781"))
print(box.attrition)

## Figr8:Age, Gender vs Attrition

box.attrition <-HREmployeeAttrition  %>% select(Attrition, Age, Gender) %>% 
    ggplot(aes(x=Attrition, y=Age, fill=Attrition)) + geom_boxplot(color="black") + theme_minimal() + facet_wrap(~Gender) + 
    scale_fill_manual(values=c("#FA5858", "#9FF781"))
print(box.attrition)

`

INtroduction

The data set used for this analysis is based on from data source of the course document under the name of HR-EmployeeAttritiona . This data set contains information on 1470 employees of IBM with 35 variables representing specific characteristics of each employee. This includes their Attrition status (if they left the company or not). This data consists of both numerical and categorical variables. The dependent variable here is Attrition which is a binomial categorical variable with an outcome of “TRUE”, which means an employee has left the company or if “FALSE”, it means the employee is still currently employed with the company.

To clean the data I checked the following:

also we did glimpse() Checking for duplicate records Checking for Missing Values Checking for Null values Removing non-significant columns because all of the have the same value for all observation Changing categorical attributes to factor which are specified integer in data set

code used

The code used to plot the function for each variable is “ggplot2()”

Area of Interest

1.Perform a data analysis to identify factors that lead to attrition using visualizations graphs. 2.Top factors that contribute to turnover based by evidence obtained from the data analyses. 3.Any other interesting trends and observations from the data analysis.

conculussion Findings

The top “5”variables which impacted attrition were

Overtime Travel frequently job roles Job satisfaction singles (Martial status)

 *people who have to work overtime show higher proportion of leaveres compared to their counter parts
 
 *people who travel frequently show higher proportion of leavers compared to their counterpartes
 
 *People from some specific Job roles like sales eccutive,research scientist,and laboratory technician have higher        attrition rates
 
 *people  with low satisfaction of the work enviroment had higher attrition rates.
 
 * Single employees show the largest proportion of leavers,compared to married and Diverorced counterpartes.