DATA 606 Final Project

Introduction

The Company ABC has very poor employee satisfaction and retention. Company has decided to look into the request of paying their employees for overtime hours. The information available for the sample employees includes currently available information such as satisfaction, number of projects and salary level as well as hours worked.

Project purpose:

Using data, we can build a predictive model which will estimate the average hours an employee is likely to work based on their other factors (such as satisfaction, salary level etc).

We will follow some steps to solve estimate salary for the overtime employee. Those steps are EDA, Transform data and model building.

I have downloded the data file from Kaggle and stored the file in my Github repository.

Here is the source from Kaggle : https://www.kaggle.com/giripujar/hr-analytics

Below loaded required libraries for the project.

library(data.table)
library(corrplot)
library(ggplot2)
library (gcookbook)
library(caret)
library(hexbin)
library(leaps)
library(plyr)
library(plotly)
library(waffle)
library(dummies)
library(caTools)
library(wesanderson)
library(visreg)
library(rpart)
library(leaps)

Load data from Github to RStudio using read.csv().

hr_data <- read.csv("https://raw.githubusercontent.com/SubhalaxmiRout002/Data-606-Final-Project/master/HR_comma_sep.csv", stringsAsFactors = FALSE)

# number of rows and columns
dim(hr_data)

## [1] 14999    10

# column names
names(hr_data)

##  [1] "satisfaction_level"    "last_evaluation"       "number_project"       
##  [4] "average_montly_hours"  "time_spend_company"    "Work_accident"        
##  [7] "left"                  "promotion_last_5years" "Department"           
## [10] "salary"

# structure of the columns
str(hr_data)

## 'data.frame':    14999 obs. of  10 variables:
##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Department           : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...

#rename columns
colNames <- c("satLevel", "lastEval", "numProj", "avgHrs", "timeCpny", "wrkAcdnt", "left", "fiveYrPrmo", "department", "salary")
setnames(hr_data, colNames)

# raw data
head(hr_data)

##   satLevel lastEval numProj avgHrs timeCpny wrkAcdnt left fiveYrPrmo department
## 1     0.38     0.53       2    157        3        0    1          0      sales
## 2     0.80     0.86       5    262        6        0    1          0      sales
## 3     0.11     0.88       7    272        4        0    1          0      sales
## 4     0.72     0.87       5    223        5        0    1          0      sales
## 5     0.37     0.52       2    159        3        0    1          0      sales
## 6     0.41     0.50       2    153        3        0    1          0      sales
##   salary
## 1    low
## 2 medium
## 3 medium
## 4    low
## 5    low
## 6    low

# summary statistics of the columns
summary(hr_data)

##     satLevel         lastEval         numProj          avgHrs     
##  Min.   :0.0900   Min.   :0.3600   Min.   :2.000   Min.   : 96.0  
##  1st Qu.:0.4400   1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0  
##  Median :0.6400   Median :0.7200   Median :4.000   Median :200.0  
##  Mean   :0.6128   Mean   :0.7161   Mean   :3.803   Mean   :201.1  
##  3rd Qu.:0.8200   3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0  
##  Max.   :1.0000   Max.   :1.0000   Max.   :7.000   Max.   :310.0  
##     timeCpny         wrkAcdnt           left          fiveYrPrmo     
##  Min.   : 2.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 3.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 3.000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   : 3.498   Mean   :0.1446   Mean   :0.2381   Mean   :0.02127  
##  3rd Qu.: 4.000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :10.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   department           salary         
##  Length:14999       Length:14999      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

We are trying to create a predictive model to estimate “average_monthly_hours”. This will allow us to estimate how many hours our current employee base is likely to work.

EDA (Exploratory data analysis)

Correlation helps us to understand the variables how they are related with each other. The correlation value for two columns can be between -1 and +1. The closer the value is to either -1 or +1, the higher the correlation or association between columns. If the correlation value is positive, it means that the when one column gets bigger the other column gets bigger. When the correlation value is negative, this means that when one column gets bigger the other column gets smaller.

# reference column by their name
attach(hr_data)

# Correlations of columns from 1 to 8
cor <-cor(hr_data[1:8], use="complete.obs", method="pearson")

#round to two decimals
DT::datatable(round(cor, 2))

# visualisation of correlation
corrplot(cor, method="square")

From the above correlation plot we can see, lastEval and numProj have highest correlation values to the column which we are trying to predict, avgHrs. This means the employee worked more projects or had received a higher evaluation, they are likely to work more hours.

 # histogram of avgHrs, satLevel, lastEval and numProj
hist(avgHrs/4, main="Distribution of Average Hours per Week", xlab="Avg Hours", breaks=7, col="lightblue")

hist(satLevel, main="Distribution of Satisfaction Level", xlab="Satisfaction Level", breaks=7, col="lightblue")

hist(lastEval, main="Distribution of Last Evaluations", xlab="Last Eval", breaks=7, col="lightblue")

hist(numProj, main="Distribution of Number of Projects", xlab="Number of Projects", breaks=7, col="lightblue")

# density plot showing the average hours per week by salary category
qplot(avgHrs/4, data=hr_data, geom="density", fill=salary, alpha=I(.5), 
      main="Avg Weekly Hours by Salary Category", xlab="Average Weekly Hours", 
      ylab="Density")

From the density plot we found, higher salary levels does not work for more hours.

hr_data$left_new <- factor(left,levels=c(0,1),
                     labels=c("Not Leave Company","Left Company"))
# average hours per week by employee retention
qplot(avgHrs/4, data=hr_data, geom="density", fill=left_new, alpha=I(.5), 
      main="Average Weekly Hours and Employee Retention", xlab="Average Weekly Hours", 
      ylab="Density")

We view a density plot of hours worked by retained vs non-retained employees.

# distribution of average hours per week by department
boxplot(avgHrs~department,data=hr_data, main="Distribution of average hours per week by department", xlab="Department", ylab="Avg Hours", col="lightblue" )

Transform

This section we will prepare our data for predictive modeling. Devide the dataset in to 2 dataset they are test and train data.

One of the easiest ways to transform our data is to create dummy variables. In the salary section we have 3 values i.e high, medium and low. Create 3 dummy varibles such as NAhigh, NAmedium and NAlow. But for predictive analysis we need numeric values. Let’s do one thing, we will create one level that will shlow 0 and 1. 0 means salary is not available, 1 means salary available.

Example: In a row salary is high, this means for NAhigh = 1, NAmedium = 0 and NAlow = 0.

# drop new colum left_new
hr_data <- hr_data[ -c(11)]

#create dummy variables for department and salary 
hr_dummy <- cbind (hr_data, dummies::dummy(hr_data$department), dummies::dummy(hr_data$salary))
names(hr_dummy)[11] <- "NAaccounting"
names(hr_dummy)[12] <- "NAhr"
names(hr_dummy)[13] <- "NAIT"
names(hr_dummy)[14] <- "NAmanagement"
names(hr_dummy)[15] <- "NAmarketing"
names(hr_dummy)[16] <- "NAproduct_mng"
names(hr_dummy)[17] <- "NARandD"
names(hr_dummy)[18] <- "NAsales"
names(hr_dummy)[19] <- "NAsupport"
names(hr_dummy)[20] <- "NAtechnical"
names(hr_dummy)[21] <- "NAhigh"
names(hr_dummy)[22] <- "NAlow"
names(hr_dummy)[23] <- "NAmedium"

names(hr_dummy)

##  [1] "satLevel"      "lastEval"      "numProj"       "avgHrs"       
##  [5] "timeCpny"      "wrkAcdnt"      "left"          "fiveYrPrmo"   
##  [9] "department"    "salary"        "NAaccounting"  "NAhr"         
## [13] "NAIT"          "NAmanagement"  "NAmarketing"   "NAproduct_mng"
## [17] "NARandD"       "NAsales"       "NAsupport"     "NAtechnical"  
## [21] "NAhigh"        "NAlow"         "NAmedium"

head(hr_dummy)

##   satLevel lastEval numProj avgHrs timeCpny wrkAcdnt left fiveYrPrmo department
## 1     0.38     0.53       2    157        3        0    1          0      sales
## 2     0.80     0.86       5    262        6        0    1          0      sales
## 3     0.11     0.88       7    272        4        0    1          0      sales
## 4     0.72     0.87       5    223        5        0    1          0      sales
## 5     0.37     0.52       2    159        3        0    1          0      sales
## 6     0.41     0.50       2    153        3        0    1          0      sales
##   salary NAaccounting NAhr NAIT NAmanagement NAmarketing NAproduct_mng NARandD
## 1    low            0    0    0            0           0             0       0
## 2 medium            0    0    0            0           0             0       0
## 3 medium            0    0    0            0           0             0       0
## 4    low            0    0    0            0           0             0       0
## 5    low            0    0    0            0           0             0       0
## 6    low            0    0    0            0           0             0       0
##   NAsales NAsupport NAtechnical NAhigh NAlow NAmedium
## 1       1         0           0      0     1        0
## 2       1         0           0      0     0        1
## 3       1         0           0      0     0        1
## 4       1         0           0      0     1        0
## 5       1         0           0      0     1        0
## 6       1         0           0      0     1        0

Using log, square-root with variables we can make our data set a little more robust to outliers and differences in units between variables.

hr_dummy$satLevelLog <- log(satLevel)
hr_dummy$lastEvalLog <- log(lastEval)

hr_dummy$satLevelSqrt <- sqrt(satLevel)
hr_dummy$lastEvalSqrt <- sqrt(lastEval)

hr_dummy$satLevelScale <- scale(satLevel)
hr_dummy$lastEvalScale <- scale(lastEval)

Lets see those employee, who received high evaluation but the satisfaction score is low. We will create one variable and this variable in to the dummy data set.

hr_dummy$greatEvalLowSat <- ifelse(lastEval>0.8 & satLevel <0.2, 1, 0)

x <- ggplot(hr_dummy, aes(factor(greatEvalLowSat), avgHrs))
x <- x + geom_boxplot (aes(fill=factor(greatEvalLowSat)), outlier.color="black",  outlier.size=1) 
x + scale_fill_manual(values=wes_palette(n=2, name="GrandBudapest2"))

split the data set in to 2 parts train and test. Train data contains 75% of data and test data contains 25% of data.

set.seed(123)
sample = sample.split(hr_dummy, SplitRatio = .75)
train = subset(hr_dummy, sample == TRUE)
test  = subset(hr_dummy, sample == FALSE)

dim(train)

## [1] 10999    30

dim(test)

## [1] 4000   30

summary(train)

##     satLevel         lastEval         numProj          avgHrs     
##  Min.   :0.0900   Min.   :0.3600   Min.   :2.000   Min.   : 96.0  
##  1st Qu.:0.4400   1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0  
##  Median :0.6500   Median :0.7200   Median :4.000   Median :200.0  
##  Mean   :0.6147   Mean   :0.7157   Mean   :3.797   Mean   :200.9  
##  3rd Qu.:0.8200   3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0  
##  Max.   :1.0000   Max.   :1.0000   Max.   :7.000   Max.   :310.0  
##     timeCpny         wrkAcdnt           left          fiveYrPrmo     
##  Min.   : 2.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.: 3.000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median : 3.000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   : 3.497   Mean   :0.1466   Mean   :0.2382   Mean   :0.02209  
##  3rd Qu.: 4.000   3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :10.000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##   department           salary           NAaccounting          NAhr        
##  Length:10999       Length:10999       Min.   :0.00000   Min.   :0.00000  
##  Class :character   Class :character   1st Qu.:0.00000   1st Qu.:0.00000  
##  Mode  :character   Mode  :character   Median :0.00000   Median :0.00000  
##                                        Mean   :0.05182   Mean   :0.04919  
##                                        3rd Qu.:0.00000   3rd Qu.:0.00000  
##                                        Max.   :1.00000   Max.   :1.00000  
##       NAIT          NAmanagement      NAmarketing      NAproduct_mng    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   :0.08173   Mean   :0.04346   Mean   :0.05846   Mean   :0.05919  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##     NARandD           NAsales        NAsupport       NAtechnical    
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.05182   Mean   :0.276   Mean   :0.1471   Mean   :0.1812  
##  3rd Qu.:0.00000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##      NAhigh            NAlow           NAmedium       satLevelLog     
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :-2.4079  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:-0.8210  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :-0.4308  
##  Mean   :0.08064   Mean   :0.4856   Mean   :0.4338   Mean   :-0.6182  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:-0.1985  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   : 0.0000  
##   lastEvalLog       satLevelSqrt     lastEvalSqrt      satLevelScale.V1  
##  Min.   :-1.0217   Min.   :0.3000   Min.   :0.6000   Min.   :-2.1028522  
##  1st Qu.:-0.5798   1st Qu.:0.6633   1st Qu.:0.7483   1st Qu.:-0.6951417  
##  Median :-0.3285   Median :0.8062   Median :0.8485   Median : 0.1494847  
##  Mean   :-0.3650   Mean   :0.7624   Mean   :0.8397   Mean   : 0.0075563  
##  3rd Qu.:-0.1393   3rd Qu.:0.9055   3rd Qu.:0.9327   3rd Qu.: 0.8332298  
##  Max.   : 0.0000   Max.   :1.0000   Max.   :1.0000   Max.   : 1.5571953  
##    lastEvalScale.V1   greatEvalLowSat  
##  Min.   :-2.0804089   Min.   :0.00000  
##  1st Qu.:-0.9119738   1st Qu.:0.00000  
##  Median : 0.0227743   Median :0.00000  
##  Mean   :-0.0023228   Mean   :0.05855  
##  3rd Qu.: 0.8991007   3rd Qu.:0.00000  
##  Max.   : 1.6585835   Max.   :1.00000

summary(test)

##     satLevel         lastEval         numProj         avgHrs     
##  Min.   :0.0900   Min.   :0.3600   Min.   :2.00   Min.   : 97.0  
##  1st Qu.:0.4400   1st Qu.:0.5600   1st Qu.:3.00   1st Qu.:156.0  
##  Median :0.6400   Median :0.7200   Median :4.00   Median :200.0  
##  Mean   :0.6077   Mean   :0.7172   Mean   :3.82   Mean   :201.5  
##  3rd Qu.:0.8100   3rd Qu.:0.8700   3rd Qu.:5.00   3rd Qu.:245.0  
##  Max.   :1.0000   Max.   :1.0000   Max.   :7.00   Max.   :310.0  
##     timeCpny         wrkAcdnt          left          fiveYrPrmo   
##  Min.   : 2.000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.: 3.000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median : 3.000   Median :0.000   Median :0.0000   Median :0.000  
##  Mean   : 3.501   Mean   :0.139   Mean   :0.2377   Mean   :0.019  
##  3rd Qu.: 4.000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:0.000  
##  Max.   :10.000   Max.   :1.000   Max.   :1.0000   Max.   :1.000  
##   department           salary           NAaccounting          NAhr       
##  Length:4000        Length:4000        Min.   :0.00000   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.00000   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :0.00000   Median :0.0000  
##                                        Mean   :0.04925   Mean   :0.0495  
##                                        3rd Qu.:0.00000   3rd Qu.:0.0000  
##                                        Max.   :1.00000   Max.   :1.0000  
##       NAIT        NAmanagement    NAmarketing      NAproduct_mng    
##  Min.   :0.000   Min.   :0.000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :0.000   Median :0.000   Median :0.00000   Median :0.00000  
##  Mean   :0.082   Mean   :0.038   Mean   :0.05375   Mean   :0.06275  
##  3rd Qu.:0.000   3rd Qu.:0.000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :1.000   Max.   :1.000   Max.   :1.00000   Max.   :1.00000  
##     NARandD           NAsales        NAsupport       NAtechnical    
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.05425   Mean   :0.276   Mean   :0.1527   Mean   :0.1817  
##  3rd Qu.:0.00000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##      NAhigh           NAlow           NAmedium       satLevelLog     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :-2.4079  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:-0.8210  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :-0.4463  
##  Mean   :0.0875   Mean   :0.4938   Mean   :0.4188   Mean   :-0.6301  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:-0.2107  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   : 0.0000  
##   lastEvalLog       satLevelSqrt     lastEvalSqrt      satLevelScale.V1  
##  Min.   :-1.0217   Min.   :0.3000   Min.   :0.6000   Min.   :-2.1028522  
##  1st Qu.:-0.5798   1st Qu.:0.6633   1st Qu.:0.7483   1st Qu.:-0.6951417  
##  Median :-0.3285   Median :0.8000   Median :0.8485   Median : 0.1092644  
##  Mean   :-0.3623   Mean   :0.7580   Mean   :0.8407   Mean   :-0.0207779  
##  3rd Qu.:-0.1393   3rd Qu.:0.9000   3rd Qu.:0.9327   3rd Qu.: 0.7930095  
##  Max.   : 0.0000   Max.   :1.0000   Max.   :1.0000   Max.   : 1.5571953  
##    lastEvalScale.V1   greatEvalLowSat  
##  Min.   :-2.0804089   Min.   :0.00000  
##  1st Qu.:-0.9119738   1st Qu.:0.00000  
##  Median : 0.0227743   Median :0.00000  
##  Mean   : 0.0063870   Mean   :0.06275  
##  3rd Qu.: 0.8991007   3rd Qu.:0.00000  
##  Max.   : 1.6585835   Max.   :1.00000

Model

We will create different models and see which is the highest performing model. Start with a linear regression model using all variables in our data set.

MODEl 1 : Linear Regression With All Variables

model.lr <- lm(avgHrs ~ ., train)
summary(model.lr)

## 
## Call:
## lm(formula = avgHrs ~ ., data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -132.232  -29.495   -2.441   31.864  143.177 
## 
## Coefficients: (15 not defined because of singularities)
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -4890.2915  1051.2484  -4.652 3.33e-06 ***
## satLevel              -1066.5293    66.7986 -15.966  < 2e-16 ***
## lastEval              -1404.2705   469.2540  -2.993  0.00277 ** 
## numProj                   7.7406     0.4203  18.418  < 2e-16 ***
## timeCpny                  1.3710     0.2931   4.677 2.94e-06 ***
## wrkAcdnt                  0.5092     1.1582   0.440  0.66021    
## left                      2.1593     1.1417   1.891  0.05860 .  
## fiveYrPrmo                0.6399     2.8012   0.228  0.81932    
## departmenthr             -0.2786     2.5459  -0.109  0.91287    
## departmentIT              1.1853     2.2725   0.522  0.60197    
## departmentmanagement     -1.7592     2.6837  -0.655  0.51217    
## departmentmarketing      -0.1665     2.4415  -0.068  0.94564    
## departmentproduct_mng     0.4638     2.4338   0.191  0.84886    
## departmentRandD           0.7972     2.5146   0.317  0.75123    
## departmentsales           1.1327     1.9366   0.585  0.55863    
## departmentsupport        -0.8150     2.0669  -0.394  0.69336    
## departmenttechnical      -0.3196     2.0150  -0.159  0.87397    
## salarylow                -1.5149     1.5936  -0.951  0.34182    
## salarymedium             -1.3487     1.5895  -0.849  0.39615    
## NAaccounting                  NA         NA      NA       NA    
## NAhr                          NA         NA      NA       NA    
## NAIT                          NA         NA      NA       NA    
## NAmanagement                  NA         NA      NA       NA    
## NAmarketing                   NA         NA      NA       NA    
## NAproduct_mng                 NA         NA      NA       NA    
## NARandD                       NA         NA      NA       NA    
## NAsales                       NA         NA      NA       NA    
## NAsupport                     NA         NA      NA       NA    
## NAtechnical                   NA         NA      NA       NA    
## NAhigh                        NA         NA      NA       NA    
## NAlow                         NA         NA      NA       NA    
## NAmedium                      NA         NA      NA       NA    
## satLevelLog            -521.6129    27.1330 -19.224  < 2e-16 ***
## lastEvalLog            -835.6357   306.0401  -2.730  0.00633 ** 
## satLevelSqrt           3095.1387   174.7076  17.716  < 2e-16 ***
## lastEvalSqrt           4443.3827  1522.2016   2.919  0.00352 ** 
## satLevelScale                 NA         NA      NA       NA    
## lastEvalScale                 NA         NA      NA       NA    
## greatEvalLowSat          -0.7864     3.0617  -0.257  0.79730    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.38 on 10975 degrees of freedom
## Multiple R-squared:  0.2781, Adjusted R-squared:  0.2766 
## F-statistic: 183.8 on 23 and 10975 DF,  p-value: < 2.2e-16

Adjusted R Squared - Is similar to the R Squared metric as it measures the amount of variation in the variable you are trying to predict (avg monthly hours worked), that is explained by your predictors (remaining data set). Although it adjusts for the number of variables included as predictors. A higher adjusted r squared value is better.
AIC - Akaike information criterion attempts to measure the bias/variance trade-off by rewarding for accuracy and penalizing for over-fitting. I described the bias/variance trade-off was in a past blog post. A lower AIC value is better.
BIC - Bayesian information criterion is similar to AIC, except that it has a higher penalty for including more variables in the model. A lower BIC value is also better.
Mean Squared Prediction Error - Displays the average squared difference between the predicted values and actual values in the testing set. A lower mean squared prediction error value is better.
Standard Error - Shows the standard deviation of the prediction error above. A lower standard error value is better.

# aic value
aic <- AIC(model.lr)
# bic value
bic <-BIC(model.lr)
#turn off warnings
options(warn=-1)
# validation predictions
pred.model.lr <- predict(model.lr, newdata = test) 
#turn on warnings
options(warn=1)
# mean prediction error
meanPred <- mean((test$avgHrs - pred.model.lr)^2) 
# std error
stdError <- sd((test$avgHrs - pred.model.lr)^2)/length(test$avgHrs) 
# create model matrix
MM <- data.frame( "Model" = character(), "adjRsq" = integer(),  "AIC"= integer(), "BIC"= integer(), "Mean Prediction Error"= integer(), "Standard Error"= integer(), stringsAsFactors=FALSE)

# add one row to mm
MM[nrow(MM) + 1, ] <- c( "model.lr", 0.2766, aic, bic, meanPred, stdError)
MM

##      Model adjRsq              AIC              BIC Mean.Prediction.Error
## 1 model.lr 0.2766 113661.307912075 113843.946903039       1782.3827021275
##      Standard.Error
## 1 0.538189821104025

let’s create another model with less variables using linear regression backward selection.

Model 2: Linear Regression Backward Selection

model.bckwd <- lm(avgHrs ~ satLevel + lastEval + numProj + timeCpny + left + satLevelLog +
lastEvalLog + satLevelSqrt + lastEvalSqrt, train)
summary(model.bckwd)

## 
## Call:
## lm(formula = avgHrs ~ satLevel + lastEval + numProj + timeCpny + 
##     left + satLevelLog + lastEvalLog + satLevelSqrt + lastEvalSqrt, 
##     data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -133.071  -29.457   -2.426   31.854  142.989 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4879.7583  1049.9582  -4.648 3.40e-06 ***
## satLevel     -1060.3614    64.0887 -16.545  < 2e-16 ***
## lastEval     -1406.1566   468.2164  -3.003  0.00268 ** 
## numProj          7.7251     0.4195  18.413  < 2e-16 ***
## timeCpny         1.3840     0.2880   4.806 1.56e-06 ***
## left             1.9356     1.1149   1.736  0.08258 .  
## satLevelLog   -518.5068    24.9384 -20.792  < 2e-16 ***
## lastEvalLog   -835.2846   305.4702  -2.734  0.00626 ** 
## satLevelSqrt  3077.4120   165.0633  18.644  < 2e-16 ***
## lastEvalSqrt  4445.2617  1519.1426   2.926  0.00344 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.37 on 10989 degrees of freedom
## Multiple R-squared:  0.2777, Adjusted R-squared:  0.2771 
## F-statistic: 469.5 on 9 and 10989 DF,  p-value: < 2.2e-16

aic <- AIC(model.bckwd)
bic <-BIC(model.bckwd)

pred.model.bckwd <- predict(model.bckwd, newdata = test)
meanPred <- mean((test$avgHrs - pred.model.bckwd )^2)
stdError <- sd((test$avgHrs - pred.model.bckwd )^2)/length(test$avgHrs)

MM[nrow(MM) + 1, ] <- c( "model.bckwd", 0.2771, aic, bic, meanPred, stdError)
MM

##         Model adjRsq              AIC              BIC Mean.Prediction.Error
## 1    model.lr 0.2766 113661.307912075 113843.946903039       1782.3827021275
## 2 model.bckwd 0.2771 113638.565847037 113718.927003062      1780.80197949829
##      Standard.Error
## 1 0.538189821104025
## 2 0.537485453973232

We will see which variables highly correlated with other variables and develop a model with highly correlated variables. In the train data set department and salary have character data type, so remove those 2 columns. Because in correlation only data type numeric is preferred.

Model 3: Highly Corelated Linear Regression

cor2 <- train[ -c(9,10) ]
head(cor2)

##   satLevel lastEval numProj avgHrs timeCpny wrkAcdnt left fiveYrPrmo
## 1     0.38     0.53       2    157        3        0    1          0
## 2     0.80     0.86       5    262        6        0    1          0
## 3     0.11     0.88       7    272        4        0    1          0
## 6     0.41     0.50       2    153        3        0    1          0
## 7     0.10     0.77       6    247        4        0    1          0
## 9     0.89     1.00       5    224        5        0    1          0
##   NAaccounting NAhr NAIT NAmanagement NAmarketing NAproduct_mng NARandD NAsales
## 1            0    0    0            0           0             0       0       1
## 2            0    0    0            0           0             0       0       1
## 3            0    0    0            0           0             0       0       1
## 6            0    0    0            0           0             0       0       1
## 7            0    0    0            0           0             0       0       1
## 9            0    0    0            0           0             0       0       1
##   NAsupport NAtechnical NAhigh NAlow NAmedium satLevelLog lastEvalLog
## 1         0           0      0     1        0  -0.9675840  -0.6348783
## 2         0           0      0     0        1  -0.2231436  -0.1508229
## 3         0           0      0     0        1  -2.2072749  -0.1278334
## 6         0           0      0     1        0  -0.8915981  -0.6931472
## 7         0           0      0     1        0  -2.3025851  -0.2613648
## 9         0           0      0     1        0  -0.1165338   0.0000000
##   satLevelSqrt lastEvalSqrt satLevelScale lastEvalScale greatEvalLowSat
## 1    0.6164414    0.7280110    -0.9364635    -1.0872390               0
## 2    0.8944272    0.9273618     0.7527892     0.8406789               0
## 3    0.3316625    0.9380832    -2.0224116     0.9575224               1
## 6    0.6403124    0.7071068    -0.8158026    -1.2625043               0
## 7    0.3162278    0.8774964    -2.0626319     0.3148831               0
## 9    0.9433981    1.0000000     1.1147720     1.6585835               0

str(cor2)

## 'data.frame':    10999 obs. of  28 variables:
##  $ satLevel       : num  0.38 0.8 0.11 0.41 0.1 0.89 0.42 0.11 0.84 0.41 ...
##  $ lastEval       : num  0.53 0.86 0.88 0.5 0.77 1 0.53 0.81 0.92 0.55 ...
##  $ numProj        : int  2 5 7 2 6 5 2 6 4 2 ...
##  $ avgHrs         : int  157 262 272 153 247 224 142 305 234 148 ...
##  $ timeCpny       : int  3 6 4 3 4 5 3 4 5 3 ...
##  $ wrkAcdnt       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ fiveYrPrmo     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAaccounting   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAhr           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAIT           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAmanagement   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAmarketing    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAproduct_mng  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NARandD        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAsales        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ NAsupport      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAtechnical    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAhigh         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NAlow          : int  1 0 0 1 1 1 1 1 1 1 ...
##  $ NAmedium       : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ satLevelLog    : num  -0.968 -0.223 -2.207 -0.892 -2.303 ...
##  $ lastEvalLog    : num  -0.635 -0.151 -0.128 -0.693 -0.261 ...
##  $ satLevelSqrt   : num  0.616 0.894 0.332 0.64 0.316 ...
##  $ lastEvalSqrt   : num  0.728 0.927 0.938 0.707 0.877 ...
##  $ satLevelScale  : num [1:10999, 1] -0.936 0.753 -2.022 -0.816 -2.063 ...
##  $ lastEvalScale  : num [1:10999, 1] -1.087 0.841 0.958 -1.263 0.315 ...
##  $ greatEvalLowSat: num  0 0 1 0 0 0 0 1 0 0 ...

cor2 <-cor(cor2, use="complete.obs", method="pearson")
corrplot(cor2, type = "upper", order = "hclust", 
         col = c("black", "white"), bg = "lightblue", tl.col = "black")

Above plot we can see highest correlated variables. Inclused those variables in out model.

# create model
model.cor <- lm(avgHrs ~ numProj + lastEvalLog + timeCpny + left + greatEvalLowSat, train)
summary(model.cor)

## 
## Call:
## lm(formula = avgHrs ~ numProj + lastEvalLog + timeCpny + left + 
##     greatEvalLowSat, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -167.623  -31.557   -2.854   33.286  131.567 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     168.3481     1.9905  84.575   <2e-16 ***
## numProj          11.2184     0.3976  28.217   <2e-16 ***
## lastEvalLog      42.2722     1.7915  23.596   <2e-16 ***
## timeCpny          0.9453     0.2939   3.216   0.0013 ** 
## left              1.7508     1.0643   1.645   0.1000 .  
## greatEvalLowSat  28.2793     2.1215  13.330   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.64 on 10993 degrees of freedom
## Multiple R-squared:  0.2333, Adjusted R-squared:  0.233 
## F-statistic:   669 on 5 and 10993 DF,  p-value: < 2.2e-16

aic <- AIC(model.cor)
bic <-BIC(model.cor)

pred.model.cor <- predict(model.cor, newdata = test)

meanPred <- mean((test$avgHrs - pred.model.cor )^2) 
stdError <- sd((test$avgHrs - pred.model.cor )^2)/length(test$avgHrs)

MM[nrow(MM) + 1, ] <- c( "model.cor", 0.233, aic, bic, meanPred, stdError)
MM

##         Model adjRsq              AIC              BIC Mean.Prediction.Error
## 1    model.lr 0.2766 113661.307912075 113843.946903039       1782.3827021275
## 2 model.bckwd 0.2771 113638.565847037 113718.927003062      1780.80197949829
## 3   model.cor  0.233 114287.123002206 114338.261919676      1911.14435850578
##      Standard.Error
## 1 0.538189821104025
## 2 0.537485453973232
## 3 0.575043653658238

Compairing all 3 models we can see Linear Regression Backward selection performs well. This has higher $R^2$ adjust values and lower AIC, BIC Prediction.Error values.

Using the same model, we will see how munber of working(AvgHrs) hours related with number of project(numProj).

visreg(model.bckwd, "numProj")

Above graph, we can see AvgHrs increases based on number of project. These 2 variables positively correlated.

Salary Prediction

In Salary variable, there are 3 section high, low and medium.

Assume, high salary = $50/hour Medium salary = $25/hour And, low salary = $10/hour

In general an average work month is 160 hours, we can calculate the overtime hours and an estimated payout based on known salary levels.

# sample data set from main data set
hr_data2 <- head(hr_data, 1000)

hr_data2$left_new <- factor(hr_data2$left,levels=c(0,1),
                     labels=c("Not Leave Company","Left Company"))

hr_data2$fiveYrPrmo_new <- factor(hr_data2$fiveYrPrmo,levels=c(0,1),
                     labels=c("Not Promoted","Promoted"))

hr_data2$wrkAcdnt_new <- factor(hr_data2$wrkAcdnt,levels=c(0,1),
                      labels=c("No Accident","Accident"))

#dummy variables
hr_data2 <- cbind (hr_data2, dummy(hr_data2$department), dummy(hr_data2$salary))


names(hr_data2)[14] <- "NAaccounting"
names(hr_data2)[15] <- "NAhr"
names(hr_data2)[16] <- "NAIT"
names(hr_data2)[17] <- "NAmanagement"
names(hr_data2)[18] <- "NAmarketing"
names(hr_data2)[19] <- "NAproduct_mng"
names(hr_data2)[20] <- "NARandD"
names(hr_data2)[21] <- "NAsales"
names(hr_data2)[22] <- "NAsupport"
names(hr_data2)[23] <- "NAtechnical"
names(hr_data2)[24] <- "NAhigh"
names(hr_data2)[25] <- "NAlow"
names(hr_data2)[26] <- "NAmedium"

#Log Transforms for: satLevel, lastEval
hr_data2$satLevelLog <- log(hr_data2$satLevel)
hr_data2$lastEvalLog <- log(hr_data2$lastEval)

#SQRT Transforms for: satLevel, lastEval
hr_data2$satLevelSqrt <- sqrt(hr_data2$satLevel)
hr_data2$lastEvalSqrt <- sqrt(hr_data2$lastEval)

#Scale Transforms for: satLevel, lastEval
hr_data2$satLevelScale <- scale(hr_data2$satLevel)
hr_data2$lastEvalScale <- scale(hr_data2$lastEval)

# new interaction variable
hr_data2$greatEvalLowSat <- ifelse(hr_data2$lastEval>0.8 & hr_data2$satLevel <0.2, 1, 0)

# model prediction
hr_data2$Predictions <- predict(model.bckwd, hr_data2)

# overtime hours
hr_data2$Predictedovertime = hr_data2$Predictions - 160

#In case the employees worked less than 160 hours per month set negative values to 0
hr_data2$totalPredictedOvertime <- hr_data2$Predictedovertime
hr_data2$totalPredictedOvertime[hr_data2$totalPredictedOvertime < 0 ] <- 0

# total hours worked by salary 
lowSalary = sum(hr_data2[hr_data2$salary=='low',]$Predictedovertime) 

mediumSalary = sum(hr_data2[hr_data2$salary=='medium',]$Predictedovertime) 

highSalary = sum(hr_data2[hr_data2$salary=='high',]$Predictedovertime) 


TotalPaidPerMonth = lowSalary*10 + mediumSalary*25 + highSalary*50
TotalPaidPerMonth

## [1] 762573.9

TotalPaidPerYear = TotalPaidPerMonth * 12
TotalPaidPerYear

## [1] 9150887

Summary

From above result we can see that if the company were to pay their 1000 employee based on overtime, it would cost them $762,573.9/month and $9,150,887/year.