Introduction
The Company ABC has very poor employee satisfaction and retention. Company has decided to look into the request of paying their employees for overtime hours. The information available for the sample employees includes currently available information such as satisfaction, number of projects and salary level as well as hours worked.
Project purpose:
Using data, we can build a predictive model which will estimate the average hours an employee is likely to work based on their other factors (such as satisfaction, salary level etc).
We will follow some steps to solve estimate salary for the overtime employee. Those steps are EDA, Transform data and model building.
I have downloded the data file from Kaggle and stored the file in my Github repository.
Here is the source from Kaggle : https://www.kaggle.com/giripujar/hr-analytics
Below loaded required libraries for the project.
library(data.table)
library(corrplot)
library(ggplot2)
library (gcookbook)
library(caret)
library(hexbin)
library(leaps)
library(plyr)
library(plotly)
library(waffle)
library(dummies)
library(caTools)
library(wesanderson)
library(visreg)
library(rpart)
library(leaps)Load data from Github to RStudio using read.csv().
hr_data <- read.csv("https://raw.githubusercontent.com/SubhalaxmiRout002/Data-606-Final-Project/master/HR_comma_sep.csv", stringsAsFactors = FALSE)
# number of rows and columns
dim(hr_data)## [1] 14999 10
## [1] "satisfaction_level" "last_evaluation" "number_project"
## [4] "average_montly_hours" "time_spend_company" "Work_accident"
## [7] "left" "promotion_last_5years" "Department"
## [10] "salary"
## 'data.frame': 14999 obs. of 10 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Department : chr "sales" "sales" "sales" "sales" ...
## $ salary : chr "low" "medium" "medium" "low" ...
#rename columns
colNames <- c("satLevel", "lastEval", "numProj", "avgHrs", "timeCpny", "wrkAcdnt", "left", "fiveYrPrmo", "department", "salary")
setnames(hr_data, colNames)
# raw data
head(hr_data)## satLevel lastEval numProj avgHrs timeCpny wrkAcdnt left fiveYrPrmo department
## 1 0.38 0.53 2 157 3 0 1 0 sales
## 2 0.80 0.86 5 262 6 0 1 0 sales
## 3 0.11 0.88 7 272 4 0 1 0 sales
## 4 0.72 0.87 5 223 5 0 1 0 sales
## 5 0.37 0.52 2 159 3 0 1 0 sales
## 6 0.41 0.50 2 153 3 0 1 0 sales
## salary
## 1 low
## 2 medium
## 3 medium
## 4 low
## 5 low
## 6 low
## satLevel lastEval numProj avgHrs
## Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0
## 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0
## Median :0.6400 Median :0.7200 Median :4.000 Median :200.0
## Mean :0.6128 Mean :0.7161 Mean :3.803 Mean :201.1
## 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0
## Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
## timeCpny wrkAcdnt left fiveYrPrmo
## Min. : 2.000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 3.000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 3.498 Mean :0.1446 Mean :0.2381 Mean :0.02127
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :10.000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## department salary
## Length:14999 Length:14999
## Class :character Class :character
## Mode :character Mode :character
##
##
##
We are trying to create a predictive model to estimate “average_monthly_hours”. This will allow us to estimate how many hours our current employee base is likely to work.
EDA (Exploratory data analysis)
Correlation helps us to understand the variables how they are related with each other. The correlation value for two columns can be between -1 and +1. The closer the value is to either -1 or +1, the higher the correlation or association between columns. If the correlation value is positive, it means that the when one column gets bigger the other column gets bigger. When the correlation value is negative, this means that when one column gets bigger the other column gets smaller.
# reference column by their name
attach(hr_data)
# Correlations of columns from 1 to 8
cor <-cor(hr_data[1:8], use="complete.obs", method="pearson")
#round to two decimals
DT::datatable(round(cor, 2))From the above correlation plot we can see, lastEval and numProj have highest correlation values to the column which we are trying to predict, avgHrs. This means the employee worked more projects or had received a higher evaluation, they are likely to work more hours.
# histogram of avgHrs, satLevel, lastEval and numProj
hist(avgHrs/4, main="Distribution of Average Hours per Week", xlab="Avg Hours", breaks=7, col="lightblue")hist(lastEval, main="Distribution of Last Evaluations", xlab="Last Eval", breaks=7, col="lightblue")# density plot showing the average hours per week by salary category
qplot(avgHrs/4, data=hr_data, geom="density", fill=salary, alpha=I(.5),
main="Avg Weekly Hours by Salary Category", xlab="Average Weekly Hours",
ylab="Density")From the density plot we found, higher salary levels does not work for more hours.
hr_data$left_new <- factor(left,levels=c(0,1),
labels=c("Not Leave Company","Left Company"))
# average hours per week by employee retention
qplot(avgHrs/4, data=hr_data, geom="density", fill=left_new, alpha=I(.5),
main="Average Weekly Hours and Employee Retention", xlab="Average Weekly Hours",
ylab="Density")We view a density plot of hours worked by retained vs non-retained employees.
# distribution of average hours per week by department
boxplot(avgHrs~department,data=hr_data, main="Distribution of average hours per week by department", xlab="Department", ylab="Avg Hours", col="lightblue" )Transform
This section we will prepare our data for predictive modeling. Devide the dataset in to 2 dataset they are test and train data.
One of the easiest ways to transform our data is to create dummy variables. In the salary section we have 3 values i.e high, medium and low. Create 3 dummy varibles such as NAhigh, NAmedium and NAlow. But for predictive analysis we need numeric values. Let’s do one thing, we will create one level that will shlow 0 and 1. 0 means salary is not available, 1 means salary available.
Example: In a row salary is high, this means for NAhigh = 1, NAmedium = 0 and NAlow = 0.
# drop new colum left_new
hr_data <- hr_data[ -c(11)]
#create dummy variables for department and salary
hr_dummy <- cbind (hr_data, dummies::dummy(hr_data$department), dummies::dummy(hr_data$salary))
names(hr_dummy)[11] <- "NAaccounting"
names(hr_dummy)[12] <- "NAhr"
names(hr_dummy)[13] <- "NAIT"
names(hr_dummy)[14] <- "NAmanagement"
names(hr_dummy)[15] <- "NAmarketing"
names(hr_dummy)[16] <- "NAproduct_mng"
names(hr_dummy)[17] <- "NARandD"
names(hr_dummy)[18] <- "NAsales"
names(hr_dummy)[19] <- "NAsupport"
names(hr_dummy)[20] <- "NAtechnical"
names(hr_dummy)[21] <- "NAhigh"
names(hr_dummy)[22] <- "NAlow"
names(hr_dummy)[23] <- "NAmedium"
names(hr_dummy)## [1] "satLevel" "lastEval" "numProj" "avgHrs"
## [5] "timeCpny" "wrkAcdnt" "left" "fiveYrPrmo"
## [9] "department" "salary" "NAaccounting" "NAhr"
## [13] "NAIT" "NAmanagement" "NAmarketing" "NAproduct_mng"
## [17] "NARandD" "NAsales" "NAsupport" "NAtechnical"
## [21] "NAhigh" "NAlow" "NAmedium"
## satLevel lastEval numProj avgHrs timeCpny wrkAcdnt left fiveYrPrmo department
## 1 0.38 0.53 2 157 3 0 1 0 sales
## 2 0.80 0.86 5 262 6 0 1 0 sales
## 3 0.11 0.88 7 272 4 0 1 0 sales
## 4 0.72 0.87 5 223 5 0 1 0 sales
## 5 0.37 0.52 2 159 3 0 1 0 sales
## 6 0.41 0.50 2 153 3 0 1 0 sales
## salary NAaccounting NAhr NAIT NAmanagement NAmarketing NAproduct_mng NARandD
## 1 low 0 0 0 0 0 0 0
## 2 medium 0 0 0 0 0 0 0
## 3 medium 0 0 0 0 0 0 0
## 4 low 0 0 0 0 0 0 0
## 5 low 0 0 0 0 0 0 0
## 6 low 0 0 0 0 0 0 0
## NAsales NAsupport NAtechnical NAhigh NAlow NAmedium
## 1 1 0 0 0 1 0
## 2 1 0 0 0 0 1
## 3 1 0 0 0 0 1
## 4 1 0 0 0 1 0
## 5 1 0 0 0 1 0
## 6 1 0 0 0 1 0
Using log, square-root with variables we can make our data set a little more robust to outliers and differences in units between variables.
hr_dummy$satLevelLog <- log(satLevel)
hr_dummy$lastEvalLog <- log(lastEval)
hr_dummy$satLevelSqrt <- sqrt(satLevel)
hr_dummy$lastEvalSqrt <- sqrt(lastEval)
hr_dummy$satLevelScale <- scale(satLevel)
hr_dummy$lastEvalScale <- scale(lastEval)Lets see those employee, who received high evaluation but the satisfaction score is low. We will create one variable and this variable in to the dummy data set.
hr_dummy$greatEvalLowSat <- ifelse(lastEval>0.8 & satLevel <0.2, 1, 0)
x <- ggplot(hr_dummy, aes(factor(greatEvalLowSat), avgHrs))
x <- x + geom_boxplot (aes(fill=factor(greatEvalLowSat)), outlier.color="black", outlier.size=1)
x + scale_fill_manual(values=wes_palette(n=2, name="GrandBudapest2"))split the data set in to 2 parts train and test. Train data contains 75% of data and test data contains 25% of data.
set.seed(123)
sample = sample.split(hr_dummy, SplitRatio = .75)
train = subset(hr_dummy, sample == TRUE)
test = subset(hr_dummy, sample == FALSE)
dim(train)## [1] 10999 30
## [1] 4000 30
## satLevel lastEval numProj avgHrs
## Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0
## 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0
## Median :0.6500 Median :0.7200 Median :4.000 Median :200.0
## Mean :0.6147 Mean :0.7157 Mean :3.797 Mean :200.9
## 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0
## Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
## timeCpny wrkAcdnt left fiveYrPrmo
## Min. : 2.000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 3.000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 3.497 Mean :0.1466 Mean :0.2382 Mean :0.02209
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :10.000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## department salary NAaccounting NAhr
## Length:10999 Length:10999 Min. :0.00000 Min. :0.00000
## Class :character Class :character 1st Qu.:0.00000 1st Qu.:0.00000
## Mode :character Mode :character Median :0.00000 Median :0.00000
## Mean :0.05182 Mean :0.04919
## 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000
## NAIT NAmanagement NAmarketing NAproduct_mng
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.08173 Mean :0.04346 Mean :0.05846 Mean :0.05919
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## NARandD NAsales NAsupport NAtechnical
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.05182 Mean :0.276 Mean :0.1471 Mean :0.1812
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.000 Max. :1.0000 Max. :1.0000
## NAhigh NAlow NAmedium satLevelLog
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :-2.4079
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:-0.8210
## Median :0.00000 Median :0.0000 Median :0.0000 Median :-0.4308
## Mean :0.08064 Mean :0.4856 Mean :0.4338 Mean :-0.6182
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:-0.1985
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. : 0.0000
## lastEvalLog satLevelSqrt lastEvalSqrt satLevelScale.V1
## Min. :-1.0217 Min. :0.3000 Min. :0.6000 Min. :-2.1028522
## 1st Qu.:-0.5798 1st Qu.:0.6633 1st Qu.:0.7483 1st Qu.:-0.6951417
## Median :-0.3285 Median :0.8062 Median :0.8485 Median : 0.1494847
## Mean :-0.3650 Mean :0.7624 Mean :0.8397 Mean : 0.0075563
## 3rd Qu.:-0.1393 3rd Qu.:0.9055 3rd Qu.:0.9327 3rd Qu.: 0.8332298
## Max. : 0.0000 Max. :1.0000 Max. :1.0000 Max. : 1.5571953
## lastEvalScale.V1 greatEvalLowSat
## Min. :-2.0804089 Min. :0.00000
## 1st Qu.:-0.9119738 1st Qu.:0.00000
## Median : 0.0227743 Median :0.00000
## Mean :-0.0023228 Mean :0.05855
## 3rd Qu.: 0.8991007 3rd Qu.:0.00000
## Max. : 1.6585835 Max. :1.00000
## satLevel lastEval numProj avgHrs
## Min. :0.0900 Min. :0.3600 Min. :2.00 Min. : 97.0
## 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.00 1st Qu.:156.0
## Median :0.6400 Median :0.7200 Median :4.00 Median :200.0
## Mean :0.6077 Mean :0.7172 Mean :3.82 Mean :201.5
## 3rd Qu.:0.8100 3rd Qu.:0.8700 3rd Qu.:5.00 3rd Qu.:245.0
## Max. :1.0000 Max. :1.0000 Max. :7.00 Max. :310.0
## timeCpny wrkAcdnt left fiveYrPrmo
## Min. : 2.000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.: 3.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.000
## Median : 3.000 Median :0.000 Median :0.0000 Median :0.000
## Mean : 3.501 Mean :0.139 Mean :0.2377 Mean :0.019
## 3rd Qu.: 4.000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.000
## Max. :10.000 Max. :1.000 Max. :1.0000 Max. :1.000
## department salary NAaccounting NAhr
## Length:4000 Length:4000 Min. :0.00000 Min. :0.0000
## Class :character Class :character 1st Qu.:0.00000 1st Qu.:0.0000
## Mode :character Mode :character Median :0.00000 Median :0.0000
## Mean :0.04925 Mean :0.0495
## 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000
## NAIT NAmanagement NAmarketing NAproduct_mng
## Min. :0.000 Min. :0.000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.000 Median :0.000 Median :0.00000 Median :0.00000
## Mean :0.082 Mean :0.038 Mean :0.05375 Mean :0.06275
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.000 Max. :1.000 Max. :1.00000 Max. :1.00000
## NARandD NAsales NAsupport NAtechnical
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.05425 Mean :0.276 Mean :0.1527 Mean :0.1817
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.000 Max. :1.0000 Max. :1.0000
## NAhigh NAlow NAmedium satLevelLog
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :-2.4079
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:-0.8210
## Median :0.0000 Median :0.0000 Median :0.0000 Median :-0.4463
## Mean :0.0875 Mean :0.4938 Mean :0.4188 Mean :-0.6301
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:-0.2107
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. : 0.0000
## lastEvalLog satLevelSqrt lastEvalSqrt satLevelScale.V1
## Min. :-1.0217 Min. :0.3000 Min. :0.6000 Min. :-2.1028522
## 1st Qu.:-0.5798 1st Qu.:0.6633 1st Qu.:0.7483 1st Qu.:-0.6951417
## Median :-0.3285 Median :0.8000 Median :0.8485 Median : 0.1092644
## Mean :-0.3623 Mean :0.7580 Mean :0.8407 Mean :-0.0207779
## 3rd Qu.:-0.1393 3rd Qu.:0.9000 3rd Qu.:0.9327 3rd Qu.: 0.7930095
## Max. : 0.0000 Max. :1.0000 Max. :1.0000 Max. : 1.5571953
## lastEvalScale.V1 greatEvalLowSat
## Min. :-2.0804089 Min. :0.00000
## 1st Qu.:-0.9119738 1st Qu.:0.00000
## Median : 0.0227743 Median :0.00000
## Mean : 0.0063870 Mean :0.06275
## 3rd Qu.: 0.8991007 3rd Qu.:0.00000
## Max. : 1.6585835 Max. :1.00000
Model
We will create different models and see which is the highest performing model. Start with a linear regression model using all variables in our data set.
MODEl 1 : Linear Regression With All Variables
##
## Call:
## lm(formula = avgHrs ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132.232 -29.495 -2.441 31.864 143.177
##
## Coefficients: (15 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4890.2915 1051.2484 -4.652 3.33e-06 ***
## satLevel -1066.5293 66.7986 -15.966 < 2e-16 ***
## lastEval -1404.2705 469.2540 -2.993 0.00277 **
## numProj 7.7406 0.4203 18.418 < 2e-16 ***
## timeCpny 1.3710 0.2931 4.677 2.94e-06 ***
## wrkAcdnt 0.5092 1.1582 0.440 0.66021
## left 2.1593 1.1417 1.891 0.05860 .
## fiveYrPrmo 0.6399 2.8012 0.228 0.81932
## departmenthr -0.2786 2.5459 -0.109 0.91287
## departmentIT 1.1853 2.2725 0.522 0.60197
## departmentmanagement -1.7592 2.6837 -0.655 0.51217
## departmentmarketing -0.1665 2.4415 -0.068 0.94564
## departmentproduct_mng 0.4638 2.4338 0.191 0.84886
## departmentRandD 0.7972 2.5146 0.317 0.75123
## departmentsales 1.1327 1.9366 0.585 0.55863
## departmentsupport -0.8150 2.0669 -0.394 0.69336
## departmenttechnical -0.3196 2.0150 -0.159 0.87397
## salarylow -1.5149 1.5936 -0.951 0.34182
## salarymedium -1.3487 1.5895 -0.849 0.39615
## NAaccounting NA NA NA NA
## NAhr NA NA NA NA
## NAIT NA NA NA NA
## NAmanagement NA NA NA NA
## NAmarketing NA NA NA NA
## NAproduct_mng NA NA NA NA
## NARandD NA NA NA NA
## NAsales NA NA NA NA
## NAsupport NA NA NA NA
## NAtechnical NA NA NA NA
## NAhigh NA NA NA NA
## NAlow NA NA NA NA
## NAmedium NA NA NA NA
## satLevelLog -521.6129 27.1330 -19.224 < 2e-16 ***
## lastEvalLog -835.6357 306.0401 -2.730 0.00633 **
## satLevelSqrt 3095.1387 174.7076 17.716 < 2e-16 ***
## lastEvalSqrt 4443.3827 1522.2016 2.919 0.00352 **
## satLevelScale NA NA NA NA
## lastEvalScale NA NA NA NA
## greatEvalLowSat -0.7864 3.0617 -0.257 0.79730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.38 on 10975 degrees of freedom
## Multiple R-squared: 0.2781, Adjusted R-squared: 0.2766
## F-statistic: 183.8 on 23 and 10975 DF, p-value: < 2.2e-16
- Adjusted R Squared - Is similar to the R Squared metric as it measures the amount of variation in the variable you are trying to predict (avg monthly hours worked), that is explained by your predictors (remaining data set). Although it adjusts for the number of variables included as predictors. A higher adjusted r squared value is better.
- AIC - Akaike information criterion attempts to measure the bias/variance trade-off by rewarding for accuracy and penalizing for over-fitting. I described the bias/variance trade-off was in a past blog post. A lower AIC value is better.
- BIC - Bayesian information criterion is similar to AIC, except that it has a higher penalty for including more variables in the model. A lower BIC value is also better.
- Mean Squared Prediction Error - Displays the average squared difference between the predicted values and actual values in the testing set. A lower mean squared prediction error value is better.
- Standard Error - Shows the standard deviation of the prediction error above. A lower standard error value is better.
# aic value
aic <- AIC(model.lr)
# bic value
bic <-BIC(model.lr)
#turn off warnings
options(warn=-1)
# validation predictions
pred.model.lr <- predict(model.lr, newdata = test)
#turn on warnings
options(warn=1)
# mean prediction error
meanPred <- mean((test$avgHrs - pred.model.lr)^2)
# std error
stdError <- sd((test$avgHrs - pred.model.lr)^2)/length(test$avgHrs)
# create model matrix
MM <- data.frame( "Model" = character(), "adjRsq" = integer(), "AIC"= integer(), "BIC"= integer(), "Mean Prediction Error"= integer(), "Standard Error"= integer(), stringsAsFactors=FALSE)
# add one row to mm
MM[nrow(MM) + 1, ] <- c( "model.lr", 0.2766, aic, bic, meanPred, stdError)
MM## Model adjRsq AIC BIC Mean.Prediction.Error
## 1 model.lr 0.2766 113661.307912075 113843.946903039 1782.3827021275
## Standard.Error
## 1 0.538189821104025
let’s create another model with less variables using linear regression backward selection.
Model 2: Linear Regression Backward Selection
model.bckwd <- lm(avgHrs ~ satLevel + lastEval + numProj + timeCpny + left + satLevelLog +
lastEvalLog + satLevelSqrt + lastEvalSqrt, train)
summary(model.bckwd)##
## Call:
## lm(formula = avgHrs ~ satLevel + lastEval + numProj + timeCpny +
## left + satLevelLog + lastEvalLog + satLevelSqrt + lastEvalSqrt,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -133.071 -29.457 -2.426 31.854 142.989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4879.7583 1049.9582 -4.648 3.40e-06 ***
## satLevel -1060.3614 64.0887 -16.545 < 2e-16 ***
## lastEval -1406.1566 468.2164 -3.003 0.00268 **
## numProj 7.7251 0.4195 18.413 < 2e-16 ***
## timeCpny 1.3840 0.2880 4.806 1.56e-06 ***
## left 1.9356 1.1149 1.736 0.08258 .
## satLevelLog -518.5068 24.9384 -20.792 < 2e-16 ***
## lastEvalLog -835.2846 305.4702 -2.734 0.00626 **
## satLevelSqrt 3077.4120 165.0633 18.644 < 2e-16 ***
## lastEvalSqrt 4445.2617 1519.1426 2.926 0.00344 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42.37 on 10989 degrees of freedom
## Multiple R-squared: 0.2777, Adjusted R-squared: 0.2771
## F-statistic: 469.5 on 9 and 10989 DF, p-value: < 2.2e-16
aic <- AIC(model.bckwd)
bic <-BIC(model.bckwd)
pred.model.bckwd <- predict(model.bckwd, newdata = test)
meanPred <- mean((test$avgHrs - pred.model.bckwd )^2)
stdError <- sd((test$avgHrs - pred.model.bckwd )^2)/length(test$avgHrs)
MM[nrow(MM) + 1, ] <- c( "model.bckwd", 0.2771, aic, bic, meanPred, stdError)
MM## Model adjRsq AIC BIC Mean.Prediction.Error
## 1 model.lr 0.2766 113661.307912075 113843.946903039 1782.3827021275
## 2 model.bckwd 0.2771 113638.565847037 113718.927003062 1780.80197949829
## Standard.Error
## 1 0.538189821104025
## 2 0.537485453973232
We will see which variables highly correlated with other variables and develop a model with highly correlated variables. In the train data set department and salary have character data type, so remove those 2 columns. Because in correlation only data type numeric is preferred.
Model 3: Highly Corelated Linear Regression
## satLevel lastEval numProj avgHrs timeCpny wrkAcdnt left fiveYrPrmo
## 1 0.38 0.53 2 157 3 0 1 0
## 2 0.80 0.86 5 262 6 0 1 0
## 3 0.11 0.88 7 272 4 0 1 0
## 6 0.41 0.50 2 153 3 0 1 0
## 7 0.10 0.77 6 247 4 0 1 0
## 9 0.89 1.00 5 224 5 0 1 0
## NAaccounting NAhr NAIT NAmanagement NAmarketing NAproduct_mng NARandD NAsales
## 1 0 0 0 0 0 0 0 1
## 2 0 0 0 0 0 0 0 1
## 3 0 0 0 0 0 0 0 1
## 6 0 0 0 0 0 0 0 1
## 7 0 0 0 0 0 0 0 1
## 9 0 0 0 0 0 0 0 1
## NAsupport NAtechnical NAhigh NAlow NAmedium satLevelLog lastEvalLog
## 1 0 0 0 1 0 -0.9675840 -0.6348783
## 2 0 0 0 0 1 -0.2231436 -0.1508229
## 3 0 0 0 0 1 -2.2072749 -0.1278334
## 6 0 0 0 1 0 -0.8915981 -0.6931472
## 7 0 0 0 1 0 -2.3025851 -0.2613648
## 9 0 0 0 1 0 -0.1165338 0.0000000
## satLevelSqrt lastEvalSqrt satLevelScale lastEvalScale greatEvalLowSat
## 1 0.6164414 0.7280110 -0.9364635 -1.0872390 0
## 2 0.8944272 0.9273618 0.7527892 0.8406789 0
## 3 0.3316625 0.9380832 -2.0224116 0.9575224 1
## 6 0.6403124 0.7071068 -0.8158026 -1.2625043 0
## 7 0.3162278 0.8774964 -2.0626319 0.3148831 0
## 9 0.9433981 1.0000000 1.1147720 1.6585835 0
## 'data.frame': 10999 obs. of 28 variables:
## $ satLevel : num 0.38 0.8 0.11 0.41 0.1 0.89 0.42 0.11 0.84 0.41 ...
## $ lastEval : num 0.53 0.86 0.88 0.5 0.77 1 0.53 0.81 0.92 0.55 ...
## $ numProj : int 2 5 7 2 6 5 2 6 4 2 ...
## $ avgHrs : int 157 262 272 153 247 224 142 305 234 148 ...
## $ timeCpny : int 3 6 4 3 4 5 3 4 5 3 ...
## $ wrkAcdnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ fiveYrPrmo : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAaccounting : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAhr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAIT : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAmanagement : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAmarketing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAproduct_mng : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NARandD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAsales : int 1 1 1 1 1 1 1 1 1 1 ...
## $ NAsupport : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAtechnical : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAhigh : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NAlow : int 1 0 0 1 1 1 1 1 1 1 ...
## $ NAmedium : int 0 1 1 0 0 0 0 0 0 0 ...
## $ satLevelLog : num -0.968 -0.223 -2.207 -0.892 -2.303 ...
## $ lastEvalLog : num -0.635 -0.151 -0.128 -0.693 -0.261 ...
## $ satLevelSqrt : num 0.616 0.894 0.332 0.64 0.316 ...
## $ lastEvalSqrt : num 0.728 0.927 0.938 0.707 0.877 ...
## $ satLevelScale : num [1:10999, 1] -0.936 0.753 -2.022 -0.816 -2.063 ...
## $ lastEvalScale : num [1:10999, 1] -1.087 0.841 0.958 -1.263 0.315 ...
## $ greatEvalLowSat: num 0 0 1 0 0 0 0 1 0 0 ...
cor2 <-cor(cor2, use="complete.obs", method="pearson")
corrplot(cor2, type = "upper", order = "hclust",
col = c("black", "white"), bg = "lightblue", tl.col = "black")Above plot we can see highest correlated variables. Inclused those variables in out model.
# create model
model.cor <- lm(avgHrs ~ numProj + lastEvalLog + timeCpny + left + greatEvalLowSat, train)
summary(model.cor)##
## Call:
## lm(formula = avgHrs ~ numProj + lastEvalLog + timeCpny + left +
## greatEvalLowSat, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -167.623 -31.557 -2.854 33.286 131.567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 168.3481 1.9905 84.575 <2e-16 ***
## numProj 11.2184 0.3976 28.217 <2e-16 ***
## lastEvalLog 42.2722 1.7915 23.596 <2e-16 ***
## timeCpny 0.9453 0.2939 3.216 0.0013 **
## left 1.7508 1.0643 1.645 0.1000 .
## greatEvalLowSat 28.2793 2.1215 13.330 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.64 on 10993 degrees of freedom
## Multiple R-squared: 0.2333, Adjusted R-squared: 0.233
## F-statistic: 669 on 5 and 10993 DF, p-value: < 2.2e-16
aic <- AIC(model.cor)
bic <-BIC(model.cor)
pred.model.cor <- predict(model.cor, newdata = test)
meanPred <- mean((test$avgHrs - pred.model.cor )^2)
stdError <- sd((test$avgHrs - pred.model.cor )^2)/length(test$avgHrs)
MM[nrow(MM) + 1, ] <- c( "model.cor", 0.233, aic, bic, meanPred, stdError)
MM## Model adjRsq AIC BIC Mean.Prediction.Error
## 1 model.lr 0.2766 113661.307912075 113843.946903039 1782.3827021275
## 2 model.bckwd 0.2771 113638.565847037 113718.927003062 1780.80197949829
## 3 model.cor 0.233 114287.123002206 114338.261919676 1911.14435850578
## Standard.Error
## 1 0.538189821104025
## 2 0.537485453973232
## 3 0.575043653658238
Compairing all 3 models we can see Linear Regression Backward selection performs well. This has higher \(R^2\) adjust values and lower AIC, BIC Prediction.Error values.
Using the same model, we will see how munber of working(AvgHrs) hours related with number of project(numProj).
Above graph, we can see AvgHrs increases based on number of project. These 2 variables positively correlated.
Salary Prediction
In Salary variable, there are 3 section high, low and medium.
Assume, high salary = $50/hour Medium salary = $25/hour And, low salary = $10/hour
In general an average work month is 160 hours, we can calculate the overtime hours and an estimated payout based on known salary levels.
# sample data set from main data set
hr_data2 <- head(hr_data, 1000)
hr_data2$left_new <- factor(hr_data2$left,levels=c(0,1),
labels=c("Not Leave Company","Left Company"))
hr_data2$fiveYrPrmo_new <- factor(hr_data2$fiveYrPrmo,levels=c(0,1),
labels=c("Not Promoted","Promoted"))
hr_data2$wrkAcdnt_new <- factor(hr_data2$wrkAcdnt,levels=c(0,1),
labels=c("No Accident","Accident"))
#dummy variables
hr_data2 <- cbind (hr_data2, dummy(hr_data2$department), dummy(hr_data2$salary))
names(hr_data2)[14] <- "NAaccounting"
names(hr_data2)[15] <- "NAhr"
names(hr_data2)[16] <- "NAIT"
names(hr_data2)[17] <- "NAmanagement"
names(hr_data2)[18] <- "NAmarketing"
names(hr_data2)[19] <- "NAproduct_mng"
names(hr_data2)[20] <- "NARandD"
names(hr_data2)[21] <- "NAsales"
names(hr_data2)[22] <- "NAsupport"
names(hr_data2)[23] <- "NAtechnical"
names(hr_data2)[24] <- "NAhigh"
names(hr_data2)[25] <- "NAlow"
names(hr_data2)[26] <- "NAmedium"
#Log Transforms for: satLevel, lastEval
hr_data2$satLevelLog <- log(hr_data2$satLevel)
hr_data2$lastEvalLog <- log(hr_data2$lastEval)
#SQRT Transforms for: satLevel, lastEval
hr_data2$satLevelSqrt <- sqrt(hr_data2$satLevel)
hr_data2$lastEvalSqrt <- sqrt(hr_data2$lastEval)
#Scale Transforms for: satLevel, lastEval
hr_data2$satLevelScale <- scale(hr_data2$satLevel)
hr_data2$lastEvalScale <- scale(hr_data2$lastEval)
# new interaction variable
hr_data2$greatEvalLowSat <- ifelse(hr_data2$lastEval>0.8 & hr_data2$satLevel <0.2, 1, 0)
# model prediction
hr_data2$Predictions <- predict(model.bckwd, hr_data2)
# overtime hours
hr_data2$Predictedovertime = hr_data2$Predictions - 160
#In case the employees worked less than 160 hours per month set negative values to 0
hr_data2$totalPredictedOvertime <- hr_data2$Predictedovertime
hr_data2$totalPredictedOvertime[hr_data2$totalPredictedOvertime < 0 ] <- 0
# total hours worked by salary
lowSalary = sum(hr_data2[hr_data2$salary=='low',]$Predictedovertime)
mediumSalary = sum(hr_data2[hr_data2$salary=='medium',]$Predictedovertime)
highSalary = sum(hr_data2[hr_data2$salary=='high',]$Predictedovertime)
TotalPaidPerMonth = lowSalary*10 + mediumSalary*25 + highSalary*50
TotalPaidPerMonth## [1] 762573.9
## [1] 9150887
Summary
From above result we can see that if the company were to pay their 1000 employee based on overtime, it would cost them $762,573.9/month and $9,150,887/year.