Goal: Examine relationships and try to understand this data set well

Data is provided using the peopleanalytics package in R

rm(list = ls())
library(peopleanalyticsdata)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4     ✓ purrr   0.3.4
## ✓ tibble  3.1.2     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   1.4.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(DataExplorer)
library(runway)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
employee = employee_performance

### Looking at what is contained in this data set
head(employee)
##   sales new_customers region gender rating
## 1 10.10             9  North      M      2
## 2 11.67            17   West      F      3
## 3  8.91            13  North      M      3
## 4  7.73            10  South      F      2
## 5 11.36            12  North      M      2
## 6  8.06            12  North      M      2
summary(employee)
##      sales        new_customers       region             gender         
##  Min.   : 2.000   Min.   : 1.000   Length:366         Length:366        
##  1st Qu.: 5.362   1st Qu.: 6.000   Class :character   Class :character  
##  Median : 7.485   Median : 8.000   Mode  :character   Mode  :character  
##  Mean   : 7.543   Mean   : 8.025                                        
##  3rd Qu.: 9.877   3rd Qu.:10.750                                        
##  Max.   :15.660   Max.   :20.000                                        
##      rating     
##  Min.   :1.000  
##  1st Qu.:1.000  
##  Median :2.000  
##  Mean   :1.937  
##  3rd Qu.:3.000  
##  Max.   :3.000
employee %>% introduce()
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1  366       5                2                  3                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0           366               1830        13976
## Adding new column
employee = employee %>% mutate(employee_ID = 1:366)

Exploratory Data Analysis

employee %>% plot_bar()

employee %>% select(-employee_ID) %>% plot_histogram()

employee %>% select(-c(employee_ID, rating)) %>% plot_qq()

employee %>% select(-employee_ID) %>% plot_prcomp()

ggplot(employee, aes(x = rating, group = rating, y = sales)) +
  geom_boxplot(aes(fill = as.integer(rating))) +
  labs(title = "Sales by Rating")

ggplot(employee, aes(x = gender, group = gender, y = sales)) +
  geom_boxplot(aes(fill = gender)) +
  labs(title = "Sales by Gender")

ggplot(employee, aes(x = region, group = region, y = sales)) +
  geom_boxplot(aes(fill = region)) +
  labs(title = "Sales by Region")

ggplot(employee, aes(x = rating, group = rating, y = new_customers)) +
  geom_boxplot(aes(fill = as.integer(rating))) +
  labs(title = "New Customers by Rating")

ggplot(employee, aes(x = gender, group = gender, y = new_customers)) +
  geom_boxplot(aes(fill = gender)) +
  labs(title = "New Customers by Gender")

ggplot(employee, aes(x = region, group = region, y = new_customers)) +
  geom_boxplot(aes(fill = region)) +
  labs(title = "New Customers by Region")

ggplot(employee, aes(x = gender, group = gender, y = rating)) +
  geom_boxplot(aes(fill = gender))

employee %>%
  group_by(gender, rating) %>%
  summarise(count = n()) %>%
  mutate(rel.freq = count / sum(count))
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups:   gender [2]
##   gender rating count rel.freq
##   <chr>   <int> <int>    <dbl>
## 1 F           1    52    0.302
## 2 F           2    69    0.401
## 3 F           3    51    0.297
## 4 M           1    67    0.345
## 5 M           2    82    0.423
## 6 M           3    45    0.232

Build a model

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
set.seed(1234)

employee = employee %>% mutate(is_rating_3 = ifelse(rating ==3, 1, 0))


data_idx = createDataPartition(employee$is_rating_3, p = .75, list = F)
train = employee[data_idx,]
test = employee[-data_idx,]

logistic_reg3 = glm(is_rating_3 ~ sales + new_customers + gender, data = train, family = "binomial")
summary(logistic_reg3)
## 
## Call:
## glm(formula = is_rating_3 ~ sales + new_customers + gender, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.55302  -0.57148  -0.31518  -0.07596   2.60314  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -7.35854    0.97357  -7.558 4.08e-14 ***
## sales          0.32052    0.07294   4.394 1.11e-05 ***
## new_customers  0.40020    0.06967   5.744 9.25e-09 ***
## genderM       -0.24771    0.35599  -0.696    0.487    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 298.40  on 274  degrees of freedom
## Residual deviance: 201.37  on 271  degrees of freedom
## AIC: 209.37
## 
## Number of Fisher Scoring iterations: 6
logistic_preds3 = predict(logistic_reg3, newdata = test, type = "response")
logistic_roc3 = roc(test$is_rating_3, logistic_preds3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(logistic_roc3)

auc(logistic_roc3)
## Area under the curve: 0.7998
logistic_val3 = ifelse(logistic_preds3 > 0.5, 1, 0)
trn_tab = table(predicted = logistic_val3, actual = test$is_rating_3)
confusionMatrix(trn_tab, positive = '1')
## Confusion Matrix and Statistics
## 
##          actual
## predicted  0  1
##         0 55 13
##         1  4 19
##                                           
##                Accuracy : 0.8132          
##                  95% CI : (0.7178, 0.8872)
##     No Information Rate : 0.6484          
##     P-Value [Acc > NIR] : 0.0004363       
##                                           
##                   Kappa : 0.5621          
##                                           
##  Mcnemar's Test P-Value : 0.0523451       
##                                           
##             Sensitivity : 0.5938          
##             Specificity : 0.9322          
##          Pos Pred Value : 0.8261          
##          Neg Pred Value : 0.8088          
##              Prevalence : 0.3516          
##          Detection Rate : 0.2088          
##    Detection Prevalence : 0.2527          
##       Balanced Accuracy : 0.7630          
##                                           
##        'Positive' Class : 1               
## 
test = cbind(test, logistic_preds3)

cal_plot(test, outcome = 'is_rating_3', prediction = 'logistic_preds3')
## Warning: Ignoring unknown aesthetics: ymin, ymax
## Warning: Removed 2 rows containing missing values (geom_bar).

Permutation importance

test_vars = test %>% select(sales, new_customers, gender, is_rating_3)

gender_perm = data.frame(sales = test$sales, new_customers = test$new_customers,
                         gender = sample(test$gender) ,is_rating_3 = test$is_rating_3)

gender_perm_preds = predict(logistic_reg3, newdata = gender_perm, type = "response")
gender_perm_roc = roc(test$is_rating_3, gender_perm_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(gender_perm_roc)

auc(gender_perm_roc)
## Area under the curve: 0.795
sales_perm = data.frame(sales = sample(test$sales), new_customers = test$new_customers,
                         gender = test$gender ,is_rating_3 = test$is_rating_3)

sales_perm_preds = predict(logistic_reg3, newdata = sales_perm, type = "response")
sales_perm_roc = roc(test$is_rating_3, sales_perm_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(sales_perm_roc)

auc(sales_perm_roc)
## Area under the curve: 0.7606
new_customers_perm = data.frame(sales = test$sales, new_customers = sample(test$new_customers),
                        gender = test$gender ,is_rating_3 = test$is_rating_3)

new_customers_perm_preds = predict(logistic_reg3, newdata = new_customers_perm, type = "response")
new_customers_perm_roc = roc(test$is_rating_3, new_customers_perm_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(new_customers_perm_roc)

auc(new_customers_perm_roc)
## Area under the curve: 0.6618
## Mean Squared Error
baseline_error = sum(test$is_rating_3**2 - logistic_preds3**2)
gender_perm_importance = sum(test$is_rating_3**2 - gender_perm_preds**2) - baseline_error
new_customers_perm_importance = sum(test$is_rating_3**2 - new_customers_perm_preds**2) - baseline_error
sales_perm_importance = sum(test$is_rating_3**2 - sales_perm_preds**2) - baseline_error

perm_importance = data.frame(variable = c("gender", "new_customers", "sales"),
                             importance = c(gender_perm_importance,new_customers_perm_importance,
                                            sales_perm_importance))
head(perm_importance)
##        variable importance
## 1        gender  0.1037726
## 2 new_customers  4.2277270
## 3         sales  3.6979977
ggplot(perm_importance, aes(x = importance, y = variable,color = variable)) +
  geom_point(size = 4) +
  labs(title = "Feature Importance in Predicting a 3 Rating", subtitle = "Measured by permuation importance",
       x = "Importance", y = "Variable")

Conclusion

Through this analysis, we were able to take a diver deep into what features drive employee rating (new customers and sales) and what features do not (region and gender).

We can see through the feature importance plot that sales and new customers are what drive an employee's rating. The gender variable was almost negligible. I believe that this finding has to be encouraging for those in HR. Gender discrimination has absolutely no place in the workforce.

This type of analysis would be ideal for any corporation or organization's HR department. Ensuring that rating is driven by actual relevant KPIs instead of discrimination or regional differences is of the utmost importance.

In a real world scenario, the company trying to recreate this process should use more key data points to better understand the relationships between employee rating and employee performance.