rm(list = ls())
library(peopleanalyticsdata)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.4 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(DataExplorer)
library(runway)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
employee = employee_performance
### Looking at what is contained in this data set
head(employee)
## sales new_customers region gender rating
## 1 10.10 9 North M 2
## 2 11.67 17 West F 3
## 3 8.91 13 North M 3
## 4 7.73 10 South F 2
## 5 11.36 12 North M 2
## 6 8.06 12 North M 2
summary(employee)
## sales new_customers region gender
## Min. : 2.000 Min. : 1.000 Length:366 Length:366
## 1st Qu.: 5.362 1st Qu.: 6.000 Class :character Class :character
## Median : 7.485 Median : 8.000 Mode :character Mode :character
## Mean : 7.543 Mean : 8.025
## 3rd Qu.: 9.877 3rd Qu.:10.750
## Max. :15.660 Max. :20.000
## rating
## Min. :1.000
## 1st Qu.:1.000
## Median :2.000
## Mean :1.937
## 3rd Qu.:3.000
## Max. :3.000
employee %>% introduce()
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 366 5 2 3 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 366 1830 13976
## Adding new column
employee = employee %>% mutate(employee_ID = 1:366)
employee %>% plot_bar()
employee %>% select(-employee_ID) %>% plot_histogram()
employee %>% select(-c(employee_ID, rating)) %>% plot_qq()
employee %>% select(-employee_ID) %>% plot_prcomp()
ggplot(employee, aes(x = rating, group = rating, y = sales)) +
geom_boxplot(aes(fill = as.integer(rating))) +
labs(title = "Sales by Rating")
ggplot(employee, aes(x = gender, group = gender, y = sales)) +
geom_boxplot(aes(fill = gender)) +
labs(title = "Sales by Gender")
ggplot(employee, aes(x = region, group = region, y = sales)) +
geom_boxplot(aes(fill = region)) +
labs(title = "Sales by Region")
ggplot(employee, aes(x = rating, group = rating, y = new_customers)) +
geom_boxplot(aes(fill = as.integer(rating))) +
labs(title = "New Customers by Rating")
ggplot(employee, aes(x = gender, group = gender, y = new_customers)) +
geom_boxplot(aes(fill = gender)) +
labs(title = "New Customers by Gender")
ggplot(employee, aes(x = region, group = region, y = new_customers)) +
geom_boxplot(aes(fill = region)) +
labs(title = "New Customers by Region")
ggplot(employee, aes(x = gender, group = gender, y = rating)) +
geom_boxplot(aes(fill = gender))
employee %>%
group_by(gender, rating) %>%
summarise(count = n()) %>%
mutate(rel.freq = count / sum(count))
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
## # A tibble: 6 x 4
## # Groups: gender [2]
## gender rating count rel.freq
## <chr> <int> <int> <dbl>
## 1 F 1 52 0.302
## 2 F 2 69 0.401
## 3 F 3 51 0.297
## 4 M 1 67 0.345
## 5 M 2 82 0.423
## 6 M 3 45 0.232
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(1234)
employee = employee %>% mutate(is_rating_3 = ifelse(rating ==3, 1, 0))
data_idx = createDataPartition(employee$is_rating_3, p = .75, list = F)
train = employee[data_idx,]
test = employee[-data_idx,]
logistic_reg3 = glm(is_rating_3 ~ sales + new_customers + gender, data = train, family = "binomial")
summary(logistic_reg3)
##
## Call:
## glm(formula = is_rating_3 ~ sales + new_customers + gender, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.55302 -0.57148 -0.31518 -0.07596 2.60314
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.35854 0.97357 -7.558 4.08e-14 ***
## sales 0.32052 0.07294 4.394 1.11e-05 ***
## new_customers 0.40020 0.06967 5.744 9.25e-09 ***
## genderM -0.24771 0.35599 -0.696 0.487
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 298.40 on 274 degrees of freedom
## Residual deviance: 201.37 on 271 degrees of freedom
## AIC: 209.37
##
## Number of Fisher Scoring iterations: 6
logistic_preds3 = predict(logistic_reg3, newdata = test, type = "response")
logistic_roc3 = roc(test$is_rating_3, logistic_preds3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(logistic_roc3)
auc(logistic_roc3)
## Area under the curve: 0.7998
logistic_val3 = ifelse(logistic_preds3 > 0.5, 1, 0)
trn_tab = table(predicted = logistic_val3, actual = test$is_rating_3)
confusionMatrix(trn_tab, positive = '1')
## Confusion Matrix and Statistics
##
## actual
## predicted 0 1
## 0 55 13
## 1 4 19
##
## Accuracy : 0.8132
## 95% CI : (0.7178, 0.8872)
## No Information Rate : 0.6484
## P-Value [Acc > NIR] : 0.0004363
##
## Kappa : 0.5621
##
## Mcnemar's Test P-Value : 0.0523451
##
## Sensitivity : 0.5938
## Specificity : 0.9322
## Pos Pred Value : 0.8261
## Neg Pred Value : 0.8088
## Prevalence : 0.3516
## Detection Rate : 0.2088
## Detection Prevalence : 0.2527
## Balanced Accuracy : 0.7630
##
## 'Positive' Class : 1
##
test = cbind(test, logistic_preds3)
cal_plot(test, outcome = 'is_rating_3', prediction = 'logistic_preds3')
## Warning: Ignoring unknown aesthetics: ymin, ymax
## Warning: Removed 2 rows containing missing values (geom_bar).
test_vars = test %>% select(sales, new_customers, gender, is_rating_3)
gender_perm = data.frame(sales = test$sales, new_customers = test$new_customers,
gender = sample(test$gender) ,is_rating_3 = test$is_rating_3)
gender_perm_preds = predict(logistic_reg3, newdata = gender_perm, type = "response")
gender_perm_roc = roc(test$is_rating_3, gender_perm_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(gender_perm_roc)
auc(gender_perm_roc)
## Area under the curve: 0.795
sales_perm = data.frame(sales = sample(test$sales), new_customers = test$new_customers,
gender = test$gender ,is_rating_3 = test$is_rating_3)
sales_perm_preds = predict(logistic_reg3, newdata = sales_perm, type = "response")
sales_perm_roc = roc(test$is_rating_3, sales_perm_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(sales_perm_roc)
auc(sales_perm_roc)
## Area under the curve: 0.7606
new_customers_perm = data.frame(sales = test$sales, new_customers = sample(test$new_customers),
gender = test$gender ,is_rating_3 = test$is_rating_3)
new_customers_perm_preds = predict(logistic_reg3, newdata = new_customers_perm, type = "response")
new_customers_perm_roc = roc(test$is_rating_3, new_customers_perm_preds)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(new_customers_perm_roc)
auc(new_customers_perm_roc)
## Area under the curve: 0.6618
## Mean Squared Error
baseline_error = sum(test$is_rating_3**2 - logistic_preds3**2)
gender_perm_importance = sum(test$is_rating_3**2 - gender_perm_preds**2) - baseline_error
new_customers_perm_importance = sum(test$is_rating_3**2 - new_customers_perm_preds**2) - baseline_error
sales_perm_importance = sum(test$is_rating_3**2 - sales_perm_preds**2) - baseline_error
perm_importance = data.frame(variable = c("gender", "new_customers", "sales"),
importance = c(gender_perm_importance,new_customers_perm_importance,
sales_perm_importance))
head(perm_importance)
## variable importance
## 1 gender 0.1037726
## 2 new_customers 4.2277270
## 3 sales 3.6979977
ggplot(perm_importance, aes(x = importance, y = variable,color = variable)) +
geom_point(size = 4) +
labs(title = "Feature Importance in Predicting a 3 Rating", subtitle = "Measured by permuation importance",
x = "Importance", y = "Variable")
Through this analysis, we were able to take a diver deep into what features drive employee rating (new customers and sales) and what features do not (region and gender).
We can see through the feature importance plot that sales and new customers are what drive an employee's rating. The gender variable was almost negligible. I believe that this finding has to be encouraging for those in HR. Gender discrimination has absolutely no place in the workforce.
This type of analysis would be ideal for any corporation or organization's HR department. Ensuring that rating is driven by actual relevant KPIs instead of discrimination or regional differences is of the utmost importance.
In a real world scenario, the company trying to recreate this process should use more key data points to better understand the relationships between employee rating and employee performance.