Data Exploration and Transformation
path <- setwd("C:\\Users\\yogdhar\\OneDrive - Publicis Groupe\\Documents\\IIM\\Project - Attrition Prediction")
file_name <- "emp_data_dummy.csv"
file_path <- file.path(path, file_name)
# Import the employee data
emp_data <- read_csv(file_path)
# Data Transformation
# Convert emp_ID (employee ID) and Turnover variables into character
emp_data$emp_ID <- as.character(emp_data$emp_ID)
emp_data$Turnover <- as.factor(emp_data$Turnover)
# Convert some chr variables into numeric
emp_data$prod_utilization <- as.numeric(sub("%", "", emp_data$prod_utilization))
emp_data$bench <- as.numeric(sub("%", "", emp_data$bench))
emp_data$compa_ratio <- as.numeric(sub("%", "", emp_data$compa_ratio))
# Check the structure of org dataset
glimpse(emp_data)
## Rows: 11,007
## Columns: 19
## $ emp_ID <chr> "7238486", "6923304", "1036303", "5984817", "3982465"…
## $ Turnover <fct> Active, Active, Active, Active, Active, Active, Activ…
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male…
## $ Career_Stage <chr> "Sr Asso", "Asso", "Asso", "Sr Asso", "Asso", "Sr Ass…
## $ skill_complexity <chr> "Normal", "Complex", "Complex", "Normal", "Complex", …
## $ Age <dbl> 29, 26, 25, 29, 28, 29, 32, 28, 25, 29, 30, 33, 29, 2…
## $ Location <chr> "Gurgaon", "Bangalore", "Gurgaon", "Gurgaon", "Gurgao…
## $ Base_Pay <dbl> 1843520, 1372000, 1231999, 1959760, 1427900, 1521265,…
## $ Is_promoted <chr> "No", "No", "No", "No", "No", "No", "No", "No", "Yes"…
## $ Time_in_title <dbl> 14.9, 3.2, 4.6, 22.6, 20.0, 13.1, 25.2, 25.8, 7.0, 28…
## $ Tenure <dbl> 14.9, 3.2, 4.6, 22.6, 20.0, 13.1, 25.2, 25.8, 28.3, 2…
## $ Last_merit_inc <dbl> 10.0, 0.0, 0.0, 10.0, 9.0, 6.0, 10.0, 13.0, 40.0, 12.…
## $ Talent_Profile <chr> "Valued Contributor", "Valued Contributor", "Valued C…
## $ Engagement_Score <dbl> 8.0, 8.0, 8.0, 8.0, 8.0, 9.4, 8.1, 8.0, 5.1, 8.0, 8.0…
## $ prod_utilization <dbl> 108, 85, 66, 97, 78, 89, 86, 104, 91, 0, 94, 82, 69, …
## $ bench <dbl> 0, 0, 29, 0, 20, 0, 0, 0, 0, 95, 0, 0, 28, 0, 0, 0, 0…
## $ compa_ratio <dbl> 122.9, 142.0, 123.2, 130.7, 142.8, 125.7, 108.9, 106.…
## $ compa_level <chr> "Above", "Above", "Above", "Above", "Above", "Above",…
## $ Status <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
head(emp_data)
## # A tibble: 6 × 19
## emp_ID Turnover Gender Career_Stage skill_complexity Age Location Base_Pay
## <chr> <fct> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 7238486 Active Male Sr Asso Normal 29 Gurgaon 1843520
## 2 6923304 Active Male Asso Complex 26 Bangalore 1372000
## 3 1036303 Active Male Asso Complex 25 Gurgaon 1231999
## 4 5984817 Active Male Sr Asso Normal 29 Gurgaon 1959760
## 5 3982465 Active Male Asso Complex 28 Gurgaon 1427900
## 6 2936716 Active Male Sr Asso Complex 29 Bangalore 1521265
## # ℹ 11 more variables: Is_promoted <chr>, Time_in_title <dbl>, Tenure <dbl>,
## # Last_merit_inc <dbl>, Talent_Profile <chr>, Engagement_Score <dbl>,
## # prod_utilization <dbl>, bench <dbl>, compa_ratio <dbl>, compa_level <chr>,
## # Status <dbl>
# Feature Engineering: Note that 'compa level' variable was created in Excel
Exploratory data analysis begins
# Count Active and Inactive employees
emp_data %>%
count(Status)
## # A tibble: 2 × 2
## Status n
## <dbl> <int>
## 1 0 7398
## 2 1 3609
# Calculate turnover rate
emp_data %>%
summarise(avg_turnover_rate = mean(Status))
## # A tibble: 1 × 1
## avg_turnover_rate
## <dbl>
## 1 0.328
# Calculate level wise turnover rate
df_level <- emp_data %>%
group_by(Career_Stage) %>%
summarise(turnover_level = mean(Status))
# Check the results
df_level
## # A tibble: 3 × 2
## Career_Stage turnover_level
## <chr> <dbl>
## 1 Asso 0.334
## 2 Mgr 0.239
## 3 Sr Asso 0.345
# Visualize the results
ggplot(df_level, aes(x = Career_Stage, y = turnover_level, fill = Career_Stage)) +
geom_bar(stat = "identity") +
geom_text(aes(label = scales::percent(turnover_level)), vjust = -0.5, color = "black", size=3) +
labs(title = "Turnover Rate by Career Stage",
x = "Career Stage",
y = "Turnover Rate",
fill = "Career Stage") +
theme_minimal() +
theme(legend.position = "none") + # Remove the legend
scale_y_continuous(labels = scales::percent_format(scale = 1)) # Format y-axis as percentage

# Calculate location wise turnover rate
df_location <- emp_data %>%
group_by(Location) %>%
summarize(turnover_location = mean(Status))
# Check the results
df_location
## # A tibble: 4 × 2
## Location turnover_location
## <chr> <dbl>
## 1 Bangalore 0.303
## 2 Gurgaon 0.349
## 3 Mumbai 0.329
## 4 Noida 0.334
# Visualize the results
# Create a grouped bar chart with data labels in 00.0% format
ggplot(df_location, aes(x = Location, y = turnover_location, fill = Location)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(formatC(turnover_location * 100, format = "f", digits = 1), "%")), vjust = -0.5, size = 3) + # Add data labels above bars
labs(title = "Turnover Rate by Location",
x = "Location",
y = "Turnover Rate",
fill = "Location") +
theme_minimal() +
theme(legend.position = "none") # Remove the legend

# Count the number of employees across Career Stages
emp_data %>%
count(Career_Stage)
## # A tibble: 3 × 2
## Career_Stage n
## <chr> <int>
## 1 Asso 3117
## 2 Mgr 1446
## 3 Sr Asso 6444
# Count the occurrences of each Career_Stage
# Calculate career stage proportions
career_stage_props <- emp_data %>%
count(Career_Stage) %>%
mutate(proportion = n / sum(n))
career_stage_props
## # A tibble: 3 × 3
## Career_Stage n proportion
## <chr> <int> <dbl>
## 1 Asso 3117 0.283
## 2 Mgr 1446 0.131
## 3 Sr Asso 6444 0.585
# Calculate turnover rate for each career stage
career_stage_turnover <- emp_data %>%
group_by(Career_Stage) %>%
summarize(turnover_rate = mean(Status))
career_stage_turnover
## # A tibble: 3 × 2
## Career_Stage turnover_rate
## <chr> <dbl>
## 1 Asso 0.334
## 2 Mgr 0.239
## 3 Sr Asso 0.345
# Merge career stage proportions and turnover rates
merged_data <- merge(career_stage_props, career_stage_turnover, by = "Career_Stage")
p <- ggplot(merged_data, aes(x = Career_Stage)) +
geom_bar(aes(y = proportion * 100), stat = "identity", fill = "dark green", alpha = 0.7) +
geom_line(aes(y = turnover_rate * 100, group = 1), color = "red", size = 1) +
geom_point(aes(y = turnover_rate * 100), color = "red", size = 3) +
labs(title = "Distribution of Career Stages and Turnover Rate",
x = "Career Stage",
y = "Proportion / Turnover Rate (%)") +
scale_y_continuous(sec.axis = sec_axis(~./100, name = "Turnover Rate (%)")) +
theme_minimal() +
theme(legend.position = "right")
print(p)

# Compare Talent Profiles on Turnover
# Calculate rating wise turnover rate
df_talent_profile <- emp_data %>%
group_by(Talent_Profile) %>%
summarise(turnover_rating = mean(Status))
# Create a bar chart with data labels as percentages in 00.0% format
ggplot(df_talent_profile, aes(x = Talent_Profile, y = turnover_rating)) +
geom_bar(stat = "identity", fill = "5A9BD5") +
geom_text(aes(label = paste0(formatC(turnover_rating * 100, format = "f", digits = 1), "%")), vjust = -0.5, size = 3) + # Add data labels above bars
labs(title = "Comparison of Talent Profiles on Turnover",
x = "Talent Profile",
y = "Turnover Rating") +
theme_minimal()

# Compare Skill Complexity on Turnover
# Calculate rating wise turnover rate and count
df_skill_complexity <- emp_data %>%
group_by(skill_complexity) %>%
summarise(
turnover_rating = mean(Status),
count = n())
# Create a bar chart with data labels and absolute count
ggplot(df_skill_complexity, aes(x = skill_complexity, y = turnover_rating)) +
geom_bar(stat = "identity", fill = "light green") +
geom_text(aes(label = paste0("Count: ", count, "\nTurnover: ", scales::percent(turnover_rating))),
vjust = -0.1, size = 3) + # Add combined data labels above bars
labs(title = "Comparison of Skill Complexity on Turnover",
x = "Skill Complexity",
y = "Turnover and Count") +
theme_minimal()

# Happiness Scores (engagement data) and Attrition Analysis
# Create a boxplot with data labels
ggplot(emp_data, aes(x = Career_Stage, y = Engagement_Score)) +
geom_violin() +
geom_jitter(width = 0.2, alpha = 0.5, color = "#5A9BD5") + # Add jittered points for individual data
labs(title = "Comparison of Happiness Scores by Career Stage",
x = "Career Stage",
y = "Engagement Score") +
theme_minimal()

# Number of variables in the dataset
variables <- ncol(emp_data)
# Compensation and Attrition Analysis
# Set a custom color for the bars
bar_color <- "#5A9BD5"
# Plot the distribution of compensation
ggplot(emp_data, aes(x = Base_Pay)) +
geom_histogram(binwidth = 120000, color = "white", fill = bar_color, alpha = 0.7) +
labs(title = "Distribution of Compensation",
x = "Base Pay",
y = "Frequency") +
theme_minimal() +
theme(plot.title = element_text(size = 16, face = "bold"),
axis.text = element_text(size = 12),
axis.title = element_text(size = 14),
legend.title = element_blank(),
legend.text = element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white"),
plot.background = element_rect(fill = "white"))+
scale_x_continuous(labels = scales::comma)

library(scales) # Load the scales package for formatting
# Plot the distribution of compensation across levels
ggplot(emp_data, aes(x = Career_Stage, y = Base_Pay)) +
geom_boxplot() +
scale_y_continuous(labels = scales::dollar_format(scale = 1e-6, suffix = "M")) +
labs(title = "Distribution of Compensation",
x = "Career Stage",
y = "Base Pay (Millions)") +
theme_minimal()

# Convert Status to a factor
emp_data$Status <- factor(emp_data$Status)
# Compare compensation of Active and Inactive employees across levels
ggplot(emp_data, aes(x = Career_Stage, y = Base_Pay, fill = Status, group = interaction(Career_Stage, Status))) +
geom_boxplot() +
scale_fill_manual(values = c("0" = "gray", "1" = "#5A9BD5")) +
labs(title = "Comparison of Compensation for Active & Inactive Employees",
x = "Career Stage",
y = "Base Pay (Millions)") +
theme_minimal() +
scale_y_continuous(labels = scales::comma_format(scale = 1e-6)) # Format Y-axis labels in millions

# Calculate percentage distribution within each category
compa_level_counts <- emp_data %>%
group_by(Status, compa_level) %>%
summarize(count = n()) %>%
group_by(Status) %>%
mutate(percentage = count / sum(count))
# Create an enhanced grouped bar chart with data labels
ggplot(compa_level_counts, aes(x = Status, y = percentage, fill = compa_level)) +
geom_bar(stat = "identity", position = "fill") +
geom_text(aes(label = scales::percent(percentage)), position = position_fill(vjust = 0.5), size = 3) + # Add data labels
annotate("text", x = c(1, 2), y = -0.08, label = c("Active", "Inactive"), size = 3, color = "black") + # Add category labels
labs(title = "Comparison of compa_level for Active & Inactive Employees",
x = "Status",
y = "Percentage",
fill = "compa_level") +
theme_minimal() +
scale_y_continuous(labels = scales::percent_format(scale = 1)) + # Format y-axis as percentages
theme(legend.position = "top") # Move legend to the top for better placement

We can closely observe that a greater proportion of Inactive
employees were paid LESS than the Median compensation.
Predicting Attrition
Training and Testing Datasets: Splitting data
Split the data
# Set seed - This is to ensure that our research is reproducible
set.seed(123)
# Store row numbers for training dataset
index_train <- createDataPartition(emp_data$Status, p = 0.7, list = FALSE)
# Create training dataset
train_set <- emp_data[index_train, ]
head(train_set)
## # A tibble: 6 × 19
## emp_ID Turnover Gender Career_Stage skill_complexity Age Location Base_Pay
## <chr> <fct> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 7238486 Active Male Sr Asso Normal 29 Gurgaon 1843520
## 2 6923304 Active Male Asso Complex 26 Bangalore 1372000
## 3 3982465 Active Male Asso Complex 28 Gurgaon 1427900
## 4 2936716 Active Male Sr Asso Complex 29 Bangalore 1521265
## 5 2264655 Active Male Asso Super Complex 25 Gurgaon 1177817
## 6 5138103 Active Female Sr Asso Complex 29 Gurgaon 1718131
## # ℹ 11 more variables: Is_promoted <chr>, Time_in_title <dbl>, Tenure <dbl>,
## # Last_merit_inc <dbl>, Talent_Profile <chr>, Engagement_Score <dbl>,
## # prod_utilization <dbl>, bench <dbl>, compa_ratio <dbl>, compa_level <chr>,
## # Status <fct>
# Create testing dataset
test_set <- emp_data[-index_train, ]
head(test_set)
## # A tibble: 6 × 19
## emp_ID Turnover Gender Career_Stage skill_complexity Age Location Base_Pay
## <chr> <fct> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 1036303 Active Male Asso Complex 25 Gurgaon 1231999
## 2 5984817 Active Male Sr Asso Normal 29 Gurgaon 1959760
## 3 2305469 Active Male Asso Normal 32 Gurgaon 707680
## 4 7890396 Active Male Sr Asso Complex 28 Gurgaon 1598724
## 5 1791423 Active Female Sr Asso Complex 29 Gurgaon 1924966
## 6 9304541 Active Male Sr Asso Normal 30 Gurgaon 1839240
## # ℹ 11 more variables: Is_promoted <chr>, Time_in_title <dbl>, Tenure <dbl>,
## # Last_merit_inc <dbl>, Talent_Profile <chr>, Engagement_Score <dbl>,
## # prod_utilization <dbl>, bench <dbl>, compa_ratio <dbl>, compa_level <chr>,
## # Status <fct>
Corroborate the splits
# Let's make sure both train_set and test_set have the same proportion of active
# and inactive employees.
# Calculate turnover proportion in train_set
train_set %>%
count(Status) %>%
mutate(prop = n / sum(n))
## # A tibble: 2 × 3
## Status n prop
## <fct> <int> <dbl>
## 1 0 5179 0.672
## 2 1 2527 0.328
# Calculate Skill Complexity proportion in train_set
train_set %>%
count(skill_complexity) %>%
mutate(prop = n / sum(n))
## # A tibble: 3 × 3
## skill_complexity n prop
## <chr> <int> <dbl>
## 1 Complex 2997 0.389
## 2 Normal 4040 0.524
## 3 Super Complex 669 0.0868
# Calculate turnover proportion in test_set
test_set %>%
count(Status) %>%
mutate(prop = n / sum(n))
## # A tibble: 2 × 3
## Status n prop
## <fct> <int> <dbl>
## 1 0 2219 0.672
## 2 1 1082 0.328
# Calculate Skill Complexity proportion in train_set
test_set %>%
count(skill_complexity) %>%
mutate(prop = n / sum(n))
## # A tibble: 3 × 3
## skill_complexity n prop
## <chr> <int> <dbl>
## 1 Complex 1291 0.391
## 2 Normal 1727 0.523
## 3 Super Complex 283 0.0857
# Drop non-required variables and save the resulting object as train_set_multi
# We are dropping emp_ID: it is only the unique identifier
# We are dropping Turnover because we already have 'Status'
train_set_multi <- train_set %>%
select(-c(emp_ID, Turnover))
Developing different classification models and testing their
Accuracy
Logistic Regression
It is a Classification technique
Predicts the probability of occurrence of an event
Dependent variable is categorical
Independent variable can be Continuous or Categorical (age, tenure,
compensation, etc.)
Dependent variable is binary/dichotomous (Turnover is either 0 or
1)
multi_log_regression <- glm(Status ~ ., family = binomial(link = "logit"),
data = train_set_multi)
# Print summary
summary(multi_log_regression)
##
## Call:
## glm(formula = Status ~ ., family = binomial(link = "logit"),
## data = train_set_multi)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0362 -0.8715 -0.6051 1.0837 2.8340
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.523e+00 5.277e-01 10.465 < 2e-16 ***
## GenderMale 4.177e-01 6.250e-02 6.684 2.33e-11 ***
## Career_StageMgr 1.013e+00 1.926e-01 5.262 1.43e-07 ***
## Career_StageSr Asso 5.830e-01 9.561e-02 6.098 1.08e-09 ***
## skill_complexityNormal -4.525e-01 6.050e-02 -7.480 7.44e-14 ***
## skill_complexitySuper Complex -1.527e-01 1.045e-01 -1.462 0.143851
## Age -3.592e-02 9.557e-03 -3.758 0.000171 ***
## LocationGurgaon 2.302e-01 5.674e-02 4.056 4.99e-05 ***
## LocationMumbai 5.312e-01 3.156e-01 1.683 0.092334 .
## LocationNoida 2.603e-01 8.925e-02 2.917 0.003533 **
## Base_Pay -4.265e-07 1.022e-07 -4.175 2.98e-05 ***
## Is_promotedYes 5.985e-01 1.074e-01 5.572 2.52e-08 ***
## Time_in_title 3.160e-02 2.448e-03 12.909 < 2e-16 ***
## Tenure -1.804e-02 1.960e-03 -9.203 < 2e-16 ***
## Last_merit_inc -6.372e-03 3.517e-03 -1.812 0.070013 .
## Talent_ProfileTop Talent 1.452e-01 1.906e-01 0.762 0.445995
## Talent_ProfileValued Contributor -5.791e-01 1.756e-01 -3.297 0.000976 ***
## Engagement_Score -1.489e-01 3.478e-02 -4.282 1.85e-05 ***
## prod_utilization -8.161e-03 1.860e-03 -4.389 1.14e-05 ***
## bench 1.299e-02 2.819e-03 4.609 4.05e-06 ***
## compa_ratio -2.762e-02 2.670e-03 -10.345 < 2e-16 ***
## compa_levelBelow -9.898e-02 9.014e-02 -1.098 0.272185
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9751.2 on 7705 degrees of freedom
## Residual deviance: 8649.7 on 7684 degrees of freedom
## AIC: 8693.7
##
## Number of Fisher Scoring iterations: 4
Detecting and handling Multicollinearity in R
Notice that many variables seem to be insignificant.
In multiple regression models this can happen due to
multicollinearity.
Multicollinearity occurs when one independent variable is
highly
collinear with a set of two or more independent variables
Let us check multicollinearity through VIF - Variance Inflation
Factor
# Calculate VIF - Variance Inflation Factor
vif(multi_log_regression)
## GVIF Df GVIF^(1/(2*Df))
## Gender 1.082482 1 1.040424
## Career_Stage 4.875551 2 1.485956
## skill_complexity 1.434420 2 1.094382
## Age 3.297683 1 1.815952
## Location 1.077138 3 1.012462
## Base_Pay 6.966035 1 2.639325
## Is_promoted 3.642592 1 1.908558
## Time_in_title 4.124042 1 2.030774
## Tenure 7.346106 1 2.710370
## Last_merit_inc 1.491341 1 1.221205
## Talent_Profile 1.239627 2 1.055171
## Engagement_Score 1.011992 1 1.005978
## prod_utilization 1.772916 1 1.331509
## bench 1.812765 1 1.346390
## compa_ratio 4.537252 1 2.130083
## compa_level 2.918516 1 1.708367
How to interpret the VIF value
VIF Interpretation
1 Not multicollinear
Between 1 & 5 Moderately multicollinear
>5 Highly multicollinear
STEP 1 - compute VIF
STEP 2 - Identify if any variable has VIF >5 in first column in
output
STEP 2a - Remove variable with VIF >5 from model
STEP 2b - IF there are multiple variables with VIF >5, only
remove variable with highest VIF, from model
STEP 3 - Repeat steps 1 and 2 until IVF of each variable is
<5
We will now remove highly multicollinear variables from the model
now.
As identified in ‘vif(multi_log_regression)’, tenure has the highest
VIF, let us remove Tenure and check the model
multi_log_regression_vif1 <- glm(Status ~ . - Tenure, family = binomial(link="logit"),
data = train_set_multi)
# Check for multicollinearity in this model
vif(multi_log_regression_vif1)
## GVIF Df GVIF^(1/(2*Df))
## Gender 1.081181 1 1.039799
## Career_Stage 4.701170 2 1.472487
## skill_complexity 1.429399 2 1.093423
## Age 3.279669 1 1.810986
## Location 1.075145 3 1.012149
## Base_Pay 6.851123 1 2.617465
## Is_promoted 1.623125 1 1.274019
## Time_in_title 1.779951 1 1.334148
## Last_merit_inc 1.464252 1 1.210063
## Talent_Profile 1.241350 2 1.055537
## Engagement_Score 1.011887 1 1.005926
## prod_utilization 1.765832 1 1.328846
## bench 1.804678 1 1.343383
## compa_ratio 4.525427 1 2.127305
## compa_level 2.905871 1 1.704661
Note that we can still observe Multicollinearity with ‘Base_Pay’ in
the output.
Hence, remove Base Pay with high VIF and then re-run the model
multi_log_regression_vif2 <- glm(Status ~ . - Tenure - Base_Pay, family = binomial(link="logit"),
data = train_set_multi)
# Check for multicollinearity in this model
vif(multi_log_regression_vif2)
## GVIF Df GVIF^(1/(2*Df))
## Gender 1.061552 1 1.030317
## Career_Stage 2.256580 2 1.225639
## skill_complexity 1.255881 2 1.058613
## Age 3.109022 1 1.763242
## Location 1.074043 3 1.011976
## Is_promoted 1.610728 1 1.269145
## Time_in_title 1.779038 1 1.333806
## Last_merit_inc 1.449365 1 1.203896
## Talent_Profile 1.233087 2 1.053776
## Engagement_Score 1.011197 1 1.005583
## prod_utilization 1.761817 1 1.327335
## bench 1.803021 1 1.342766
## compa_ratio 3.139777 1 1.771942
## compa_level 2.905263 1 1.704483
Now, we can observe all the GVIF values < 5 so we canstart
building final Logistic model
LOGISTIC REGRESSION MODEL
Drop variables Tenure and Base_Pay and save the resulting object as
‘train_set_final’
Let’s convert train_set_multi into dataframe so that grouping is
removed and we do not get error while removing fields from dataset
train_set_final <- as.data.frame(train_set_multi) #convert train_set_multi in df
train_set_final <- select(train_set_multi, -c(Tenure, Base_Pay))
glimpse(train_set_final)
## Rows: 7,706
## Columns: 15
## $ Gender <chr> "Male", "Male", "Male", "Male", "Male", "Female", "Ma…
## $ Career_Stage <chr> "Sr Asso", "Asso", "Asso", "Sr Asso", "Asso", "Sr Ass…
## $ skill_complexity <chr> "Normal", "Complex", "Complex", "Complex", "Super Com…
## $ Age <dbl> 29, 26, 28, 29, 25, 29, 23, 28, 39, 29, 26, 46, 29, 3…
## $ Location <chr> "Gurgaon", "Bangalore", "Gurgaon", "Bangalore", "Gurg…
## $ Is_promoted <chr> "No", "No", "No", "No", "Yes", "No", "No", "Yes", "No…
## $ Time_in_title <dbl> 14.9, 3.2, 20.0, 13.1, 7.0, 36.8, 9.4, 7.0, 34.0, 2.5…
## $ Last_merit_inc <dbl> 10.0, 0.0, 9.0, 6.0, 40.0, 10.0, 0.0, 23.0, 11.0, 0.0…
## $ Talent_Profile <chr> "Valued Contributor", "Valued Contributor", "Valued C…
## $ Engagement_Score <dbl> 8.0, 8.0, 8.0, 9.4, 5.1, 7.1, 8.0, 8.0, 8.0, 8.0, 8.0…
## $ prod_utilization <dbl> 108, 85, 78, 89, 91, 69, 92, 83, 91, 83, 95, 94, 85, …
## $ bench <dbl> 0, 0, 20, 0, 0, 28, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ compa_ratio <dbl> 122.9, 142.0, 142.8, 125.7, 78.5, 114.5, 145.6, 88.5,…
## $ compa_level <chr> "Above", "Above", "Above", "Above", "Below", "Above",…
## $ Status <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
# Build the final logistic regression model
logistic_regression_final <- glm(Status ~ ., family = binomial("logit"),
data = train_set_final)
# Print summary
summary(logistic_regression_final)
##
## Call:
## glm(formula = Status ~ ., family = binomial("logit"), data = train_set_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1162 -0.8758 -0.6215 1.1053 2.7499
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.385189 0.507565 12.580 < 2e-16 ***
## GenderMale 0.399837 0.061458 6.506 7.72e-11 ***
## Career_StageMgr 0.284632 0.131393 2.166 0.030291 *
## Career_StageSr Asso 0.337219 0.073687 4.576 4.73e-06 ***
## skill_complexityNormal -0.403153 0.058968 -6.837 8.09e-12 ***
## skill_complexitySuper Complex -0.284322 0.100602 -2.826 0.004710 **
## Age -0.061527 0.009182 -6.701 2.07e-11 ***
## LocationGurgaon 0.216442 0.056349 3.841 0.000122 ***
## LocationMumbai 0.503605 0.314539 1.601 0.109357
## LocationNoida 0.219967 0.088374 2.489 0.012808 *
## Is_promotedYes -0.189104 0.070176 -2.695 0.007045 **
## Time_in_title 0.015515 0.001578 9.830 < 2e-16 ***
## Last_merit_inc -0.004873 0.003446 -1.414 0.157378
## Talent_ProfileTop Talent 0.071220 0.188931 0.377 0.706200
## Talent_ProfileValued Contributor -0.600172 0.174759 -3.434 0.000594 ***
## Engagement_Score -0.146019 0.034381 -4.247 2.17e-05 ***
## prod_utilization -0.008165 0.001843 -4.431 9.37e-06 ***
## bench 0.013169 0.002803 4.698 2.63e-06 ***
## compa_ratio -0.032915 0.002210 -14.895 < 2e-16 ***
## compa_levelBelow -0.088901 0.089260 -0.996 0.319259
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9751.2 on 7705 degrees of freedom
## Residual deviance: 8756.6 on 7686 degrees of freedom
## AIC: 8796.6
##
## Number of Fisher Scoring iterations: 4
Understanding Model Predictions - generate and validate
predictions
# Make predictions for training dataset
logistic_prediction_train <- predict(logistic_regression_final, newdata = train_set_final,
type = "response")
# Look at the prediction range
hist(logistic_prediction_train)

# Make predictions for testing dataset
logistic_prediction_test <- predict(logistic_regression_final, newdata = test_set,
type = "response")
# Look at the prediction range
hist(logistic_prediction_test)

# Classify predictions using a cut-off of 0.5
pred_cutoff_50_test <- ifelse(logistic_prediction_test > 0.55, 1, 0)
## Creating confusion matrix
table(pred_cutoff_50_test, test_set$Status)
##
## pred_cutoff_50_test 0 1
## 0 2095 829
## 1 124 253
# Method to compute model accuracy
conf_matrix_logistic <- confusionMatrix(table(test_set$Status, pred_cutoff_50_test))
conf_matrix_logistic
## Confusion Matrix and Statistics
##
## pred_cutoff_50_test
## 0 1
## 0 2095 124
## 1 829 253
##
## Accuracy : 0.7113
## 95% CI : (0.6955, 0.7267)
## No Information Rate : 0.8858
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2136
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7165
## Specificity : 0.6711
## Pos Pred Value : 0.9441
## Neg Pred Value : 0.2338
## Prevalence : 0.8858
## Detection Rate : 0.6347
## Detection Prevalence : 0.6722
## Balanced Accuracy : 0.6938
##
## 'Positive' Class : 0
##
RANDOM FOREST MODEL
# Train a Random Forest model
random_forest_model <- randomForest(
Status ~ .,
data = train_set_final,
ntree = 1000,
mtry = sqrt(ncol(train_set_final) - 1),
maxdepth = 15
)
# Make predictions using the trained Random Forest model
prediction_random_forest <- predict(random_forest_model, newdata = test_set)
# Create confusion matrix
conf_matrix_random_forest <- confusionMatrix(prediction_random_forest, test_set$Status)
# Print the confusion matrix
print(conf_matrix_random_forest)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2048 498
## 1 171 584
##
## Accuracy : 0.7973
## 95% CI : (0.7832, 0.8109)
## No Information Rate : 0.6722
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5015
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9229
## Specificity : 0.5397
## Pos Pred Value : 0.8044
## Neg Pred Value : 0.7735
## Prevalence : 0.6722
## Detection Rate : 0.6204
## Detection Prevalence : 0.7713
## Balanced Accuracy : 0.7313
##
## 'Positive' Class : 0
##
SUPPORT VECTOR MACHINE (SVM) MODEL
# Train a Support Vector Machine (SVM) model
svm_model <- svm(
Status ~ .,
data = train_set_final,
kernel = "radial", # Kernel type (e.g., radial for Gaussian RBF)
cost = 1, # Cost parameter (C)
gamma = 0.1 # Gamma parameter (for radial kernel)
)
# Make predictions using the trained SVM model
prediction_svm <- predict(svm_model, newdata = test_set)
# Create confusion matrix for SVM
conf_matrix_svm <- confusionMatrix(prediction_svm, test_set$Status)
# Print the confusion matrix for SVM
print(conf_matrix_svm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2064 699
## 1 155 383
##
## Accuracy : 0.7413
## 95% CI : (0.726, 0.7562)
## No Information Rate : 0.6722
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.3261
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9301
## Specificity : 0.3540
## Pos Pred Value : 0.7470
## Neg Pred Value : 0.7119
## Prevalence : 0.6722
## Detection Rate : 0.6253
## Detection Prevalence : 0.8370
## Balanced Accuracy : 0.6421
##
## 'Positive' Class : 0
##
Calculate accuracies of three models
accuracy_logistic <- conf_matrix_logistic$overall["Accuracy"]
accuracy_random_forest <- conf_matrix_random_forest$overall["Accuracy"]
accuracy_svm <- conf_matrix_svm$overall["Accuracy"]
Create a data frame for plotting
accuracy_data <- data.frame(
Model = c("Logistic Regression", "Random Forest", "SVM"),
Accuracy = c(accuracy_logistic, accuracy_random_forest, accuracy_svm)
)
Create a bar plot
ggplot(accuracy_data, aes(x = Model, y = Accuracy, fill = Model)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(formatC(Accuracy * 100, format = "f", digits = 2), "%")), vjust = -0.5, size = 3) + # Add data labels above bars
labs(title = "Model Accuracies",
x = "Model",
y = "Accuracy") +
theme_minimal()

Calculate Precision, Recall, and ROC-AUC
precision_logistic <- conf_matrix_logistic$byClass["Precision"]
recall_logistic <- conf_matrix_logistic$byClass["Recall"]
roc_auc_logistic <- roc(test_set$Status, as.numeric(logistic_prediction_test))$auc
precision_random_forest <- conf_matrix_random_forest$byClass["Precision"]
recall_random_forest <- conf_matrix_random_forest$byClass["Recall"]
roc_auc_random_forest <- roc(test_set$Status, as.numeric(prediction_random_forest))$auc
precision_svm <- conf_matrix_svm$byClass["Precision"]
recall_svm <- conf_matrix_svm$byClass["Recall"]
roc_auc_svm <- roc(test_set$Status, as.numeric(prediction_svm))$auc
Create a data frame for plotting
metrics_data <- data.frame(
Model = rep(c("Logistic Regression", "Random Forest", "SVM"), each = 3),
Metric = rep(c("Precision", "Recall", "ROC-AUC"), times = 3),
Value = c(
precision_logistic, recall_logistic, roc_auc_logistic,
precision_random_forest, recall_random_forest, roc_auc_random_forest,
precision_svm, recall_svm, roc_auc_svm
)
)
Create a faceted bar plot with data labels
ggplot(metrics_data, aes(x = Model, y = Value, fill = Metric)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = paste0(round(Value * 100, 1), "%")),
position = position_dodge(width = 0.9),
vjust = -0.5) +
labs(title = "Model Evaluation Metrics",
x = "Model",
y = "Value") +
facet_wrap(vars(Metric), scales = "free_y") +
theme_minimal()

Analysing Final Outputs
Understanding the Outputs step by step:
PRECISION:
The Logistic Regression model has the highest precision (94.4%),
indicating that when it predicts a positive outcome, it is likely to be
correct.
RECALL:
Random Forest and SVM have higher recall values (92.6% and 93%
respectively), indicating that these models are better at capturing a
higher percentage of actual positive cases.
AUC-ROC (Receiver Operating Characteristic):
The area under ROC curve (AUC) quantifies the overall performance of
the model. In our case, Random Forest has the highest ROC-AUC (73.3%),
suggesting that it provides a better balance between true positive rate
and false positive rate.
INTERPRETING THE VALUES
A) If we prioritize correctly identifying actual positive cases
(high recall), the Random Forest or SVM might be suitable choices.
B) If we prioritize minimizing false positives and having highly
accurate, positive predictions (high precision), the Logistic Regression
model might be preferred.
C) If we want our model to provide a good balance between true
positive rate and false positive rate, the Random Forest model with the
highest ROC-AUC might be a consideration.
OUR CHOICE
We should select Random Forest as the model of choice to get the
best balance in true positive and false positive rates.