Load library

library(readr)
## Warning: package 'readr' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(pscl)
## Warning: package 'pscl' was built under R version 4.5.3
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.

Deskripsi Data

data <- read_csv("D:/Semester 4/ANMUL_MODUL 4/Employee Attrition.csv")
## Rows: 15787 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): dept, salary
## dbl (8): Emp ID, satisfaction_level, last_evaluation, number_project, averag...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 10
##   `Emp ID` satisfaction_level last_evaluation number_project
##      <dbl>              <dbl>           <dbl>          <dbl>
## 1        1               0.38            0.53              2
## 2        2               0.8             0.86              5
## 3        3               0.11            0.88              7
## 4        4               0.72            0.87              5
## 5        5               0.37            0.52              2
## 6        6               0.41            0.5               2
## # ℹ 6 more variables: average_montly_hours <dbl>, time_spend_company <dbl>,
## #   Work_accident <dbl>, promotion_last_5years <dbl>, dept <chr>, salary <chr>
str(data)
## spc_tbl_ [15,787 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Emp ID               : num [1:15787] 1 2 3 4 5 6 7 8 9 10 ...
##  $ satisfaction_level   : num [1:15787] 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num [1:15787] 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : num [1:15787] 2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : num [1:15787] 157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : num [1:15787] 3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : num [1:15787] 0 0 0 0 0 0 0 0 0 0 ...
##  $ promotion_last_5years: num [1:15787] 0 0 0 0 0 0 0 0 0 0 ...
##  $ dept                 : chr [1:15787] "sales" "sales" "sales" "sales" ...
##  $ salary               : chr [1:15787] "low" "medium" "medium" "low" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Emp ID` = col_double(),
##   ..   satisfaction_level = col_double(),
##   ..   last_evaluation = col_double(),
##   ..   number_project = col_double(),
##   ..   average_montly_hours = col_double(),
##   ..   time_spend_company = col_double(),
##   ..   Work_accident = col_double(),
##   ..   promotion_last_5years = col_double(),
##   ..   dept = col_character(),
##   ..   salary = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
summary(data)
##      Emp ID      satisfaction_level last_evaluation  number_project 
##  Min.   :    1   Min.   :0.0900     Min.   :0.3600   Min.   :2.000  
##  1st Qu.: 3750   1st Qu.:0.4400     1st Qu.:0.5600   1st Qu.:3.000  
##  Median : 7500   Median :0.6400     Median :0.7200   Median :4.000  
##  Mean   : 7500   Mean   :0.6128     Mean   :0.7161   Mean   :3.803  
##  3rd Qu.:11250   3rd Qu.:0.8200     3rd Qu.:0.8700   3rd Qu.:5.000  
##  Max.   :14999   Max.   :1.0000     Max.   :1.0000   Max.   :7.000  
##  NA's   :788     NA's   :788        NA's   :788      NA's   :788    
##  average_montly_hours time_spend_company Work_accident    promotion_last_5years
##  Min.   : 96.0        Min.   : 2.000     Min.   :0.0000   Min.   :0.00000      
##  1st Qu.:156.0        1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.00000      
##  Median :200.0        Median : 3.000     Median :0.0000   Median :0.00000      
##  Mean   :201.1        Mean   : 3.498     Mean   :0.1446   Mean   :0.02127      
##  3rd Qu.:245.0        3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.00000      
##  Max.   :310.0        Max.   :10.000     Max.   :1.0000   Max.   :1.00000      
##  NA's   :788          NA's   :788        NA's   :788      NA's   :788          
##      dept              salary         
##  Length:15787       Length:15787      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 
colSums(is.na(data))
##                Emp ID    satisfaction_level       last_evaluation 
##                   788                   788                   788 
##        number_project  average_montly_hours    time_spend_company 
##                   788                   788                   788 
##         Work_accident promotion_last_5years                  dept 
##                   788                   788                   788 
##                salary 
##                   788

Analisis Data

data <- na.omit(data)

colSums(is.na(data))
##                Emp ID    satisfaction_level       last_evaluation 
##                     0                     0                     0 
##        number_project  average_montly_hours    time_spend_company 
##                     0                     0                     0 
##         Work_accident promotion_last_5years                  dept 
##                     0                     0                     0 
##                salary 
##                     0

Statistika Deskriptif

hist(data$satisfaction_level, main="Distribusi Kepuasan", xlab="Satisfaction")

hist(data$number_project, main="Jumlah Proyek", xlab="Project")

hist(data$average_montly_hours, main="Jam Kerja", xlab="Hours")

boxplot(data$satisfaction_level, main="Outlier Satisfaction")

boxplot(data$average_montly_hours, main="Outlier Jam Kerja")

boxplot(data$number_project, main="Outlier Project")

barplot(table(data$salary), main="Distribusi Gaji")

barplot(table(data$dept), main="Distribusi Departemen")

barplot(table(data$promotion_last_5years), main="Promosi")

boxplot(satisfaction_level ~ salary, data=data, main="Satisfaction vs Salary")

boxplot(satisfaction_level ~ promotion_last_5years, data=data, main="Satisfaction vs Promotion")

boxplot(satisfaction_level ~ Work_accident, data=data, main="Satisfaction vs Work Accident")

summary(data)
##      Emp ID      satisfaction_level last_evaluation  number_project 
##  Min.   :    1   Min.   :0.0900     Min.   :0.3600   Min.   :2.000  
##  1st Qu.: 3750   1st Qu.:0.4400     1st Qu.:0.5600   1st Qu.:3.000  
##  Median : 7500   Median :0.6400     Median :0.7200   Median :4.000  
##  Mean   : 7500   Mean   :0.6128     Mean   :0.7161   Mean   :3.803  
##  3rd Qu.:11250   3rd Qu.:0.8200     3rd Qu.:0.8700   3rd Qu.:5.000  
##  Max.   :14999   Max.   :1.0000     Max.   :1.0000   Max.   :7.000  
##  average_montly_hours time_spend_company Work_accident    promotion_last_5years
##  Min.   : 96.0        Min.   : 2.000     Min.   :0.0000   Min.   :0.00000      
##  1st Qu.:156.0        1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.00000      
##  Median :200.0        Median : 3.000     Median :0.0000   Median :0.00000      
##  Mean   :201.1        Mean   : 3.498     Mean   :0.1446   Mean   :0.02127      
##  3rd Qu.:245.0        3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.00000      
##  Max.   :310.0        Max.   :10.000     Max.   :1.0000   Max.   :1.00000      
##      dept              salary         
##  Length:14999       Length:14999      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
dim(data)
## [1] 14999    10

Pembentukan Variabel Ordinal

data$satisfaction_cat <- cut(
  data$satisfaction_level,
  breaks = quantile(data$satisfaction_level, probs = c(0, 0.33, 0.66, 1)),
  labels = c("Low", "Medium", "High"),
  include.lowest = TRUE
)

data$satisfaction_cat <- factor(
  data$satisfaction_cat,
  levels = c("Low","Medium","High"),
  ordered = TRUE
)

table(data$satisfaction_cat)
## 
##    Low Medium   High 
##   4999   5066   4934
data$promotion_last_5years <- factor(data$promotion_last_5years)

data$salary <- factor(
  data$salary,
  levels = c("low","medium","high"),
  ordered = TRUE
)

Uji Independensi

chisq.test(table(data$satisfaction_cat, data$salary))
## 
##  Pearson's Chi-squared test
## 
## data:  table(data$satisfaction_cat, data$salary)
## X-squared = 72.28, df = 4, p-value = 7.49e-15
chisq.test(table(data$satisfaction_cat, data$promotion_last_5years))
## 
##  Pearson's Chi-squared test
## 
## data:  table(data$satisfaction_cat, data$promotion_last_5years)
## X-squared = 10.886, df = 2, p-value = 0.004326
chisq.test(table(data$satisfaction_cat, data$number_project))
## 
##  Pearson's Chi-squared test
## 
## data:  table(data$satisfaction_cat, data$number_project)
## X-squared = 5058.7, df = 10, p-value < 2.2e-16

Model Regresi Logistik Ordinal

model <- polr(
  satisfaction_cat ~ 
    number_project +
    average_montly_hours +
    time_spend_company +
    promotion_last_5years +
    salary,
  data = data,
  Hess = TRUE
)

summary(model)
## Call:
## polr(formula = satisfaction_cat ~ number_project + average_montly_hours + 
##     time_spend_company + promotion_last_5years + salary, data = data, 
##     Hess = TRUE)
## 
## Coefficients:
##                            Value Std. Error t value
## number_project          0.009999  0.0140636  0.7110
## average_montly_hours    0.004794  0.0003344 14.3355
## time_spend_company     -0.069873  0.0104694 -6.6740
## promotion_last_5years1  0.307199  0.1038422  2.9583
## salary.L                0.163821  0.0395886  4.1381
## salary.Q               -0.025003  0.0294278 -0.8496
## 
## Intercepts:
##             Value   Std. Error t value
## Low|Medium   0.0191  0.0762     0.2503
## Medium|High  1.4490  0.0772    18.7672
## 
## Residual Deviance: 32646.74 
## AIC: 32662.74

Pengujian Signifikansi Parameter

a. Uji Serentak

model_null <- polr(satisfaction_cat ~ 1, data = data)

anova(model_null, model)
## Likelihood ratio tests of ordinal regression models
## 
## Response: satisfaction_cat
##                                                                                         Model
## 1                                                                                           1
## 2 number_project + average_montly_hours + time_spend_company + promotion_last_5years + salary
##   Resid. df Resid. Dev   Test    Df LR stat. Pr(Chi)
## 1     14997   32954.43                              
## 2     14991   32646.74 1 vs 2     6 307.6848       0

b. Uji Parsial

coef_table <- coef(summary(model))

p_values <- pnorm(abs(coef_table[, "t value"]), lower.tail = FALSE) * 2

coef_table <- cbind(coef_table, "p value" = p_values)

coef_table
##                               Value   Std. Error    t value      p value
## number_project          0.009999052 0.0140636010  0.7109880 4.770917e-01
## average_montly_hours    0.004793702 0.0003343928 14.3355411 1.312198e-46
## time_spend_company     -0.069872563 0.0104694339 -6.6739581 2.489947e-11
## promotion_last_5years1  0.307198535 0.1038422425  2.9583195 3.093213e-03
## salary.L                0.163821425 0.0395885524  4.1381009 3.501923e-05
## salary.Q               -0.025002554 0.0294278391 -0.8496225 3.955350e-01
## Low|Medium              0.019073304 0.0762117164  0.2502673 8.023806e-01
## Medium|High             1.448954921 0.0772066039 18.7672407 1.399691e-78

Odds Ratio

exp(coef(model))
##         number_project   average_montly_hours     time_spend_company 
##              1.0100492              1.0048052              0.9325126 
## promotion_last_5years1               salary.L               salary.Q 
##              1.3596109              1.1780039              0.9753074

Uji Multikolinearitas

vif(lm(
  as.numeric(satisfaction_cat) ~ 
    number_project +
    average_montly_hours +
    time_spend_company +
    promotion_last_5years +
    salary,
  data = data
))
##                           GVIF Df GVIF^(1/(2*Df))
## number_project        1.242729  1        1.114778
## average_montly_hours  1.214001  1        1.101817
## time_spend_company    1.050325  1        1.024854
## promotion_last_5years 1.014353  1        1.007151
## salary                1.012160  2        1.003026

Uji Kebaikan Model

pR2(model)
## fitting null model for pseudo-r2
##           llh       llhNull            G2      McFadden          r2ML 
## -1.632337e+04 -1.647721e+04  3.076848e+02  9.336677e-03  2.030472e-02 
##          r2CU 
##  2.284314e-02

Evaluasi Model

# Prediksi
pred <- predict(model, data)

head(pred)
## [1] Low  High High Low  Low  Low 
## Levels: Low Medium High
# Confusion matrix
table(
  Actual = data$satisfaction_cat,
  Predicted = pred
)
##         Predicted
## Actual    Low Medium High
##   Low    2834    430 1735
##   Medium 1996    882 2188
##   High   1760   1006 2168
# Akurasi
accuracy <- mean(as.character(pred) == as.character(data$satisfaction_cat))
accuracy
## [1] 0.3922928