Load library
library(readr)
## Warning: package 'readr' was built under R version 4.5.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(car)
## Warning: package 'car' was built under R version 4.5.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.5.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(pscl)
## Warning: package 'pscl' was built under R version 4.5.3
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
Deskripsi Data
data <- read_csv("D:/Semester 4/ANMUL_MODUL 4/Employee Attrition.csv")
## Rows: 15787 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): dept, salary
## dbl (8): Emp ID, satisfaction_level, last_evaluation, number_project, averag...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(data)
## # A tibble: 6 × 10
## `Emp ID` satisfaction_level last_evaluation number_project
## <dbl> <dbl> <dbl> <dbl>
## 1 1 0.38 0.53 2
## 2 2 0.8 0.86 5
## 3 3 0.11 0.88 7
## 4 4 0.72 0.87 5
## 5 5 0.37 0.52 2
## 6 6 0.41 0.5 2
## # ℹ 6 more variables: average_montly_hours <dbl>, time_spend_company <dbl>,
## # Work_accident <dbl>, promotion_last_5years <dbl>, dept <chr>, salary <chr>
str(data)
## spc_tbl_ [15,787 × 10] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Emp ID : num [1:15787] 1 2 3 4 5 6 7 8 9 10 ...
## $ satisfaction_level : num [1:15787] 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num [1:15787] 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : num [1:15787] 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : num [1:15787] 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : num [1:15787] 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : num [1:15787] 0 0 0 0 0 0 0 0 0 0 ...
## $ promotion_last_5years: num [1:15787] 0 0 0 0 0 0 0 0 0 0 ...
## $ dept : chr [1:15787] "sales" "sales" "sales" "sales" ...
## $ salary : chr [1:15787] "low" "medium" "medium" "low" ...
## - attr(*, "spec")=
## .. cols(
## .. `Emp ID` = col_double(),
## .. satisfaction_level = col_double(),
## .. last_evaluation = col_double(),
## .. number_project = col_double(),
## .. average_montly_hours = col_double(),
## .. time_spend_company = col_double(),
## .. Work_accident = col_double(),
## .. promotion_last_5years = col_double(),
## .. dept = col_character(),
## .. salary = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
summary(data)
## Emp ID satisfaction_level last_evaluation number_project
## Min. : 1 Min. :0.0900 Min. :0.3600 Min. :2.000
## 1st Qu.: 3750 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000
## Median : 7500 Median :0.6400 Median :0.7200 Median :4.000
## Mean : 7500 Mean :0.6128 Mean :0.7161 Mean :3.803
## 3rd Qu.:11250 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000
## Max. :14999 Max. :1.0000 Max. :1.0000 Max. :7.000
## NA's :788 NA's :788 NA's :788 NA's :788
## average_montly_hours time_spend_company Work_accident promotion_last_5years
## Min. : 96.0 Min. : 2.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:156.0 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :200.0 Median : 3.000 Median :0.0000 Median :0.00000
## Mean :201.1 Mean : 3.498 Mean :0.1446 Mean :0.02127
## 3rd Qu.:245.0 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :310.0 Max. :10.000 Max. :1.0000 Max. :1.00000
## NA's :788 NA's :788 NA's :788 NA's :788
## dept salary
## Length:15787 Length:15787
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
colSums(is.na(data))
## Emp ID satisfaction_level last_evaluation
## 788 788 788
## number_project average_montly_hours time_spend_company
## 788 788 788
## Work_accident promotion_last_5years dept
## 788 788 788
## salary
## 788
Analisis Data
data <- na.omit(data)
colSums(is.na(data))
## Emp ID satisfaction_level last_evaluation
## 0 0 0
## number_project average_montly_hours time_spend_company
## 0 0 0
## Work_accident promotion_last_5years dept
## 0 0 0
## salary
## 0
Statistika Deskriptif
hist(data$satisfaction_level, main="Distribusi Kepuasan", xlab="Satisfaction")

hist(data$number_project, main="Jumlah Proyek", xlab="Project")

hist(data$average_montly_hours, main="Jam Kerja", xlab="Hours")

boxplot(data$satisfaction_level, main="Outlier Satisfaction")

boxplot(data$average_montly_hours, main="Outlier Jam Kerja")

boxplot(data$number_project, main="Outlier Project")

barplot(table(data$salary), main="Distribusi Gaji")

barplot(table(data$dept), main="Distribusi Departemen")

barplot(table(data$promotion_last_5years), main="Promosi")

boxplot(satisfaction_level ~ salary, data=data, main="Satisfaction vs Salary")

boxplot(satisfaction_level ~ promotion_last_5years, data=data, main="Satisfaction vs Promotion")

boxplot(satisfaction_level ~ Work_accident, data=data, main="Satisfaction vs Work Accident")

summary(data)
## Emp ID satisfaction_level last_evaluation number_project
## Min. : 1 Min. :0.0900 Min. :0.3600 Min. :2.000
## 1st Qu.: 3750 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000
## Median : 7500 Median :0.6400 Median :0.7200 Median :4.000
## Mean : 7500 Mean :0.6128 Mean :0.7161 Mean :3.803
## 3rd Qu.:11250 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000
## Max. :14999 Max. :1.0000 Max. :1.0000 Max. :7.000
## average_montly_hours time_spend_company Work_accident promotion_last_5years
## Min. : 96.0 Min. : 2.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:156.0 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :200.0 Median : 3.000 Median :0.0000 Median :0.00000
## Mean :201.1 Mean : 3.498 Mean :0.1446 Mean :0.02127
## 3rd Qu.:245.0 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :310.0 Max. :10.000 Max. :1.0000 Max. :1.00000
## dept salary
## Length:14999 Length:14999
## Class :character Class :character
## Mode :character Mode :character
##
##
##
dim(data)
## [1] 14999 10
Pembentukan Variabel Ordinal
data$satisfaction_cat <- cut(
data$satisfaction_level,
breaks = quantile(data$satisfaction_level, probs = c(0, 0.33, 0.66, 1)),
labels = c("Low", "Medium", "High"),
include.lowest = TRUE
)
data$satisfaction_cat <- factor(
data$satisfaction_cat,
levels = c("Low","Medium","High"),
ordered = TRUE
)
table(data$satisfaction_cat)
##
## Low Medium High
## 4999 5066 4934
data$promotion_last_5years <- factor(data$promotion_last_5years)
data$salary <- factor(
data$salary,
levels = c("low","medium","high"),
ordered = TRUE
)
Uji Independensi
chisq.test(table(data$satisfaction_cat, data$salary))
##
## Pearson's Chi-squared test
##
## data: table(data$satisfaction_cat, data$salary)
## X-squared = 72.28, df = 4, p-value = 7.49e-15
chisq.test(table(data$satisfaction_cat, data$promotion_last_5years))
##
## Pearson's Chi-squared test
##
## data: table(data$satisfaction_cat, data$promotion_last_5years)
## X-squared = 10.886, df = 2, p-value = 0.004326
chisq.test(table(data$satisfaction_cat, data$number_project))
##
## Pearson's Chi-squared test
##
## data: table(data$satisfaction_cat, data$number_project)
## X-squared = 5058.7, df = 10, p-value < 2.2e-16
Model Regresi Logistik Ordinal
model <- polr(
satisfaction_cat ~
number_project +
average_montly_hours +
time_spend_company +
promotion_last_5years +
salary,
data = data,
Hess = TRUE
)
summary(model)
## Call:
## polr(formula = satisfaction_cat ~ number_project + average_montly_hours +
## time_spend_company + promotion_last_5years + salary, data = data,
## Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## number_project 0.009999 0.0140636 0.7110
## average_montly_hours 0.004794 0.0003344 14.3355
## time_spend_company -0.069873 0.0104694 -6.6740
## promotion_last_5years1 0.307199 0.1038422 2.9583
## salary.L 0.163821 0.0395886 4.1381
## salary.Q -0.025003 0.0294278 -0.8496
##
## Intercepts:
## Value Std. Error t value
## Low|Medium 0.0191 0.0762 0.2503
## Medium|High 1.4490 0.0772 18.7672
##
## Residual Deviance: 32646.74
## AIC: 32662.74
Pengujian Signifikansi Parameter
a. Uji Serentak
model_null <- polr(satisfaction_cat ~ 1, data = data)
anova(model_null, model)
## Likelihood ratio tests of ordinal regression models
##
## Response: satisfaction_cat
## Model
## 1 1
## 2 number_project + average_montly_hours + time_spend_company + promotion_last_5years + salary
## Resid. df Resid. Dev Test Df LR stat. Pr(Chi)
## 1 14997 32954.43
## 2 14991 32646.74 1 vs 2 6 307.6848 0
b. Uji Parsial
coef_table <- coef(summary(model))
p_values <- pnorm(abs(coef_table[, "t value"]), lower.tail = FALSE) * 2
coef_table <- cbind(coef_table, "p value" = p_values)
coef_table
## Value Std. Error t value p value
## number_project 0.009999052 0.0140636010 0.7109880 4.770917e-01
## average_montly_hours 0.004793702 0.0003343928 14.3355411 1.312198e-46
## time_spend_company -0.069872563 0.0104694339 -6.6739581 2.489947e-11
## promotion_last_5years1 0.307198535 0.1038422425 2.9583195 3.093213e-03
## salary.L 0.163821425 0.0395885524 4.1381009 3.501923e-05
## salary.Q -0.025002554 0.0294278391 -0.8496225 3.955350e-01
## Low|Medium 0.019073304 0.0762117164 0.2502673 8.023806e-01
## Medium|High 1.448954921 0.0772066039 18.7672407 1.399691e-78
Odds Ratio
exp(coef(model))
## number_project average_montly_hours time_spend_company
## 1.0100492 1.0048052 0.9325126
## promotion_last_5years1 salary.L salary.Q
## 1.3596109 1.1780039 0.9753074
Uji Multikolinearitas
vif(lm(
as.numeric(satisfaction_cat) ~
number_project +
average_montly_hours +
time_spend_company +
promotion_last_5years +
salary,
data = data
))
## GVIF Df GVIF^(1/(2*Df))
## number_project 1.242729 1 1.114778
## average_montly_hours 1.214001 1 1.101817
## time_spend_company 1.050325 1 1.024854
## promotion_last_5years 1.014353 1 1.007151
## salary 1.012160 2 1.003026
Uji Kebaikan Model
pR2(model)
## fitting null model for pseudo-r2
## llh llhNull G2 McFadden r2ML
## -1.632337e+04 -1.647721e+04 3.076848e+02 9.336677e-03 2.030472e-02
## r2CU
## 2.284314e-02
Evaluasi Model
# Prediksi
pred <- predict(model, data)
head(pred)
## [1] Low High High Low Low Low
## Levels: Low Medium High
# Confusion matrix
table(
Actual = data$satisfaction_cat,
Predicted = pred
)
## Predicted
## Actual Low Medium High
## Low 2834 430 1735
## Medium 1996 882 2188
## High 1760 1006 2168
# Akurasi
accuracy <- mean(as.character(pred) == as.character(data$satisfaction_cat))
accuracy
## [1] 0.3922928