# 1. Import data CSV
df <- read.csv("HR-Employee-Attrition.csv", header = TRUE, stringsAsFactors = FALSE)
# 2. Lihat struktur data
str(df)
## 'data.frame': 1476 obs. of 35 variables:
## $ Age : chr "41" "49" "37" "33" ...
## $ Attrition : chr "Yes" "No" "Yes" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : int 1102 279 1373 1392 591 1005 1324 1358 216 1299 ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : int 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : int 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : int 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr "Female" "Male" "Male" "Female" ...
## $ HourlyRate : int 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : int 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : int 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
## $ JobSatisfaction : int 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : int 5993 5130 2090 2909 3468 3068 2670 2693 9526 5237 ...
## $ MonthlyRate : int 19479 24907 2396 23159 16632 11864 9964 13335 8787 16577 ...
## $ NumCompaniesWorked : int 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : int 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : int 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: int 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : int 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : int 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : int 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : int 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : int 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : int 5 7 0 0 2 6 0 0 8 7 ...
# 4. Lihat 6 baris pertama
head(df)
## Age Attrition BusinessTravel DailyRate Department
## 1 41 Yes Travel_Rarely 1102 Sales
## 2 49 No Travel_Frequently 279 Research & Development
## 3 37 Yes Travel_Rarely 1373 Research & Development
## 4 33 No Travel_Frequently 1392 Research & Development
## 5 27 No Travel_Rarely 591 Research & Development
## 6 32 No Travel_Frequently 1005 Research & Development
## DistanceFromHome Education EducationField EmployeeCount EmployeeNumber
## 1 1 2 Life Sciences 1 1
## 2 8 1 Life Sciences 1 2
## 3 2 2 Other 1 4
## 4 3 4 Life Sciences 1 5
## 5 2 1 Medical 1 7
## 6 2 2 Life Sciences 1 8
## EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel
## 1 2 Female 94 3 2
## 2 3 Male 61 2 2
## 3 4 Male 92 2 1
## 4 4 Female 56 3 1
## 5 1 Male 40 3 1
## 6 4 Male 79 3 1
## JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## 1 Sales Executive 4 Single 5993 19479
## 2 Research Scientist 2 Married 5130 24907
## 3 Laboratory Technician 3 Single 2090 2396
## 4 Research Scientist 3 Married 2909 23159
## 5 Laboratory Technician 2 Married 3468 16632
## 6 Laboratory Technician 4 Single 3068 11864
## NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating
## 1 8 Y Yes 11 3
## 2 1 Y No 23 4
## 3 6 Y Yes 15 3
## 4 1 Y Yes 11 3
## 5 9 Y No 12 3
## 6 0 Y No 13 3
## RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears
## 1 1 80 0 8
## 2 4 80 1 10
## 3 2 80 0 7
## 4 3 80 0 8
## 5 4 80 1 6
## 6 3 80 0 8
## TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole
## 1 0 1 6 4
## 2 3 3 10 7
## 3 3 3 0 0
## 4 3 3 8 7
## 5 3 3 2 2
## 6 2 2 7 7
## YearsSinceLastPromotion YearsWithCurrManager
## 1 0 5
## 2 1 7
## 3 0 0
## 4 3 0
## 5 2 2
## 6 3 6
# 5. Cek dimensi data
dim(df)
## [1] 1476 35
# 6. Cek nama variabel
colnames(df)
## [1] "Age" "Attrition"
## [3] "BusinessTravel" "DailyRate"
## [5] "Department" "DistanceFromHome"
## [7] "Education" "EducationField"
## [9] "EmployeeCount" "EmployeeNumber"
## [11] "EnvironmentSatisfaction" "Gender"
## [13] "HourlyRate" "JobInvolvement"
## [15] "JobLevel" "JobRole"
## [17] "JobSatisfaction" "MaritalStatus"
## [19] "MonthlyIncome" "MonthlyRate"
## [21] "NumCompaniesWorked" "Over18"
## [23] "OverTime" "PercentSalaryHike"
## [25] "PerformanceRating" "RelationshipSatisfaction"
## [27] "StandardHours" "StockOptionLevel"
## [29] "TotalWorkingYears" "TrainingTimesLastYear"
## [31] "WorkLifeBalance" "YearsAtCompany"
## [33] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [35] "YearsWithCurrManager"
# Ambil kolom penting
df2 <- df[, c("YearsAtCompany", "Attrition", "Gender", "Department", "MaritalStatus","BusinessTravel", "OverTime", "PerformanceRating" )]
head(df2)
## YearsAtCompany Attrition Gender Department MaritalStatus
## 1 6 Yes Female Sales Single
## 2 10 No Male Research & Development Married
## 3 0 Yes Male Research & Development Single
## 4 8 No Female Research & Development Married
## 5 2 No Male Research & Development Married
## 6 7 No Male Research & Development Single
## BusinessTravel OverTime PerformanceRating
## 1 Travel_Rarely Yes 3
## 2 Travel_Frequently No 4
## 3 Travel_Rarely Yes 3
## 4 Travel_Frequently Yes 3
## 5 Travel_Rarely No 3
## 6 Travel_Frequently No 3
str(df2)
## 'data.frame': 1476 obs. of 8 variables:
## $ YearsAtCompany : int 6 10 0 8 2 7 1 1 9 7 ...
## $ Attrition : chr "Yes" "No" "Yes" "No" ...
## $ Gender : chr "Female" "Male" "Male" "Female" ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ MaritalStatus : chr "Single" "Married" "Single" "Married" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ OverTime : chr "Yes" "No" "Yes" "Yes" ...
## $ PerformanceRating: int 3 4 3 3 3 3 4 4 4 3 ...
#ubah status Event ke Biner
df2$status <- ifelse(df2$Attrition == "Yes", 1, 0)
#Cek missing value
colSums(is.na(df2))
## YearsAtCompany Attrition Gender Department
## 6 0 0 0
## MaritalStatus BusinessTravel OverTime PerformanceRating
## 0 0 0 6
## status
## 0
#Menangani missing value
df_clean <- df2[!is.na(df2$YearsAtCompany), ]
colSums(is.na(df_clean))
## YearsAtCompany Attrition Gender Department
## 0 0 0 0
## MaritalStatus BusinessTravel OverTime PerformanceRating
## 0 0 0 0
## status
## 0
Pada tahap ini dilakukan pendefinisian variabel survival yang terdiri atas waktu kejadian (time) dan status kejadian (event). Variabel YearsAtCompany digunakan sebagai waktu survival, sedangkan Attrition digunakan sebagai indikator kejadian, di mana nilai 1 menunjukkan karyawan keluar (event) dan 0 menunjukkan data tersensor (karyawan masih bekerja).
#Membuat variabel status (event)
df_clean$status <- ifelse(df_clean$Attrition == "Yes", 1, 0)
table(df_clean$status)
##
## 0 1
## 1233 237
Artinya: 1233 karyawan sensor (masih kerja) dan 237 karyawan event (keluar)
#Membuat objek survival
library(survival)
surv_object <- Surv(time = df_clean$YearsAtCompany,
event = df_clean$status)
head(surv_object)
## [1] 6 10+ 0 8+ 2+ 7+
# Memastikan tipe data grouping untuk perbandingan Gender
df_clean$Gender <- factor(df_clean$Gender)
levels(df_clean$Gender)
## [1] "Female" "Male"
# Memastikan tipe data grouping untuk perbandingan Department
df_clean$Department <- factor(df_clean$Department)
levels(df_clean$Department)
## [1] "Human Resources" "Research & Development" "Sales"
# Memastikan tipe data grouping untuk perbandingan Marital Status
df_clean$MaritalStatus <- factor(df_clean$MaritalStatus)
levels(df_clean$MaritalStatus)
## [1] "Divorced" "Married" "Single"
# Memastikan tipe data grouping untuk perbandingan BusinessTravel
df_clean$BusinessTravel <- factor(df_clean$BusinessTravel)
levels(df_clean$ BusinessTravel)
## [1] "Non-Travel" "Travel_Frequently" "Travel_Rarely"
# Memastikan tipe data grouping untuk perbandingan OverTime
df_clean$ OverTime <- factor(df_clean$OverTime)
levels(df_clean$ OverTime)
## [1] "No" "Yes"
# Memastikan tipe data grouping untuk perbandingan PerformanceRating
df_clean$PerformanceRating <- factor(df_clean$PerformanceRating )
levels(df_clean$PerformanceRating )
## [1] "3" "4"
library(survival)
fit_interval <- survfit(
Surv(YearsAtCompany, status) ~ 1,
data = df_clean
)
life_table_5yr <- summary(
fit_interval,
times = seq(0, max(df_clean$YearsAtCompany), by = 5)
)
life_table_5yr
## Call: survfit(formula = Surv(YearsAtCompany, status) ~ 1, data = df_clean)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 1470 16 0.989 0.00271 0.984 0.994
## 5 890 146 0.873 0.00947 0.855 0.892
## 10 366 55 0.777 0.01516 0.748 0.807
## 15 158 7 0.749 0.01789 0.715 0.785
## 20 93 5 0.717 0.02233 0.674 0.762
## 25 29 4 0.654 0.03686 0.586 0.731
## 30 17 0 0.654 0.03686 0.586 0.731
## 35 4 3 0.510 0.08037 0.374 0.694
## 40 1 1 0.000 NaN NA NA
# Gender
library(survival)
fit_gender <- survfit(
Surv(YearsAtCompany, status) ~ Gender,
data = df_clean
)
life_table_gender_5yr <- summary(
fit_gender,
times = seq(0, max(df_clean$YearsAtCompany), by = 5)
)
life_table_gender_5yr
## Call: survfit(formula = Surv(YearsAtCompany, status) ~ Gender, data = df_clean)
##
## Gender=Female
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 588 4 0.993 0.00339 0.987 1.000
## 5 374 52 0.891 0.01398 0.864 0.918
## 10 149 22 0.797 0.02314 0.753 0.844
## 15 72 3 0.770 0.02710 0.719 0.825
## 20 38 1 0.757 0.02991 0.700 0.818
## 25 9 2 0.650 0.07665 0.516 0.819
## 30 3 0 0.650 0.07665 0.516 0.819
## 35 1 2 0.217 0.17864 0.043 1.000
## 40 1 1 0.000 NaN NA NA
##
## Gender=Male
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 882 12 0.986 0.0039 0.979 0.994
## 5 516 94 0.861 0.0127 0.836 0.886
## 10 217 33 0.763 0.0200 0.725 0.804
## 15 86 4 0.735 0.0239 0.689 0.783
## 20 55 4 0.689 0.0315 0.630 0.754
## 25 20 2 0.647 0.0420 0.570 0.735
## 30 14 0 0.647 0.0420 0.570 0.735
## 35 3 1 0.597 0.0615 0.488 0.731
# Department
library(survival)
fit_Department <- survfit(
Surv(YearsAtCompany, status) ~ Department,
data = df_clean
)
life_table_department_5yr <- summary(
fit_Department,
times = seq(0, max(df_clean$YearsAtCompany), by = 5)
)
life_table_department_5yr
## Call: survfit(formula = Surv(YearsAtCompany, status) ~ Department,
## data = df_clean)
##
## Department=Human Resources
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 63 0 1.000 0.0000 1.000 1.000
## 5 38 10 0.825 0.0510 0.731 0.931
## 10 15 1 0.791 0.0593 0.683 0.916
## 15 7 0 0.791 0.0593 0.683 0.916
## 20 7 1 0.678 0.1163 0.484 0.949
## 25 2 0 0.678 0.1163 0.484 0.949
## 30 2 0 0.678 0.1163 0.484 0.949
##
## Department=Research & Development
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 961 9 0.991 0.00311 0.985 0.997
## 5 571 83 0.889 0.01115 0.867 0.911
## 10 230 34 0.789 0.01936 0.752 0.828
## 15 100 1 0.783 0.02018 0.744 0.823
## 20 59 2 0.762 0.02416 0.716 0.811
## 25 17 1 0.739 0.03313 0.676 0.806
## 30 10 0 0.739 0.03313 0.676 0.806
## 35 2 2 0.525 0.13483 0.318 0.869
## 40 1 1 0.000 NaN NA NA
##
## Department=Sales
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 446 7 0.984 0.00589 0.973 0.996
## 5 281 53 0.847 0.01836 0.812 0.884
## 10 121 20 0.750 0.02651 0.700 0.804
## 15 51 6 0.682 0.03619 0.615 0.757
## 20 27 2 0.644 0.04320 0.564 0.734
## 25 10 3 0.519 0.07495 0.391 0.689
## 30 5 0 0.519 0.07495 0.391 0.689
## 35 2 1 0.415 0.11050 0.246 0.699
# Marital Status
library(survival)
fit_MaritalStatus <- survfit(
Surv(YearsAtCompany, status) ~ MaritalStatus,
data = df_clean
)
life_table_MaritalStatus_5yr <- summary(
fit_MaritalStatus,
times = seq(0, max(df_clean$YearsAtCompany), by = 5)
)
life_table_MaritalStatus_5yr
## Call: survfit(formula = Surv(YearsAtCompany, status) ~ MaritalStatus,
## data = df_clean)
##
## MaritalStatus=Divorced
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 327 0 1.000 0.0000 1.000 1.000
## 5 208 26 0.908 0.0174 0.874 0.942
## 10 78 6 0.866 0.0241 0.820 0.914
## 15 40 0 0.866 0.0241 0.820 0.914
## 20 22 1 0.837 0.0367 0.768 0.912
## 25 8 0 0.837 0.0367 0.768 0.912
## 30 4 0 0.837 0.0367 0.768 0.912
##
## MaritalStatus=Married
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 673 2 0.997 0.0021 0.993 1.000
## 5 418 47 0.916 0.0116 0.894 0.939
## 10 188 24 0.824 0.0211 0.784 0.866
## 15 75 4 0.789 0.0264 0.739 0.843
## 20 46 2 0.760 0.0328 0.698 0.827
## 25 15 2 0.691 0.0560 0.590 0.810
## 30 8 0 0.691 0.0560 0.590 0.810
## 35 2 2 0.474 0.1343 0.272 0.826
## 40 1 1 0.000 NaN NA NA
##
## MaritalStatus=Single
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 470 14 0.970 0.00784 0.955 0.986
## 5 264 73 0.786 0.02076 0.746 0.828
## 10 100 25 0.651 0.03074 0.593 0.714
## 15 43 3 0.615 0.03557 0.549 0.689
## 20 25 2 0.577 0.04245 0.499 0.666
## 25 6 2 0.473 0.07836 0.342 0.654
## 30 5 0 0.473 0.07836 0.342 0.654
## 35 2 1 0.378 0.10531 0.219 0.653
# Business Travel
library(survival)
fit_BusinessTravel <- survfit(
Surv(YearsAtCompany, status) ~ BusinessTravel,
data = df_clean
)
life_table_BusinessTravel_5yr <- summary(
fit_BusinessTravel,
times = seq(0, max(df_clean$YearsAtCompany), by = 5)
)
life_table_BusinessTravel_5yr
## Call: survfit(formula = Surv(YearsAtCompany, status) ~ BusinessTravel,
## data = df_clean)
##
## BusinessTravel=Non-Travel
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 150 1 0.993 0.00664 0.980 1.000
## 5 94 9 0.921 0.02450 0.874 0.970
## 10 38 2 0.872 0.04063 0.796 0.955
## 15 15 0 0.872 0.04063 0.796 0.955
## 20 8 0 0.872 0.04063 0.796 0.955
## 25 3 0 0.872 0.04063 0.796 0.955
## 30 3 0 0.872 0.04063 0.796 0.955
##
## BusinessTravel=Travel_Frequently
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 277 7 0.975 0.00943 0.956 0.993
## 5 168 40 0.814 0.02479 0.767 0.864
## 10 71 17 0.682 0.03687 0.613 0.758
## 15 31 2 0.650 0.04152 0.573 0.736
## 20 19 1 0.625 0.04684 0.539 0.724
## 25 5 2 0.510 0.08574 0.367 0.709
## 30 3 0 0.510 0.08574 0.367 0.709
##
## BusinessTravel=Travel_Rarely
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 1043 8 0.992 0.0027 0.987 0.998
## 5 628 97 0.882 0.0110 0.861 0.904
## 10 257 36 0.791 0.0178 0.757 0.826
## 15 112 5 0.761 0.0216 0.720 0.805
## 20 66 4 0.724 0.0276 0.672 0.780
## 25 21 2 0.676 0.0421 0.598 0.764
## 30 11 0 0.676 0.0421 0.598 0.764
## 35 4 3 0.478 0.1011 0.316 0.724
## 40 1 1 0.000 NaN NA NA
# OverTime
fit_OverTime <- survfit(
Surv(YearsAtCompany, status) ~ OverTime,
data = df_clean
)
life_table_OverTime_5yr <- summary(
fit_OverTime,
times = seq(0, max(df_clean$YearsAtCompany), by = 5)
)
life_table_OverTime_5yr
## Call: survfit(formula = Surv(YearsAtCompany, status) ~ OverTime, data = df_clean)
##
## OverTime=No
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 1054 6 0.994 0.00232 0.990 0.999
## 5 648 67 0.920 0.00909 0.902 0.938
## 10 269 24 0.857 0.01527 0.828 0.888
## 15 108 5 0.828 0.01978 0.790 0.867
## 20 65 3 0.796 0.02627 0.746 0.849
## 25 17 2 0.741 0.04532 0.657 0.835
## 30 9 0 0.741 0.04532 0.657 0.835
## 35 3 2 0.540 0.12663 0.341 0.855
## 40 1 1 0.000 NaN NA NA
##
## OverTime=Yes
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 416 10 0.976 0.00751 0.961 0.991
## 5 242 79 0.759 0.02265 0.716 0.804
## 10 97 31 0.592 0.03254 0.531 0.659
## 15 50 2 0.571 0.03460 0.507 0.643
## 20 28 2 0.540 0.03909 0.469 0.623
## 25 12 2 0.476 0.05595 0.378 0.599
## 30 8 0 0.476 0.05595 0.378 0.599
## 35 1 1 0.408 0.07916 0.279 0.597
# Performance Rating
fit_PerformanceRating <- survfit(
Surv(YearsAtCompany, status) ~ PerformanceRating,
data = df_clean
)
life_table_PerformanceRating_5yr <- summary(
fit_PerformanceRating,
times = seq(0, max(df_clean$YearsAtCompany), by = 5)
)
life_table_PerformanceRating_5yr
## Call: survfit(formula = Surv(YearsAtCompany, status) ~ PerformanceRating,
## data = df_clean)
##
## PerformanceRating=3
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 1244 12 0.990 0.00277 0.985 0.996
## 5 746 124 0.873 0.01033 0.853 0.894
## 10 307 47 0.776 0.01659 0.744 0.809
## 15 136 4 0.757 0.01866 0.721 0.795
## 20 81 5 0.720 0.02426 0.674 0.769
## 25 26 4 0.649 0.04105 0.573 0.734
## 30 16 0 0.649 0.04105 0.573 0.734
## 35 3 3 0.493 0.08576 0.351 0.694
## 40 1 1 0.000 NaN NA NA
##
## PerformanceRating=4
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 226 4 0.982 0.00877 0.965 1.000
## 5 144 22 0.872 0.02381 0.826 0.919
## 10 59 8 0.783 0.03710 0.713 0.859
## 15 22 3 0.707 0.05455 0.608 0.823
## 20 12 0 0.707 0.05455 0.608 0.823
## 25 3 0 0.707 0.05455 0.608 0.823
## 30 1 0 0.707 0.05455 0.608 0.823
## 35 1 0 0.707 0.05455 0.608 0.823
fit_km <- survfit(surv_object ~ 1, data = df_clean)
summary(fit_km)
## Call: survfit(formula = surv_object ~ 1, data = df_clean)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 1470 16 0.989 0.00271 0.984 0.994
## 1 1426 59 0.948 0.00583 0.937 0.960
## 2 1255 27 0.928 0.00690 0.914 0.941
## 3 1128 20 0.911 0.00769 0.896 0.927
## 4 1000 19 0.894 0.00851 0.877 0.911
## 5 890 21 0.873 0.00947 0.855 0.892
## 6 694 9 0.862 0.01007 0.842 0.882
## 7 618 11 0.846 0.01091 0.825 0.868
## 8 528 9 0.832 0.01173 0.809 0.855
## 9 448 8 0.817 0.01264 0.793 0.842
## 10 366 18 0.777 0.01516 0.748 0.807
## 11 246 2 0.770 0.01568 0.740 0.802
## 13 200 2 0.763 0.01644 0.731 0.796
## 14 176 2 0.754 0.01736 0.721 0.789
## 15 158 1 0.749 0.01789 0.715 0.785
## 16 138 1 0.744 0.01857 0.708 0.781
## 17 126 1 0.738 0.01934 0.701 0.777
## 18 117 1 0.732 0.02018 0.693 0.772
## 19 104 1 0.725 0.02117 0.684 0.767
## 20 93 1 0.717 0.02233 0.674 0.762
## 21 66 1 0.706 0.02449 0.660 0.756
## 22 52 1 0.692 0.02753 0.641 0.749
## 23 37 1 0.674 0.03253 0.613 0.741
## 24 35 1 0.654 0.03686 0.586 0.731
## 31 16 1 0.614 0.05256 0.519 0.726
## 32 13 1 0.566 0.06641 0.450 0.713
## 33 10 1 0.510 0.08037 0.374 0.694
## 40 1 1 0.000 NaN NA NA
plot(fit_km,
xlab = "Years At Company",
ylab = "Probabilitas Bertahan",
main = "Kurva Kaplan-Meier Keseluruhan",
col = "blue",
lwd = 2)
Berdasarkan hasil estimasi Kaplan–Meier, diperoleh bahwa probabilitas
karyawan untuk tetap bertahan di perusahaan menurun seiring bertambahnya
masa kerja. Pada awal masa kerja (tahun ke-0), probabilitas bertahan
sebesar 0.989, yang menunjukkan bahwa sekitar 98.9% karyawan masih
bertahan. Probabilitas ini terus menurun menjadi 0.948 pada tahun
pertama, dan 0.873 pada tahun ke-5. Hal ini mengindikasikan bahwa
sekitar 12.7% karyawan keluar dalam lima tahun pertama masa kerja.
Pada tahun ke-10, probabilitas bertahan turun menjadi 0.777, yang berarti hanya sekitar 77.7% karyawan yang masih bertahan. Penurunan ini berlanjut hingga mencapai sekitar 51% pada tahun ke-33, yang menunjukkan bahwa hanya setengah karyawan yang mampu bertahan lebih dari 30 tahun. Secara umum, hasil ini menunjukkan bahwa risiko attrition paling tinggi terjadi pada awal hingga pertengahan masa kerja, sedangkan pada masa kerja yang sangat panjang jumlah karyawan yang tersisa relatif sedikit.
df_clean$Gender <- as.factor(df_clean$Gender)
surv_object <- Surv(df_clean$YearsAtCompany, df_clean$status)
fit_km_Gender <- survfit(
surv_object ~ Gender,
data = df_clean
)
summary(fit_km_Gender)
## Call: survfit(formula = surv_object ~ Gender, data = df_clean)
##
## Gender=Female
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 588 4 0.993 0.00339 0.987 1.000
## 1 573 21 0.957 0.00845 0.940 0.974
## 2 509 10 0.938 0.01016 0.918 0.958
## 3 465 7 0.924 0.01133 0.902 0.946
## 4 409 5 0.913 0.01226 0.889 0.937
## 5 374 9 0.891 0.01398 0.864 0.918
## 6 287 3 0.881 0.01483 0.853 0.911
## 7 258 5 0.864 0.01639 0.833 0.897
## 8 222 4 0.849 0.01785 0.814 0.884
## 9 185 4 0.830 0.01968 0.793 0.870
## 10 149 6 0.797 0.02314 0.753 0.844
## 11 113 1 0.790 0.02399 0.744 0.838
## 13 92 1 0.781 0.02522 0.733 0.832
## 15 72 1 0.770 0.02710 0.719 0.825
## 17 56 1 0.757 0.02991 0.700 0.818
## 22 18 1 0.715 0.04966 0.624 0.819
## 24 11 1 0.650 0.07665 0.516 0.819
## 32 3 1 0.433 0.18404 0.188 0.996
## 33 2 1 0.217 0.17864 0.043 1.000
## 40 1 1 0.000 NaN NA NA
##
## Gender=Male
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 882 12 0.986 0.00390 0.979 0.994
## 1 853 38 0.942 0.00790 0.927 0.958
## 2 746 17 0.921 0.00928 0.903 0.939
## 3 663 13 0.903 0.01036 0.883 0.923
## 4 591 14 0.882 0.01159 0.859 0.905
## 5 516 12 0.861 0.01274 0.836 0.886
## 6 407 6 0.848 0.01356 0.822 0.875
## 7 360 6 0.834 0.01452 0.806 0.863
## 8 306 5 0.821 0.01551 0.791 0.852
## 9 263 4 0.808 0.01648 0.776 0.841
## 10 217 12 0.763 0.01999 0.725 0.804
## 11 133 1 0.758 0.02065 0.718 0.799
## 13 108 1 0.751 0.02161 0.709 0.794
## 14 94 2 0.735 0.02392 0.689 0.783
## 16 76 1 0.725 0.02549 0.677 0.777
## 18 66 1 0.714 0.02737 0.662 0.770
## 19 60 1 0.702 0.02938 0.647 0.762
## 20 55 1 0.689 0.03150 0.630 0.754
## 21 42 1 0.673 0.03476 0.608 0.745
## 23 26 1 0.647 0.04197 0.570 0.735
## 31 13 1 0.597 0.06154 0.488 0.731
fit_gender <- survfit(surv_object ~ Gender, data = df_clean)
plot(fit_gender,
col = c("red", "blue"),
lwd = 2,
xlab = "Years At Company",
ylab = "Probabilitas Bertahan",
main = "Kurva Kaplan-Meier Berdasarkan Gender")
legend("bottomleft",
legend = levels(df_clean$Gender),
col = c("red", "blue"),
lwd = 2)
df_clean$Department <- as.factor(df_clean$Department)
fit_km_Department <- survfit(
surv_object ~ Department,
data = df_clean
)
summary(fit_km_Department)
## Call: survfit(formula = surv_object ~ Department, data = df_clean)
##
## Department=Human Resources
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 63 4 0.937 0.0307 0.878 0.999
## 2 58 2 0.904 0.0372 0.834 0.980
## 3 50 2 0.868 0.0436 0.787 0.958
## 4 42 1 0.847 0.0472 0.760 0.945
## 5 38 1 0.825 0.0510 0.731 0.931
## 7 24 1 0.791 0.0593 0.683 0.916
## 20 7 1 0.678 0.1163 0.484 0.949
##
## Department=Research & Development
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 961 9 0.991 0.00311 0.985 0.997
## 1 934 38 0.950 0.00706 0.937 0.964
## 2 812 13 0.935 0.00811 0.919 0.951
## 3 735 8 0.925 0.00879 0.908 0.942
## 4 650 11 0.909 0.00982 0.890 0.929
## 5 571 13 0.889 0.01115 0.867 0.911
## 6 441 4 0.881 0.01176 0.858 0.904
## 7 389 8 0.862 0.01314 0.837 0.889
## 8 332 4 0.852 0.01397 0.825 0.880
## 9 280 4 0.840 0.01504 0.811 0.870
## 10 230 14 0.789 0.01936 0.752 0.828
## 13 127 1 0.783 0.02018 0.744 0.823
## 17 81 1 0.773 0.02212 0.731 0.817
## 18 74 1 0.762 0.02416 0.716 0.811
## 22 32 1 0.739 0.03313 0.676 0.806
## 31 9 1 0.657 0.08279 0.513 0.841
## 33 5 1 0.525 0.13483 0.318 0.869
## 40 1 1 0.000 NaN NA NA
##
## Department=Sales
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 446 7 0.984 0.00589 0.973 0.996
## 1 429 17 0.945 0.01086 0.924 0.967
## 2 385 12 0.916 0.01344 0.890 0.943
## 3 343 10 0.889 0.01548 0.859 0.920
## 4 308 7 0.869 0.01691 0.836 0.903
## 5 281 7 0.847 0.01836 0.812 0.884
## 6 226 5 0.829 0.01977 0.791 0.868
## 7 205 2 0.820 0.02039 0.781 0.861
## 8 176 5 0.797 0.02232 0.755 0.842
## 9 150 4 0.776 0.02412 0.730 0.825
## 10 121 4 0.750 0.02651 0.700 0.804
## 11 82 2 0.732 0.02885 0.678 0.791
## 13 66 1 0.721 0.03047 0.664 0.783
## 14 57 2 0.696 0.03425 0.632 0.766
## 15 51 1 0.682 0.03619 0.615 0.757
## 16 41 1 0.665 0.03895 0.593 0.746
## 19 31 1 0.644 0.04320 0.564 0.734
## 21 21 1 0.613 0.05087 0.521 0.721
## 23 13 1 0.566 0.06526 0.452 0.710
## 24 12 1 0.519 0.07495 0.391 0.689
## 32 5 1 0.415 0.11050 0.246 0.699
fit_dept <- survfit(surv_object ~ Department, data = df_clean)
plot(fit_dept,
col = c("darkgreen", "orange", "purple"),
lwd = 2,
xlab = "Years At Company",
ylab = "Probabilitas Bertahan",
main = "Kurva Kaplan-Meier Berdasarkan Department")
legend("bottomleft",
legend = levels(df_clean$Department),
col = c("darkgreen", "orange", "purple"),
lwd = 2)
df_clean$MaritalStatus <- as.factor(df_clean$MaritalStatus)
fit_km_MaritalStatus <- survfit(
surv_object ~ MaritalStatus,
data = df_clean
)
summary(fit_km_MaritalStatus)
## Call: survfit(formula = surv_object ~ MaritalStatus, data = df_clean)
##
## MaritalStatus=Divorced
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 1 323 8 0.975 0.00865 0.958 0.992
## 2 284 5 0.958 0.01141 0.936 0.981
## 3 258 7 0.932 0.01473 0.904 0.961
## 4 234 5 0.912 0.01690 0.880 0.946
## 5 208 1 0.908 0.01738 0.874 0.942
## 6 161 1 0.902 0.01816 0.867 0.938
## 7 143 4 0.877 0.02159 0.836 0.920
## 10 78 1 0.866 0.02407 0.820 0.914
## 18 30 1 0.837 0.03669 0.768 0.912
##
## MaritalStatus=Married
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 673 2 0.997 0.00210 0.993 1.000
## 1 661 21 0.965 0.00710 0.952 0.979
## 2 589 9 0.951 0.00852 0.934 0.967
## 3 531 7 0.938 0.00964 0.919 0.957
## 4 465 3 0.932 0.01019 0.912 0.952
## 5 418 7 0.916 0.01160 0.894 0.939
## 6 330 2 0.911 0.01218 0.887 0.935
## 7 302 5 0.896 0.01372 0.869 0.923
## 8 269 3 0.886 0.01473 0.857 0.915
## 9 227 4 0.870 0.01641 0.839 0.903
## 10 188 10 0.824 0.02107 0.784 0.866
## 11 120 1 0.817 0.02199 0.775 0.861
## 13 97 1 0.809 0.02332 0.764 0.856
## 14 84 2 0.789 0.02644 0.739 0.843
## 16 63 1 0.777 0.02884 0.722 0.835
## 20 46 1 0.760 0.03278 0.698 0.827
## 22 27 1 0.732 0.04195 0.654 0.819
## 23 18 1 0.691 0.05595 0.590 0.810
## 32 7 1 0.592 0.10323 0.421 0.834
## 33 5 1 0.474 0.13435 0.272 0.826
## 40 1 1 0.000 NaN NA NA
##
## MaritalStatus=Single
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 470 14 0.970 0.00784 0.955 0.986
## 1 442 30 0.904 0.01372 0.878 0.932
## 2 382 13 0.874 0.01568 0.843 0.905
## 3 339 6 0.858 0.01663 0.826 0.891
## 4 301 11 0.827 0.01851 0.791 0.864
## 5 264 13 0.786 0.02076 0.746 0.828
## 6 203 6 0.763 0.02221 0.721 0.808
## 7 173 2 0.754 0.02281 0.711 0.800
## 8 148 6 0.723 0.02507 0.676 0.774
## 9 122 4 0.700 0.02691 0.649 0.754
## 10 100 7 0.651 0.03074 0.593 0.714
## 11 69 1 0.641 0.03171 0.582 0.707
## 13 54 1 0.629 0.03327 0.567 0.698
## 15 43 1 0.615 0.03557 0.549 0.689
## 17 35 1 0.597 0.03865 0.526 0.678
## 19 29 1 0.577 0.04245 0.499 0.666
## 21 16 1 0.541 0.05293 0.446 0.655
## 24 8 1 0.473 0.07836 0.342 0.654
## 31 5 1 0.378 0.10531 0.219 0.653
fit_marital <- survfit(surv_object ~ MaritalStatus, data = df_clean)
plot(fit_marital,
col = c("red", "blue", "green"),
lwd = 2,
xlab = "Years At Company",
ylab = "Probabilitas Bertahan",
main = "Kurva Kaplan-Meier Berdasarkan Status Pernikahan")
legend("bottomleft",
legend = levels(df_clean$MaritalStatus),
col = c("red", "blue", "green"),
lwd = 2)
#### D. Berdasarkan BusinessTravel
df_clean$BusinessTravel <- as.factor(df_clean$BusinessTravel)
fit_km_BusinessTravel <- survfit(
surv_object ~ BusinessTravel,
data = df_clean
)
summary(fit_km_BusinessTravel)
## Call: survfit(formula = surv_object ~ BusinessTravel, data = df_clean)
##
## BusinessTravel=Non-Travel
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 150 1 0.993 0.00664 0.980 1.000
## 1 143 4 0.966 0.01514 0.936 0.996
## 2 125 1 0.958 0.01688 0.925 0.991
## 3 119 1 0.950 0.01856 0.914 0.987
## 4 103 1 0.941 0.02054 0.901 0.982
## 5 94 2 0.921 0.02450 0.874 0.970
## 10 38 2 0.872 0.04063 0.796 0.955
##
## BusinessTravel=Travel_Frequently
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 277 7 0.975 0.00943 0.956 0.993
## 1 266 17 0.912 0.01708 0.880 0.947
## 2 241 7 0.886 0.01930 0.849 0.925
## 3 211 6 0.861 0.02131 0.820 0.904
## 4 191 6 0.834 0.02333 0.789 0.881
## 5 168 4 0.814 0.02479 0.767 0.864
## 6 142 4 0.791 0.02661 0.740 0.845
## 7 123 3 0.772 0.02820 0.718 0.829
## 8 107 3 0.750 0.03005 0.693 0.811
## 9 90 2 0.733 0.03161 0.674 0.798
## 10 71 5 0.682 0.03687 0.613 0.758
## 11 46 1 0.667 0.03893 0.595 0.748
## 13 39 1 0.650 0.04152 0.573 0.736
## 18 26 1 0.625 0.04684 0.539 0.724
## 21 15 1 0.583 0.05942 0.478 0.712
## 23 8 1 0.510 0.08574 0.367 0.709
##
## BusinessTravel=Travel_Rarely
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 1043 8 0.992 0.00270 0.987 0.998
## 1 1017 38 0.955 0.00645 0.943 0.968
## 2 889 19 0.935 0.00783 0.920 0.950
## 3 798 13 0.920 0.00877 0.903 0.937
## 4 706 12 0.904 0.00971 0.885 0.923
## 5 628 15 0.882 0.01096 0.861 0.904
## 6 479 5 0.873 0.01160 0.851 0.896
## 7 429 8 0.857 0.01273 0.832 0.882
## 8 361 6 0.843 0.01378 0.816 0.870
## 9 306 6 0.826 0.01507 0.797 0.856
## 10 257 11 0.791 0.01780 0.757 0.826
## 11 171 1 0.786 0.01829 0.751 0.823
## 13 139 1 0.780 0.01901 0.744 0.819
## 14 126 2 0.768 0.02063 0.729 0.810
## 15 112 1 0.761 0.02156 0.720 0.805
## 16 100 1 0.754 0.02264 0.711 0.799
## 17 89 1 0.745 0.02392 0.700 0.794
## 19 73 1 0.735 0.02568 0.686 0.787
## 20 66 1 0.724 0.02760 0.672 0.780
## 22 37 1 0.704 0.03307 0.642 0.772
## 24 25 1 0.676 0.04207 0.598 0.764
## 31 11 1 0.615 0.06998 0.492 0.768
## 32 9 1 0.546 0.08952 0.396 0.753
## 33 8 1 0.478 0.10108 0.316 0.724
## 40 1 1 0.000 NaN NA NA
fit_BusinessTravel <- survfit(surv_object ~ BusinessTravel, data = df_clean)
plot(fit_BusinessTravel,
col = c("red", "blue"),
lwd = 2,
xlab = "Years At Company",
ylab = "Probabilitas Bertahan",
main = "Kurva Kaplan-Meier Berdasarkan Business Travel")
legend("bottomleft",
legend = levels(df_clean$BusinessTravel),
col = c("red", "blue"),
lwd = 2)
df_clean$OverTime <- as.factor(df_clean$OverTime)
surv_object <- Surv(df_clean$YearsAtCompany, df_clean$status)
fit_km_OverTime <- survfit(
surv_object ~ OverTime,
data = df_clean
)
summary(fit_km_OverTime)
## Call: survfit(formula = surv_object ~ OverTime, data = df_clean)
##
## OverTime=No
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 1054 6 0.994 0.00232 0.990 0.999
## 1 1024 31 0.964 0.00578 0.953 0.976
## 2 908 12 0.951 0.00677 0.938 0.965
## 3 816 6 0.944 0.00730 0.930 0.959
## 4 728 11 0.930 0.00836 0.914 0.947
## 5 648 7 0.920 0.00909 0.902 0.938
## 6 509 4 0.913 0.00971 0.894 0.932
## 7 457 3 0.907 0.01025 0.887 0.927
## 8 388 3 0.900 0.01094 0.879 0.922
## 9 332 6 0.884 0.01260 0.859 0.909
## 10 269 8 0.857 0.01527 0.828 0.888
## 11 181 1 0.853 0.01590 0.822 0.884
## 13 146 2 0.841 0.01770 0.807 0.876
## 14 126 2 0.828 0.01978 0.790 0.867
## 17 89 1 0.818 0.02163 0.777 0.862
## 18 82 1 0.808 0.02356 0.763 0.856
## 20 65 1 0.796 0.02627 0.746 0.849
## 22 35 1 0.773 0.03397 0.709 0.843
## 23 24 1 0.741 0.04532 0.657 0.835
## 32 8 1 0.648 0.09528 0.486 0.865
## 33 6 1 0.540 0.12663 0.341 0.855
## 40 1 1 0.000 NaN NA NA
##
## OverTime=Yes
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 416 10 0.976 0.00751 0.961 0.991
## 1 402 28 0.908 0.01423 0.881 0.936
## 2 347 15 0.869 0.01684 0.836 0.902
## 3 312 14 0.830 0.01903 0.793 0.868
## 4 272 8 0.805 0.02034 0.766 0.846
## 5 242 14 0.759 0.02265 0.716 0.804
## 6 185 5 0.738 0.02383 0.693 0.786
## 7 161 8 0.702 0.02593 0.653 0.754
## 8 140 6 0.672 0.02757 0.620 0.728
## 9 116 2 0.660 0.02829 0.607 0.718
## 10 97 10 0.592 0.03254 0.531 0.659
## 11 65 1 0.583 0.03329 0.521 0.652
## 15 50 1 0.571 0.03460 0.507 0.643
## 16 43 1 0.558 0.03626 0.491 0.634
## 19 32 1 0.540 0.03909 0.469 0.623
## 21 22 1 0.516 0.04437 0.436 0.611
## 24 13 1 0.476 0.05595 0.378 0.599
## 31 7 1 0.408 0.07916 0.279 0.597
fit_OverTime <- survfit(surv_object ~ OverTime, data = df_clean)
plot(fit_OverTime,
col = c("red", "blue"),
lwd = 2,
xlab = "Years At Company",
ylab = "Probabilitas Bertahan",
main = "Kurva Kaplan-Meier Berdasarkan Over Time")
legend("bottomleft",
legend = levels(df_clean$OverTime),
col = c("red", "blue"),
lwd = 2)
df_clean$PerformanceRating <- as.factor(df_clean$PerformanceRating)
fit_km_PerformanceRating <- survfit(
surv_object ~ PerformanceRating,
data = df_clean
)
summary(fit_km_PerformanceRating)
## Call: survfit(formula = surv_object ~ PerformanceRating, data = df_clean)
##
## PerformanceRating=3
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 1244 12 0.990 0.00277 0.985 0.996
## 1 1207 49 0.950 0.00622 0.938 0.962
## 2 1065 24 0.929 0.00746 0.914 0.943
## 3 949 17 0.912 0.00835 0.896 0.929
## 4 841 16 0.895 0.00925 0.877 0.913
## 5 746 18 0.873 0.01033 0.853 0.894
## 6 586 8 0.861 0.01102 0.840 0.883
## 7 521 11 0.843 0.01207 0.820 0.867
## 8 440 7 0.830 0.01290 0.805 0.855
## 9 373 5 0.819 0.01365 0.792 0.846
## 10 307 16 0.776 0.01659 0.744 0.809
## 11 202 1 0.772 0.01695 0.740 0.806
## 13 168 1 0.767 0.01746 0.734 0.802
## 14 150 2 0.757 0.01866 0.721 0.795
## 16 118 1 0.751 0.01958 0.713 0.790
## 17 109 1 0.744 0.02057 0.705 0.785
## 18 103 1 0.737 0.02161 0.696 0.780
## 19 91 1 0.729 0.02283 0.685 0.775
## 20 81 1 0.720 0.02426 0.674 0.769
## 21 58 1 0.707 0.02683 0.656 0.762
## 22 46 1 0.692 0.03033 0.635 0.754
## 23 32 1 0.670 0.03628 0.603 0.745
## 24 31 1 0.649 0.04105 0.573 0.734
## 31 15 1 0.605 0.05668 0.504 0.727
## 32 12 1 0.555 0.07094 0.432 0.713
## 33 9 1 0.493 0.08576 0.351 0.694
## 40 1 1 0.000 NaN NA NA
##
## PerformanceRating=4
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 0 226 4 0.982 0.00877 0.965 1.000
## 1 219 10 0.937 0.01619 0.906 0.970
## 2 190 3 0.923 0.01805 0.888 0.959
## 3 179 3 0.907 0.01983 0.869 0.947
## 4 159 3 0.890 0.02178 0.848 0.934
## 5 144 3 0.872 0.02381 0.826 0.919
## 6 108 1 0.863 0.02492 0.816 0.914
## 8 88 2 0.844 0.02795 0.791 0.900
## 9 75 3 0.810 0.03293 0.748 0.877
## 10 59 2 0.783 0.03710 0.713 0.859
## 11 44 1 0.765 0.04030 0.690 0.848
## 13 32 1 0.741 0.04558 0.657 0.836
## 15 22 1 0.707 0.05455 0.608 0.823
fit_PerformanceRating <- survfit(surv_object ~ PerformanceRating, data = df_clean)
plot(fit_PerformanceRating,
col = c("red", "blue"),
lwd = 2,
xlab = "Years At Company",
ylab = "Probabilitas Bertahan",
main = "Kurva Kaplan-Meier Berdasarkan Performance Rating")
legend("bottomleft",
legend = levels(df_clean$PerformanceRating),
col = c("red", "blue"),
lwd = 2)
survdiff(surv_object ~ Gender, data = df_clean)
## Call:
## survdiff(formula = surv_object ~ Gender, data = df_clean)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## Gender=Female 588 87 97 1.022 1.79
## Gender=Male 882 150 140 0.708 1.79
##
## Chisq= 1.8 on 1 degrees of freedom, p= 0.2
Berdasarkan hasil uji log-rank, diperoleh nilai p-value sebesar 0.2, yang lebih besar dari taraf signifikansi 5% (α = 0.05). Hal ini menunjukkan bahwa tidak terdapat perbedaan yang signifikan secara statistik pada kurva survival antara karyawan laki-laki dan perempuan. Risiko keluar kerja (attrition) tidak berbeda secara signifikan antara karyawan pria dan wanita. Faktor gender bukan determinan utama dalam lamanya karyawan bertahan di perusahaan.
survdiff(surv_object ~ Department, data = df_clean)
## Call:
## survdiff(formula = surv_object ~ Department, data = df_clean)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## Department=Human Resources 63 12 10.5 0.215 0.231
## Department=Research & Development 961 133 152.6 2.521 7.283
## Department=Sales 446 92 73.9 4.440 6.629
##
## Chisq= 7.4 on 2 degrees of freedom, p= 0.02
Hasil uji log-rank menunjukkan nilai p-value sebesar 0.02, yang lebih kecil dari 0.05, sehingga dapat disimpulkan bahwa terdapat perbedaan kurva survival yang signifikan antar departemen. Departemen memengaruhi lama masa kerja karyawan. Perbedaan karakteristik pekerjaan, tekanan kerja, lingkungan, dan beban tugas antar departemen berkontribusi terhadap perbedaan tingkat attrition.
Departemen Sales menunjukkan observed event lebih besar dari expected, menandakan risiko keluar lebih tinggi. Departemen R&D menunjukkan observed < expected, artinya retensi karyawan lebih baik.
survdiff(surv_object ~ MaritalStatus, data = df_clean)
## Call:
## survdiff(formula = surv_object ~ MaritalStatus, data = df_clean)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## MaritalStatus=Divorced 327 33 53.9 8.1 10.8
## MaritalStatus=Married 673 84 113.1 7.5 14.8
## MaritalStatus=Single 470 120 70.0 35.8 52.2
##
## Chisq= 52.8 on 2 degrees of freedom, p= 3e-12
Hasil uji log-rank menunjukkan nilai p-value yang sangat kecil (p < 0.001), sehingga dapat disimpulkan bahwa terdapat perbedaan yang sangat signifikan pada kurva survival berdasarkan status pernikahan. Status pernikahan sangat memengaruhi stabilitas kerja. Karyawan single memiliki risiko keluar yang jauh lebih tinggi dibandingkan karyawan yang sudah menikah atau bercerai.
survdiff(surv_object ~ BusinessTravel, data = df_clean)
## Call:
## survdiff(formula = surv_object ~ BusinessTravel, data = df_clean)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## BusinessTravel=Non-Travel 150 12 24.5 6.413 7.34
## BusinessTravel=Travel_Frequently 277 69 45.0 12.820 16.25
## BusinessTravel=Travel_Rarely 1043 156 167.5 0.785 2.75
##
## Chisq= 20.6 on 2 degrees of freedom, p= 3e-05
Hasil uji log-rank menunjukkan bahwa terdapat perbedaan yang sangat signifikan pada kurva survival berdasarkan intensitas perjalanan dinas (business travel). Karyawan dengan kategori Travel_Frequently memiliki risiko keluar kerja paling tinggi, sedangkan karyawan Non-Travel menunjukkan tingkat ketahanan kerja yang lebih baik.
survdiff(surv_object ~ OverTime, data = df_clean)
## Call:
## survdiff(formula = surv_object ~ OverTime, data = df_clean)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## OverTime=No 1054 110 171.4 22.0 81.7
## OverTime=Yes 416 127 65.6 57.5 81.7
##
## Chisq= 81.7 on 1 degrees of freedom, p= <2e-16
Uji log-rank menunjukkan bahwa terdapat perbedaan yang sangat signifikan pada kurva survival antara karyawan yang bekerja lembur dan yang tidak lembur. Karyawan yang sering lembur (OverTime = Yes) memiliki risiko keluar kerja jauh lebih tinggi dibandingkan karyawan tanpa lembur.
survdiff(surv_object ~ PerformanceRating, data = df_clean)
## Call:
## survdiff(formula = surv_object ~ PerformanceRating, data = df_clean)
##
## N Observed Expected (O-E)^2/E (O-E)^2/V
## PerformanceRating=3 1244 200 200.4 0.000964 0.00642
## PerformanceRating=4 226 37 36.6 0.005284 0.00642
##
## Chisq= 0 on 1 degrees of freedom, p= 0.9
Hasil uji log-rank menunjukkan bahwa tidak terdapat perbedaan signifikan pada kurva survival berdasarkan performance rating. Dengan demikian, tingkat penilaian kinerja tidak berpengaruh terhadap lama masa kerja karyawan.
Berdasarkan hasil analisis survival menggunakan metode Kaplan–Meier dan uji log-rank, diperoleh bahwa probabilitas karyawan untuk bertahan di perusahaan menurun seiring bertambahnya masa kerja, dengan tingkat attrition yang relatif lebih tinggi pada periode awal hingga pertengahan masa kerja.
Hasil uji log-rank menunjukkan bahwa faktor gender dan performance rating tidak memberikan pengaruh yang signifikan terhadap perbedaan kurva survival karyawan. Hal ini mengindikasikan bahwa risiko keluar kerja relatif sama antara karyawan laki-laki dan perempuan, serta antara karyawan dengan tingkat kinerja yang berbeda.
Sebaliknya, departemen, status pernikahan, intensitas perjalanan dinas (business travel), dan kerja lembur (overtime) terbukti memiliki pengaruh yang signifikan terhadap lama masa kerja karyawan. Karyawan pada departemen Sales, karyawan dengan status single, karyawan yang sering melakukan perjalanan dinas, serta karyawan yang sering bekerja lembur menunjukkan tingkat risiko attrition yang lebih tinggi. Sementara itu, karyawan di departemen Research & Development, karyawan married, serta karyawan yang jarang melakukan perjalanan dinas dan tidak lembur cenderung memiliki stabilitas kerja yang lebih baik.
Secara keseluruhan, hasil penelitian ini menegaskan bahwa faktor beban kerja, karakteristik pekerjaan, serta kondisi sosial memiliki peranan penting dalam memengaruhi ketahanan karyawan di perusahaan. Oleh karena itu, perusahaan disarankan untuk merancang strategi retensi yang berfokus pada pengurangan beban lembur, pengelolaan intensitas perjalanan dinas, serta peningkatan kenyamanan kerja pada departemen dengan risiko attrition tinggi, guna menekan tingkat pergantian karyawan dan meningkatkan stabilitas sumber daya manusia.
df_empiris <- df_clean[df_clean$status == 1, ]
#hitung proporsi manual
table(df_empiris$YearsAtCompany)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 13 14 15 16 17 18 19 20 21 22 23 24 31 32
## 16 59 27 20 19 21 9 11 9 8 18 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1
## 33 40
## 1 1