Load Libraries

library(dplyr)
library(survival)
library(survminer)

Read Data

data <- read.csv("HR data.csv")
data$salary<-ordered(data$salary,levels=c("low","medium","high"))
data$Work_accident <- factor(data$Work_accident)
data$promotion_last_5years <- factor(data$promotion_last_5years)
newdata <- data %>% select(-sales)
str(newdata)
## 'data.frame':    14999 obs. of  9 variables:
##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ salary               : Ord.factor w/ 3 levels "low"<"medium"<..: 1 2 2 1 1 1 1 1 1 1 ...

Making a Survival Object

survObj <- Surv(newdata$time_spend_company, newdata$left)
head(survObj)
## [1] 3  6  4  5  3  3

Kaplan-Meier Survival curve

fit <- survfit(survObj~1)
print(fit)
## Call: survfit(formula = survObj ~ 1)
## 
##       n  events  median 0.95LCL 0.95UCL 
##   14999    3571       6       5       6

Survival Table

summary(fit)
## Call: survfit(formula = survObj ~ 1)
## 
##  time n.risk n.event survival  std.err lower 95% CI upper 95% CI
##     2  14999      53    0.996 0.000485        0.996        0.997
##     3  11755    1586    0.862 0.003168        0.856        0.868
##     4   5312     890    0.718 0.005144        0.708        0.728
##     5   2755     833    0.501 0.007232        0.487        0.515
##     6   1282     209    0.419 0.007957        0.404        0.435

Main KM Survival Plot

ggsurvplot(fit,
           break.time.by = 1,
           palette = c("#E7B800", "#2E9FDF"),
           xlim = c(0,6),
           conf.int = TRUE, # Add confidence interval
           pval = TRUE, # Add p-value
           risk.table = TRUE, # Add risk table
           risk.table.height = 0.25, # Useful to change when you have multiple groups
           ggtheme = theme_light() # Change ggplot2 theme
          )

Group by Salary

fit.sal <- survfit(survObj~newdata$salary)
print(fit.sal)
## Call: survfit(formula = survObj ~ newdata$salary)
## 
##                          n events median 0.95LCL 0.95UCL
## newdata$salary=low    7316   2172      5       5       5
## newdata$salary=medium 6446   1317      6       6      NA
## newdata$salary=high   1237     82     NA      NA      NA
ggsurvplot(fit.sal,
           break.time.by = 1,
           conf.int = TRUE, # Add confidence interval
           pval = TRUE, # Add p-value
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Risk table color by groups
           risk.table.height = 0.50, # Useful to change when you have multiple groups
           ggtheme = theme_light() # Change ggplot2 theme
          )

Group by Projects

fit.proj <- survfit(survObj~as.factor(newdata$number_project))
print(fit.proj)
## Call: survfit(formula = survObj ~ as.factor(newdata$number_project))
## 
##                                        n events median 0.95LCL 0.95UCL
## as.factor(newdata$number_project)=2 2388   1567      3       3       3
## as.factor(newdata$number_project)=3 4055     72     NA      NA      NA
## as.factor(newdata$number_project)=4 4365    409     NA       6      NA
## as.factor(newdata$number_project)=5 2761    612      6       5       6
## as.factor(newdata$number_project)=6 1174    655      4       4       4
## as.factor(newdata$number_project)=7  256    256      4       4       4
ggsurvplot(fit.proj,
           break.time.by = 1,
           conf.int = TRUE, # Add confidence interval
           pval = TRUE, # Add p-value
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Risk table color by groups
           risk.table.height = 0.6, # Useful to change when you have multiple groups
           surv.plot.height = 0.9,
           ggtheme = theme_light() # Change ggplot2 theme
          )

Group by Promotion

fit.prom <- survfit(survObj~as.factor(newdata$promotion_last_5years))
print(fit.prom)
## Call: survfit(formula = survObj ~ as.factor(newdata$promotion_last_5years))
## 
##                                                n events median 0.95LCL
## as.factor(newdata$promotion_last_5years)=0 14680   3552      5       5
## as.factor(newdata$promotion_last_5years)=1   319     19     NA      NA
##                                            0.95UCL
## as.factor(newdata$promotion_last_5years)=0       6
## as.factor(newdata$promotion_last_5years)=1      NA
ggsurvplot(fit.prom,
           break.time.by = 1,
           conf.int = TRUE, # Add confidence interval
           pval = TRUE, # Add p-value
           risk.table = TRUE, # Add risk table
           risk.table.col = "strata", # Risk table color by groups
           risk.table.height = 0.3, # Useful to change when you have multiple groups
           ggtheme = theme_light() # Change ggplot2 theme
          )