Load Libraries
library(dplyr)
library(survival)
library(survminer)
Read Data
data <- read.csv("HR data.csv")
data$salary<-ordered(data$salary,levels=c("low","medium","high"))
data$Work_accident <- factor(data$Work_accident)
data$promotion_last_5years <- factor(data$promotion_last_5years)
newdata <- data %>% select(-sales)
str(newdata)
## 'data.frame': 14999 obs. of 9 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ salary : Ord.factor w/ 3 levels "low"<"medium"<..: 1 2 2 1 1 1 1 1 1 1 ...
Making a Survival Object
survObj <- Surv(newdata$time_spend_company, newdata$left)
head(survObj)
## [1] 3 6 4 5 3 3
Kaplan-Meier Survival curve
fit <- survfit(survObj~1)
print(fit)
## Call: survfit(formula = survObj ~ 1)
##
## n events median 0.95LCL 0.95UCL
## 14999 3571 6 5 6
Survival Table
summary(fit)
## Call: survfit(formula = survObj ~ 1)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 2 14999 53 0.996 0.000485 0.996 0.997
## 3 11755 1586 0.862 0.003168 0.856 0.868
## 4 5312 890 0.718 0.005144 0.708 0.728
## 5 2755 833 0.501 0.007232 0.487 0.515
## 6 1282 209 0.419 0.007957 0.404 0.435
Main KM Survival Plot
ggsurvplot(fit,
break.time.by = 1,
palette = c("#E7B800", "#2E9FDF"),
xlim = c(0,6),
conf.int = TRUE, # Add confidence interval
pval = TRUE, # Add p-value
risk.table = TRUE, # Add risk table
risk.table.height = 0.25, # Useful to change when you have multiple groups
ggtheme = theme_light() # Change ggplot2 theme
)

Group by Salary
fit.sal <- survfit(survObj~newdata$salary)
print(fit.sal)
## Call: survfit(formula = survObj ~ newdata$salary)
##
## n events median 0.95LCL 0.95UCL
## newdata$salary=low 7316 2172 5 5 5
## newdata$salary=medium 6446 1317 6 6 NA
## newdata$salary=high 1237 82 NA NA NA
ggsurvplot(fit.sal,
break.time.by = 1,
conf.int = TRUE, # Add confidence interval
pval = TRUE, # Add p-value
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Risk table color by groups
risk.table.height = 0.50, # Useful to change when you have multiple groups
ggtheme = theme_light() # Change ggplot2 theme
)

Group by Projects
fit.proj <- survfit(survObj~as.factor(newdata$number_project))
print(fit.proj)
## Call: survfit(formula = survObj ~ as.factor(newdata$number_project))
##
## n events median 0.95LCL 0.95UCL
## as.factor(newdata$number_project)=2 2388 1567 3 3 3
## as.factor(newdata$number_project)=3 4055 72 NA NA NA
## as.factor(newdata$number_project)=4 4365 409 NA 6 NA
## as.factor(newdata$number_project)=5 2761 612 6 5 6
## as.factor(newdata$number_project)=6 1174 655 4 4 4
## as.factor(newdata$number_project)=7 256 256 4 4 4
ggsurvplot(fit.proj,
break.time.by = 1,
conf.int = TRUE, # Add confidence interval
pval = TRUE, # Add p-value
risk.table = TRUE, # Add risk table
risk.table.col = "strata", # Risk table color by groups
risk.table.height = 0.6, # Useful to change when you have multiple groups
surv.plot.height = 0.9,
ggtheme = theme_light() # Change ggplot2 theme
)
