Load libraries
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
theme for nice plotting
theme_nice <- theme_classic()+
theme(
axis.line.y.left = element_line(colour = "black"),
axis.line.y.right = element_line(colour = "black"),
axis.line.x.bottom = element_line(colour = "black"),
axis.line.x.top = element_line(colour = "black"),
axis.text.y = element_text(colour = "black", size = 12),
axis.text.x = element_text(color = "black", size = 12),
axis.ticks = element_line(color = "black")) +
theme(
axis.ticks.length = unit(-0.25, "cm"),
axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")),
axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
Load Data
# Absences data
abs_data <- read_excel("Anon_ESTH_Absences.xlsx")
## New names:
## • `Department name` -> `Department name...3`
## • `Department name` -> `Department name...19`
Making all variable lower case
names(abs_data) <- tolower(x = names(abs_data))
Viewing the structure of the data
str(abs_data)
## tibble [839 × 19] (S3: tbl_df/tbl/data.frame)
## $ id : num [1:839] 1 2 3 4 5 6 7 8 9 10 ...
## $ employee id : num [1:839] 26794649 27006215 26751119 26765779 22834377 ...
## $ department name...3 : chr [1:839] "A & E (St Helier) - Med Staff" "Pharmacy (Epsom) - OHP" "Infection Control - Non Clinical" "Cardiology (St Helier) - Med Staff" ...
## $ staff group : chr [1:839] "Medical and Dental" "Add Prof Scientific and Technic" "Administrative and Clerical" "Medical and Dental" ...
## $ location : chr [1:839] "St Helier" "Epsom" "St Helier" "St Helier" ...
## $ manager id : num [1:839] NA 20403303 26245662 NA 10122920 ...
## $ fte : num [1:839] 1 1 1 1 1 1 0.43 1 1 0.6 ...
## $ absence start : POSIXct[1:839], format: "2018-05-31 12:00:00" "2018-05-31 09:00:00" ...
## $ last absent day : POSIXct[1:839], format: "2018-06-01" "2018-05-31" ...
## $ rtw date : POSIXct[1:839], format: "2018-06-04 16:00:00" "2018-06-01 09:00:00" ...
## $ days lost : num [1:839] 2 1 1 1 0.75 2 2 17 1 1 ...
## $ days lost in period : num [1:839] 1 1 1 1 0.75 1 1 1 1 1 ...
## $ calendar days lost : num [1:839] 4 1 1 1 2 4 7 25 2 5 ...
## $ calendardays lost in period: num [1:839] 1 1 1 1 1 1 1 1 1 1 ...
## $ hours lost : num [1:839] 20 7.5 7.5 7.5 6 15 16 126 8.5 7.5 ...
## $ hours lost in period : num [1:839] 10 7.5 7.5 7.5 6 7.5 8 5.5 8.5 7.5 ...
## $ type : chr [1:839] "Medical" "Medical" "Medical" "Medical" ...
## $ department_name2 : chr [1:839] "Others" "OHP" "Non Clinical" "Med Staff" ...
## $ department name...19 : chr [1:839] "A & E (St Helier) - Med Staff" "Pharmacy (Epsom) - OHP" "Infection Control - Non Clinical" "Cardiology (St Helier) - Med Staff" ...
# At first glance the data contains 10 numerical variables, 4 character variables, and 3 data variable
Data Cleaning
anyNA(abs_data)
## [1] TRUE
# Data included missing values
Check to see which variables have missing value
colSums(is.na(abs_data))
## id employee id
## 0 0
## department name...3 staff group
## 0 0
## location manager id
## 0 171
## fte absence start
## 0 0
## last absent day rtw date
## 0 0
## days lost days lost in period
## 0 0
## calendar days lost calendardays lost in period
## 0 0
## hours lost hours lost in period
## 0 0
## type department_name2
## 0 0
## department name...19
## 0
# All missing value was from the managers ID
Remove manager ID variable
abs_data <- abs_data[-c(6)]
Checking variables distribution to decide which is important
table(abs_data$department_name2)
##
## EGH Med Staff Non Clinical Nursing OHP Others
## 95 23 46 253 94 197
## STH
## 131
# Deparment variabel has so many levels, its best to recode
I used this line of code to clean and recreate the department name
and finished the rest in excel.
# The data shows similar pattern, and can splitted on "-"
# load stringr library
#library(stringr)
#abs_data2[c('Prefix', 'department_name2')] <- str_split_fixed(abs_data2$`department name`, '-', 2)
Here are the summary of the numerical data sets
summary(abs_data
%>% select_if(is.numeric))
## fte days lost days lost in period calendar days lost
## Min. :0.170 Min. : 0.25 Min. : 0.000 Min. : 1.00
## 1st Qu.:0.800 1st Qu.: 1.00 1st Qu.: 1.000 1st Qu.: 2.00
## Median :1.000 Median : 2.00 Median : 2.000 Median : 5.00
## Mean :0.886 Mean : 11.14 Mean : 4.338 Mean : 19.92
## 3rd Qu.:1.000 3rd Qu.: 6.00 3rd Qu.: 4.250 3rd Qu.: 13.00
## Max. :1.000 Max. :336.00 Max. :27.000 Max. :469.00
## calendardays lost in period hours lost hours lost in period
## Min. : 0.0 Min. : 2.00 Min. : 0.00
## 1st Qu.: 2.0 1st Qu.: 11.50 1st Qu.: 9.00
## Median : 4.0 Median : 22.50 Median : 16.50
## Mean : 7.9 Mean : 91.56 Mean : 36.93
## 3rd Qu.: 8.0 3rd Qu.: 50.50 3rd Qu.: 37.50
## Max. :32.0 Max. :2520.00 Max. :276.00
and here is the categorical feature along with the number of unique
values:
abs_data %>% select_if(is.character) %>%
summarise_all(~n_distinct(.)) %>%
pivot_longer(., everything(), names_to = "columns", values_to = "count_unique_values")
Correlation Visauls
pairs(abs_data
%>% select_if(is.numeric))

library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
#Correlation Visual
ggcorr(abs_data, method = c("everything", "pearson"))
## Warning in ggcorr(abs_data, method = c("everything", "pearson")): data in
## column(s) 'staff group', 'location', 'absence start', 'last absent day', 'rtw
## date', 'type', 'department_name2' are not numeric and were ignored

Correlation Values
library(readr)
corr_results<- cor(abs_data
%>% select_if(is.numeric))
round(corr_results, 2)
## fte days lost days lost in period
## fte 1.00 -0.01 0.02
## days lost -0.01 1.00 0.71
## days lost in period 0.02 0.71 1.00
## calendar days lost -0.09 0.97 0.68
## calendardays lost in period -0.12 0.64 0.91
## hours lost 0.02 0.98 0.71
## hours lost in period 0.08 0.66 0.96
## calendar days lost calendardays lost in period
## fte -0.09 -0.12
## days lost 0.97 0.64
## days lost in period 0.68 0.91
## calendar days lost 1.00 0.69
## calendardays lost in period 0.69 1.00
## hours lost 0.97 0.66
## hours lost in period 0.65 0.91
## hours lost hours lost in period
## fte 0.02 0.08
## days lost 0.98 0.66
## days lost in period 0.71 0.96
## calendar days lost 0.97 0.65
## calendardays lost in period 0.66 0.91
## hours lost 1.00 0.70
## hours lost in period 0.70 1.00
abc<- data.frame(round(corr_results, 2))
write_csv(abc, "Correlation result.csv")
distribution of the type
table(abs_data$type )
##
## Medical Non-Medical
## 724 115
Percentage and graphical representation of the type
type<- abs_data %>% select(type) %>% count(type) %>%
mutate(percent=round((n/sum(n))*100,2),
lab_ypos = cumsum(percent) - 0.7*percent) %>%
ggplot(aes(x=2, y=percent, fill = factor(type, levels = c("Non-Medical", "Medical")))) +
geom_bar(stat="identity", start=0) +
coord_polar(theta = "y", start=0) +
geom_text(aes(y = lab_ypos,
label = paste0(percent,' ','%')), color = "white") +
theme_void() + theme(legend.position = "bottom") + xlim(0.5, 2.5) +
labs(title = "Percenatage of Absentees by Type ", fill = "type")
## Warning: Ignoring unknown parameters: start
ggsave(filename = "type.png",height=8, width=10, dpi = "print")
type

Initial distribution of location(Before reclassifying)
table(abs_data$location)
##
## Epsom Epsom and St Helier (across sites)
## 215 31
## Leatherhead Orchard Hill
## 1 1
## Queen Marys St Helier
## 1 517
## Sutton SWLEOC
## 12 61
Reclassifying to get fancy result
library(forcats)
#Location
abs_data$location<- car::Recode(abs_data$location,
recodes="'Leatherhead'='Others'; 'Orchard Hill'='Others'; 'Queen Marys'='Others' ;
'Epsom and St Helier (across sites)'='Epsom'",
as.factor=T)
abs_data$location<-fct_relevel(abs_data$location,'St Helier','Epsom','SWLEOC', 'Sutton', 'Others')
Percentage distribution by location
round((prop.table(table(abs_data$location)))*100,2)
##
## St Helier Epsom SWLEOC Sutton Others
## 61.62 29.32 7.27 1.43 0.36
Graphical representation of the location
myplot <- ggplot(abs_data, aes(location)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
ylab("Percent of Absentees") + xlab(" Location ") + labs(title = "Percenatage of Absentees by Location") +theme_nice
ggsave(filename = "location.png",height=8, width=10, dpi = "print")
myplot

Distribution of the staff group data and reclassifying
table(abs_data$`staff group`)
##
## Add Prof Scientific and Technic Additional Clinical Services
## 25 231
## Administrative and Clerical Allied Health Professionals
## 187 25
## Estates and Ancillary Healthcare Scientists
## 32 24
## Medical and Dental Nursing and Midwifery Registered
## 46 269
abs_data$`staff group`<- car::Recode(abs_data$`staff group`,
recodes="'Add Prof Scientific and Technic'='Sci & Tec'; 'Additional Clinical Services'='Cli Serv'; 'Administrative and Clerical'='Adm & Cl' ; 'Estates and Ancillary'='Est & Anc' ; 'Healthcare Scientists'='Healc Sci' ;
'Medical and Dental'='Med & Den' ; 'Nursing and Midwifery Registered'='Nurs & Mid' ;
'Allied Health Professionals'='Health Prof'",
as.factor=T)
table(abs_data$`staff group`)
##
## Adm & Cl Cli Serv Est & Anc Healc Sci Health Prof Med & Den
## 187 231 32 24 25 46
## Nurs & Mid Sci & Tec
## 269 25
abs_data$`staff group`<-fct_relevel(abs_data$`staff group`,'Nurs & Mid','Cli Serv','Adm & Cl', 'Med & Den', 'Est & Anc', 'Health Prof', 'Sci & Tec', 'Healc Sci')
myplot2 <- ggplot(abs_data, aes(`staff group`)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
ylab("Percent of Absentees") + xlab(" Staff Group ") + labs(title = "Percenatage of Absentees by Staff group") +theme_nice
ggsave(filename = "staff_group.png",height=8, width=10, dpi = "print")
myplot2

Distribution of the department data after cleaning in R and
Excel
table(abs_data$department_name2)
##
## EGH Med Staff Non Clinical Nursing OHP Others
## 95 23 46 253 94 197
## STH
## 131
abs_data$department_name2<-fct_relevel(abs_data$department_name2,'Nursing','Others','STH', 'EGH', 'OHP', 'Non Clinical', 'Med Staff')
Percentage distribution of the department variable
myplot3 <- ggplot(abs_data, aes(department_name2)) +
geom_bar(aes(y = (..count..)/sum(..count..))) +
scale_y_continuous(labels=scales::percent) +
ylab("Percent of Absentees") + xlab("Department Name") + labs(title = "Percenatage of Absentees by Department")+theme_nice
ggsave(filename = "Department_Name.png",height=8, width=10, dpi = "print")
myplot3

---
title: "Attrition Analysis"
author: "Gloria Falomo"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
   html_document:
    df_print: paged
    fig_height: 7
    fig_width: 7
    toc: yes
    toc_float: yes
    code_download: true
---

# Load libraries
```{r}
library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
```

# theme for nice plotting
```{r}

theme_nice <- theme_classic()+
                theme(
                  axis.line.y.left = element_line(colour = "black"),
                  axis.line.y.right = element_line(colour = "black"),
                  axis.line.x.bottom = element_line(colour = "black"),
                  axis.line.x.top = element_line(colour = "black"),
                  axis.text.y = element_text(colour = "black", size = 12),
                  axis.text.x = element_text(color = "black", size = 12),
                  axis.ticks = element_line(color = "black")) +
                theme(
                  axis.ticks.length = unit(-0.25, "cm"), 
                  axis.text.x = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")), 
                  axis.text.y = element_text(margin=unit(c(0.5,0.5,0.5,0.5), "cm")))
```



# Load Data
```{r}
# Absences data
abs_data <- read_excel("Anon_ESTH_Absences.xlsx")
```

# Making all variable lower case
```{r}
names(abs_data) <- tolower(x =  names(abs_data))

```


# Viewing the structure of the data
```{r}
str(abs_data)

# At first glance the data contains 10 numerical variables, 4 character variables, and 3 data variable
```



# Data Cleaning
```{r}
anyNA(abs_data)
# Data included missing values
```

# Check to see which variables have missing value 
```{r}
colSums(is.na(abs_data))

# All missing value was from the managers ID 
```

# Remove manager ID variable 
```{r}
abs_data <- abs_data[-c(6)]
```

# Checking variables distribution to decide which is important

```{r}
table(abs_data$department_name2)
# Deparment variabel has so many levels, its best to recode
```




# I used this line of code to clean and recreate the department name and finished the rest in excel. 
```{r}
# The data shows similar pattern, and can splitted on "-"
# load stringr library
#library(stringr)
 
#abs_data2[c('Prefix', 'department_name2')] <- str_split_fixed(abs_data2$`department name`, '-', 2)
 
```



# There are some variables that can be removed as they do not give useful information nor relevant to the analysis

```{r}
abs_data <- abs_data %>% select(-c("employee id", "id", "department name...19", "department name...3"))

```


# Here are the summary of the numerical data sets

```{r}
 summary(abs_data 
        %>% select_if(is.numeric))
```

# and here is the categorical feature along with the number of unique values:

```{r}
abs_data %>% select_if(is.character) %>%
  summarise_all(~n_distinct(.)) %>% 
 pivot_longer(., everything(), names_to = "columns", values_to = "count_unique_values")
```




# Correlation Visauls
```{r}
pairs(abs_data 
        %>% select_if(is.numeric))
```

```{r}
library(GGally)
#Correlation Visual
ggcorr(abs_data, method = c("everything", "pearson"))
```

# Correlation Values
```{r}
library(readr)
corr_results<- cor(abs_data 
        %>% select_if(is.numeric))

round(corr_results, 2)
abc<- data.frame(round(corr_results, 2))

write_csv(abc, "Correlation result.csv")  
```

# distribution of the type
```{r}
table(abs_data$type )
```


# Percentage and graphical representation of the type
```{r}
type<- abs_data %>% select(type) %>% count(type) %>% 
  mutate(percent=round((n/sum(n))*100,2), 
         lab_ypos = cumsum(percent) - 0.7*percent) %>% 
  ggplot(aes(x=2, y=percent, fill = factor(type, levels = c("Non-Medical", "Medical")))) +
  geom_bar(stat="identity", start=0) +
  coord_polar(theta = "y", start=0) +
  geom_text(aes(y = lab_ypos, 
                label = paste0(percent,' ','%')), color = "white") +
  theme_void() + theme(legend.position = "bottom") + xlim(0.5, 2.5) +
  labs(title = "Percenatage of Absentees by Type ", fill = "type")

ggsave(filename = "type.png",height=8, width=10, dpi = "print")

type

```



# Initial distribution of location(Before reclassifying)
```{r}
table(abs_data$location)
```

# Reclassifying to get fancy result
```{r}
library(forcats)
#Location
abs_data$location<- car::Recode(abs_data$location,
                     recodes="'Leatherhead'='Others'; 'Orchard Hill'='Others'; 'Queen Marys'='Others' ; 
                     'Epsom and St Helier (across sites)'='Epsom'",
                    as.factor=T)

abs_data$location<-fct_relevel(abs_data$location,'St Helier','Epsom','SWLEOC', 'Sutton', 'Others') 
```


# Percentage distribution by location
```{r}
 round((prop.table(table(abs_data$location)))*100,2)
```

# Graphical representation of the location
```{r}
myplot <- ggplot(abs_data, aes(location)) + 
          geom_bar(aes(y = (..count..)/sum(..count..))) + 
          scale_y_continuous(labels=scales::percent) +
  ylab("Percent of Absentees") + xlab(" Location ")  + labs(title = "Percenatage of Absentees by Location") +theme_nice
ggsave(filename = "location.png",height=8, width=10, dpi = "print")

myplot 
```

# Distribution of the staff group data and reclassifying
```{r}
table(abs_data$`staff group`)


abs_data$`staff group`<- car::Recode(abs_data$`staff group`,
                     recodes="'Add Prof Scientific and Technic'='Sci & Tec'; 'Additional Clinical Services'='Cli Serv'; 'Administrative and Clerical'='Adm & Cl' ; 'Estates and Ancillary'='Est & Anc' ; 'Healthcare Scientists'='Healc Sci' ; 
                     'Medical and Dental'='Med & Den' ; 'Nursing and Midwifery Registered'='Nurs & Mid' ; 
                     'Allied Health Professionals'='Health Prof'",
                    as.factor=T)
table(abs_data$`staff group`)

abs_data$`staff group`<-fct_relevel(abs_data$`staff group`,'Nurs & Mid','Cli Serv','Adm & Cl', 'Med & Den', 'Est & Anc', 'Health Prof', 'Sci & Tec', 'Healc Sci')
```


```{r}
myplot2 <- ggplot(abs_data, aes(`staff group`)) + 
          geom_bar(aes(y = (..count..)/sum(..count..))) + 
          scale_y_continuous(labels=scales::percent) +
  ylab("Percent of Absentees") + xlab(" Staff Group ") + labs(title = "Percenatage of Absentees by Staff group") +theme_nice
ggsave(filename = "staff_group.png",height=8, width=10, dpi = "print")
myplot2

```

# Distribution of the department data after cleaning in R and Excel
```{r}
table(abs_data$department_name2)

abs_data$department_name2<-fct_relevel(abs_data$department_name2,'Nursing','Others','STH', 'EGH', 'OHP', 'Non Clinical', 'Med Staff')
```

# Percentage distribution of the department variable
```{r}
myplot3 <- ggplot(abs_data, aes(department_name2)) + 
          geom_bar(aes(y = (..count..)/sum(..count..))) + 
          scale_y_continuous(labels=scales::percent) +
  ylab("Percent of Absentees") + xlab("Department Name") + labs(title = "Percenatage of Absentees by Department")+theme_nice

ggsave(filename = "Department_Name.png",height=8, width=10, dpi = "print")

myplot3 
```



