Interactive Exploratory Data Analysis and Sumamry Statistics for Attrition Analysis

loading the data visualization libraries

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.0.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.5     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.0.3

## Warning: package 'tibble' was built under R version 4.0.3

## Warning: package 'tidyr' was built under R version 4.0.3

## Warning: package 'readr' was built under R version 4.0.3

## Warning: package 'purrr' was built under R version 4.0.3

## Warning: package 'dplyr' was built under R version 4.0.3

## Warning: package 'stringr' was built under R version 4.0.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(readr)
library(devtools)

## Warning: package 'devtools' was built under R version 4.0.3

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.0.3

#install_github("ramnathv/htmlwidgets") 
#install_github("smartinsightsfromdata/rpivotTable")
library(rpivotTable)
library(easyalluvial)
library(parcats)

## Warning: package 'parcats' was built under R version 4.0.3

library(ggplot2)
library(plotly)

## Warning: package 'plotly' was built under R version 4.0.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(gapminder)

## Warning: package 'gapminder' was built under R version 4.0.3

library(rgl)

## Warning: package 'rgl' was built under R version 4.0.3

library(ggmosaic)

## Warning: package 'ggmosaic' was built under R version 4.0.3

loading and reading the data

WA_Fn_UseC_HR_Employee_Attrition <- read_csv("C:/Users/Nikhil/Desktop/WA_Fn-UseC_-HR-Employee-Attrition.csv")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   .default = col_double(),
##   Attrition = col_character(),
##   BusinessTravel = col_character(),
##   Department = col_character(),
##   EducationField = col_character(),
##   Gender = col_character(),
##   JobRole = col_character(),
##   MaritalStatus = col_character(),
##   Over18 = col_character(),
##   OverTime = col_character()
## )
## i Use `spec()` for the full column specifications.

View(WA_Fn_UseC_HR_Employee_Attrition)
dataHR<-WA_Fn_UseC_HR_Employee_Attrition
str(dataHR)

## tibble [1,470 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                     : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
##  $ Attrition               : chr [1:1470] "Yes" "No" "Yes" "No" ...
##  $ BusinessTravel          : chr [1:1470] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
##  $ DailyRate               : num [1:1470] 1102 279 1373 1392 591 ...
##  $ Department              : chr [1:1470] "Sales" "Research & Development" "Research & Development" "Research & Development" ...
##  $ DistanceFromHome        : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
##  $ Education               : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
##  $ EducationField          : chr [1:1470] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
##  $ EmployeeCount           : num [1:1470] 1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : num [1:1470] 1 2 4 5 7 8 10 11 12 13 ...
##  $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
##  $ Gender                  : chr [1:1470] "Female" "Male" "Male" "Female" ...
##  $ HourlyRate              : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
##  $ JobInvolvement          : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
##  $ JobLevel                : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
##  $ JobRole                 : chr [1:1470] "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
##  $ JobSatisfaction         : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
##  $ MaritalStatus           : chr [1:1470] "Single" "Married" "Single" "Married" ...
##  $ MonthlyIncome           : num [1:1470] 5993 5130 2090 2909 3468 ...
##  $ MonthlyRate             : num [1:1470] 19479 24907 2396 23159 16632 ...
##  $ NumCompaniesWorked      : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
##  $ Over18                  : chr [1:1470] "Y" "Y" "Y" "Y" ...
##  $ OverTime                : chr [1:1470] "Yes" "No" "Yes" "Yes" ...
##  $ PercentSalaryHike       : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
##  $ PerformanceRating       : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
##  $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
##  $ StandardHours           : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
##  $ TotalWorkingYears       : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
##  $ TrainingTimesLastYear   : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
##  $ WorkLifeBalance         : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
##  $ YearsInCurrentRole      : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
##  $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
##  $ YearsWithCurrManager    : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Attrition = col_character(),
##   ..   BusinessTravel = col_character(),
##   ..   DailyRate = col_double(),
##   ..   Department = col_character(),
##   ..   DistanceFromHome = col_double(),
##   ..   Education = col_double(),
##   ..   EducationField = col_character(),
##   ..   EmployeeCount = col_double(),
##   ..   EmployeeNumber = col_double(),
##   ..   EnvironmentSatisfaction = col_double(),
##   ..   Gender = col_character(),
##   ..   HourlyRate = col_double(),
##   ..   JobInvolvement = col_double(),
##   ..   JobLevel = col_double(),
##   ..   JobRole = col_character(),
##   ..   JobSatisfaction = col_double(),
##   ..   MaritalStatus = col_character(),
##   ..   MonthlyIncome = col_double(),
##   ..   MonthlyRate = col_double(),
##   ..   NumCompaniesWorked = col_double(),
##   ..   Over18 = col_character(),
##   ..   OverTime = col_character(),
##   ..   PercentSalaryHike = col_double(),
##   ..   PerformanceRating = col_double(),
##   ..   RelationshipSatisfaction = col_double(),
##   ..   StandardHours = col_double(),
##   ..   StockOptionLevel = col_double(),
##   ..   TotalWorkingYears = col_double(),
##   ..   TrainingTimesLastYear = col_double(),
##   ..   WorkLifeBalance = col_double(),
##   ..   YearsAtCompany = col_double(),
##   ..   YearsInCurrentRole = col_double(),
##   ..   YearsSinceLastPromotion = col_double(),
##   ..   YearsWithCurrManager = col_double()
##   .. )

summary(dataHR)

##       Age         Attrition         BusinessTravel       DailyRate     
##  Min.   :18.00   Length:1470        Length:1470        Min.   : 102.0  
##  1st Qu.:30.00   Class :character   Class :character   1st Qu.: 465.0  
##  Median :36.00   Mode  :character   Mode  :character   Median : 802.0  
##  Mean   :36.92                                         Mean   : 802.5  
##  3rd Qu.:43.00                                         3rd Qu.:1157.0  
##  Max.   :60.00                                         Max.   :1499.0  
##   Department        DistanceFromHome   Education     EducationField    
##  Length:1470        Min.   : 1.000   Min.   :1.000   Length:1470       
##  Class :character   1st Qu.: 2.000   1st Qu.:2.000   Class :character  
##  Mode  :character   Median : 7.000   Median :3.000   Mode  :character  
##                     Mean   : 9.193   Mean   :2.913                     
##                     3rd Qu.:14.000   3rd Qu.:4.000                     
##                     Max.   :29.000   Max.   :5.000                     
##  EmployeeCount EmployeeNumber   EnvironmentSatisfaction    Gender         
##  Min.   :1     Min.   :   1.0   Min.   :1.000           Length:1470       
##  1st Qu.:1     1st Qu.: 491.2   1st Qu.:2.000           Class :character  
##  Median :1     Median :1020.5   Median :3.000           Mode  :character  
##  Mean   :1     Mean   :1024.9   Mean   :2.722                             
##  3rd Qu.:1     3rd Qu.:1555.8   3rd Qu.:4.000                             
##  Max.   :1     Max.   :2068.0   Max.   :4.000                             
##    HourlyRate     JobInvolvement    JobLevel       JobRole         
##  Min.   : 30.00   Min.   :1.00   Min.   :1.000   Length:1470       
##  1st Qu.: 48.00   1st Qu.:2.00   1st Qu.:1.000   Class :character  
##  Median : 66.00   Median :3.00   Median :2.000   Mode  :character  
##  Mean   : 65.89   Mean   :2.73   Mean   :2.064                     
##  3rd Qu.: 83.75   3rd Qu.:3.00   3rd Qu.:3.000                     
##  Max.   :100.00   Max.   :4.00   Max.   :5.000                     
##  JobSatisfaction MaritalStatus      MonthlyIncome    MonthlyRate   
##  Min.   :1.000   Length:1470        Min.   : 1009   Min.   : 2094  
##  1st Qu.:2.000   Class :character   1st Qu.: 2911   1st Qu.: 8047  
##  Median :3.000   Mode  :character   Median : 4919   Median :14236  
##  Mean   :2.729                      Mean   : 6503   Mean   :14313  
##  3rd Qu.:4.000                      3rd Qu.: 8379   3rd Qu.:20462  
##  Max.   :4.000                      Max.   :19999   Max.   :26999  
##  NumCompaniesWorked    Over18            OverTime         PercentSalaryHike
##  Min.   :0.000      Length:1470        Length:1470        Min.   :11.00    
##  1st Qu.:1.000      Class :character   Class :character   1st Qu.:12.00    
##  Median :2.000      Mode  :character   Mode  :character   Median :14.00    
##  Mean   :2.693                                            Mean   :15.21    
##  3rd Qu.:4.000                                            3rd Qu.:18.00    
##  Max.   :9.000                                            Max.   :25.00    
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.154     Mean   :2.712            Mean   :80    Mean   :0.7939  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.28     Mean   :2.799         Mean   :2.761   Mean   : 7.008  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.: 9.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 3.000     Median : 1.000          Median : 3.000      
##  Mean   : 4.229     Mean   : 2.188          Mean   : 4.123      
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.000      
##  Max.   :18.000     Max.   :15.000          Max.   :17.000

dim(dataHR)

## [1] 1470   35

Bubblechart

## Warning: Using size for a discrete variable is not advised.

Interactive Heatmaps

## Warning: package 'heatmaply' was built under R version 4.0.3

## Loading required package: viridis

## Warning: package 'viridis' was built under R version 4.0.3

## Loading required package: viridisLite

## Warning: package 'viridisLite' was built under R version 4.0.3

## 
## ======================
## Welcome to heatmaply version 1.2.1
## 
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
## 
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================

Interactive Histograms fot visualizing the numerical variables

Interactive Subplots

Interactive Histograms

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Alluvial Plots

suppressPackageStartupMessages( require(parcats) )

p = alluvial_wide(dataHR, max_variables = 7)

parcats(p, marginal_histograms = TRUE, data_input = dataHR)

Boxplots

p <- plot_ly(dataHR, y = ~HourlyRate, color = I("black"), 
              alpha = 0.1, boxpoints = "suspectedoutliers")
 p1 <- p %>% add_boxplot(x = ~Gender)
 p2 <- p %>% add_boxplot(x = ~Attrition)
 subplot(
    p1, p2, shareY = TRUE,
     widths = c(0.2, 0.8), margin = 0
 )

## Interactive Boxplots or visualzng the numerical and categoriical variables

 plot_ly(dataHR, x = ~HourlyRate, y = ~interaction(Attrition,JobLevel)) %>%
  add_boxplot(color = ~Gender) %>%
  layout(yaxis = list(title = ""))

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

# 3D Scatter plots

plot_ly(dataHR, x = ~dataHR$PercentSalaryHike, y = ~YearsAtCompany, z = ~DailyRate) %>%
  add_markers(color = ~Attrition)

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels

Correlation plots

corr <- cor(dplyr::select_if(dataHR, is.numeric))

## Warning in cor(dplyr::select_if(dataHR, is.numeric)): the standard deviation is
## zero

plot_ly(colors = "RdBu") %>%
  add_heatmap(x = rownames(corr), y = colnames(corr), z = corr) %>%
  colorbar(limits = c(-1, 1))

Scatter plots

library(htmlwidgets)
plot_ly(dataHR, x = ~Age, y = ~MonthlyIncome) %>%
  onRender("
    function(el) { 
      el.on('plotly_hover', function(d) { 
        console.log('Hover: ', d); 
      });
      el.on('plotly_click', function(d) { 
        console.log('Click: ', d);
      });
      el.on('plotly_selected', function(d) { 
        console.log('Select: ', d); 
      });
    }
  ")

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

library(htmlwidgets)
plot_ly(dataHR, x = ~Age, y = ~PercentSalaryHike) %>%
  onRender("
    function(el) { 
      el.on('plotly_hover', function(d) { 
        console.log('Hover: ', d); 
      });
      el.on('plotly_click', function(d) { 
        console.log('Click: ', d);
      });
      el.on('plotly_selected', function(d) { 
        console.log('Select: ', d); 
      });
    }
  ")

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Density plots

library(gapminder)
m <- highlight_key(dataHR, ~Attrition)
p1 <- ggplot(m, aes(MonthlyIncome, fill = Attrition)) + geom_density()
p2 <- ggplot(m, aes(MonthlyIncome, DailyRate, fill = Attrition)) + geom_point()
subplot(p1, p2) %>% hide_legend() %>% highlight("plotly_hover")

## Warning: All elements of `...` must be named.
## Did you want `key = c(key)`?

## Setting the `off` event (i.e., 'plotly_doubleclick') to match the `on` event (i.e., 'plotly_hover'). You can change this default via the `highlight()` function.

library(gapminder)
m <- highlight_key(dataHR, ~Attrition)
p1 <- ggplot(m, aes(PercentSalaryHike, fill = Attrition)) + geom_density()
p2 <- ggplot(m, aes(MonthlyIncome,PercentSalaryHike, fill = Attrition)) + geom_point()
subplot(p1, p2) %>% hide_legend() %>% highlight("plotly_hover")

## Warning: All elements of `...` must be named.
## Did you want `key = c(key)`?

## Setting the `off` event (i.e., 'plotly_doubleclick') to match the `on` event (i.e., 'plotly_hover'). You can change this default via the `highlight()` function.

Interactive Pivot Table visualization

rpivotTable(dataHR, rows="Education", col="Attrition", aggregatorName="Count Unique Values ", 
vals="JobLevel", rendererName="Treemap")

Summary Stats Plots

# for reproducibility
set.seed(123)
library(ggstatsplot)

## Warning: package 'ggstatsplot' was built under R version 4.0.3

## Registered S3 methods overwritten by 'lme4':
##   method                          from
##   cooks.distance.influence.merMod car 
##   influence.merMod                car 
##   dfbeta.influence.merMod         car 
##   dfbetas.influence.merMod        car

## In case you would like cite this package, cite it as:
##      Patil, I. (2018). ggstatsplot: "ggplot2" Based Plots with Statistical Details. CRAN.
##      Retrieved from https://cran.r-project.org/web/packages/ggstatsplot/index.html

# plot
ggstatsplot::ggbetweenstats(
  data = dataHR,
  x = Department,
  y = MonthlyIncome,
  title = "Distribution of Monthly Income across Department"
)

library(wesanderson)

## Warning: package 'wesanderson' was built under R version 4.0.3

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.0.3

ggstatsplot::grouped_ggbarstats(
  data = dataHR,
  x = Gender,
  y = BusinessTravel,
  grouping.var =Attrition,
  title.prefix = "Attrition",
  label = "both",
  xlab = "Party affiliation",
  package = "wesanderson",
  palette = "Darjeeling2",
  ggtheme = ggthemes::theme_tufte(base_size = 5),
  ggstatsplot.layer = FALSE,
  annotation.args = list(title = "Race, religion, and political affiliation"),
  plotgrid.args = list(nrow = 2)
)

## Warning in (function (x, y = NULL, correct = TRUE, p = rep(1/length(x), : Chi-
## squared approximation may be incorrect

ggstatsplot::ggpiestats(
  data = dataHR,
  x = Department,
  y = Attrition,
  title = "Dataset: HR Attrition", # title for the plot
  legend.title = "Department", # title for the legend
  caption = substitute(paste(italic("Source"), ": Employee Attrition Analysis Kaggle"))
)

ggstatsplot::ggpiestats(
  data = dataHR,
  x = JobLevel,
  y = Attrition,
  title = "Dataset: HR Attrition", # title for the plot
  legend.title = "Job level ", # title for the legend
  caption = substitute(paste(italic("Source"), ": Employee Attrition Analysis Kaggle"))
)

ggstatsplot::ggpiestats(
  data = dataHR,
  x = BusinessTravel,
  y = Attrition,
  title = "Dataset: HR Attrition", # title for the plot
  legend.title = "BusinessTravel", # title for the legend
  caption = substitute(paste(italic("Source"), ": Employee Attrition Analysis Kaggle"))
)

# plot
ggstatsplot::ggbetweenstats(
  data = dataHR,
  x = EducationField,
  y = MonthlyIncome,
  title = "Distribution of Monthly Income across EducationField"
)

# plot
ggstatsplot::ggbetweenstats(
  data = dataHR,
  x = JobSatisfaction,
  y = MonthlyIncome,
  title = "Distribution of Monthly Income across JobSatisfaction"
)

# plot
ggstatsplot::ggbetweenstats(
  data = dataHR,
  x = MaritalStatus,
  y = MonthlyIncome,
  title = "Distribution of Monthly Income across  MaritalStatus"
)

ggstatsplot::ggscatterstats(
  data = dataHR,
  x = Age,
  y = MonthlyRate,
  xlab = "Age",
  ylab = "MonthlyRate",
  title = "Understanding Age through MonthlyRate"
)

ggstatsplot::ggscatterstats(
  data = dataHR,
  x = PercentSalaryHike,
  y = MonthlyIncome,
  xlab = "PercentSalaryHike",
  ylab = "MonthlyIncome",
  title = "Understanding PercentSalaryHike through MonthlyIncome"
)

Interactive Exploratory Data Analysis and Sumamry Statistics for Attrition Analysis

Akshata Kishore Moharir

2/16/2021

loading the data visualization libraries

loading and reading the data

Bubblechart

Interactive Heatmaps

Interactive Histograms fot visualizing the numerical variables

Interactive Subplots

Interactive Histograms

Alluvial Plots

Boxplots

# 3D Scatter plots

Correlation plots

Scatter plots

Density plots

Interactive Pivot Table visualization

Summary Stats Plots