library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.5 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'tidyr' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'purrr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.3
## Warning: package 'stringr' was built under R version 4.0.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(readr)
library(devtools)
## Warning: package 'devtools' was built under R version 4.0.3
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.0.3
#install_github("ramnathv/htmlwidgets")
#install_github("smartinsightsfromdata/rpivotTable")
library(rpivotTable)
library(easyalluvial)
library(parcats)
## Warning: package 'parcats' was built under R version 4.0.3
library(ggplot2)
library(plotly)
## Warning: package 'plotly' was built under R version 4.0.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gapminder)
## Warning: package 'gapminder' was built under R version 4.0.3
library(rgl)
## Warning: package 'rgl' was built under R version 4.0.3
library(ggmosaic)
## Warning: package 'ggmosaic' was built under R version 4.0.3
WA_Fn_UseC_HR_Employee_Attrition <- read_csv("C:/Users/Nikhil/Desktop/WA_Fn-UseC_-HR-Employee-Attrition.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## Attrition = col_character(),
## BusinessTravel = col_character(),
## Department = col_character(),
## EducationField = col_character(),
## Gender = col_character(),
## JobRole = col_character(),
## MaritalStatus = col_character(),
## Over18 = col_character(),
## OverTime = col_character()
## )
## i Use `spec()` for the full column specifications.
View(WA_Fn_UseC_HR_Employee_Attrition)
dataHR<-WA_Fn_UseC_HR_Employee_Attrition
str(dataHR)
## tibble [1,470 x 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr [1:1470] "Yes" "No" "Yes" "No" ...
## $ BusinessTravel : chr [1:1470] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : num [1:1470] 1102 279 1373 1392 591 ...
## $ Department : chr [1:1470] "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr [1:1470] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : num [1:1470] 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : num [1:1470] 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr [1:1470] "Female" "Male" "Male" "Female" ...
## $ HourlyRate : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr [1:1470] "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
## $ JobSatisfaction : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr [1:1470] "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : num [1:1470] 5993 5130 2090 2909 3468 ...
## $ MonthlyRate : num [1:1470] 19479 24907 2396 23159 16632 ...
## $ NumCompaniesWorked : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr [1:1470] "Y" "Y" "Y" "Y" ...
## $ OverTime : chr [1:1470] "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Attrition = col_character(),
## .. BusinessTravel = col_character(),
## .. DailyRate = col_double(),
## .. Department = col_character(),
## .. DistanceFromHome = col_double(),
## .. Education = col_double(),
## .. EducationField = col_character(),
## .. EmployeeCount = col_double(),
## .. EmployeeNumber = col_double(),
## .. EnvironmentSatisfaction = col_double(),
## .. Gender = col_character(),
## .. HourlyRate = col_double(),
## .. JobInvolvement = col_double(),
## .. JobLevel = col_double(),
## .. JobRole = col_character(),
## .. JobSatisfaction = col_double(),
## .. MaritalStatus = col_character(),
## .. MonthlyIncome = col_double(),
## .. MonthlyRate = col_double(),
## .. NumCompaniesWorked = col_double(),
## .. Over18 = col_character(),
## .. OverTime = col_character(),
## .. PercentSalaryHike = col_double(),
## .. PerformanceRating = col_double(),
## .. RelationshipSatisfaction = col_double(),
## .. StandardHours = col_double(),
## .. StockOptionLevel = col_double(),
## .. TotalWorkingYears = col_double(),
## .. TrainingTimesLastYear = col_double(),
## .. WorkLifeBalance = col_double(),
## .. YearsAtCompany = col_double(),
## .. YearsInCurrentRole = col_double(),
## .. YearsSinceLastPromotion = col_double(),
## .. YearsWithCurrManager = col_double()
## .. )
summary(dataHR)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 Length:1470 Length:1470 Min. : 102.0
## 1st Qu.:30.00 Class :character Class :character 1st Qu.: 465.0
## Median :36.00 Mode :character Mode :character Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
## Department DistanceFromHome Education EducationField
## Length:1470 Min. : 1.000 Min. :1.000 Length:1470
## Class :character 1st Qu.: 2.000 1st Qu.:2.000 Class :character
## Mode :character Median : 7.000 Median :3.000 Mode :character
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
## EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender
## Min. :1 Min. : 1.0 Min. :1.000 Length:1470
## 1st Qu.:1 1st Qu.: 491.2 1st Qu.:2.000 Class :character
## Median :1 Median :1020.5 Median :3.000 Mode :character
## Mean :1 Mean :1024.9 Mean :2.722
## 3rd Qu.:1 3rd Qu.:1555.8 3rd Qu.:4.000
## Max. :1 Max. :2068.0 Max. :4.000
## HourlyRate JobInvolvement JobLevel JobRole
## Min. : 30.00 Min. :1.00 Min. :1.000 Length:1470
## 1st Qu.: 48.00 1st Qu.:2.00 1st Qu.:1.000 Class :character
## Median : 66.00 Median :3.00 Median :2.000 Mode :character
## Mean : 65.89 Mean :2.73 Mean :2.064
## 3rd Qu.: 83.75 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :100.00 Max. :4.00 Max. :5.000
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Min. :1.000 Length:1470 Min. : 1009 Min. : 2094
## 1st Qu.:2.000 Class :character 1st Qu.: 2911 1st Qu.: 8047
## Median :3.000 Mode :character Median : 4919 Median :14236
## Mean :2.729 Mean : 6503 Mean :14313
## 3rd Qu.:4.000 3rd Qu.: 8379 3rd Qu.:20462
## Max. :4.000 Max. :19999 Max. :26999
## NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. :0.000 Length:1470 Length:1470 Min. :11.00
## 1st Qu.:1.000 Class :character Class :character 1st Qu.:12.00
## Median :2.000 Mode :character Mode :character Median :14.00
## Mean :2.693 Mean :15.21
## 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :9.000 Max. :25.00
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.154 Mean :2.712 Mean :80 Mean :0.7939
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.28 Mean :2.799 Mean :2.761 Mean : 7.008
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 3.000 Median : 1.000 Median : 3.000
## Mean : 4.229 Mean : 2.188 Mean : 4.123
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :18.000 Max. :15.000 Max. :17.000
dim(dataHR)
## [1] 1470 35
## Warning: Using size for a discrete variable is not advised.
## Warning: package 'heatmaply' was built under R version 4.0.3
## Loading required package: viridis
## Warning: package 'viridis' was built under R version 4.0.3
## Loading required package: viridisLite
## Warning: package 'viridisLite' was built under R version 4.0.3
##
## ======================
## Welcome to heatmaply version 1.2.1
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
suppressPackageStartupMessages( require(parcats) )
p = alluvial_wide(dataHR, max_variables = 7)
parcats(p, marginal_histograms = TRUE, data_input = dataHR)
p <- plot_ly(dataHR, y = ~HourlyRate, color = I("black"),
alpha = 0.1, boxpoints = "suspectedoutliers")
p1 <- p %>% add_boxplot(x = ~Gender)
p2 <- p %>% add_boxplot(x = ~Attrition)
subplot(
p1, p2, shareY = TRUE,
widths = c(0.2, 0.8), margin = 0
)
## Interactive Boxplots or visualzng the numerical and categoriical variables
plot_ly(dataHR, x = ~HourlyRate, y = ~interaction(Attrition,JobLevel)) %>%
add_boxplot(color = ~Gender) %>%
layout(yaxis = list(title = ""))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
plot_ly(dataHR, x = ~dataHR$PercentSalaryHike, y = ~YearsAtCompany, z = ~DailyRate) %>%
add_markers(color = ~Attrition)
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
## Warning in RColorBrewer::brewer.pal(N, "Set2"): minimal value for n is 3, returning requested palette with 3 different levels
corr <- cor(dplyr::select_if(dataHR, is.numeric))
## Warning in cor(dplyr::select_if(dataHR, is.numeric)): the standard deviation is
## zero
plot_ly(colors = "RdBu") %>%
add_heatmap(x = rownames(corr), y = colnames(corr), z = corr) %>%
colorbar(limits = c(-1, 1))
library(htmlwidgets)
plot_ly(dataHR, x = ~Age, y = ~MonthlyIncome) %>%
onRender("
function(el) {
el.on('plotly_hover', function(d) {
console.log('Hover: ', d);
});
el.on('plotly_click', function(d) {
console.log('Click: ', d);
});
el.on('plotly_selected', function(d) {
console.log('Select: ', d);
});
}
")
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(htmlwidgets)
plot_ly(dataHR, x = ~Age, y = ~PercentSalaryHike) %>%
onRender("
function(el) {
el.on('plotly_hover', function(d) {
console.log('Hover: ', d);
});
el.on('plotly_click', function(d) {
console.log('Click: ', d);
});
el.on('plotly_selected', function(d) {
console.log('Select: ', d);
});
}
")
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
library(gapminder)
m <- highlight_key(dataHR, ~Attrition)
p1 <- ggplot(m, aes(MonthlyIncome, fill = Attrition)) + geom_density()
p2 <- ggplot(m, aes(MonthlyIncome, DailyRate, fill = Attrition)) + geom_point()
subplot(p1, p2) %>% hide_legend() %>% highlight("plotly_hover")
## Warning: All elements of `...` must be named.
## Did you want `key = c(key)`?
## Setting the `off` event (i.e., 'plotly_doubleclick') to match the `on` event (i.e., 'plotly_hover'). You can change this default via the `highlight()` function.
library(gapminder)
m <- highlight_key(dataHR, ~Attrition)
p1 <- ggplot(m, aes(PercentSalaryHike, fill = Attrition)) + geom_density()
p2 <- ggplot(m, aes(MonthlyIncome,PercentSalaryHike, fill = Attrition)) + geom_point()
subplot(p1, p2) %>% hide_legend() %>% highlight("plotly_hover")
## Warning: All elements of `...` must be named.
## Did you want `key = c(key)`?
## Setting the `off` event (i.e., 'plotly_doubleclick') to match the `on` event (i.e., 'plotly_hover'). You can change this default via the `highlight()` function.
rpivotTable(dataHR, rows="Education", col="Attrition", aggregatorName="Count Unique Values ",
vals="JobLevel", rendererName="Treemap")
# for reproducibility
set.seed(123)
library(ggstatsplot)
## Warning: package 'ggstatsplot' was built under R version 4.0.3
## Registered S3 methods overwritten by 'lme4':
## method from
## cooks.distance.influence.merMod car
## influence.merMod car
## dfbeta.influence.merMod car
## dfbetas.influence.merMod car
## In case you would like cite this package, cite it as:
## Patil, I. (2018). ggstatsplot: "ggplot2" Based Plots with Statistical Details. CRAN.
## Retrieved from https://cran.r-project.org/web/packages/ggstatsplot/index.html
# plot
ggstatsplot::ggbetweenstats(
data = dataHR,
x = Department,
y = MonthlyIncome,
title = "Distribution of Monthly Income across Department"
)
library(wesanderson)
## Warning: package 'wesanderson' was built under R version 4.0.3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.3
ggstatsplot::grouped_ggbarstats(
data = dataHR,
x = Gender,
y = BusinessTravel,
grouping.var =Attrition,
title.prefix = "Attrition",
label = "both",
xlab = "Party affiliation",
package = "wesanderson",
palette = "Darjeeling2",
ggtheme = ggthemes::theme_tufte(base_size = 5),
ggstatsplot.layer = FALSE,
annotation.args = list(title = "Race, religion, and political affiliation"),
plotgrid.args = list(nrow = 2)
)
## Warning in (function (x, y = NULL, correct = TRUE, p = rep(1/length(x), : Chi-
## squared approximation may be incorrect
ggstatsplot::ggpiestats(
data = dataHR,
x = Department,
y = Attrition,
title = "Dataset: HR Attrition", # title for the plot
legend.title = "Department", # title for the legend
caption = substitute(paste(italic("Source"), ": Employee Attrition Analysis Kaggle"))
)
ggstatsplot::ggpiestats(
data = dataHR,
x = JobLevel,
y = Attrition,
title = "Dataset: HR Attrition", # title for the plot
legend.title = "Job level ", # title for the legend
caption = substitute(paste(italic("Source"), ": Employee Attrition Analysis Kaggle"))
)
ggstatsplot::ggpiestats(
data = dataHR,
x = BusinessTravel,
y = Attrition,
title = "Dataset: HR Attrition", # title for the plot
legend.title = "BusinessTravel", # title for the legend
caption = substitute(paste(italic("Source"), ": Employee Attrition Analysis Kaggle"))
)
# plot
ggstatsplot::ggbetweenstats(
data = dataHR,
x = EducationField,
y = MonthlyIncome,
title = "Distribution of Monthly Income across EducationField"
)
# plot
ggstatsplot::ggbetweenstats(
data = dataHR,
x = JobSatisfaction,
y = MonthlyIncome,
title = "Distribution of Monthly Income across JobSatisfaction"
)
# plot
ggstatsplot::ggbetweenstats(
data = dataHR,
x = MaritalStatus,
y = MonthlyIncome,
title = "Distribution of Monthly Income across MaritalStatus"
)
ggstatsplot::ggscatterstats(
data = dataHR,
x = Age,
y = MonthlyRate,
xlab = "Age",
ylab = "MonthlyRate",
title = "Understanding Age through MonthlyRate"
)
ggstatsplot::ggscatterstats(
data = dataHR,
x = PercentSalaryHike,
y = MonthlyIncome,
xlab = "PercentSalaryHike",
ylab = "MonthlyIncome",
title = "Understanding PercentSalaryHike through MonthlyIncome"
)