#install.packages("tidyr")
# install.packages("dplyr")
# install.packages("ggplot2")
# install.packages("readr")
# install.packages("lubridate")
# install.packages("Plotly")
# install.packages("hrbrthemes")
# install.packages("viridis")
# install.packages("ggridges")
# install.packages("highcharter")
# install.packages("rio")
# install.packages("purrr")

library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(viridis)
## Loading required package: viridisLite
library(ggridges)
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(rio)
## 
## Attaching package: 'rio'
## The following object is masked from 'package:plotly':
## 
##     export
library(purrr)
library(stringr)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## The following object is masked from 'package:readr':
## 
##     col_factor
library(stringr)
df <- read_csv("HRdataset_v14.csv")
## Rows: 311 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (19): Employee_Name, Position, State, Zip, DOB, Sex, MaritalDesc, Citize...
## dbl (17): EmpID, MarriedID, MaritalStatusID, GenderID, EmpStatusID, DeptID, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
## Rows: 311
## Columns: 36
## $ Employee_Name              <chr> "Adinolfi, Wilson  K", "Ait Sidi, Karthikey…
## $ EmpID                      <dbl> 10026, 10084, 10196, 10088, 10069, 10002, 1…
## $ MarriedID                  <dbl> 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
## $ MaritalStatusID            <dbl> 0, 1, 1, 1, 2, 0, 0, 4, 0, 2, 1, 1, 2, 0, 2…
## $ GenderID                   <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1…
## $ EmpStatusID                <dbl> 1, 5, 5, 1, 5, 1, 1, 1, 3, 1, 5, 5, 1, 1, 5…
## $ DeptID                     <dbl> 5, 3, 5, 5, 5, 5, 4, 5, 5, 3, 5, 5, 3, 5, 5…
## $ PerfScoreID                <dbl> 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3…
## $ FromDiversityJobFairID     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0…
## $ Salary                     <dbl> 62506, 104437, 64955, 64991, 50825, 57568, …
## $ Termd                      <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1…
## $ PositionID                 <dbl> 19, 27, 20, 19, 19, 19, 24, 19, 19, 14, 19,…
## $ Position                   <chr> "Production Technician I", "Sr. DBA", "Prod…
## $ State                      <chr> "MA", "MA", "MA", "MA", "MA", "MA", "MA", "…
## $ Zip                        <chr> "01960", "02148", "01810", "01886", "02169"…
## $ DOB                        <chr> "07/10/83", "05/05/75", "09/19/88", "09/27/…
## $ Sex                        <chr> "M", "M", "F", "F", "F", "F", "F", "M", "F"…
## $ MaritalDesc                <chr> "Single", "Married", "Married", "Married", …
## $ CitizenDesc                <chr> "US Citizen", "US Citizen", "US Citizen", "…
## $ HispanicLatino             <chr> "No", "No", "No", "No", "No", "No", "No", "…
## $ RaceDesc                   <chr> "White", "White", "White", "White", "White"…
## $ DateofHire                 <chr> "7/5/2011", "3/30/2015", "7/5/2011", "1/7/2…
## $ DateofTermination          <chr> NA, "6/16/2016", "9/24/2012", NA, "9/6/2016…
## $ TermReason                 <chr> "N/A-StillEmployed", "career change", "hour…
## $ EmploymentStatus           <chr> "Active", "Voluntarily Terminated", "Volunt…
## $ Department                 <chr> "Production", "IT/IS", "Production", "Produ…
## $ ManagerName                <chr> "Michael Albert", "Simon Roup", "Kissy Sull…
## $ ManagerID                  <dbl> 22, 4, 20, 16, 39, 11, 10, 19, 12, 7, 14, 2…
## $ RecruitmentSource          <chr> "LinkedIn", "Indeed", "LinkedIn", "Indeed",…
## $ PerformanceScore           <chr> "Exceeds", "Fully Meets", "Fully Meets", "F…
## $ EngagementSurvey           <dbl> 4.60, 4.96, 3.02, 4.84, 5.00, 5.00, 3.04, 5…
## $ EmpSatisfaction            <dbl> 5, 3, 3, 5, 4, 5, 3, 4, 3, 5, 4, 3, 4, 4, 5…
## $ SpecialProjectsCount       <dbl> 0, 6, 0, 0, 0, 0, 4, 0, 0, 6, 0, 0, 5, 0, 0…
## $ LastPerformanceReview_Date <chr> "1/17/2019", "2/24/2016", "5/15/2012", "1/3…
## $ DaysLateLast30             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Absences                   <dbl> 1, 17, 3, 15, 2, 15, 19, 19, 4, 16, 12, 15,…
class(df$CitizenDesc)
## [1] "character"
unique(df$CitizenDesc)
## [1] "US Citizen"          "Eligible NonCitizen" "Non-Citizen"

CLEANSING

To Factor

  1. RecruitmentSource : 9 levels
  2. PerformanceScore : 3 levels
  3. EmpSatisfaction : 3 levels
  4. Department : 5 levels
  5. EmploymentStatus : 2 levels
  6. TermReason : 18 levels
  7. sex : 2 levels
  8. CitizenDesc : 2 levels

To Date

  1. DOB
  2. DateofHire
  3. DateofTermination
  4. LastPerformanceReview_Date

To Logical

  1. Married Id
  2. FromDiversityJobFairID

#drop columns 1. ZIP 2. HispanicLatino 3. RaceDesc

df_clean <- df %>% 
  mutate_at(.vars = c("RecruitmentSource", "PerformanceScore", "EmpSatisfaction", "Department", "EmploymentStatus", "TermReason", "Sex","CitizenDesc" ), 
            .funs = as.factor) %>% 
  mutate_at(.vars = c("MarriedID", "FromDiversityJobFairID"), 
            .funs = as.logical)

df_clean$DateofHire <- mdy(df$DateofHire) 
df_clean$DOB <- mdy(df$DOB) 
df_clean$DateofTermination <- mdy(df$DateofTermination)
df_clean$LastPerformanceReview_Date <- mdy(df$LastPerformanceReview_Date)

df_clean <- select(df_clean, -Zip, -HispanicLatino, -RaceDesc)
glimpse(df_clean)
## Rows: 311
## Columns: 33
## $ Employee_Name              <chr> "Adinolfi, Wilson  K", "Ait Sidi, Karthikey…
## $ EmpID                      <dbl> 10026, 10084, 10196, 10088, 10069, 10002, 1…
## $ MarriedID                  <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALS…
## $ MaritalStatusID            <dbl> 0, 1, 1, 1, 2, 0, 0, 4, 0, 2, 1, 1, 2, 0, 2…
## $ GenderID                   <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1…
## $ EmpStatusID                <dbl> 1, 5, 5, 1, 5, 1, 1, 1, 3, 1, 5, 5, 1, 1, 5…
## $ DeptID                     <dbl> 5, 3, 5, 5, 5, 5, 4, 5, 5, 3, 5, 5, 3, 5, 5…
## $ PerfScoreID                <dbl> 4, 3, 3, 3, 3, 4, 3, 3, 3, 3, 3, 3, 4, 3, 3…
## $ FromDiversityJobFairID     <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, F…
## $ Salary                     <dbl> 62506, 104437, 64955, 64991, 50825, 57568, …
## $ Termd                      <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1…
## $ PositionID                 <dbl> 19, 27, 20, 19, 19, 19, 24, 19, 19, 14, 19,…
## $ Position                   <chr> "Production Technician I", "Sr. DBA", "Prod…
## $ State                      <chr> "MA", "MA", "MA", "MA", "MA", "MA", "MA", "…
## $ DOB                        <date> 1983-07-10, 1975-05-05, 1988-09-19, 1988-0…
## $ Sex                        <fct> M, M, F, F, F, F, F, M, F, M, F, M, M, M, M…
## $ MaritalDesc                <chr> "Single", "Married", "Married", "Married", …
## $ CitizenDesc                <fct> US Citizen, US Citizen, US Citizen, US Citi…
## $ DateofHire                 <date> 2011-07-05, 2015-03-30, 2011-07-05, 2008-0…
## $ DateofTermination          <date> NA, 2016-06-16, 2012-09-24, NA, 2016-09-06…
## $ TermReason                 <fct> "N/A-StillEmployed", "career change", "hour…
## $ EmploymentStatus           <fct> Active, Voluntarily Terminated, Voluntarily…
## $ Department                 <fct> Production, IT/IS, Production, Production, …
## $ ManagerName                <chr> "Michael Albert", "Simon Roup", "Kissy Sull…
## $ ManagerID                  <dbl> 22, 4, 20, 16, 39, 11, 10, 19, 12, 7, 14, 2…
## $ RecruitmentSource          <fct> LinkedIn, Indeed, LinkedIn, Indeed, Google …
## $ PerformanceScore           <fct> Exceeds, Fully Meets, Fully Meets, Fully Me…
## $ EngagementSurvey           <dbl> 4.60, 4.96, 3.02, 4.84, 5.00, 5.00, 3.04, 5…
## $ EmpSatisfaction            <fct> 5, 3, 3, 5, 4, 5, 3, 4, 3, 5, 4, 3, 4, 4, 5…
## $ SpecialProjectsCount       <dbl> 0, 6, 0, 0, 0, 0, 4, 0, 0, 6, 0, 0, 5, 0, 0…
## $ LastPerformanceReview_Date <date> 2019-01-17, 2016-02-24, 2012-05-15, 2019-0…
## $ DaysLateLast30             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Absences                   <dbl> 1, 17, 3, 15, 2, 15, 19, 19, 4, 16, 12, 15,…

ANALYSING

df_clean
## # A tibble: 311 × 33
##    Employe…¹ EmpID Marri…² Marit…³ Gende…⁴ EmpSt…⁵ DeptID PerfS…⁶ FromD…⁷ Salary
##    <chr>     <dbl> <lgl>     <dbl>   <dbl>   <dbl>  <dbl>   <dbl> <lgl>    <dbl>
##  1 Adinolfi… 10026 FALSE         0       1       1      5       4 FALSE    62506
##  2 Ait Sidi… 10084 TRUE          1       1       5      3       3 FALSE   104437
##  3 Akinkuol… 10196 TRUE          1       0       5      5       3 FALSE    64955
##  4 Alagbe,T… 10088 TRUE          1       0       1      5       3 FALSE    64991
##  5 Anderson… 10069 FALSE         2       0       5      5       3 FALSE    50825
##  6 Anderson… 10002 FALSE         0       0       1      5       4 FALSE    57568
##  7 Andreola… 10194 FALSE         0       0       1      4       3 FALSE    95660
##  8 Athwal, … 10062 FALSE         4       1       1      5       3 FALSE    59365
##  9 Bachioch… 10114 FALSE         0       0       3      5       3 TRUE     47837
## 10 Bacong, … 10250 FALSE         2       1       1      3       3 FALSE    50178
## # … with 301 more rows, 23 more variables: Termd <dbl>, PositionID <dbl>,
## #   Position <chr>, State <chr>, DOB <date>, Sex <fct>, MaritalDesc <chr>,
## #   CitizenDesc <fct>, DateofHire <date>, DateofTermination <date>,
## #   TermReason <fct>, EmploymentStatus <fct>, Department <fct>,
## #   ManagerName <chr>, ManagerID <dbl>, RecruitmentSource <fct>,
## #   PerformanceScore <fct>, EngagementSurvey <dbl>, EmpSatisfaction <fct>,
## #   SpecialProjectsCount <dbl>, LastPerformanceReview_Date <date>, …
plot1 <- 
  ggplot(data= df_clean, mapping = aes(x = Department, y = EngagementSurvey)) +
  geom_boxplot(mapping = aes(fill = Department))

ggplotly(plot1)
plot5 <- 
  ggplot(data = df_clean, mapping = aes(x = df_clean$EngagementSurvey, y = Salary))+
  geom_line()

ggplotly(plot5)
## Warning: Use of `df_clean$EngagementSurvey` is discouraged. Use
## `EngagementSurvey` instead.
hc <- hchart(
  density(df_clean$Salary), type = "area", 
  color = "steelblue", name = "Male"
  )

hc
p2 <- ggplot(data=df_clean, aes(x=Salary, group=EmpSatisfaction, fill=EmpSatisfaction)) +
    geom_density(adjust=1.5, alpha=.4) +
    theme_ipsum()

ggplotly(p2)
hchart(df_clean, 
         type = "column", 
         hcaes(x = PerformanceScore, 
               y = Salary, 
               group = Department))
hchart(df_clean,
       type = "column", 
       hcaes(x = Department, 
             y = Salary,
             group = EmpSatisfaction))%>% 
    hc_tooltip(
      useHTML = TRUE
    ) %>% hc_exporting(enabled = TRUE) 
  # %>% 
  #   hc_tooltip(crosshairs = TRUE,
  #              shared = TRUE, borderWidth = 2)
hc <- hchart(df_clean, "packedbubble", hcaes(name = Position,value = EngagementSurvey,group = RecruitmentSource))

q95 <- as.numeric(quantile(df_clean$EngagementSurvey, .95))

hc %>% 
  hc_tooltip(

        useHTML = TRUE,
    pointFormat = "<b>{point.name}:</b> {point.value}"
  ) %>% 
  hc_plotOptions(
    packedbubble = list(
      maxSize = "50%",
      zMin = 0,
      layoutAlgorithm = list(
        gravitationalConstant =  0.05,
        splitSeries =  TRUE, # TRUE to group points
        seriesInteraction = TRUE,
        dragBetweenSeries = TRUE,
        parentNodeLimit = TRUE
      ),
      dataLabels = list(
        enabled = TRUE,
        format = "{point.name}",
        filter = list(
          property = "y",
          operator = ">",
          value = q95
        ),
        style = list(
          color = "black",
          textOutline = "none",
          fontWeight = "normal"
        )
      )
    )
  )
options(dplyr.summarise.inform = FALSE)

df_baru <- 
  select(df_clean, Department, Position, Salary) %>% 
  arrange(desc(Salary)) 
  
  df_baru_2 <- df_baru %>% 
  group_by(Department) %>% 
  summarise(dailyAvg = mean(Salary))

drilldown <- df_baru %>%
  # group_by(Department, Position, .add = T) %>% 
  group_nest(Department) %>%
  mutate(
    id = Department,
    type = "column",
    data = map(data, ~ .x %>%
      mutate(
        name = Position,
        y    = Salary
      ) %>%
      list_parse())
  )

mygraph <- hchart(
  df_baru_2,
  "column",
  hcaes(x = Department, y = dailyAvg, name = Department, drilldown = Department),
  name = "Salary",
  colorByPoint = TRUE
)

mygraph <- mygraph %>% 
  hc_drilldown(
    allowPointDrilldown = TRUE,
    series = list_parse(drilldown)
  ) 

mygraph <- mygraph %>%
  hc_tooltip(
    useHTML = TRUE 
  ) %>% 
  hc_yAxis( title = "") %>% 
  hc_xAxis( title = "" ) %>%
  hc_title(text = "Salary" )

mygraph
unique(df_clean$Department)
## [1] Production           IT/IS                Software Engineering
## [4] Admin Offices        Sales                Executive Office    
## 6 Levels: Admin Offices Executive Office IT/IS Production ... Software Engineering
hchart(df_clean, "scatter", hcaes(x= EngagementSurvey,y= SpecialProjectsCount, name = Position, group = Department)) %>% 
    hc_chart(type = "scatter") %>%
    hc_tooltip(
      headerFormat = "",
      pointFormat = "<b>{point.Position}</b> is in <b>{point.Department}</b>"
    ) 
unique(df_clean$ManagerName)
##  [1] "Michael Albert"     "Simon Roup"         "Kissy Sullivan"    
##  [4] "Elijiah Gray"       "Webster Butler"     "Amy Dunn"          
##  [7] "Alex Sweetwater"    "Ketsia Liebig"      "Brannon Miller"    
## [10] "Peter Monroe"       "David Stanley"      "Kelley Spirea"     
## [13] "Brandon R. LeBlanc" "Janet King"         "John Smith"        
## [16] "Jennifer Zamora"    "Lynn Daneault"      "Eric Dougall"      
## [19] "Debra Houlihan"     "Brian Champaigne"   "Board of Directors"
hchart(df_clean, "organization", hcaes(from = ManagerName, to = Employee_Name)) %>% 
  hc_chart(inverted = TRUE)
hchart(df_clean, "lollipop", hcaes(name = Employee_Name, low = Absences ),name = "Absences") %>% 
  hc_xAxis(type = "category") %>% 
  hc_yAxis(labels = list(format = "{value}"))
df_clean_2 <- df_clean %>% mutate_at(.vars = c("EmpSatisfaction"), .funs = as.numeric) 
hchart(df_clean_2, "lollipop", hcaes(name = Employee_Name, low = EmpSatisfaction),name = "EmpSatisfaction") %>% 
      hc_xAxis(type = "category") %>% 
      hc_yAxis(labels = list(format = "{value}")) %>% 
      hc_title(text="Employee Satisfaction under {var}",align="center")
mean(df$Absences)
## [1] 10.23794
# https://rafif.shinyapps.io/HumanResource/