#establish general directory
getwd()
## [1] "/cloud/project"
# list files in thespecific directory
list.files("/cloud/project/hr_dept")
## [1] "HR_Employees.csv"   "HR_Performance.csv" "HR_training.csv"   
## [4] "HR_Turnover.csv"    "salary_costs.csv"
# load the 3 datasets in the specific directory

#1 employees dataset
employees <- read.csv("/cloud/project/hr_dept/HR_Employees.csv")
colnames(employees)
## [1] "EmployeeID"      "Name"            "Position"        "Work_Location"  
## [5] "Employment_Type" "Hire_Date"       "SupervisorID"
#2 performance dataset
performance <- read.csv("/cloud/project/hr_dept/HR_Performance.csv")
colnames(performance)
## [1] "EmployeeID"         "Performance_Rating" "Bonus_Awarded"     
## [4] "Review_Date"
#3 turnover dataset
turnover <- read.csv("/cloud/project/hr_dept/HR_Turnover.csv")
colnames(turnover)
## [1] "EmployeeID" "Exit_Date"  "Reason"     "NewHireID"
#4 training dataset
training <- read.csv("/cloud/project/hr_dept/HR_training.csv")
colnames(training)
## [1] "TrainingID"               "EmployeeID"              
## [3] "TrainingProgram"          "TrainingDate"            
## [5] "TrainingHours"            "TrainingCompletionStatus"
## [7] "TrainingScore"            "TrainerName"             
## [9] "DepartmentID"
#5 salary-cost dataset
costs <- read.csv("/cloud/project/hr_dept/salary_costs.csv")
colnames(costs)
## [1] "Salary_Cost_ID" "Employee_ID"    "Salary"         "Misc_Costs"    
## [5] "Cost_Date"
#Q1: Establish employee performance and productivity

# step1: load r packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

# step2: aggregate performance count and group by performance rating
employee_performance <- performance %>%
  group_by(Performance_Rating)%>%
  summarise(count=n(),.groups='drop')

# step3: print results
print(employee_performance)
## # A tibble: 5 × 2
##   Performance_Rating count
##                <int> <int>
## 1                  1    20
## 2                  2    21
## 3                  3    25
## 4                  4    17
## 5                  5    17
# step4: visualize output
library(ggplot2)
ggplot(data=employee_performance,mapping=aes(x=Performance_Rating,y=count))+
  geom_col(color='black',fill='lightgreen')+
  labs(title='Performance Rating by Count')

#Q2: Is there a correlation between performance and bonus awarded?

# what is the correlation between performance and bonus awarded
correlation <- cor(performance$Performance_Rating,performance$Bonus_Awarded,use='complete.obs')
print(correlation)
## [1] -0.1114384
#Q3: How does training impact employee performance?
library(dplyr)
library(ggplot2)

#step1: join the training and employee tables
training_performance <- left_join(training,performance,by='EmployeeID')
## Warning in left_join(training, performance, by = "EmployeeID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 94 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#step2: count peformance and group
training_impact <- training_performance %>%
  group_by(Performance_Rating,TrainingCompletionStatus)%>%
  summarise(count=n(),.groups='drop')

#step3: print results
print(training_impact)
## # A tibble: 18 × 3
##    Performance_Rating TrainingCompletionStatus count
##                 <int> <chr>                    <int>
##  1                  1 Completed                    3
##  2                  1 Failed                       6
##  3                  1 In Progress                  4
##  4                  2 Completed                    4
##  5                  2 Failed                      11
##  6                  2 In Progress                  7
##  7                  3 Completed                    3
##  8                  3 Failed                       7
##  9                  3 In Progress                  2
## 10                  4 Completed                    5
## 11                  4 Failed                       6
## 12                  4 In Progress                  8
## 13                  5 Completed                    6
## 14                  5 Failed                       2
## 15                  5 In Progress                 10
## 16                 NA Completed                   11
## 17                 NA Failed                      16
## 18                 NA In Progress                 14
#step4: visualize results
ggplot(data=training_impact,mapping=aes(x=Performance_Rating,y=count,fill=TrainingCompletionStatus))+
  geom_col(color='black',position='dodge') +
  labs(title='How training impacts employee performance')
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_col()`).

# Q4: What are the most common reasons employees leave?

#step1: aggregate using count and group by reason employees left
top_churn <- turnover %>%
  group_by(Reason)%>%
  summarise(count=n(),groups='drop')%>%
  arrange(desc(count))

#step2: print results
print(top_churn)
## # A tibble: 3 × 3
##   Reason     count groups
##   <chr>      <int> <chr> 
## 1 Resigned      38 drop  
## 2 Retired       33 drop  
## 3 Terminated    29 drop
#step3: visualize results
ggplot(data=top_churn,mapping=aes(x=Reason,y=count))+
  geom_col(color='black',fill='orange')+
labs(title='Reasons employees leave')

# Q5: Is turnover higher in specific job position?

#step1: join turnover and employees table
turnover_employees <- left_join(turnover,employees,by='EmployeeID')

#step2: aggregate turnover by job role
role_turnover <- turnover_employees %>%
  group_by(Position,Reason)%>%
  summarise(count=n(),.groups='drop')%>%
  arrange(desc(count),Position)

#step3: print results
print(role_turnover)
## # A tibble: 12 × 3
##    Position      Reason     count
##    <chr>         <chr>      <int>
##  1 Manager       Resigned      13
##  2 Manager       Retired       13
##  3 Manager       Terminated    13
##  4 Analyst       Resigned      11
##  5 HR Specialist Retired        8
##  6 Analyst       Terminated     7
##  7 Developer     Resigned       7
##  8 Developer     Retired        7
##  9 HR Specialist Resigned       7
## 10 Analyst       Retired        5
## 11 HR Specialist Terminated     5
## 12 Developer     Terminated     4
#step4: visualize results
ggplot(data=role_turnover,mapping=aes(x=Position,y=count,fill=Reason))+
  geom_col(color='black', position='dodge')+
  labs(title='Turnover by job position')

#Q6 How does employment type impact churn rate?

#step1: join the employees table and the turnover table
employees_turnover <- left_join(employees,turnover,by='EmployeeID')

#step2: calculate retention rate
type_impact <- employees_turnover %>%
  group_by (Employment_Type)%>%
  summarise(
    churn_count=sum(!is.na(Reason)),
    total_employees=n(),
    churn_rate=churn_count/total_employees
  ,.groups='drop'
  )

#step3: print results
print(type_impact)
## # A tibble: 3 × 4
##   Employment_Type churn_count total_employees churn_rate
##   <chr>                 <int>           <int>      <dbl>
## 1 Contract                 47              57      0.825
## 2 Full-time                25              41      0.610
## 3 Part-time                28              44      0.636
#step4: visualize results
ggplot(data=type_impact,mapping=aes(x=Employment_Type,y=churn_rate,fill=Employment_Type))+
  geom_col(color='black')+labs(title='How employment impacts churn rate')

#Q7 How does position impact retention rate?

#step1: join the employees table and the turnover table
employees_turnover <- left_join(employees,turnover,by='EmployeeID')

#step2: calculate retention rate
position_impact <- employees_turnover %>%
  group_by (Position)%>%
  summarise(
    retention_count=sum(is.na(Reason)),
    total_employees=n(),
    retention_rate=retention_count/total_employees
    ,.groups='drop'
  )

#step3: print results
print(position_impact)
## # A tibble: 4 × 4
##   Position      retention_count total_employees retention_rate
##   <chr>                   <int>           <int>          <dbl>
## 1 Analyst                    12              35          0.343
## 2 Developer                  11              29          0.379
## 3 HR Specialist               8              28          0.286
## 4 Manager                    11              50          0.22
#step4: visualize results
ggplot(data=position_impact,mapping=aes(x=Position,y=retention_rate,fill=Position))+
  geom_col(color='black')+labs(title='How Position impacts retention rate')

#Q8: what is the training completion status by training program?

#step1: aggregate completion status and group by training program
completion_status <- training %>%
  group_by (TrainingProgram)%>%
  summarise (count=n(),groups='drop')%>%
  arrange(desc(count))

#step2: print results
print(completion_status)
## # A tibble: 10 × 3
##    TrainingProgram             count groups
##    <chr>                       <int> <chr> 
##  1 Financial Modeling Bootcamp    13 drop  
##  2 Leadership Development         12 drop  
##  3 Technical Skills Workshop      12 drop  
##  4 Marketing Strategies           11 drop  
##  5 Customer Service Training      10 drop  
##  6 Project Management Basics      10 drop  
##  7 Cybersecurity Awareness         9 drop  
##  8 Sales Excellence Program        9 drop  
##  9 Data Analytics Essentials       8 drop  
## 10 HR Compliance Training          6 drop
#step3: visualize results
ggplot(data=completion_status,mapping=aes(x=TrainingProgram,y=count))+
  geom_col(color='black',fill='lightblue')+
  labs(title='Training Completion Status by Training Program')+
  theme(axis.text.x = element_text(angle=45,hjust=1))

#Q9: Are there positions with excessive overtime costs?

#step1: Join the employees and costs table
employees_costs <- left_join(employees,costs,by=c("EmployeeID" ="Employee_ID"))

#step2: sum costs and group by position
position_costs <- employees_costs%>%
  group_by (Position)%>%
  summarise(total_costs=sum(Misc_Costs,na.rm=TRUE),.groups='drop')%>%
  arrange(desc(total_costs))

#step3: print results
print(position_costs)
## # A tibble: 4 × 2
##   Position      total_costs
##   <chr>               <dbl>
## 1 Analyst            78535.
## 2 Manager            69085.
## 3 HR Specialist      62902.
## 4 Developer          48720.
#visualize results
ggplot(data=position_costs, mapping=aes(x=Position,y=total_costs))+
  geom_col(color='black',fill='lightgreen')+
  labs(title='Costs by Position')

#Q10: What is the relationship between salary and performance rating by location

#step1: join the costs and performance table
costs_performance <- left_join (costs,performance,by=c("Employee_ID"="EmployeeID"))
## Warning in left_join(costs, performance, by = c(Employee_ID = "EmployeeID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 11 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#step2: join the costs_performance table above to employees table
costs_performance_employees <- left_join(costs_performance,employees,by=c("Employee_ID"="EmployeeID"))

#step3: aggregate salary and group by location and performance rating
salary_performance <- costs_performance_employees %>%
  group_by (Performance_Rating,Work_Location)%>%
  summarise (avg_salary = mean(Salary,na.rm=TRUE),.groups='drop')

#step4: print results
print(salary_performance)
## # A tibble: 18 × 3
##    Performance_Rating Work_Location avg_salary
##                 <int> <chr>              <dbl>
##  1                  1 Chicago           86004 
##  2                  1 Los Angeles       82909.
##  3                  1 New York          84341.
##  4                  2 Chicago           83129.
##  5                  2 Los Angeles       54848.
##  6                  2 New York          87564.
##  7                  3 Chicago           91364.
##  8                  3 Los Angeles       57073 
##  9                  3 New York          69315.
## 10                  4 Chicago           81881.
## 11                  4 Los Angeles       87901.
## 12                  4 New York          63588.
## 13                  5 Chicago           66598.
## 14                  5 Los Angeles       69678 
## 15                  5 New York          64120.
## 16                 NA Chicago           68531.
## 17                 NA Los Angeles       80199.
## 18                 NA New York          63708.
#step5: visualize results
ggplot(data=salary_performance,mapping=aes(x=Performance_Rating,y=avg_salary,fill=Work_Location))+
  geom_col(color='black',position='stack')+
  labs(title='Relationship between Salary and Performance Rating by Location')
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_col()`).

#Q11:Does an employee’s salary level influence the likelihood of turnover type 

#load nnet package that contains multinom function
library(nnet)

#step1:join the costs and turnover tables
costs_turnover <- left_join(costs,turnover,by=c("Employee_ID"="EmployeeID"))
## Warning in left_join(costs, turnover, by = c(Employee_ID = "EmployeeID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8 of `x` matches multiple rows in `y`.
## ℹ Row 31 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#step2:establish the logistic regression rate
multi_logistic_model <- multinom(Reason~Salary,data=costs_turnover)
## # weights:  9 (4 variable)
## initial  value 98.875106 
## final  value 95.499055 
## converged
#step2: print results
print (multi_logistic_model)
## Call:
## multinom(formula = Reason ~ Salary, data = costs_turnover)
## 
## Coefficients:
##            (Intercept)        Salary
## Retired     -0.6644148  5.636636e-06
## Terminated   0.1055740 -9.382469e-06
## 
## Residual Deviance: 190.9981 
## AIC: 198.9981
#Is there a relationship between employee position and salary?

#step1: calculate logistic regression
logistic_model <-multinom(Position~Salary,data=employees_costs)
## # weights:  12 (6 variable)
## initial  value 138.629436 
## final  value 136.198836 
## converged
#step2: print results
print(logistic_model)
## Call:
## multinom(formula = Position ~ Salary, data = employees_costs)
## 
## Coefficients:
##               (Intercept)        Salary
## Developer      -0.1914699 -3.741696e-06
## HR Specialist  -0.1121410 -9.964701e-07
## Manager         0.7522235 -1.020568e-05
## 
## Residual Deviance: 272.3977 
## AIC: 284.3977
#Do employees who received training have significantly higher salaries 
#than those who didn’t?

#step1:join the training and costs table
training_costs <- left_join(training,costs,by=c("EmployeeID"="Employee_ID"))
## Warning in left_join(training, costs, by = c(EmployeeID = "Employee_ID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 7 of `x` matches multiple rows in `y`.
## ℹ Row 80 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
#step2:establish means for the multiple groups
anova_result <- aov(Salary~TrainingCompletionStatus,data=training_costs)

#step3: print results
print(anova_result)
## Call:
##    aov(formula = Salary ~ TrainingCompletionStatus, data = training_costs)
## 
## Terms:
##                 TrainingCompletionStatus   Residuals
## Sum of Squares                 219121974 83767167464
## Deg. of Freedom                        2         106
## 
## Residual standard error: 28111.5
## Estimated effects may be unbalanced
## 32 observations deleted due to missingness
#What is the cost of each employment type as a percentage of the total salary costs

#step1: join the cost and employee tables
costs_employees <- left_join(costs,employees,by=c("Employee_ID"="EmployeeID"))

#step2: calculate the rate
salaries_costs <- costs_employees %>%
  group_by (Employment_Type)%>%
  summarise(
    total_costs = sum(Misc_Costs),
    total_salary = sum(Salary),
    cost_rate = total_costs/total_salary*100
  ,.groups='drop')

#step3: print results
print(salaries_costs)
## # A tibble: 3 × 4
##   Employment_Type total_costs total_salary cost_rate
##   <chr>                 <dbl>        <int>     <dbl>
## 1 Contract             91272.      2716520      3.36
## 2 Full-time            76995.      2136316      3.60
## 3 Part-time            90974.      2615555      3.48
#step4: visualize results
ggplot(data=salaries_costs,mapping=aes(x=cost_rate,y=Employment_Type,fill=Employment_Type))+
  geom_bar(stat='identity',color='black')+
  labs(title='Costs as a percentage of Salaries by Position')

#Q15:Is there a relationship between the position and the reason for turnover? 

# Step 1: Ensure both variables are factors
employees_turnover$Position <- as.factor(employees_turnover$Position)
employees_turnover$Reason <- as.factor(employees_turnover$Reason)

# Step 2: Establish relationship using Chi-square test
chi_result <- chisq.test(employees_turnover$Position, employees_turnover$Reason)

# Step 3: Print results
print(chi_result)
## 
##  Pearson's Chi-squared test
## 
## data:  employees_turnover$Position and employees_turnover$Reason
## X-squared = 2.8946, df = 6, p-value = 0.822
#Q16:Are there significant salary outliers by employee positions? 

# Identify salary outliers within each department
salary_outliers <- employees_costs %>%
  group_by(Position) %>% 
  mutate(
    q1_salary = quantile(Salary, 0.25, na.rm = TRUE),
    q3_salary = quantile(Salary, 0.75, na.rm = TRUE),
    salary_iqr = q3_salary - q1_salary,
    lower_bounds_salary = q1_salary - 1.5 * salary_iqr,
    upper_bounds_salary = q3_salary + 1.5 * salary_iqr,
    is_outlier = Salary < lower_bounds_salary | Salary > upper_bounds_salary
  ) %>%
  filter(is_outlier)  # Keep only rows where is_outlier is TRUE

# Print results
print(salary_outliers)
## # A tibble: 0 × 17
## # Groups:   Position [0]
## # ℹ 17 variables: EmployeeID <int>, Name <chr>, Position <chr>,
## #   Work_Location <chr>, Employment_Type <chr>, Hire_Date <chr>,
## #   SupervisorID <int>, Salary_Cost_ID <int>, Salary <int>, Misc_Costs <dbl>,
## #   Cost_Date <chr>, q1_salary <dbl>, q3_salary <dbl>, salary_iqr <dbl>,
## #   lower_bounds_salary <dbl>, upper_bounds_salary <dbl>, is_outlier <lgl>