#establish general directory
getwd()
## [1] "/cloud/project"
# list files in thespecific directory
list.files("/cloud/project/hr_dept")
## [1] "HR_Employees.csv" "HR_Performance.csv" "HR_training.csv"
## [4] "HR_Turnover.csv" "salary_costs.csv"
# load the 3 datasets in the specific directory
#1 employees dataset
employees <- read.csv("/cloud/project/hr_dept/HR_Employees.csv")
colnames(employees)
## [1] "EmployeeID" "Name" "Position" "Work_Location"
## [5] "Employment_Type" "Hire_Date" "SupervisorID"
#2 performance dataset
performance <- read.csv("/cloud/project/hr_dept/HR_Performance.csv")
colnames(performance)
## [1] "EmployeeID" "Performance_Rating" "Bonus_Awarded"
## [4] "Review_Date"
#3 turnover dataset
turnover <- read.csv("/cloud/project/hr_dept/HR_Turnover.csv")
colnames(turnover)
## [1] "EmployeeID" "Exit_Date" "Reason" "NewHireID"
#4 training dataset
training <- read.csv("/cloud/project/hr_dept/HR_training.csv")
colnames(training)
## [1] "TrainingID" "EmployeeID"
## [3] "TrainingProgram" "TrainingDate"
## [5] "TrainingHours" "TrainingCompletionStatus"
## [7] "TrainingScore" "TrainerName"
## [9] "DepartmentID"
#5 salary-cost dataset
costs <- read.csv("/cloud/project/hr_dept/salary_costs.csv")
colnames(costs)
## [1] "Salary_Cost_ID" "Employee_ID" "Salary" "Misc_Costs"
## [5] "Cost_Date"
#Q1: Establish employee performance and productivity
# step1: load r packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
# step2: aggregate performance count and group by performance rating
employee_performance <- performance %>%
group_by(Performance_Rating)%>%
summarise(count=n(),.groups='drop')
# step3: print results
print(employee_performance)
## # A tibble: 5 × 2
## Performance_Rating count
## <int> <int>
## 1 1 20
## 2 2 21
## 3 3 25
## 4 4 17
## 5 5 17
# step4: visualize output
library(ggplot2)
ggplot(data=employee_performance,mapping=aes(x=Performance_Rating,y=count))+
geom_col(color='black',fill='lightgreen')+
labs(title='Performance Rating by Count')

#Q2: Is there a correlation between performance and bonus awarded?
# what is the correlation between performance and bonus awarded
correlation <- cor(performance$Performance_Rating,performance$Bonus_Awarded,use='complete.obs')
print(correlation)
## [1] -0.1114384
#Q3: How does training impact employee performance?
library(dplyr)
library(ggplot2)
#step1: join the training and employee tables
training_performance <- left_join(training,performance,by='EmployeeID')
## Warning in left_join(training, performance, by = "EmployeeID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 94 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#step2: count peformance and group
training_impact <- training_performance %>%
group_by(Performance_Rating,TrainingCompletionStatus)%>%
summarise(count=n(),.groups='drop')
#step3: print results
print(training_impact)
## # A tibble: 18 × 3
## Performance_Rating TrainingCompletionStatus count
## <int> <chr> <int>
## 1 1 Completed 3
## 2 1 Failed 6
## 3 1 In Progress 4
## 4 2 Completed 4
## 5 2 Failed 11
## 6 2 In Progress 7
## 7 3 Completed 3
## 8 3 Failed 7
## 9 3 In Progress 2
## 10 4 Completed 5
## 11 4 Failed 6
## 12 4 In Progress 8
## 13 5 Completed 6
## 14 5 Failed 2
## 15 5 In Progress 10
## 16 NA Completed 11
## 17 NA Failed 16
## 18 NA In Progress 14
#step4: visualize results
ggplot(data=training_impact,mapping=aes(x=Performance_Rating,y=count,fill=TrainingCompletionStatus))+
geom_col(color='black',position='dodge') +
labs(title='How training impacts employee performance')
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_col()`).

# Q4: What are the most common reasons employees leave?
#step1: aggregate using count and group by reason employees left
top_churn <- turnover %>%
group_by(Reason)%>%
summarise(count=n(),groups='drop')%>%
arrange(desc(count))
#step2: print results
print(top_churn)
## # A tibble: 3 × 3
## Reason count groups
## <chr> <int> <chr>
## 1 Resigned 38 drop
## 2 Retired 33 drop
## 3 Terminated 29 drop
#step3: visualize results
ggplot(data=top_churn,mapping=aes(x=Reason,y=count))+
geom_col(color='black',fill='orange')+
labs(title='Reasons employees leave')

# Q5: Is turnover higher in specific job position?
#step1: join turnover and employees table
turnover_employees <- left_join(turnover,employees,by='EmployeeID')
#step2: aggregate turnover by job role
role_turnover <- turnover_employees %>%
group_by(Position,Reason)%>%
summarise(count=n(),.groups='drop')%>%
arrange(desc(count),Position)
#step3: print results
print(role_turnover)
## # A tibble: 12 × 3
## Position Reason count
## <chr> <chr> <int>
## 1 Manager Resigned 13
## 2 Manager Retired 13
## 3 Manager Terminated 13
## 4 Analyst Resigned 11
## 5 HR Specialist Retired 8
## 6 Analyst Terminated 7
## 7 Developer Resigned 7
## 8 Developer Retired 7
## 9 HR Specialist Resigned 7
## 10 Analyst Retired 5
## 11 HR Specialist Terminated 5
## 12 Developer Terminated 4
#step4: visualize results
ggplot(data=role_turnover,mapping=aes(x=Position,y=count,fill=Reason))+
geom_col(color='black', position='dodge')+
labs(title='Turnover by job position')

#Q6 How does employment type impact churn rate?
#step1: join the employees table and the turnover table
employees_turnover <- left_join(employees,turnover,by='EmployeeID')
#step2: calculate retention rate
type_impact <- employees_turnover %>%
group_by (Employment_Type)%>%
summarise(
churn_count=sum(!is.na(Reason)),
total_employees=n(),
churn_rate=churn_count/total_employees
,.groups='drop'
)
#step3: print results
print(type_impact)
## # A tibble: 3 × 4
## Employment_Type churn_count total_employees churn_rate
## <chr> <int> <int> <dbl>
## 1 Contract 47 57 0.825
## 2 Full-time 25 41 0.610
## 3 Part-time 28 44 0.636
#step4: visualize results
ggplot(data=type_impact,mapping=aes(x=Employment_Type,y=churn_rate,fill=Employment_Type))+
geom_col(color='black')+labs(title='How employment impacts churn rate')

#Q7 How does position impact retention rate?
#step1: join the employees table and the turnover table
employees_turnover <- left_join(employees,turnover,by='EmployeeID')
#step2: calculate retention rate
position_impact <- employees_turnover %>%
group_by (Position)%>%
summarise(
retention_count=sum(is.na(Reason)),
total_employees=n(),
retention_rate=retention_count/total_employees
,.groups='drop'
)
#step3: print results
print(position_impact)
## # A tibble: 4 × 4
## Position retention_count total_employees retention_rate
## <chr> <int> <int> <dbl>
## 1 Analyst 12 35 0.343
## 2 Developer 11 29 0.379
## 3 HR Specialist 8 28 0.286
## 4 Manager 11 50 0.22
#step4: visualize results
ggplot(data=position_impact,mapping=aes(x=Position,y=retention_rate,fill=Position))+
geom_col(color='black')+labs(title='How Position impacts retention rate')

#Q8: what is the training completion status by training program?
#step1: aggregate completion status and group by training program
completion_status <- training %>%
group_by (TrainingProgram)%>%
summarise (count=n(),groups='drop')%>%
arrange(desc(count))
#step2: print results
print(completion_status)
## # A tibble: 10 × 3
## TrainingProgram count groups
## <chr> <int> <chr>
## 1 Financial Modeling Bootcamp 13 drop
## 2 Leadership Development 12 drop
## 3 Technical Skills Workshop 12 drop
## 4 Marketing Strategies 11 drop
## 5 Customer Service Training 10 drop
## 6 Project Management Basics 10 drop
## 7 Cybersecurity Awareness 9 drop
## 8 Sales Excellence Program 9 drop
## 9 Data Analytics Essentials 8 drop
## 10 HR Compliance Training 6 drop
#step3: visualize results
ggplot(data=completion_status,mapping=aes(x=TrainingProgram,y=count))+
geom_col(color='black',fill='lightblue')+
labs(title='Training Completion Status by Training Program')+
theme(axis.text.x = element_text(angle=45,hjust=1))

#Q9: Are there positions with excessive overtime costs?
#step1: Join the employees and costs table
employees_costs <- left_join(employees,costs,by=c("EmployeeID" ="Employee_ID"))
#step2: sum costs and group by position
position_costs <- employees_costs%>%
group_by (Position)%>%
summarise(total_costs=sum(Misc_Costs,na.rm=TRUE),.groups='drop')%>%
arrange(desc(total_costs))
#step3: print results
print(position_costs)
## # A tibble: 4 × 2
## Position total_costs
## <chr> <dbl>
## 1 Analyst 78535.
## 2 Manager 69085.
## 3 HR Specialist 62902.
## 4 Developer 48720.
#visualize results
ggplot(data=position_costs, mapping=aes(x=Position,y=total_costs))+
geom_col(color='black',fill='lightgreen')+
labs(title='Costs by Position')

#Q10: What is the relationship between salary and performance rating by location
#step1: join the costs and performance table
costs_performance <- left_join (costs,performance,by=c("Employee_ID"="EmployeeID"))
## Warning in left_join(costs, performance, by = c(Employee_ID = "EmployeeID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 11 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#step2: join the costs_performance table above to employees table
costs_performance_employees <- left_join(costs_performance,employees,by=c("Employee_ID"="EmployeeID"))
#step3: aggregate salary and group by location and performance rating
salary_performance <- costs_performance_employees %>%
group_by (Performance_Rating,Work_Location)%>%
summarise (avg_salary = mean(Salary,na.rm=TRUE),.groups='drop')
#step4: print results
print(salary_performance)
## # A tibble: 18 × 3
## Performance_Rating Work_Location avg_salary
## <int> <chr> <dbl>
## 1 1 Chicago 86004
## 2 1 Los Angeles 82909.
## 3 1 New York 84341.
## 4 2 Chicago 83129.
## 5 2 Los Angeles 54848.
## 6 2 New York 87564.
## 7 3 Chicago 91364.
## 8 3 Los Angeles 57073
## 9 3 New York 69315.
## 10 4 Chicago 81881.
## 11 4 Los Angeles 87901.
## 12 4 New York 63588.
## 13 5 Chicago 66598.
## 14 5 Los Angeles 69678
## 15 5 New York 64120.
## 16 NA Chicago 68531.
## 17 NA Los Angeles 80199.
## 18 NA New York 63708.
#step5: visualize results
ggplot(data=salary_performance,mapping=aes(x=Performance_Rating,y=avg_salary,fill=Work_Location))+
geom_col(color='black',position='stack')+
labs(title='Relationship between Salary and Performance Rating by Location')
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_col()`).

#Q11:Does an employee’s salary level influence the likelihood of turnover type
#load nnet package that contains multinom function
library(nnet)
#step1:join the costs and turnover tables
costs_turnover <- left_join(costs,turnover,by=c("Employee_ID"="EmployeeID"))
## Warning in left_join(costs, turnover, by = c(Employee_ID = "EmployeeID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 8 of `x` matches multiple rows in `y`.
## ℹ Row 31 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#step2:establish the logistic regression rate
multi_logistic_model <- multinom(Reason~Salary,data=costs_turnover)
## # weights: 9 (4 variable)
## initial value 98.875106
## final value 95.499055
## converged
#step2: print results
print (multi_logistic_model)
## Call:
## multinom(formula = Reason ~ Salary, data = costs_turnover)
##
## Coefficients:
## (Intercept) Salary
## Retired -0.6644148 5.636636e-06
## Terminated 0.1055740 -9.382469e-06
##
## Residual Deviance: 190.9981
## AIC: 198.9981
#Is there a relationship between employee position and salary?
#step1: calculate logistic regression
logistic_model <-multinom(Position~Salary,data=employees_costs)
## # weights: 12 (6 variable)
## initial value 138.629436
## final value 136.198836
## converged
#step2: print results
print(logistic_model)
## Call:
## multinom(formula = Position ~ Salary, data = employees_costs)
##
## Coefficients:
## (Intercept) Salary
## Developer -0.1914699 -3.741696e-06
## HR Specialist -0.1121410 -9.964701e-07
## Manager 0.7522235 -1.020568e-05
##
## Residual Deviance: 272.3977
## AIC: 284.3977
#Do employees who received training have significantly higher salaries
#than those who didn’t?
#step1:join the training and costs table
training_costs <- left_join(training,costs,by=c("EmployeeID"="Employee_ID"))
## Warning in left_join(training, costs, by = c(EmployeeID = "Employee_ID")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 7 of `x` matches multiple rows in `y`.
## ℹ Row 80 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
#step2:establish means for the multiple groups
anova_result <- aov(Salary~TrainingCompletionStatus,data=training_costs)
#step3: print results
print(anova_result)
## Call:
## aov(formula = Salary ~ TrainingCompletionStatus, data = training_costs)
##
## Terms:
## TrainingCompletionStatus Residuals
## Sum of Squares 219121974 83767167464
## Deg. of Freedom 2 106
##
## Residual standard error: 28111.5
## Estimated effects may be unbalanced
## 32 observations deleted due to missingness
#What is the cost of each employment type as a percentage of the total salary costs
#step1: join the cost and employee tables
costs_employees <- left_join(costs,employees,by=c("Employee_ID"="EmployeeID"))
#step2: calculate the rate
salaries_costs <- costs_employees %>%
group_by (Employment_Type)%>%
summarise(
total_costs = sum(Misc_Costs),
total_salary = sum(Salary),
cost_rate = total_costs/total_salary*100
,.groups='drop')
#step3: print results
print(salaries_costs)
## # A tibble: 3 × 4
## Employment_Type total_costs total_salary cost_rate
## <chr> <dbl> <int> <dbl>
## 1 Contract 91272. 2716520 3.36
## 2 Full-time 76995. 2136316 3.60
## 3 Part-time 90974. 2615555 3.48
#step4: visualize results
ggplot(data=salaries_costs,mapping=aes(x=cost_rate,y=Employment_Type,fill=Employment_Type))+
geom_bar(stat='identity',color='black')+
labs(title='Costs as a percentage of Salaries by Position')

#Q15:Is there a relationship between the position and the reason for turnover?
# Step 1: Ensure both variables are factors
employees_turnover$Position <- as.factor(employees_turnover$Position)
employees_turnover$Reason <- as.factor(employees_turnover$Reason)
# Step 2: Establish relationship using Chi-square test
chi_result <- chisq.test(employees_turnover$Position, employees_turnover$Reason)
# Step 3: Print results
print(chi_result)
##
## Pearson's Chi-squared test
##
## data: employees_turnover$Position and employees_turnover$Reason
## X-squared = 2.8946, df = 6, p-value = 0.822
#Q16:Are there significant salary outliers by employee positions?
# Identify salary outliers within each department
salary_outliers <- employees_costs %>%
group_by(Position) %>%
mutate(
q1_salary = quantile(Salary, 0.25, na.rm = TRUE),
q3_salary = quantile(Salary, 0.75, na.rm = TRUE),
salary_iqr = q3_salary - q1_salary,
lower_bounds_salary = q1_salary - 1.5 * salary_iqr,
upper_bounds_salary = q3_salary + 1.5 * salary_iqr,
is_outlier = Salary < lower_bounds_salary | Salary > upper_bounds_salary
) %>%
filter(is_outlier) # Keep only rows where is_outlier is TRUE
# Print results
print(salary_outliers)
## # A tibble: 0 × 17
## # Groups: Position [0]
## # ℹ 17 variables: EmployeeID <int>, Name <chr>, Position <chr>,
## # Work_Location <chr>, Employment_Type <chr>, Hire_Date <chr>,
## # SupervisorID <int>, Salary_Cost_ID <int>, Salary <int>, Misc_Costs <dbl>,
## # Cost_Date <chr>, q1_salary <dbl>, q3_salary <dbl>, salary_iqr <dbl>,
## # lower_bounds_salary <dbl>, upper_bounds_salary <dbl>, is_outlier <lgl>