library(readr)

hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
plot_data <- mtcars %>% 
  mutate(Transmision = as.factor(ifelse(am == 0 , 'Automatic' , 'Manual')))

plot_ly(plot_data , 
        x = ~Transmision ,
        y = ~mpg ,
        type = 'box')
hr1 <- hr %>%
  mutate(employee_status = ifelse(left == 1, "Left", "Active"))

#1a.

t.test(hr1$average_montly_hours ~ hr1$employee_status)
## 
##  Welch Two Sample t-test
## 
## data:  hr1$average_montly_hours by hr1$employee_status
## t = -7.5323, df = 4875.1, p-value = 5.907e-14
## alternative hypothesis: true difference in means between group Active and group Left is not equal to 0
## 95 percent confidence interval:
##  -10.534631  -6.183384
## sample estimates:
## mean in group Active   mean in group Left 
##             199.0602             207.4192

#1b.
#There is a significant difference between means, where employees that #left work at least 6 hours more.

#1c. #Employees that left, on average work more hours, at least 3% more

#1d.

plot_ly(hr1 , 
        x = ~employee_status ,
        y = ~average_montly_hours ,
        type = 'box',
        color = ~employee_status,
        colors= c('#29a21a','blue')
) %>% 
  layout(title = 'employees that left on average, work more hours, at least 3% more',
         yaxis = list(title = 'Average Monthly Hours', range = c(0,350)),
         xaxis = list(title = 'employee status'))
hr$left <- as.factor(hr$left)
hr$promotion_last_5years <- as.factor(hr$promotion_last_5years)
hr$Department <- as.factor(hr$Department)

#2a.

t_test1 <- t.test(hr1$average_montly_hours ~ hr1$promotion_last_5years)

#2b.
#The p-value in this t-test tells us the probability of observing the difference #in average monthly hours between employees who were promoted and those who were not

#2c. #Promoted employees work a significantly different number of hours on #average each month compared to those who weren’t promoted

#2d.

plot_ly(hr, 
        x = ~promotion_last_5years, 
        y = ~average_montly_hours, 
        type = 'box',
        color = ~promotion_last_5years, 
        colors = c('#D46A6A', '#4DAF7C')  # Using random colors
) %>% 
  layout(title = ifelse(t_test1$p.value < 0.05, 
                        'Promoted employees work a different number of hours per month', 
                        'No difference in hours worked between promoted and non-promoted employees'),
         yaxis = list(title = 'Average Monthly Hours', range = c(0, 350)),
         xaxis = list(title = 'Promotion Status (0 = Not Promoted, 1 = Promoted)'))

#3a.

t_test2 <- t.test(hr1$satisfaction_level ~ hr1$left)

#3b.
#The p-value in this t-test measures the probability of observing a difference #in satisfaction levels between employees who left and those who stayed

#3c. #Employees who left the company had a significantly different level of #satisfaction compared to those who stayed. This suggests that satisfaction #levels may impact whether an employee decides to leave

#3d.

plot_ly(hr, 
        x = ~left, 
        y = ~satisfaction_level, 
        type = 'box',
        color = ~left, 
        colors = c('#F1C40F', '#2980B9')  # Using random colors
) %>% 
  layout(title = ifelse(t_test2$p.value < 0.05, 
                        'Employees who left had different satisfaction levels than those who stayed', 
                        'No difference in satisfaction levels between employees who left and those who stayed'),
         yaxis = list(title = 'Satisfaction Level', range = c(0, 1)),
         xaxis = list(title = 'Employee Status (0 = Stayed, 1 = Left)'))

#4a.

t_test3 <- t.test(hr1$last_evaluation[hr$Department %in% c("sales", "technical")] ~ hr1$Department[hr1$Department %in% c("sales", "technical")])

#4b.
#The p-value from this t-test tells us the probability of observing a #difference in last evaluation scores between the sales and technical departments

#4c. #Employees in the sales department have different evaluation scores on average #than those in the technical department. This suggests that department may influence evaluation outcomes

#4d.

plot_ly(hr1, 
        x = ~Department,
        y = ~last_evaluation,
        type = 'box',
        color = ~Department,
        colors = c('#FF5733', '#1F618D')  # Using random colors
) %>% 
  layout(title = ifelse(t_test3$p.value < 0.05, 
                        'Evaluation scores differ between sales and technical departments', 
                        'No difference in evaluation scores between sales and technical departments'),
         yaxis = list(title = 'Last Evaluation Score', range = c(0, 1)),
         xaxis = list(title = 'Department'))