library(readr)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
For each of the four t-tests:
-Perform the t-test (.5 point) Choose any two appropriate variables from the data and perform the t-test, displaying the results.
-Interpret the results in technical terms (.5 point) For each t-test, explain what the test’s p-value means (significance).
-Interpret the results in non-technical terms (1 point) For each t-test, what do the results mean in non-techical terms.
-Create a plot that helps visualize the t-test (.5 point) For each t-test, create a graph to help visualize the difference between means, if any. The title must be the non-technical interpretation.
t_test_satisfaction <- t.test(hr$satisfaction_level ~ hr$left)
t_test_satisfaction
##
## Welch Two Sample t-test
##
## data: hr$satisfaction_level by hr$left
## t = 46.636, df = 5167, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 0.2171815 0.2362417
## sample estimates:
## mean in group 0 mean in group 1
## 0.6668096 0.4400980
plot_data <- hr %>%
mutate(Attrition = as.factor(ifelse(left == 1, 'Left', 'Stayed')))
plot_ly(plot_data, x = ~Attrition, y = ~satisfaction_level, type = 'box') %>%
layout(title = "Employees with lower satisfaction levels are more likely to leave")
t_test_hours <- t.test(hr$average_montly_hours ~ hr$left)
t_test_hours
##
## Welch Two Sample t-test
##
## data: hr$average_montly_hours by hr$left
## t = -7.5323, df = 4875.1, p-value = 5.907e-14
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -10.534631 -6.183384
## sample estimates:
## mean in group 0 mean in group 1
## 199.0602 207.4192
Since the p-value is much lower than 0.05,there is a statistically significant difference in average monthly hours between employees who left and those who stayed.
The results show employees with higher average monthly hours were more likely to leave the company.
plot_data <- hr %>%
mutate(Attrition = as.factor(ifelse(left == 1, 'Left', 'Stayed')))
plot_ly(plot_data, x = ~Attrition, y = ~average_montly_hours, type = 'box') %>%
layout(title = "Employees with higher average monthly hours are more likely to leave")
t_test_evaluation <- t.test(hr$last_evaluation ~ hr$left)
t_test_evaluation
##
## Welch Two Sample t-test
##
## data: hr$last_evaluation by hr$left
## t = -0.72534, df = 5154.9, p-value = 0.4683
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.009772224 0.004493874
## sample estimates:
## mean in group 0 mean in group 1
## 0.7154734 0.7181126
The p-value is greater than 0.05, meaning there is no statistically significant difference in last evaluation scores between employees who left and those who stayed.
The results show evaluation scores do not appear to be a reason in whether an employee stays or leaves.
plot_data <- hr %>%
mutate(Attrition = as.factor(ifelse(left == 1, 'Left', 'Stayed')))
plot_ly(plot_data, x = ~Attrition, y = ~last_evaluation, type = 'box') %>%
layout(title = "Evaluation scores do not seem to impact employee attrition")
t_test_time_spent <- t.test(hr$time_spend_company ~ hr$left)
t_test_time_spent
##
## Welch Two Sample t-test
##
## data: hr$time_spend_company by hr$left
## t = -22.631, df = 9625.6, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.5394767 -0.4534706
## sample estimates:
## mean in group 0 mean in group 1
## 3.380032 3.876505
Since the p-value is extremely low, we conclude that the difference in time spent at the company between employees who left and those who stayed is statistically significant.
The results show employees who have been at the company for more years are more likely to leave.
plot_data <- hr %>%
mutate(Attrition = as.factor(ifelse(left == 1, 'Left', 'Stayed')))
plot_ly(plot_data, x = ~Attrition, y = ~time_spend_company, type = 'box') %>%
layout(title = "Employees with longer tenure are more likely to leave")