library(readr)
library(ggplot2)
hr <- read_csv('https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv')
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cor_result <- cor.test(hr$satisfaction_level, hr$average_montly_hours)
print(cor_result)
##
## Pearson's product-moment correlation
##
## data: hr$satisfaction_level and hr$average_montly_hours
## t = -2.4556, df = 14997, p-value = 0.01408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.036040356 -0.004045605
## sample estimates:
## cor
## -0.02004811
Average monthly hours/Satisfaction level- Correlation=-.02004811 indicates that there is a inverse relationship, where as monthly hours increases, the satisfaction level decreases The relationship is weak if not nonexistent as the magnitude of the correlation is .02 which is very close to zero
There is no relationship between satisfaction level and average monthly hours
ggplot(hr, aes(x = average_montly_hours, y = satisfaction_level)) +
geom_point(alpha = 0.4, color = "steelblue") + # Semi-transparent points to handle overlap
geom_smooth(method = "lm", color = "red", se = TRUE) + # Add regression line with confidence interval
labs(
title = "There is basically no connection between how satisfied employees are\nand how many hours they work each month",
x = "Average Monthly Hours",
y = "Satisfaction Level"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 12, face = "bold"),
panel.grid.minor = element_blank()
)
## `geom_smooth()` using formula = 'y ~ x'
cor_result_eval_hours <- cor.test(hr$last_evaluation, hr$average_montly_hours)
print(cor_result_eval_hours)
##
## Pearson's product-moment correlation
##
## data: hr$last_evaluation and hr$average_montly_hours
## t = 44.237, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3255078 0.3538218
## sample estimates:
## cor
## 0.3397418
Correlation coefficient=.34 is statistically signifigant and there is a low to positive relationship. P value< 2.2e-16 meaning we can reject the null hypothesis that there is no relationship
As the number of monthly hours increases, the evaluation score increases
ggplot(hr, aes(x = average_montly_hours, y = last_evaluation)) +
geom_point(alpha = 0.3, color = "blue") +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Relationship between employee evaluation scores and monthly hours worked",
x = "Average Monthly Hours",
y = "Last Evaluation Score"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
cor_result_projects <- cor.test(hr$satisfaction_level, hr$number_project)
print(cor_result_projects)
##
## Pearson's product-moment correlation
##
## data: hr$satisfaction_level and hr$number_project
## t = -17.69, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1586105 -0.1272570
## sample estimates:
## cor
## -0.1429696
The correlation value=-.1429696 which means it is statistically signifigant and has a weak negative relationship. The p-value < 2.2e-16 means that we reject the null hypothesis with extreme confidence
As the number of projects increases the level of satisfaction decreases and we are confident it is a real pattern and not a coincidence
ggplot(hr, aes(x = number_project, y = satisfaction_level)) +
geom_point(alpha = 0.3, color = "blue") +
geom_smooth(method = "lm", color = "red") +
labs(
title = "The more projects employees have, the less satisfied they tend to be",
x = "Number of Projects",
y = "Satisfaction Level"
)
## `geom_smooth()` using formula = 'y ~ x'
cor_result_time_projects <- cor.test(hr$time_spend_company, hr$number_project)
print(cor_result_time_projects)
##
## Pearson's product-moment correlation
##
## data: hr$time_spend_company and hr$number_project
## t = 24.579, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1813532 0.2121217
## sample estimates:
## cor
## 0.1967859
R=.197 meaning a weak positive relationship is present
Employees who have been at the company longer, tend to have slightly more projects but the relationship is weak
ggplot(hr, aes(x = time_spend_company, y = number_project)) +
geom_jitter(alpha = 0.3, color = "blue", width = 0.1, height = 0.1) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
scale_x_continuous(breaks = unique(hr$time_spend_company)) +
scale_y_continuous(breaks = unique(hr$number_project)) +
labs(
title = "Employees who have been at the company longer
tend to have slightly more projects",
x = "Years at Company",
y = "Number of Projects"
) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 11))
## `geom_smooth()` using formula = 'y ~ x'