options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages("readr")

## 
## The downloaded binary packages are in
##  /var/folders/6q/3nbyz_h95ks8z6lqpx_k5x_40000gn/T//RtmpqkBIVU/downloaded_packages

library(readr)

hr <- read_csv("https://raw.githubusercontent.com/aiplanethub/Datasets/refs/heads/master/HR_comma_sep.csv")

## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

1. Perform the correlation (.5 point) Choose any two appropriate variables from the data and perform the correlation, displaying the results.

cor1 <- cor.test(hr$satisfaction_level, hr$last_evaluation)

cor1

## 
##  Pearson's product-moment correlation
## 
## data:  hr$satisfaction_level and hr$last_evaluation
## t = 12.933, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.08916727 0.12082195
## sample estimates:
##       cor 
## 0.1050212

library(ggplot2)

ggplot(hr, aes(x = satisfaction_level, y = last_evaluation)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(title = "Relationship between Satisfaction Level and Last Evaluation Score", 
    x = "Satisfaction Level", y = "Last Evaluation Score")

## `geom_smooth()` using formula = 'y ~ x'

#Due to the p-value being extremely low, 4.7e-38, this indicates that there is a significant correlation between satisfaction level and the last eveluation of workers. 

#The results suggest that employees' satisfaction with their job is connected to how well they perform in their evaluations. For example, employees who are more satisfied may be performing better or receiving higher scores in evaluations

2. Interpret the results in technical terms (.5 point) For each correlation, explain what the test’s p-value means (significance).

cor2 <- cor.test(hr$satisfaction_level, hr$average_montly_hours)

cor2

## 
##  Pearson's product-moment correlation
## 
## data:  hr$satisfaction_level and hr$average_montly_hours
## t = -2.4556, df = 14997, p-value = 0.01408
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.036040356 -0.004045605
## sample estimates:
##         cor 
## -0.02004811

ggplot(hr, aes(x = satisfaction_level, y = average_montly_hours)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(title = "Is Employee Satisfaction Linked to Monthly Working Hours?", 
    x = "Satisfaction Level", y = "Average Monthly Hours")

## `geom_smooth()` using formula = 'y ~ x'

#Due to the p-value being below 0.05, 0.0141, the correlation is significant. This means that there is a significant relationship between satisfaction level and average monthly hours worked. 

#This result suggests that there is some connection between how satisfied employees feel and the amount of time they work each month. For instance, employees who work more hours might be less satisfied, or perhaps those with higher satisfaction work a moderate number of hours.

3. Interpret the results in non-technical terms (1 point) For each correlation, what do the results mean in non-techical terms.

cor3 <- cor.test(hr$number_project, hr$last_evaluation)

cor3

## 
##  Pearson's product-moment correlation
## 
## data:  hr$number_project and hr$last_evaluation
## t = 45.656, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3352028 0.3633053
## sample estimates:
##       cor 
## 0.3493326

ggplot(hr, aes(x = number_project, y = last_evaluation)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(title = "Do More Projects Lead to Higher Evaluation Scores?", 
    x = "Number of Projects", y = "Last Evaluation Score")

## `geom_smooth()` using formula = 'y ~ x'

#Due to the p-value being 0 and below 0.05, it indicates a very strong statistical significance. There is a highly reliable correlation between the number of projects completed and the last evaluation of workers. 

#The results show that employees who work on more projects tend to have different evaluation scores. This could mean that taking on additional projects may be associated with better evaluations, perhaps reflecting high engagement or productivity

4. Create a plot that helps visualize the correlation (.5 point) For each correlation, create a graph to help visualize the realtionship between the two variables. The title must be the non-technical interpretation.

cor4 <- cor.test(hr$time_spend_company, hr$average_montly_hours)

cor4

## 
##  Pearson's product-moment correlation
## 
## data:  hr$time_spend_company and hr$average_montly_hours
## t = 15.774, df = 14997, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1119801 0.1434654
## sample estimates:
##       cor 
## 0.1277549

ggplot(hr, aes(x = time_spend_company, y = average_montly_hours)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  labs(title = "Does Time at the Company Affect Monthly Working Hours?", 
    x = "Time Spent at Company (years)", y = "Average Monthly Hours")

## `geom_smooth()` using formula = 'y ~ x'

#Due to the p-value being 1.31e-55, which is lower that 0.05, there is a high correlation between time spent with the company and average monthly hours worked.

#This finding suggests that the length of time employees have been with the company is related to how many hours they work each month. For example, employees who have been with the company longer might be taking on heavier workloads, or perhaps newer employees work fewer hours as they are onboarded.

DATA3210 Assignment7

Andrew deLaricheliere

2024-11-06

1. Perform the correlation (.5 point) Choose any two appropriate variables from the data and perform the correlation, displaying the results.

2. Interpret the results in technical terms (.5 point) For each correlation, explain what the test’s p-value means (significance).

3. Interpret the results in non-technical terms (1 point) For each correlation, what do the results mean in non-techical terms.

4. Create a plot that helps visualize the correlation (.5 point) For each correlation, create a graph to help visualize the realtionship between the two variables. The title must be the non-technical interpretation.