# Load necessary libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggcorrplot)

# Load the HR dataset
library(readr)
data <- read_csv("HR_comma_sep-1.csv")
## Rows: 14999 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): sales, salary
## dbl (8): satisfaction_level, last_evaluation, number_project, average_montly...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(data)

1. Create a Box Plot for employee satisfaction and last evaluation, each broken out by the variable left. This meaning that for each variable there will be two box plots, side by side, where each box will represent the same variable, but one filtered for left = 0, and the other left = 1

#1.(a) Box Plot for Employee Satisfaction

# Box Plot for Employee Satisfaction

# Convert 'left' variable to factor for better labeling in plots
data$left <- factor(data$left, levels = c(0, 1), labels = c("Stayed", "Left"))

satisfaction_plot <- ggplot(data, aes(x = left, y = satisfaction_level)) +
  geom_boxplot() +
  labs(title = "Employee Satisfaction by Left Status", x = "Employee Status", y = "Satisfaction Level") +
  theme_minimal()
satisfaction_plot

###1.(a) Box Plot for Last Evaluation

# Box Plot for Last Evaluation
evaluation_plot <- ggplot(data, aes(x = left, y = last_evaluation)) +
  geom_boxplot() +
  labs(title = "Last Evaluation by Left Status", x = "Employee Status", y = "Last Evaluation") +
  theme_minimal()
evaluation_plot

###2. Using the HR_comma_sep dataset, create a correlogram, only for continuous variables

# Select continuous variables
continuous_vars <- data %>% select(satisfaction_level, last_evaluation, number_project, average_montly_hours, Work_accident, promotion_last_5years)

# Calculate the correlation matrix
correlation_matrix <- cor(continuous_vars)

# Create the correlogram
ggcorrplot(correlation_matrix, 
           method = "circle", 
           type = "upper", 
           title = "Correlogram of Continuous Variables", 
           lab = TRUE, 
           colors = c("blue", "white", "orange"))

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00