Data Preparation

# Generate synthetic employee dataset
employee_data <- tibble(
  employee_id = 1:200,
  department = sample(c("Sales", "Marketing", "IT", "HR", "Finance"), 200, replace = TRUE),
  hire_date = sample(seq(as.Date('2010/01/01'), as.Date('2023/12/31'), by="day"), 200),
  performance_score = runif(200, 60, 100),
  salary = round(runif(200, 40000, 120000), 0),
  age = round(runif(200, 22, 60), 0),
  years_experience = round(runif(200, 1, 20), 0),
  left_company = sample(c(TRUE, FALSE), 200, prob = c(0.2, 0.8), replace = TRUE)
)

Descriptive Statistics

# Summary statistics
employee_summary <- employee_data %>%
  group_by(department) %>%
  summarise(
    avg_performance = mean(performance_score),
    avg_salary = mean(salary),
    retention_rate = mean(!left_company) * 100,
    total_employees = n()
  ) %>%
  arrange(desc(avg_performance))

# Display summary table
employee_summary %>%
  kable("html", caption = "Department Performance Overview") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Department Performance Overview
department avg_performance avg_salary retention_rate total_employees
Sales 80.38183 83703.80 80.43478 46
HR 79.56821 78968.09 91.17647 34
Finance 79.36061 75825.72 69.76744 43
Marketing 78.97990 82973.00 82.50000 40
IT 77.33365 80541.57 75.67568 37

Performance and Retention Analysis

# Performance vs. Retention Visualization
ggplot(employee_data, aes(x = performance_score, fill = left_company)) +
  geom_density(alpha = 0.5) +
  labs(
    title = "Performance Score Distribution by Retention",
    x = "Performance Score",
    y = "Density",
    fill = "Left Company"
  ) +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")

# Correlation between performance and other factors
cor_matrix <- employee_data %>%
  select(performance_score, salary, age, years_experience) %>%
  cor()

# Print correlation matrix
kable(cor_matrix, caption = "Correlation Matrix") %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Correlation Matrix
performance_score salary age years_experience
performance_score 1.0000000 0.0533547 -0.0213555 -0.0397686
salary 0.0533547 1.0000000 0.0098798 -0.0114841
age -0.0213555 0.0098798 1.0000000 -0.0585347
years_experience -0.0397686 -0.0114841 -0.0585347 1.0000000

Retention Prediction Model

# Logistic regression for predicting employee turnover
retention_model <- glm(
  left_company ~ performance_score + salary + years_experience + age,
  data = employee_data,
  family = binomial()
)

# Model summary
summary(retention_model)
## 
## Call:
## glm(formula = left_company ~ performance_score + salary + years_experience + 
##     age, family = binomial(), data = employee_data)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)       -1.562e+00  1.583e+00  -0.987    0.324
## performance_score  2.870e-04  1.551e-02   0.019    0.985
## salary             6.027e-06  8.064e-06   0.747    0.455
## years_experience   9.752e-03  3.399e-02   0.287    0.774
## age               -1.025e-02  1.584e-02  -0.647    0.518
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 202.90  on 199  degrees of freedom
## Residual deviance: 201.83  on 195  degrees of freedom
## AIC: 211.83
## 
## Number of Fisher Scoring iterations: 4

Key Insights

  1. Department performance varies significantly
  2. Performance score appears to correlate with retention
  3. Salary and years of experience impact turnover probability

Conclusion

This analysis provides insights into employee performance and potential factors influencing retention.

# Export summary to CSV
write_csv(employee_summary, "department_performance_summary.csv")