library(tidyverse)
library(dplyr)
Data Load & Pre-compute
# data
employees <- read_csv("employees_data.csv") %>%
mutate(Remote = tolower(as.character(RemoteWorker)) %in% c("yes","true","1"))
## Rows: 100 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Department, RemoteWorker
## dbl (7): EmployeeID, YearsAtCompany, MonthlySalary, PerformanceRating, Age, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
avg_tr <- mean(employees$TrainingHours)
avg_sal <- mean(employees$MonthlySalary)
Q1
q1 <- employees %>%
filter(YearsAtCompany > 8) %>%
summarise(Count = n())
q1 #52
## # A tibble: 1 × 1
## Count
## <int>
## 1 52
Q2
q2 <- employees %>%
filter(!Remote, YearsAtCompany < 3, TrainingHours > avg_tr) %>%
summarise(Count = n())
q2 # 3
## # A tibble: 1 × 1
## Count
## <int>
## 1 3
Q3
q3 <- employees %>%
filter(YearsAtCompany <= 2) %>%
count(Department, name = "NewerCount")
q3 # A tibble: 5 × 2
## # A tibble: 5 × 2
## Department NewerCount
## <chr> <int>
## 1 Engineering 1
## 2 Finance 2
## 3 HR 5
## 4 Marketing 3
## 5 Sales 4
Q4
q4_overall <- tibble(AverageTraining = avg_tr)
q4_by_dept <- employees %>%
group_by(Department) %>%
summarise(AverageTraining = mean(TrainingHours, na.rm = TRUE))
q4_by_dept
## # A tibble: 5 × 2
## Department AverageTraining
## <chr> <dbl>
## 1 Engineering 50.7
## 2 Finance 51.5
## 3 HR 47.4
## 4 Marketing 43.3
## 5 Sales 46.1
Q5
q5 <- employees %>%
filter(TrainingHours > avg_tr) %>%
group_by(Department) %>%
summarise(AveragePerformanceRating = mean(PerformanceRating, na.rm = TRUE)) %>%
arrange(desc(AveragePerformanceRating))
q5
## # A tibble: 5 × 2
## Department AveragePerformanceRating
## <chr> <dbl>
## 1 HR 3
## 2 Engineering 2.89
## 3 Sales 2.89
## 4 Marketing 2.73
## 5 Finance 2.67
Q6
q6 <- employees %>%
group_by(Department) %>%
summarise(AverageSalary = mean(MonthlySalary, na.rm = TRUE)) %>%
filter(AverageSalary > avg_sal) %>%
arrange(desc(AverageSalary))
q6
## # A tibble: 3 × 2
## Department AverageSalary
## <chr> <dbl>
## 1 Sales 10172
## 2 Engineering 9165.
## 3 Marketing 9128.
Q7
q7 <- employees %>%
group_by(Department) %>%
summarise(AverageSalary = mean(MonthlySalary, na.rm = TRUE)) %>%
arrange(desc(AverageSalary))
q7
## # A tibble: 5 × 2
## Department AverageSalary
## <chr> <dbl>
## 1 Sales 10172
## 2 Engineering 9165.
## 3 Marketing 9128.
## 4 Finance 8611.
## 5 HR 8346.
Q9
q9 <- employees %>%
group_by(Department) %>%
summarise(
Headcount = n(),
AvgTraining = mean(TrainingHours, na.rm = TRUE)
) %>%
arrange(desc(Headcount), desc(AvgTraining))
q9
## # A tibble: 5 × 3
## Department Headcount AvgTraining
## <chr> <int> <dbl>
## 1 Marketing 26 43.3
## 2 Finance 21 51.5
## 3 Sales 19 46.1
## 4 HR 18 47.4
## 5 Engineering 16 50.7
Q10
q10 <- employees %>%
group_by(Department) %>%
summarise(RemoteShare = mean(Remote, na.rm = TRUE)) %>%
arrange(desc(RemoteShare))
q10
## # A tibble: 5 × 2
## Department RemoteShare
## <chr> <dbl>
## 1 Sales 0.421
## 2 HR 0.389
## 3 Marketing 0.231
## 4 Finance 0.190
## 5 Engineering 0.188
Q11 (long view, ordered by higher avg within dept)
q11 <- employees %>%
group_by(Department, Remote) %>%
summarise(AverageSalary = mean(MonthlySalary, na.rm = TRUE)) %>%
mutate(Remote = if_else(Remote, "Remote", "NonRemote")) %>%
arrange(Department, desc(AverageSalary))
## `summarise()` has grouped output by 'Department'. You can override using the
## `.groups` argument.
q11
## # A tibble: 10 × 3
## # Groups: Department [5]
## Department Remote AverageSalary
## <chr> <chr> <dbl>
## 1 Engineering NonRemote 9648.
## 2 Engineering Remote 7075.
## 3 Finance NonRemote 8779.
## 4 Finance Remote 7899.
## 5 HR NonRemote 8399.
## 6 HR Remote 8264.
## 7 Marketing Remote 9474
## 8 Marketing NonRemote 9024.
## 9 Sales Remote 10961.
## 10 Sales NonRemote 9598.
Q12 (counts + top remote dept)
q12_counts <- employees %>%
group_by(Department, Remote) %>%
summarise(EmployeeCount = n()) %>%
mutate(Remote = if_else(Remote, "Remote", "NonRemote"))
## `summarise()` has grouped output by 'Department'. You can override using the
## `.groups` argument.
q12_top_remote_dept <- employees %>%
filter(Remote) %>%
count(Department, name = "RemoteCount") %>%
slice_max(RemoteCount, n = 1, with_ties = TRUE) %>%
pull(Department)
q12_counts
## # A tibble: 10 × 3
## # Groups: Department [5]
## Department Remote EmployeeCount
## <chr> <chr> <int>
## 1 Engineering NonRemote 13
## 2 Engineering Remote 3
## 3 Finance NonRemote 17
## 4 Finance Remote 4
## 5 HR NonRemote 11
## 6 HR Remote 7
## 7 Marketing NonRemote 20
## 8 Marketing Remote 6
## 9 Sales NonRemote 11
## 10 Sales Remote 8
q12_top_remote_dept
## [1] "Sales"
Q13 (two tables)
q13_remote <- employees %>%
filter(Remote) %>%
group_by(Department) %>%
summarise(AvgSalary = mean(MonthlySalary)) %>%
arrange(desc(AvgSalary))
q13_nonremote <- employees %>%
filter(!Remote) %>%
group_by(Department) %>%
summarise(AvgSalary = mean(MonthlySalary)) %>%
arrange(desc(AvgSalary))
q13_remote
## # A tibble: 5 × 2
## Department AvgSalary
## <chr> <dbl>
## 1 Sales 10961.
## 2 Marketing 9474
## 3 HR 8264.
## 4 Finance 7899.
## 5 Engineering 7075.
q13_nonremote
## # A tibble: 5 × 2
## Department AvgSalary
## <chr> <dbl>
## 1 Engineering 9648.
## 2 Sales 9598.
## 3 Marketing 9024.
## 4 Finance 8779.
## 5 HR 8399.
Q14 (top 5 employees)
q14 <- employees %>%
arrange(desc(MonthlySalary)) %>%
slice_head(n = 5) %>%
select(EmployeeID, Department, MonthlySalary)
q14
## # A tibble: 5 × 3
## EmployeeID Department MonthlySalary
## <dbl> <chr> <dbl>
## 1 1072 Sales 14983
## 2 1069 Marketing 14969
## 3 1041 Marketing 14915
## 4 1038 Sales 14835
## 5 1005 Sales 14589
Q15 (high satisfaction counts, low -> high)
q15 <- employees %>%
filter(JobSatisfaction >= 8) %>%
count(Department, name = "EmployeeCount") %>%
arrange(EmployeeCount)
q15
## # A tibble: 0 × 2
## # ℹ 2 variables: Department <chr>, EmployeeCount <int>
Q16 (dept x rating, sorted by higher avg in each dept)
q16 <- employees %>%
group_by(Department, PerformanceRating) %>%
summarise(
AverageSalary = mean(MonthlySalary, na.rm = TRUE),
Headcount = n()
) %>%
arrange(Department, desc(AverageSalary))
## `summarise()` has grouped output by 'Department'. You can override using the
## `.groups` argument.
q16
## # A tibble: 22 × 4
## # Groups: Department [5]
## Department PerformanceRating AverageSalary Headcount
## <chr> <dbl> <dbl> <int>
## 1 Engineering 3 10383. 10
## 2 Engineering 2 9327 2
## 3 Engineering 4 6118 3
## 4 Engineering 1 5811 1
## 5 Finance 4 10708. 4
## 6 Finance 5 10005. 3
## 7 Finance 1 9801 2
## 8 Finance 2 9227. 3
## 9 Finance 3 6745 9
## 10 HR 5 11680 1
## # ℹ 12 more rows
Q17 (per rating)
q17 <- employees %>%
group_by(PerformanceRating) %>%
summarise(
MeanJobSatisfaction = mean(JobSatisfaction, na.rm = TRUE),
EmployeeCount = n()
) %>%
arrange(desc(MeanJobSatisfaction))
q17
## # A tibble: 5 × 3
## PerformanceRating MeanJobSatisfaction EmployeeCount
## <dbl> <dbl> <int>
## 1 2 3.29 17
## 2 5 3.17 6
## 3 4 2.89 18
## 4 3 2.87 53
## 5 1 1.83 6