Import data
data <- read_excel(here::here("00_data", "Salaries.xlsx"))
glimpse(data)
## Rows: 397
## Columns: 6
## $ rank <chr> "Prof", "Prof", "AsstProf", "Prof", "Prof", "AssocProf",…
## $ discipline <chr> "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "…
## $ yrs.since.phd <dbl> 19, 20, 4, 45, 40, 6, 30, 45, 21, 18, 12, 7, 1, 2, 20, 1…
## $ yrs.service <dbl> 18, 16, 3, 39, 41, 6, 23, 45, 20, 18, 8, 2, 1, 0, 18, 3,…
## $ sex <chr> "Male", "Male", "Male", "Male", "Male", "Male", "Male", …
## $ salary <dbl> 139750, 173200, 79750, 115000, 141500, 97000, 175000, 14…
Apply the following dplyr verbs to your data
Filter rows
filtered <- data %>%
filter(rank == "Prof", yrs.since.phd >= 10, salary >= 100000)
filtered %>% slice_head(n = 8)
## # A tibble: 8 × 6
## rank discipline yrs.since.phd yrs.service sex salary
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Prof B 19 18 Male 139750
## 2 Prof B 20 16 Male 173200
## 3 Prof B 45 39 Male 115000
## 4 Prof B 40 41 Male 141500
## 5 Prof B 30 23 Male 175000
## 6 Prof B 45 45 Male 147765
## 7 Prof B 21 20 Male 119250
## 8 Prof B 18 18 Female 129000
Arrange rows
arranged <- filtered %>%
arrange(desc(salary), yrs.since.phd, rank)
arranged %>% slice_head(n = 8)
## # A tibble: 8 × 6
## rank discipline yrs.since.phd yrs.service sex salary
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Prof B 38 38 Male 231545
## 2 Prof A 43 43 Male 205500
## 3 Prof A 29 7 Male 204000
## 4 Prof A 42 18 Male 194800
## 5 Prof B 26 19 Male 193000
## 6 Prof B 49 60 Male 192253
## 7 Prof B 34 33 Male 189409
## 8 Prof B 56 49 Male 186960
Select columns
selected <- arranged %>%
select(rank, sex, discipline,
years_phd = yrs.since.phd,
years_service = yrs.service,
salary)
selected %>% slice_head(n = 8)
## # A tibble: 8 × 6
## rank sex discipline years_phd years_service salary
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Prof Male B 38 38 231545
## 2 Prof Male A 43 43 205500
## 3 Prof Male A 29 7 204000
## 4 Prof Male A 42 18 194800
## 5 Prof Male B 26 19 193000
## 6 Prof Male B 49 60 192253
## 7 Prof Male B 34 33 189409
## 8 Prof Male B 56 49 186960
Add columns
mutated <- selected %>%
mutate(
salary_k = salary / 1000,
total_exp = years_phd + years_service,
seniority = case_when(
years_phd < 5 ~ "early",
years_phd < 15 ~ "mid",
TRUE ~ "late"
),
high_salary = salary > quantile(salary, 0.75, na.rm = TRUE)
)
mutated %>% slice_head(n = 8)
## # A tibble: 8 × 10
## rank sex discipline years_phd years_service salary salary_k total_exp
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Prof Male B 38 38 231545 232. 76
## 2 Prof Male A 43 43 205500 206. 86
## 3 Prof Male A 29 7 204000 204 36
## 4 Prof Male A 42 18 194800 195. 60
## 5 Prof Male B 26 19 193000 193 45
## 6 Prof Male B 49 60 192253 192. 109
## 7 Prof Male B 34 33 189409 189. 67
## 8 Prof Male B 56 49 186960 187. 105
## # ℹ 2 more variables: seniority <chr>, high_salary <lgl>
Summarize by groups
summary_table <- mutated %>%
group_by(rank, sex) %>%
summarise(
n = n(),
mean_salary = mean(salary, na.rm = TRUE),
median_salary = median(salary, na.rm = TRUE),
sd_salary = sd(salary, na.rm = TRUE),
mean_experience = mean(total_exp, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(rank, sex)
summary_table
## # A tibble: 2 × 7
## rank sex n mean_salary median_salary sd_salary mean_experience
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Prof Female 16 125873. 123636 17024. 40.7
## 2 Prof Male 213 133343. 129600 25244. 50.1