myData <- read_excel("../00_data/Salaries.xlsx")
myData
## # A tibble: 397 × 6
## rank discipline yrs.since.phd yrs.service sex salary
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Prof B 19 18 Male 139750
## 2 Prof B 20 16 Male 173200
## 3 AsstProf B 4 3 Male 79750
## 4 Prof B 45 39 Male 115000
## 5 Prof B 40 41 Male 141500
## 6 AssocProf B 6 6 Male 97000
## 7 Prof B 30 23 Male 175000
## 8 Prof B 45 45 Male 147765
## 9 Prof B 21 20 Male 119250
## 10 Prof B 18 18 Female 129000
## # … with 387 more rows
Make two bar charts here - one before ordering another after Unordered factor levels
# Transform data
myDataTransofmed <- myData %>%
group_by(sex) %>%
summarise(
avg_salary = mean(salary, na.rm = TRUE)
)
myDataTransofmed
## # A tibble: 2 × 2
## sex avg_salary
## <chr> <dbl>
## 1 Female 101002.
## 2 Male 115090.
# Plot
myDataTransofmed %>%
ggplot(aes(x = avg_salary, y = sex)) +
geom_point()
Ordered facotr levels
myDataTransofmed %>%
ggplot(aes(x = avg_salary, y = fct_reorder(.f = sex, .x = avg_salary))) +
geom_point() +
labs(y = NULL, x = "Mean Salary by sex")
Show examples of three functions:
myDataTransofmed %>% distinct(sex)
## # A tibble: 2 × 1
## sex
## <chr>
## 1 Female
## 2 Male
# Recode
myDataTransofmed %>%
# Rename levels
mutate(sex_rev = fct_recode(sex, "Male" = "Man")) %>%
select(sex, sex_rev) %>%
filter(sex == "Man")
## Warning: Unknown levels in `f`: Man
## # A tibble: 0 × 2
## # … with 2 variables: sex <chr>, sex_rev <fct>
# Collapse multple levels into one
myDataTransofmed %>%
mutate(sex_col = fct_collapse(sex, "Male" = c("Male", "Other"))) %>%
select(sex, sex_col) %>%
filter(sex != "Female")
## Warning: Unknown levels in `f`: Other
## # A tibble: 1 × 2
## sex sex_col
## <chr> <fct>
## 1 Male Male
# Lump small levels into other levels
myDataTransofmed %>% count(sex)
## # A tibble: 2 × 2
## sex n
## <chr> <int>
## 1 Female 1
## 2 Male 1
myDataTransofmed %>% mutate(sex_lump = fct_lump(sex)) %>% distinct(sex_lump)
## # A tibble: 2 × 1
## sex_lump
## <fct>
## 1 Female
## 2 Male
No need to do anything here.