Data description

Data wrangling

# Read in the datafile
salaries <- read_csv("Salaries.csv")
## Rows: 397 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): rank, discipline, sex
## dbl (3): yrs.since.phd, yrs.service, salary
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
knitr::kable(head(salaries, 5))
rank discipline yrs.since.phd yrs.service sex salary
Prof B 19 18 Male 139750
Prof B 20 16 Male 173200
AsstProf B 4 3 Male 79750
Prof B 45 39 Male 115000
Prof B 40 41 Male 141500
# use select to grab just these columns
salaries_subset <- salaries %>% 
  select(rank, discipline, sex, salary)

knitr::kable(head(salaries_subset, 5))
rank discipline sex salary
Prof B Male 139750
Prof B Male 173200
AsstProf B Male 79750
Prof B Male 115000
Prof B Male 141500
# get rid of any lines that don't have a value in salary (there were none)
salaries_filtered <- salaries_subset %>% 
  drop_na(salary)

knitr::kable(head(salaries_filtered, 5))
rank discipline sex salary
Prof B Male 139750
Prof B Male 173200
AsstProf B Male 79750
Prof B Male 115000
Prof B Male 141500
# Group by the three features
salaries_grouped <- salaries_filtered %>% 
  group_by(rank,discipline, sex)

knitr::kable(head(salaries_grouped, 5))
rank discipline sex salary
Prof B Male 139750
Prof B Male 173200
AsstProf B Male 79750
Prof B Male 115000
Prof B Male 141500
# After that, we can use spread to split out the values of "sex" to new columns
# ("Female" and "Male")
# We will make a temp column with all TRUE values and use that to populate the
# new column that is associated with the value of "sex" (and fill FALSE in the
# other column)
# We need a unique value in each row so we'll add an ID column in addition to
# the temp column to get a value from
salaries_wide <- salaries_grouped %>%  
  mutate(row_id = row_number(), temp = TRUE) %>% 
  spread(key = sex, value = temp, fill = FALSE)

knitr::kable(head(salaries_wide, 5))
rank discipline salary row_id Female Male
AssocProf A 62884 2 TRUE FALSE
AssocProf A 70000 10 FALSE TRUE
AssocProf A 70700 17 FALSE TRUE
AssocProf A 73300 4 TRUE FALSE
AssocProf A 73877 7 FALSE TRUE
# Merge the columns back from the one hot encoding (as we called this in AI
# classes). I knew by looking at the data that there were only two columns
# added and it would have been more intuitive to name the columns in the gather
# function but I wanted to parallel the slides.
# I'm doing a filter and then removing the temp column.
salaries_long <- salaries_wide %>% 
  gather(key = "sex", value = "temp", (ncol(salaries_wide) - 1):ncol(salaries_wide)) %>% 
  filter(temp == TRUE) %>% 
  select(-temp)

knitr::kable(head(salaries_long, 5))
rank discipline salary row_id sex
AssocProf A 62884 2 Female
AssocProf A 73300 4 Female
AssocProf A 74830 1 Female
AssocProf A 77500 3 Female
AssocProf B 71065 5 Female