Data wrangling
# Read in the datafile
salaries <- read_csv("Salaries.csv")
## Rows: 397 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): rank, discipline, sex
## dbl (3): yrs.since.phd, yrs.service, salary
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
knitr::kable(head(salaries, 5))
| Prof |
B |
19 |
18 |
Male |
139750 |
| Prof |
B |
20 |
16 |
Male |
173200 |
| AsstProf |
B |
4 |
3 |
Male |
79750 |
| Prof |
B |
45 |
39 |
Male |
115000 |
| Prof |
B |
40 |
41 |
Male |
141500 |
- Step 3: Select four columns from the dataframe.
# use select to grab just these columns
salaries_subset <- salaries %>%
select(rank, discipline, sex, salary)
knitr::kable(head(salaries_subset, 5))
| Prof |
B |
Male |
139750 |
| Prof |
B |
Male |
173200 |
| AsstProf |
B |
Male |
79750 |
| Prof |
B |
Male |
115000 |
| Prof |
B |
Male |
141500 |
- Step 4: Remove rows without a salary value.
# get rid of any lines that don't have a value in salary (there were none)
salaries_filtered <- salaries_subset %>%
drop_na(salary)
knitr::kable(head(salaries_filtered, 5))
| Prof |
B |
Male |
139750 |
| Prof |
B |
Male |
173200 |
| AsstProf |
B |
Male |
79750 |
| Prof |
B |
Male |
115000 |
| Prof |
B |
Male |
141500 |
- Step 5: Group by rank, discipline, and sex.
# Group by the three features
salaries_grouped <- salaries_filtered %>%
group_by(rank,discipline, sex)
knitr::kable(head(salaries_grouped, 5))
| Prof |
B |
Male |
139750 |
| Prof |
B |
Male |
173200 |
| AsstProf |
B |
Male |
79750 |
| Prof |
B |
Male |
115000 |
| Prof |
B |
Male |
141500 |
- Step 6: Convert the sex column into separate Male and Female columns.
# After that, we can use spread to split out the values of "sex" to new columns
# ("Female" and "Male")
# We will make a temp column with all TRUE values and use that to populate the
# new column that is associated with the value of "sex" (and fill FALSE in the
# other column)
# We need a unique value in each row so we'll add an ID column in addition to
# the temp column to get a value from
salaries_wide <- salaries_grouped %>%
mutate(row_id = row_number(), temp = TRUE) %>%
spread(key = sex, value = temp, fill = FALSE)
knitr::kable(head(salaries_wide, 5))
| AssocProf |
A |
62884 |
2 |
TRUE |
FALSE |
| AssocProf |
A |
70000 |
10 |
FALSE |
TRUE |
| AssocProf |
A |
70700 |
17 |
FALSE |
TRUE |
| AssocProf |
A |
73300 |
4 |
TRUE |
FALSE |
| AssocProf |
A |
73877 |
7 |
FALSE |
TRUE |
- Step 7: Convert the separate Male and Female columns back to one sex column.
# Merge the columns back from the one hot encoding (as we called this in AI
# classes). I knew by looking at the data that there were only two columns
# added and it would have been more intuitive to name the columns in the gather
# function but I wanted to parallel the slides.
# I'm doing a filter and then removing the temp column.
salaries_long <- salaries_wide %>%
gather(key = "sex", value = "temp", (ncol(salaries_wide) - 1):ncol(salaries_wide)) %>%
filter(temp == TRUE) %>%
select(-temp)
knitr::kable(head(salaries_long, 5))
| AssocProf |
A |
62884 |
2 |
Female |
| AssocProf |
A |
73300 |
4 |
Female |
| AssocProf |
A |
74830 |
1 |
Female |
| AssocProf |
A |
77500 |
3 |
Female |
| AssocProf |
B |
71065 |
5 |
Female |