Data wrangling

Step 1: Loading required libraries
Step 2: Load dataframe from CSV

# Read in the datafile
salaries <- read_csv("Salaries.csv")

## Rows: 397 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): rank, discipline, sex
## dbl (3): yrs.since.phd, yrs.service, salary
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

knitr::kable(head(salaries, 5))

rank	discipline	yrs.since.phd	yrs.service	sex	salary
Prof	B	19	18	Male	139750
Prof	B	20	16	Male	173200
AsstProf	B	4	3	Male	79750
Prof	B	45	39	Male	115000
Prof	B	40	41	Male	141500

Step 3: Select four columns from the dataframe.

# use select to grab just these columns
salaries_subset <- salaries %>% 
  select(rank, discipline, sex, salary)

knitr::kable(head(salaries_subset, 5))

rank	discipline	sex	salary
Prof	B	Male	139750
Prof	B	Male	173200
AsstProf	B	Male	79750
Prof	B	Male	115000
Prof	B	Male	141500

Step 4: Remove rows without a salary value.

# get rid of any lines that don't have a value in salary (there were none)
salaries_filtered <- salaries_subset %>% 
  drop_na(salary)

knitr::kable(head(salaries_filtered, 5))

rank	discipline	sex	salary
Prof	B	Male	139750
Prof	B	Male	173200
AsstProf	B	Male	79750
Prof	B	Male	115000
Prof	B	Male	141500

Step 5: Group by rank, discipline, and sex.

# Group by the three features
salaries_grouped <- salaries_filtered %>% 
  group_by(rank,discipline, sex)

knitr::kable(head(salaries_grouped, 5))

rank	discipline	sex	salary
Prof	B	Male	139750
Prof	B	Male	173200
AsstProf	B	Male	79750
Prof	B	Male	115000
Prof	B	Male	141500

Step 6: Convert the sex column into separate Male and Female columns.

# After that, we can use spread to split out the values of "sex" to new columns
# ("Female" and "Male")
# We will make a temp column with all TRUE values and use that to populate the
# new column that is associated with the value of "sex" (and fill FALSE in the
# other column)
# We need a unique value in each row so we'll add an ID column in addition to
# the temp column to get a value from
salaries_wide <- salaries_grouped %>%  
  mutate(row_id = row_number(), temp = TRUE) %>% 
  spread(key = sex, value = temp, fill = FALSE)

knitr::kable(head(salaries_wide, 5))

rank	discipline	salary	row_id	Female	Male
AssocProf	A	62884	2	TRUE	FALSE
AssocProf	A	70000	10	FALSE	TRUE
AssocProf	A	70700	17	FALSE	TRUE
AssocProf	A	73300	4	TRUE	FALSE
AssocProf	A	73877	7	FALSE	TRUE

Step 7: Convert the separate Male and Female columns back to one sex column.

# Merge the columns back from the one hot encoding (as we called this in AI
# classes). I knew by looking at the data that there were only two columns
# added and it would have been more intuitive to name the columns in the gather
# function but I wanted to parallel the slides.
# I'm doing a filter and then removing the temp column.
salaries_long <- salaries_wide %>% 
  gather(key = "sex", value = "temp", (ncol(salaries_wide) - 1):ncol(salaries_wide)) %>% 
  filter(temp == TRUE) %>% 
  select(-temp)

knitr::kable(head(salaries_long, 5))

rank	discipline	salary	row_id	sex
AssocProf	A	62884	2	Female
AssocProf	A	73300	4	Female
AssocProf	A	74830	1	Female
AssocProf	A	77500	3	Female
AssocProf	B	71065	5	Female

The basic R Markdown

Marshall Uy Alexander

5/16/2026

Data description

Data wrangling