- janitor
- janitor is tidyverse-orientient package.
- janitor is optimized for cleaning data.
- pacman
[SITE] https://github.com/sfirke/janitor?fbclid=IwAR3nwT_zjicyR8tReiVeZBKpjsur1wKMrdasWhIFS2j0wgcXfRxuscqseZc
packages loading
library(janitor)
library(pacman) # for loading packages
p_load(readxl, janitor, dplyr, here)
data loading
roster_raw <- read_excel(here("dirty_data.xlsx")) # available at http://github.com/sfirke/janitor
glimpse(roster_raw)
## Observations: 13
## Variables: 11
## $ `First Name` <chr> "Jason", "Jason", "Alicia", "Ada", "Desus"...
## $ `Last Name` <chr> "Bourne", "Bourne", "Keys", "Lovelace", "N...
## $ `Employee Status` <chr> "Teacher", "Teacher", "Teacher", "Teacher"...
## $ Subject <chr> "PE", "Drafting", "Music", NA, "Dean", "Ph...
## $ `Hire Date` <dbl> 39690, 39690, 37118, 27515, 41431, 11037, ...
## $ `% Allocated` <dbl> 0.75, 0.25, 1.00, 1.00, 1.00, 0.50, 0.50, ...
## $ `Full time?` <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", ...
## $ `do not edit! --->` <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ Certification...9 <chr> "Physical ed", "Physical ed", "Instr. musi...
## $ Certification...10 <chr> "Theater", "Theater", "Vocal music", "Comp...
## $ Certification...11 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
data cleaning
roster <- roster_raw %>%
clean_names() %>%
remove_empty(c("rows", "cols")) %>%
mutate(hire_date = excel_numeric_to_date(hire_date),
cert = coalesce(certification_9, certification_10)) %>% # from dplyr
select(-certification_9, -certification_10) # drop unwanted columns
roster
## # A tibble: 12 x 8
## first_name last_name employee_status subject hire_date percent_allocat~
## <chr> <chr> <chr> <chr> <date> <dbl>
## 1 Jason Bourne Teacher PE 2008-08-30 0.75
## 2 Jason Bourne Teacher Drafti~ 2008-08-30 0.25
## 3 Alicia Keys Teacher Music 2001-08-15 1
## 4 Ada Lovelace Teacher <NA> 1975-05-01 1
## 5 Desus Nice Administration Dean 2013-06-06 1
## 6 Chien-Shi~ Wu Teacher Physics 1930-03-20 0.5
## 7 Chien-Shi~ Wu Teacher Chemis~ 1930-03-20 0.5
## 8 James Joyce Teacher English 1990-05-01 0.5
## 9 Hedy Lamarr Teacher Science 1976-06-08 0.5
## 10 Carlos Boozer Coach Basket~ 2015-08-05 NA
## 11 Young Boozer Coach <NA> 1995-01-01 NA
## 12 Micheal Larsen Teacher English 2009-09-15 0.8
## # ... with 2 more variables: full_time <chr>, cert <chr>
compare_df_cols(roster)
## column_name roster
## 1 first_name character
## 2 last_name character
## 3 employee_status character
## 4 subject character
## 5 hire_date Date
## 6 percent_allocated numeric
## 7 full_time character
## 8 cert character
data loading
roster_raw <- read_excel(here("dirty_data.xlsx"), .name_repair = make_clean_names) # available at http://github.com/sfirke/janitor
glimpse(roster_raw)
## Observations: 13
## Variables: 11
## $ first_name <chr> "Jason", "Jason", "Alicia", "Ada", "Desus", ...
## $ last_name <chr> "Bourne", "Bourne", "Keys", "Lovelace", "Nic...
## $ employee_status <chr> "Teacher", "Teacher", "Teacher", "Teacher", ...
## $ subject <chr> "PE", "Drafting", "Music", NA, "Dean", "Phys...
## $ hire_date <dbl> 39690, 39690, 37118, 27515, 41431, 11037, 11...
## $ percent_allocated <dbl> 0.75, 0.25, 1.00, 1.00, 1.00, 0.50, 0.50, NA...
## $ full_time <chr> "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Y...
## $ do_not_edit <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ certification <chr> "Physical ed", "Physical ed", "Instr. music"...
## $ certification_2 <chr> "Theater", "Theater", "Vocal music", "Comput...
## $ certification_3 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
data cleaning
roster %>% get_dupes(first_name, last_name)
## # A tibble: 4 x 9
## first_name last_name dupe_count employee_status subject hire_date
## <chr> <chr> <int> <chr> <chr> <date>
## 1 Chien-Shi~ Wu 2 Teacher Physics 1930-03-20
## 2 Chien-Shi~ Wu 2 Teacher Chemis~ 1930-03-20
## 3 Jason Bourne 2 Teacher PE 2008-08-30
## 4 Jason Bourne 2 Teacher Drafti~ 2008-08-30
## # ... with 3 more variables: percent_allocated <dbl>, full_time <chr>,
## # cert <chr>
tabyl(roster$subject)
## roster$subject n percent valid_percent
## Basketball 1 0.08333333 0.1
## Chemistry 1 0.08333333 0.1
## Dean 1 0.08333333 0.1
## Drafting 1 0.08333333 0.1
## English 2 0.16666667 0.2
## Music 1 0.08333333 0.1
## PE 1 0.08333333 0.1
## Physics 1 0.08333333 0.1
## Science 1 0.08333333 0.1
## <NA> 2 0.16666667 NA
roster %>%
tabyl(subject)
## subject n percent valid_percent
## Basketball 1 0.08333333 0.1
## Chemistry 1 0.08333333 0.1
## Dean 1 0.08333333 0.1
## Drafting 1 0.08333333 0.1
## English 2 0.16666667 0.2
## Music 1 0.08333333 0.1
## PE 1 0.08333333 0.1
## Physics 1 0.08333333 0.1
## Science 1 0.08333333 0.1
## <NA> 2 0.16666667 NA
roster %>%
filter(hire_date > as.Date("1950-01-01")) %>%
tabyl(employee_status, full_time)
## employee_status No Yes
## Administration 0 1
## Coach 2 0
## Teacher 3 4
roster %>%
tabyl(full_time, subject, employee_status, show_missing_levels = FALSE)
## $Administration
## full_time Dean
## Yes 1
##
## $Coach
## full_time Basketball NA_
## No 1 1
##
## $Teacher
## full_time Chemistry Drafting English Music PE Physics Science NA_
## No 0 0 2 0 0 0 1 0
## Yes 1 1 0 1 1 1 0 1
data cleaning_2
roster %>%
tabyl(employee_status, full_time)
## employee_status No Yes
## Administration 0 1
## Coach 2 0
## Teacher 3 6
roster %>%
tabyl(employee_status, full_time) %>%
adorn_totals("row") %>%
adorn_percentages("row")
## employee_status No Yes
## Administration 0.0000000 1.0000000
## Coach 1.0000000 0.0000000
## Teacher 0.3333333 0.6666667
## Total 0.4166667 0.5833333
roster %>%
tabyl(employee_status, full_time) %>%
adorn_totals("row") %>%
adorn_percentages("row") %>%
adorn_pct_formatting() %>%
adorn_ns() %>%
adorn_title("combined")
## employee_status/full_time No Yes
## Administration 0.0% (0) 100.0% (1)
## Coach 100.0% (2) 0.0% (0)
## Teacher 33.3% (3) 66.7% (6)
## Total 41.7% (5) 58.3% (7)