I used to spend 30 minutes manually cleaning each dataset. Then I found these 5 functions. Now I spend 30 seconds.
Data cleaning doesn’t have to be painful. After cleaning 50+ datasets for work and side projects, I’ve discovered that 80% of cleaning tasks can be solved with just 5 functions. Here they are, with real examples you can copy-paste today.
clean_names() from janitorProblem: Column names with spaces, capitals, or special characters
# Messy column names
messy_df <- data.frame(
`Employee ID` = 1:5,
`First Name` = c("John", "Jane", "Bob", "Martha", "Eli"),
`Salary (USD)` = c(50000, 60000, 55000, 45000, 65000),
check.names = FALSE
)
names(messy_df)
[1] "Employee ID" "First Name" "Salary (USD)"
Solution:
library(janitor)
clean_df <- messy_df %>%
clean_names()
names(clean_df)
[1] "employee_id" "first_name" "salary_usd"
Why it works: Converts to snake_case automatically (lowercase, underscores instead of spaces)
drop_na() from tidyrProblem: Rows with missing values breaking your calculations
# Data with missing values
df_with_na <- data.frame(
id = 1:5,
value = c(10, NA, 30, NA, 50)
)
df_with_na
| id | value |
|---|---|
| 1 | 10 |
| 2 | NA |
| 3 | 30 |
| 4 | NA |
| 5 | 50 |
library(tidyr)
df_clean <- df_with_na %>%
drop_na()
df_clean
| id | value |
|---|---|
| 1 | 10 |
| 3 | 30 |
| 5 | 50 |
Pro tip: Use drop_na(value) to only remove NAs from
specific columns
str_trim() from stringrProblem: Invisible spaces breaking text matches
# text with extra spaces
messy_text <- data.frame(
name = c("John ", " Jane", " Bob ", "Sarah")
)
messy_text
| name |
|---|
| John |
| Jane |
| Bob |
| Sarah |
df_trimmed <- messy_text %>%
mutate(name = str_trim(name))
df_trimmed
| name |
|---|
| John |
| Jane |
| Bob |
| Sarah |
case_when() from dplyrProblem: Converting messy categories into clean groups
# Inconsistent categories
df_status <- data.frame(
status = c("Active", "active", "ACTIVE", "Inactive", "inactive", "Term")
)
df_status
| status |
|---|
| Active |
| active |
| ACTIVE |
| Inactive |
| inactive |
| Term |
Solution:
df_clean_status <- df_status %>%
mutate(status_clean = case_when(
str_to_lower(status) %in% c("active", "act") ~ "Active",
str_to_lower(status) %in% c("inactive", "inact") ~ "Inactive",
str_to_lower(status) == "term" ~ "Terminated",
TRUE ~ "Unknown"
))
df_clean_status
| status | status_clean |
|---|---|
| Active | Active |
| active | Active |
| ACTIVE | Active |
| Inactive | Inactive |
| inactive | Inactive |
| Term | Terminated |
distinct from dplyrProblem: Duplicate rows skewing your analysis
# Data with duplicates
df_dupes <- data.frame(
id = c(1,2,2,3,3,3),
name = c("A", "B", "B", "C", "C", "D")
)
df_dupes
| id | name |
|---|---|
| 1 | A |
| 2 | B |
| 2 | B |
| 3 | C |
| 3 | C |
| 3 | D |
Solution:
df_unique <- df_dupes %>%
distinct
df_unique
| id | name |
|---|---|
| 1 | A |
| 2 | B |
| 3 | C |
| 3 | D |
Pro tip: distinct(id, .keep_all = TRUE) keeps first
occurrence of each unique ID
Here’s how I use ALL 5 functions together:
# start with ANY messy dataset
raw_data <- data.frame(
`Employee ID` = c(101, 102, 102, 103, NA),
`First Name` = c("John ", " Jane ", " Jane", "Bob", "Sarah"),
`Status` = c("Active", "active", "ACTIVE", "Inactive", "pending"),
`Salary` = c(50000, 60000, 60000, NA, 55000),
check.names = FALSE
)
# One pipeline to clean everything
cleaned_data <- raw_data %>%
clean_names() %>% # Fix column names
mutate(first_name = str_trim(first_name)) %>% # Remove spaces
mutate(status_clean = case_when( # Standardized categories
str_to_lower(status) %in% c("Active", "act") ~ "Active",
str_to_lower(status) %in% c("inactive", "inact") ~ "Inactive",
str_to_lower(status) == "pending" ~ "Pending",
TRUE ~ "Unknown"
)) %>%
drop_na(employee_id) %>% # Remove rows missing critical data
distinct() # Removes duplicates
# Before and After
cat("BEFORE:", nrow(raw_data), "rows,", ncol(raw_data), "columns\n")
BEFORE: 5 rows, 4 columns
cat("AFTER:", nrow(cleaned_data), "rows,", ncol(cleaned_data), "columns\n")
AFTER: 4 rows, 5 columns
cleaned_data
| employee_id | first_name | status | salary | status_clean |
|---|---|---|---|---|
| 101 | John | Active | 50000 | Unknown |
| 102 | Jane | active | 60000 | Unknown |
| 102 | Jane | ACTIVE | 60000 | Unknown |
| 103 | Bob | Inactive | NA | Inactive |
Try cleaning this dataset using the 5 functions:
practice_data <- data.frame(
`Customer ID` = c(1001, 1002, 1002, 1003, NA),
`Full Name` = c("Alice Smith ", " Bob Jones", "Bob Jones", "Carol Lee", "David Kim"),
`Membership` = c("Gold", "gold", "GOLD", "Silver", "Bronze"),
check.names = FALSE
)
# Your code here
Solution:
practice_data %>%
clean_names() %>%
mutate(full_name = str_trim(full_name)) %>%
mutate(membership_clean = str_to_lower(membership)) %>%
drop_na(customer_id) %>%
distinct()
| customer_id | full_name | membership | membership_clean |
|---|---|---|---|
| 1001 | Alice Smith | Gold | gold |
| 1002 | Bob Jones | gold | gold |
| 1002 | Bob Jones | GOLD | gold |
| 1003 | Carol Lee | Silver | silver |