I used to spend 30 minutes manually cleaning each dataset. Then I found these 5 functions. Now I spend 30 seconds.

Data cleaning doesn’t have to be painful. After cleaning 50+ datasets for work and side projects, I’ve discovered that 80% of cleaning tasks can be solved with just 5 functions. Here they are, with real examples you can copy-paste today.

clean_names() from janitor

Problem: Column names with spaces, capitals, or special characters

# Messy column names
messy_df <- data.frame(
  `Employee ID` = 1:5,
  `First Name` = c("John", "Jane", "Bob", "Martha", "Eli"),
  `Salary (USD)` = c(50000, 60000, 55000, 45000, 65000),
  check.names = FALSE
)

names(messy_df)
[1] "Employee ID"  "First Name"   "Salary (USD)"

Solution:

library(janitor)
clean_df <- messy_df %>% 
  clean_names()

names(clean_df)
[1] "employee_id" "first_name"  "salary_usd" 

Why it works: Converts to snake_case automatically (lowercase, underscores instead of spaces)

drop_na() from tidyr

Problem: Rows with missing values breaking your calculations

# Data with missing values
df_with_na <- data.frame(
  id = 1:5,
  value = c(10, NA, 30, NA, 50)
)

df_with_na
id value
1 10
2 NA
3 30
4 NA
5 50
library(tidyr)
df_clean <- df_with_na %>% 
  drop_na()

df_clean
id value
1 10
3 30
5 50

Pro tip: Use drop_na(value) to only remove NAs from specific columns

str_trim() from stringr

Problem: Invisible spaces breaking text matches

# text with extra spaces
messy_text <- data.frame(
  name = c("John ", " Jane", " Bob ", "Sarah")
)

messy_text
name
John
Jane
Bob
Sarah
df_trimmed <- messy_text %>% 
  mutate(name = str_trim(name))

df_trimmed
name
John
Jane
Bob
Sarah

case_when() from dplyr

Problem: Converting messy categories into clean groups

# Inconsistent categories
df_status <- data.frame(
  status = c("Active", "active", "ACTIVE", "Inactive", "inactive", "Term")
)

df_status
status
Active
active
ACTIVE
Inactive
inactive
Term

Solution:

df_clean_status <- df_status %>% 
  mutate(status_clean = case_when(
    str_to_lower(status) %in% c("active", "act") ~ "Active",
    str_to_lower(status) %in% c("inactive", "inact") ~ "Inactive",
    str_to_lower(status) == "term" ~ "Terminated",
    TRUE ~ "Unknown"
  ))

df_clean_status
status status_clean
Active Active
active Active
ACTIVE Active
Inactive Inactive
inactive Inactive
Term Terminated

distinct from dplyr

Problem: Duplicate rows skewing your analysis

# Data with duplicates
df_dupes <- data.frame(
  id = c(1,2,2,3,3,3),
  name = c("A", "B", "B", "C", "C", "D")
)

df_dupes
id name
1 A
2 B
2 B
3 C
3 C
3 D

Solution:

df_unique <- df_dupes %>% 
  distinct

df_unique
id name
1 A
2 B
3 C
3 D

Pro tip: distinct(id, .keep_all = TRUE) keeps first occurrence of each unique ID

The Complete 30-Second Workflow

Here’s how I use ALL 5 functions together:

# start with ANY messy dataset
raw_data <- data.frame(
  `Employee ID` = c(101, 102, 102, 103, NA),
  `First Name` = c("John  ", "  Jane  ", "  Jane", "Bob", "Sarah"),
  `Status` = c("Active", "active", "ACTIVE", "Inactive", "pending"),
  `Salary` = c(50000, 60000, 60000, NA, 55000),
  check.names = FALSE
)

# One pipeline to clean everything
cleaned_data <- raw_data %>% 
  clean_names() %>%                                   # Fix column names
  mutate(first_name = str_trim(first_name)) %>%       # Remove spaces
  mutate(status_clean = case_when(                    # Standardized categories
    str_to_lower(status) %in% c("Active", "act") ~ "Active",
    str_to_lower(status) %in% c("inactive", "inact") ~ "Inactive",
    str_to_lower(status) == "pending" ~ "Pending",
    TRUE ~ "Unknown"
    
  )) %>% 
  drop_na(employee_id) %>%                            # Remove rows missing critical data
  distinct()                                          # Removes duplicates

# Before and After
cat("BEFORE:", nrow(raw_data), "rows,", ncol(raw_data), "columns\n")
BEFORE: 5 rows, 4 columns
cat("AFTER:", nrow(cleaned_data), "rows,", ncol(cleaned_data), "columns\n")
AFTER: 4 rows, 5 columns
cleaned_data
employee_id first_name status salary status_clean
101 John Active 50000 Unknown
102 Jane active 60000 Unknown
102 Jane ACTIVE 60000 Unknown
103 Bob Inactive NA Inactive

Your Turn: Practice Exercise

Try cleaning this dataset using the 5 functions:

practice_data <- data.frame(
  `Customer ID` = c(1001, 1002, 1002, 1003, NA),
  `Full Name` = c("Alice Smith ", "  Bob Jones", "Bob Jones", "Carol Lee", "David Kim"),
  `Membership` = c("Gold", "gold", "GOLD", "Silver", "Bronze"),
  check.names = FALSE
)

# Your code here

Solution:

practice_data %>% 
  clean_names() %>% 
  mutate(full_name = str_trim(full_name)) %>% 
  mutate(membership_clean = str_to_lower(membership)) %>% 
  drop_na(customer_id) %>% 
  distinct()
customer_id full_name membership membership_clean
1001 Alice Smith Gold gold
1002 Bob Jones gold gold
1002 Bob Jones GOLD gold
1003 Carol Lee Silver silver