本文以模拟数据集为例,演示从缺失值、异常值、类型转换、合并拆分、标准化、性能优化到正则表达式的 7 大清洗技巧。所有代码均可直接运行。

0.1 0. 准备工作

# 可选:一次性安装缺失的包
need <- c("data.table","dplyr","stringr")
for (p in need) if (!requireNamespace(p, quietly = TRUE)) install.packages(p)

library(data.table)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

# 读取示例数据(请把工作目录设到文件所在位置,或用绝对路径)
main_path <- "messy_data.csv"
cust_path <- "customers.csv"
ord_path  <- "orders.csv"

raw  <- fread(main_path, na.strings = c("", "NA"))
cust <- fread(cust_path, na.strings = c("", "NA"))
ord  <- fread(ord_path,  na.strings = c("", "NA"))

dim(raw); head(raw)
## [1] 120  11

数据字段简介:


0.2 1. 缺失值处理:智能填补

要点:数值用 mean/median,类别用众数;尽量分组填补;保留“是否缺失”的指示变量

df <- copy(raw)

# 1) 缺失指示(便于后续追踪)
df <- df %>% mutate(
  miss_age = is.na(age),
  miss_pc  = is.na(purchase_count)
)

# 2) 数值列:用中位数或均值填补(演示:age 用中位数)
med_age <- median(df$age, na.rm = TRUE)
df$age[is.na(df$age)] <- med_age

# purchase_count 用均值
mean_pc <- mean(df$purchase_count, na.rm = TRUE)
df$purchase_count[is.na(df$purchase_count)] <- mean_pc

# 3) 类别列:众数填补工具函数
Mode <- function(x) {
  ux <- na.omit(x)
  if (length(ux) == 0) return(NA)
  tab <- sort(table(ux), decreasing = TRUE)
  names(tab)[1]
}

# gender 用众数
mode_gender <- Mode(df$gender)
df$gender[is.na(df$gender)] <- mode_gender

# 检查
sapply(df[, .(age, purchase_count, gender)], function(x) sum(is.na(x)))
##            age purchase_count         gender 
##              0              0              0

0.3 2. 异常值检测与处理:Z-score / 箱线图 / Winsorize

# 以 amount_last_purchase 演示
x <- df$amount_last_purchase

# Z-score 检测(阈值 |z| > 3)
z <- scale(x)
out_z <- which(abs(z) > 3)

# 箱线图法(1.5*IQR)
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lo <- Q1 - 1.5*IQR
hi <- Q3 + 1.5*IQR
out_box <- which(x < lo | x > hi)

length(out_z); length(out_box)
## [1] 1
## [1] 2
# 简单修正:用分位数 Winsorize 到 [lo, hi]
x_wins <- pmax(pmin(x, hi), lo)
df$amount_last_purchase_clean <- x_wins

# 可视化(在交互里运行)
# boxplot(list(raw=x, cleaned=x_wins), names=c("raw","wins"))

0.4 3. 数据类型转换:字符串→数值/日期/因子

# 3.1 收入字符 → 数值(去掉 $ 和逗号)
df$income_num <- as.numeric(gsub("[\\$,]", "", df$income_str))

# 3.2 “数值字符串”中含逗号 → 数值
df$numeric_string_clean <- as.numeric(gsub(",", "", df$numeric_string_column))

# 3.3 日期清洗:统一到 Date 类型
# 尝试多种格式;失败的记为 NA
parse_multi <- function(x) {
  out <- suppressWarnings(as.Date(x, format="%Y-%m-%d"))
  idx <- is.na(out)
  out[idx] <- suppressWarnings(as.Date(x[idx], format="%d/%m/%Y"))
  idx <- is.na(out)
  out[idx] <- suppressWarnings(as.Date(x[idx], format="%m-%d-%Y"))
  out
}
df$signup_date <- parse_multi(df$signup_date_raw)

# 3.4 因子化示例
df$region <- factor(df$region, levels=c("North","South","East","West"))
str(df[, .(income_str, income_num, signup_date_raw, signup_date, region)])
## Classes 'data.table' and 'data.frame':   120 obs. of  5 variables:
##  $ income_str     : chr  "557.91" "$553.37" NA "511.65" ...
##  $ income_num     : num  558 553 NA 512 NA ...
##  $ signup_date_raw: chr  "21/04/2024" "07/08/2023" "2024-06-15" "27/06/2023" ...
##  $ signup_date    : Date, format: "2024-04-21" "2023-08-07" ...
##  $ region         : Factor w/ 4 levels "North","South",..: 1 1 4 3 4 3 1 3 1 4 ...
##  - attr(*, ".internal.selfref")=<externalptr>

0.5 4. 数据合并与拆分:merge() / split()

# 合并:客户与订单(左连接保留全部客户)
merged <- merge(cust, ord, by="customer_id", all.x = TRUE)

# 拆分:按 region 拆分列表
splitted <- split(df, df$region)
length(splitted); sapply(splitted, nrow)[1:4]
## [1] 4
## North South  East  West 
##    39    25    25    31

0.6 5. 标准化与文本规范化

# 数值标准化(z-score 标准化)
num_cols <- c("age","income_num","purchase_count","amount_last_purchase_clean")
df_std <- copy(df)
df_std[, paste0(num_cols, "_z") := lapply(.SD, scale), .SDcols = num_cols]

# 文本规范化:小写 + 去非字母数字
df_std$text_norm <- tolower(gsub("[^a-zA-Z0-9]", "", df$notes))
head(df_std[, .(notes, text_norm)])

0.7 6. 性能优化:向量化与 data.table

DT <- as.data.table(df)

# 向量化派生变量(避免 for 循环)
DT[, avg_amount_per_purchase := amount_last_purchase_clean / pmax(1, purchase_count)]

# 分组聚合(data.table 语法)
by_region <- DT[, .(
  n = .N,
  mean_age = mean(age, na.rm=TRUE),
  med_income = median(income_num, na.rm=TRUE)
), by = region][order(region)]
by_region

0.8 7. 正则表达式:复杂模式提取

# 7.1 提取有效邮箱
df$email
##   [1] "user0@example.com"     "user1@data.cn"         "user2@mail.org"       
##   [4] "user3@example.com"     "user4@mail.org"        "user5@mail.org"       
##   [7] "user6@example.com"     NA                      "user8@data.cn"        
##  [10] "user9@data.cn"         "user10@mail.org"       "user11 at bad_domain" 
##  [13] "user12@example.com"    "user13@example.com"    "user14@example.com"   
##  [16] "user15 at bad_domain"  "user16@data.cn"        "user17@example.com"   
##  [19] "user18@example.com"    "user19@data.cn"        "user20@data.cn"       
##  [22] "user21 at example"     "user22 at bad_domain"  "user23@example.com"   
##  [25] "user24@data.cn"        "user25 at example"     "user26@data.cn"       
##  [28] "user27 at bad_domain"  "user28@data.cn"        "user29 at bad_domain" 
##  [31] "user30@example.com"    "user31@mail.org"       "user32@example.com"   
##  [34] "user33@data.cn"        "user34@example.com"    "user35@example.com"   
##  [37] "user36@example.com"    "user37@data.cn"        "user38@example.com"   
##  [40] NA                      "user40@data.cn"        "user41@mail.org"      
##  [43] "user42@mail.org"       "user43@mail.org"       "user44@example.com"   
##  [46] "user45@data.cn"        "user46@example.com"    "user47@example.com"   
##  [49] NA                      "user49@example.com"    "user50@example.com"   
##  [52] "user51 at example"     NA                      "user53@mail.org"      
##  [55] "user54@example.com"    "user55@mail.org"       "user56 at bad_domain" 
##  [58] "user57@data.cn"        "user58 at example"     "user59@example.com"   
##  [61] "user60@mail.org"       NA                      "user62@mail.org"      
##  [64] "user63@mail.org"       "user64@example.com"    "user65@example.com"   
##  [67] "user66 at bad_domain"  "user67@mail.org"       "user68@example.com"   
##  [70] "user69@mail.org"       "user70@example.com"    "user71@example.com"   
##  [73] "user72@data.cn"        "user73 at example"     "user74@data.cn"       
##  [76] "user75@mail.org"       "user76@data.cn"        "user77@mail.org"      
##  [79] "user78@example.com"    "user79@example.com"    "user80 at example"    
##  [82] "user81@example.com"    "user82@mail.org"       "user83@data.cn"       
##  [85] "user84@example.com"    "user85@example.com"    "user86@example.com"   
##  [88] "user87@data.cn"        "user88@example.com"    "user89 at example"    
##  [91] NA                      "user91@example.com"    "user92@example.com"   
##  [94] NA                      "user94@example.com"    "user95@data.cn"       
##  [97] "user96 at bad_domain"  "user97@mail.org"       NA                     
## [100] "user99@mail.org"       "user100@mail.org"      "user101 at example"   
## [103] NA                      "user103@data.cn"       "user104@mail.org"     
## [106] "user105@mail.org"      "user106 at bad_domain" "user107@mail.org"     
## [109] "user108@mail.org"      "user109 at bad_domain" "user110 at bad_domain"
## [112] "user111@mail.org"      "user112@example.com"   "user113@example.com"  
## [115] "user114@example.com"   "user115@mail.org"      "user116@example.com"  
## [118] "user117@mail.org"      "user118 at example"    "user119 at example"
is_email <- grepl("^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$", df$email)
emails <- df$email[is_email]
head(emails)
## [1] "user0@example.com" "user1@data.cn"     "user2@mail.org"   
## [4] "user3@example.com" "user4@mail.org"    "user5@mail.org"
# 7.2 从 notes 中提取“推荐码/优惠码”(英文/中文混排示例)
# 匹配 ABC-123 或 SPRING2025 等
promo <- stringr::str_extract_all(
  df$notes,
  "(?i)(?:ref\\s*code\\s*[::]\\s*)?[A-Z]{3,}-?\\d{2,}"
)
head(unlist(promo)[unlist(promo) != ""])
## [1] "SPRING2025"        "Ref code: ABC-123" "Ref code: ABC-123"
## [4] "Ref code: ABC-123" "Ref code: ABC-123" "Ref code: ABC-123"
# 7.3 提取中国手机号(简单示例:以 1 开头的 11 位数字)
phone <- stringr::str_extract(df$notes, "1[3-9]\\d[ -]?\\d{4}[ -]?\\d{4}")
phone <- gsub("[ -]", "", phone)
head(na.omit(phone))
## [1] "13800138000" "13800138000" "13800138000" "13800138000" "13800138000"
## [6] "13800138000"

0.9 小结与复用清单