dplyr包使用

黄利东

2025-11-05

0 鸢尾花(iris)数据集简介

iris 是 R 自带的经典多变量数据集,包含 150 条观测,来自 3 个物种(各 50 条):setosaversicolorvirginica。每条记录测量了 4 个连续变量,单位均为 厘米(cm)

历史上该数据由 Edgar Anderson 采集,R. A. Fisher (1936) 在判别分析(LDA)论文中广泛使用,成为统计与机器学习教学中的入门示例。

典型教学目标:数值型特征的分布对比、变量之间的相关性、按物种分组的差异、用于线性/广义线性/判别模型的快速演示等。

library(dplyr)
library(tibble)

# 快速结构与前几行
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
as_tibble(iris) |> slice_head(n = 6)
## # A tibble: 6 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          4.9         3            1.4         0.2 setosa 
## 3          4.7         3.2          1.3         0.2 setosa 
## 4          4.6         3.1          1.5         0.2 setosa 
## 5          5           3.6          1.4         0.2 setosa 
## 6          5.4         3.9          1.7         0.4 setosa

0.1 基本描述与按物种计数

iris |> 
  as_tibble() |>
  count(Species, name = "n")
## # A tibble: 3 × 2
##   Species        n
##   <fct>      <int>
## 1 setosa        50
## 2 versicolor    50
## 3 virginica     50

0.2 变量摘要统计(整体与分组)

library(tidyr)

# 整体摘要
summary(iris[, 1:4])
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500
# 分组均值与标准差
iris |>
  as_tibble() |>
  summarise(
    across(where(is.numeric),
           list(mean = ~mean(.x), sd = ~sd(.x))),
    .by = Species
  )
## # A tibble: 3 × 9
##   Species    Sepal.Length_mean Sepal.Length_sd Sepal.Width_mean Sepal.Width_sd
##   <fct>                  <dbl>           <dbl>            <dbl>          <dbl>
## 1 setosa                  5.01           0.352             3.43          0.379
## 2 versicolor              5.94           0.516             2.77          0.314
## 3 virginica               6.59           0.636             2.97          0.322
## # ℹ 4 more variables: Petal.Length_mean <dbl>, Petal.Length_sd <dbl>,
## #   Petal.Width_mean <dbl>, Petal.Width_sd <dbl>
# .by = Species:只在本次 summarise 里“临时分组”;
# 
# across(where(is.numeric), ...):选中所有数值列;
# 
# list(mean = ~..., sd = ~...):为每个被选列同时计算均值与标准差;
# 
# 输出:每个物种一行,多列统计量,列名遵循 {列名}_{函数名}(可用 .names 自定义)

0.3 可视化概览

0.3.1 单变量分布(直方图 + 密度)

library(ggplot2)

iris |>
  as_tibble() |>
  pivot_longer(cols = Sepal.Length:Petal.Width,
               names_to = "Measure", values_to = "Value") |>
  ggplot(aes(Value, fill = Species)) +
  geom_histogram(alpha = .6, bins = 20, position = "identity") +
  facet_wrap(~ Measure, scales = "free", ncol = 2) +
  labs(title = "Iris:四个测量变量的分布",
       x = "值(cm)", y = "频数", fill = "物种") 

0.3.2 二元关系(散点 + 回归线,分物种)

ggplot(iris, aes(Sepal.Length, Petal.Length, color = Species)) +
  geom_point(alpha = .7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "花萼长度 vs 花瓣长度(按物种)",
       x = "Sepal.Length (cm)", y = "Petal.Length (cm)")

0.4 相关性矩阵(整体)

num <- iris |> select(where(is.numeric))
round(cor(num), 3)
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length        1.000      -0.118        0.872       0.818
## Sepal.Width        -0.118       1.000       -0.428      -0.366
## Petal.Length        0.872      -0.428        1.000       0.963
## Petal.Width         0.818      -0.366        0.963       1.000

0.5 典型建模前准备(可选)

set.seed(123)
idx  <- sample(seq_len(nrow(iris)), size = 0.7 * nrow(iris))
train <- iris[idx, ]
test  <- iris[-idx, ]

# 例如:快速线性模型(演示)
m_lm <- lm(Petal.Length ~ Sepal.Length + Sepal.Width + Petal.Width, data = train)
summary(m_lm)

# 预测与评估
pred <- predict(m_lm, newdata = test)
mean((pred - test$Petal.Length)^2)  # MSE

0.6 引用

1 简介

本课围绕 dplyr 1.1+ 的核心能力,全程以内置数据集 iris 为例,覆盖:

# iris 数据预览
as_tibble(iris) |> slice_head(n = 6)
## # A tibble: 6 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          4.9         3            1.4         0.2 setosa 
## 3          4.7         3.2          1.3         0.2 setosa 
## 4          4.6         3.1          1.5         0.2 setosa 
## 5          5           3.6          1.4         0.2 setosa 
## 6          5.4         3.9          1.7         0.4 setosa

2 管道与 tibble

2.1 管道:|>(base R)与 %>%(magrittr)

# Base R 管道 |> 推荐
1:5 |> mean()
## [1] 3
# 在数据清洗中的典型用法(以 iris 为例)
iris |>
  as_tibble() |>
  select(Species, Sepal.Length, Sepal.Width) |>
  arrange(desc(Sepal.Length)) |>
  slice_head(n = 3)
## # A tibble: 3 × 3
##   Species   Sepal.Length Sepal.Width
##   <fct>            <dbl>       <dbl>
## 1 virginica          7.9         3.8
## 2 virginica          7.7         3.8
## 3 virginica          7.7         2.6

2.2 tibble 打印更友好

tibble(x = 1:5, y = letters[1:5])
## # A tibble: 5 × 2
##       x y    
##   <int> <chr>
## 1     1 a    
## 2     2 b    
## 3     3 c    
## 4     4 d    
## 5     5 e

3 选择与重命名列:select() / rename() / relocate() / rename_with()

# 选择关键列
iris |>
  as_tibble() |>
  select(Species, starts_with("Sepal")) |>
  slice_head(n = 5)
## # A tibble: 5 × 3
##   Species Sepal.Length Sepal.Width
##   <fct>          <dbl>       <dbl>
## 1 setosa           5.1         3.5
## 2 setosa           4.9         3  
## 3 setosa           4.7         3.2
## 4 setosa           4.6         3.1
## 5 setosa           5           3.6
# 中文重命名(不改变其它列)
iris |>
  as_tibble() |>
  rename(花萼长 = Sepal.Length, 花萼宽 = Sepal.Width) |>
  select(花萼长, 花萼宽, Species) |>
  slice_head(n = 3)
## # A tibble: 3 × 3
##   花萼长 花萼宽 Species
##    <dbl>  <dbl> <fct>  
## 1    5.1    3.5 setosa 
## 2    4.9    3   setosa 
## 3    4.7    3.2 setosa
# 批量重命名(大写)
iris |>
  as_tibble() |>
  rename_with(toupper, .cols = ends_with("Length")) |>
  select(Species, SEPAL.LENGTH, PETAL.LENGTH) |>
  slice_head(n = 3)
## # A tibble: 3 × 3
##   Species SEPAL.LENGTH PETAL.LENGTH
##   <fct>          <dbl>        <dbl>
## 1 setosa           5.1          1.4
## 2 setosa           4.9          1.4
## 3 setosa           4.7          1.3
# 调整列顺序
iris |>
  as_tibble() |>
  relocate(Species, .before = 1) |>
  slice_head(n = 3)
## # A tibble: 3 × 5
##   Species Sepal.Length Sepal.Width Petal.Length Petal.Width
##   <fct>          <dbl>       <dbl>        <dbl>       <dbl>
## 1 setosa           5.1         3.5          1.4         0.2
## 2 setosa           4.9         3            1.4         0.2
## 3 setosa           4.7         3.2          1.3         0.2

4 行筛选与取样:filter() / slice_*() / distinct() / between()

# 筛选:山鸢尾(setosa),且花萼长度 >= 5
iris |>
  as_tibble() |>
  filter(Species == "setosa", Sepal.Length >= 5) |>
  slice_head(n = 5)
## # A tibble: 5 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          5           3.6          1.4         0.2 setosa 
## 3          5.4         3.9          1.7         0.4 setosa 
## 4          5           3.4          1.5         0.2 setosa 
## 5          5.4         3.7          1.5         0.2 setosa
# 最长的花瓣前 5 行
iris |>
  as_tibble() |>
  slice_max(order_by = Petal.Length, n = 5) |>
  select(Species, Petal.Length, Petal.Width)
## # A tibble: 5 × 3
##   Species   Petal.Length Petal.Width
##   <fct>            <dbl>       <dbl>
## 1 virginica          6.9         2.3
## 2 virginica          6.7         2.2
## 3 virginica          6.7         2  
## 4 virginica          6.6         2.1
## 5 virginica          6.4         2
# 区间筛选:花萼宽在 [3, 3.5] 之间
iris |>
  as_tibble() |>
  filter(between(Sepal.Width, 3, 3.5)) |>
  summarise(n = n())
## # A tibble: 1 × 1
##       n
##   <int>
## 1    74
# 去重:有哪些物种
iris |>
  as_tibble() |>
  distinct(Species)
## # A tibble: 3 × 1
##   Species   
##   <fct>     
## 1 setosa    
## 2 versicolor
## 3 virginica

5 创建与变换列:mutate() / transmute() / case_when() / if_else() / across()

# 常见派生变量:比值、面积近似(单位为平方厘米近似量)
iris |>
  as_tibble() |>
  transmute(
    Species,
    Sepal.Ratio = Sepal.Length / Sepal.Width,
    Petal.Area  = Petal.Length * Petal.Width
  ) |>
  slice_head(n = 6)
## # A tibble: 6 × 3
##   Species Sepal.Ratio Petal.Area
##   <fct>         <dbl>      <dbl>
## 1 setosa         1.46       0.28
## 2 setosa         1.63       0.28
## 3 setosa         1.47       0.26
## 4 setosa         1.48       0.3 
## 5 setosa         1.39       0.28
## 6 setosa         1.38       0.68
# 分类变量(分箱):依据花瓣长度定义粗略类别
iris |>
  as_tibble() |>
  mutate(
    PetalClass = case_when(
      Petal.Length < 2 ~ "short",
      Petal.Length < 5 ~ "medium",
      TRUE ~ "long"
    ),
    WideSepal = if_else(Sepal.Width >= 3, TRUE, FALSE)
  ) |>
  count(Species, PetalClass, WideSepal)
## # A tibble: 10 × 4
##    Species    PetalClass WideSepal     n
##    <fct>      <chr>      <lgl>     <int>
##  1 setosa     short      FALSE         2
##  2 setosa     short      TRUE         48
##  3 versicolor long       FALSE         1
##  4 versicolor long       TRUE          1
##  5 versicolor medium     FALSE        33
##  6 versicolor medium     TRUE         15
##  7 virginica  long       FALSE        17
##  8 virginica  long       TRUE         27
##  9 virginica  medium     FALSE         4
## 10 virginica  medium     TRUE          2
# 批量标准化数值列(仅对数值列应用)
iris |>
  as_tibble() |>
  mutate(across(where(is.numeric), scale)) |>
  slice_head(n = 3)
## # A tibble: 3 × 5
##   Sepal.Length[,1] Sepal.Width[,1] Petal.Length[,1] Petal.Width[,1] Species
##              <dbl>           <dbl>            <dbl>           <dbl> <fct>  
## 1           -0.898           1.02             -1.34           -1.31 setosa 
## 2           -1.14           -0.132            -1.34           -1.31 setosa 
## 3           -1.38            0.327            -1.39           -1.31 setosa

6 排序与排名:arrange() / desc() / row_number() / ntile() / lead() / lag()

# 按花瓣宽降序排列
iris |>
  as_tibble() |>
  arrange(desc(Petal.Width)) |>
  slice_head(n = 5) |>
  select(Species, Petal.Width)
## # A tibble: 5 × 2
##   Species   Petal.Width
##   <fct>           <dbl>
## 1 virginica         2.5
## 2 virginica         2.5
## 3 virginica         2.5
## 4 virginica         2.4
## 5 virginica         2.4
# 对花萼长度分位数分箱(四等分)
iris |>
  as_tibble() |>
  mutate(SepalLenQuartile = ntile(Sepal.Length, 4)) |>
  count(SepalLenQuartile)
## # A tibble: 4 × 2
##   SepalLenQuartile     n
##              <int> <int>
## 1                1    38
## 2                2    38
## 3                3    37
## 4                4    37
# 使用滞后/超前(示例:按物种与花萼长度排序后比较相邻差异)
iris |>
  as_tibble() |>
  arrange(Species, Sepal.Length) |>
  group_by(Species) |>
  mutate(
    Sepal.Length.Lag1  = lag(Sepal.Length),
    Sepal.Length.Lead1 = lead(Sepal.Length),
    Diff_to_Lag1       = Sepal.Length - Sepal.Length.Lag1
  ) |>
  slice_head(n = 6)
## # A tibble: 18 × 8
## # Groups:   Species [3]
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species   Sepal.Length.Lag1
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>                 <dbl>
##  1          4.3         3            1.1         0.1 setosa                 NA  
##  2          4.4         2.9          1.4         0.2 setosa                  4.3
##  3          4.4         3            1.3         0.2 setosa                  4.4
##  4          4.4         3.2          1.3         0.2 setosa                  4.4
##  5          4.5         2.3          1.3         0.3 setosa                  4.4
##  6          4.6         3.1          1.5         0.2 setosa                  4.5
##  7          4.9         2.4          3.3         1   versicol…              NA  
##  8          5           2            3.5         1   versicol…               4.9
##  9          5           2.3          3.3         1   versicol…               5  
## 10          5.1         2.5          3           1.1 versicol…               5  
## 11          5.2         2.7          3.9         1.4 versicol…               5.1
## 12          5.4         3            4.5         1.5 versicol…               5.2
## 13          4.9         2.5          4.5         1.7 virginica              NA  
## 14          5.6         2.8          4.9         2   virginica               4.9
## 15          5.7         2.5          5           2   virginica               5.6
## 16          5.8         2.7          5.1         1.9 virginica               5.7
## 17          5.8         2.8          5.1         2.4 virginica               5.8
## 18          5.8         2.7          5.1         1.9 virginica               5.8
## # ℹ 2 more variables: Sepal.Length.Lead1 <dbl>, Diff_to_Lag1 <dbl>

7 汇总与分组:summarise() / group_by() / .by / count() / add_count() / tally() / ungroup()

7.1 经典 group_by() + summarise()

iris |>
  as_tibble() |>
  group_by(Species) |>
  summarise(
    n = n(),
    Sepal.Length.mean = mean(Sepal.Length, na.rm = TRUE),
    Sepal.Width.mean  = mean(Sepal.Width, na.rm = TRUE),
    Petal.Length.sd   = sd(Petal.Length,   na.rm = TRUE),
    Petal.Width.sd    = sd(Petal.Width,    na.rm = TRUE)
  )
## # A tibble: 3 × 6
##   Species        n Sepal.Length.mean Sepal.Width.mean Petal.Length.sd
##   <fct>      <int>             <dbl>            <dbl>           <dbl>
## 1 setosa        50              5.01             3.43           0.174
## 2 versicolor    50              5.94             2.77           0.470
## 3 virginica     50              6.59             2.97           0.552
## # ℹ 1 more variable: Petal.Width.sd <dbl>

7.2 新特性 .by:无需显式 group_by()

# 计算花瓣面积的均值与中位数(按物种分组)
iris |>
  as_tibble() |>
  mutate(Petal.Area = Petal.Length * Petal.Width) |>
  summarise(
    n = n(),
    mean_area = mean(Petal.Area),
    median_area = median(Petal.Area),
    .by = Species
  ) |>
  arrange(desc(mean_area))
## # A tibble: 3 × 4
##   Species        n mean_area median_area
##   <fct>      <int>     <dbl>       <dbl>
## 1 virginica     50    11.3         11.4 
## 2 versicolor    50     5.72         5.62
## 3 setosa        50     0.366        0.3

7.3 count() / add_count() / tally()

# 统计花瓣长度分箱数量(每个物种)
iris |>
  as_tibble() |>
  mutate(Bin = cut(Petal.Length, breaks = c(-Inf, 2, 5, Inf),
                   labels = c("short","medium","long"))) |>
  count(Species, Bin)
## # A tibble: 5 × 3
##   Species    Bin        n
##   <fct>      <fct>  <int>
## 1 setosa     short     50
## 2 versicolor medium    49
## 3 versicolor long       1
## 4 virginica  medium     9
## 5 virginica  long      41
# 在原表临时添加组内计数列
iris |>
  as_tibble() |>
  add_count(Species, name = "n_species") |>
  select(Species, n_species) |>
  distinct()
## # A tibble: 3 × 2
##   Species    n_species
##   <fct>          <int>
## 1 setosa            50
## 2 versicolor        50
## 3 virginica         50

8 行级操作:rowwise()(谨慎使用)

当每行需要聚合多列计算时可用;大数据上较慢,能向量化尽量向量化。

iris |>
  as_tibble() |>
  rowwise() |>
  mutate(
    # 行内最小/最大(示例:四个测量值的最小与最大)
    min_measure = min(c_across(Sepal.Length:Petal.Width)),
    max_measure = max(c_across(Sepal.Length:Petal.Width))
  ) |>
  ungroup() |>
  slice_head(n = 4) |>
  select(Species, min_measure, max_measure)
## # A tibble: 4 × 3
##   Species min_measure max_measure
##   <fct>         <dbl>       <dbl>
## 1 setosa          0.2         5.1
## 2 setosa          0.2         4.9
## 3 setosa          0.2         4.7
## 4 setosa          0.2         4.6

9 连接(Join)——构造与 iris 匹配的查找表(物种信息)

为演示连接,这里构造一张物种信息表,包含物种中文名与是否“长花瓣物种”(按物种的 Petal.Length 均值是否 ≥ 4 cm 判断)。

# 物种级统计
species_stats <- iris |>
  as_tibble() |>
  summarise(mean_petal_len = mean(Petal.Length), .by = Species)

# 物种信息查找表
species_info <- species_stats |>
  mutate(
    Species_CN = case_match(
      Species,
      "setosa"     ~ "山鸢尾",
      "versicolor" ~ "变色鸢尾",
      "virginica"  ~ "弗吉尼亚鸢尾",
      .default = "未知"
    ),
    LongPetalSp = mean_petal_len >= 4
  ) |>
  select(Species, Species_CN, LongPetalSp)

species_info
## # A tibble: 3 × 3
##   Species    Species_CN   LongPetalSp
##   <fct>      <chr>        <lgl>      
## 1 setosa     山鸢尾       FALSE      
## 2 versicolor 变色鸢尾     TRUE       
## 3 virginica  弗吉尼亚鸢尾 TRUE
# left_join:把中文名与长花瓣标识并回到样本级
iris |>
  as_tibble() |>
  left_join(species_info, by = "Species") |>
  select(Species, Species_CN, LongPetalSp, Petal.Length, Petal.Width) |>
  slice_head(n = 6)
## # A tibble: 6 × 5
##   Species Species_CN LongPetalSp Petal.Length Petal.Width
##   <fct>   <chr>      <lgl>              <dbl>       <dbl>
## 1 setosa  山鸢尾     FALSE                1.4         0.2
## 2 setosa  山鸢尾     FALSE                1.4         0.2
## 3 setosa  山鸢尾     FALSE                1.3         0.2
## 4 setosa  山鸢尾     FALSE                1.5         0.2
## 5 setosa  山鸢尾     FALSE                1.4         0.2
## 6 setosa  山鸢尾     FALSE                1.7         0.4
# semi_join:只保留 species_info 中存在的物种(这里等价于原表)
iris |>
  as_tibble() |>
  semi_join(species_info, by = "Species") |>
  distinct(Species)
## # A tibble: 3 × 1
##   Species   
##   <fct>     
## 1 setosa    
## 2 versicolor
## 3 virginica
# anti_join:找出 iris 中 species 不在 species_info 的(此处应为空)
iris |>
  as_tibble() |>
  anti_join(species_info, by = "Species")
## # A tibble: 0 × 5
## # ℹ 5 variables: Sepal.Length <dbl>, Sepal.Width <dbl>, Petal.Length <dbl>,
## #   Petal.Width <dbl>, Species <fct>

10 绑定与集合操作:bind_rows() / bind_cols() / union() / intersect() / setdiff()

# 构造两个子集
setosa  <- iris |> as_tibble() |> filter(Species == "setosa")  |> slice_head(n = 3)
versi   <- iris |> as_tibble() |> filter(Species == "versicolor") |> slice_head(n = 3)

# 行绑定
bind_rows(setosa, versi)
## # A tibble: 6 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>     
## 1          5.1         3.5          1.4         0.2 setosa    
## 2          4.9         3            1.4         0.2 setosa    
## 3          4.7         3.2          1.3         0.2 setosa    
## 4          7           3.2          4.7         1.4 versicolor
## 5          6.4         3.2          4.5         1.5 versicolor
## 6          6.9         3.1          4.9         1.5 versicolor
# 集合(基于整行相等)
union(setosa, versi)
## # A tibble: 6 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species   
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>     
## 1          5.1         3.5          1.4         0.2 setosa    
## 2          4.9         3            1.4         0.2 setosa    
## 3          4.7         3.2          1.3         0.2 setosa    
## 4          7           3.2          4.7         1.4 versicolor
## 5          6.4         3.2          4.5         1.5 versicolor
## 6          6.9         3.1          4.9         1.5 versicolor
intersect(setosa, versi)   # 此处通常为空
## # A tibble: 0 × 5
## # ℹ 5 variables: Sepal.Length <dbl>, Sepal.Width <dbl>, Petal.Length <dbl>,
## #   Petal.Width <dbl>, Species <fct>
setdiff(setosa, versi)     # setosa 中有而 versi 中没有
## # A tibble: 3 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1          5.1         3.5          1.4         0.2 setosa 
## 2          4.9         3            1.4         0.2 setosa 
## 3          4.7         3.2          1.3         0.2 setosa

11 缺失值处理与宽长变换(tidyr 协同)

为演示缺失,这里先构造一些 NA。

iris_na <- iris |>
  as_tibble() |>
  mutate(
    Sepal.Length = replace(Sepal.Length, 1:3, NA_real_),
    Petal.Width  = replace(Petal.Width,  5:6, NA_real_)
  )
iris_na |> slice_head(n = 6)
## # A tibble: 6 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1         NA           3.5          1.4         0.2 setosa 
## 2         NA           3            1.4         0.2 setosa 
## 3         NA           3.2          1.3         0.2 setosa 
## 4          4.6         3.1          1.5         0.2 setosa 
## 5          5           3.6          1.4        NA   setosa 
## 6          5.4         3.9          1.7        NA   setosa
# 用 0 或物种组均值填充(两种策略示例)
filled0 <- iris_na |>
  mutate(across(where(is.numeric), ~replace_na(.x, 0)))
filled_by_grp <- iris_na |>
  group_by(Species) |>
  mutate(across(where(is.numeric), ~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))) |>
  ungroup()

filled0 |> slice_head(n = 3)
## # A tibble: 3 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1            0         3.5          1.4         0.2 setosa 
## 2            0         3            1.4         0.2 setosa 
## 3            0         3.2          1.3         0.2 setosa
filled_by_grp |> slice_head(n = 3)
## # A tibble: 3 × 5
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
## 1         5.01         3.5          1.4         0.2 setosa 
## 2         5.01         3            1.4         0.2 setosa 
## 3         5.01         3.2          1.3         0.2 setosa
# 宽转长:把四个测量变量堆叠到两列
iris_long <- iris |>
  as_tibble() |>
  pivot_longer(cols = Sepal.Length:Petal.Width,
               names_to = "Measure",
               values_to = "Value")
iris_long |> slice_head(n = 6)
## # A tibble: 6 × 3
##   Species Measure      Value
##   <fct>   <chr>        <dbl>
## 1 setosa  Sepal.Length   5.1
## 2 setosa  Sepal.Width    3.5
## 3 setosa  Petal.Length   1.4
## 4 setosa  Petal.Width    0.2
## 5 setosa  Sepal.Length   4.9
## 6 setosa  Sepal.Width    3
# 长转宽:每个物种的均值展开为列
iris_wide <- iris |>
  as_tibble() |>
  summarise(across(where(is.numeric), mean), .by = Species) |>
  pivot_wider(names_from = Species, values_from = c(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width))
iris_wide
## # A tibble: 1 × 12
##   Sepal.Length_setosa Sepal.Length_versicolor Sepal.Length_virginica
##                 <dbl>                   <dbl>                  <dbl>
## 1                5.01                    5.94                   6.59
## # ℹ 9 more variables: Sepal.Width_setosa <dbl>, Sepal.Width_versicolor <dbl>,
## #   Sepal.Width_virginica <dbl>, Petal.Length_setosa <dbl>,
## #   Petal.Length_versicolor <dbl>, Petal.Length_virginica <dbl>,
## #   Petal.Width_setosa <dbl>, Petal.Width_versicolor <dbl>,
## #   Petal.Width_virginica <dbl>

12 编程化 dplyr:across() 与 tidy evaluation({{ }}

# 批量统计:对数值列同时计算 mean 与 sd
iris |>
  as_tibble() |>
  summarise(
    across(where(is.numeric),
           list(mean = ~mean(.x, na.rm = TRUE),
                sd   = ~sd(.x,   na.rm = TRUE)))
  ) |>
  glimpse()
## Rows: 1
## Columns: 8
## $ Sepal.Length_mean <dbl> 5.843333
## $ Sepal.Length_sd   <dbl> 0.8280661
## $ Sepal.Width_mean  <dbl> 3.057333
## $ Sepal.Width_sd    <dbl> 0.4358663
## $ Petal.Length_mean <dbl> 3.758
## $ Petal.Length_sd   <dbl> 1.765298
## $ Petal.Width_mean  <dbl> 1.199333
## $ Petal.Width_sd    <dbl> 0.7622377
# 写一个可复用的汇总函数:按任意分组,对任意变量求均值
my_summary <- function(data, grp, var){
  data |>
    summarise(
      n = n(),
      mean = mean({{var}}, na.rm = TRUE),
      .by = {{grp}}
    ) |>
    arrange(desc(mean))
}
# 用法示例:
my_summary(iris, Species, Petal.Length)

13 与 ggplot2 的衔接:从汇总到可视化

# 1) 物种级均值条形图
iris |>
  as_tibble() |>
  summarise(
    Sepal.Length = mean(Sepal.Length),
    Sepal.Width  = mean(Sepal.Width),
    Petal.Length = mean(Petal.Length),
    Petal.Width  = mean(Petal.Width),
    .by = Species
  ) |>
  pivot_longer(-Species, names_to = "Measure", values_to = "Mean") |>
  ggplot(aes(Measure, Mean, fill = Species)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Iris:各物种四项测量的均值", x = NULL, y = "均值")

# 2) 散点 + 拟合线(按物种着色)
iris |>
  as_tibble() |>
  ggplot(aes(Sepal.Length, Petal.Length, color = Species)) +
  geom_point(alpha = .7) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "花萼长度与花瓣长度关系(按物种)", x = "Sepal.Length", y = "Petal.Length")

14 常见坑与最佳实践

sessionInfo()
## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: Asia/Shanghai
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] tibble_3.2.1  purrr_1.0.2   stringr_1.5.1 ggplot2_3.5.1 tidyr_1.3.1  
## [6] dplyr_1.1.4  
## 
## loaded via a namespace (and not attached):
##  [1] Matrix_1.7-1      gtable_0.3.6      jsonlite_1.8.8    compiler_4.4.1   
##  [5] tidyselect_1.2.1  jquerylib_0.1.4   splines_4.4.1     scales_1.3.0     
##  [9] yaml_2.3.10       fastmap_1.2.0     lattice_0.22-6    R6_2.5.1         
## [13] labeling_0.4.3    generics_0.1.3    knitr_1.49        munsell_0.5.1    
## [17] bslib_0.8.0       pillar_1.11.0     rlang_1.1.4       utf8_1.2.4       
## [21] cachem_1.1.0      stringi_1.8.4     xfun_0.49         sass_0.4.9       
## [25] cli_3.6.3         mgcv_1.9-1        withr_3.0.2       magrittr_2.0.3   
## [29] digest_0.6.37     grid_4.4.1        rstudioapi_0.17.1 nlme_3.1-166     
## [33] lifecycle_1.0.4   vctrs_0.6.5       evaluate_1.0.1    glue_1.7.0       
## [37] farver_2.1.2      colorspace_2.1-1  rmarkdown_2.29    tools_4.4.1      
## [41] pkgconfig_2.0.3   htmltools_0.5.8.1

15 练习与参考答案(基于 iris)

练习 1(基础筛选与变换)

iris |>
  as_tibble() |>
  filter(Petal.Width >= 1.5) |>
  mutate(Petal.Area = Petal.Length * Petal.Width) |>
  summarise(mean_area = mean(Petal.Area), .by = Species) |>
  arrange(desc(mean_area))
## # A tibble: 2 × 2
##   Species    mean_area
##   <fct>          <dbl>
## 1 virginica      11.4 
## 2 versicolor      7.25

练习 2(连接与质量检查)

# 复用/重建 species_info
species_info <- iris |>
  as_tibble() |>
  summarise(mean_petal_len = mean(Petal.Length), .by = Species) |>
  mutate(
    Species_CN = case_match(
      Species,
      "setosa"     ~ "山鸢尾",
      "versicolor" ~ "变色鸢尾",
      "virginica"  ~ "弗吉尼亚鸢尾",
      .default = "未知"
    )
  ) |>
  select(Species, Species_CN)

merged <- iris |>
  as_tibble() |>
  left_join(species_info, by = "Species")

# 质量检查(应无缺失)
merged |>
  filter(is.na(Species_CN)) |>
  nrow()
## [1] 0

练习 3(分组汇总与图形)

iris |>
  as_tibble() |>
  summarise(
    across(where(is.numeric), sd),
    .by = Species
  ) |>
  pivot_longer(-Species, names_to = "Measure", values_to = "SD") |>
  ggplot(aes(Measure, SD, fill = Species)) +
  geom_col(position = "dodge") +
  coord_flip() +
  labs(title = "Iris:各物种四项测量的标准差", x = NULL, y = "SD")

16 参考资料