黄利东
2025-11-05
iris 是 R 自带的经典多变量数据集,包含
150 条观测,来自 3 个物种(各
50
条):setosa、versicolor、virginica。每条记录测量了
4 个连续变量,单位均为 厘米(cm):
Sepal.Length(花萼长度)Sepal.Width(花萼宽度)Petal.Length(花瓣长度)Petal.Width(花瓣宽度)Species 为物种因子变量(3 水平)历史上该数据由 Edgar Anderson 采集,R. A. Fisher (1936) 在判别分析(LDA)论文中广泛使用,成为统计与机器学习教学中的入门示例。
典型教学目标:数值型特征的分布对比、变量之间的相关性、按物种分组的差异、用于线性/广义线性/判别模型的快速演示等。
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## # A tibble: 3 × 2
## Species n
## <fct> <int>
## 1 setosa 50
## 2 versicolor 50
## 3 virginica 50
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
# 分组均值与标准差
iris |>
as_tibble() |>
summarise(
across(where(is.numeric),
list(mean = ~mean(.x), sd = ~sd(.x))),
.by = Species
)## # A tibble: 3 × 9
## Species Sepal.Length_mean Sepal.Length_sd Sepal.Width_mean Sepal.Width_sd
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.01 0.352 3.43 0.379
## 2 versicolor 5.94 0.516 2.77 0.314
## 3 virginica 6.59 0.636 2.97 0.322
## # ℹ 4 more variables: Petal.Length_mean <dbl>, Petal.Length_sd <dbl>,
## # Petal.Width_mean <dbl>, Petal.Width_sd <dbl>
# .by = Species:只在本次 summarise 里“临时分组”;
#
# across(where(is.numeric), ...):选中所有数值列;
#
# list(mean = ~..., sd = ~...):为每个被选列同时计算均值与标准差;
#
# 输出:每个物种一行,多列统计量,列名遵循 {列名}_{函数名}(可用 .names 自定义)library(ggplot2)
iris |>
as_tibble() |>
pivot_longer(cols = Sepal.Length:Petal.Width,
names_to = "Measure", values_to = "Value") |>
ggplot(aes(Value, fill = Species)) +
geom_histogram(alpha = .6, bins = 20, position = "identity") +
facet_wrap(~ Measure, scales = "free", ncol = 2) +
labs(title = "Iris:四个测量变量的分布",
x = "值(cm)", y = "频数", fill = "物种") ggplot(iris, aes(Sepal.Length, Petal.Length, color = Species)) +
geom_point(alpha = .7) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "花萼长度 vs 花瓣长度(按物种)",
x = "Sepal.Length (cm)", y = "Petal.Length (cm)")## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.000 -0.118 0.872 0.818
## Sepal.Width -0.118 1.000 -0.428 -0.366
## Petal.Length 0.872 -0.428 1.000 0.963
## Petal.Width 0.818 -0.366 0.963 1.000
set.seed(123)
idx <- sample(seq_len(nrow(iris)), size = 0.7 * nrow(iris))
train <- iris[idx, ]
test <- iris[-idx, ]
# 例如:快速线性模型(演示)
m_lm <- lm(Petal.Length ~ Sepal.Length + Sepal.Width + Petal.Width, data = train)
summary(m_lm)
# 预测与评估
pred <- predict(m_lm, newdata = test)
mean((pred - test$Petal.Length)^2) # MSE本课围绕 dplyr 1.1+
的核心能力,全程以内置数据集 iris
为例,覆盖:
.by、计数;iris
匹配的查找表)、绑定与集合运算;dplyr;ggplot2 的管道衔接,以及练习题。## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
|>(base R)与
%>%(magrittr)## [1] 3
# 在数据清洗中的典型用法(以 iris 为例)
iris |>
as_tibble() |>
select(Species, Sepal.Length, Sepal.Width) |>
arrange(desc(Sepal.Length)) |>
slice_head(n = 3)## # A tibble: 3 × 3
## Species Sepal.Length Sepal.Width
## <fct> <dbl> <dbl>
## 1 virginica 7.9 3.8
## 2 virginica 7.7 3.8
## 3 virginica 7.7 2.6
## # A tibble: 5 × 2
## x y
## <int> <chr>
## 1 1 a
## 2 2 b
## 3 3 c
## 4 4 d
## 5 5 e
select() / rename() /
relocate() / rename_with()## # A tibble: 5 × 3
## Species Sepal.Length Sepal.Width
## <fct> <dbl> <dbl>
## 1 setosa 5.1 3.5
## 2 setosa 4.9 3
## 3 setosa 4.7 3.2
## 4 setosa 4.6 3.1
## 5 setosa 5 3.6
# 中文重命名(不改变其它列)
iris |>
as_tibble() |>
rename(花萼长 = Sepal.Length, 花萼宽 = Sepal.Width) |>
select(花萼长, 花萼宽, Species) |>
slice_head(n = 3)## # A tibble: 3 × 3
## 花萼长 花萼宽 Species
## <dbl> <dbl> <fct>
## 1 5.1 3.5 setosa
## 2 4.9 3 setosa
## 3 4.7 3.2 setosa
# 批量重命名(大写)
iris |>
as_tibble() |>
rename_with(toupper, .cols = ends_with("Length")) |>
select(Species, SEPAL.LENGTH, PETAL.LENGTH) |>
slice_head(n = 3)## # A tibble: 3 × 3
## Species SEPAL.LENGTH PETAL.LENGTH
## <fct> <dbl> <dbl>
## 1 setosa 5.1 1.4
## 2 setosa 4.9 1.4
## 3 setosa 4.7 1.3
## # A tibble: 3 × 5
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.1 3.5 1.4 0.2
## 2 setosa 4.9 3 1.4 0.2
## 3 setosa 4.7 3.2 1.3 0.2
filter() / slice_*() /
distinct() / between()# 筛选:山鸢尾(setosa),且花萼长度 >= 5
iris |>
as_tibble() |>
filter(Species == "setosa", Sepal.Length >= 5) |>
slice_head(n = 5)## # A tibble: 5 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 5 3.6 1.4 0.2 setosa
## 3 5.4 3.9 1.7 0.4 setosa
## 4 5 3.4 1.5 0.2 setosa
## 5 5.4 3.7 1.5 0.2 setosa
# 最长的花瓣前 5 行
iris |>
as_tibble() |>
slice_max(order_by = Petal.Length, n = 5) |>
select(Species, Petal.Length, Petal.Width)## # A tibble: 5 × 3
## Species Petal.Length Petal.Width
## <fct> <dbl> <dbl>
## 1 virginica 6.9 2.3
## 2 virginica 6.7 2.2
## 3 virginica 6.7 2
## 4 virginica 6.6 2.1
## 5 virginica 6.4 2
# 区间筛选:花萼宽在 [3, 3.5] 之间
iris |>
as_tibble() |>
filter(between(Sepal.Width, 3, 3.5)) |>
summarise(n = n())## # A tibble: 1 × 1
## n
## <int>
## 1 74
## # A tibble: 3 × 1
## Species
## <fct>
## 1 setosa
## 2 versicolor
## 3 virginica
mutate() / transmute() /
case_when() / if_else() /
across()# 常见派生变量:比值、面积近似(单位为平方厘米近似量)
iris |>
as_tibble() |>
transmute(
Species,
Sepal.Ratio = Sepal.Length / Sepal.Width,
Petal.Area = Petal.Length * Petal.Width
) |>
slice_head(n = 6)## # A tibble: 6 × 3
## Species Sepal.Ratio Petal.Area
## <fct> <dbl> <dbl>
## 1 setosa 1.46 0.28
## 2 setosa 1.63 0.28
## 3 setosa 1.47 0.26
## 4 setosa 1.48 0.3
## 5 setosa 1.39 0.28
## 6 setosa 1.38 0.68
# 分类变量(分箱):依据花瓣长度定义粗略类别
iris |>
as_tibble() |>
mutate(
PetalClass = case_when(
Petal.Length < 2 ~ "short",
Petal.Length < 5 ~ "medium",
TRUE ~ "long"
),
WideSepal = if_else(Sepal.Width >= 3, TRUE, FALSE)
) |>
count(Species, PetalClass, WideSepal)## # A tibble: 10 × 4
## Species PetalClass WideSepal n
## <fct> <chr> <lgl> <int>
## 1 setosa short FALSE 2
## 2 setosa short TRUE 48
## 3 versicolor long FALSE 1
## 4 versicolor long TRUE 1
## 5 versicolor medium FALSE 33
## 6 versicolor medium TRUE 15
## 7 virginica long FALSE 17
## 8 virginica long TRUE 27
## 9 virginica medium FALSE 4
## 10 virginica medium TRUE 2
# 批量标准化数值列(仅对数值列应用)
iris |>
as_tibble() |>
mutate(across(where(is.numeric), scale)) |>
slice_head(n = 3)## # A tibble: 3 × 5
## Sepal.Length[,1] Sepal.Width[,1] Petal.Length[,1] Petal.Width[,1] Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 -0.898 1.02 -1.34 -1.31 setosa
## 2 -1.14 -0.132 -1.34 -1.31 setosa
## 3 -1.38 0.327 -1.39 -1.31 setosa
arrange() / desc() /
row_number() / ntile() / lead() /
lag()# 按花瓣宽降序排列
iris |>
as_tibble() |>
arrange(desc(Petal.Width)) |>
slice_head(n = 5) |>
select(Species, Petal.Width)## # A tibble: 5 × 2
## Species Petal.Width
## <fct> <dbl>
## 1 virginica 2.5
## 2 virginica 2.5
## 3 virginica 2.5
## 4 virginica 2.4
## 5 virginica 2.4
# 对花萼长度分位数分箱(四等分)
iris |>
as_tibble() |>
mutate(SepalLenQuartile = ntile(Sepal.Length, 4)) |>
count(SepalLenQuartile)## # A tibble: 4 × 2
## SepalLenQuartile n
## <int> <int>
## 1 1 38
## 2 2 38
## 3 3 37
## 4 4 37
# 使用滞后/超前(示例:按物种与花萼长度排序后比较相邻差异)
iris |>
as_tibble() |>
arrange(Species, Sepal.Length) |>
group_by(Species) |>
mutate(
Sepal.Length.Lag1 = lag(Sepal.Length),
Sepal.Length.Lead1 = lead(Sepal.Length),
Diff_to_Lag1 = Sepal.Length - Sepal.Length.Lag1
) |>
slice_head(n = 6)## # A tibble: 18 × 8
## # Groups: Species [3]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species Sepal.Length.Lag1
## <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 4.3 3 1.1 0.1 setosa NA
## 2 4.4 2.9 1.4 0.2 setosa 4.3
## 3 4.4 3 1.3 0.2 setosa 4.4
## 4 4.4 3.2 1.3 0.2 setosa 4.4
## 5 4.5 2.3 1.3 0.3 setosa 4.4
## 6 4.6 3.1 1.5 0.2 setosa 4.5
## 7 4.9 2.4 3.3 1 versicol… NA
## 8 5 2 3.5 1 versicol… 4.9
## 9 5 2.3 3.3 1 versicol… 5
## 10 5.1 2.5 3 1.1 versicol… 5
## 11 5.2 2.7 3.9 1.4 versicol… 5.1
## 12 5.4 3 4.5 1.5 versicol… 5.2
## 13 4.9 2.5 4.5 1.7 virginica NA
## 14 5.6 2.8 4.9 2 virginica 4.9
## 15 5.7 2.5 5 2 virginica 5.6
## 16 5.8 2.7 5.1 1.9 virginica 5.7
## 17 5.8 2.8 5.1 2.4 virginica 5.8
## 18 5.8 2.7 5.1 1.9 virginica 5.8
## # ℹ 2 more variables: Sepal.Length.Lead1 <dbl>, Diff_to_Lag1 <dbl>
summarise() / group_by() /
.by / count() / add_count() /
tally() / ungroup()group_by() +
summarise()iris |>
as_tibble() |>
group_by(Species) |>
summarise(
n = n(),
Sepal.Length.mean = mean(Sepal.Length, na.rm = TRUE),
Sepal.Width.mean = mean(Sepal.Width, na.rm = TRUE),
Petal.Length.sd = sd(Petal.Length, na.rm = TRUE),
Petal.Width.sd = sd(Petal.Width, na.rm = TRUE)
)## # A tibble: 3 × 6
## Species n Sepal.Length.mean Sepal.Width.mean Petal.Length.sd
## <fct> <int> <dbl> <dbl> <dbl>
## 1 setosa 50 5.01 3.43 0.174
## 2 versicolor 50 5.94 2.77 0.470
## 3 virginica 50 6.59 2.97 0.552
## # ℹ 1 more variable: Petal.Width.sd <dbl>
.by:无需显式 group_by()# 计算花瓣面积的均值与中位数(按物种分组)
iris |>
as_tibble() |>
mutate(Petal.Area = Petal.Length * Petal.Width) |>
summarise(
n = n(),
mean_area = mean(Petal.Area),
median_area = median(Petal.Area),
.by = Species
) |>
arrange(desc(mean_area))## # A tibble: 3 × 4
## Species n mean_area median_area
## <fct> <int> <dbl> <dbl>
## 1 virginica 50 11.3 11.4
## 2 versicolor 50 5.72 5.62
## 3 setosa 50 0.366 0.3
count() /
add_count() / tally()# 统计花瓣长度分箱数量(每个物种)
iris |>
as_tibble() |>
mutate(Bin = cut(Petal.Length, breaks = c(-Inf, 2, 5, Inf),
labels = c("short","medium","long"))) |>
count(Species, Bin)## # A tibble: 5 × 3
## Species Bin n
## <fct> <fct> <int>
## 1 setosa short 50
## 2 versicolor medium 49
## 3 versicolor long 1
## 4 virginica medium 9
## 5 virginica long 41
# 在原表临时添加组内计数列
iris |>
as_tibble() |>
add_count(Species, name = "n_species") |>
select(Species, n_species) |>
distinct()## # A tibble: 3 × 2
## Species n_species
## <fct> <int>
## 1 setosa 50
## 2 versicolor 50
## 3 virginica 50
rowwise()(谨慎使用)当每行需要聚合多列计算时可用;大数据上较慢,能向量化尽量向量化。
iris |>
as_tibble() |>
rowwise() |>
mutate(
# 行内最小/最大(示例:四个测量值的最小与最大)
min_measure = min(c_across(Sepal.Length:Petal.Width)),
max_measure = max(c_across(Sepal.Length:Petal.Width))
) |>
ungroup() |>
slice_head(n = 4) |>
select(Species, min_measure, max_measure)## # A tibble: 4 × 3
## Species min_measure max_measure
## <fct> <dbl> <dbl>
## 1 setosa 0.2 5.1
## 2 setosa 0.2 4.9
## 3 setosa 0.2 4.7
## 4 setosa 0.2 4.6
为演示连接,这里构造一张物种信息表,包含物种中文名与是否“长花瓣物种”(按物种的
Petal.Length 均值是否 ≥ 4 cm 判断)。
# 物种级统计
species_stats <- iris |>
as_tibble() |>
summarise(mean_petal_len = mean(Petal.Length), .by = Species)
# 物种信息查找表
species_info <- species_stats |>
mutate(
Species_CN = case_match(
Species,
"setosa" ~ "山鸢尾",
"versicolor" ~ "变色鸢尾",
"virginica" ~ "弗吉尼亚鸢尾",
.default = "未知"
),
LongPetalSp = mean_petal_len >= 4
) |>
select(Species, Species_CN, LongPetalSp)
species_info## # A tibble: 3 × 3
## Species Species_CN LongPetalSp
## <fct> <chr> <lgl>
## 1 setosa 山鸢尾 FALSE
## 2 versicolor 变色鸢尾 TRUE
## 3 virginica 弗吉尼亚鸢尾 TRUE
# left_join:把中文名与长花瓣标识并回到样本级
iris |>
as_tibble() |>
left_join(species_info, by = "Species") |>
select(Species, Species_CN, LongPetalSp, Petal.Length, Petal.Width) |>
slice_head(n = 6)## # A tibble: 6 × 5
## Species Species_CN LongPetalSp Petal.Length Petal.Width
## <fct> <chr> <lgl> <dbl> <dbl>
## 1 setosa 山鸢尾 FALSE 1.4 0.2
## 2 setosa 山鸢尾 FALSE 1.4 0.2
## 3 setosa 山鸢尾 FALSE 1.3 0.2
## 4 setosa 山鸢尾 FALSE 1.5 0.2
## 5 setosa 山鸢尾 FALSE 1.4 0.2
## 6 setosa 山鸢尾 FALSE 1.7 0.4
# semi_join:只保留 species_info 中存在的物种(这里等价于原表)
iris |>
as_tibble() |>
semi_join(species_info, by = "Species") |>
distinct(Species)## # A tibble: 3 × 1
## Species
## <fct>
## 1 setosa
## 2 versicolor
## 3 virginica
# anti_join:找出 iris 中 species 不在 species_info 的(此处应为空)
iris |>
as_tibble() |>
anti_join(species_info, by = "Species")## # A tibble: 0 × 5
## # ℹ 5 variables: Sepal.Length <dbl>, Sepal.Width <dbl>, Petal.Length <dbl>,
## # Petal.Width <dbl>, Species <fct>
bind_rows() /
bind_cols() / union() /
intersect() / setdiff()# 构造两个子集
setosa <- iris |> as_tibble() |> filter(Species == "setosa") |> slice_head(n = 3)
versi <- iris |> as_tibble() |> filter(Species == "versicolor") |> slice_head(n = 3)
# 行绑定
bind_rows(setosa, versi)## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 7 3.2 4.7 1.4 versicolor
## 5 6.4 3.2 4.5 1.5 versicolor
## 6 6.9 3.1 4.9 1.5 versicolor
## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 7 3.2 4.7 1.4 versicolor
## 5 6.4 3.2 4.5 1.5 versicolor
## 6 6.9 3.1 4.9 1.5 versicolor
## # A tibble: 0 × 5
## # ℹ 5 variables: Sepal.Length <dbl>, Sepal.Width <dbl>, Petal.Length <dbl>,
## # Petal.Width <dbl>, Species <fct>
## # A tibble: 3 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
为演示缺失,这里先构造一些 NA。
iris_na <- iris |>
as_tibble() |>
mutate(
Sepal.Length = replace(Sepal.Length, 1:3, NA_real_),
Petal.Width = replace(Petal.Width, 5:6, NA_real_)
)
iris_na |> slice_head(n = 6)## # A tibble: 6 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 NA 3.5 1.4 0.2 setosa
## 2 NA 3 1.4 0.2 setosa
## 3 NA 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 NA setosa
## 6 5.4 3.9 1.7 NA setosa
# 用 0 或物种组均值填充(两种策略示例)
filled0 <- iris_na |>
mutate(across(where(is.numeric), ~replace_na(.x, 0)))
filled_by_grp <- iris_na |>
group_by(Species) |>
mutate(across(where(is.numeric), ~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x))) |>
ungroup()
filled0 |> slice_head(n = 3)## # A tibble: 3 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 0 3.5 1.4 0.2 setosa
## 2 0 3 1.4 0.2 setosa
## 3 0 3.2 1.3 0.2 setosa
## # A tibble: 3 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.01 3.5 1.4 0.2 setosa
## 2 5.01 3 1.4 0.2 setosa
## 3 5.01 3.2 1.3 0.2 setosa
# 宽转长:把四个测量变量堆叠到两列
iris_long <- iris |>
as_tibble() |>
pivot_longer(cols = Sepal.Length:Petal.Width,
names_to = "Measure",
values_to = "Value")
iris_long |> slice_head(n = 6)## # A tibble: 6 × 3
## Species Measure Value
## <fct> <chr> <dbl>
## 1 setosa Sepal.Length 5.1
## 2 setosa Sepal.Width 3.5
## 3 setosa Petal.Length 1.4
## 4 setosa Petal.Width 0.2
## 5 setosa Sepal.Length 4.9
## 6 setosa Sepal.Width 3
# 长转宽:每个物种的均值展开为列
iris_wide <- iris |>
as_tibble() |>
summarise(across(where(is.numeric), mean), .by = Species) |>
pivot_wider(names_from = Species, values_from = c(Sepal.Length, Sepal.Width, Petal.Length, Petal.Width))
iris_wide## # A tibble: 1 × 12
## Sepal.Length_setosa Sepal.Length_versicolor Sepal.Length_virginica
## <dbl> <dbl> <dbl>
## 1 5.01 5.94 6.59
## # ℹ 9 more variables: Sepal.Width_setosa <dbl>, Sepal.Width_versicolor <dbl>,
## # Sepal.Width_virginica <dbl>, Petal.Length_setosa <dbl>,
## # Petal.Length_versicolor <dbl>, Petal.Length_virginica <dbl>,
## # Petal.Width_setosa <dbl>, Petal.Width_versicolor <dbl>,
## # Petal.Width_virginica <dbl>
across() 与 tidy
evaluation({{ }})# 批量统计:对数值列同时计算 mean 与 sd
iris |>
as_tibble() |>
summarise(
across(where(is.numeric),
list(mean = ~mean(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE)))
) |>
glimpse()## Rows: 1
## Columns: 8
## $ Sepal.Length_mean <dbl> 5.843333
## $ Sepal.Length_sd <dbl> 0.8280661
## $ Sepal.Width_mean <dbl> 3.057333
## $ Sepal.Width_sd <dbl> 0.4358663
## $ Petal.Length_mean <dbl> 3.758
## $ Petal.Length_sd <dbl> 1.765298
## $ Petal.Width_mean <dbl> 1.199333
## $ Petal.Width_sd <dbl> 0.7622377
# 1) 物种级均值条形图
iris |>
as_tibble() |>
summarise(
Sepal.Length = mean(Sepal.Length),
Sepal.Width = mean(Sepal.Width),
Petal.Length = mean(Petal.Length),
Petal.Width = mean(Petal.Width),
.by = Species
) |>
pivot_longer(-Species, names_to = "Measure", values_to = "Mean") |>
ggplot(aes(Measure, Mean, fill = Species)) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = "Iris:各物种四项测量的均值", x = NULL, y = "均值")# 2) 散点 + 拟合线(按物种着色)
iris |>
as_tibble() |>
ggplot(aes(Sepal.Length, Petal.Length, color = Species)) +
geom_point(alpha = .7) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "花萼长度与花瓣长度关系(按物种)", x = "Sepal.Length", y = "Petal.Length")count(key)
检查是否一对一,避免重复行膨胀。na.rm = TRUE;replace_na()
或组内均值填充。across() 取代旧的
*_at/_if/_all,与 where()/选择器搭配。.by 简化一次性分组;多步分析仍建议
group_by()/ungroup() 显式控制。rowwise()
+ c_across()。sessionInfo()、脚本版本管理。## R version 4.4.1 (2024-06-14 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26200)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: Asia/Shanghai
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tibble_3.2.1 purrr_1.0.2 stringr_1.5.1 ggplot2_3.5.1 tidyr_1.3.1
## [6] dplyr_1.1.4
##
## loaded via a namespace (and not attached):
## [1] Matrix_1.7-1 gtable_0.3.6 jsonlite_1.8.8 compiler_4.4.1
## [5] tidyselect_1.2.1 jquerylib_0.1.4 splines_4.4.1 scales_1.3.0
## [9] yaml_2.3.10 fastmap_1.2.0 lattice_0.22-6 R6_2.5.1
## [13] labeling_0.4.3 generics_0.1.3 knitr_1.49 munsell_0.5.1
## [17] bslib_0.8.0 pillar_1.11.0 rlang_1.1.4 utf8_1.2.4
## [21] cachem_1.1.0 stringi_1.8.4 xfun_0.49 sass_0.4.9
## [25] cli_3.6.3 mgcv_1.9-1 withr_3.0.2 magrittr_2.0.3
## [29] digest_0.6.37 grid_4.4.1 rstudioapi_0.17.1 nlme_3.1-166
## [33] lifecycle_1.0.4 vctrs_0.6.5 evaluate_1.0.1 glue_1.7.0
## [37] farver_2.1.2 colorspace_2.1-1 rmarkdown_2.29 tools_4.4.1
## [41] pkgconfig_2.0.3 htmltools_0.5.8.1
Petal.Width >= 1.5 的样本,创建
Petal.Area = Petal.Length * Petal.Width,按物种计算
mean(Petal.Area) 并从大到小排序。iris |>
as_tibble() |>
filter(Petal.Width >= 1.5) |>
mutate(Petal.Area = Petal.Length * Petal.Width) |>
summarise(mean_area = mean(Petal.Area), .by = Species) |>
arrange(desc(mean_area))## # A tibble: 2 × 2
## Species mean_area
## <fct> <dbl>
## 1 virginica 11.4
## 2 versicolor 7.25
species_info,把中文名并回到样本级,检查是否有未匹配的物种。# 复用/重建 species_info
species_info <- iris |>
as_tibble() |>
summarise(mean_petal_len = mean(Petal.Length), .by = Species) |>
mutate(
Species_CN = case_match(
Species,
"setosa" ~ "山鸢尾",
"versicolor" ~ "变色鸢尾",
"virginica" ~ "弗吉尼亚鸢尾",
.default = "未知"
)
) |>
select(Species, Species_CN)
merged <- iris |>
as_tibble() |>
left_join(species_info, by = "Species")
# 质量检查(应无缺失)
merged |>
filter(is.na(Species_CN)) |>
nrow()## [1] 0
iris |>
as_tibble() |>
summarise(
across(where(is.numeric), sd),
.by = Species
) |>
pivot_longer(-Species, names_to = "Measure", values_to = "SD") |>
ggplot(aes(Measure, SD, fill = Species)) +
geom_col(position = "dodge") +
coord_flip() +
labs(title = "Iris:各物种四项测量的标准差", x = NULL, y = "SD")?dplyr、vignette("dplyr")、?across、?summarise、?join、?pivot_longer。