第四章 数据分布可视化

Author

221527132凌文翊

1 解释原始数据

  • faithful是R语言中自带的一个经典数据集,它记录了美国黄石国家公园老忠实间歇泉(Old Faithful geyser)的喷发数据。这个数据集经常被用于统计教学和数据分析示例。

  • faithful数据集包含两个变量,共有272个观测值。

    data = faithful
    datatable(data,rownames = FALSE)
  • eruptions: 喷发持续时间,连续数值变量,以分钟为单位,范围:1.6分钟到5.1分钟。

  • waiting: 两次喷发之间的等待时间,连续数值变量,以分钟为单位,范围:43分钟到96分钟。

2 单变量直方图

2.1 绘图要求

  • 利用geom_histogram(aes(y=..density..))绘制eruptions的直方图,使用预设主题:mytheme;

  • 利用geom_rug()为直方图添加地毯图;

  • 利用geom_density()为直方图添加核密度曲线;

  • 利用annotate()在直方图标注峰度和偏度信息;

  • 利用geom_vline() 为直方图添加一条垂直的均值参考线;

  • 利用geom_point()在横轴上添加一个中位数参考点,并在点上方添加文字注释

2.2 作图代码

mytheme <- theme_bw() + 
  theme(
    plot.title = element_text(hjust = 0.5),
    panel.grid = element_blank(),
    legend.position = "none"
  )

df<-data

skew <- skewness(df$eruptions)
kurt <- kurtosis(df$eruptions)
mean_val <- mean(df$eruptions)
median_val <- median(df$eruptions)

ggplot(faithful, aes(x = eruptions)) +
  geom_histogram(aes(y = ..density..), 
                 bins = 30, 
                 fill = "lightblue", 
                 color = "black") +  #密度直方图
  geom_rug(sides = "b", color = "steelblue") +  #底部的地毯图
  geom_density(color = "red", linewidth = 1) +  #红色核密度曲线
  geom_vline(xintercept = mean_val, 
             linetype = "dashed", 
             color = "blue", 
             linewidth = 1) +  #蓝色虚线表示均值线
  geom_point(aes(x = median_val, y = 0), 
             shape = 17, 
             size = 3, 
             color = "darkgreen") + #绿色三角形表示中位数点
  annotate("text", 
           x = max(faithful$eruptions)*0.7, 
           y = max(hist(faithful$eruptions, plot = FALSE)$density)*0.9,
           label = paste("偏度:", round(skew, 2), "\n峰度:", round(kurt, 2)),
           color = "darkred") +  #右上角标注偏度和峰度值
  annotate("text", 
           x = median_val, 
           y = 0.05,
           label = "中位数", 
           color = "darkgreen",
           vjust = 0) +
  labs(title = "Old Faithful 间歇泉喷发持续时间分布",
       x = "喷发持续时间 (分钟)", 
       y = "密度") +
  mytheme

2.3 图形观察和代码编写的心得体会

3 叠加直方图和镜像直方图

3.1 绘图要求

  • 绘制eruptionswaiting两个变量的叠加直方图和镜像直方图,使用预设主题:mytheme。

  • 将数据转化为长型数据再作叠加直方图,利用scale_fill_brewer()将叠加直方图配色方案改为set3

  • 镜像直方图中eruptions在正方向,waiting在负方向,直方数bins=30,并添加文字标签作标签。

  • 两种图都需要针对原始数据作图和标准标准化数据作图,可以使用scale()函数对变量标准化,分类标准化可以使用plyr::ddply()函数。

3.2 叠加直方图代码

mytheme <- theme_bw() + 
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    panel.grid = element_blank(),
    legend.position = "top",
    legend.title = element_blank()
  )
df_long <- df %>%
  pivot_longer(
    cols = c(eruptions, waiting),
    names_to = "variable",
    values_to = "value"
  )
df_scaled <- ddply(df_long, .(variable), transform, scaled_value = scale(value))
p1 <- ggplot(df_long, aes(x = value, fill = variable)) +
  geom_histogram(
    aes(y = ..density..),
    position = "identity",
    alpha = 0.6,
    bins = 30,
    color = "black"
  ) +
  scale_fill_brewer(palette = "Set3") +
  labs(
    title = "原始数据叠加直方图",
    x = "数值",
    y = "密度"
  ) +
  mytheme



p2 <- ggplot(df_scaled, aes(x = scaled_value, fill = variable)) +
  geom_histogram(
    aes(y = ..density..),
    position = "identity",
    alpha = 0.6,
    bins = 30,
    color = "black"
  ) +
  scale_fill_brewer(palette = "Set3") +
  labs(
    title = "标准化数据叠加直方图",
    x = "标准化值 (均值为0, 标准差为1)",
    y = "密度"
  ) +
  mytheme

gridExtra::grid.arrange(p1,p2,ncol=2)        # 组合图形

3.3 镜像直方图代码

mytheme <- theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    panel.grid = element_blank(),
    legend.position = "none",
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12)
  )
# 使用 scale() 标准化
df_scaled <- as.data.frame(scale(df))
names(df_scaled) <- c("eruptions_scaled", "waiting_scaled")

# 合并原始数据和标准化数据
df_combined <- cbind(df, df_scaled)

# 原始数据镜像直方图
p1 <- ggplot(df_combined) +
  
  # eruptions (正方向)
  geom_histogram(
    aes(x = eruptions, y = ..count..),
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  # waiting (负方向,用 -..count.. 翻转)
  geom_histogram(
    aes(x = waiting, y = -..count..),
    bins = 30,
    fill = "salmon",
    color = "white"
  ) +
  geom_hline(yintercept = 0, color = "black", linewidth = 0.5) +
  # 添加标签
  annotate("text", x = 25, y = 50, label = "Eruptions", color = "steelblue", size = 4) +
  annotate("text", x = 80, y = -50, label = "Waiting", color = "salmon", size = 4) +
  # 调整坐标轴
  scale_y_continuous(
    name = "Count",
    labels = abs,  # 负值显示为正值
    breaks = seq(-100, 100, by = 20)
  ) +
  labs(title = "(a)原始数据镜像直方图",x = '指标值') +
  mytheme

# 标准化数据镜像直方图
p2 <- ggplot(df_combined) +
  
  # eruptions_scaled (正方向)
  geom_histogram(
    aes(x = eruptions_scaled, y = ..count..),
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  # waiting_scaled (负方向)
  geom_histogram(
    aes(x = waiting_scaled, y = -..count..),
    bins = 30,
    fill = "salmon",
    color = "white"
  ) +
  geom_hline(yintercept = 0, color = "black", linewidth = 0.5) +
  # 添加标签
  annotate("text", x = 0, y = 50, label = "Eruptions (Scaled)", color = "steelblue", size = 4) +
  annotate("text", x = 0, y = -50, label = "Waiting (Scaled)", color = "salmon", size = 4) +
  # 调整坐标轴
  scale_y_continuous(
    name = "Count",
    labels = abs,
    breaks = seq(-100, 100, by = 20)
  ) +
  labs(title = "(b)标准化数据镜像直方图",x = '指标值') +
  mytheme

gridExtra::grid.arrange(p1,p2,ncol=2)  

3.4 图形观察和代码编写的心得体会

4 核密度图

4.1 绘图要求

  • 绘制eruptions和 waiting两个变量的分组核密度图、分面核密度图和镜像核密度图。

  • 分组核密度图,采用geom_density(position="identity")

  • 分面核密度图,采用geom_density()+facet_wrap(~xx,scale="free")

  • 镜像核密度图中eruptions在正方向,waiting在负方向,直方数bins=30,并添加文字标签作标签。

  • 分组核密度图和镜像核密度图需要针对原始数据作图和标准标准化数据作图。

4.2 分组核密度图

library(reshape2)
mytheme <- theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    panel.grid = element_blank(),
    legend.position = "top",
    legend.title = element_blank(),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12)
  )

df_scaled <- as.data.frame(scale(df))  # 标准化数据
names(df_scaled) <- c("eruptions_scaled", "waiting_scaled")

# 合并原始和标准化数据(长格式转换)
df_long <- pivot_longer(
  cbind(df, df_scaled),
  cols = everything(),
  names_to = "variable",
  values_to = "value"
)

# 原始数据核密度图
p1 <- ggplot(df, aes(x = eruptions, fill = "Eruptions")) +
  geom_density(alpha = 0.5, position = "identity") +
  geom_density(aes(x = waiting, fill = "Waiting"), alpha = 0.5, position = "identity") +
  scale_fill_manual(
    name = "Variable",
    values = c("Eruptions" = "steelblue", "Waiting" = "salmon")
  ) +
  labs(
    title = "(a)原始数据分组核密度图",
    x = "Value",
    y = "Density"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "top"
  )
p2 <- ggplot(df_scaled, aes(x = eruptions_scaled, fill = "Eruptions (Scaled)")) +
  geom_density(alpha = 0.5, position = "identity") +
  geom_density(aes(x = waiting_scaled, fill = "Waiting (Scaled)"), alpha = 0.5, position = "identity") +
  scale_fill_manual(
    name = "Variable",
    values = c("Eruptions (Scaled)" = "steelblue", "Waiting (Scaled)" = "salmon")
  ) +
  labs(
    title = "(b)标准化数据分组核密度图",
    x = "Standardized Value",
    y = "Density"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "top"
  )

gridExtra::grid.arrange(p1,p2,ncol=2)  

4.3 分面核密度图

df_long <- pivot_longer(
  df,
  cols = c(eruptions, waiting),
  names_to = "variable",
  values_to = "value"
)

# 绘制分面核密度图
ggplot(df_long, aes(x = value, fill = variable)) +
  geom_density(alpha = 0.6, color = NA) +
  facet_wrap(~variable, scales = "free") +
  scale_fill_manual(
    values = c("eruptions" = "#1f77b4", "waiting" = "#ff7f0e"),  # 确保变量名拼写一致
    guide = "none"
  ) +
  labs(
    title = "分面核密度图",
    x = "值",
    y = "密度"
  ) +
  theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    strip.background = element_rect(fill = "lightgray"),
    strip.text = element_text(size = 12)
  )

4.4 镜像核密度图

mytheme <- theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    panel.grid = element_blank(),
    legend.position = "none",
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12)
  )


# 合并原始数据和标准化数据
df_combined <- cbind(df, df_scaled)
# 原始数据镜像核密度图
p1 <- ggplot(df_combined) +
  # eruptions (正方向)
  geom_density(
    aes(x = eruptions, y = ..density..),
    fill = "steelblue",
    alpha = 0.5,
    color = "steelblue",
    linewidth = 1
  ) +
  # waiting (负方向,用 -..density.. 翻转)
  geom_density(
    aes(x = waiting, y = -..density..),
    fill = "salmon",
    alpha = 0.5,
    color = "salmon",
    linewidth = 1
  ) +
  # 添加 y=0 处的黑色水平线
  geom_hline(yintercept = 0, color = "black", linewidth = 0.5) +
  # 添加标签
  annotate("text", x = 50, y = 0.2, label = "Eruptions", color = "steelblue", size = 4) +
  annotate("text", x = 50, y = -0.2, label = "Waiting", color = "salmon", size = 4) +
  # 调整坐标轴
  scale_y_continuous(
    name = "Density",
    labels = abs,  # 负值显示为正值
    breaks = seq(-0.5, 0.5, by = 0.1)
  ) +
  labs(title = "(a)标准化数据镜像核密度图", x = "指标值") +
  mytheme

# 标准化数据镜像核密度图
p2 <- ggplot(df_combined) +
  # eruptions_scaled (正方向)
  geom_density(
    aes(x = eruptions_scaled, y = ..density..),
    fill = "steelblue",
    alpha = 0.5,
    color = "steelblue",
    linewidth = 1
  ) +
  # waiting_scaled (负方向)
  geom_density(
    aes(x = waiting_scaled, y = -..density..),
    fill = "salmon",
    alpha = 0.5,
    color = "salmon",
    linewidth = 1
  ) +
  # 添加 y=0 处的黑色水平线
  geom_hline(yintercept = 0, color = "black", linewidth = 0.5) +
  # 添加标签
  annotate("text", x = 0, y = 0.2, label = "Eruptions (Scaled)", color = "steelblue", size = 4) +
  annotate("text", x = 0, y = -0.2, label = "Waiting (Scaled)", color = "salmon", size = 4) +
  # 调整坐标轴
  scale_y_continuous(
    name = "Density",
    labels = abs,
    breaks = seq(-0.5, 0.5, by = 0.1)
  ) +
  labs(title = "(b)标准化数据镜像核密度图", x = "指标值") +
  mytheme

gridExtra::grid.arrange(p1,p2,ncol=2)  

4.5 图形观察和代码编写的心得体会

5 箱线图和小提琴图

5.1 绘图要求

  • 根据实际数据和标准化后的数据绘制eruptionswaiting两个变量的箱线图geom_boxplot和小提琴图geom_violin

  • 采用stat_summary(fun="mean",geom="point")在箱线图和均值图中要添加均值点。

  • 小提琴图中要加入点图和箱线图

  • 采用调色板前两种颜色,brewer.pal(6,"Set2")[1:2] ,作为箱体填充颜色。

"#66C2A5" "#FC8D62" "#8DA0CB" "#E78AC3" "#A6D854" "#FFD92F"

5.2 箱线图代码

df_scaled <- as.data.frame(scale(df))  # 标准化数据
names(df_scaled) <- c("eruptions_scaled", "waiting_scaled")

# 合并数据(长格式)
df_long <- pivot_longer(
  df,
  cols = c(eruptions, waiting),
  names_to = "variable",
  values_to = "value"
)

df_scaled_long <- pivot_longer(
  df_scaled,
  cols = c(eruptions_scaled, waiting_scaled),
  names_to = "variable",
  values_to = "value"
)
# 定义颜色
colors <- brewer.pal(6, "Set2")[1:2]

p1_box <- ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(alpha = 0.7, width = 0.5) +
  stat_summary(
    fun = "mean",
    geom = "point",
    shape = 18,
    size = 3,
    color = "black"
  ) +
  scale_fill_manual(values = colors) +
  labs(
    title = "(a)箱线图(原始数据)",
    x = "变量",
    y = "值"
  ) +
  theme_bw() +
  theme(legend.position = "none")

p2_box <- ggplot(df_scaled_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(alpha = 0.7, width = 0.5) +
  stat_summary(
    fun = "mean",
    geom = "point",
    shape = 18,
    size = 3,
    color = "black"
  ) +
  scale_fill_manual(values = colors) +
  labs(
    title = "(b)箱线图(标准化数据)",
    x = "变量",
    y = "标准化值"
  ) +
  theme_bw() +
  theme(legend.position = "none")
gridExtra::grid.arrange(p1_box,p2_box,ncol=2)  

5.3 小提琴图代码

  • 通过d3r::d3_nest将数据框转化为层次数据“d3.js”作为绘图输入
p1_violin <- ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_violin(alpha = 0.7, trim = FALSE) +
  geom_boxplot(width = 0.1, fill = "white", alpha = 0.5) +
  geom_jitter(width = 0.1, alpha = 0.3, size = 1) +  # 添加数据点
  stat_summary(
    fun = "mean",
    geom = "point",
    shape = 18,
    size = 3,
    color = "black"
  ) +
  scale_fill_manual(values = colors) +
  labs(
    title = "(a)小提琴图(原始数据)",
    x = "变量",
    y = "值"
  ) +
  theme_bw() +
  theme(legend.position = "none")



p2_violin <- ggplot(df_scaled_long, aes(x = variable, y = value, fill = variable)) +
  geom_violin(alpha = 0.7, trim = FALSE) +
  geom_boxplot(width = 0.1, fill = "white", alpha = 0.5) +
  geom_jitter(width = 0.1, alpha = 0.3, size = 1) +
  stat_summary(
    fun = "mean",
    geom = "point",
    shape = 18,
    size = 3,
    color = "black"
  ) +
  scale_fill_manual(values = colors) +
  labs(
    title = "(b)小提琴图(标准化数据)",
    x = "变量",
    y = "标准化值"
  ) +
  theme_bw() +
  theme(legend.position = "none")
gridExtra::grid.arrange(p1_violin,p2_violin,ncol=2)  

5.4 图形观察和代码编写的心得体会

6 威尔金森点图、蜂群图和云雨图

6.1 绘图要求

  • 绘制eruptionswaiting 两个变量的威尔金森点图、蜂群图和云雨图。

  • 三种图形均采用标准化数据作图

  • 威尔金森点图采用geom_dotplot(binaxis="y",bins=30,dotsize = 0.3) ,要求作出居中堆叠和向上堆叠两种情况的图。

  • 蜂群图采用geom_beeswarm(cex=0.8,shape=21,size=0.8),要求作出不带箱线图和带有箱线图两种情况的图。

  • 云雨图采用geom_violindot(dots_size=0.7,binwidth=0.07) ,要求作出横向和纵向图两种情况的图。

# 标准化数据
df <- faithful
df_scaled <- as.data.frame(scale(df))
names(df_scaled) <- c("eruptions_scaled", "waiting_scaled")

# 转为长格式
df_long <- pivot_longer(
  df_scaled,
  cols = everything(),
  names_to = "variable",
  values_to = "value"
)

6.2 威尔金森点图代码

分别作矩形热图和极坐标热图

mytheme<-theme_bw()+theme(legend.position="none")
p1_dot_center <- ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_dotplot(
    binaxis = "y",
    stackdir = "center",  # 居中堆叠
    bins = 30,
    dotsize = 0.3,
    alpha = 0.7
  ) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "威尔金森点图(居中堆叠)", x = "变量", y = "标准化值") +
  theme_bw() +
  theme(legend.position = "none")+
  mytheme

p1_dot_up <- ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_dotplot(
    binaxis = "y",
    stackdir = "up",      # 向上堆叠
    bins = 30,
    dotsize = 0.3,
    alpha = 0.7
  ) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "威尔金森点图(向上堆叠)", x = "变量", y = "标准化值") +
  theme_bw() +
  theme(legend.position = "none")+
  mytheme

gridExtra::grid.arrange(p1_dot_center,p1_dot_up,ncol=2)  

6.3 蜂群图代码

mytheme<-theme_bw()+theme(legend.position="none")
p2_beeswarm <- ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_beeswarm(
    cex = 0.8,          # 点间距
    shape = 21,          # 带边框的点
    size = 0.8,
    alpha = 0.6
  ) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "蜂群图(无箱线图)", x = "变量", y = "标准化值") +
  theme_bw() +
  theme(legend.position = "none")+
  mytheme

p2_beeswarm_box <- ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot(
    width = 0.2,
    alpha = 0.5,
    outlier.shape = NA  # 隐藏箱线图的异常值点
  ) +
  geom_beeswarm(
    cex = 0.8,
    shape = 21,
    size = 0.8,
    alpha = 0.6
  ) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "蜂群图(带箱线图)", x = "变量", y = "标准化值") +
  theme_bw() +
  theme(legend.position = "none")+
  mytheme

gridExtra::grid.arrange(p2_beeswarm,p2_beeswarm_box,ncol=2)  

6.4 云雨图代码

library(see)  # 提供主题函数theme_modern
library(ggdist)
mytheme<-theme_modern()+
         theme(legend.position="none",
               plot.title=element_text(size=14,hjust=0.5))   # 调整标题位置


p3_raincloud_vertical<- ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  # 小提琴图 + 点图
  geom_violindot(
    dots_size = 0.7, 
    binwidth = 0.07,
    alpha = 0.7,
    color = NA
  ) +
  # 添加均值点
  stat_summary(
    fun = "mean",
    geom = "point",
    shape = 18,
    size = 3,
    color = "black"
  ) +
  scale_fill_manual(values = colors) +
  labs(
    title = "纵向云雨图(标准化数据)",
    x = "变量",
    y = "标准化值"
  ) +
  theme_bw() +
  theme(legend.position = "none")+
  mytheme

p3_raincloud_horizontal <- ggplot(df_long, aes(y = variable, x = value, fill = variable)) +
  geom_violindot(
    dots_size = 0.7,
    binwidth = 0.07,
    alpha = 0.7,
    color = NA
  ) +
  stat_summary(
    fun = "mean",
    geom = "point",
    shape = 18,
    size = 3,
    color = "black"
  ) +
  scale_fill_manual(values = colors) +
  labs(
    title = "横向云雨图(标准化数据)",
    y = "变量",
    x = "标准化值"
  ) +
  theme_bw() +
  theme(legend.position = "none")+
  mytheme

gridExtra::grid.arrange(p3_raincloud_vertical,p3_raincloud_horizontal,ncol=2)  

6.5 图形观察和代码编写的心得体会