第四章数据分布可视化

Author

221572216程子桃

1 解释原始数据

faithful是R语言中自带的一个经典数据集，它记录了美国黄石国家公园老忠实间歇泉(Old Faithful geyser)的喷发数据。这个数据集经常被用于统计教学和数据分析示例。
faithful数据集包含两个变量，共有272个观测值。
```
data = faithful
datatable(data,rownames = FALSE)
```
eruptions: 喷发持续时间，连续数值变量，以分钟为单位，范围：1.6分钟到5.1分钟。
waiting: 两次喷发之间的等待时间，连续数值变量，以分钟为单位，范围：43分钟到96分钟。

2 单变量直方图

2.1 绘图要求

利用geom_histogram(aes(y=..density..))绘制eruptions的直方图，使用预设主题：mytheme；
利用geom_rug()为直方图添加地毯图；
利用geom_density()为直方图添加核密度曲线；
利用annotate()在直方图标注峰度和偏度信息；
利用geom_vline() 为直方图添加一条垂直的均值参考线；
利用geom_point()在横轴上添加一个中位数参考点，并在点上方添加文字注释

2.2 作图代码

library(e1071)
df <- data
ggplot(data=df,aes(x=eruptions))+mytheme+    # 绘制直方图
  geom_histogram(aes(y=..density..),fill="lightgreen",color="gray50")+
  geom_rug(size=0.2,color="blue3")+    # 添加地毯图,须线的宽度为0.2
  geom_density(color="blue2",size=0.7)+
  annotate("text",x=2.5,y=0.7,label=paste0("偏度系数 =",round(skewness(df$eruptions),4)),size=3)+  # 添加注释文本
  annotate("text",x=2.5,y=0.65,label=paste0("峰度系数 =",round(kurtosis(df$eruptions),4)),size=3)+ 
  geom_vline(xintercept=mean(df$eruptions),linetype="twodash",size=0.6,color="red")+          # 添加均值垂线，并设置线形、线宽和颜色
  annotate("text",x=mean(df$eruptions),y=0.7,label=paste0("均值线=",round(mean(df$eruptions),3)),size=3)+  # 添加注释文本
   geom_point(x=median(df$eruptions),y=0,shape=21,size=4,fill="yellow")+# 添加中位数点
  annotate("text",x=median(df$eruptions),y=0.05,label="中位数",size=3,color="red3") # 添加注释文本

2.3 图形观察和代码编写的心得体会

该直方图通过多图层叠加：密度曲线、地毯图、统计量标注和动态计算关键指标：偏度、峰度、均值、中位数，清晰呈现了 eruptions 数据的右偏分布特征，是单变量探索性分析的标准化可视化范式。

3 叠加直方图和镜像直方图

3.1 绘图要求

绘制eruptions和 waiting两个变量的叠加直方图和镜像直方图，使用预设主题：mytheme。
将数据转化为长型数据再作叠加直方图，利用scale_fill_brewer()将叠加直方图配色方案改为set3 。
镜像直方图中eruptions在正方向，waiting在负方向，直方数bins=30，并添加文字标签作标签。
两种图都需要针对原始数据作图和标准标准化数据作图，可以使用scale()函数对变量标准化，分类标准化可以使用plyr::ddply()函数。

3.2 叠加直方图代码

df<-data |> 
  gather(eruptions,waiting,key=指标,value=指标值) |> 
  ddply("指标",transform,标准化值=scale(指标值))

p1<-ggplot(df)+aes(x=指标值,y=..density..,fill=指标)+
  geom_histogram(position="identity",color="gray60",alpha=0.5)+scale_fill_brewer(palette="Set3")+ 
  theme(legend.position=c(0.8,0.8),# 设置图例位置
       legend.background=element_rect(fill="grey90",color="grey"))+
                                                # 设置图例背景色和边框颜色
  ggtitle("(a) 原始数据的叠加直方图")

p2<-ggplot(df)+aes(x=标准化值,y=..density..,fill=指标)+
  geom_histogram(position="identity",color="gray60",alpha=0.5)+scale_fill_brewer(palette="Set3")+ 
  theme(legend.position=c(0.8,0.8),# 设置图例位置
       legend.background=element_rect(fill="grey90",color="grey"))+
                                                # 设置图例背景色和边框颜色
  ggtitle("(b) 标准化数据的叠加直方图")
gridExtra::grid.arrange(p1,p2,ncol=2)

3.3 镜像直方图代码

df <- data |> 
  mutate(std.eruptions = scale(eruptions),
         std.waiting = scale(waiting))

# p1: 原始数据的镜像直方图
p1 <- ggplot(df) +
  geom_histogram(
    aes(x = eruptions, y = ..density..),
    fill = "red", alpha = 0.3, bins = 30
  ) +
  geom_label(
    aes(x = 20, y = 0.1),  # 调整标签位置
    label = "eruptions", color = "red", vjust = 0
  ) +
  geom_histogram(
    aes(x = waiting, y = -..density..),
    fill = "blue", alpha = 0.3, bins = 30
  ) +
  geom_label(
    aes(x = 60, y = -0.05),  # 调整标签位置
    label = "waiting", color = "blue", vjust = 1
  ) +
  labs(
    x = "原始值",
    y = "密度",
    title = "(a) 原始数据的镜像直方图"
  ) 

# p2: 标准化数据的镜像直方图
p2 <- ggplot(df) +
  geom_histogram(
    aes(x = std.eruptions, y = ..density..),  # 使用标准化值
    fill = "red", alpha = 0.3, bins = 30
  ) +
  geom_label(
    aes(x = -1.5, y = 0.4),  # 调整标签位置
    label = "eruptions", color = "red", vjust = 0
  ) +
  geom_histogram(
    aes(x = std.waiting, y = -..density..),  # 使用标准化值
    fill = "blue", alpha = 0.3, bins = 30
  ) +
  geom_label(
    aes(x = -1.5, y = -0.4),  # 调整标签位置
    label = "waiting", color = "blue", vjust = 1
  ) +
  labs(
    x = "标准化值",
    y = "密度",
    title = "(b) 标准化数据的镜像直方图"
  ) 

# 组合图形
grid.arrange(p1, p2, ncol = 2)

3.4 图形观察和代码编写的心得体会

通过镜像直方图可直观对比eruptions（右偏）和waiting（对称）的分布差异，标准化后更凸显形状特征；代码编写需注意数据长宽格式转换、图层叠加顺序和坐标轴比例协调，合理使用分面与配色能提升可视化效果。

4 核密度图

4.1 绘图要求

绘制eruptions和 waiting两个变量的分组核密度图、分面核密度图和镜像核密度图。
分组核密度图，采用geom_density(position="identity") 。
分面核密度图，采用geom_density()+facet_wrap(~xx,scale="free") 。
镜像核密度图中eruptions在正方向，waiting在负方向，直方数bins=30，并添加文字标签作标签。
分组核密度图和镜像核密度图需要针对原始数据作图和标准标准化数据作图。

4.2 分组核密度图

df<-data |>

gather(eruptions,waiting,key=指标,value=指标值) |>

ddply(“指标”,transform,标准化值=scale(指标值))

df<-data |> 
  gather(eruptions,waiting,key=指标,value=指标值) |> 
  ddply("指标",transform,标准化值=scale(指标值))

p1<-ggplot(df)+aes(x=指标值,y=..density..,fill=指标)+
   geom_density(position="identity",color="gray50",alpha=0.5)+
   scale_fill_brewer(palette="Set3")+           # 设置调色板
   mytheme+
   theme(legend.position=c(0.8,0.7),            # 设置图例位置
       legend.background=element_rect(fill="grey90",color="grey"))+  
   ggtitle("(a) 原始数据的分组核密度")    
p2<-ggplot(df)+aes(x=标准化值,y=..density..,fill=指标)+
   geom_density(position="identity",color="gray50",alpha=0.5)+
   scale_fill_brewer(palette="Set3")+           # 设置调色板
   mytheme+
   theme(legend.position=c(0.8,0.7),            # 设置图例位置
       legend.background=element_rect(fill="grey90",color="grey"))+  
   ggtitle("(a) 标准化值的分组核密度")    
gridExtra::grid.arrange(p1,p2,ncol=2)

4.3 分面核密度图

# 原始数据密度图
p1 <- ggplot(df, aes(x = 指标值, y = ..density.., fill = 指标)) +
  geom_density(position = "identity", color = "gray50", alpha = 0.5) +
  scale_fill_brewer(palette = "Set3") +
  facet_wrap(~指标, scales = "free") +
  guides(fill = "none") +
  theme(legend.position=c(0.8,0.8),            # 设置图例位置
       legend.background=element_rect(fill="grey90",color="grey"))
p1

4.4 镜像核密度图

df <- data %>%
  select(eruptions, waiting) %>%
  mutate(
    std_eruptions = scale(eruptions),
    std_waiting = scale(waiting)
  )

# p1: 原始数据的镜像核密度图
p1 <- ggplot(df) +
  geom_density(
    aes(x = eruptions, y = ..density..),
    fill = "red", alpha = 0.3, color = "grey50"
  ) +
  geom_label(
    aes(x = 20, y = 0.1),  # 调整标签位置
    label = "eruptions", color = "red", vjust = 0
  ) +
  geom_density(
    aes(x = waiting, y = -..density..),
    fill = "blue", alpha = 0.3, color = "grey50"
  ) +
  geom_label(
    aes(x = 25, y = -0.02),  # 调整标签位置
    label = "waiting", color = "blue", vjust = 1
  ) +
  labs(
    x = "原始值",
    y = "密度",
    title = "(a) 原始数据的镜像核密度图"
  ) 

# p2: 标准化数据的镜像核密度图
p2 <- ggplot(df) +
  geom_density(
    aes(x = std_eruptions, y = ..density..),  # 使用标准化值
    fill = "red", alpha = 0.3, color = "grey50"
  ) +
  geom_label(
    aes(x = -0.5, y = 0.4),  # 调整标签位置
    label = "eruptions", color = "red", vjust = 0
  ) +
  geom_density(
    aes(x = std_waiting, y = -..density..),  # 使用标准化值
    fill = "blue", alpha = 0.3, color = "grey50"
  ) +
  geom_label(
    aes(x = -0.5, y = -0.02),  # 调整标签位置
    label = "waiting", color = "blue", vjust = 1
  ) +
  labs(
    x = "标准化值",
    y = "密度",
    title = "(b) 标准化数据的镜像核密度图"
  ) 

# 组合图形
grid.arrange(p1, p2, ncol = 2)

4.5 图形观察和代码编写的心得体会

核密度图通过平滑曲线展现变量分布特征，分组图用重叠曲线对比形状，分面图独立展示各组细节，镜像图通过正负轴实现直观对比

5 箱线图和小提琴图

5.1 绘图要求

根据实际数据和标准化后的数据绘制eruptions和waiting两个变量的箱线图geom_boxplot和小提琴图geom_violin。
采用stat_summary(fun="mean",geom="point")在箱线图和均值图中要添加均值点。
小提琴图中要加入点图和箱线图
采用调色板前两种颜色，brewer.pal(6,"Set2")[1:2] ，作为箱体填充颜色。

"#66C2A5" "#FC8D62" "#8DA0CB" "#E78AC3" "#A6D854" "#FFD92F"

5.2 箱线图代码

df <- data %>% 
  pivot_longer(cols = c(eruptions, waiting), 
               names_to = "指标", 
               values_to = "指标值") %>% 
  group_by(指标) %>% 
  mutate(标准化值 = as.numeric(scale(指标值))) %>%  # 确保转换为数值
  ungroup()

# 设置调色板
palette <- brewer.pal(2, "Set2")  # 只需要2种颜色

# 原始数据箱线图（正确）
p1 <- ggplot(df, aes(x = 指标, y = 指标值, fill = 指标)) +
  geom_boxplot(outlier.size = 0.8, width = 0.6) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "white") +
  scale_fill_manual(values = palette) +
  labs(y = "原始值", title = "(a) 原始数据箱线图")  +
  theme(legend.position = "none")

# 标准化数据箱线图（修正后）
p2 <- ggplot(df, aes(x = 指标, y = 标准化值, fill = 指标)) +
  geom_boxplot(outlier.size = 0.8, width = 0.6) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "white") +
  scale_fill_manual(values = palette) +
  scale_y_continuous(limits = c(-3, 3)) +  # 固定y轴范围
  labs(y = "标准化值 ", title = "(b) 标准化数据箱线图") +
  theme(legend.position = "none")

# 组合图形
grid.arrange(p1, p2, ncol = 2)

5.3 小提琴图代码

通过d3r::d3_nest将数据框转化为层次数据“d3.js”作为绘图输入

df<-data |> 
  gather(eruptions,waiting,key=指标,value=指标值) |> 
  ddply("指标",transform,标准化值=scale(指标值))

palette<-RColorBrewer::brewer.pal(6,"Set2")[1:2]     # 设置调色板
# 图（a）原始数据小提琴图
p1<-ggplot(df,aes(x=指标,y=指标值,fill=指标))+
     geom_violin(scale="width",trim=FALSE)+
     geom_point(color="black",size=0.8)+  # 添加点
     geom_boxplot(outlier.size=0.7,outlier.color="white",size=0.3,
               width=0.2,fill="white")+  # 添加并设置箱线图和离群点参数
     scale_fill_brewer(palette="Set2")+
     stat_summary(fun=mean,geom="point",shape=21,size=2)+# 添加均值点
     guides(fill="none")+
     ggtitle("(a) 原始数据小提琴图")

# 图（b）数据标准化后的小提琴图
p2<-ggplot(df,aes(x=指标,y=标准化值,fill=指标))+
     geom_violin(scale="width")+
     #geom_point(color="black",size=1)+
     geom_boxplot(,outlier.size=0.7,outlier.color="black",size=0.3,
          width=0.2,fill="white")+
     scale_fill_brewer(palette="Set2")+
     guides(fill="none")+
     ggtitle("(b) 标准化小提琴图")

gridExtra::grid.arrange(p1,p2,ncol=2)        # 组合图形p1和p2

5.4 图形观察和代码编写的心得体会

箱线图清晰展示五数概括和离群点，小提琴图结合核密度展示分布细节，添加均值点增强中心趋势对比

6 威尔金森点图、蜂群图和云雨图

6.1 绘图要求

绘制eruptions和 waiting 两个变量的威尔金森点图、蜂群图和云雨图。
三种图形均采用标准化数据作图
威尔金森点图采用geom_dotplot(binaxis="y",bins=30,dotsize = 0.3) ，要求作出居中堆叠和向上堆叠两种情况的图。
蜂群图采用geom_beeswarm(cex=0.8,shape=21,size=0.8)，要求作出不带箱线图和带有箱线图两种情况的图。
云雨图采用geom_violindot(dots_size=0.7,binwidth=0.07) ，要求作出横向和纵向图两种情况的图。

6.2 威尔金森点图代码

分别作矩形热图和极坐标热图

mytheme<-theme_bw()+theme(legend.position="none")
df<-data |> 
  gather(eruptions,waiting,key=指标,value=指标值) |> 
  ddply("指标",transform,标准化值=scale(指标值))

mytheme<-theme_bw()+theme(legend.position="none")
p<-ggplot(df,aes(x=指标,y=标准化值,fill=指标))
p1<-p+geom_dotplot(binaxis="y",bins=30,dotsize = 0.3,stackdir="center")+ # 绘制点图
  mytheme+ggtitle("(a) 居中堆叠")

p2<-p+geom_dotplot(binaxis="y",bins=30,dotsize = 0.3)+ # 绘制点图
  mytheme+ggtitle("(b) 向上堆叠")
gridExtra::grid.arrange(p1,p2,ncol=2)

6.3 蜂群图代码

library(ggbeeswarm)
mytheme<-theme_bw()+theme(legend.position="none")
df<-data |> 
  gather(eruptions,waiting,key=指标,value=指标值) |> 
  ddply("指标",transform,标准化值=scale(指标值))

# 图（a）5项指标的蜂群图
mytheme<-theme_bw()+theme(legend.position="none")
p<-ggplot(df,aes(x=指标,y=标准化值))
p1<-p+geom_beeswarm(cex=0.8,shape=21,size=0.8)+# 设置蜂群的宽度、点的形状、大小和填充颜色
mytheme+ggtitle("(a) 蜂群图")

# 图（b）箱线图+蜂群图
p2<-p+geom_boxplot(size=0.5,outlier.size=0.8,aes(color=指标))+
geom_beeswarm(cex=0.8,shape=21,size=0.8)+
mytheme+ggtitle("(b) 箱线图+蜂群图")
gridExtra::grid.arrange(p1,p2,ncol=2)

6.4 云雨图代码

library(see)  # 提供主题函数theme_modern
mytheme <- theme_modern() +
  theme(legend.position = "none",
        plot.title = element_text(size = 14, hjust = 0.5))

# 数据转换
df1<-data |> 
  gather(eruptions,waiting,key=指标,value=指标值) |> 
  ddply("指标",transform,标准化值=scale(指标值))

p1<-ggplot(df1,aes(x=指标,y=标准化值,fill=指标))+
  geom_violindot(dots_size=1.2,binwidth=0.05)+ # 绘制云雨图并设置点的大小和箱宽
  mytheme+ggtitle("(a) 垂直排列(默认)")

p2<-ggplot(df1,aes(x=指标,y=标准化值,fill=指标))+
  geom_violindot(dots_size=1.2,binwidth=0.06)+
  coord_flip()+mytheme+ggtitle("(b) 水平排列")

gridExtra::grid.arrange(p1,p2,ncol=2)        # 按2列组合图形p1和p2

6.5 图形观察和代码编写的心得体会

威尔金森点图通过堆叠点展示数据密度分布，蜂群图避免重叠同时保留原始数据位置，云雨图结合核密度与点阵实现双重可视化