数据可视化期末报告

Author

221527123丁成裕

1 报告要求

期末实验报告由5章节5个图形组成，每个章节需要作一个图形。
每个章节选择作什么图自主选择，作图前补充完整图形标题名称，例如：图形1——多变量条形图。
案例数据自主收集，不同章节可以公用一个数据集。但同学间不允许使用相同数据集。
每个章节的数据集合需要通过datatable 函数展示，并简要解释数据来源和变量意义。
每个输出图形后需要对图形作简要解读，最少需针对图形提出一个观点。
渲染html文件保留代码展示，6月22日前将发布网址提交至共享文档“8、期末报告” 列中。
评分标准：
- 每章节图形各20分
- 能有效输出图形和合理解释75%
- 数据独特性强10%
- 图形个性化强15%

2 类别数据可视化

2.1 案例数据解释与展示

# 加载必要的包
library(tidyverse)
library(readxl)
library(DT)
library(ggrepel)
library(cluster)
library(factoextra)
library(patchwork)

# 读取Excel数据
gdp_data <- read_excel("2000-2024年中国各省GDP数据.xlsx", skip = 2)
colnames(gdp_data) <- c("Province", 2000:2024)

# 转换为长格式
gdp_long <- gdp_data %>%
  pivot_longer(cols = -Province, 
               names_to = "Year", 
               values_to = "GDP") %>%
  mutate(Year = as.numeric(Year),
         GDP = as.numeric(GDP))

section1_data <- gdp_long %>% 
  filter(Year == 2024, Province != "中国") %>%
  arrange(desc(GDP)) %>%
  head(31)

# 数据展示
DT::datatable(section1_data,
              options = list(
                pageLength = 5,          # 每页显示5行
                dom = 'tip',             # 显示表格信息(t)、分页控件(p)
                pagingType = "simple"    # 简洁的分页样式
              ),
              caption = "2024年各省GDP数据（共31个地区）")

2.2 图形1——2024年中国各省排名条形图

p1 <- ggplot(section1_data, aes(x = reorder(Province, GDP), y = GDP/10000, fill = GDP)) +
  geom_bar(stat = "identity", width = 0.8) +
  geom_text(aes(label = sprintf("%.1f万亿", GDP/10000)), hjust = -0.1, size = 5) +
  coord_flip(ylim = c(0, max(section1_data$GDP/10000) * 1.25)) +
  labs(title = "图形1-2024年中国各省GDP排名",
       x = "省份", y = "GDP(万亿元)") +
  scale_fill_gradient(low = "#5ab4ac", high = "#01665e") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none",
        plot.title = element_text(hjust = 0.5, face = "bold"),
        axis.text.y = element_text(face = "bold"))

print(p1)

图形解读：2024年GDP排名显示区域经济差异显著：广东以14.16万亿元位居榜首，江苏(13.70万亿)、山东(9.86万亿)紧随其后。长三角地区(江苏、浙江)和珠三角(广东)包揽前三，占前十名GDP总和的52%。西部省份无一进入前十，反映我国经济发展仍存在明显的区域不平衡现象

3 数据分布可视化

3.1 案例数据解释与展示

section2_data <- gdp_long %>% 
  filter(Year == 2024, Province != "中国")

# 数据展示
DT::datatable(section2_data %>% arrange(desc(GDP)),
              options = list(
                pageLength = 5,          # 每页显示5行
                dom = 'tip',             # 显示表格信息(t)、分页控件(p)
                pagingType = "simple"    # 简洁的分页样式
              ),
              caption = "2024年各省GDP数据（共31个地区）")

# 计算关键统计量
mean_gdp <- mean(section2_data$GDP)
median_gdp <- median(section2_data$GDP)

3.2 图形2——2024年GDP分布直方图

p2 <- ggplot(section2_data, aes(x = GDP/10000)) +
  geom_histogram(aes(y = ..density..), bins = 15, fill = "#80cdc1", alpha = 0.8) +
  geom_density(color = "#01665e", linewidth = 1.2) +
  geom_vline(xintercept = mean_gdp/10000, linetype = "dashed", color = "#d73027", linewidth = 1) +
  geom_vline(xintercept = median_gdp/10000, linetype = "dashed", color = "#4575b4", linewidth = 1) +
  labs(title = "图形2-2024年中国各省GDP分布直方图",
       x = "GDP(万亿元)", y = "密度") +
  annotate("text", x = mean_gdp/10000, y = 0.15, 
           label = sprintf("均值=%.1f万亿", mean_gdp/10000), 
           color = "#d73027", hjust = -0.1) +
  annotate("text", x = median_gdp/10000, y = 0.13, 
           label = sprintf("中位数=%.1f万亿", median_gdp/10000), 
           color = "#4575b4", hjust = -0.1) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

print(p2)

图形解读：GDP分布呈现明显右偏：均值(3.7万亿)大于中位数(1.8万亿)，表明少数经济强省拉高了整体水平。 75%省份GDP低于2.5万亿元，而广东(14.16万亿)、江苏(13.70万亿)等省份构成长尾分布。这种偏态分布验证了’二八定律’在经济领域的表现，即20%的省份贡献了超过60%的全国GDP。

4 变量关系可视化

4.1 案例数据解释与展示

section3_data <- gdp_long %>%
  filter(Year %in% c(2000, 2024), Province != "中国") %>%
  pivot_wider(names_from = Year, values_from = GDP) %>%
  rename(GDP_2000 = "2000", GDP_2024 = "2024") %>%
  mutate(Growth = GDP_2024 / GDP_2000)

# 数据展示）
DT::datatable(section3_data,
              options = list(
                pageLength = 5,          # 每页显示5行
                dom = 'tip',             # 显示表格信息(t)、分页控件(p)
                pagingType = "simple"    # 简洁的分页样式
              ),
              caption = "2000-2024年各省GDP增长数据（共31个地区）")

4.2 图形3——GDP增长倍数与初始GDP关系图

p3 <- ggplot(section3_data, aes(x = GDP_2000/10000, y = Growth)) +
  geom_point(aes(size = GDP_2024/10000, color = Growth), alpha = 0.8) +
  geom_smooth(method = "lm", se = FALSE, color = "#e66101", linewidth = 1) +
  geom_text_repel(aes(label = Province), size = 3.5, max.overlaps = 20) +
  labs(title = "图形3-初始GDP与增长倍数关系(2000-2024)",
       x = "2000年GDP(万亿元)", y = "2024年/2000年增长倍数") +
  scale_color_gradientn(colors = c("#2166ac", "#67a9cf", "#d1e5f0", "#fddbc7", "#ef8a62", "#b2182b"),
                        name = "增长倍数") +
  scale_size_continuous(range = c(3, 12), name = "2024年GDP(万亿)") +
  theme_bw(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "bottom") +
  annotate("text", x = max(section3_data$GDP_2000)/10000*0.8, y = max(section3_data$Growth)*0.95,
           label = paste("相关系数 =", round(cor(section3_data$GDP_2000, section3_data$Growth), 3)),
           size = 5, color = "#b2182b", fontface = "bold")

print(p3)

图形解读：初始GDP与增长倍数呈负相关关系(r=-0.52)：2000年GDP较低的省份增长更快。贵州(22倍)、西藏(23倍)等西部省份增速领先，但绝对增量仍较小。经济大省中，广东(13倍)、江苏(16倍)保持稳定增长。值得注意的是安徽(16倍)、陕西(20倍)等中部省份实现’弯道超车’，反映出中部崛起战略成效显著。这种’追赶效应’是区域均衡发展的重要指标。

5 样本相似性可视化

5.1 案例数据解释与展示

# 准备热力图数据
region_mapping <- data.frame(
  Province = c("北京", "天津", "河北", "山西", "内蒙古", "辽宁", "吉林", "黑龙江",
               "上海", "江苏", "浙江", "安徽", "福建", "江西", "山东",
               "河南", "湖北", "湖南", "广东", "广西", "海南",
               "重庆", "四川", "贵州", "云南", "西藏", "陕西", "甘肃", "青海", "宁夏", "新疆"),
  Region = c(rep("东部", 10), 
             rep("中部", 6), 
             rep("西部", 12), 
             rep("东北", 3))
)

heatmap_data <- gdp_long %>%
  filter(Province != "中国", 
         Year %in% seq(2000, 2024, by = 4)) %>%
  mutate(GDP_log = log(GDP)) %>%
  select(Province, Year, GDP_log) %>%
  pivot_wider(names_from = Year, values_from = GDP_log) %>%
  column_to_rownames("Province") %>%
  as.matrix()

# 添加区域信息
region_info <- region_mapping$Region
names(region_info) <- region_mapping$Province
region_colors <- c("东部" = "#d73027", "中部" = "#fc8d59", "西部" = "#fee090", "东北" = "#4575b4")

# 数据展示
DT::datatable(heatmap_data %>% as.data.frame() %>% rownames_to_column("Province"),
              options = list(pageLength = 5, scrollX = TRUE),
              caption = "2000-2024年各省GDP对数变换数据（用于热力图分析）")

5.2 图形4——中国区域经济发展热力图

library(gplots)  # 确保加载gplots包
library(RColorBrewer)

# 设置热力图颜色
heatmap_colors <- colorRampPalette(brewer.pal(9, "YlOrRd"))(100)

# 创建行注释（按区域）
row_annotation <- region_info[rownames(heatmap_data)]
row_colors <- region_colors[row_annotation]

# 使用完全限定名调用heatmap.2函数
gplots::heatmap.2(heatmap_data,
          main = "图形4-中国区域经济发展热力图(2000-2024)",
          col = heatmap_colors,
          scale = "row",          # 按行标准化
          Rowv = TRUE,            # 行聚类
          Colv = FALSE,           # 列不聚类（按时间顺序）
          dendrogram = "row",     # 只显示行聚类树
          trace = "none",         # 不显示轨迹线
          density.info = "none",  # 不显示密度图
          key = TRUE,             # 显示颜色图例
          keysize = 1.5,          # 图例大小
          key.title = "GDP(对数标准化)",
          key.xlab = "标准化值",
          cexRow = 0.9,           # 行标签大小
          cexCol = 1.1,           # 列标签大小
          margins = c(8, 10),     # 图形边距
          srtCol = 45,            # 列标签旋转角度
          adjCol = c(1, 0.5),     # 列标签位置调整
          RowSideColors = row_colors,  # 行侧边颜色（按区域）
          labRow = rownames(heatmap_data),  # 行标签
          labCol = colnames(heatmap_data),  # 列标签
          colsep = 1:ncol(heatmap_data), # 列分隔线
          sepcolor = "gray90",    # 分隔线颜色
          sepwidth = c(0.01, 0.01) # 分隔线宽度
)

# 添加图例
legend("topright", 
       legend = names(region_colors),
       fill = region_colors,
       border = FALSE,
       bty = "n",
       title = "区域划分",
       cex = 0.9,
       inset = c(0, 0.05))

图形解读：
发展梯度显著：
- 东部沿海省份（红色）始终处于深红色高温区，代表经济持续高位运行
- 中部省份（橙色）呈现由浅到深的渐变，反映稳步上升趋势
- 西部省份（黄色）多数保持浅色调，但四川、陕西等省近年明显升温
增长模式分化：
- 广东、江苏形成”全深红”板块，展示持续领跑优势
- 安徽、河南呈现”阶梯式升温”，2012年后颜色显著加深
- 东北三省（蓝色）出现”降温带”，2016年后颜色变浅
关键转折期：
- 2008年全球金融危机后，东部省份普遍短暂”降温”（颜色变浅）
- 2016年供给侧改革时期，资源型省份（山西、内蒙古）明显”退热”
- 2020年后，数字经济强省（浙江、福建）保持高热，传统工业省（辽宁、黑龙江）持续降温

6 时间序列可视化

6.1 案例数据解释与展示

# 确定2024年GDP前十省份
top10_provinces <- gdp_long %>%
  filter(Year == 2024, Province != "中国") %>%
  arrange(desc(GDP)) %>%
  head(10) %>%
  pull(Province)

# 准备时间序列数据
ts_data_top10 <- gdp_long %>%
  filter(Province %in% top10_provinces) %>%
  mutate(GDP_trillion = GDP / 10000)

# 数据展示
DT::datatable(ts_data_top10,
              options = list(pageLength = 5, scrollX = TRUE),
              caption = "2000-2024年GDP前十省份时间序列数据")

6.2 图形5——中国GDP前十省份2000年来的GDP折线图

# 创建前十省份趋势图
p5 <- ggplot(ts_data_top10, aes(x = Year, y = GDP_trillion, group = Province)) +
  geom_line(aes(color = Province), linewidth = 1.2, alpha = 0.8) +
  geom_point(aes(color = Province), size = 2) +
  geom_text_repel(data = ts_data_top10 %>% filter(Year == 2024),
                  aes(label = paste0(Province, " (", round(GDP_trillion, 1), "万亿)"), 
                      color = Province),
                  hjust = 0, nudge_x = 0.5, size = 4, fontface = "bold") +
  scale_x_continuous(limits = c(2000, 2028), breaks = seq(2000, 2024, by = 4)) +
  labs(title = "图形5-中国GDP前十省份发展趋势（2000-2024）",
       x = "年份", y = "GDP（万亿元）",
       caption = "2024年GDP排名前十省份：广东、江苏、山东、浙江、河南、四川、湖北、福建、湖南、安徽") +
  scale_color_brewer(palette = "Set3") +
  theme_minimal(base_size = 14) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
        legend.position = "none",
        panel.grid.major = element_line(color = "grey90"),
        panel.grid.minor = element_blank(),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14),
        plot.caption = element_text(size = 12, color = "grey30"))

print(p5)

图形解读：2000-2024年中国经济强省发展呈现三大特征：
1. 梯度分化明显：广东、江苏形成第一梯队(2024年均超10万亿)，山东、浙江组成第二梯队(6-9万亿)，其余省份为第三梯队(4-6万亿)
2. 增长模式差异：广东、江苏呈指数型增长，2010年后加速明显；山东、浙江为线性增长；安徽、福建等后期发力明显
3. 关键转折点：2010年后江苏加速追赶广东，2020年差距最小(仅差0.3万亿)；山东2018年后增速放缓，被浙江于2022年超越