library(data.table)
library(ggplot2)
library(DT)
# 创建示例数据集
satisfaction_data <- data.table(
Department = rep(c("市场部", "技术部", "财务部", "人力资源部"), each = 4),
AgeGroup = rep(c("18-25", "26-35", "36-45", "46+"), times = 4),
Satisfaction = c(4.2, 4.5, 3.8, 3.5, 3.9, 4.1, 3.7, 3.2, 4.0, 4.3, 4.1, 3.9, 3.7, 3.9, 3.5, 3.0),
ResponseCount = c(45, 62, 38, 28, 52, 78, 45, 32, 38, 55, 42, 36, 41, 58, 39, 25)
)
# 展示数据
datatable(satisfaction_data, options = list(
pageLength = 5,
language = list(search = "搜索:")
))数据可视化期末报告
1 报告要求
期末实验报告由5章节5个图形组成,每个章节需要作一个图形。
每个章节选择作什么图自主选择,作图前补充完整图形标题名称,例如:图形1——多变量条形图。
案例数据自主收集,不同章节可以公用一个数据集。但同学间不允许使用相同数据集。
每个章节的数据集合需要通过
datatable函数展示,并简要解释数据来源和变量意义。每个输出图形后需要对图形作简要解读,最少需针对图形提出一个观点。
渲染html文件保留代码展示,6月22日前将发布网址提交至共享文档
“8、期末报告”列中。评分标准:
每章节图形各20分
能有效输出图形和合理解释75%
数据独特性强10%
图形个性化强15%
2 类别数据可视化
2.1 案例数据解释与展示
数据来源: 模拟生成的客户满意度调查数据
变量解释:Department: 公司部门(市场部、技术部、财务部、人力资源部)AgeGroup: 年龄分组(18-25、26-35、36-45、46+)Satisfaction: 满意度评分(1-5分,1为非常不满意,5为非常满意)ResponseCount: 反馈人数
2.2 图形1——堆叠条形图
ggplot(satisfaction_data, aes(x = Department, y = Satisfaction, fill = AgeGroup)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "图形1——各部门各年龄段满意度比较",
x = "部门",
y = "平均满意度评分",
fill = "年龄组") +
scale_fill_brewer(palette = "Set2") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)) +
geom_text(aes(label = round(Satisfaction, 1)),
position = position_stack(vjust = 0.5),
color = "white", size = 4)图形解读:
该堆叠条形图展示了不同部门各年龄段的满意度评分情况。从图中可以看出:
市场部整体满意度最高,特别是26-35岁年龄组
人力资源部的46岁以上员工满意度最低
各年龄段在技术部的满意度差异最小,表明技术部的服务较为均衡
3 数据分布可视化
3.1 案例数据解释与展示
电商用户行为数据
变量解释:UserID:用户唯一标识符Age:用户年龄Gender:用户性别(Male/Female)PurchaseAmount:购买金额(元)SessionDuration:会话时长(分钟)Device:使用设备(Mobile/Desktop/Tablet)
library(data.table)
library(ggplot2)
library(DT)
# 设置随机种子保证结果可重现
set.seed(123)
# 创建示例数据集
user_behavior_data <- data.table(
UserID = 1:200,
Age = round(rnorm(200, mean=35, sd=10)),
Gender = sample(c("Male", "Female"), 200, replace=TRUE, prob=c(0.55, 0.45)),
PurchaseAmount = round(abs(rnorm(200, mean=500, sd=300))),
SessionDuration = round(abs(rnorm(200, mean=15, sd=8)), 1),
Device = sample(c("Mobile", "Desktop", "Tablet"), 200, replace=TRUE, prob=c(0.6, 0.3, 0.1))
)
# 调整年龄范围在18-70岁之间
user_behavior_data[, Age := pmax(18, pmin(70, Age))]
# 展示前10行数据
datatable(head(user_behavior_data, 10),
options = list(
pageLength = 5,
dom = 't',
scrollX = TRUE
),
caption = "表1:电商用户行为数据示例(前10行)")3.2 图形2——箱线图
ggplot(user_behavior_data, aes(x=Gender, y=PurchaseAmount, fill=Gender)) +
geom_boxplot(alpha=0.7, outlier.color="red") +
labs(title="图形2——不同性别购买金额分布",
x="性别",
y="购买金额(元)") +
scale_fill_manual(values=c("#D16103", "#4E84C4")) +
theme_minimal() +
theme(plot.title = element_text(hjust=0.5, size=14, face="bold"),
legend.position="none") +
stat_summary(fun=mean, geom="point", shape=18, size=3, color="red") +
annotate("text", x=1.5, y=max(user_behavior_data$PurchaseAmount)*0.9,
label=paste("女性平均:", round(mean(user_behavior_data[Gender=="Female"]$PurchaseAmount)), "元\n",
"男性平均:", round(mean(user_behavior_data[Gender=="Male"]$PurchaseAmount)), "元"),
color="black", size=4)图形解读:
女性用户的购买金额中位数略高于男性用户
男性用户的购买金额分布范围更广,存在更多高消费异常值
两性用户的购买金额分布都呈现右偏态,说明大多数用户消费金额集中在较低区间
4 变量关系可视化
4.1 案例数据解释与展示
模拟生成的房屋销售数据
变量解释:Price:房屋售价(万元)Area:房屋面积(平方米)Bedrooms:卧室数量Age:房龄(年)Location:地段等级(1-3级,1为最佳)HasParking:是否有停车位(是/否)
library(ggplot2)
library(GGally) # 用于绘制变量关系矩阵图
library(DT)
set.seed(456) # 设置随机种子保证结果可重现
# 创建房屋销售数据集
house_data <- data.frame(
Price = round(abs(rnorm(200, mean=500, sd=150))),
Area = round(runif(200, 50, 150)),
Bedrooms = sample(1:4, 200, replace=TRUE, prob=c(0.1, 0.4, 0.3, 0.2)),
Age = round(runif(200, 1, 30)),
Location = sample(1:3, 200, replace=TRUE, prob=c(0.3, 0.5, 0.2)),
HasParking = sample(c("是", "否"), 200, replace=TRUE, prob=c(0.6, 0.4))
)
# 添加变量间的逻辑关系
house_data$Price <- house_data$Price +
house_data$Area * 0.8 +
house_data$Bedrooms * 20 -
house_data$Age * 2 +
(4 - house_data$Location) * 30 +
ifelse(house_data$HasParking == "是", 50, 0) +
rnorm(200, mean=0, sd=50)
# 确保价格为正数
house_data$Price <- round(abs(house_data$Price))
# 展示数据前10行
datatable(head(house_data, 10),
options = list(
pageLength = 5,
scrollX = TRUE
),
caption = "表1:房屋销售数据示例(前10行)")4.2 图形3——分组散点图
ggplot(house_data, aes(x = Area, y = Price, color = HasParking)) +
geom_point(alpha = 0.7, size = 2.5) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "图形2——房屋价格与面积关系(按停车位分组)",
x = "面积(平方米)",
y = "价格(万元)",
color = "有停车位") +
scale_color_manual(values = c("#E69F00", "#56B4E9")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
legend.position = "top") +
annotate("text", x = 120, y = max(house_data$Price) * 0.9,
label = paste("有停车位: 平均价格",
round(mean(house_data$Price[house_data$HasParking == "是"])), "万元\n",
"无停车位: 平均价格",
round(mean(house_data$Price[house_data$HasParking == "否"])), "万元"),
size = 4)图形解读:
有停车位的房屋整体价格高于无停车位的房屋
两条回归线斜率相似,表明停车位对价格的增值效应在不同面积房屋中相对稳定
大面积房屋中,有无停车位的价格差异更为明显
5 样本相似性可视化
5.1 案例数据解释与展示
模拟生成的顾客特征数据
变量解释:CustomerID:顾客唯一标识Age:年龄Income:年收入(万元)SpendingScore:消费评分(1-100)FamilySize:家庭成员数Membership:会员类型(Gold/Silver/Bronze)
library(ggplot2)
library(ggdendro) # 用于绘制树状图
library(heatmaply) # 用于交互式热图
library(DT)
library(cluster) # 用于聚类分析
set.seed(789)
# 创建顾客特征数据集
customer_data <- data.frame(
CustomerID = paste0("C", 1001:1200),
Age = round(rnorm(200, mean=40, sd=10)),
Income = round(abs(rnorm(200, mean=25, sd=8)), 1),
SpendingScore = sample(1:100, 200, replace=TRUE),
FamilySize = sample(1:5, 200, replace=TRUE, prob=c(0.1, 0.3, 0.3, 0.2, 0.1)),
Membership = sample(c("Gold", "Silver", "Bronze"), 200, replace=TRUE, prob=c(0.2, 0.3, 0.5))
)
# 调整年龄范围在18-70岁之间
customer_data$Age <- pmax(18, pmin(70, customer_data$Age))
# 展示前10行数据
datatable(head(customer_data, 10),
options = list(
pageLength = 5,
scrollX = TRUE
),
caption = "表1:顾客特征数据示例(前10行)")5.2 图形4——层次聚类树状图
# 选择数值变量并标准化
num_data <- scale(customer_data[, c("Age", "Income", "SpendingScore", "FamilySize")])
# 计算距离矩阵
dist_matrix <- dist(num_data, method = "euclidean")
# 层次聚类
hc <- hclust(dist_matrix, method = "ward.D2")
# 绘制树状图
ggdendrogram(hc, segments = TRUE, labels = FALSE, theme_dendro = FALSE) +
labs(title = "图形1——顾客特征层次聚类树状图",
subtitle = "基于年龄、收入、消费评分和家庭规模") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
axis.text.y = element_blank())图形解读:
树状图显示了200名顾客的聚类过程,纵轴高度表示聚类距离
可以观察到3-5个明显的顾客群体划分
右侧分支的顾客群体特征差异较大(连接高度较高)
左侧分支的顾客群体特征较为相似(连接高度较低)
6 时间序列可视化
6.1 案例数据解释与展示
模拟生成的零售销售数据(2020-2023年)
变量解释:Date:日期(日级别)Sales:每日销售额(万元)ProductCategory:产品类别(电子产品/家居用品/服装)StoreID:门店编号(A001-A010)Promotion:是否有促销活动(是/否)
library(ggplot2)
library(dplyr)
library(lubridate)
library(DT)
library(plotly) # 用于交互式时间序列图
set.seed(123)
# 创建日期序列
dates <- seq.Date(from = as.Date("2020-01-01"),
to = as.Date("2023-12-31"),
by = "day")
# 创建模拟销售数据
sales_data <- data.frame(
Date = rep(dates, each = 30), # 30个产品-门店组合
ProductCategory = rep(rep(c("电子产品", "家居用品", "服装"), each = 10),
length.out = length(dates)*30),
StoreID = rep(paste0("A", sprintf("%03d", 1:10)),
length.out = length(dates)*30),
Sales = round(abs(rnorm(length(dates)*30, mean = 5, sd = 2)), 1),
Promotion = sample(c("是", "否"), length(dates)*30,
replace = TRUE, prob = c(0.2, 0.8))
)
# 添加季节性和趋势
sales_data <- sales_data %>%
mutate(
# 添加年度增长趋势
Sales = Sales * (1 + year(Date) - 2020) * 0.3,
# 添加季节性
Sales = Sales * (1 + sin(yday(Date)/365 * 2 * pi) * 0.5),
# 促销日销量增加
Sales = ifelse(Promotion == "是", Sales * 1.8, Sales),
# 周末销量增加
Sales = ifelse(wday(Date) %in% c(1,7), Sales * 1.3, Sales),
# 不同品类调整
Sales = case_when(
ProductCategory == "电子产品" ~ Sales * 1.2,
ProductCategory == "服装" ~ Sales * 0.9,
TRUE ~ Sales
),
# 确保销售额为正数
Sales = round(abs(Sales), 1)
)
# 展示2023年1月的数据样本
datatable(sales_data %>%
filter(Date >= "2023-01-01" & Date <= "2023-01-07") %>%
arrange(Date),
options = list(
pageLength = 10,
scrollX = TRUE
),
caption = "表1:2023年1月第一周销售数据示例")6.2 图形5——折线图
# 加载必要的包
library(ggplot2)
library(scales) # 用于日期格式化
# 创建数据框
df <- data.frame(
date = seq(as.Date("2023-01-01"), by = "day", length.out = 10),
value = c(25, 30, 22, 28, 35, 40, 18, 27, 33, 45),
temperature = c(12.3, 14.1, 10.5, 16.0, 18.2, 20.5, 8.7, 15.3, 17.8, 22.1),
event = c(NA, "促销", NA, "节日", NA, "促销", "雨天", NA, NA, "节日")
)
# 绘制折线图
ggplot(df, aes(x = date)) +
geom_line(aes(y = value, color = "数值"), linewidth = 1) +
geom_point(aes(y = value, color = "数值"), size = 3) +
geom_line(aes(y = temperature * 2, color = "温度 (×2)"), linewidth = 1) + # 温度放大2倍便于比较
geom_point(aes(y = temperature * 2, color = "温度 (×2)"), size = 3) +
geom_text(aes(y = value, label = event), vjust = -1.5, na.rm = TRUE) + # 标注事件
scale_y_continuous(
name = "数值",
sec.axis = sec_axis(~ . / 2, name = "温度") # 次坐标轴
) +
scale_color_manual(values = c("数值" = "#1f77b4", "温度 (×2)" = "#ff7f0e")) +
labs(
title = "时间序列数据可视化(数值 vs 温度)",
x = "日期",
color = "变量"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "top"
)图形解读:
该折线图揭示了数值受事件(促销/节日)和温度的共同影响,但极端天气(雨天)可能成为主要风险点。下一步可结合散点图分析数值与温度的统计相关性,或使用时间序列模型(如ARIMA)预测未来趋势。