##下载与载入所需的R包
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.9
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(modelr)
options(na.action = na.warn)
##模型可视化
为了探究钻石的切工(cut),颜色(color),纯净度(clarity)和其价格(price)的关系,我们绘制箱形图来可视化其关系,发现质量差(切工差,颜色差,纯净度低)的钻石反而价格更高。其中最差的钻石颜色为J(微黄),最差的钻石纯净度为II(肉眼可见内含物)。
plot1 <- ggplot(data=diamonds, aes(x=cut,y=price,,fill=as.factor((cut)))) +
geom_boxplot(
alpha=0.7,
color="#1380A1")+
scale_fill_brewer(palette = "Greens")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")
plot1
plot2 <- ggplot(data=diamonds, aes(x=color,y=price,fill=as.factor((color)))) +
geom_boxplot(
alpha=0.7,
color="#1380A1")+
scale_fill_brewer(palette = "Greens")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")
plot2
plot3 <- ggplot(data=diamonds, aes(x=clarity,y=price,fill=as.factor((clarity)))) +
geom_boxplot(
alpha=0.7,
color="#1380A1")+
scale_fill_brewer(palette = "Greens")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")
plot3
这一现象的造成是因为一个重要混淆变量,钻石的重量(carat)。通过绘制六边形图来可视化钻石重量与钻石价格的关系,可以看出两个变量间存在强烈的线性模式。
plot4 <- ggplot(data=diamonds, aes(x=carat,y=price)) +
geom_hex(bins=50,
color="white")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")+
guides(fill = guide_legend(title="count",reverse = T))
plot4
为了将钻石重量和钻石价格的关系转化为线性模式,我们重点关注小于2.5克拉的钻石,并对其重量和价格变量进行对数变换。
diamonds2 <- diamonds %>%
filter(carat <= 2.5) %>%
mutate(lprice=log2(price),lcarat=log2(carat))
plot5 <- ggplot(data=diamonds2, aes(x=lcarat,y=lprice)) +
geom_hex(bins=50,
color="white")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")+
guides(fill = guide_legend(title="count",reverse = T))
plot5
分离重量变量的作用,移除钻石重量与价格强烈的线性模式,以便看见钻石其他特性对价格的影响。为此,我们拟合线性模型来让钻石重量和价格间的模式成为显式的,检查模型,利用模型得到预测值,并对其进行反向变换,还原对数变换。以钻石重量为x变量,价格的price预测值为y变量,绘制带线的六边形图。
mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)
grid <- diamonds2 %>%
data_grid(carat = seq_range(carat, 20)) %>%
mutate(lcarat = log2(carat)) %>%
add_predictions(mod_diamond, "lprice") %>%
mutate(price = 2 ^ lprice)
grid
## # A tibble: 20 × 4
## carat lcarat lprice price
## <dbl> <dbl> <dbl> <dbl>
## 1 0.2 -2.32 8.29 313.
## 2 0.321 -1.64 9.44 694.
## 3 0.442 -1.18 10.2 1188.
## 4 0.563 -0.828 10.8 1784.
## 5 0.684 -0.547 11.3 2475.
## 6 0.805 -0.312 11.7 3255.
## 7 0.926 -0.110 12.0 4119.
## 8 1.05 0.0668 12.3 5064.
## 9 1.17 0.225 12.6 6087.
## 10 1.29 0.367 12.8 7184.
## 11 1.41 0.496 13.0 8354.
## 12 1.53 0.615 13.2 9594.
## 13 1.65 0.725 13.4 10903.
## 14 1.77 0.827 13.6 12279.
## 15 1.89 0.922 13.7 13721.
## 16 2.02 1.01 13.9 15227.
## 17 2.14 1.10 14.0 16795.
## 18 2.26 1.17 14.2 18426.
## 19 2.38 1.25 14.3 20117.
## 20 2.5 1.32 14.4 21868.
plot6 <- ggplot(data=diamonds2, aes(x=carat,y=price)) +
geom_hex(bins=50,
color="white")+
geom_line(data = grid, color = "red", size = 1)+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")+
guides(fill = guide_legend(title="count",reverse = T))
plot6
检查模型的残差以验证是否移除了钻石重量与价格强烈的线性模式。可以看出残差平均分布,线性模式已被移除。
diamonds2 <- diamonds2 %>%
add_residuals (mod_diamond,"lresid")
plot7 <- ggplot(diamonds2, aes(x=lcarat, y=lresid))+
geom_hex(bins = 50,
color="white")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")+
guides(fill = guide_legend(title="count",reverse = T))
plot7
再次绘制boxplot箱形图来可视化钻石的切工,颜色,纯净度和其价格的关系。这次用残差代替价格绘图。可以看见,移除线性模式后,我们得到了期望中的钻石质量和价格的关系。质量更差的钻石,价格也更低。
plot8 <- ggplot(data=diamonds2, aes(x=cut,y=lresid,,fill=as.factor((cut)))) +
geom_boxplot(
alpha=0.7,
color="#1380A1")+
scale_fill_brewer(palette = "Greens")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")
plot8
plot9 <- ggplot(data=diamonds2, aes(x=color,y=lresid,fill=as.factor((color)))) +
geom_boxplot(
alpha=0.7,
color="#1380A1")+
scale_fill_brewer(palette = "Greens")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")
plot9
plot10 <- ggplot(data=diamonds2, aes(x=clarity,y=lresid,fill=as.factor((clarity)))) +
geom_boxplot(
alpha=0.7,
color="#1380A1")+
scale_fill_brewer(palette = "Greens")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")
plot10
在模型中加入切工,颜色,纯净度变量,以显现它们对价格的影响。以钻石切工cut作为data_grid的变量,得到价格price的预测值,绘制散点图以可视化切工与价格变量之间的模式。
mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity,data = diamonds2)
grid <- diamonds2 %>%
data_grid(cut, .model = mod_diamond2) %>%
add_predictions (mod_diamond2)
grid
## # A tibble: 5 × 5
## cut lcarat color clarity pred
## <ord> <dbl> <chr> <chr> <dbl>
## 1 Fair -0.515 G VS2 11.2
## 2 Good -0.515 G VS2 11.3
## 3 Very Good -0.515 G VS2 11.4
## 4 Premium -0.515 G VS2 11.4
## 5 Ideal -0.515 G VS2 11.4
plot11 <- ggplot(data=grid,aes(x=cut,y=pred))+
geom_point(color="#2E8B57",size=3,alpha=0.7)+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")
plot11
检查新模型的残差,发现一些钻石有非常大的残差。
diamonds2 <- diamonds2 %>%
add_residuals (mod_diamond2,"lresid2")
plot12 <- ggplot(diamonds2, aes(x=lcarat, y=lresid2))+
geom_hex(bins = 50,
color="white")+
theme(plot.title = element_text(size = 25, face = "bold"),
plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
axis.ticks = element_blank(),axis.text = element_text(size = 14),
panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(),
panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
legend.position = "none")+
guides(fill = guide_legend(title="count",reverse = T))
plot12