##下载与载入所需的R包

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.9
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(modelr)
options(na.action = na.warn)

##模型可视化

为了探究钻石的切工(cut),颜色(color),纯净度(clarity)和其价格(price)的关系,我们绘制箱形图来可视化其关系,发现质量差(切工差,颜色差,纯净度低)的钻石反而价格更高。其中最差的钻石颜色为J(微黄),最差的钻石纯净度为II(肉眼可见内含物)。

plot1 <- ggplot(data=diamonds, aes(x=cut,y=price,,fill=as.factor((cut)))) +
  geom_boxplot(
           alpha=0.7,
           color="#1380A1")+
  scale_fill_brewer(palette = "Greens")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")
plot1

plot2 <- ggplot(data=diamonds, aes(x=color,y=price,fill=as.factor((color)))) +
  geom_boxplot(
    alpha=0.7,
    color="#1380A1")+
  scale_fill_brewer(palette = "Greens")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")
plot2

plot3 <- ggplot(data=diamonds, aes(x=clarity,y=price,fill=as.factor((clarity)))) +
  geom_boxplot(
    alpha=0.7,
    color="#1380A1")+
  scale_fill_brewer(palette = "Greens")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")
plot3

这一现象的造成是因为一个重要混淆变量,钻石的重量(carat)。通过绘制六边形图来可视化钻石重量与钻石价格的关系,可以看出两个变量间存在强烈的线性模式。

plot4 <- ggplot(data=diamonds, aes(x=carat,y=price)) +
  geom_hex(bins=50,
           color="white")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")+
  guides(fill = guide_legend(title="count",reverse = T))
plot4

为了将钻石重量和钻石价格的关系转化为线性模式,我们重点关注小于2.5克拉的钻石,并对其重量和价格变量进行对数变换。

diamonds2 <- diamonds %>%
  filter(carat <= 2.5) %>%
  mutate(lprice=log2(price),lcarat=log2(carat))
plot5 <- ggplot(data=diamonds2, aes(x=lcarat,y=lprice)) +
  geom_hex(bins=50,
           color="white")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")+
  guides(fill = guide_legend(title="count",reverse = T))
plot5

分离重量变量的作用,移除钻石重量与价格强烈的线性模式,以便看见钻石其他特性对价格的影响。为此,我们拟合线性模型来让钻石重量和价格间的模式成为显式的,检查模型,利用模型得到预测值,并对其进行反向变换,还原对数变换。以钻石重量为x变量,价格的price预测值为y变量,绘制带线的六边形图。

mod_diamond <- lm(lprice ~ lcarat, data = diamonds2)
grid <- diamonds2 %>%
  data_grid(carat = seq_range(carat, 20)) %>%
  mutate(lcarat = log2(carat)) %>%
  add_predictions(mod_diamond, "lprice") %>%
  mutate(price = 2 ^ lprice)
grid
## # A tibble: 20 × 4
##    carat  lcarat lprice  price
##    <dbl>   <dbl>  <dbl>  <dbl>
##  1 0.2   -2.32     8.29   313.
##  2 0.321 -1.64     9.44   694.
##  3 0.442 -1.18    10.2   1188.
##  4 0.563 -0.828   10.8   1784.
##  5 0.684 -0.547   11.3   2475.
##  6 0.805 -0.312   11.7   3255.
##  7 0.926 -0.110   12.0   4119.
##  8 1.05   0.0668  12.3   5064.
##  9 1.17   0.225   12.6   6087.
## 10 1.29   0.367   12.8   7184.
## 11 1.41   0.496   13.0   8354.
## 12 1.53   0.615   13.2   9594.
## 13 1.65   0.725   13.4  10903.
## 14 1.77   0.827   13.6  12279.
## 15 1.89   0.922   13.7  13721.
## 16 2.02   1.01    13.9  15227.
## 17 2.14   1.10    14.0  16795.
## 18 2.26   1.17    14.2  18426.
## 19 2.38   1.25    14.3  20117.
## 20 2.5    1.32    14.4  21868.
plot6 <- ggplot(data=diamonds2, aes(x=carat,y=price)) +
  geom_hex(bins=50,
           color="white")+
  geom_line(data = grid, color = "red", size = 1)+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")+
  guides(fill = guide_legend(title="count",reverse = T))
plot6

检查模型的残差以验证是否移除了钻石重量与价格强烈的线性模式。可以看出残差平均分布,线性模式已被移除。

diamonds2 <- diamonds2 %>%
  add_residuals (mod_diamond,"lresid")
plot7 <- ggplot(diamonds2, aes(x=lcarat, y=lresid))+
geom_hex(bins = 50,
         color="white")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")+
  guides(fill = guide_legend(title="count",reverse = T))
plot7

再次绘制boxplot箱形图来可视化钻石的切工,颜色,纯净度和其价格的关系。这次用残差代替价格绘图。可以看见,移除线性模式后,我们得到了期望中的钻石质量和价格的关系。质量更差的钻石,价格也更低。

plot8 <- ggplot(data=diamonds2, aes(x=cut,y=lresid,,fill=as.factor((cut)))) +
  geom_boxplot(
    alpha=0.7,
    color="#1380A1")+
  scale_fill_brewer(palette = "Greens")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")
plot8

plot9 <- ggplot(data=diamonds2, aes(x=color,y=lresid,fill=as.factor((color)))) +
  geom_boxplot(
    alpha=0.7,
    color="#1380A1")+
  scale_fill_brewer(palette = "Greens")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")
plot9

plot10 <- ggplot(data=diamonds2, aes(x=clarity,y=lresid,fill=as.factor((clarity)))) +
  geom_boxplot(
    alpha=0.7,
    color="#1380A1")+
  scale_fill_brewer(palette = "Greens")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")
plot10

在模型中加入切工,颜色,纯净度变量,以显现它们对价格的影响。以钻石切工cut作为data_grid的变量,得到价格price的预测值,绘制散点图以可视化切工与价格变量之间的模式。

mod_diamond2 <- lm(lprice ~ lcarat + color + cut + clarity,data = diamonds2)
grid <- diamonds2 %>%
  data_grid(cut, .model = mod_diamond2) %>%
  add_predictions (mod_diamond2)
grid
## # A tibble: 5 × 5
##   cut       lcarat color clarity  pred
##   <ord>      <dbl> <chr> <chr>   <dbl>
## 1 Fair      -0.515 G     VS2      11.2
## 2 Good      -0.515 G     VS2      11.3
## 3 Very Good -0.515 G     VS2      11.4
## 4 Premium   -0.515 G     VS2      11.4
## 5 Ideal     -0.515 G     VS2      11.4
plot11 <- ggplot(data=grid,aes(x=cut,y=pred))+
  geom_point(color="#2E8B57",size=3,alpha=0.7)+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")
plot11

检查新模型的残差,发现一些钻石有非常大的残差。

diamonds2 <- diamonds2 %>%
  add_residuals (mod_diamond2,"lresid2")
plot12 <- ggplot(diamonds2, aes(x=lcarat, y=lresid2))+
  geom_hex(bins = 50,
           color="white")+
  theme(plot.title = element_text(size = 25, face = "bold"),
        plot.subtitle = element_text(size = 18),axis.title = element_text(size = 16,color = "#222222",face = "bold"),
        axis.ticks = element_blank(),axis.text = element_text(size = 14),
        panel.grid.major.y = element_line(color = "#cbcbcb"), panel.grid.major.x = element_blank(), 
        panel.background = element_blank(), legend.title = element_text(size = 18,face = "bold"),
        legend.position = "none")+
  guides(fill = guide_legend(title="count",reverse = T))
plot12