完整的ggplot2教程-第3部分(1)|相关性

针对哪种类型的问题使用哪种类型的可视化？本教程可帮助您为特定目标选择正确的图表类型，以及如何在R中使用ggplot2实现图表。

这是ggplot2三部分教程第3部分第一小节，主要介绍涉及相关性的图表。

有效的图表是：

传达正确的信息而不会扭曲事实。
简单但优雅。它不应该迫使您为获得它而想太多。
支持美学信息而不是遮蔽信息。
没有过多的信息。

下面的列表根据其主要目的对可视化进行排序。首先，您可以构建8种类型的目标图。因此，在实际绘制图表之前，请尝试弄清楚您想通过可视化传达或检查哪些发现和关系。它很有可能属于这8个类别中的一个（或有时更多）。

相关性：以下图表有助于检查两个变量之间的相关程度。

1 散点图

数据分析中最常用的图无疑是散点图。每当您想了解两个变量之间关系的性质时，首选始终是散点图。

可以使用绘制geom_point()。另外，geom_smooth默认情况下会绘制一条平滑线（基于loess），可以通过设置来调整以绘制最佳拟合线method=‘lm’。

library(tidyverse)

## -- Attaching packages ------------------------ tidyverse 1.3.0 --

## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0

## -- Conflicts --------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

data("midwest", package = "ggplot2")
midwest %>% 
  str()

## Classes 'tbl_df', 'tbl' and 'data.frame':    437 obs. of  28 variables:
##  $ PID                 : int  561 562 563 564 565 566 567 568 569 570 ...
##  $ county              : chr  "ADAMS" "ALEXANDER" "BOND" "BOONE" ...
##  $ state               : chr  "IL" "IL" "IL" "IL" ...
##  $ area                : num  0.052 0.014 0.022 0.017 0.018 0.05 0.017 0.027 0.024 0.058 ...
##  $ poptotal            : int  66090 10626 14991 30806 5836 35688 5322 16805 13437 173025 ...
##  $ popdensity          : num  1271 759 681 1812 324 ...
##  $ popwhite            : int  63917 7054 14477 29344 5264 35157 5298 16519 13384 146506 ...
##  $ popblack            : int  1702 3496 429 127 547 50 1 111 16 16559 ...
##  $ popamerindian       : int  98 19 35 46 14 65 8 30 8 331 ...
##  $ popasian            : int  249 48 16 150 5 195 15 61 23 8033 ...
##  $ popother            : int  124 9 34 1139 6 221 0 84 6 1596 ...
##  $ percwhite           : num  96.7 66.4 96.6 95.3 90.2 ...
##  $ percblack           : num  2.575 32.9 2.862 0.412 9.373 ...
##  $ percamerindan       : num  0.148 0.179 0.233 0.149 0.24 ...
##  $ percasian           : num  0.3768 0.4517 0.1067 0.4869 0.0857 ...
##  $ percother           : num  0.1876 0.0847 0.2268 3.6973 0.1028 ...
##  $ popadults           : int  43298 6724 9669 19272 3979 23444 3583 11323 8825 95971 ...
##  $ perchsd             : num  75.1 59.7 69.3 75.5 68.9 ...
##  $ percollege          : num  19.6 11.2 17 17.3 14.5 ...
##  $ percprof            : num  4.36 2.87 4.49 4.2 3.37 ...
##  $ poppovertyknown     : int  63628 10529 14235 30337 4815 35107 5241 16455 13081 154934 ...
##  $ percpovertyknown    : num  96.3 99.1 95 98.5 82.5 ...
##  $ percbelowpoverty    : num  13.15 32.24 12.07 7.21 13.52 ...
##  $ percchildbelowpovert: num  18 45.8 14 11.2 13 ...
##  $ percadultpoverty    : num  11.01 27.39 10.85 5.54 11.14 ...
##  $ percelderlypoverty  : num  12.44 25.23 12.7 6.22 19.2 ...
##  $ inmetro             : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ category            : chr  "AAR" "LHR" "AAR" "ALU" ...

midwest %>% 
  is.na() %>% 
  sum()

## [1] 0

complete.cases(midwest) %>% sum()

## [1] 437

dim(midwest)

## [1] 437  28

gg <- ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) + 
  geom_smooth(method="loess", se=FALSE) + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) + 
  labs(subtitle="Area Vs Population", 
       y="Population", 
       x="Area", 
       title="Scatterplot", 
       caption = "Source: midwest") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        plot.subtitle = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))
gg + 
  ggthemes::theme_economist() +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        plot.subtitle = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

2 带有区域的散点图

在介绍结果时，有时我会在图表中加上某些特殊的点或区域组，以便引起人们对那些特殊情况的注意。使用geom_encircle()可以方便地完成此操作。

在中geom_encircle()，将设置为data仅包含点（行）或兴趣点的新数据框。此外，您可以expand使曲线通过点外。曲线的color和size（厚度）也可以修改。请参见以下示例。

theme_set(ggthemes::theme_economist())
# devtools::install_github("hrbrmstr/ggalt")
library(ggalt)

## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2

# midwest_select <- midwest[midwest$poptotal > 350000 & 
#                             midwest$poptotal <= 500000 & 
#                             midwest$area > 0.01 & 
#                             midwest$area < 0.1, ]
midwest %>% 
  filter(poptotal>350000,poptotal<=500000,
         area>0.01,area<0.1)->midwest_select
# Plot
ggplot(midwest, aes(x=area, y=poptotal)) + 
  geom_point(aes(col=state, size=popdensity)) +   # draw points
  geom_smooth(method="loess", se=F) + 
  xlim(c(0, 0.1)) + 
  ylim(c(0, 500000)) +   # draw smoothing line
  geom_encircle(aes(x=area, y=poptotal), 
                data=midwest_select, 
                color="red", 
                size=2, 
                expand=0.08) +   # encircle
  labs(subtitle="Area Vs Population", 
       y="Population", 
       x="Area", 
       title="Scatterplot + Encircle", 
       caption="Source: midwest") + 
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        plot.subtitle = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

## Warning: Removed 15 rows containing non-finite values (stat_smooth).

## Warning: Removed 15 rows containing missing values (geom_point).

3 抖动图

让我们看一下绘制散点图的新数据。这次，我将使用mpg数据集绘制城市里程（cty）与高速公路里程（hwy）的关系。

mpg %>% 
  ggplot(aes(cty,hwy)) +
  geom_point() +
  geom_smooth(method = "lm",se = FALSE) +
  labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="Scatterplot with overlapping points", 
       caption="Source: midwest") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        plot.subtitle = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

我们这里是mpg数据集中城市和公路里程的散点图。我们已经看到了类似的散点图，看起来很整洁，并且清楚地了解了城市里程（cty）和公路里程（hwy）之间的相关性。

但是，这种无辜的情节掩藏了一些东西。你可以找到？

dim(mpg)

## [1] 234  11

原始数据有234个数据点，但该图表似乎显示较少的点。发生了什么事？这是因为有许多重叠的点显示为单个点。因此，下次您使用整数创建散点图时，请格外小心。

那么如何处理呢？没有什么选择。我们可以用做一个抖动图jitter_geom()。顾名思义，重叠点会根据width参数控制的阈值在其原始位置周围随机抖动。

g <- ggplot(mpg, aes(cty, hwy))
g + geom_jitter(width = .5, size=1) +
  labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="Jittered Points") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        plot.subtitle = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

更多的width显示更多点。

4 计数图

解决数据点重叠问题的第二种方法是使用所谓的计数表。只要有更多的点重叠，圆的大小就会变大。

ggplot(mpg, aes(cty, hwy)) +
  geom_count(col="tomato3", show.legend=FALSE) +
  labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="Counts Plot") +
    theme(plot.title = element_text(size = 20,hjust = 0.5),
        plot.subtitle = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

ggplot(mpg, aes(cty, hwy)) +
  geom_count(col="tomato3", show.legend=TRUE) +
  labs(subtitle="mpg: city vs highway mileage", 
       y="hwy", 
       x="cty", 
       title="Counts Plot") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        plot.subtitle = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

5 气泡图

尽管散点图可让您比较2个连续变量之间的关系，但如果您想基于以下内容理解基础组内的关系，则气泡图非常有用。

分类变量（通过更改颜色）
另一个连续变量（通过更改点的大小）。

用简单的话来说，如果您有4维数据，其中两个是数字（X和Y），另一个是分类（颜色），另一个是数字变量（大小），则气泡图更适合。

气泡图清楚地区分了制造商之间的范围以及最佳拟合线的斜率如何变化，从而提供了组之间更好的可视比较。

mpg %>% 
  filter(manufacturer %in% c("audi", "ford", "honda", "hyundai")) ->mpg_select
mpg_select %>% 
  ggplot(aes(displ,cty)) + 
  geom_jitter(aes(col=manufacturer, size=hwy)) + 
  geom_smooth(aes(col=manufacturer), method="lm", se=F) +
  labs(subtitle="mpg: Displacement vs City Mileage",
       title="Bubble chart") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        plot.subtitle = element_text(size = 15),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

6 动画气泡图

可以使用该gganimate程序包实现动画气泡图。它与气泡图相同，但是，您必须显示这些值如何在第五维（通常是时间）上变化。

关键要做的是将设置为aes(frame)要为其设置动画的所需列。其余与情节建设有关的程序是相同的。绘制完图后，您可以gganimate()通过设置selected来对其进行动画处理interval。

# install.packages("gganimate")
library(gganimate)
library(gapminder)

gapminder %>% DT::datatable()
g <- ggplot(gapminder, aes(gdpPercap, lifeExp, size = pop, frame = year)) +
  geom_jitter() +
  geom_smooth(aes(group = year), 
              method = "lm", 
              show.legend = FALSE) +
  facet_wrap(~continent, scales = "free") +
  scale_x_log10()  # convert to log scale
g
gganimate(g, interval=0.2)

7 边际直方图/箱线图

如果要在同一图表中显示关系和分布，请使用边际直方图。它在散点图的边缘处具有X和Y变量的直方图。

可以使用’ggExtra’包中的ggMarginal()函数来实现。除了之外histogram，您还可以通过设置相应的选项来选择绘制边线boxplot或density绘图type。

library(ggExtra)
data(mpg, package="ggplot2")

# Scatterplot
mpg_select <- mpg[mpg$hwy >= 35 & mpg$cty > 27, ]
g <- ggplot(mpg, aes(cty, hwy)) + 
  geom_count(,show.legend = FALSE) + 
  geom_smooth(method="lm", se=FALSE)
g
ggMarginal(g, type = "histogram", fill="transparent")
ggMarginal(g, type = "boxplot", fill="transparent")
ggMarginal(g, type = "density", fill="transparent")

8 相关图

library(ggcorrplot)

# Correlation matrix
data(mtcars)
corr <- round(cor(mtcars), 1)

# Plot
ggcorrplot(corr, 
           hc.order = TRUE, 
           type = "lower", 
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of mtcars", 
           ggtheme=ggthemes::theme_economist())