1 排名

用于比较多个项目彼此之间的位置或性能。实际值比排名重要。

1.1 有序条形图

有序条形图是按Y轴变量排序的条形图。仅按感兴趣的变量对数据框进行排序不足以绘制需要的条形图。为了使条形图保持行的顺序,必须将X轴变量(即类别)转换为一个因子

让我们从mpg数据集中绘制每个制造商的平均城市里程。首先,汇总数据并对其进行排序,然后再绘制绘图。最后,将X变量转换为因子。

让我们看看如何完成。

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
mpg$manufacturer %>% table()
## .
##       audi  chevrolet      dodge       ford      honda    hyundai       jeep 
##         18         19         37         25          9         14          8 
## land rover    lincoln    mercury     nissan    pontiac     subaru     toyota 
##          4          3          4         13          5         14         34 
## volkswagen 
##         27
# aggregate(mpg$cty, by=list(mpg$manufacturer), FUN=mean)  # aggregate
mpg %>% 
  group_by(manufacturer) %>% 
  summarise(mean(cty))->cty_mpg
colnames(cty_mpg) <- c("make", "mileage")  # change column names
cty_mpg %>% 
  arrange(mileage)->cty_mpg

cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make)  # to retain the order in plot.
head(cty_mpg, 15)
## # A tibble: 15 x 2
##    make       mileage
##    <fct>        <dbl>
##  1 lincoln       11.3
##  2 land rover    11.5
##  3 dodge         13.1
##  4 mercury       13.2
##  5 jeep          13.5
##  6 ford          14  
##  7 chevrolet     15  
##  8 pontiac       17  
##  9 audi          17.6
## 10 nissan        18.1
## 11 toyota        18.5
## 12 hyundai       18.6
## 13 subaru        19.3
## 14 volkswagen    20.9
## 15 honda         24.4

X变量现在是a factor,让我们绘图。

factor(cty_mpg$make) # levels按照字母排序
##  [1] lincoln    land rover dodge      mercury    jeep       ford      
##  [7] chevrolet  pontiac    audi       nissan     toyota     hyundai   
## [13] subaru     volkswagen honda     
## 15 Levels: lincoln land rover dodge mercury jeep ford chevrolet ... honda
factor(cty_mpg$make,levels = cty_mpg$make)  # 按照make原始顺序排名
##  [1] lincoln    land rover dodge      mercury    jeep       ford      
##  [7] chevrolet  pontiac    audi       nissan     toyota     hyundai   
## [13] subaru     volkswagen honda     
## 15 Levels: lincoln land rover dodge mercury jeep ford chevrolet ... honda
factor(cty_mpg$make,levels = cty_mpg$make,labels = 1:15)  # 转换成数字
##  [1] 1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Draw plot
ggplot(cty_mpg, aes(x=make, y=mileage)) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  labs(title="Ordered Bar Chart", 
       subtitle="Make Vs Avg. Mileage", 
       caption="source: mpg") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

#如果X不是因子
cty_mpg$make <- cty_mpg$make %>% as.character()
cty_mpg %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame':    15 obs. of  2 variables:
##  $ make   : chr  "lincoln" "land rover" "dodge" "mercury" ...
##  $ mileage: num  11.3 11.5 13.1 13.2 13.5 ...
cty_mpg %>% 
  ggplot(aes(make,mileage)) +
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  labs(title="Ordered Bar Chart", 
       subtitle="Make Vs Avg. Mileage", 
       caption="source: mpg") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

1.2 棒棒糖图

棒棒糖图表传达的信息与条形图相同。通过将粗条减少为细线,可以减少混乱,并更加重视该值。看起来不错,很现代。

# Plot
cty_mpg %>% head()
## # A tibble: 6 x 2
##   make       mileage
##   <chr>        <dbl>
## 1 lincoln       11.3
## 2 land rover    11.5
## 3 dodge         13.1
## 4 mercury       13.2
## 5 jeep          13.5
## 6 ford          14
ggplot(cty_mpg, aes(x=factor(make,levels = make), y=mileage)) + 
  geom_point(size=3) + 
  geom_segment(aes(x=factor(make,levels = make), 
                   xend=factor(make,levels = make), 
                   y=0, 
                   yend=mileage)) + 
  labs(title="Lollipop Chart", 
       subtitle="Make Vs Avg. Mileage", 
       caption="source: mpg",
       x = "make") + 
  ggthemes::theme_economist() +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10),
        axis.text.x = element_text(angle=45, vjust=0.6))

1.3 点图

点图与棒棒糖非常相似,但是没有线条,而是翻转到水平位置。它更加强调了项目相对于实际值的等级排序以及实体之间的距离。

还是先将cty_mpg转换为合适格式

cty_mpg$make <- factor(cty_mpg$make,levels = cty_mpg$make)
cty_mpg %>% 
  ggplot(aes(make,mileage)) +
  geom_point(col="tomato2", size=3) +
  geom_segment(aes(x=make, 
                   xend=make, 
                   y=min(mileage), 
                   yend=max(mileage)), 
               linetype="dashed", 
               size=0.1) +   # Draw dashed lines
  labs(title="Dot Plot", 
       subtitle="Make Vs Avg. Mileage", 
       caption="source: mpg") +
  coord_flip() +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10),
        axis.text.x = element_text(angle=45, vjust=0.6)) +
  ggthemes::theme_economist()

1.4 坡度图

# prep data
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv")
colnames(df) <- c("continent", "1952", "1957")
left_label <- paste(df$continent, round(df$`1952`),sep=", ")
right_label <- paste(df$continent, round(df$`1957`),sep=", ")
df$class <- ifelse((df$`1957` - df$`1952`) < 0, "red", "green")

# Plot
p <- ggplot(df) + geom_segment(aes(x=1, xend=2, y=`1952`, yend=`1957`, col=class), size=.75, show.legend=F) + 
                  geom_vline(xintercept=1, linetype="dashed", size=.1) + 
                  geom_vline(xintercept=2, linetype="dashed", size=.1) +
                  scale_color_manual(labels = c("Up", "Down"), 
                                     values = c("green"="#00ba38", "red"="#f8766d")) +  # color of lines
                  labs(x="", y="Mean GdpPerCap") +  # Axis labels
                  xlim(.5, 2.5) + ylim(0,(1.1*(max(df$`1952`, df$`1957`))))  # X and Y axis limits

# Add texts
p <- p + geom_text(label=left_label, y=df$`1952`, x=rep(1, NROW(df)), hjust=1.1, size=3.5)
p <- p + geom_text(label=right_label, y=df$`1957`, x=rep(2, NROW(df)), hjust=-0.1, size=3.5)
p <- p + geom_text(label="Time 1", x=1, y=1.1*(max(df$`1952`, df$`1957`)), hjust=1.2, size=5)  # title
p <- p + geom_text(label="Time 2", x=2, y=1.1*(max(df$`1952`, df$`1957`)), hjust=-0.1, size=5)  # title

# Minify theme
p + theme(panel.background = element_blank(), 
           panel.grid = element_blank(),
           axis.ticks = element_blank(),
           axis.text.x = element_blank(),
           panel.border = element_blank(),
           plot.margin = unit(c(1,2,1,2), "cm"))

1.5 哑铃图

如果您希望:

  1. 可视化两个时间点之间的相对位置(如增长和下降)。
  2. 比较两个类别之间的距离。

哑铃图是一个很好的工具,为了获得正确的哑铃排序,应将Y变量作为一个因子,并且该因子变量的级别应与在图中显示的顺序相同。

health <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/health.csv")
health
##                 Area pct_2014 pct_2013
## 1            Houston     0.19     0.22
## 2              Miami     0.19     0.24
## 3             Dallas     0.18     0.21
## 4        San Antonio     0.15     0.19
## 5            Atlanta     0.15     0.18
## 6        Los Angeles     0.14     0.20
## 7              Tampa     0.14     0.17
## 8  Riverside, Calif.     0.14     0.19
## 9            Phoenix     0.13     0.17
## 10         Charlotte     0.13     0.15
## 11         San Diego     0.12     0.16
## 12   All Metro Areas     0.11     0.14
## 13           Chicago     0.11     0.14
## 14          New York     0.10     0.12
## 15            Denver     0.10     0.14
## 16  Washington, D.C.     0.09     0.11
## 17          Portland     0.09     0.13
## 18         St. Louis     0.09     0.10
## 19           Detroit     0.09     0.11
## 20      Philadelphia     0.08     0.10
## 21           Seattle     0.08     0.12
## 22     San Francisco     0.08     0.11
## 23         Baltimore     0.06     0.09
## 24        Pittsburgh     0.06     0.07
## 25       Minneapolis     0.06     0.08
## 26            Boston     0.04     0.04
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
##   method                  from   
##   grid.draw.absoluteGrob  ggplot2
##   grobHeight.absoluteGrob ggplot2
##   grobWidth.absoluteGrob  ggplot2
##   grobX.absoluteGrob      ggplot2
##   grobY.absoluteGrob      ggplot2
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
health$Area <- factor(health$Area, levels=as.character(health$Area))  # for right ordering of the dumbells

# health$Area <- factor(health$Area)
gg <- ggplot(health, aes(x=pct_2013, xend=pct_2014, y=Area, group=Area)) + 
        geom_dumbbell(color="#a3c4dc", 
                      size=0.75, 
                      point.colour.l="#0e668b") + 
        scale_x_continuous(label=percent) + 
        labs(x=NULL, 
             y=NULL, 
             title="Dumbbell Chart", 
             subtitle="Pct Change: 2013 vs 2014", 
             caption="Source: https://github.com/hrbrmstr/ggalt") +
        theme(plot.title = element_text(hjust=0.5, face="bold"),
              plot.background=element_rect(fill="#f7f7f7"),
              panel.background=element_rect(fill="#f7f7f7"),
              panel.grid.minor=element_blank(),
              panel.grid.major.y=element_blank(),
              panel.grid.major.x=element_line(),
              axis.ticks=element_blank(),
              legend.position="top",
              panel.border=element_blank())
## Warning: Ignoring unknown parameters: point.colour.l
plot(gg)

2 分布图

当您有很多数据点并且想要研究数据点的位置和分布方式时。

2.1 连续变量的直方图

默认情况下,如果仅提供一个变量,则geom_bar()尝试计算计数。为了使其表现得像一个条形图,该stat=identity选项必须被设置和x和y必须提供的值。

可以使用geom_bar()geom_histogram()来完成连续变量的直方图。使用时geom_histogram(),您可以使用bins选项控制条形数量。否则,您可以使用设置每个箱所覆盖的范围binwidth。binwidth的值与建立直方图的连续变量的比例相同。

g <- ggplot(mpg, aes(displ)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(aes(fill=class), binwidth = .1, col="black", size=.1) +  # change binwidth
  labs(title="Histogram with Auto Binning", 
       subtitle="Engine Displacement across Vehicle Classes")  

g + geom_histogram(aes(fill=class), 
                   bins=10, 
                   col="black", 
                   size=.1) +   # change number of bins
  labs(title="Histogram with Fixed Bins", 
       subtitle="Engine Displacement across Vehicle Classes") 

2.2 类别变量的直方图

类别变量的直方图将导致频率图表显示每个类别的条形图。通过调整width,您可以调整钢筋的厚度。

# Histogram on a Categorical variable
g <- ggplot(mpg, aes(manufacturer))
g + geom_bar(aes(fill=class), width = 0.5) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Histogram on Categorical Variable", 
       subtitle="Manufacturer across Vehicle Classes") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

2.3 密度图

mpg %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame':    234 obs. of  11 variables:
##  $ manufacturer: chr  "audi" "audi" "audi" "audi" ...
##  $ model       : chr  "a4" "a4" "a4" "a4" ...
##  $ displ       : num  1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int  1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int  4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr  "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr  "f" "f" "f" "f" ...
##  $ cty         : int  18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int  29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr  "p" "p" "p" "p" ...
##  $ class       : chr  "compact" "compact" "compact" "compact" ...
ggplot(mpg, aes(cty)) +
  geom_density(aes(fill = cyl %>% as.factor()),alpha = 0.7) +
  scale_fill_discrete("Cyl") +
  labs(title="Density plot", 
         subtitle="City Mileage Grouped by Number of cylinders",
         caption="Source: mpg",
         x="City Mileage",
         fill="# Cylinders") +
  ggthemes::theme_economist() +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

2.4 箱形图

箱形图是研究分布的绝佳工具。它还可以显示多个组内的分布,以及中位数,范围和离群值(如果有)。

框内的黑线代表中位数。盒子的顶部是75%分位数,盒子的底部是25%分位数。线的端点(也称为晶须)的距离为1.5 * IQR,其中IQR或四分位范围是25到75个百分位之间的距离。晶须外的点被标记为点,通常被认为是极值点。

设置varwidth=T可将框的宽度调整为与其中包含的观察值成比例。

library(forcats)
mpg %>% 
  ggplot(aes(class,cty)) +
  geom_boxplot(varwidth = TRUE)
mpg %>% 
  arrange(-cty) %>% 
  ggplot(aes(class,cty)) +
  geom_boxplot(varwidth = FALSE,aes(fill = class)) +
  labs(title="Box plot", 
       subtitle="City Mileage grouped by Class of vehicle",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(aes(fill=factor(cyl))) + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Box plot", 
       subtitle="City Mileage grouped by Class of vehicle",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10)) +
  scale_fill_discrete("cyl")

2.5 点+箱形图

box plot提供的信息之上,dot plot可以以汇总统计的形式为每个组提供更清晰的信息。这些点是交错排列的,每个点代表一个观察结果。因此,在下表中,给定制造商的点的数量将与源数据中制造商的行数匹配。

ggplot(mpg, aes(manufacturer, cty)) + 
  geom_boxplot() + 
  geom_dotplot(binaxis='y', 
               stackdir='center', 
               dotsize = .5, 
               fill="red") +
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Box plot + Dot plot", 
       subtitle="City Mileage vs Class: Each dot represents 1 row in source data",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

2.6 塔夫特箱线图

Tufte box plot,由ggthemes package提供,灵感来自爱德华·塔夫特(Edward Tufte)的作品。Tufte的箱形图只是一个极小的箱形图,在视觉上很有吸引力。

library(ggthemes)
ggplot(mpg, aes(manufacturer, cty)) +
  geom_tufteboxplot() + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6)) + 
  labs(title="Tufte Styled Boxplot", 
       subtitle="City Mileage grouped by Class of vehicle",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage") +
  theme_economist() +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

2.7 小提琴图

小提琴图类似于箱形图,但显示了组内的密度。在箱线图中提供的信息不多。可以使用geom_violin()来绘制它。

mpg %>% 
  ggplot(aes(class,cty)) +
  geom_violin(aes(fill = class)) +
  geom_boxplot(aes(col = class,fill = class),alpha = 0.7,outlier.colour = "red") +
  labs(title="Violin plot and box plot", 
       subtitle="City Mileage vs Class of vehicle",
       caption="Source: mpg",
       x="Class of Vehicle",
       y="City Mileage") +
  theme(plot.title = element_text(size = 20,hjust = 0.5),
        axis.title = element_text(size = 15),
        axis.text = element_text(size = 10))

2.8 种群锥体(表示人口分布的统计图表)

人口金字塔提供了一种独特的方式来可视化有多少人口或人口的百分比属于某一类别。下面的金字塔是一个很好的例子,说明了在电子邮件营销活动漏斗的每个阶段有多少用户被保

# Read data
email_campaign_funnel <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv")
email_campaign_funnel
##                                     Stage Gender         Users
## 1                      Stage 01: Browsers   Male -1.492762e+07
## 2               Stage 02: Unbounced Users   Male -1.286266e+07
## 3                 Stage 03: Email Signups   Male -1.136190e+07
## 4               Stage 04: Email Confirmed   Male -9.411708e+06
## 5          Stage 05: Campaign-Email Opens   Male -8.074317e+06
## 6  Stage 06: Campaign-Email Clickthroughs   Male -6.958512e+06
## 7               Stage 07: Buy Button Page   Male -6.045363e+06
## 8           Stage 08: Buy Button Clickers   Male -5.029954e+06
## 9        Stage 09: Cart Confirmation Page   Male -4.008034e+06
## 10    Stage 10: Address Verification Page   Male -3.172555e+06
## 11            Stage 11: Submit Order Page   Male -2.484808e+06
## 12                      Stage 12: Payment   Male -1.903727e+06
## 13           Stage 13: Payment Successful   Male -1.490277e+06
## 14      Stage 14: 1st Successful Purchase   Male -1.152004e+06
## 15                 Stage 15: 2nd Purchase   Male -7.707481e+05
## 16                 Stage 16: 3rd Purchase   Male -4.344300e+05
## 17                 Stage 17: 4th Purchase   Male -1.950319e+05
## 18                 Stage 18: 5th Purchase   Male -5.857022e+04
## 19                 Stage 18: 5th Purchase   Male -1.221545e+04
## 20                 Stage 18: 5th Purchase   Male  8.884954e+02
## 21                 Stage 18: 5th Purchase   Male -4.436191e+02
## 22                     Stage 01: Browsers Female  1.422643e+07
## 23              Stage 02: Unbounced Users Female  1.227604e+07
## 24                Stage 03: Email Signups Female  1.085039e+07
## 25              Stage 04: Email Confirmed Female  8.999932e+06
## 26         Stage 05: Campaign-Email Opens Female  7.732693e+06
## 27 Stage 06: Campaign-Email Clickthroughs Female  6.666394e+06
## 28              Stage 07: Buy Button Page Female  5.743260e+06
## 29          Stage 08: Buy Button Clickers Female  4.723255e+06
## 30       Stage 09: Cart Confirmation Page Female  3.680879e+06
## 31    Stage 10: Address Verification Page Female  3.002641e+06
## 32            Stage 11: Submit Order Page Female  2.467805e+06
## 33                      Stage 12: Payment Female  1.977278e+06
## 34           Stage 13: Payment Successful Female  1.593650e+06
## 35      Stage 14: 1st Successful Purchase Female  1.229651e+06
## 36                 Stage 15: 2nd Purchase Female  8.284969e+05
## 37                 Stage 16: 3rd Purchase Female  4.866220e+05
## 38                 Stage 17: 4th Purchase Female  2.271061e+05
## 39                 Stage 18: 5th Purchase Female  7.346678e+04
## 40                 Stage 18: 5th Purchase Female  1.265455e+04
## 41                 Stage 18: 5th Purchase Female  3.425495e+03
## 42                 Stage 18: 5th Purchase Female -3.066191e+02
# X Axis Breaks and Labels 
brks <- seq(-15000000, 15000000, 5000000)
lbls = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "m")

# Plot
ggplot(email_campaign_funnel, aes(x = Stage, y = Users, fill = Gender)) +   # Fill column
                              geom_bar(stat = "identity", width = .6) +   # draw the bars
                              scale_y_continuous(breaks = brks,   # Breaks
                                                 labels = lbls) + # Labels
                              coord_flip() +  # Flip axes
                              labs(title="Email Campaign Funnel") +
                              theme_tufte() +  # Tufte theme from ggfortify
                              theme(plot.title = element_text(hjust = .5), 
                                    axis.ticks = element_blank()) +   # Centre plot title
                              scale_fill_brewer(palette = "Set1")  # Color palette