用于比较多个项目彼此之间的位置或性能。实际值比排名重要。
有序条形图是按Y轴变量排序的条形图。仅按感兴趣的变量对数据框进行排序不足以绘制需要的条形图。为了使条形图保持行的顺序,必须将X轴变量(即类别)转换为一个因子。
让我们从mpg数据集中绘制每个制造商的平均城市里程。首先,汇总数据并对其进行排序,然后再绘制绘图。最后,将X变量转换为因子。
让我们看看如何完成。
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 1.0.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
mpg$manufacturer %>% table()
## .
## audi chevrolet dodge ford honda hyundai jeep
## 18 19 37 25 9 14 8
## land rover lincoln mercury nissan pontiac subaru toyota
## 4 3 4 13 5 14 34
## volkswagen
## 27
# aggregate(mpg$cty, by=list(mpg$manufacturer), FUN=mean) # aggregate
mpg %>%
group_by(manufacturer) %>%
summarise(mean(cty))->cty_mpg
colnames(cty_mpg) <- c("make", "mileage") # change column names
cty_mpg %>%
arrange(mileage)->cty_mpg
cty_mpg$make <- factor(cty_mpg$make, levels = cty_mpg$make) # to retain the order in plot.
head(cty_mpg, 15)
## # A tibble: 15 x 2
## make mileage
## <fct> <dbl>
## 1 lincoln 11.3
## 2 land rover 11.5
## 3 dodge 13.1
## 4 mercury 13.2
## 5 jeep 13.5
## 6 ford 14
## 7 chevrolet 15
## 8 pontiac 17
## 9 audi 17.6
## 10 nissan 18.1
## 11 toyota 18.5
## 12 hyundai 18.6
## 13 subaru 19.3
## 14 volkswagen 20.9
## 15 honda 24.4
X变量现在是a factor,让我们绘图。
factor(cty_mpg$make) # levels按照字母排序
## [1] lincoln land rover dodge mercury jeep ford
## [7] chevrolet pontiac audi nissan toyota hyundai
## [13] subaru volkswagen honda
## 15 Levels: lincoln land rover dodge mercury jeep ford chevrolet ... honda
factor(cty_mpg$make,levels = cty_mpg$make) # 按照make原始顺序排名
## [1] lincoln land rover dodge mercury jeep ford
## [7] chevrolet pontiac audi nissan toyota hyundai
## [13] subaru volkswagen honda
## 15 Levels: lincoln land rover dodge mercury jeep ford chevrolet ... honda
factor(cty_mpg$make,levels = cty_mpg$make,labels = 1:15) # 转换成数字
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Draw plot
ggplot(cty_mpg, aes(x=make, y=mileage)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Ordered Bar Chart",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10))
#如果X不是因子
cty_mpg$make <- cty_mpg$make %>% as.character()
cty_mpg %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame': 15 obs. of 2 variables:
## $ make : chr "lincoln" "land rover" "dodge" "mercury" ...
## $ mileage: num 11.3 11.5 13.1 13.2 13.5 ...
cty_mpg %>%
ggplot(aes(make,mileage)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
labs(title="Ordered Bar Chart",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10))
棒棒糖图表传达的信息与条形图相同。通过将粗条减少为细线,可以减少混乱,并更加重视该值。看起来不错,很现代。
# Plot
cty_mpg %>% head()
## # A tibble: 6 x 2
## make mileage
## <chr> <dbl>
## 1 lincoln 11.3
## 2 land rover 11.5
## 3 dodge 13.1
## 4 mercury 13.2
## 5 jeep 13.5
## 6 ford 14
ggplot(cty_mpg, aes(x=factor(make,levels = make), y=mileage)) +
geom_point(size=3) +
geom_segment(aes(x=factor(make,levels = make),
xend=factor(make,levels = make),
y=0,
yend=mileage)) +
labs(title="Lollipop Chart",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg",
x = "make") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10),
axis.text.x = element_text(angle=45, vjust=0.6))
点图与棒棒糖非常相似,但是没有线条,而是翻转到水平位置。它更加强调了项目相对于实际值的等级排序以及实体之间的距离。
还是先将cty_mpg转换为合适格式
cty_mpg$make <- factor(cty_mpg$make,levels = cty_mpg$make)
cty_mpg %>%
ggplot(aes(make,mileage)) +
geom_point(col="tomato2", size=3) +
geom_segment(aes(x=make,
xend=make,
y=min(mileage),
yend=max(mileage)),
linetype="dashed",
size=0.1) + # Draw dashed lines
labs(title="Dot Plot",
subtitle="Make Vs Avg. Mileage",
caption="source: mpg") +
coord_flip() +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10),
axis.text.x = element_text(angle=45, vjust=0.6)) +
ggthemes::theme_economist()
# prep data
df <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/gdppercap.csv")
colnames(df) <- c("continent", "1952", "1957")
left_label <- paste(df$continent, round(df$`1952`),sep=", ")
right_label <- paste(df$continent, round(df$`1957`),sep=", ")
df$class <- ifelse((df$`1957` - df$`1952`) < 0, "red", "green")
# Plot
p <- ggplot(df) + geom_segment(aes(x=1, xend=2, y=`1952`, yend=`1957`, col=class), size=.75, show.legend=F) +
geom_vline(xintercept=1, linetype="dashed", size=.1) +
geom_vline(xintercept=2, linetype="dashed", size=.1) +
scale_color_manual(labels = c("Up", "Down"),
values = c("green"="#00ba38", "red"="#f8766d")) + # color of lines
labs(x="", y="Mean GdpPerCap") + # Axis labels
xlim(.5, 2.5) + ylim(0,(1.1*(max(df$`1952`, df$`1957`)))) # X and Y axis limits
# Add texts
p <- p + geom_text(label=left_label, y=df$`1952`, x=rep(1, NROW(df)), hjust=1.1, size=3.5)
p <- p + geom_text(label=right_label, y=df$`1957`, x=rep(2, NROW(df)), hjust=-0.1, size=3.5)
p <- p + geom_text(label="Time 1", x=1, y=1.1*(max(df$`1952`, df$`1957`)), hjust=1.2, size=5) # title
p <- p + geom_text(label="Time 2", x=2, y=1.1*(max(df$`1952`, df$`1957`)), hjust=-0.1, size=5) # title
# Minify theme
p + theme(panel.background = element_blank(),
panel.grid = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
panel.border = element_blank(),
plot.margin = unit(c(1,2,1,2), "cm"))
如果您希望:
哑铃图是一个很好的工具,为了获得正确的哑铃排序,应将Y变量作为一个因子,并且该因子变量的级别应与在图中显示的顺序相同。
health <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/health.csv")
health
## Area pct_2014 pct_2013
## 1 Houston 0.19 0.22
## 2 Miami 0.19 0.24
## 3 Dallas 0.18 0.21
## 4 San Antonio 0.15 0.19
## 5 Atlanta 0.15 0.18
## 6 Los Angeles 0.14 0.20
## 7 Tampa 0.14 0.17
## 8 Riverside, Calif. 0.14 0.19
## 9 Phoenix 0.13 0.17
## 10 Charlotte 0.13 0.15
## 11 San Diego 0.12 0.16
## 12 All Metro Areas 0.11 0.14
## 13 Chicago 0.11 0.14
## 14 New York 0.10 0.12
## 15 Denver 0.10 0.14
## 16 Washington, D.C. 0.09 0.11
## 17 Portland 0.09 0.13
## 18 St. Louis 0.09 0.10
## 19 Detroit 0.09 0.11
## 20 Philadelphia 0.08 0.10
## 21 Seattle 0.08 0.12
## 22 San Francisco 0.08 0.11
## 23 Baltimore 0.06 0.09
## 24 Pittsburgh 0.06 0.07
## 25 Minneapolis 0.06 0.08
## 26 Boston 0.04 0.04
library(ggalt)
## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
health$Area <- factor(health$Area, levels=as.character(health$Area)) # for right ordering of the dumbells
# health$Area <- factor(health$Area)
gg <- ggplot(health, aes(x=pct_2013, xend=pct_2014, y=Area, group=Area)) +
geom_dumbbell(color="#a3c4dc",
size=0.75,
point.colour.l="#0e668b") +
scale_x_continuous(label=percent) +
labs(x=NULL,
y=NULL,
title="Dumbbell Chart",
subtitle="Pct Change: 2013 vs 2014",
caption="Source: https://github.com/hrbrmstr/ggalt") +
theme(plot.title = element_text(hjust=0.5, face="bold"),
plot.background=element_rect(fill="#f7f7f7"),
panel.background=element_rect(fill="#f7f7f7"),
panel.grid.minor=element_blank(),
panel.grid.major.y=element_blank(),
panel.grid.major.x=element_line(),
axis.ticks=element_blank(),
legend.position="top",
panel.border=element_blank())
## Warning: Ignoring unknown parameters: point.colour.l
plot(gg)
当您有很多数据点并且想要研究数据点的位置和分布方式时。
默认情况下,如果仅提供一个变量,则geom_bar()尝试计算计数。为了使其表现得像一个条形图,该stat=identity选项必须被设置和x和y必须提供的值。
可以使用geom_bar()或geom_histogram()来完成连续变量的直方图。使用时geom_histogram(),您可以使用bins选项控制条形数量。否则,您可以使用设置每个箱所覆盖的范围binwidth。binwidth的值与建立直方图的连续变量的比例相同。
g <- ggplot(mpg, aes(displ)) + scale_fill_brewer(palette = "Spectral")
g + geom_histogram(aes(fill=class), binwidth = .1, col="black", size=.1) + # change binwidth
labs(title="Histogram with Auto Binning",
subtitle="Engine Displacement across Vehicle Classes")
g + geom_histogram(aes(fill=class),
bins=10,
col="black",
size=.1) + # change number of bins
labs(title="Histogram with Fixed Bins",
subtitle="Engine Displacement across Vehicle Classes")
类别变量的直方图将导致频率图表显示每个类别的条形图。通过调整width,您可以调整钢筋的厚度。
# Histogram on a Categorical variable
g <- ggplot(mpg, aes(manufacturer))
g + geom_bar(aes(fill=class), width = 0.5) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Histogram on Categorical Variable",
subtitle="Manufacturer across Vehicle Classes") +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10))
mpg %>% str()
## Classes 'tbl_df', 'tbl' and 'data.frame': 234 obs. of 11 variables:
## $ manufacturer: chr "audi" "audi" "audi" "audi" ...
## $ model : chr "a4" "a4" "a4" "a4" ...
## $ displ : num 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr "f" "f" "f" "f" ...
## $ cty : int 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr "p" "p" "p" "p" ...
## $ class : chr "compact" "compact" "compact" "compact" ...
ggplot(mpg, aes(cty)) +
geom_density(aes(fill = cyl %>% as.factor()),alpha = 0.7) +
scale_fill_discrete("Cyl") +
labs(title="Density plot",
subtitle="City Mileage Grouped by Number of cylinders",
caption="Source: mpg",
x="City Mileage",
fill="# Cylinders") +
ggthemes::theme_economist() +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10))
箱形图是研究分布的绝佳工具。它还可以显示多个组内的分布,以及中位数,范围和离群值(如果有)。
框内的黑线代表中位数。盒子的顶部是75%分位数,盒子的底部是25%分位数。线的端点(也称为晶须)的距离为1.5 * IQR,其中IQR或四分位范围是25到75个百分位之间的距离。晶须外的点被标记为点,通常被认为是极值点。
设置varwidth=T可将框的宽度调整为与其中包含的观察值成比例。
library(forcats)
mpg %>%
ggplot(aes(class,cty)) +
geom_boxplot(varwidth = TRUE)
mpg %>%
arrange(-cty) %>%
ggplot(aes(class,cty)) +
geom_boxplot(varwidth = FALSE,aes(fill = class)) +
labs(title="Box plot",
subtitle="City Mileage grouped by Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage") +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10))
g <- ggplot(mpg, aes(class, cty))
g + geom_boxplot(aes(fill=factor(cyl))) +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Box plot",
subtitle="City Mileage grouped by Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage") +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10)) +
scale_fill_discrete("cyl")
在box plot提供的信息之上,dot plot可以以汇总统计的形式为每个组提供更清晰的信息。这些点是交错排列的,每个点代表一个观察结果。因此,在下表中,给定制造商的点的数量将与源数据中制造商的行数匹配。
ggplot(mpg, aes(manufacturer, cty)) +
geom_boxplot() +
geom_dotplot(binaxis='y',
stackdir='center',
dotsize = .5,
fill="red") +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Box plot + Dot plot",
subtitle="City Mileage vs Class: Each dot represents 1 row in source data",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage") +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
Tufte box plot,由ggthemes package提供,灵感来自爱德华·塔夫特(Edward Tufte)的作品。Tufte的箱形图只是一个极小的箱形图,在视觉上很有吸引力。
library(ggthemes)
ggplot(mpg, aes(manufacturer, cty)) +
geom_tufteboxplot() +
theme(axis.text.x = element_text(angle=65, vjust=0.6)) +
labs(title="Tufte Styled Boxplot",
subtitle="City Mileage grouped by Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage") +
theme_economist() +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10))
小提琴图类似于箱形图,但显示了组内的密度。在箱线图中提供的信息不多。可以使用geom_violin()来绘制它。
mpg %>%
ggplot(aes(class,cty)) +
geom_violin(aes(fill = class)) +
geom_boxplot(aes(col = class,fill = class),alpha = 0.7,outlier.colour = "red") +
labs(title="Violin plot and box plot",
subtitle="City Mileage vs Class of vehicle",
caption="Source: mpg",
x="Class of Vehicle",
y="City Mileage") +
theme(plot.title = element_text(size = 20,hjust = 0.5),
axis.title = element_text(size = 15),
axis.text = element_text(size = 10))
人口金字塔提供了一种独特的方式来可视化有多少人口或人口的百分比属于某一类别。下面的金字塔是一个很好的例子,说明了在电子邮件营销活动漏斗的每个阶段有多少用户被保
# Read data
email_campaign_funnel <- read.csv("https://raw.githubusercontent.com/selva86/datasets/master/email_campaign_funnel.csv")
email_campaign_funnel
## Stage Gender Users
## 1 Stage 01: Browsers Male -1.492762e+07
## 2 Stage 02: Unbounced Users Male -1.286266e+07
## 3 Stage 03: Email Signups Male -1.136190e+07
## 4 Stage 04: Email Confirmed Male -9.411708e+06
## 5 Stage 05: Campaign-Email Opens Male -8.074317e+06
## 6 Stage 06: Campaign-Email Clickthroughs Male -6.958512e+06
## 7 Stage 07: Buy Button Page Male -6.045363e+06
## 8 Stage 08: Buy Button Clickers Male -5.029954e+06
## 9 Stage 09: Cart Confirmation Page Male -4.008034e+06
## 10 Stage 10: Address Verification Page Male -3.172555e+06
## 11 Stage 11: Submit Order Page Male -2.484808e+06
## 12 Stage 12: Payment Male -1.903727e+06
## 13 Stage 13: Payment Successful Male -1.490277e+06
## 14 Stage 14: 1st Successful Purchase Male -1.152004e+06
## 15 Stage 15: 2nd Purchase Male -7.707481e+05
## 16 Stage 16: 3rd Purchase Male -4.344300e+05
## 17 Stage 17: 4th Purchase Male -1.950319e+05
## 18 Stage 18: 5th Purchase Male -5.857022e+04
## 19 Stage 18: 5th Purchase Male -1.221545e+04
## 20 Stage 18: 5th Purchase Male 8.884954e+02
## 21 Stage 18: 5th Purchase Male -4.436191e+02
## 22 Stage 01: Browsers Female 1.422643e+07
## 23 Stage 02: Unbounced Users Female 1.227604e+07
## 24 Stage 03: Email Signups Female 1.085039e+07
## 25 Stage 04: Email Confirmed Female 8.999932e+06
## 26 Stage 05: Campaign-Email Opens Female 7.732693e+06
## 27 Stage 06: Campaign-Email Clickthroughs Female 6.666394e+06
## 28 Stage 07: Buy Button Page Female 5.743260e+06
## 29 Stage 08: Buy Button Clickers Female 4.723255e+06
## 30 Stage 09: Cart Confirmation Page Female 3.680879e+06
## 31 Stage 10: Address Verification Page Female 3.002641e+06
## 32 Stage 11: Submit Order Page Female 2.467805e+06
## 33 Stage 12: Payment Female 1.977278e+06
## 34 Stage 13: Payment Successful Female 1.593650e+06
## 35 Stage 14: 1st Successful Purchase Female 1.229651e+06
## 36 Stage 15: 2nd Purchase Female 8.284969e+05
## 37 Stage 16: 3rd Purchase Female 4.866220e+05
## 38 Stage 17: 4th Purchase Female 2.271061e+05
## 39 Stage 18: 5th Purchase Female 7.346678e+04
## 40 Stage 18: 5th Purchase Female 1.265455e+04
## 41 Stage 18: 5th Purchase Female 3.425495e+03
## 42 Stage 18: 5th Purchase Female -3.066191e+02
# X Axis Breaks and Labels
brks <- seq(-15000000, 15000000, 5000000)
lbls = paste0(as.character(c(seq(15, 0, -5), seq(5, 15, 5))), "m")
# Plot
ggplot(email_campaign_funnel, aes(x = Stage, y = Users, fill = Gender)) + # Fill column
geom_bar(stat = "identity", width = .6) + # draw the bars
scale_y_continuous(breaks = brks, # Breaks
labels = lbls) + # Labels
coord_flip() + # Flip axes
labs(title="Email Campaign Funnel") +
theme_tufte() + # Tufte theme from ggfortify
theme(plot.title = element_text(hjust = .5),
axis.ticks = element_blank()) + # Centre plot title
scale_fill_brewer(palette = "Set1") # Color palette