使用geom_boxplot()创建箱线图。了解不同的参数和函数参数,以多种方式自定义箱线图。

Boxplot theory

library(ggplot2); library(dplyr);
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(viridis); library(ggExtra)
## Loading required package: viridisLite
red <- rbeta(200, shape1 = 0.3, shape2 = 2) * 4
green <- rnorm(1000, mean = 2)
orange <- rnorm(1000, mean = 3.5, sd = 0.5)

xx <-
  data.frame(
    group = rep(c("red", "green", "orange"), times = c(200, 1000, 1000)),
    value = c(red, green, orange),
    box = rep(c(0.4, 0.7, 1.0), times = c(200, 1000, 1000))
  )

col_vector <- c("#72F281", "#357BF0", "#F0624D")

xx %>%
  ggplot(aes(x = value, color = group, fill = group)) +
  geom_density(alpha = 0.4) +
  geom_boxplot(aes(x = value, y = box, alpha = 0.8), color = "black") +
  theme_void()  +
  theme(legend.position = "none") +
  labs(x = "", y = "") +
  scale_color_manual(values = col_vector) +
  scale_fill_manual(values = col_vector)

set.seed(3)
rnorm_1000 <- rnorm(n = 1000, mean = 0, sd = 1)
par(mfrow = c(1, 2))
plot(rnorm_1000, las = 1)
boxplot(rnorm_1000, las = 1)

summary(rnorm_1000)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -3.056328 -0.684539  0.032338  0.006397  0.676673  3.519299
table(rnorm_1000 > -0.68 & rnorm_1000 < 0.67)
## 
## FALSE  TRUE 
##   504   496
names(summary(rnorm_1000))
## [1] "Min."    "1st Qu." "Median"  "Mean"    "3rd Qu." "Max."
(IQR <- diff(summary(rnorm_1000)[c(2, 5)]))
##  3rd Qu. 
## 1.361212
# 计算3/4分位数
summary(rnorm_1000)["3rd Qu."]
##   3rd Qu. 
## 0.6766734
# 3/4分位数+1.5倍盒子高度,超过该范围作为异常点
summary(rnorm_1000)["3rd Qu."] + 1.5 * IQR # 2.718491
##  3rd Qu. 
## 2.718491
tail(sort(rnorm_1000))
## [1] 2.536236 2.595481 2.635045 2.676632 3.200590 3.519299
boxplot(rnorm_1000, las = 1, ylim = c(2.2, 3.6))
abline(h = 2.718491,
       lty = 2,
       lwd = 1,
       col = "red")
abline(h = 2.676632,
       lty = 2,
       lwd = 1,
       col = "black")
points(x=jitter(rep(1,length(rnorm_1000)), factor = 4), y=rnorm_1000, col="red")

head(sort(rnorm_1000))
## [1] -3.056328 -3.053300 -2.665698 -2.545858 -2.403664 -2.398453
summary(rnorm_1000)["1st Qu."]-1.5*IQR
##   1st Qu. 
## -2.726356
boxplot(rnorm_1000, las=1, ylim=c(-3.1,-1.7))
abline(h=-2.726356,lty=2, lwd=1, col="red")
abline(h=-2.665698, lty=2, lwd=1, col="black")
points(x=jitter(rep(1,length(rnorm_1000)), factor = 4), y=rnorm_1000, col="red")

Basic ggplot2 boxplot

head(mtcars)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
ggplot(data=mtcars, aes(x=as.factor(cyl),y=mpg))+geom_boxplot()

mtcars %>% ggplot(aes(as.factor(cyl),mpg))+geom_boxplot(fill="navyblue", alpha=0.4)+xlab("cyl")

mtcars %>% ggplot(aes(as.factor(cyl),mpg, fill=as.factor(cyl)))+geom_boxplot(alpha=0.4)+xlab("cyl")

geom_boxplot() parameters

# ?geom_boxplot()

head(mpg)
## # A tibble: 6 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…
## 6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa…
mpg %>% ggplot(aes(x=class,y=hwy))+geom_boxplot(color="blue",fill="yellow", alpha=.5)

mpg %>% ggplot(aes(x=class,y=hwy))+geom_boxplot(color="blue",fill="yellow", alpha=.5, notch=TRUE, notchwidth=.5)
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

mpg %>% ggplot(aes(x=class,y=hwy))+geom_boxplot(color="blue",fill="yellow", alpha=.5, notch=TRUE, notchwidth=.5, outlier.colour = "red", outlier.shape = 19, outlier.size = 3)
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

mpg %>% ggplot(aes(x=class,y=hwy))+geom_boxplot(color="blue",fill="yellow", alpha=.5, notch=TRUE, notchwidth=.5, outlier.colour = "red", outlier.shape = NA)
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?
## Notch went outside hinges
## ℹ Do you want `notch = FALSE`?

Width for sample size

set.seed(10)
names <- c(rep("A",20), rep("B",5), rep("C", 30), rep("D",100))
value <- c(sample(2:5,20, replace = T),sample(4:10,5, replace = T),sample(1:7,30,replace = T), sample(3:8,100, replace = T))
mydata <- data.frame(names,value)

my_xlab <- paste(levels(mydata$names),"\n(N=", table(mydata$names),")",sep="")
my_xlab
## [1] "\n(N=20)"  "\n(N=5)"   "\n(N=30)"  "\n(N=100)"
ggplot(mydata,aes(names,value,fill=names))+geom_boxplot(varwidth = T,alpha=.2)

ggplot(mydata,aes(names,value,fill=names))+geom_boxplot(varwidth = T,alpha=.2)+theme(legend.position = "none")

ggplot(mydata,aes(names,value,fill=names))+geom_boxplot(varwidth = T,alpha=.2)+theme(legend.position = "none")+scale_x_discrete(labels=my_xlab)

Control ggplot2 boxplot colors

p1 <- ggplot(mpg,aes(class,hwy,fill=class))+geom_boxplot()+theme(legend.position = "none")
p1

p2 <- ggplot(mpg,aes(class,hwy))+geom_boxplot(color="red",fill="orange",alpha=.3)+theme(legend.position = "none")
p2

p3 <- ggplot(mpg,aes(class,hwy, fill=class))+geom_boxplot()+theme(legend.position = "none")+scale_fill_brewer(palette="BuPu")
p3

p4 <- ggplot(mpg,aes(class,hwy, fill=class))+geom_boxplot()+theme(legend.position = "none")+scale_fill_brewer(palette="Dark2")
p4

library(patchwork)
(p1|p2)/(p3|p4)

Highlight one boxplot

mpg %>% mutate(type=ifelse(class=="subcompact","Highlighted","Normal")) %>% ggplot(aes(class,hwy,fill=type))+geom_boxplot()

mpg %>% mutate(type=ifelse(class=="subcompact","Highlighted","Normal")) %>% ggplot(aes(class,hwy,fill=type,alpha=type))+geom_boxplot() + scale_fill_manual(values=c("#69b3a2","grey"))+scale_alpha_manual(values=c(.8,.2))+theme(legend.position = "none")

Grouping boxplot

variety <- rep(LETTERS[1:6],each=40)
treatment <- rep(c("high","low"),each=20)
value <- seq(1:240)+sample(1:150,240,replace = T)
mydata <- data.frame(variety, treatment, value)
mydata
##     variety treatment value
## 1         A      high   103
## 2         A      high    22
## 3         A      high    69
## 4         A      high    33
## 5         A      high    54
## 6         A      high    34
## 7         A      high    56
## 8         A      high   148
## 9         A      high   113
## 10        A      high   127
## 11        A      high    38
## 12        A      high    72
## 13        A      high    87
## 14        A      high    78
## 15        A      high    87
## 16        A      high   149
## 17        A      high   122
## 18        A      high   110
## 19        A      high   164
## 20        A      high    75
## 21        A       low    35
## 22        A       low    91
## 23        A       low    93
## 24        A       low   119
## 25        A       low   174
## 26        A       low   109
## 27        A       low    58
## 28        A       low    36
## 29        A       low    67
## 30        A       low    60
## 31        A       low    60
## 32        A       low   141
## 33        A       low    55
## 34        A       low   138
## 35        A       low   167
## 36        A       low   107
## 37        A       low    64
## 38        A       low   144
## 39        A       low   173
## 40        A       low    44
## 41        B      high    93
## 42        B      high   132
## 43        B      high    96
## 44        B      high    90
## 45        B      high   141
## 46        B      high   131
## 47        B      high   153
## 48        B      high   150
## 49        B      high    81
## 50        B      high   161
## 51        B      high    65
## 52        B      high    64
## 53        B      high   117
## 54        B      high   154
## 55        B      high   199
## 56        B      high   165
## 57        B      high    95
## 58        B      high    80
## 59        B      high   198
## 60        B      high    95
## 61        B       low   184
## 62        B       low   155
## 63        B       low   161
## 64        B       low   129
## 65        B       low   137
## 66        B       low   178
## 67        B       low   132
## 68        B       low   197
## 69        B       low   216
## 70        B       low   132
## 71        B       low   118
## 72        B       low   143
## 73        B       low   166
## 74        B       low   172
## 75        B       low   111
## 76        B       low   191
## 77        B       low   120
## 78        B       low   106
## 79        B       low   198
## 80        B       low   178
## 81        C      high   133
## 82        C      high   106
## 83        C      high   113
## 84        C      high   147
## 85        C      high   214
## 86        C      high   179
## 87        C      high   187
## 88        C      high   135
## 89        C      high   162
## 90        C      high   206
## 91        C      high   109
## 92        C      high   153
## 93        C      high   230
## 94        C      high   204
## 95        C      high   105
## 96        C      high   111
## 97        C      high   157
## 98        C      high   175
## 99        C      high   232
## 100       C      high   190
## 101       C       low   225
## 102       C       low   195
## 103       C       low   236
## 104       C       low   129
## 105       C       low   223
## 106       C       low   119
## 107       C       low   150
## 108       C       low   194
## 109       C       low   238
## 110       C       low   170
## 111       C       low   194
## 112       C       low   247
## 113       C       low   192
## 114       C       low   240
## 115       C       low   163
## 116       C       low   171
## 117       C       low   163
## 118       C       low   266
## 119       C       low   226
## 120       C       low   243
## 121       D      high   264
## 122       D      high   170
## 123       D      high   228
## 124       D      high   213
## 125       D      high   133
## 126       D      high   239
## 127       D      high   241
## 128       D      high   181
## 129       D      high   257
## 130       D      high   197
## 131       D      high   259
## 132       D      high   202
## 133       D      high   268
## 134       D      high   178
## 135       D      high   184
## 136       D      high   154
## 137       D      high   168
## 138       D      high   171
## 139       D      high   267
## 140       D      high   200
## 141       D       low   160
## 142       D       low   169
## 143       D       low   200
## 144       D       low   250
## 145       D       low   203
## 146       D       low   247
## 147       D       low   204
## 148       D       low   274
## 149       D       low   174
## 150       D       low   220
## 151       D       low   285
## 152       D       low   172
## 153       D       low   273
## 154       D       low   223
## 155       D       low   304
## 156       D       low   287
## 157       D       low   206
## 158       D       low   232
## 159       D       low   164
## 160       D       low   285
## 161       E      high   191
## 162       E      high   304
## 163       E      high   228
## 164       E      high   249
## 165       E      high   225
## 166       E      high   256
## 167       E      high   299
## 168       E      high   190
## 169       E      high   283
## 170       E      high   310
## 171       E      high   227
## 172       E      high   184
## 173       E      high   206
## 174       E      high   180
## 175       E      high   179
## 176       E      high   280
## 177       E      high   290
## 178       E      high   198
## 179       E      high   314
## 180       E      high   200
## 181       E       low   226
## 182       E       low   238
## 183       E       low   324
## 184       E       low   273
## 185       E       low   203
## 186       E       low   195
## 187       E       low   204
## 188       E       low   225
## 189       E       low   327
## 190       E       low   329
## 191       E       low   231
## 192       E       low   253
## 193       E       low   241
## 194       E       low   230
## 195       E       low   238
## 196       E       low   285
## 197       E       low   292
## 198       E       low   303
## 199       E       low   242
## 200       E       low   343
## 201       F      high   345
## 202       F      high   223
## 203       F      high   221
## 204       F      high   350
## 205       F      high   353
## 206       F      high   277
## 207       F      high   329
## 208       F      high   223
## 209       F      high   275
## 210       F      high   227
## 211       F      high   344
## 212       F      high   323
## 213       F      high   360
## 214       F      high   357
## 215       F      high   273
## 216       F      high   219
## 217       F      high   338
## 218       F      high   220
## 219       F      high   344
## 220       F      high   255
## 221       F       low   292
## 222       F       low   232
## 223       F       low   246
## 224       F       low   303
## 225       F       low   317
## 226       F       low   336
## 227       F       low   347
## 228       F       low   362
## 229       F       low   303
## 230       F       low   248
## 231       F       low   364
## 232       F       low   338
## 233       F       low   308
## 234       F       low   293
## 235       F       low   247
## 236       F       low   326
## 237       F       low   354
## 238       F       low   306
## 239       F       low   290
## 240       F       low   255
ggplot(mydata, aes(variety,value,fill=treatment))+geom_boxplot()

ggplot(mydata, aes(variety,value,fill=treatment))+geom_boxplot()+facet_wrap(~treatment)

ggplot(mydata, aes(variety,value,fill=treatment))+geom_boxplot()+facet_wrap(~variety)

ggplot(mydata, aes(variety,value,fill=treatment))+geom_boxplot()+facet_wrap(~variety, scales="free_x")

ggplot(mydata, aes(variety,value,fill=treatment))+geom_boxplot()+facet_wrap(~variety, scales="free")

Adding the mean with stat_summary()

names <- c(rep("A",20),rep("B",8),rep("C",30),rep("D",80))
value <-
  c(
    sample(2:5, 20, replace = T),
    sample(4:10, 8, replace = T),
    sample(1:7, 30, replace = T),
    sample(3:8, 80, replace = T)
  )

mydata <- data.frame(names,value)

ggplot(mydata,aes(names,value,fill=names))+geom_boxplot(alpha=.7)

ggplot(mydata,aes(names,value,fill=names))+geom_boxplot(alpha=.7)+stat_summary(fun=mean,geom="point",shape=20,size=10,color="yellow")

ggplot(mydata,aes(names,value,fill=names))+geom_boxplot(alpha=.7)+stat_summary(fun=mean,geom="point",shape=20,size=10,color="yellow")+theme(legend.position = "none")+scale_fill_brewer(palette="Set1")

ggplot(mydata,aes(names,value,fill=names))+geom_boxplot(alpha=.7)+stat_summary(fun=mean,geom="point",shape=20,size=10,color="yellow")+theme(legend.position = "none")+scale_fill_brewer(palette="Set1")+stat_boxplot(geom="errorbar",width=0.5)

Adding geom_point, geom_jitter, geom_dotplot

mydata <- data.frame(name=c(rep("A",500),rep("B",500), rep("C",20), rep("D",100)), value=c(rnorm(500,10,5),rnorm(500,13,1),rnorm(20,25,4),rnorm(100,12,1)))

mydata %>% ggplot(aes(name,value,fill=name))+geom_boxplot()+geom_point()+theme(legend.position = "none", plot.title = element_text(size=11))+ggtitle("A boxplot with geom_point")+ xlab("")

mydata %>% ggplot(aes(name,value,fill=name))+geom_boxplot()+geom_jitter(color="black",size=.4,alpha=.9)+theme(legend.position = "none", plot.title = element_text(size=11))+ggtitle("A boxplot with geom_point")+ xlab("")

mydata %>% ggplot(aes(name,value,fill=name))+geom_jitter(color="black",size=.8,alpha=.6)+geom_boxplot(alpha=.5)+theme(legend.position = "none", plot.title = element_text(size=11))+ggtitle("A boxplot with geom_point")+ xlab("")

Boxplot with dot plot

ggplot(ToothGrowth, aes(dose, len, group=factor(dose)))+geom_boxplot()

ggplot(ToothGrowth, aes(dose, len, group=factor(dose)))+geom_boxplot()+geom_jitter()

ggplot(ToothGrowth, aes(dose, len, group=factor(dose)))+geom_boxplot()+geom_dotplot(binaxis = 'y',stackdir="center",dotsize=.8)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Boxplots in the margins with ggMarginal

library(ggExtra)
p <- ggplot(mtcars, aes(wt,mpg,color=cyl,size=2))+geom_point()+theme(legend.position = "none")
ggMarginal(p,type="boxplot")
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
ggMarginal(p,type="density")
ggMarginal(p,type="histogram")
ggMarginal(p,type="violin")
ggMarginal(p,type="densigram")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## ℹ The deprecated feature was likely used in the ggExtra package.
##   Please report the issue at <https://github.com/daattali/ggExtra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Putting it all together

set.seed(1)
names <- c(rep("A",40),rep("D",20),rep("C",60),rep("B",200))
value <- c(rnorm(40,3.5),rnorm(20,7),rnorm(60,4,sd=2),rnorm(200,6))
mydata <- data.frame(names,value)

my_xlab <- paste(unique(mydata$names),"\n(N=",table(mydata$names),")",sep="")

mydata %>% mutate(type = ifelse(names=="B","Highlighted","Normal")) %>% ggplot(aes(names,value,fill=type))+geom_jitter(size=1, color="black", alpha=.3, width=.2)+geom_boxplot(alpha=.7, varwidth=T, notch=T, notchwidth = .4, outlier.color = "red", outlier.fill = "orange", outlier.size = 2)

mydata %>% mutate(type=ifelse(names=="B","Highlighted","Normal")) %>% ggplot(aes(names,value,fill=type))+geom_jitter(size=1, color="black", alpha=.3, width=.2)+geom_boxplot(alpha=.7, varwidth=T, notch=T, notchwidth=.4, outlier.color = "red", outlier.fill = "orange", outlier.size = 2) + scale_fill_manual(values=c("#DE6B45","#45DEBA")) + theme(legend.position = "none") + scale_x_discrete(labels=my_xlab) + labs(x="",y="") + stat_summary(fun = mean, geom = "point", shape=23, size=3, color= "darkgreen", fill="orange")