Introduction

Needless to say, ggplot2 is a great package for data visualization. However, a potential downside of the package is its syntax which is viewed by some as complicated. It is actually not that complicated; it is just hard to remember all the options within the ‘ggplot’ functions. I have used ggplot2 for more than 5 years now, and I still cannot remember all options for different graphs.

In this Rmarkdown document, I provide a cheat sheet for common graphs that are often used in scientific research. I found that in order to use ggplot2 for effectively, I need to load a number of packages (in addition to ggplot2, of course):

In this demo, I am going to use a dataset called ‘Obesity’ which contains a number of important data in my study. The main purpose is to study the relationship between percent body fat and body mass index.

Loading libraries and data

suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'tidyverse' was built under R version 4.0.2
## Warning: package 'ggplot2' was built under R version 4.0.2
## Warning: package 'tibble' was built under R version 4.0.2
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'readr' was built under R version 4.0.2
## Warning: package 'dplyr' was built under R version 4.0.2
## Warning: package 'stringr' was built under R version 4.0.2
## Warning: package 'forcats' was built under R version 4.0.2
suppressPackageStartupMessages(library(magrittr))
## Warning: package 'magrittr' was built under R version 4.0.2
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(ggExtra))
## Warning: package 'ggExtra' was built under R version 4.0.2
suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(survival))
suppressPackageStartupMessages(library(survminer))
suppressPackageStartupMessages(library(ggridges))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(readxl))

Reading the obesity data

ob = read.csv("~/Dropbox/UTS/Teaching/Statistical Modelling/Datasets/Obesity data.csv")
ob$group[ob$bmi<18.5] = "Underweight"
ob$group[ob$bmi>=18.5 & ob$bmi<25] = "Normal"
ob$group[ob$bmi>=25.0 & ob$bmi<30.0] = "Overweight"
ob$group[ob$bmi>=30.0] = "Obese"
ob$group = factor(ob$group, levels=c("Underweight", "Normal", "Overweight", "Obese"))
head(ob)
##   id gender height weight  bmi age  bmc  bmd   fat  lean pcfat  group
## 1  1      F    150     49 21.8  53 1312 0.88 17802 28600  37.3 Normal
## 2  2      M    165     52 19.1  65 1309 0.84  8381 40229  16.8 Normal
## 3  3      F    157     57 23.1  64 1230 0.84 19221 36057  34.0 Normal
## 4  4      F    156     53 21.8  56 1171 0.80 17472 33094  33.8 Normal
## 5  5      M    160     51 19.9  54 1681 0.98  7336 40621  14.8 Normal
## 6  6      F    153     47 20.1  52 1358 0.91 14904 30068  32.2 Normal

Specifying x and y axes

p = ggplot(data=ob, aes(x=age, y=pcfat, col=gender)) 
p = p + geom_point() 
p = p + scale_x_continuous(breaks = seq(from=15, to=90, by=5)) + scale_y_continuous(breaks = seq(from=10, to=50, by=10))
p = p + labs(x="Age", y="Percent body fat", title="Percent body fat and age")
# putting title at the centre 
p + theme(plot.title = element_text(hjust = 0.5))

Time variables

df = read_excel("~/Dropbox/Bao chi/Khoa hoc & Y te/Mortality analysis VN/HCM cases.xlsx")

df$date = as.Date(df$Date)

p = ggplot(data=df, aes(x=date, y=Cases, col=Lockdown)) +  geom_point() 
p = p +  geom_label_repel(aes(label = Cases)) 
p = p + scale_x_date(date_breaks = "1 week")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))

p + labs(x="Ngày", y="Số ca")  + theme(legend.position="none") 

Barplot

p1 = ggplot(data=ob, aes(x=group, fill=group)) + geom_bar() + theme(legend.position="none")
# By gender
p2 = ggplot(data=ob, aes(x=group, fill=gender)) + geom_bar() + theme(legend.position="none")

p3 = ggplot(data=ob, aes(x=group, fill=gender)) + geom_bar(position = position_dodge())

grid.arrange(p1, p2, p3, nrow=3)

# Bar plot of means

temp = ob %>% group_by(group) %>% summarize(mean.pcfat = round(mean(pcfat), 2))

ggplot(temp, aes(x=group, y=mean.pcfat, fill=group, label=mean.pcfat)) + geom_bar(stat="identity", width=0.5) + geom_text(size=3, col="white", position=position_stack(vjust=0.9)) + coord_flip() + theme(legend.position="none")

# Using facet
# Calculating means by group and gender 
temp = ob %>% group_by(group, gender) %>% summarize(mean.pcfat = round(mean(pcfat), 2))
## `summarise()` has grouped output by 'group'. You can override using the `.groups` argument.
ggplot(temp, aes(x=group, y=mean.pcfat, fill=group, label=mean.pcfat)) + geom_bar(stat = "identity", position="dodge") + geom_text(size=3, col="white", position=position_stack(vjust=0.9)) + facet_wrap(~gender) + theme(legend.position="none")

Histogram

p1 = ggplot(data=ob, aes(x=pcfat)) + geom_histogram(aes(y=..density..), col="white", fill="blue") + labs(x="Percent body fat", y="Number of individuals") + geom_density(col="red")

p2 = ggplot(data=ob, aes(x=pcfat, fill=gender, col=gender)) + geom_histogram(position="identity", alpha=0.3) + labs(x="Percent body fat", y="Number of individuals")

grid.arrange(p1, p2, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggridges

In this demo I use the iris dataset to show the distribution of Sepal.Length by Species.

p = ggplot(data=iris, aes(x = Sepal.Length, y = Species, fill=Species)) 

p + geom_density_ridges(aes(point_color = Species, point_fill=Species, point_shape=Species), alpha=0.2, point_alpha=1, jittered_points = TRUE) + theme(legend.position="none")
## Picking joint bandwidth of 0.181

ggplot(diamonds, aes(x=price, y=cut, fill=cut)) + geom_density_ridges(scale = 4) + scale_fill_cyclical(values=c("blue", "green"))
## Picking joint bandwidth of 458

Boxplot

p1 = ggplot(data=ob, aes(x=gender, y=pcfat, col=gender)) + geom_boxplot() + labs(x="Gender", y="Percent body fat") + theme(legend.position="none")

p2 = ggplot(data=ob, aes(x=gender, y=pcfat, col=gender)) + geom_boxplot() + geom_jitter(alpha=0.2) + labs(x="Gender", y="Percent body fat") + theme_economist() + theme(legend.position="none")

grid.arrange(p1, p2, ncol=2)

Scatterplot

p1 = ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point() + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")

p2 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")

grid.arrange(p1, p2, ncol=2)

Add marginal dist

p1 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2)) + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")

p2 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_rug(col=rgb(.5,0,0,alpha=.2))+ geom_smooth(method="lm", formula=y~x+I(x^2)) + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")

p2 = ggMarginal(p2, groupColour = TRUE, groupFill = TRUE)

grid.arrange(p1, p2, ncol=2)

Spaghetti plot

In this demo, I use the dataset ChickWeight from the datasets package

library(datasets)
cw = as.data.frame(ChickWeight)
head(cw)
##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1
ggplot(data=cw, aes(x=Time, y=weight, group=Chick, col=factor(Chick))) + geom_line() + stat_smooth(aes(group = 1)) + facet_wrap(~Diet) + theme(legend.position="none")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Matrix plot of correlations

ob1 = ob[, c("gender", "bmi", "pcfat", "lean", "fat")]
ggpairs(ob1, ggplot2::aes(colour=gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Survival plot

fit = survfit(Surv(time, status) ~ sex, data = lung)

ggsurvplot(fit, data = lung)

ggsurvplot(fit, risk.table=T, pval=T, conf.int=T, data=lung)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

Logistic regression

# Simulating data
dat = data.frame(y = sample(0:1, 1000, replace=TRUE))
dat$x1 = rnorm(1000, 5, 2) * (dat$y+1)
head(dat)
##   y        x1
## 1 1 12.572413
## 2 0  1.214889
## 3 1  8.985215
## 4 1 14.167258
## 5 1 13.195851
## 6 0  5.959922
library(broom)
## Warning: package 'broom' was built under R version 4.0.2
m = glm(y ~ x1, family=binomial, data=dat)

plot.dat = augment(m, type.predict = "response")

head(plot.dat)
## # A tibble: 6 x 8
##       y    x1 .fitted .resid .std.resid    .hat .sigma   .cooksd
##   <int> <dbl>   <dbl>  <dbl>      <dbl>   <dbl>  <dbl>     <dbl>
## 1     1 12.6   0.926   0.392      0.392 0.00267  0.964 0.000107 
## 2     0  1.21  0.0481 -0.314     -0.314 0.00178  0.964 0.0000452
## 3     1  8.99  0.687   0.866      0.867 0.00236  0.964 0.000540 
## 4     1 14.2   0.965   0.269      0.269 0.00204  0.964 0.0000377
## 5     1 13.2   0.944   0.338      0.339 0.00244  0.964 0.0000723
## 6     0  5.96  0.336  -0.905     -0.906 0.00166  0.964 0.000422
ggplot(plot.dat, aes(x = x1)) + geom_line(aes(y = .fitted), color = "blue") + labs(x = "Size", y = "Survival") + geom_point(aes(y = y), alpha = 0.2)

p = ggplot(plot.dat, aes(x = x1)) + geom_line(aes(y = .fitted), color = "blue") + labs(x = "Size", y = "Survival") + geom_point(aes(y = y), alpha = 0.2)

p + stat_summary_bin(geom="point", fun=mean, aes(y=y), bins=60)

p + geom_histogram(aes(x=x1, y = stat(count)/1000), bins = 30, na.rm = TRUE, col="white", fill="blue") + geom_histogram(aes(x=x1, y = -1*stat(count/1000)), bins = 30, na.rm = TRUE, position = position_nudge(y = 1), col="white", fill="blue")

library(popbio)
## Warning: package 'popbio' was built under R version 4.0.2
logi.hist.plot(dat$x1, dat$y, boxp = FALSE, type = "hist", col = "gray")

Forest plot

# Forest plot
dv = c("Y (N = 100)","Y (N = 100)","Y (N = 100)")
iv = c("X1","X2","X3")
es = c(.29,.11,.01)
lci = c(.12,.01,-.07) 
uci = c(.46,.21,.09)

temp = data.frame(dv, iv, es, lci, uci)

p = ggplot(data=temp, aes(x=iv, y=es, ymin=lci, ymax=uci)) 
p = p + geom_pointrange()
p = p + geom_hline(yintercept=0, lty=2, size =1)
p = p + geom_errorbar(aes(ymin=lci, ymax=uci), width=0.5, cex=1)

p + facet_wrap(~dv) + coord_flip() + geom_point(shape=15, size=5, col="red")