Introduction

Needless to say, ggplot2 is a great package for data visualization. However, a potential downside of the package is its syntax which is viewed by some as complicated. It is actually not that complicated; it is just hard to remember all the options within the ‘ggplot’ functions. I have used ggplot2 for more than 5 years now, and I still cannot remember all options for different graphs.

In this Rmarkdown document, I provide a cheat sheet for common graphs that are often used in scientific research. I found that in order to use ggplot2 for effectively, I need to load a number of packages (in addition to ggplot2, of course):

tidyverse for data management and manipulation
GGally for multiple variables visualization
ggExtra for enhancing ggplot2
ggridges for visualizing changes in distribution over time and space
ggthemes for selecting nice themes
gridExtra for arranging plots
ggrepel for text arrangement

In this demo, I am going to use a dataset called ‘Obesity’ which contains a number of important data in my study. The main purpose is to study the relationship between percent body fat and body mass index.

Loading libraries and data

suppressPackageStartupMessages(library(tidyverse))

## Warning: package 'tidyverse' was built under R version 4.0.2

## Warning: package 'ggplot2' was built under R version 4.0.2

## Warning: package 'tibble' was built under R version 4.0.2

## Warning: package 'tidyr' was built under R version 4.0.2

## Warning: package 'readr' was built under R version 4.0.2

## Warning: package 'dplyr' was built under R version 4.0.2

## Warning: package 'stringr' was built under R version 4.0.2

## Warning: package 'forcats' was built under R version 4.0.2

suppressPackageStartupMessages(library(magrittr))

## Warning: package 'magrittr' was built under R version 4.0.2

suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(ggExtra))

## Warning: package 'ggExtra' was built under R version 4.0.2

suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(survival))
suppressPackageStartupMessages(library(survminer))
suppressPackageStartupMessages(library(ggridges))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(readxl))

Reading the obesity data

ob = read.csv("~/Dropbox/UTS/Teaching/Statistical Modelling/Datasets/Obesity data.csv")
ob$group[ob$bmi<18.5] = "Underweight"
ob$group[ob$bmi>=18.5 & ob$bmi<25] = "Normal"
ob$group[ob$bmi>=25.0 & ob$bmi<30.0] = "Overweight"
ob$group[ob$bmi>=30.0] = "Obese"
ob$group = factor(ob$group, levels=c("Underweight", "Normal", "Overweight", "Obese"))
head(ob)

##   id gender height weight  bmi age  bmc  bmd   fat  lean pcfat  group
## 1  1      F    150     49 21.8  53 1312 0.88 17802 28600  37.3 Normal
## 2  2      M    165     52 19.1  65 1309 0.84  8381 40229  16.8 Normal
## 3  3      F    157     57 23.1  64 1230 0.84 19221 36057  34.0 Normal
## 4  4      F    156     53 21.8  56 1171 0.80 17472 33094  33.8 Normal
## 5  5      M    160     51 19.9  54 1681 0.98  7336 40621  14.8 Normal
## 6  6      F    153     47 20.1  52 1358 0.91 14904 30068  32.2 Normal

Specifying x and y axes

p = ggplot(data=ob, aes(x=age, y=pcfat, col=gender)) 
p = p + geom_point() 
p = p + scale_x_continuous(breaks = seq(from=15, to=90, by=5)) + scale_y_continuous(breaks = seq(from=10, to=50, by=10))
p = p + labs(x="Age", y="Percent body fat", title="Percent body fat and age")
# putting title at the centre 
p + theme(plot.title = element_text(hjust = 0.5))

Time variables

df = read_excel("~/Dropbox/Bao chi/Khoa hoc & Y te/Mortality analysis VN/HCM cases.xlsx")

df$date = as.Date(df$Date)

p = ggplot(data=df, aes(x=date, y=Cases, col=Lockdown)) +  geom_point() 
p = p +  geom_label_repel(aes(label = Cases)) 
p = p + scale_x_date(date_breaks = "1 week")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))

p + labs(x="Ngày", y="Số ca")  + theme(legend.position="none")

Barplot

p1 = ggplot(data=ob, aes(x=group, fill=group)) + geom_bar() + theme(legend.position="none")
# By gender
p2 = ggplot(data=ob, aes(x=group, fill=gender)) + geom_bar() + theme(legend.position="none")

p3 = ggplot(data=ob, aes(x=group, fill=gender)) + geom_bar(position = position_dodge())

grid.arrange(p1, p2, p3, nrow=3)

# Bar plot of means

temp = ob %>% group_by(group) %>% summarize(mean.pcfat = round(mean(pcfat), 2))

ggplot(temp, aes(x=group, y=mean.pcfat, fill=group, label=mean.pcfat)) + geom_bar(stat="identity", width=0.5) + geom_text(size=3, col="white", position=position_stack(vjust=0.9)) + coord_flip() + theme(legend.position="none")

# Using facet
# Calculating means by group and gender 
temp = ob %>% group_by(group, gender) %>% summarize(mean.pcfat = round(mean(pcfat), 2))

## `summarise()` has grouped output by 'group'. You can override using the `.groups` argument.

ggplot(temp, aes(x=group, y=mean.pcfat, fill=group, label=mean.pcfat)) + geom_bar(stat = "identity", position="dodge") + geom_text(size=3, col="white", position=position_stack(vjust=0.9)) + facet_wrap(~gender) + theme(legend.position="none")

Histogram

p1 = ggplot(data=ob, aes(x=pcfat)) + geom_histogram(aes(y=..density..), col="white", fill="blue") + labs(x="Percent body fat", y="Number of individuals") + geom_density(col="red")

p2 = ggplot(data=ob, aes(x=pcfat, fill=gender, col=gender)) + geom_histogram(position="identity", alpha=0.3) + labs(x="Percent body fat", y="Number of individuals")

grid.arrange(p1, p2, nrow=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggridges

In this demo I use the iris dataset to show the distribution of Sepal.Length by Species.

p = ggplot(data=iris, aes(x = Sepal.Length, y = Species, fill=Species)) 

p + geom_density_ridges(aes(point_color = Species, point_fill=Species, point_shape=Species), alpha=0.2, point_alpha=1, jittered_points = TRUE) + theme(legend.position="none")

## Picking joint bandwidth of 0.181

ggplot(diamonds, aes(x=price, y=cut, fill=cut)) + geom_density_ridges(scale = 4) + scale_fill_cyclical(values=c("blue", "green"))

## Picking joint bandwidth of 458

Boxplot

p1 = ggplot(data=ob, aes(x=gender, y=pcfat, col=gender)) + geom_boxplot() + labs(x="Gender", y="Percent body fat") + theme(legend.position="none")

p2 = ggplot(data=ob, aes(x=gender, y=pcfat, col=gender)) + geom_boxplot() + geom_jitter(alpha=0.2) + labs(x="Gender", y="Percent body fat") + theme_economist() + theme(legend.position="none")

grid.arrange(p1, p2, ncol=2)

Scatterplot

p1 = ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point() + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")

p2 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")

grid.arrange(p1, p2, ncol=2)

Add marginal dist

p1 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2)) + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")

p2 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_rug(col=rgb(.5,0,0,alpha=.2))+ geom_smooth(method="lm", formula=y~x+I(x^2)) + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")

p2 = ggMarginal(p2, groupColour = TRUE, groupFill = TRUE)

grid.arrange(p1, p2, ncol=2)

Spaghetti plot

In this demo, I use the dataset ChickWeight from the datasets package

library(datasets)
cw = as.data.frame(ChickWeight)
head(cw)

##   weight Time Chick Diet
## 1     42    0     1    1
## 2     51    2     1    1
## 3     59    4     1    1
## 4     64    6     1    1
## 5     76    8     1    1
## 6     93   10     1    1

ggplot(data=cw, aes(x=Time, y=weight, group=Chick, col=factor(Chick))) + geom_line() + stat_smooth(aes(group = 1)) + facet_wrap(~Diet) + theme(legend.position="none")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Matrix plot of correlations

ob1 = ob[, c("gender", "bmi", "pcfat", "lean", "fat")]
ggpairs(ob1, ggplot2::aes(colour=gender))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Survival plot

fit = survfit(Surv(time, status) ~ sex, data = lung)

ggsurvplot(fit, data = lung)

ggsurvplot(fit, risk.table=T, pval=T, conf.int=T, data=lung)

## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.

Logistic regression

# Simulating data
dat = data.frame(y = sample(0:1, 1000, replace=TRUE))
dat$x1 = rnorm(1000, 5, 2) * (dat$y+1)
head(dat)

##   y        x1
## 1 1 12.572413
## 2 0  1.214889
## 3 1  8.985215
## 4 1 14.167258
## 5 1 13.195851
## 6 0  5.959922

library(broom)

## Warning: package 'broom' was built under R version 4.0.2

m = glm(y ~ x1, family=binomial, data=dat)

plot.dat = augment(m, type.predict = "response")

head(plot.dat)

## # A tibble: 6 x 8
##       y    x1 .fitted .resid .std.resid    .hat .sigma   .cooksd
##   <int> <dbl>   <dbl>  <dbl>      <dbl>   <dbl>  <dbl>     <dbl>
## 1     1 12.6   0.926   0.392      0.392 0.00267  0.964 0.000107 
## 2     0  1.21  0.0481 -0.314     -0.314 0.00178  0.964 0.0000452
## 3     1  8.99  0.687   0.866      0.867 0.00236  0.964 0.000540 
## 4     1 14.2   0.965   0.269      0.269 0.00204  0.964 0.0000377
## 5     1 13.2   0.944   0.338      0.339 0.00244  0.964 0.0000723
## 6     0  5.96  0.336  -0.905     -0.906 0.00166  0.964 0.000422

ggplot(plot.dat, aes(x = x1)) + geom_line(aes(y = .fitted), color = "blue") + labs(x = "Size", y = "Survival") + geom_point(aes(y = y), alpha = 0.2)

p = ggplot(plot.dat, aes(x = x1)) + geom_line(aes(y = .fitted), color = "blue") + labs(x = "Size", y = "Survival") + geom_point(aes(y = y), alpha = 0.2)

p + stat_summary_bin(geom="point", fun=mean, aes(y=y), bins=60)

p + geom_histogram(aes(x=x1, y = stat(count)/1000), bins = 30, na.rm = TRUE, col="white", fill="blue") + geom_histogram(aes(x=x1, y = -1*stat(count/1000)), bins = 30, na.rm = TRUE, position = position_nudge(y = 1), col="white", fill="blue")

library(popbio)

## Warning: package 'popbio' was built under R version 4.0.2

logi.hist.plot(dat$x1, dat$y, boxp = FALSE, type = "hist", col = "gray")

Forest plot

# Forest plot
dv = c("Y (N = 100)","Y (N = 100)","Y (N = 100)")
iv = c("X1","X2","X3")
es = c(.29,.11,.01)
lci = c(.12,.01,-.07) 
uci = c(.46,.21,.09)

temp = data.frame(dv, iv, es, lci, uci)

p = ggplot(data=temp, aes(x=iv, y=es, ymin=lci, ymax=uci)) 
p = p + geom_pointrange()
p = p + geom_hline(yintercept=0, lty=2, size =1)
p = p + geom_errorbar(aes(ymin=lci, ymax=uci), width=0.5, cex=1)

p + facet_wrap(~dv) + coord_flip() + geom_point(shape=15, size=5, col="red")

Common plots using R

T Nguyen

01/10/2021