Needless to say, ggplot2 is a great package for data visualization. However, a potential downside of the package is its syntax which is viewed by some as complicated. It is actually not that complicated; it is just hard to remember all the options within the ‘ggplot’ functions. I have used ggplot2 for more than 5 years now, and I still cannot remember all options for different graphs.
In this Rmarkdown document, I provide a cheat sheet for common graphs that are often used in scientific research. I found that in order to use ggplot2 for effectively, I need to load a number of packages (in addition to ggplot2, of course):
In this demo, I am going to use a dataset called ‘Obesity’ which contains a number of important data in my study. The main purpose is to study the relationship between percent body fat and body mass index.
suppressPackageStartupMessages(library(tidyverse))
## Warning: package 'tidyverse' was built under R version 4.0.2
## Warning: package 'ggplot2' was built under R version 4.0.2
## Warning: package 'tibble' was built under R version 4.0.2
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'readr' was built under R version 4.0.2
## Warning: package 'dplyr' was built under R version 4.0.2
## Warning: package 'stringr' was built under R version 4.0.2
## Warning: package 'forcats' was built under R version 4.0.2
suppressPackageStartupMessages(library(magrittr))
## Warning: package 'magrittr' was built under R version 4.0.2
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(GGally))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(ggExtra))
## Warning: package 'ggExtra' was built under R version 4.0.2
suppressPackageStartupMessages(library(ggthemes))
suppressPackageStartupMessages(library(survival))
suppressPackageStartupMessages(library(survminer))
suppressPackageStartupMessages(library(ggridges))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(readxl))
ob = read.csv("~/Dropbox/UTS/Teaching/Statistical Modelling/Datasets/Obesity data.csv")
ob$group[ob$bmi<18.5] = "Underweight"
ob$group[ob$bmi>=18.5 & ob$bmi<25] = "Normal"
ob$group[ob$bmi>=25.0 & ob$bmi<30.0] = "Overweight"
ob$group[ob$bmi>=30.0] = "Obese"
ob$group = factor(ob$group, levels=c("Underweight", "Normal", "Overweight", "Obese"))
head(ob)
## id gender height weight bmi age bmc bmd fat lean pcfat group
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3 Normal
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8 Normal
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0 Normal
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8 Normal
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8 Normal
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2 Normal
p = ggplot(data=ob, aes(x=age, y=pcfat, col=gender))
p = p + geom_point()
p = p + scale_x_continuous(breaks = seq(from=15, to=90, by=5)) + scale_y_continuous(breaks = seq(from=10, to=50, by=10))
p = p + labs(x="Age", y="Percent body fat", title="Percent body fat and age")
# putting title at the centre
p + theme(plot.title = element_text(hjust = 0.5))
df = read_excel("~/Dropbox/Bao chi/Khoa hoc & Y te/Mortality analysis VN/HCM cases.xlsx")
df$date = as.Date(df$Date)
p = ggplot(data=df, aes(x=date, y=Cases, col=Lockdown)) + geom_point()
p = p + geom_label_repel(aes(label = Cases))
p = p + scale_x_date(date_breaks = "1 week")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
p + labs(x="Ngà y", y="Số ca") + theme(legend.position="none")
p1 = ggplot(data=ob, aes(x=group, fill=group)) + geom_bar() + theme(legend.position="none")
# By gender
p2 = ggplot(data=ob, aes(x=group, fill=gender)) + geom_bar() + theme(legend.position="none")
p3 = ggplot(data=ob, aes(x=group, fill=gender)) + geom_bar(position = position_dodge())
grid.arrange(p1, p2, p3, nrow=3)
# Bar plot of means
temp = ob %>% group_by(group) %>% summarize(mean.pcfat = round(mean(pcfat), 2))
ggplot(temp, aes(x=group, y=mean.pcfat, fill=group, label=mean.pcfat)) + geom_bar(stat="identity", width=0.5) + geom_text(size=3, col="white", position=position_stack(vjust=0.9)) + coord_flip() + theme(legend.position="none")
# Using facet
# Calculating means by group and gender
temp = ob %>% group_by(group, gender) %>% summarize(mean.pcfat = round(mean(pcfat), 2))
## `summarise()` has grouped output by 'group'. You can override using the `.groups` argument.
ggplot(temp, aes(x=group, y=mean.pcfat, fill=group, label=mean.pcfat)) + geom_bar(stat = "identity", position="dodge") + geom_text(size=3, col="white", position=position_stack(vjust=0.9)) + facet_wrap(~gender) + theme(legend.position="none")
p1 = ggplot(data=ob, aes(x=pcfat)) + geom_histogram(aes(y=..density..), col="white", fill="blue") + labs(x="Percent body fat", y="Number of individuals") + geom_density(col="red")
p2 = ggplot(data=ob, aes(x=pcfat, fill=gender, col=gender)) + geom_histogram(position="identity", alpha=0.3) + labs(x="Percent body fat", y="Number of individuals")
grid.arrange(p1, p2, nrow=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
In this demo I use the iris dataset to show the distribution of Sepal.Length by Species.
p = ggplot(data=iris, aes(x = Sepal.Length, y = Species, fill=Species))
p + geom_density_ridges(aes(point_color = Species, point_fill=Species, point_shape=Species), alpha=0.2, point_alpha=1, jittered_points = TRUE) + theme(legend.position="none")
## Picking joint bandwidth of 0.181
ggplot(diamonds, aes(x=price, y=cut, fill=cut)) + geom_density_ridges(scale = 4) + scale_fill_cyclical(values=c("blue", "green"))
## Picking joint bandwidth of 458
p1 = ggplot(data=ob, aes(x=gender, y=pcfat, col=gender)) + geom_boxplot() + labs(x="Gender", y="Percent body fat") + theme(legend.position="none")
p2 = ggplot(data=ob, aes(x=gender, y=pcfat, col=gender)) + geom_boxplot() + geom_jitter(alpha=0.2) + labs(x="Gender", y="Percent body fat") + theme_economist() + theme(legend.position="none")
grid.arrange(p1, p2, ncol=2)
p1 = ggplot(data=ob, aes(x=bmi, y=pcfat)) + geom_point() + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")
p2 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")
grid.arrange(p1, p2, ncol=2)
p1 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_smooth(method="lm", formula=y~x+I(x^2)) + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")
p2 = ggplot(data=ob, aes(x=bmi, y=pcfat, col=gender)) + geom_point() + geom_rug(col=rgb(.5,0,0,alpha=.2))+ geom_smooth(method="lm", formula=y~x+I(x^2)) + labs(x="Body mass index", y="Percent body fat") + theme(legend.position="none")
p2 = ggMarginal(p2, groupColour = TRUE, groupFill = TRUE)
grid.arrange(p1, p2, ncol=2)
In this demo, I use the dataset ChickWeight from the datasets package
library(datasets)
cw = as.data.frame(ChickWeight)
head(cw)
## weight Time Chick Diet
## 1 42 0 1 1
## 2 51 2 1 1
## 3 59 4 1 1
## 4 64 6 1 1
## 5 76 8 1 1
## 6 93 10 1 1
ggplot(data=cw, aes(x=Time, y=weight, group=Chick, col=factor(Chick))) + geom_line() + stat_smooth(aes(group = 1)) + facet_wrap(~Diet) + theme(legend.position="none")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ob1 = ob[, c("gender", "bmi", "pcfat", "lean", "fat")]
ggpairs(ob1, ggplot2::aes(colour=gender))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
fit = survfit(Surv(time, status) ~ sex, data = lung)
ggsurvplot(fit, data = lung)
ggsurvplot(fit, risk.table=T, pval=T, conf.int=T, data=lung)
## Warning: Vectorized input to `element_text()` is not officially supported.
## Results may be unexpected or may change in future versions of ggplot2.
# Simulating data
dat = data.frame(y = sample(0:1, 1000, replace=TRUE))
dat$x1 = rnorm(1000, 5, 2) * (dat$y+1)
head(dat)
## y x1
## 1 1 12.572413
## 2 0 1.214889
## 3 1 8.985215
## 4 1 14.167258
## 5 1 13.195851
## 6 0 5.959922
library(broom)
## Warning: package 'broom' was built under R version 4.0.2
m = glm(y ~ x1, family=binomial, data=dat)
plot.dat = augment(m, type.predict = "response")
head(plot.dat)
## # A tibble: 6 x 8
## y x1 .fitted .resid .std.resid .hat .sigma .cooksd
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 12.6 0.926 0.392 0.392 0.00267 0.964 0.000107
## 2 0 1.21 0.0481 -0.314 -0.314 0.00178 0.964 0.0000452
## 3 1 8.99 0.687 0.866 0.867 0.00236 0.964 0.000540
## 4 1 14.2 0.965 0.269 0.269 0.00204 0.964 0.0000377
## 5 1 13.2 0.944 0.338 0.339 0.00244 0.964 0.0000723
## 6 0 5.96 0.336 -0.905 -0.906 0.00166 0.964 0.000422
ggplot(plot.dat, aes(x = x1)) + geom_line(aes(y = .fitted), color = "blue") + labs(x = "Size", y = "Survival") + geom_point(aes(y = y), alpha = 0.2)
p = ggplot(plot.dat, aes(x = x1)) + geom_line(aes(y = .fitted), color = "blue") + labs(x = "Size", y = "Survival") + geom_point(aes(y = y), alpha = 0.2)
p + stat_summary_bin(geom="point", fun=mean, aes(y=y), bins=60)
p + geom_histogram(aes(x=x1, y = stat(count)/1000), bins = 30, na.rm = TRUE, col="white", fill="blue") + geom_histogram(aes(x=x1, y = -1*stat(count/1000)), bins = 30, na.rm = TRUE, position = position_nudge(y = 1), col="white", fill="blue")
library(popbio)
## Warning: package 'popbio' was built under R version 4.0.2
logi.hist.plot(dat$x1, dat$y, boxp = FALSE, type = "hist", col = "gray")
# Forest plot
dv = c("Y (N = 100)","Y (N = 100)","Y (N = 100)")
iv = c("X1","X2","X3")
es = c(.29,.11,.01)
lci = c(.12,.01,-.07)
uci = c(.46,.21,.09)
temp = data.frame(dv, iv, es, lci, uci)
p = ggplot(data=temp, aes(x=iv, y=es, ymin=lci, ymax=uci))
p = p + geom_pointrange()
p = p + geom_hline(yintercept=0, lty=2, size =1)
p = p + geom_errorbar(aes(ymin=lci, ymax=uci), width=0.5, cex=1)
p + facet_wrap(~dv) + coord_flip() + geom_point(shape=15, size=5, col="red")