Resource from: https://homepage.stat.uiowa.edu/~luke/classes/STAT4580-2024/ggplot.html
ggplot(data = <DATA>) +
<GEOM>(mapping = aes(<MAPPINGS>), # Geometric objects
stat = <STAT>, # Statistical Transformations
position = <POSITION>) + # Position Adjustments
< ... MORE GEOMS ... > +
<COORDINATE_ADJUSTMENT> + #coordinate systems
<SCALE_ADJUSTMENT> + #change the scales of x-axis, y-axis, legend/fill, _manual options...
## arguments:
## 1. `name`: for title of axis & legend
## 2. `breaks`
## 3. `labels`: for tick/legend label
## 4. `values`: manual inputs
## source: https://ggplot2-book.org/scales-guides#:~:text=It%20is%20useful%20to%20note,discrete_scale()%20and%20binned_scale()%20.
<FACETING> +
<THEME_ADJUSTMENT> #e.g., tilt for x-ticks, remove legends
Load Packages
library(ggplot2)
theme_set(theme_bw(base_size = 14)) # change the global setting of ggplot2 bw theme
library(ggrepel)
library(see)
library(ratdat)
library(dplyr)
Import Data
## Data 1
complete_old #individual rodents captured at the Portal Project and the plots they were captured in.
colnames(complete_old)
## [1] "record_id" "month" "day" "year"
## [5] "plot_id" "species_id" "sex" "hindfoot_length"
## [9] "weight" "genus" "species" "taxa"
## [13] "plot_type"
## Data 2
# Download files within the WebR environment
# root_srcfile <- "https://raw.githubusercontent.com/DanielEWeeks/HuGen2071/main/"
# download.file(paste0(root_srcfile,"data/study1.csv"),
# "study1.csv")
## setup study1 -> stored in b data
a = read.csv(file="./11_ggplot2/data/study1.csv")
a$ind <- seq_along(a$t)
b <- a[-c(1001:1004),]
b$g.f <- factor(b$g)
b$geno <- paste(b$all1,b$all2,sep="/")
dim(b)
## [1] 1000 10
head(b)
## another data for representation
cancer_genes = read.csv(file = "11_ggplot2/data/microarray_cancer.csv", header = TRUE)
scale_color_viridis_d()
: color blind and gray-scale
friendlyggplot(data=complete_old, aes(x=weight,
y=hindfoot_length,
color=plot_type, shape=sex)) +
geom_point(alpha=0.2) +
scale_color_viridis_d() #color blind and gray-scale friendly
## Warning: Removed 3081 rows containing missing values or values outside the scale range
## (`geom_point()`).
scale_x_log10()
ggplot(data=complete_old, aes(x=weight,
y=hindfoot_length,
color=plot_type, shape=sex)) + #global in the plot, or will need to map again in the geom function
geom_point(alpha=0.2) +
scale_x_log10()
## Warning: Removed 3081 rows containing missing values or values outside the scale range
## (`geom_point()`).
Plot the genetic data - Cases have higher trait value
ggplot(data=b, aes(x=ind, y=t, color=trait)) +
geom_point()
scale_x_discrete(labels = label_wrap_gen(width = 10))
:
label_wrap_gen() will wrap the text of the labels to make them more
legible.
ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length, fill=plot_type)) +
geom_boxplot(alpha=0.8) # can see overlapping fonts
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length, fill=plot_type)) +
geom_boxplot(alpha=0.8) + # can see overlapping fonts
scale_x_discrete(labels=label_wrap_gen(width = 10))
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
geom_point(position = "jitter")
= geom_jitter
ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length)) +
geom_boxplot(outlier.shape = NA) + #remove outliers because they are plotted twice by boxplot and geom_jitter
geom_point( aes(color=plot_type), alpha = 0.1,
position = "jitter") + #same as geom_jitter
scale_x_discrete( labels=label_wrap_gen(width = 10))
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length)) +
geom_point( aes(color=plot_type), alpha = 0.1,
position = "jitter") +
geom_boxplot(fill=NA, outlier.shape = NA) +
scale_x_discrete( labels=label_wrap_gen(width = 10))
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).
Change the ploting themes in theme()
theme(axis.title = element_text(size = 14) )
: change
axis font size,
theme(panel.grid.major.x = element_blank() )
: change
grid type,
theme(legend.position = "none")
: change
legend
theme(plot.title = element_text(face = "bold", size=20) )
:
change font size for specific section of text
ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length, color=plot_type)) +
geom_jitter(alpha=0.1) +
geom_violin(fill="white") +
theme_bw()
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).
# change the font size on the axis
ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length)) +
geom_jitter(aes(color=plot_type), alpha=0.1) +
geom_violin(aes(color=plot_type), fill="white") + # this combination is better than the sina plot
geom_boxplot(width=0.05, outlier.shape = NA)+ #add boxplot additional to the violin plot
### these three combinations = rainclouds plot ###
labs(title="Rodent Hindfoot Length by Plot Type", subtitle = "Violin Plot") +
theme_bw() +
theme(axis.title = element_text(size = 14),
panel.grid.major.x = element_blank(), #major grid are columns grids
legend.position = "none",
plot.title = element_text(face = "bold", size=18) ) +
scale_x_discrete(labels=label_wrap_gen(width=10)) + #wrap long text
scale_color_brewer(palette = "Set2") # change the palate
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).
reference: https://github.com/njudd/ggrain
library(ggrain)
## Registered S3 methods overwritten by 'ggpp':
## method from
## heightDetails.titleGrob ggplot2
## widthDetails.titleGrob ggplot2
p = ggplot(data=complete_old, aes(x=plot_type, y=hindfoot_length)) +
geom_rain(violin.args = list(alpha = 0.3, aes(fill = plot_type)),
boxplot.args = list(alpha = 0.3, aes(fill = plot_type), outlier.shape=NA),
point.args = list(alpha=0.2) )+ #not plot the points using the same color
# cov= "plot_type" ) + #much easier
labs(title="Rodent Hindfoot Length by Plot Type") +
theme_bw() +
theme(legend.position = "none",
panel.grid.major.x = element_blank()) +
scale_x_discrete(labels=label_wrap_gen(width=10))
suppressWarnings(print(p))
p2 = ggplot(data=complete_old, aes(x=1, y=hindfoot_length, fill=plot_type)) +
geom_rain(alpha=0.3) +
# cov= "plot_type" ) + #much easier
labs(title="Merge in 1 Raincloud Plot") +
theme_bw() +
theme(legend.position = "none",
panel.grid.major.x = element_blank()) +
scale_x_discrete(labels=label_wrap_gen(width=10))
suppressWarnings(print(p2))
facet_wrap(vars( <colnames> ),
nrow, ncol) #seperate by how many rows and columns
facet_wrap(~)
notation, it is
not functional in functions and loopsggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length)) +
geom_jitter(aes(color=plot_type), alpha=0.1) +
geom_violin(aes(color=plot_type), fill="white") +
theme_bw() +
facet_wrap(vars(sex), nrow=3) + # how many rows/columns of the facet
scale_x_discrete(label=label_wrap_gen(10))
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).
Another example (genetic data) - if we want to see the trend, separating by case control, and further in different genotype groups, it is hard to see in one graph
ggplot(data=b, aes(x=ind, y=t, color=geno, shape=trait) ) +
geom_point(alpha=0.8)+
geom_smooth(aes(linetype=trait), method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data=b, aes(x=ind, y=t, color=trait) ) +
geom_point(alpha=0.8)+
# geom_smooth(aes(linetype=trait), method = "lm") +
facet_wrap(vars(geno))
fn_histogram = function(df, target_var, color_var, binwidth, facet_by) {
df %>%
ggplot(aes(x={{ target_var }}, fill= {{ color_var }})) + ##{{data variables}}; no ""; usually used in the context of tidyverse
geom_histogram(binwidth = binwidth, alpha=0.8, color="black") +
facet_wrap( vars( {{facet_by}} ) ) #ensure that it can be group by multiple variables
}
fn_histogram(df=b, target_var=t, color_var=geno, binwidth=0.5, facet_by=geno)
fn_histogram(df=complete_old,
target_var=hindfoot_length,
color_var=plot_type, binwidth=0.5, facet_by=plot_type)
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_bin()`).
Notice that there is no fill!
fn_histogram(df=complete_old,
target_var=hindfoot_length,
color_var=plot_type, binwidth=2, facet_by=plot_type)
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_bin()`).
scale_?_?()
Data: cancer_genes
Purpose: ANOVA, comparing (cancer) group means if the same
Note: calculating the total/population mean
\[ \mu_{\ population} = \frac{n_{group1} * \mu_{\ group1} + n_{group2} * \mu_{\ group2} + \dots + n_{group\ k} * \mu_{\ group\ k}} {N_{\ population}} \]
\[ \mu_{\ population} = \frac{ \sum_{i=1}^{i=k}\ n_{\ group_{i}} * \mu_{\ group_{i}} } {N_{\ population} } \] - Where \(N_{\ population} = \sum n_{\ group_{i}}\)
Which is \[\mu_{\ population} = \sum\ p_{\ group_{i}} * \mu_{\ group_{i}} \]
# calculate numerical summary
summary = cancer_genes %>% group_by(cancer) %>%
summarise(
N = n(),
mean = mean(gene1),
median = median(gene1),
std = sd(gene1),
variance = var(gene1) )
summary
# calculate the total/population mean
## method1
mean(cancer_genes$gene1)
## [1] 11.0101
## method2
# mu = sum(group_n * group_mean) / total_n
# mu = sum(proportion_group_n * group_mean)
sum(summary$N * summary$mean) / nrow(cancer_genes)
## [1] 11.0101
Visualization
library(ggrepel)
boxplot =
cancer_genes %>% ggplot( aes(x=cancer, y=gene1) ) +
#boxplot
geom_boxplot( aes(fill=cancer) ) +
#add a line for total mean
geom_hline(
aes(yintercept=mean(gene1), linetype="Population Mean"), #change the legend name here
color="blue", linewidth=0.8) + #not inside `aes` if no defining purpose
#setup legend for a dashed hline
scale_linetype_manual(name ="", values = c("dashed")) + #no legend title
#re-scale y-axis to include a label for the total mean
scale_y_continuous(breaks = round( c( seq(-90,90,by=40), 11.0101), 2), #annotate population mean
name = "Gene1 Expression") + #y-axis title
#setup x-axis title & wrapping
scale_x_discrete(name = "Cancer Types",
labels = label_wrap_gen(width = 2) ) + # change x-axis labels wrapping
## this wont work because no spaces!!
#setup legend (fill of the boxplots = cancer types)
scale_fill_discrete(name = "Cancer Types") +
theme_bw() +
#tilt the x-axis ticks
theme(axis.text.x = element_text(angle = 30, hjust = 1))
print(boxplot)
Switch to a rain-cloud plot to better show the distribution of the groups expression of gene1
library(RColorBrewer)
# Define the number of colors you want
nb.cols <- length(unique(cancer_genes$cancer))
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
plt_rainCloud =
cancer_genes %>% ggplot( aes(x=cancer, y=gene1, fill=cancer) ) +
### make boxplot the middle ###
geom_jitter( aes(color=cancer), size=3,
alpha=0.5,
position = position_nudge(x=-0.2)) + #left 0.2 units
geom_boxplot(width=0.15,
alpha=0.8, outlier.shape = NA) + #middle
see::geom_violinhalf(width=1,
alpha=0.8,
position = position_nudge(x=0.2)) + #right 0.2 units
### add a line for total mean ###
geom_hline(
aes(yintercept=mean(gene1), linetype="Population Mean"), #change the legend name here
color="blue", linewidth=0.8) + #not inside `aes` if no defining purpose
#setup legend for a dashed hline
scale_linetype_manual(name ="", values = c("dashed")) + #no legend title
#re-scale y-axis to include a label for the total mean
scale_y_continuous(breaks = round( c( seq(-90,90,by=40), 11.0101), 2), #annotate population mean
name = "Gene1 Expression") + #y-axis title
#setup x-axis title & wrapping
scale_x_discrete(name = "Cancer Types",
labels = label_wrap_gen(width = 2) ) + # change x-axis labels wrapping
## this wont work because no spaces!!
theme_bw() +
theme(axis.text.x = element_text(angle = 30, hjust = 1, #tilt the x-axis ticks
size=14), #resize x-ticks
axis.title.x = element_text(size=16),# x-title-name
axis.text.y = element_text(size=14), #resize y-ticks
axis.title.y = element_text(size=16),
legend.position = "none")+ #no legends
#color palate manual (pre-defined so short)
scale_color_manual(values = mycolors) + scale_fill_manual(values = mycolors, name="Cancer Types")
print(plt_rainCloud)
Plot the same way, but using goem_rain
(can add jitter
& nudge)
library(RColorBrewer)
# Define the number of colors you want
nb.cols <- length(unique(cancer_genes$cancer))
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
plt_rainCloud =
cancer_genes %>% ggplot( aes(x=cancer, y=gene1, fill=cancer) ) +
##use the package -same setting above
ggrain::geom_rain(point.args = list(size=3, alpha=0.5, aes(color=cancer)),
point.args.pos = rlang::list2(position = position_jitter(width = 0.02, height = 0)),
boxplot.args = list(alpha=0.8, outlier.shape = NA),
boxplot.args.pos = rlang::list2(width = 0.15, position = position_nudge(x = 0.2) ),
violin.args = list(alpha=0.8),
violin.args.pos = rlang::list2(
side = "r", width = 1, position = position_nudge(x = 0.35)),
) +
### add a line for total mean ###
geom_hline(
aes(yintercept=mean(gene1), linetype="Population Mean"), #change the legend name here
color="blue", linewidth=0.8) + #not inside `aes` if no defining purpose
#setup legend for a dashed hline
scale_linetype_manual(name ="", values = c("dashed")) + #no legend title
#re-scale y-axis to include a label for the total mean
scale_y_continuous(breaks = round( c( seq(-90,90,by=40), 11.0101), 2), #annotate population mean
name = "Gene1 Expression") + #y-axis title
#setup x-axis title & wrapping
scale_x_discrete(name = "Cancer Types",
labels = label_wrap_gen(width = 2) ) + # change x-axis labels wrapping
## this wont work because no spaces!!
theme_bw() +
theme(axis.text.x = element_text(angle = -30, vjust=1, hjust = 0.2, #tilt and move the x-axis ticks
size=14, #resize x-ticks
margin=margin(b=10) ), #change bottom margin=10pt
axis.title.x = element_text(size=16),# x-title-name
axis.text.y = element_text(size=14), #resize y-ticks
axis.title.y = element_text(size=16),
legend.position = "none")+ #no legends
#color palate manual (pre-defined so short)
scale_color_manual(values = mycolors) + scale_fill_manual(values = mycolors, name="Cancer Types")
print(plt_rainCloud)