1 Important notes

Resource from: https://homepage.stat.uiowa.edu/~luke/classes/STAT4580-2024/ggplot.html

ggplot(data = <DATA>) +
    <GEOM>(mapping = aes(<MAPPINGS>), # Geometric objects
           stat = <STAT>, # Statistical Transformations
           position = <POSITION>) + # Position Adjustments
    < ... MORE GEOMS ... > +
  
    <COORDINATE_ADJUSTMENT> + #coordinate systems
  
    <SCALE_ADJUSTMENT> + #change the scales of x-axis, y-axis, legend/fill, _manual options...
    ## arguments: 
    ## 1. `name`: for title of axis & legend
    ## 2. `breaks`
    ## 3. `labels`: for tick/legend label
    ## 4. `values`: manual inputs
  
    ## source: https://ggplot2-book.org/scales-guides#:~:text=It%20is%20useful%20to%20note,discrete_scale()%20and%20binned_scale()%20.
  
    <FACETING> +
  
    <THEME_ADJUSTMENT> #e.g., tilt for x-ticks, remove legends

2 Preperation

Load Packages

library(ggplot2)
theme_set(theme_bw(base_size = 14)) # change the global setting of ggplot2 bw theme
library(ggrepel)
library(see)

library(ratdat)

library(dplyr)

Import Data

## Data 1
complete_old #individual rodents captured at the Portal Project and the plots they were captured in.
colnames(complete_old)
##  [1] "record_id"       "month"           "day"             "year"           
##  [5] "plot_id"         "species_id"      "sex"             "hindfoot_length"
##  [9] "weight"          "genus"           "species"         "taxa"           
## [13] "plot_type"
## Data 2
# Download files within the WebR environment
# root_srcfile <- "https://raw.githubusercontent.com/DanielEWeeks/HuGen2071/main/"
# download.file(paste0(root_srcfile,"data/study1.csv"),
#               "study1.csv")

## setup study1 -> stored in b data
a = read.csv(file="./11_ggplot2/data/study1.csv")
a$ind <- seq_along(a$t)
b <- a[-c(1001:1004),]
b$g.f <- factor(b$g)
b$geno <- paste(b$all1,b$all2,sep="/")
dim(b)
## [1] 1000   10
head(b)
## another data for representation
cancer_genes = read.csv(file = "11_ggplot2/data/microarray_cancer.csv", header = TRUE)

3 ggplot commands

3.1 basic aesthetics and scale

  1. scale_color_viridis_d(): color blind and gray-scale friendly
ggplot(data=complete_old, aes(x=weight, 
                              y=hindfoot_length, 
                              color=plot_type, shape=sex)) + 
  geom_point(alpha=0.2) +
  scale_color_viridis_d() #color blind and gray-scale friendly
## Warning: Removed 3081 rows containing missing values or values outside the scale range
## (`geom_point()`).

  1. scale_x_log10()
ggplot(data=complete_old, aes(x=weight, 
                              y=hindfoot_length, 
                              color=plot_type, shape=sex)) + #global in the plot, or will need to map again in the geom function
  geom_point(alpha=0.2) +
  scale_x_log10() 
## Warning: Removed 3081 rows containing missing values or values outside the scale range
## (`geom_point()`).


Plot the genetic data - Cases have higher trait value

ggplot(data=b, aes(x=ind, y=t, color=trait)) +
  geom_point()

3.2 boxplot - better use Violin plots or Raincloud plots (include additional density plot & true data points)

  1. scale_x_discrete(labels = label_wrap_gen(width = 10)): label_wrap_gen() will wrap the text of the labels to make them more legible.

ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length, fill=plot_type)) +
  geom_boxplot(alpha=0.8) # can see overlapping fonts
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length, fill=plot_type)) +
  geom_boxplot(alpha=0.8) + # can see overlapping fonts
  scale_x_discrete(labels=label_wrap_gen(width = 10))
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

  1. geom_point(position = "jitter") = geom_jitter

ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length)) +
  geom_boxplot(outlier.shape = NA) + #remove outliers because they are plotted twice by boxplot and geom_jitter
  
  geom_point( aes(color=plot_type), alpha = 0.1, 
              position = "jitter") + #same as geom_jitter
  
  scale_x_discrete( labels=label_wrap_gen(width = 10))
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).

3.3 Sequence of the commands matters (which geom is later in sequence -> will be on the upper layer)

ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length)) +

  geom_point( aes(color=plot_type), alpha = 0.1, 
              position = "jitter") +
  geom_boxplot(fill=NA, outlier.shape = NA) +
  
  scale_x_discrete( labels=label_wrap_gen(width = 10))
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).

3.4 Violin plot (density plot)

Change the ploting themes in theme()

  1. theme(axis.title = element_text(size = 14) ): change axis font size,

  2. theme(panel.grid.major.x = element_blank() ): change grid type,

  3. theme(legend.position = "none"): change legend

  4. theme(plot.title = element_text(face = "bold", size=20) ): change font size for specific section of text

ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length, color=plot_type)) +
  geom_jitter(alpha=0.1) +
  geom_violin(fill="white") +
  theme_bw()
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).

# change the font size on the axis
ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length)) +
  
  geom_jitter(aes(color=plot_type), alpha=0.1) +
  
  geom_violin(aes(color=plot_type), fill="white") + # this combination is better than the sina plot
  
  geom_boxplot(width=0.05, outlier.shape = NA)+ #add boxplot additional to the violin plot
  
  ### these three combinations = rainclouds plot ###
  
  labs(title="Rodent Hindfoot Length by Plot Type", subtitle = "Violin Plot") +
  
  theme_bw() +
  theme(axis.title = element_text(size = 14),
        panel.grid.major.x = element_blank(), #major grid are columns grids
        legend.position = "none",
        plot.title = element_text(face = "bold", size=18) ) + 
  
  scale_x_discrete(labels=label_wrap_gen(width=10)) + #wrap long text
  scale_color_brewer(palette = "Set2") # change the palate
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).

3.5 Raincloud plot

reference: https://github.com/njudd/ggrain

library(ggrain)
## Registered S3 methods overwritten by 'ggpp':
##   method                  from   
##   heightDetails.titleGrob ggplot2
##   widthDetails.titleGrob  ggplot2
p = ggplot(data=complete_old, aes(x=plot_type, y=hindfoot_length)) +
  geom_rain(violin.args = list(alpha = 0.3, aes(fill = plot_type)), 
            boxplot.args = list(alpha = 0.3, aes(fill = plot_type), outlier.shape=NA),
            point.args = list(alpha=0.2) )+ #not plot the points using the same color
            # cov= "plot_type" ) + #much easier
  labs(title="Rodent Hindfoot Length by Plot Type") +

  theme_bw() +
  theme(legend.position = "none",
        panel.grid.major.x = element_blank()) +
  
  scale_x_discrete(labels=label_wrap_gen(width=10))

suppressWarnings(print(p))

p2 = ggplot(data=complete_old, aes(x=1, y=hindfoot_length, fill=plot_type)) +
  geom_rain(alpha=0.3) +
            # cov= "plot_type" ) + #much easier
  labs(title="Merge in 1 Raincloud Plot") +

  theme_bw() +
  theme(legend.position = "none",
        panel.grid.major.x = element_blank()) +
  
  scale_x_discrete(labels=label_wrap_gen(width=10))

suppressWarnings(print(p2))

3.6 Faceting

facet_wrap(vars( <colnames> ),
           nrow, ncol) #seperate by how many rows and columns
  • Note: Although can use facet_wrap(~) notation, it is not functional in functions and loops
ggplot(data = complete_old, aes(x=plot_type, y=hindfoot_length)) +
  geom_jitter(aes(color=plot_type), alpha=0.1) +
  geom_violin(aes(color=plot_type), fill="white") +
  theme_bw() + 
  facet_wrap(vars(sex), nrow=3) + # how many rows/columns of the facet
  scale_x_discrete(label=label_wrap_gen(10))
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: Removed 2733 rows containing missing values or values outside the scale range
## (`geom_point()`).


Another example (genetic data) - if we want to see the trend, separating by case control, and further in different genotype groups, it is hard to see in one graph

  • TRY FACET
ggplot(data=b, aes(x=ind, y=t, color=geno, shape=trait) ) +
  geom_point(alpha=0.8)+
  geom_smooth(aes(linetype=trait), method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data=b, aes(x=ind, y=t, color=trait) ) +
  geom_point(alpha=0.8)+
  # geom_smooth(aes(linetype=trait), method = "lm") + 
  facet_wrap(vars(geno))

3.7 Plotting in Loop and function

fn_histogram = function(df, target_var, color_var, binwidth, facet_by) {
  df %>% 
    ggplot(aes(x={{ target_var }}, fill= {{ color_var }})) + ##{{data variables}}; no ""; usually used in the context of tidyverse
    
    geom_histogram(binwidth = binwidth, alpha=0.8, color="black") + 
    facet_wrap( vars( {{facet_by}} ) ) #ensure that it can be group by multiple variables
}

fn_histogram(df=b, target_var=t, color_var=geno, binwidth=0.5, facet_by=geno)

fn_histogram(df=complete_old, 
             target_var=hindfoot_length, 
             color_var=plot_type, binwidth=0.5, facet_by=plot_type)
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_bin()`).


Notice that there is no fill!

  • This is because the bin is TOO small for proper fill
  • Let’s change the bin-width
fn_histogram(df=complete_old, 
             target_var=hindfoot_length, 
             color_var=plot_type, binwidth=2, facet_by=plot_type)
## Warning: Removed 2733 rows containing non-finite outside the scale range
## (`stat_bin()`).

3.8 Usage of scale_?_?()

Data: cancer_genes

Purpose: ANOVA, comparing (cancer) group means if the same

Note: calculating the total/population mean

\[ \mu_{\ population} = \frac{n_{group1} * \mu_{\ group1} + n_{group2} * \mu_{\ group2} + \dots + n_{group\ k} * \mu_{\ group\ k}} {N_{\ population}} \]

\[ \mu_{\ population} = \frac{ \sum_{i=1}^{i=k}\ n_{\ group_{i}} * \mu_{\ group_{i}} } {N_{\ population} } \] - Where \(N_{\ population} = \sum n_{\ group_{i}}\)

Which is \[\mu_{\ population} = \sum\ p_{\ group_{i}} * \mu_{\ group_{i}} \]

# calculate numerical summary
summary = cancer_genes %>% group_by(cancer) %>%
  summarise(
    N = n(),
    mean = mean(gene1),
    median = median(gene1),
    std = sd(gene1),
    variance = var(gene1) )

summary
# calculate the total/population mean
## method1
mean(cancer_genes$gene1)
## [1] 11.0101
## method2
# mu = sum(group_n * group_mean) / total_n
# mu = sum(proportion_group_n * group_mean)
sum(summary$N * summary$mean) / nrow(cancer_genes)
## [1] 11.0101

Visualization

  • For each cancer type -> compare the group means (side-by-side boxplot)
library(ggrepel)

boxplot = 
  cancer_genes %>% ggplot( aes(x=cancer, y=gene1) ) +
  #boxplot
    geom_boxplot( aes(fill=cancer) ) +
    
  #add a line for total mean
    geom_hline(
      aes(yintercept=mean(gene1), linetype="Population Mean"), #change the legend name here 
          color="blue", linewidth=0.8) + #not inside `aes` if no defining purpose
  #setup legend for a dashed hline
    scale_linetype_manual(name ="", values = c("dashed")) + #no legend title
  #re-scale y-axis to include a label for the total mean
    scale_y_continuous(breaks = round( c( seq(-90,90,by=40), 11.0101), 2), #annotate population mean
                       name = "Gene1 Expression") + #y-axis title
  
  #setup x-axis title & wrapping
    scale_x_discrete(name = "Cancer Types",
                     labels = label_wrap_gen(width = 2) ) +  # change x-axis labels wrapping
                                                             ## this wont work because no spaces!!
  #setup legend (fill of the boxplots = cancer types)
    scale_fill_discrete(name = "Cancer Types") + 
    theme_bw() + 
  #tilt the x-axis ticks
    theme(axis.text.x = element_text(angle = 30, hjust = 1))

print(boxplot)

Switch to a rain-cloud plot to better show the distribution of the groups expression of gene1

library(RColorBrewer)
# Define the number of colors you want
nb.cols <- length(unique(cancer_genes$cancer))
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)

plt_rainCloud = 
  cancer_genes %>% ggplot( aes(x=cancer, y=gene1, fill=cancer) ) +
  
  ### make boxplot the middle ###
    geom_jitter( aes(color=cancer), size=3, 
                 alpha=0.5, 
                 position = position_nudge(x=-0.2)) + #left 0.2 units
  
    geom_boxplot(width=0.15, 
                 alpha=0.8, outlier.shape = NA) + #middle
  
    see::geom_violinhalf(width=1, 
                         alpha=0.8, 
                         position = position_nudge(x=0.2)) + #right 0.2 units
    
  
  ### add a line for total mean ###
    geom_hline(
      aes(yintercept=mean(gene1), linetype="Population Mean"), #change the legend name here 
          color="blue", linewidth=0.8) + #not inside `aes` if no defining purpose
  #setup legend for a dashed hline
    scale_linetype_manual(name ="", values = c("dashed")) + #no legend title
  
  #re-scale y-axis to include a label for the total mean
    scale_y_continuous(breaks = round( c( seq(-90,90,by=40), 11.0101), 2), #annotate population mean
                       name = "Gene1 Expression") + #y-axis title
  
  #setup x-axis title & wrapping
    scale_x_discrete(name = "Cancer Types",
                     labels = label_wrap_gen(width = 2) ) +  # change x-axis labels wrapping
                                                             ## this wont work because no spaces!!
    theme_bw() + 
    theme(axis.text.x = element_text(angle = 30, hjust = 1, #tilt the x-axis ticks
                                     size=14), #resize x-ticks
          axis.title.x = element_text(size=16),# x-title-name
          
          axis.text.y = element_text(size=14), #resize y-ticks
          axis.title.y = element_text(size=16),
          
          legend.position = "none")+ #no legends

  #color palate manual (pre-defined so short)
    scale_color_manual(values = mycolors) + scale_fill_manual(values = mycolors, name="Cancer Types")

print(plt_rainCloud)

Plot the same way, but using goem_rain (can add jitter & nudge)

library(RColorBrewer)
# Define the number of colors you want
nb.cols <- length(unique(cancer_genes$cancer))
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)

plt_rainCloud = 
  cancer_genes %>% ggplot( aes(x=cancer, y=gene1, fill=cancer) ) +
  
  ##use the package -same setting above
  ggrain::geom_rain(point.args = list(size=3, alpha=0.5, aes(color=cancer)), 
                    point.args.pos = rlang::list2(position = position_jitter(width = 0.02, height = 0)),
    
                    boxplot.args = list(alpha=0.8, outlier.shape = NA),
                    boxplot.args.pos = rlang::list2(width = 0.15, position = position_nudge(x = 0.2) ),
                    
                    violin.args = list(alpha=0.8), 
                    violin.args.pos = rlang::list2(
                      side = "r", width = 1, position = position_nudge(x = 0.35)), 
                    ) +

  
  ### add a line for total mean ###
    geom_hline(
      aes(yintercept=mean(gene1), linetype="Population Mean"), #change the legend name here 
          color="blue", linewidth=0.8) + #not inside `aes` if no defining purpose
  #setup legend for a dashed hline
    scale_linetype_manual(name ="", values = c("dashed")) + #no legend title
  
  #re-scale y-axis to include a label for the total mean
    scale_y_continuous(breaks = round( c( seq(-90,90,by=40), 11.0101), 2), #annotate population mean
                       name = "Gene1 Expression") + #y-axis title
  
  #setup x-axis title & wrapping
    scale_x_discrete(name = "Cancer Types",
                     labels = label_wrap_gen(width = 2) ) +  # change x-axis labels wrapping
                                                             ## this wont work because no spaces!!
    theme_bw() + 
    theme(axis.text.x = element_text(angle = -30, vjust=1, hjust = 0.2, #tilt and move the x-axis ticks
                                     size=14, #resize x-ticks
                                     margin=margin(b=10) ), #change bottom margin=10pt
          axis.title.x = element_text(size=16),# x-title-name
          
          axis.text.y = element_text(size=14), #resize y-ticks
          axis.title.y = element_text(size=16),
          
          legend.position = "none")+ #no legends

  #color palate manual (pre-defined so short)
    scale_color_manual(values = mycolors) + scale_fill_manual(values = mycolors, name="Cancer Types")

print(plt_rainCloud)