Module 15.a

Author

Vivian Strange

Intro to ggpubr

06/19/2025

Vivian Strange - GEOG 5680

Deliverable: code + notes

Set up workspace

setwd(“~/Desktop/geog5680/Module Deliverables/Module 15”)

list.files()
[1] "gapminderData5.csv"           "m15_script.R"                
[3] "Strange_m15_quarto_files"     "Strange_m15_quarto.html"     
[5] "Strange_m15_quarto.qmd"       "Strange_m15_quarto.rmarkdown"

ggpubr figures

library(ggplot2)
library(ggpubr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggExtra)

Load, read, and examine data

gap = read.csv("gapminderData5.csv")
str(gap)
'data.frame':   1704 obs. of  6 variables:
 $ country  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
 $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
 $ continent: chr  "Asia" "Asia" "Asia" "Asia" ...
 $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
 $ gdpPercap: num  779 821 853 836 740 ...

Extract a subset for 2007 data

gap07 = gap |> 
  filter(year == 2007 & continent != "Oceania")

Scatter plots

GDP Per-Capita vs Life Expectancy (2007)

ggplot version

ggplot(gap07, aes(x = gdpPercap, y = lifeExp, col = continent)) + 
  geom_point() + scale_x_log10("GDP per capita ($)") + 
  scale_y_continuous("Life Expectancy (yrs)") + ggtitle("GapMinder Data 2007")

ggpubr version (using ggscatter())

unlike ggplot, the variable names need to be enclosed in quotation marks.
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007") + 
  xscale("log10", .format = TRUE)

Add labels to each point

ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007", label = "country", repel = TRUE) + 
  xscale("log10", .format = TRUE)

Select a handful of points to label

sel_countries = c("United States", "China", "Germany")
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007", label = "country", 
          label.select = sel_countries, repel = TRUE) + 
  xscale("log10", .format = TRUE)

Show the distribution of points using a ‘rug’ (position of each observation)

ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007") + 
  xscale("log10", .format = TRUE)

Marginal histogram

library(ggExtra)
p <- ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
               xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
               main = "GapMinder Data 2007") + 
  xscale("log10", .format = TRUE)
ggMarginal(p, type = "histogram")

Add a regression line (note: it automatically fits one per color group)

ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007", add = "reg.line", conf.int = TRUE) + 
  xscale("log10", .format = TRUE)

Add correlations

ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007", add = "reg.line", conf.int = TRUE) + 
  xscale("log10", .format = TRUE) +
  stat_cor(aes(color = continent), method = "spearman")

Add regression line equations

ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007", add = "reg.line", conf.int = TRUE) + 
  xscale("log10", .format = TRUE) +
  stat_regline_equation(aes(color = continent))

Histograms (generated by gghistogram())

Distribution of Life Expectancy

gghistogram(gap07, x = "lifeExp", main = "GapMinder Life Expectancy")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

Add a ‘fill’ argument to separate the continents
gghistogram(gap07, x = "lifeExp", fill = "continent", 
            main = "GapMinder Life Expectancy")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

Palettes

15 extra palettes on ggsci

Default Nature Publishing Group (npg) palette:

gghistogram(gap07, x = "lifeExp", fill = "continent", 
            main = "GapMinder Life Expectancy", palette = "npg")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

Try some of the others:

lancet: The Lancet
gghistogram(gap07, x = "lifeExp", fill = "continent", 
            main = "GapMinder Life Expectancy", palette = "lancet")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

jco: Journal of Clinical Oncology
gghistogram(gap07, x = "lifeExp", fill = "continent", 
            main = "GapMinder Life Expectancy", palette = "jco")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

aaas: American Association for the Advancement of Science
gghistogram(gap07, x = "lifeExp", fill = "continent", 
            main = "GapMinder Life Expectancy", palette = "aaas")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

rickandmorty: Self explanatory…
gghistogram(gap07, x = "lifeExp", fill = "continent", 
            main = "GapMinder Life Expectancy", palette = "rickandmorty")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

See the ggsci help pages for the full list

Density plots

Alternative to histograms

Automatically add transparency to plots using the ggdensity() function.

ggdensity(gap07, x = "lifeExp", fill = "continent", 
          main = "GapMinder Life Expectancy", palette = "jco")

ggpubr is compatible with the facet.by() function

ggdensity(gap07, x = "lifeExp", fill = "continent", 
          main = "GapMinder Life Expectancy", palette = "jco",
          facet.by = "continent")

Useful additions

add a reference line (here the median life expectancy by continent)
add a rug plot (adds ticks below the axis to show where the actual observed values fall)
ggdensity(gap07, x = "lifeExp", fill = "continent", 
          main = "GapMinder Life Expectancy", palette = "jco",
          facet.by = "continent",
          add = "median", rug = TRUE)

Violin plots

Alternative to density plots

plot a mirrored density curve

ggviolin(gap07, x = "continent", y = "lifeExp")

Improvements

Use the jco palette to add a fill by continent.

Overlay a box and whisker plot for each continent to illustrate where the median and quartiles lie.

Add a jitter function which adds the observed values.

ggviolin(gap07, x = "continent", y = "lifeExp", 
         fill = "continent", palette = "jco",
         add = c("boxplot", "jitter"),
         ylab = "Life expectancy (yrs)")

Make the violins horizontal by adding the rotate argument

ggviolin(gap07, x = "continent", y = "lifeExp", 
         fill = "continent", palette = "jco",
         add = c("boxplot", "jitter"),
         ylab = "Life expectancy (yrs)",
         rotate = TRUE)

Bar plots

Bar plot by country using ggbar(): bar height = life expectancy

ggbarplot(gap07,
          x = "country",
          y = "lifeExp")

Modify it:

Add a fill by continent.

Rotate the country labels and reduce the font size.

Add axis labels.

ggbarplot(gap07,
          x = "country",
          y = "lifeExp",
          fill = "continent",
          palette = "jco",
          x.text.angle = 90,
          ylab = "Life expectancy (yrs)",
          xlab = "Country") +
  font("x.text", size = 4)

Order the bars by value using sort.val()

Note: if you set the sort.by.groups argument to true, this will first group all countries from a continent together, then sort them.

ggbarplot(gap07,
          x = "country",
          y = "lifeExp",
          fill = "continent",
          palette = "jco",
          sort.val = "desc",
          sort.by.groups = FALSE,
          x.text.angle = 90,
          ylab = "Life expectancy (yrs)",
          xlab = "Country") +
  font("x.text", size = 4)

Dot Plots and Cleveland Plots

An alternative to bar plots

Note: adding segments forces the origin to zero

ggdotchart(gap07,
           x = "country",
           y = "lifeExp",
           color = "continent",
           palette = "jco",
           sorting = "descending",
           rotate = TRUE,
           group = "continent",
           add = "segments",
           ylab = "Life expectancy (yrs)",
           xlab = "Country") +
  font("y.text", size = 4)

Adding comparisons to plots

Add the results of statistical tests to plots, using functions

Make a new subset of the gap data, including only African and Asian countries for three of the years.

gap_sub = gap %>% 
  filter(continent %in% c("Asia", "Africa"),
         year %in% c(1957, 1982, 2007))
Boxplot of the life expectancy values for the two continents, colored by continent, and with jittered observations overlaid
ggboxplot(gap_sub, x = "continent", y = "lifeExp", 
          ylab = "Years", col = "continent", add = "jitter")

Use stat_compare_means to test if the life expectancy differs between the two continents.

The height at which the comparison results will be placed is specified by the label.y argument

ggboxplot(gap_sub, x = "continent", y = "lifeExp", 
          ylab = "Years", col = "continent", add = "jitter") + 
  stat_compare_means(label.y = 90)

The Wilcoxon rank sum test is the default test for two groups.

Low P-value: the means are different between the two continents.

Elect t-test: specified by argument, method = “t.test”

ggboxplot(gap_sub, x = "continent", y = "lifeExp", 
          ylab = "Years", col = "continent", add = "jitter") + 
  stat_compare_means(method = "t.test", label.y = 90)

Facet boxplots by year: 3 separate p-values allow for comparison.

ggboxplot(gap_sub, x = "continent", y = "lifeExp", 
          ylab = "Years", col = "continent", add = "jitter", facet.by = "year") + 
  stat_compare_means(method = "t.test", label.y = 90)

Multiple Groups

Make boxplots by year (instead of continent), and use the same function to compare the mean by year using ANOVA.

ggboxplot(gap_sub, x = "year", y = "lifeExp") + 
  stat_compare_means(label.y = 80, method = "anova")

Multiple groups = ability to make pairwise & inter-group comparisons.

Make a list of desired comparisons:
- 1957 and 1982
- 1957 and 2007
- 1982 and 2007
comps = list( c('1957', '1982'), 
              c('1957', '2007'), 
              c('1982', '2007'))
comps
[[1]]
[1] "1957" "1982"

[[2]]
[1] "1957" "2007"

[[3]]
[1] "1982" "2007"
Make the boxplot again. Add the stat_compare_mean function (ask for t-tests between all the comparisons in the list comps)
ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years") + 
  stat_compare_means(method = "t.test", comparisons = comps, 
                     bracket.size = .6, size = 4)

include the original comparison based on all three groups using another stat_compare_means function.
ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years") + 
  stat_compare_means(method = "t.test", comparisons = comps, 
                     bracket.size = .6, size = 4) + 
  stat_compare_means(label.y = 110, method = "anova")

Get the same pairwise and grouped comparison, but individually for the two continents

Add a facet.by argument to the ggboxplot function

Note: between 1982 and 2007, there has not been a significant increase in African life expectancy

ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years", facet.by = "continent") + 
  stat_compare_means(method = "t.test", comparisons = comps, 
                     bracket.size = .6, size = 4) + 
  stat_compare_means(label.y = 110, method = "anova")

Choose one of the groups as the reference (instead of listing all the pairwise comparisons), and compare the other groups to this.

Replace the list of comparisons in the first stat_compare_means() function with a reference groups (here, the year 1957):

ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years", facet.by = "continent") + 
  stat_compare_means(method = "t.test", ref.group = "1957") + 
  stat_compare_means(label.y = 110, method = "anova")

The test results overlap, but we can replace this with a simple scheme that uses *’s to represent different levels of significance:
ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years", facet.by = "continent") + 
  stat_compare_means(label = "p.signif", method = "t.test",
                     ref.group = "1957") + 
  stat_compare_means(label.y = 110, method = "anova")