exercise15

Author

Hangu Lee

0. Setting

# Load the package
library(ggplot2)
library(ggpubr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggExtra)

# Load the dataset and check structure
gap = read.csv("./data/gapminderData5.csv")
str(gap)
'data.frame':   1704 obs. of  6 variables:
 $ country  : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
 $ year     : int  1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
 $ pop      : num  8425333 9240934 10267083 11537966 13079460 ...
 $ continent: chr  "Asia" "Asia" "Asia" "Asia" ...
 $ lifeExp  : num  28.8 30.3 32 34 36.1 ...
 $ gdpPercap: num  779 821 853 836 740 ...

1. Data Manipulation

# Extract a subset for the year 2007 excluding Oceania
gap07 = gap %>%   
  filter(year == 2007 & continent != "Oceania")

2. Visualizing Bivariate Data

Scatter Plots

# ggplot2
ggplot(gap07, aes(x = gdpPercap, y = lifeExp, col = continent)) + 
  geom_point() + scale_x_log10("GDP per capita ($)") + 
  scale_y_continuous("Life Expectancy (yrs)") + ggtitle("GapMinder Data 2007")

# Basic ggpubr Scatter Plot with Log-scale x-axis
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",          
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)",           
          main = "GapMinder Data 2007") +   
  xscale("log10", .format = TRUE)

# Labeling
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007", label = "country", repel = TRUE) + 
  xscale("log10", .format = TRUE)

# Scatter Plot with specific country labels
sel_countries = c("United States", "China", "Germany")
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",          
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)",           
          main = "GapMinder Data 2007", label = "country",           
          label.select = sel_countries, repel = TRUE) +   
  xscale("log10", .format = TRUE)

# The distribution of points can be shown using a ‘rug’ - the position of each observation
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007") + 
  xscale("log10", .format = TRUE)

# Marginal histogram
library(ggExtra)
p <- ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007") + 
  xscale("log10", .format = TRUE)
ggMarginal(p, type = "histogram")

# Regression line
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)", 
          main = "GapMinder Data 2007", add = "reg.line", conf.int = TRUE) + 
  xscale("log10", .format = TRUE)

# Scatter Plot with Regression Lines, Confidence Intervals, and Spearman Correlation
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",          
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)",           
          main = "GapMinder Data 2007", add = "reg.line", conf.int = TRUE) +   
  xscale("log10", .format = TRUE) +  
  stat_cor(aes(color = continent), method = "spearman")

# Scatter Plot with Regression Equations
ggscatter(gap07, x = "gdpPercap", y = "lifeExp", col = "continent",          
          xlab = "GDP per capita ($)", ylab = "Life expectancy (yrs)",           
          main = "GapMinder Data 2007", add = "reg.line", conf.int = TRUE) +   
  xscale("log10", .format = TRUE) +  
  stat_regline_equation(aes(color = continent))

Histograms & Density Plots

# gghistogram()
gghistogram(gap07, x = "lifeExp", main = "GapMinder Life Expectancy")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

# Fill argument
gghistogram(gap07, x = "lifeExp", fill = "continent", 
            main = "GapMinder Life Expectancy")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

# Grouped Histogram with Academic Palette
gghistogram(gap07, x = "lifeExp", fill = "continent",             
            main = "GapMinder Life Expectancy", palette = "npg")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.

# ggdensity()
ggdensity(gap07, x = "lifeExp", fill = "continent", 
          main = "GapMinder Life Expectancy", palette = "jco")

# Facet.by
ggdensity(gap07, x = "lifeExp", fill = "continent", 
          main = "GapMinder Life Expectancy", palette = "jco",
          facet.by = "continent")

# Faceted Density Plot with Median Reference Lines and Rug Plots
ggdensity(gap07, x = "lifeExp", fill = "continent",           
          main = "GapMinder Life Expectancy", palette = "jco",          
          facet.by = "continent",          
          add = "median", rug = TRUE)

Violin Plots

# ggviolin()
ggviolin(gap07, x = "continent", y = "lifeExp")

# Using the jco palette & jitter function
ggviolin(gap07, x = "continent", y = "lifeExp", 
         fill = "continent", palette = "jco",
         add = c("boxplot", "jitter"),
         ylab = "Life expectancy (yrs)")

# Horizontal Violin Plot with overlaid boxplot and jittered points
ggviolin(gap07, x = "continent", y = "lifeExp",          
         fill = "continent", palette = "jco",         
         add = c("boxplot", "jitter"),         
         ylab = "Life expectancy (yrs)",         
         rotate = TRUE)

3. Categorical Comparisons

Bar Plots & Cleveland Dot Charts

# ggbar()
ggbarplot(gap07,
          x = "country",
          y = "lifeExp")

# Modify
ggbarplot(gap07,
          x = "country",
          y = "lifeExp",
          fill = "continent",
          palette = "jco",
          x.text.angle = 90,
          ylab = "Life expectancy (yrs)",
          xlab = "Country") +
  font("x.text", size = 4)

# Sorted Bar Plot by descending value
ggbarplot(gap07,          
          x = "country",          
          y = "lifeExp",          
          fill = "continent",          
          palette = "jco",          
          sort.val = "desc",          
          sort.by.groups = FALSE,          
          x.text.angle = 90,          
          ylab = "Life expectancy (yrs)",          
          xlab = "Country") +  
  font("x.text", size = 4)

# Alternative: Cleveland Dot Chart Grouped by Continent
ggdotchart(gap07,           
           x = "country",           
           y = "lifeExp",           
           color = "continent",           
           palette = "jco",           
           sorting = "descending",           
           rotate = TRUE,           
           group = "continent",           
           add = "segments",           
           ylab = "Life expectancy (yrs)",           
           xlab = "Country") +  
  font("y.text", size = 4)

4. Statistical Comparisons on Plots

Two Groups & Multiple Group

# Data Subsetting for Comparisons
gap_sub = gap %>%   
  filter(continent %in% c("Asia", "Africa"),         
         year %in% c(1957, 1982, 2007))

# Boxplot
ggboxplot(gap_sub, x = "continent", y = "lifeExp", 
          ylab = "Years", col = "continent", add = "jitter")

# stat_compare_means
ggboxplot(gap_sub, x = "continent", y = "lifeExp", 
          ylab = "Years", col = "continent", add = "jitter") + 
  stat_compare_means(label.y = 90)

# method = "t.test"
ggboxplot(gap_sub, x = "continent", y = "lifeExp", 
          ylab = "Years", col = "continent", add = "jitter") + 
  stat_compare_means(method = "t.test", label.y = 90)

# Faceted Boxplot with Group-wise t-test Results
ggboxplot(gap_sub, x = "continent", y = "lifeExp",           
          ylab = "Years", col = "continent", add = "jitter", 
          facet.by = "year") +   
  stat_compare_means(method = "t.test", label.y = 90)

# Using ANOVA
ggboxplot(gap_sub, x = "year", y = "lifeExp") + 
  stat_compare_means(label.y = 80, method = "anova")

# Pairwise Comparisons List Definition
comps = list( c('1957', '1982'),               
              c('1957', '2007'),               
              c('1982', '2007'))

# Adding stat_compare_mean
ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years") + 
  stat_compare_means(method = "t.test", comparisons = comps, 
                     bracket.size = .6, size = 4)

# Adding a second stat_commpare_means()
ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years") + 
  stat_compare_means(method = "t.test", comparisons = comps, 
                     bracket.size = .6, size = 4) + 
  stat_compare_means(label.y = 110, method = "anova")

# Multi-group Boxplot with Pairwise Brackets (t-test) and Global Test (ANOVA)
ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years", facet.by = "continent") +   
  stat_compare_means(method = "t.test", comparisons = comps,                      
                     bracket.size = .6, size = 4) +   
  stat_compare_means(label.y = 110, method = "anova")

# stat_compare_means()
ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years", facet.by = "continent") + 
  stat_compare_means(method = "t.test", ref.group = "1957") + 
  stat_compare_means(label.y = 110, method = "anova")

# Pairwise Comparison with Reference Group using Significance Symbols (*, **, ns)
ggboxplot(gap_sub, x = "year", y = "lifeExp", ylab = "Years", facet.by = "continent") +   
  stat_compare_means(label = "p.signif", method = "t.test",                     
                     ref.group = "1957") +   
  stat_compare_means(label.y = 110, method = "anova")