# Load the packagelibrary(ggplot2)library(ggpubr)library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(ggExtra)# Load the dataset and check structuregap =read.csv("./data/gapminderData5.csv")str(gap)
'data.frame': 1704 obs. of 6 variables:
$ country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
$ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
$ pop : num 8425333 9240934 10267083 11537966 13079460 ...
$ continent: chr "Asia" "Asia" "Asia" "Asia" ...
$ lifeExp : num 28.8 30.3 32 34 36.1 ...
$ gdpPercap: num 779 821 853 836 740 ...
1. Data Manipulation
# Extract a subset for the year 2007 excluding Oceaniagap07 = gap %>%filter(year ==2007& continent !="Oceania")
2. Visualizing Bivariate Data
Scatter Plots
# ggplot2ggplot(gap07, aes(x = gdpPercap, y = lifeExp, col = continent)) +geom_point() +scale_x_log10("GDP per capita ($)") +scale_y_continuous("Life Expectancy (yrs)") +ggtitle("GapMinder Data 2007")
# Basic ggpubr Scatter Plot with Log-scale x-axisggscatter(gap07, x ="gdpPercap", y ="lifeExp", col ="continent", xlab ="GDP per capita ($)", ylab ="Life expectancy (yrs)", main ="GapMinder Data 2007") +xscale("log10", .format =TRUE)
# Labelingggscatter(gap07, x ="gdpPercap", y ="lifeExp", col ="continent",xlab ="GDP per capita ($)", ylab ="Life expectancy (yrs)", main ="GapMinder Data 2007", label ="country", repel =TRUE) +xscale("log10", .format =TRUE)
# Scatter Plot with specific country labelssel_countries =c("United States", "China", "Germany")ggscatter(gap07, x ="gdpPercap", y ="lifeExp", col ="continent", xlab ="GDP per capita ($)", ylab ="Life expectancy (yrs)", main ="GapMinder Data 2007", label ="country", label.select = sel_countries, repel =TRUE) +xscale("log10", .format =TRUE)
# The distribution of points can be shown using a ‘rug’ - the position of each observationggscatter(gap07, x ="gdpPercap", y ="lifeExp", col ="continent",xlab ="GDP per capita ($)", ylab ="Life expectancy (yrs)", main ="GapMinder Data 2007") +xscale("log10", .format =TRUE)# Marginal histogramlibrary(ggExtra)p <-ggscatter(gap07, x ="gdpPercap", y ="lifeExp", col ="continent",xlab ="GDP per capita ($)", ylab ="Life expectancy (yrs)", main ="GapMinder Data 2007") +xscale("log10", .format =TRUE)ggMarginal(p, type ="histogram")
# Regression lineggscatter(gap07, x ="gdpPercap", y ="lifeExp", col ="continent",xlab ="GDP per capita ($)", ylab ="Life expectancy (yrs)", main ="GapMinder Data 2007", add ="reg.line", conf.int =TRUE) +xscale("log10", .format =TRUE)
# Scatter Plot with Regression Lines, Confidence Intervals, and Spearman Correlationggscatter(gap07, x ="gdpPercap", y ="lifeExp", col ="continent", xlab ="GDP per capita ($)", ylab ="Life expectancy (yrs)", main ="GapMinder Data 2007", add ="reg.line", conf.int =TRUE) +xscale("log10", .format =TRUE) +stat_cor(aes(color = continent), method ="spearman")
# Scatter Plot with Regression Equationsggscatter(gap07, x ="gdpPercap", y ="lifeExp", col ="continent", xlab ="GDP per capita ($)", ylab ="Life expectancy (yrs)", main ="GapMinder Data 2007", add ="reg.line", conf.int =TRUE) +xscale("log10", .format =TRUE) +stat_regline_equation(aes(color = continent))
Histograms & Density Plots
# gghistogram()gghistogram(gap07, x ="lifeExp", main ="GapMinder Life Expectancy")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.
# Fill argumentgghistogram(gap07, x ="lifeExp", fill ="continent", main ="GapMinder Life Expectancy")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.
# Grouped Histogram with Academic Palettegghistogram(gap07, x ="lifeExp", fill ="continent", main ="GapMinder Life Expectancy", palette ="npg")
Warning: Using `bins = 30` by default. Pick better value with the argument
`bins`.
# ggdensity()ggdensity(gap07, x ="lifeExp", fill ="continent", main ="GapMinder Life Expectancy", palette ="jco")
# Facet.byggdensity(gap07, x ="lifeExp", fill ="continent", main ="GapMinder Life Expectancy", palette ="jco",facet.by ="continent")
# Faceted Density Plot with Median Reference Lines and Rug Plotsggdensity(gap07, x ="lifeExp", fill ="continent", main ="GapMinder Life Expectancy", palette ="jco", facet.by ="continent", add ="median", rug =TRUE)
Violin Plots
# ggviolin()ggviolin(gap07, x ="continent", y ="lifeExp")
# Using the jco palette & jitter functionggviolin(gap07, x ="continent", y ="lifeExp", fill ="continent", palette ="jco",add =c("boxplot", "jitter"),ylab ="Life expectancy (yrs)")
# Horizontal Violin Plot with overlaid boxplot and jittered pointsggviolin(gap07, x ="continent", y ="lifeExp", fill ="continent", palette ="jco", add =c("boxplot", "jitter"), ylab ="Life expectancy (yrs)", rotate =TRUE)
# Sorted Bar Plot by descending valueggbarplot(gap07, x ="country", y ="lifeExp", fill ="continent", palette ="jco", sort.val ="desc", sort.by.groups =FALSE, x.text.angle =90, ylab ="Life expectancy (yrs)", xlab ="Country") +font("x.text", size =4)
# Alternative: Cleveland Dot Chart Grouped by Continentggdotchart(gap07, x ="country", y ="lifeExp", color ="continent", palette ="jco", sorting ="descending", rotate =TRUE, group ="continent", add ="segments", ylab ="Life expectancy (yrs)", xlab ="Country") +font("y.text", size =4)
4. Statistical Comparisons on Plots
Two Groups & Multiple Group
# Data Subsetting for Comparisonsgap_sub = gap %>%filter(continent %in%c("Asia", "Africa"), year %in%c(1957, 1982, 2007))# Boxplotggboxplot(gap_sub, x ="continent", y ="lifeExp", ylab ="Years", col ="continent", add ="jitter")
# stat_compare_meansggboxplot(gap_sub, x ="continent", y ="lifeExp", ylab ="Years", col ="continent", add ="jitter") +stat_compare_means(label.y =90)
# method = "t.test"ggboxplot(gap_sub, x ="continent", y ="lifeExp", ylab ="Years", col ="continent", add ="jitter") +stat_compare_means(method ="t.test", label.y =90)
# Faceted Boxplot with Group-wise t-test Resultsggboxplot(gap_sub, x ="continent", y ="lifeExp", ylab ="Years", col ="continent", add ="jitter", facet.by ="year") +stat_compare_means(method ="t.test", label.y =90)
# Using ANOVAggboxplot(gap_sub, x ="year", y ="lifeExp") +stat_compare_means(label.y =80, method ="anova")
# Pairwise Comparisons List Definitioncomps =list( c('1957', '1982'), c('1957', '2007'), c('1982', '2007'))# Adding stat_compare_meanggboxplot(gap_sub, x ="year", y ="lifeExp", ylab ="Years") +stat_compare_means(method ="t.test", comparisons = comps, bracket.size = .6, size =4)
# Adding a second stat_commpare_means()ggboxplot(gap_sub, x ="year", y ="lifeExp", ylab ="Years") +stat_compare_means(method ="t.test", comparisons = comps, bracket.size = .6, size =4) +stat_compare_means(label.y =110, method ="anova")
# Multi-group Boxplot with Pairwise Brackets (t-test) and Global Test (ANOVA)ggboxplot(gap_sub, x ="year", y ="lifeExp", ylab ="Years", facet.by ="continent") +stat_compare_means(method ="t.test", comparisons = comps, bracket.size = .6, size =4) +stat_compare_means(label.y =110, method ="anova")
# stat_compare_means()ggboxplot(gap_sub, x ="year", y ="lifeExp", ylab ="Years", facet.by ="continent") +stat_compare_means(method ="t.test", ref.group ="1957") +stat_compare_means(label.y =110, method ="anova")
# Pairwise Comparison with Reference Group using Significance Symbols (*, **, ns)ggboxplot(gap_sub, x ="year", y ="lifeExp", ylab ="Years", facet.by ="continent") +stat_compare_means(label ="p.signif", method ="t.test", ref.group ="1957") +stat_compare_means(label.y =110, method ="anova")