In this exercise, we learn more functions to help us tidy-transform and visualize our data using the gapminder data set.

Brief overview

Some of the verbs we will encounter:

  • select() chooses variables to retain or remove
  • filter() keeps the subset the data according to some condition specified
  • pivot_wider() reshapes the data to wide format, increasing the number of columns and decreasing number of rows
  • summarize() produces a single-row summary of the group/unit/entity specified in the group_by() argument for which we wish to calculate the summary statistic
  • mutate() generates new variables
  • slice() subsets rows using their positions, lets you index rows by their (integer) locations
  • arrange() sorts the data. Default is ascending
# Call in libraries 
library(tidyverse)
library(gapminder)
library(ggplot2)
library(scales) # to suppress scientific notation on the x axis in plots 
# Read in data
data(gapminder)

Data preview

head(gapminder)
## # A tibble: 6 x 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
tail(gapminder)
## # A tibble: 6 x 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1982    60.4  7636524      789.
## 2 Zimbabwe Africa     1987    62.4  9216418      706.
## 3 Zimbabwe Africa     1992    60.4 10704340      693.
## 4 Zimbabwe Africa     1997    46.8 11404948      792.
## 5 Zimbabwe Africa     2002    40.0 11926563      672.
## 6 Zimbabwe Africa     2007    43.5 12311143      470.

Data structure and summary

str(gapminder)
## tibble [1,704 x 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

More descriptives

  • There are 142 countries in the data set
gapminder %>% summarise(n = n_distinct(country))
## # A tibble: 1 x 1
##       n
##   <int>
## 1   142
  • These are the years found in the data set
gapminder %>% summarise(year = unique(year))
## # A tibble: 12 x 1
##     year
##    <int>
##  1  1952
##  2  1957
##  3  1962
##  4  1967
##  5  1972
##  6  1977
##  7  1982
##  8  1987
##  9  1992
## 10  1997
## 11  2002
## 12  2007

Population

  • Calculate each country’s average population from 1952 to 2007
gapminder %>% 
  group_by(country) %>% 
  summarize(mean_pop = mean(pop))
## # A tibble: 142 x 2
##    country      mean_pop
##    <fct>           <dbl>
##  1 Afghanistan 15823715.
##  2 Albania      2580249.
##  3 Algeria     19875406.
##  4 Angola       7309390.
##  5 Argentina   28602240.
##  6 Australia   14649312.
##  7 Austria      7583298.
##  8 Bahrain       373913.
##  9 Bangladesh  90755395.
## 10 Belgium      9725119.
## # ... with 132 more rows
  • Find out each country’s smallest population size
gapminder %>% 
  group_by(country) %>% 
  summarize(min_pop = min(pop))
## # A tibble: 142 x 2
##    country      min_pop
##    <fct>          <int>
##  1 Afghanistan  8425333
##  2 Albania      1282697
##  3 Algeria      9279525
##  4 Angola       4232095
##  5 Argentina   17876956
##  6 Australia    8691212
##  7 Austria      6927772
##  8 Bahrain       120447
##  9 Bangladesh  46886859
## 10 Belgium      8730405
## # ... with 132 more rows
  • Note what happens when you use mutate() to generate the summary statistic
gapminder %>% 
  group_by(country) %>% 
  mutate(min_pop = min(pop))
## # A tibble: 1,704 x 7
## # Groups:   country [142]
##    country     continent  year lifeExp      pop gdpPercap min_pop
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>   <int>
##  1 Afghanistan Asia       1952    28.8  8425333      779. 8425333
##  2 Afghanistan Asia       1957    30.3  9240934      821. 8425333
##  3 Afghanistan Asia       1962    32.0 10267083      853. 8425333
##  4 Afghanistan Asia       1967    34.0 11537966      836. 8425333
##  5 Afghanistan Asia       1972    36.1 13079460      740. 8425333
##  6 Afghanistan Asia       1977    38.4 14880372      786. 8425333
##  7 Afghanistan Asia       1982    39.9 12881816      978. 8425333
##  8 Afghanistan Asia       1987    40.8 13867957      852. 8425333
##  9 Afghanistan Asia       1992    41.7 16317921      649. 8425333
## 10 Afghanistan Asia       1997    41.8 22227415      635. 8425333
## # ... with 1,694 more rows
  • The above does not tell us too much.
  • Say we wish to know in which year a country’s population was the smallest
  • The code below lets us browse all columns for the only the rows containing smallest population size by country
gapminder %>% 
  group_by(country) %>% 
  slice(which.min(pop))
## # A tibble: 142 x 6
## # Groups:   country [142]
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Albania     Europe     1952    55.2  1282697     1601.
##  3 Algeria     Africa     1952    43.1  9279525     2449.
##  4 Angola      Africa     1952    30.0  4232095     3521.
##  5 Argentina   Americas   1952    62.5 17876956     5911.
##  6 Australia   Oceania    1952    69.1  8691212    10040.
##  7 Austria     Europe     1952    66.8  6927772     6137.
##  8 Bahrain     Asia       1952    50.9   120447     9867.
##  9 Bangladesh  Asia       1952    37.5 46886859      684.
## 10 Belgium     Europe     1952    68    8730405     8343.
## # ... with 132 more rows
  • Let’s say we don’t want to generate a new variable. We can choose to sort the data as desired and browse
  • Be sure to specify .by_group=T to sort rows within country and not the entire data frame
gapminder %>% 
  group_by(country) %>% 
  arrange(pop, .by_group=T)
## # A tibble: 1,704 x 6
## # Groups:   country [142]
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1982    39.9 12881816      978.
##  6 Afghanistan Asia       1972    36.1 13079460      740.
##  7 Afghanistan Asia       1987    40.8 13867957      852.
##  8 Afghanistan Asia       1977    38.4 14880372      786.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # ... with 1,694 more rows

Life expectancy

  • Find and plot the average life expectancy by continent and year
  • Here we save the output that we can use to create a plot with
out = gapminder %>% 
  group_by(continent, year) %>% 
  summarize(avglife=mean(lifeExp)) %>%
  arrange(desc(avglife)) # sort in descending order
out
## # A tibble: 60 x 3
## # Groups:   continent [5]
##    continent  year avglife
##    <fct>     <int>   <dbl>
##  1 Oceania    2007    80.7
##  2 Oceania    2002    79.7
##  3 Oceania    1997    78.2
##  4 Europe     2007    77.6
##  5 Oceania    1992    76.9
##  6 Europe     2002    76.7
##  7 Europe     1997    75.5
##  8 Oceania    1987    75.3
##  9 Europe     1992    74.4
## 10 Oceania    1982    74.3
## # ... with 50 more rows
ggplot(out, aes(x=year, y=avglife, color=continent)) + 
  geom_line(size=2) + 
  ggtitle("Average life expectancy over time")

  • Look closely at Oceania, which seems to have the highest average life expectancy
out2 = gapminder %>% filter(continent == "Oceania") 
head(out2)
## # A tibble: 6 x 6
##   country   continent  year lifeExp      pop gdpPercap
##   <fct>     <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Australia Oceania    1952    69.1  8691212    10040.
## 2 Australia Oceania    1957    70.3  9712569    10950.
## 3 Australia Oceania    1962    70.9 10794968    12217.
## 4 Australia Oceania    1967    71.1 11872264    14526.
## 5 Australia Oceania    1972    71.9 13177000    16789.
## 6 Australia Oceania    1977    73.5 14074100    18334.
tail(out2)
## # A tibble: 6 x 6
##   country     continent  year lifeExp     pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>   <int>     <dbl>
## 1 New Zealand Oceania    1982    73.8 3210650    17632.
## 2 New Zealand Oceania    1987    74.3 3317166    19007.
## 3 New Zealand Oceania    1992    76.3 3437674    18363.
## 4 New Zealand Oceania    1997    77.6 3676187    21050.
## 5 New Zealand Oceania    2002    79.1 3908037    23190.
## 6 New Zealand Oceania    2007    80.2 4115771    25185.
  • There are two countries in continent == “Oceania”
ggplot(out2, aes(x=year, y=lifeExp, color=country)) + 
  geom_line(size=1.2) + 
  ggtitle("Average life expectancy in Oceania over time")

  • In any given year, find out which country had the highest life expectancy in its continent
one <- gapminder %>% 
  group_by(year, continent) %>% 
  slice(which.max(lifeExp)) %>% 
  arrange(continent) 
one
## # A tibble: 60 x 6
## # Groups:   year, continent [60]
##    country   continent  year lifeExp    pop gdpPercap
##    <fct>     <fct>     <int>   <dbl>  <int>     <dbl>
##  1 Reunion   Africa     1952    52.7 257700     2719.
##  2 Mauritius Africa     1957    58.1 609816     2034.
##  3 Mauritius Africa     1962    60.2 701016     2529.
##  4 Mauritius Africa     1967    61.6 789309     2475.
##  5 Reunion   Africa     1972    64.3 461633     5048.
##  6 Reunion   Africa     1977    67.1 492095     4320.
##  7 Reunion   Africa     1982    69.9 517810     5267.
##  8 Reunion   Africa     1987    71.9 562035     5303.
##  9 Reunion   Africa     1992    73.6 622191     6101.
## 10 Reunion   Africa     1997    74.8 684810     6072.
## # ... with 50 more rows
  • Then count up the number of times this country held the highest life expectancy
one %>% 
  group_by(continent) %>% 
  count(country)
## # A tibble: 11 x 3
## # Groups:   continent [5]
##    continent country         n
##    <fct>     <fct>       <int>
##  1 Africa    Mauritius       3
##  2 Africa    Reunion         9
##  3 Americas  Canada         12
##  4 Asia      Israel          3
##  5 Asia      Japan           9
##  6 Europe    Iceland         6
##  7 Europe    Norway          1
##  8 Europe    Sweden          3
##  9 Europe    Switzerland     2
## 10 Oceania   Australia       9
## 11 Oceania   New Zealand     3
  • The above tells us that within Europe, for instance, Iceland maintained the highest life expectancy for 6 out of the 12 years in the data set.

  • Note - The previous two code chunks can be combined, as shown below, but this might be too unwieldy to keep track of what’s being tabulated

gapminder %>% 
  group_by(year, continent) %>% 
  slice(which.max(lifeExp)) %>% 
  arrange(continent) %>% 
  group_by(continent) %>% 
  count(country)

2007: Deep dive

  • Filter out the data and save it. We use the saved data for plots
gapminder_2007 <- gapminder %>% 
  filter(year == 2007)

Population and GDP per capita

ggplot(gapminder_2007, aes(x =gdpPercap, y = pop, color=continent)) +
  geom_point() +
  scale_y_continuous(labels = comma) + 
  ggtitle("Population and GDP per capita in 2007")

  • Redo the plot after removing outliers (the three countries with the largest populations) using filter()
gapminder_2007_a <- gapminder_2007 %>%
  filter(!(country %in% c("China", "India", "United States")))
ggplot(gapminder_2007_a, aes(x =gdpPercap, y = pop, color=continent)) +
  geom_point() +
  scale_y_continuous(labels = comma) + 
  ggtitle("Population and GDP per capita in 2007")

  • Log-transform both axes to reduce the skew
  • Let color represent continent and add a size aesthetic to represent a country’s population
ggplot(gapminder_2007, aes(x = pop, y = gdpPercap, color = continent, size = pop)) + 
  geom_point(alpha = 0.5) + 
  scale_x_log10() + 
  scale_y_log10() +
  labs(title = "Population and GDP per capita in 2007",
       x = "Population (log scale)",
       y = "GDP per capita (log scale)")

Per capita GDP and life expectancy

ggplot(gapminder_2007, aes(x = gdpPercap, y = lifeExp, color = continent, size = pop)) +
  geom_point(alpha = 0.5) + 
  scale_x_log10() + 
  labs(title = "Per capita GDP and life expectancy, 2007", 
       x = "GDP per capita (log scale)",
       y = "Life expectancy")

Population and life expectancy

ggplot(gapminder_2007, aes(x = pop, y = lifeExp, color = continent, size = pop)) +
  geom_point() +
  scale_x_log10() +
  labs(title = "Population and life expectancy, 2007", 
       x = "Population (log scale)",
       y = "Life expectancy")

  • Alternatively, plot panels by continent
ggplot(gapminder_2007, aes(x=pop, y=lifeExp)) +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~continent)

Correlations

library(ggcorrplot) # to plot a correlation matrix 
  • Produce the correlation matrix to visualize
corr <- round(cor(pct_change[,2:4]),2)
corr
##           lifeExp   pop gdpPercap
## lifeExp      1.00  0.39     -0.01
## pop          0.39  1.00     -0.20
## gdpPercap   -0.01 -0.20      1.00
  • Compute a correlation matrix p-values
p.mat <- cor_pmat(pct_change[,2:4])
p.mat
##                lifeExp          pop gdpPercap
## lifeExp   0.000000e+00 1.952492e-06 0.8953042
## pop       1.952492e-06 0.000000e+00 0.0152992
## gdpPercap 8.953042e-01 1.529920e-02 0.0000000
  • Produce a graphical display of a correlation matrix
ggcorrplot(corr,
           p.mat = p.mat, 
           type = "lower", 
           lab = TRUE, #adds correlation coefficient to the plot 
           sig.level = 0.05,
           insig = "blank", #insignificant coefficients are displayed as blank 
           title = "Correlation matrix of percent changes in \n population, life expectancy and per capita GDP ", 
           outline.col = "white",
           ggtheme = ggplot2::theme_gray,
           colors = c("#6D9EC1", "white", "#E46726")) 

  • A table can just as meaningfully convey correlations when there are few variables
  • Here we introduce, the corstarsl() function which is adapted from the one posted on this forum
corstarsl <- function(x){ 
  require(Hmisc) 
  x <- as.matrix(x) 
  R <- rcorr(x)$r 
  p <- rcorr(x)$P 
  
  ## define notions for significance levels; spacing is important.
  mystars <- ifelse(p < .001, "***", ifelse(p < .01, "** ", ifelse(p < .05, "* ", " ")))
  
  ## trunctuate the matrix that holds the correlations to two decimal
  R <- format(round(cbind(rep(-1.11, ncol(x)), R), 2))[,-1] 
  
  ## build a new matrix that includes the correlations with their apropriate stars 
  Rnew <- matrix(paste(R, mystars, sep=""), ncol=ncol(x)) 
  diag(Rnew) <- paste(diag(R), " ", sep="") 
  rownames(Rnew) <- colnames(x) 
  colnames(Rnew) <- paste(colnames(x), "", sep="") 
  
  ## remove upper triangle
  Rnew <- as.matrix(Rnew)
  Rnew[upper.tri(Rnew, diag = TRUE)] <- ""
  Rnew <- as.data.frame(Rnew) 
  
  ## remove last column and return the matrix (which is now a data frame)
  Rnew <- cbind(Rnew[1:length(Rnew)-1])
  return(Rnew) 
}
  • Correlations between population, life expectancy and per capita GDP in 1952
gapminder_1952 <- gapminder %>% filter(year == 1952)
corstarsl(gapminder_1952[,4:6])
##            lifeExp    pop
## lifeExp                  
## pop          0.00        
## gdpPercap  0.28*** -0.03
  • Correlations between population, life expectancy and per capita GDP in 2007
corstarsl(gapminder_2007[,4:6])
##            lifeExp    pop
## lifeExp                  
## pop          0.05        
## gdpPercap  0.68*** -0.06