In this exercise, we learn more functions to help us tidy-transform and visualize our data using the gapminder data set.
Some of the verbs we will encounter:
select() chooses variables to retain or removefilter() keeps the subset the data according to some condition specifiedpivot_wider() reshapes the data to wide format, increasing the number of columns and decreasing number of rowssummarize() produces a single-row summary of the group/unit/entity specified in the group_by() argument for which we wish to calculate the summary statisticmutate() generates new variablesslice() subsets rows using their positions, lets you index rows by their (integer) locationsarrange() sorts the data. Default is ascending# Call in libraries
library(tidyverse)
library(gapminder)
library(ggplot2)
library(scales) # to suppress scientific notation on the x axis in plots
# Read in data
data(gapminder)
Data preview
head(gapminder)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
tail(gapminder)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Zimbabwe Africa 1982 60.4 7636524 789.
## 2 Zimbabwe Africa 1987 62.4 9216418 706.
## 3 Zimbabwe Africa 1992 60.4 10704340 693.
## 4 Zimbabwe Africa 1997 46.8 11404948 792.
## 5 Zimbabwe Africa 2002 40.0 11926563 672.
## 6 Zimbabwe Africa 2007 43.5 12311143 470.
Data structure and summary
str(gapminder)
## tibble [1,704 x 6] (S3: tbl_df/tbl/data.frame)
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
## $ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
summary(gapminder)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
gapminder %>% summarise(n = n_distinct(country))
## # A tibble: 1 x 1
## n
## <int>
## 1 142
gapminder %>% summarise(year = unique(year))
## # A tibble: 12 x 1
## year
## <int>
## 1 1952
## 2 1957
## 3 1962
## 4 1967
## 5 1972
## 6 1977
## 7 1982
## 8 1987
## 9 1992
## 10 1997
## 11 2002
## 12 2007
gapminder %>%
group_by(country) %>%
summarize(mean_pop = mean(pop))
## # A tibble: 142 x 2
## country mean_pop
## <fct> <dbl>
## 1 Afghanistan 15823715.
## 2 Albania 2580249.
## 3 Algeria 19875406.
## 4 Angola 7309390.
## 5 Argentina 28602240.
## 6 Australia 14649312.
## 7 Austria 7583298.
## 8 Bahrain 373913.
## 9 Bangladesh 90755395.
## 10 Belgium 9725119.
## # ... with 132 more rows
gapminder %>%
group_by(country) %>%
summarize(min_pop = min(pop))
## # A tibble: 142 x 2
## country min_pop
## <fct> <int>
## 1 Afghanistan 8425333
## 2 Albania 1282697
## 3 Algeria 9279525
## 4 Angola 4232095
## 5 Argentina 17876956
## 6 Australia 8691212
## 7 Austria 6927772
## 8 Bahrain 120447
## 9 Bangladesh 46886859
## 10 Belgium 8730405
## # ... with 132 more rows
mutate() to generate the summary statisticgapminder %>%
group_by(country) %>%
mutate(min_pop = min(pop))
## # A tibble: 1,704 x 7
## # Groups: country [142]
## country continent year lifeExp pop gdpPercap min_pop
## <fct> <fct> <int> <dbl> <int> <dbl> <int>
## 1 Afghanistan Asia 1952 28.8 8425333 779. 8425333
## 2 Afghanistan Asia 1957 30.3 9240934 821. 8425333
## 3 Afghanistan Asia 1962 32.0 10267083 853. 8425333
## 4 Afghanistan Asia 1967 34.0 11537966 836. 8425333
## 5 Afghanistan Asia 1972 36.1 13079460 740. 8425333
## 6 Afghanistan Asia 1977 38.4 14880372 786. 8425333
## 7 Afghanistan Asia 1982 39.9 12881816 978. 8425333
## 8 Afghanistan Asia 1987 40.8 13867957 852. 8425333
## 9 Afghanistan Asia 1992 41.7 16317921 649. 8425333
## 10 Afghanistan Asia 1997 41.8 22227415 635. 8425333
## # ... with 1,694 more rows
gapminder %>%
group_by(country) %>%
slice(which.min(pop))
## # A tibble: 142 x 6
## # Groups: country [142]
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Albania Europe 1952 55.2 1282697 1601.
## 3 Algeria Africa 1952 43.1 9279525 2449.
## 4 Angola Africa 1952 30.0 4232095 3521.
## 5 Argentina Americas 1952 62.5 17876956 5911.
## 6 Australia Oceania 1952 69.1 8691212 10040.
## 7 Austria Europe 1952 66.8 6927772 6137.
## 8 Bahrain Asia 1952 50.9 120447 9867.
## 9 Bangladesh Asia 1952 37.5 46886859 684.
## 10 Belgium Europe 1952 68 8730405 8343.
## # ... with 132 more rows
.by_group=T to sort rows within country and not the entire data framegapminder %>%
group_by(country) %>%
arrange(pop, .by_group=T)
## # A tibble: 1,704 x 6
## # Groups: country [142]
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1982 39.9 12881816 978.
## 6 Afghanistan Asia 1972 36.1 13079460 740.
## 7 Afghanistan Asia 1987 40.8 13867957 852.
## 8 Afghanistan Asia 1977 38.4 14880372 786.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # ... with 1,694 more rows
continent and yearout = gapminder %>%
group_by(continent, year) %>%
summarize(avglife=mean(lifeExp)) %>%
arrange(desc(avglife)) # sort in descending order
out
## # A tibble: 60 x 3
## # Groups: continent [5]
## continent year avglife
## <fct> <int> <dbl>
## 1 Oceania 2007 80.7
## 2 Oceania 2002 79.7
## 3 Oceania 1997 78.2
## 4 Europe 2007 77.6
## 5 Oceania 1992 76.9
## 6 Europe 2002 76.7
## 7 Europe 1997 75.5
## 8 Oceania 1987 75.3
## 9 Europe 1992 74.4
## 10 Oceania 1982 74.3
## # ... with 50 more rows
ggplot(out, aes(x=year, y=avglife, color=continent)) +
geom_line(size=2) +
ggtitle("Average life expectancy over time")
out2 = gapminder %>% filter(continent == "Oceania")
head(out2)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Australia Oceania 1952 69.1 8691212 10040.
## 2 Australia Oceania 1957 70.3 9712569 10950.
## 3 Australia Oceania 1962 70.9 10794968 12217.
## 4 Australia Oceania 1967 71.1 11872264 14526.
## 5 Australia Oceania 1972 71.9 13177000 16789.
## 6 Australia Oceania 1977 73.5 14074100 18334.
tail(out2)
## # A tibble: 6 x 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 New Zealand Oceania 1982 73.8 3210650 17632.
## 2 New Zealand Oceania 1987 74.3 3317166 19007.
## 3 New Zealand Oceania 1992 76.3 3437674 18363.
## 4 New Zealand Oceania 1997 77.6 3676187 21050.
## 5 New Zealand Oceania 2002 79.1 3908037 23190.
## 6 New Zealand Oceania 2007 80.2 4115771 25185.
ggplot(out2, aes(x=year, y=lifeExp, color=country)) +
geom_line(size=1.2) +
ggtitle("Average life expectancy in Oceania over time")
one <- gapminder %>%
group_by(year, continent) %>%
slice(which.max(lifeExp)) %>%
arrange(continent)
one
## # A tibble: 60 x 6
## # Groups: year, continent [60]
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Reunion Africa 1952 52.7 257700 2719.
## 2 Mauritius Africa 1957 58.1 609816 2034.
## 3 Mauritius Africa 1962 60.2 701016 2529.
## 4 Mauritius Africa 1967 61.6 789309 2475.
## 5 Reunion Africa 1972 64.3 461633 5048.
## 6 Reunion Africa 1977 67.1 492095 4320.
## 7 Reunion Africa 1982 69.9 517810 5267.
## 8 Reunion Africa 1987 71.9 562035 5303.
## 9 Reunion Africa 1992 73.6 622191 6101.
## 10 Reunion Africa 1997 74.8 684810 6072.
## # ... with 50 more rows
one %>%
group_by(continent) %>%
count(country)
## # A tibble: 11 x 3
## # Groups: continent [5]
## continent country n
## <fct> <fct> <int>
## 1 Africa Mauritius 3
## 2 Africa Reunion 9
## 3 Americas Canada 12
## 4 Asia Israel 3
## 5 Asia Japan 9
## 6 Europe Iceland 6
## 7 Europe Norway 1
## 8 Europe Sweden 3
## 9 Europe Switzerland 2
## 10 Oceania Australia 9
## 11 Oceania New Zealand 3
The above tells us that within Europe, for instance, Iceland maintained the highest life expectancy for 6 out of the 12 years in the data set.
Note - The previous two code chunks can be combined, as shown below, but this might be too unwieldy to keep track of what’s being tabulated
gapminder %>%
group_by(year, continent) %>%
slice(which.max(lifeExp)) %>%
arrange(continent) %>%
group_by(continent) %>%
count(country)
gapminder_2007 <- gapminder %>%
filter(year == 2007)
ggplot(gapminder_2007, aes(x =gdpPercap, y = pop, color=continent)) +
geom_point() +
scale_y_continuous(labels = comma) +
ggtitle("Population and GDP per capita in 2007")
filter()gapminder_2007_a <- gapminder_2007 %>%
filter(!(country %in% c("China", "India", "United States")))
ggplot(gapminder_2007_a, aes(x =gdpPercap, y = pop, color=continent)) +
geom_point() +
scale_y_continuous(labels = comma) +
ggtitle("Population and GDP per capita in 2007")
ggplot(gapminder_2007, aes(x = pop, y = gdpPercap, color = continent, size = pop)) +
geom_point(alpha = 0.5) +
scale_x_log10() +
scale_y_log10() +
labs(title = "Population and GDP per capita in 2007",
x = "Population (log scale)",
y = "GDP per capita (log scale)")
ggplot(gapminder_2007, aes(x = gdpPercap, y = lifeExp, color = continent, size = pop)) +
geom_point(alpha = 0.5) +
scale_x_log10() +
labs(title = "Per capita GDP and life expectancy, 2007",
x = "GDP per capita (log scale)",
y = "Life expectancy")
ggplot(gapminder_2007, aes(x = pop, y = lifeExp, color = continent, size = pop)) +
geom_point() +
scale_x_log10() +
labs(title = "Population and life expectancy, 2007",
x = "Population (log scale)",
y = "Life expectancy")
ggplot(gapminder_2007, aes(x=pop, y=lifeExp)) +
geom_point() +
scale_x_log10() +
facet_wrap(~continent)
by_year <- gapminder %>%
group_by(year) %>%
summarise(medianLifeExp = median(lifeExp))
ggplot(by_year, aes(x = year, y = medianLifeExp)) +
geom_point() +
expand_limits(y=0) +
ggtitle("Global median life expectancy over time")
by_year_continent <- gapminder %>%
group_by(continent, year) %>%
summarise(medianGdpPercap = median(gdpPercap))
by_year_continent
## # A tibble: 60 x 3
## # Groups: continent [5]
## continent year medianGdpPercap
## <fct> <int> <dbl>
## 1 Africa 1952 987.
## 2 Africa 1957 1024.
## 3 Africa 1962 1134.
## 4 Africa 1967 1210.
## 5 Africa 1972 1443.
## 6 Africa 1977 1400.
## 7 Africa 1982 1324.
## 8 Africa 1987 1220.
## 9 Africa 1992 1162.
## 10 Africa 1997 1180.
## # ... with 50 more rows
ggplot(by_year_continent, aes(x = year, y = medianGdpPercap, color = continent)) +
geom_point() +
expand_limits(y=0) +
ggtitle("Median per capita GDP by continent over time")
pop_change <- gapminder %>%
# select variables of interest
select(country, year, pop) %>%
# only keep the years 1952 and 2007 for each country
filter(year==1952|year==2007) %>%
# reshape to wide, creates two cols `1952` and `2007` and stores pop values in the created year cols
pivot_wider(names_from=year, values_from=pop) %>%
# calculate population difference
mutate(diffPop = `2007`-`1952`,
diffPop_pct = ((`2007`-`1952`)/`1952`)*100) %>% #Notice the backticks in the col names
# sort data by `diffPop` in ascending order
arrange(diffPop_pct)
pct_change <- gapminder %>%
# select variables of interest
select(country, year, lifeExp, pop, gdpPercap) %>%
# only keep the years 1952 and 2007 for each country
filter(year==1952|year==2007) %>%
# reshape to wide, creates two cols `1952` and `2007` and stores pop values in the created year cols
pivot_wider(names_from=year, values_from=c(lifeExp, pop, gdpPercap)) %>%
# calculate population difference
mutate(lifeExp = ((lifeExp_2007 - lifeExp_1952)/lifeExp_1952)*100,
pop = ((pop_2007 - pop_1952)/pop_1952)*100,
gdpPercap = ((gdpPercap_2007 - gdpPercap_1952)/gdpPercap_1952)*100) %>%
# remove the interim variables created
select(-(lifeExp_1952:gdpPercap_2007))
library(ggcorrplot) # to plot a correlation matrix
corr <- round(cor(pct_change[,2:4]),2)
corr
## lifeExp pop gdpPercap
## lifeExp 1.00 0.39 -0.01
## pop 0.39 1.00 -0.20
## gdpPercap -0.01 -0.20 1.00
p.mat <- cor_pmat(pct_change[,2:4])
p.mat
## lifeExp pop gdpPercap
## lifeExp 0.000000e+00 1.952492e-06 0.8953042
## pop 1.952492e-06 0.000000e+00 0.0152992
## gdpPercap 8.953042e-01 1.529920e-02 0.0000000
ggcorrplot(corr,
p.mat = p.mat,
type = "lower",
lab = TRUE, #adds correlation coefficient to the plot
sig.level = 0.05,
insig = "blank", #insignificant coefficients are displayed as blank
title = "Correlation matrix of percent changes in \n population, life expectancy and per capita GDP ",
outline.col = "white",
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
corstarsl() function which is adapted from the one posted on this forumcorstarsl <- function(x){
require(Hmisc)
x <- as.matrix(x)
R <- rcorr(x)$r
p <- rcorr(x)$P
## define notions for significance levels; spacing is important.
mystars <- ifelse(p < .001, "***", ifelse(p < .01, "** ", ifelse(p < .05, "* ", " ")))
## trunctuate the matrix that holds the correlations to two decimal
R <- format(round(cbind(rep(-1.11, ncol(x)), R), 2))[,-1]
## build a new matrix that includes the correlations with their apropriate stars
Rnew <- matrix(paste(R, mystars, sep=""), ncol=ncol(x))
diag(Rnew) <- paste(diag(R), " ", sep="")
rownames(Rnew) <- colnames(x)
colnames(Rnew) <- paste(colnames(x), "", sep="")
## remove upper triangle
Rnew <- as.matrix(Rnew)
Rnew[upper.tri(Rnew, diag = TRUE)] <- ""
Rnew <- as.data.frame(Rnew)
## remove last column and return the matrix (which is now a data frame)
Rnew <- cbind(Rnew[1:length(Rnew)-1])
return(Rnew)
}
gapminder_1952 <- gapminder %>% filter(year == 1952)
corstarsl(gapminder_1952[,4:6])
## lifeExp pop
## lifeExp
## pop 0.00
## gdpPercap 0.28*** -0.03
corstarsl(gapminder_2007[,4:6])
## lifeExp pop
## lifeExp
## pop 0.05
## gdpPercap 0.68*** -0.06