Visualisations with Gapminder using {ggplot} and {plotly}
Gapminder data recap
The gapminder package contains the gapminder data which contains data on life expectancy, population and GDP per capita by country and year.
The data contains 1704 rows and 6 columns containing;
- country
- continent
- year
- lifeExp - life expectancy at birth
- pop - total population
- gdpPercap - per-capita GDP (Gross domestic product given in units of international dollars)
## print the top 10 rows of the data with head()
head(gapminder)## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## print a summary of the data with summary()
summary(gapminder)## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
## print the structure of the data with str()
str(gapminder)## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num [1:1704] 28.8 30.3 32 34 36.1 ...
## $ pop : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num [1:1704] 779 821 853 836 740 ...
Introducing ggplot
Bar Charts
There are two types of bar charts:
- geom_bar()
- geom_col()
geom_bar() makes the height of the bar proportional to the number of cases in each group (..count..), (or the sum of the weights if the weight aesthetic is supplied).
geom_col() instead has the heights of the bars representing the values in the data.
## plot a geom_col and a geom_bar to see the differences
ggp_bar <- ggplot(data = gapminder, aes(x=continent)) +
geom_bar()
## what is 'count' in this graph?
ggp_bar## geom_col
ggp_col <- ggplot(data = gapminder %>%
filter(continent == "Oceania", year == 2007),
aes(x=country, y=pop)) + # can do dataframe manipulation within the data call with pipes.
geom_col()
## here, y axis is the value for the variable you pass in the aesthetic.
ggp_col## Now add in more of the layers to one of the plots above
ggp_col2 <- ggplot(data = gapminder %>%
filter(continent == "Oceania", year == 2007), aes(x=country, y=pop)) +
geom_col(aes(fill = country)) + # notice it adds in a legend automatically
scale_y_continuous(breaks = seq(0,21000000, 2000000)) + #try changing the breaks
ggtitle("Population of Oceania by country in 2007") +
labs(y = "Population", x = "Country", fill = "Country") +
theme_bw()
ggp_col2Scatter plots using geom_point()
## plot a scatter plot with geom_point
## work through this a bit at a time to understand what each section does
ggp_scatter <- ggplot(data = gapminder, # data
aes(x = gdpPercap, y = lifeExp, colour = continent)) + #aesthetics
geom_point(alpha = 0.5) + #geometries
geom_smooth(method = NULL, se = T) +
facet_wrap(~continent, scales = "free") + #facets
labs(title = "Life expectancy at birth vs. Per capita GDP", x = "gdpPercap", y = "lifeExp", colour = "Continent") + #labels
theme_light() + #themes
theme(axis.text.x = element_text(angle = 45),
axis.text = element_text(size = 7),
legend.position = "bottom") +
NULL #good to include a NULL because of the + convention.
ggp_scatter## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## repeat these plots using different variablesHistograms using geom_hist()
Histograms are useful to show distributions.
## plot a histogram showing distribution of life expectancy
ggp_hist <- ggplot(data = gapminder, aes(x = lifeExp)) +
geom_histogram(aes(y = ..density..), binwidth = 2, colour = "black", fill = "lightblue") + # change binwidth or substitute out for bins =
geom_density(aes(colour = continent, fill = continent), alpha = 0.1) +
labs(title = "Life Expectancy", x = "Life Expectancy", y = "Density") +
theme_light() +
NULL
## instead of having different density plots overlaying the same graph, try facet_wrap().
ggp_hist# ggp_hist2 <- ggplot() +
# geom_histogram() +
# geom_density() +
# NULL
#ggp_hist2
## make another histogram with different variables
# ggp_hist3 <- ggplot() +
# geom_histogram() +
# NULL
#ggp_hist3Box plots using geom_boxplot()
Boxplots are useful for visualising summary statistics and identifying outliers in the data.
## plot a boxplot
ggp_box <- ggplot(data = gapminder, aes(x = fct_reorder(continent, lifeExp), y = lifeExp, fill = continent)) + # useful reordering function suppled to the x aesthetic. Remove it and see the effects.
geom_boxplot(outlier.color = NA) + #removed outliers in this plot because we are layering on geom_point() of the data, try commenting out the geom_point() and removing the outlier.color = NA command to see how outliers are presented by default.
geom_jitter(alpha = 0.2) + #geom_jitter is a variation on geom_point(). Swap out for geom_point() to see the effects.
scale_y_continuous(breaks = round(seq(min(gapminder$lifeExp), max(gapminder$lifeExp), by = 10),0)) + # seq() is for sequence, change the by = 10 to a different value to see the effects.
coord_flip() + # comment coord_flip() out to see the effects.
labs(title = "Boxplot of life expectancy by continent", x = "Life Expectancy", y = "Continent") +
theme_light() +
theme(legend.position = "none") + # removed the legend as not visually necessary in this plot.
NULL
ggp_box## wrangle gapminder data to find top 5 countries in the year 2007 with the highest life expectancy and create a new boxplot.
top5 <- gapminder %>%
filter() %>% #filter for year
arrange() %>% #arrange by life expectancy
head(5)Additional features and visualisations
Add in additional columns to calculate life expectancy/ population and GDP per capita changes year on year using lag().
## wrangle some features
## calculate rate change from previous year per country
df_rate_change <- gapminder %>%
janitor::clean_names() %>% # good package and function to make column names more coding friendly.
group_by(country) %>%
arrange(year) %>%
mutate(pc_lifeexp_change = round(((life_exp/ lag(life_exp)) -1) * 100, 1)) %>% # here, rounding is done to 1 decimal place.
mutate(pc_pop_change = round(((pop / lag(pop)) -1) * 100, 1)) %>%
mutate(pc_gdp_cap_change = round(((gdp_percap / lag(gdp_percap)) -1) * 100, 1)) %>% #changes here are * 100 to make %
arrange(country)
## add in any other additional features you would like to visualise with mutate()
head(df_rate_change)## # A tibble: 6 × 9
## # Groups: country [1]
## country continent year life_exp pop gdp_pe…¹ pc_li…² pc_po…³ pc_gd…⁴
## <fct> <fct> <int> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779. NA NA NA
## 2 Afghanistan Asia 1957 30.3 9240934 821. 5.3 9.7 5.3
## 3 Afghanistan Asia 1962 32.0 10267083 853. 5.5 11.1 3.9
## 4 Afghanistan Asia 1967 34.0 11537966 836. 6.3 12.4 -2
## 5 Afghanistan Asia 1972 36.1 13079460 740. 6.1 13.4 -11.5
## 6 Afghanistan Asia 1977 38.4 14880372 786. 6.5 13.8 6.2
## # … with abbreviated variable names ¹gdp_percap, ²pc_lifeexp_change,
## # ³pc_pop_change, ⁴pc_gdp_cap_change
Here we will wrangle the dataframe we have just created (df_rate_change) to find the percent change in population for the country with the highest life expectancy and the country with the lowest life expectancy per continent using 2007 as an example year to filter on.
## plot a line graph showing the changes in life expectancy per year for 5 countries
## find the 5 countries with the lowest population in 2007 per continent
low_pop_2007 <- df_rate_change %>%
filter(year == 2007) %>%
group_by(continent) %>%
arrange(life_exp) %>%
slice(1) %>% # slice() takes the first row per group - here it is important that the previous steps are correct. head() will only take the 1st row of the dataframe.
ungroup() %>%
pull(country) #pull() will return a list of the countries, use class(low_pop_2007) to see the structure of the outcome.
high_pop_2007 <- df_rate_change %>%
filter(year == 2007) %>%
group_by(continent) %>%
arrange(desc(life_exp)) %>% # nesting desc() within arrange() will arrange in descending order.
slice(1) %>%
ungroup() %>%
pull(country)
countries_to_plot <- c(as.character(low_pop_2007), as.character(high_pop_2007)) # have to change to character type from factors. See what happens if you remove the as.character() calls.
countries_to_plot## [1] "Swaziland" "Haiti" "Afghanistan" "Turkey" "New Zealand"
## [6] "Reunion" "Canada" "Japan" "Iceland" "Australia"
ggp_pc_rate <- ggplot(data = df_rate_change %>%
filter(country %in% countries_to_plot), aes(x = year, colour = country)) +
geom_point(aes(y = pc_gdp_cap_change)) +
geom_line(aes(y = pc_gdp_cap_change), linetype = "dashed") + # difficult to add in the legend of linetypes when layering graphs like this.
geom_point(aes(y = pc_pop_change)) +
geom_line(aes(y = pc_pop_change)) +
facet_grid(rows = vars(continent)) +
scale_x_continuous(breaks = seq(min(df_rate_change$year), max(df_rate_change$year), by = 5)) +
scale_y_continuous(breaks = seq(-50, 50, by = 25)) +
coord_cartesian(xlim = c(1957, 2007), y = c(-50,50)) +
labs(title = "Change in GDP per capita and population time", x = "Year", y = "Percent change (%)") +
theme_light() +
NULL
ggp_pc_rate## ggplot loves long data. Let's wrangle the data to be in long format.
df_rate_change_pivot <- df_rate_change %>%
filter(country %in% countries_to_plot) %>%
select(country,
continent,
year,
pc_pop_change,
pc_gdp_cap_change) %>%
pivot_longer(c(pc_pop_change, pc_gdp_cap_change),
names_to = "pc_change_type",
values_to = "pc_change")
## now make the graph
ggp_pc_rate_pivot <- ggplot(data = df_rate_change_pivot, aes(x = year, y = pc_change, colour = country)) +
geom_point() +
geom_line(aes(linetype = pc_change_type)) +
facet_grid(rows = vars(continent)) +
scale_x_continuous(breaks = seq(min(df_rate_change$year), max(df_rate_change$year), by = 5)) +
scale_y_continuous(breaks = seq(-50, 50, by = 25)) +
coord_cartesian(xlim = c(1957, 2007), y = c(-50,50)) +
labs(title = "Change in GDP per capita and population time", x = "Year", y = "Percent change (%)", colour = "Country", linetype = "% Change Variable") +
theme_light() +
NULL
ggp_pc_rate_pivotInteractive graphs using {plotly}
A ggplot object can be passed in to ggplotly().
ggp_for_plotly <- ggplot(data = df_rate_change %>%
filter(country %in% countries_to_plot), aes(x = year, y = life_exp, colour = country)) +
geom_point(aes(text = paste0("Country: ",country,
"<br>Year: ",year,
"<br>Life Expectancy: ",life_exp,
"<br>Life Expectancy (% change): ", pc_lifeexp_change))) +
geom_line() +
scale_x_continuous(breaks = seq(min(df_rate_change$year), max(df_rate_change$year), by = 5)) +
labs(title = "Change in life expectancy over time", x = "Year", y = "Life expectancy (years)", colour = "Country") +
theme_light() +
NULL
interactive_graph <- plotly::ggplotly(ggp_for_plotly, tooltip = "text")
interactive_graph