Data Visualization with ggplot2

Author

Eric Mulaa

Question 1

(A) Explore the gapminder dataset using glimpse() and ?gapminder to learn about the variables. You may need to install ggthemes as well.

Install necessary packages

library(gapminder)
library(ggthemes)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(skimr)
library(ggplot2)
# Load the dataset
data("gapminder")
glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
$ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
$ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
$ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …

Quantitative variables

  • year
  • lifeExp
  • pop
  • gdpPercap

Categorical variables

  • country
  • continent
Variable Description Type
country Name of the country (142 total levels) Categorical (factor)
continent Continent where the country is located (5 levels: Africa, Americas, Asia, Europe, Oceania) Categorical (factor)
year Year of observation (from 1952 to 2007, in 5-year increments) Quantitative (integer)
lifeExp Life expectancy at birth, measured in years Quantitative (continuous, double)
pop Total population of the country Quantitative (integer)
gdpPercap GDP per capita (US$, inflation-adjusted) Quantitative (continuous, double)

(B) Use skim() to further explore the data set and any missing data patterns. How many missing values are there for this data set?

skim(gapminder)
Data summary
Name gapminder
Number of rows 1704
Number of columns 6
_______________________
Column type frequency:
factor 2
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
country 0 1 FALSE 142 Afg: 12, Alb: 12, Alg: 12, Ang: 12
continent 0 1 FALSE 5 Afr: 624, Asi: 396, Eur: 360, Ame: 300

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 1979.50 17.27 1952.00 1965.75 1979.50 1993.25 2007.0 ▇▅▅▅▇
lifeExp 0 1 59.47 12.92 23.60 48.20 60.71 70.85 82.6 ▁▆▇▇▇
pop 0 1 29601212.32 106157896.74 60011.00 2793664.00 7023595.50 19585221.75 1318683096.0 ▇▁▁▁▁
gdpPercap 0 1 7215.33 9857.45 241.17 1202.06 3531.85 9325.46 113523.1 ▇▁▁▁▁

There are no missing values in this data set

(C) Using ggplot2, create a scatterplot showing life expectancy across time, adding descriptive labels of the axes and overall plot. What trend do you notice?

gapminder |>
  ggplot( aes( x = year, y = lifeExp ) ) +
  geom_point() +
  labs( title = "Life Expectancy Across Time",
        x = "Year",
        y = "Life Expectancy (years)")

It seems that life expectancy went up over the years

(D) Recreate the plot of life expectancy across time, this time adding an additional smooth curve through the data. Suppress the standard error bands around the smooth curves.

gapminder |>
  ggplot( aes( x = year, y = lifeExp ) ) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs( title = "Life Expectancy Across Time",
        x = "Year",
        y = "Life Expectancy (years)")
`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

(E) Color the points based on which continent the points are representing, making sure the lines are colored by continent as well. Which continent / region has the highest life expectancy on average?

gapminder |>
  ggplot( aes( x = year, y = lifeExp, color = continent ) ) +
  geom_point() +
  geom_smooth(se = FALSE) +
  labs( title = "Life Expectancy Across Time",
        x = "Year",
        y = "Life Expectancy (years)")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Oceania has the highest life expectancy on average

(F) Extend the plot from the previous part by faceting by the continent associated with each point so that each continent has its own column.

gapminder |>
  ggplot( aes( x = year, y = lifeExp, color = continent ) ) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~continent) +
  labs( title = "Life Expectancy Across Time",
        x = "Year",
        y = "Life Expectancy (years)")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

(G) Modify the colors for each continent to be color-blind friendly, and separately customize the overall theme of the plot by specifying a complete theme of your choice.

gapminder |>
  ggplot( aes( x = year, y = lifeExp, color = continent ) ) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~continent) +
  labs( title = "Life Expectancy Across Time",
        x = "Year",
        y = "Life Expectancy (years)") +
  scale_color_viridis_d() +
  theme_bw()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

(H) Rotate the labels on the x-axis 45 degrees by adding a theme() layer with the appropriate option.

gapminder |>
  ggplot( aes( x = year, y = lifeExp, color = continent ) ) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~continent) +
  labs( title = "Life Expectancy Across Time",
        x = "Year",
        y = "Life Expectancy (years)") +
  scale_color_viridis_d() +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1) # hjust will adjust horizontal justification
  )
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

(I) Specify an argument in the theme() function to suppress the legend.

gapminder |>
  ggplot( aes( x = year, y = lifeExp, color = continent ) ) +
  geom_point() +
  geom_smooth(se = FALSE) +
  facet_wrap(~continent) +
  labs( title = "Life Expectancy Across Time",
        x = "Year",
        y = "Life Expectancy (years)") +
  scale_color_viridis_d() +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

(J) Which country (not continent) had the lowest life expectancy in any given year, and which year was this in?

gapminder |>
  group_by(year) |>
  slice_min(lifeExp, n = 1) |>
  ungroup()
# A tibble: 12 × 6
   country      continent  year lifeExp      pop gdpPercap
   <fct>        <fct>     <int>   <dbl>    <int>     <dbl>
 1 Afghanistan  Asia       1952    28.8  8425333      779.
 2 Afghanistan  Asia       1957    30.3  9240934      821.
 3 Afghanistan  Asia       1962    32.0 10267083      853.
 4 Afghanistan  Asia       1967    34.0 11537966      836.
 5 Sierra Leone Africa     1972    35.4  2879013     1354.
 6 Cambodia     Asia       1977    31.2  6978607      525.
 7 Sierra Leone Africa     1982    38.4  3464522     1465.
 8 Angola       Africa     1987    39.9  7874230     2430.
 9 Rwanda       Africa     1992    23.6  7290203      737.
10 Rwanda       Africa     1997    36.1  7212583      590.
11 Zambia       Africa     2002    39.2 10595811     1072.
12 Swaziland    Africa     2007    39.6  1133066     4513.

Country: Angola Year: 1987 Life Expectancy: 39.906

Question 2

# Create the data set
gapminder2007 <- gapminder |> 
dplyr::filter(year == 2007) |> 
dplyr::slice_max(pop, n = 20)

(A) First, create a bar plot displaying the population of each country using the gapminder2007 dataset. Note that the country names will be overlapping, but we will fix this later.

gapminder2007 |>
  ggplot( aes( x = country, y = pop ) ) +
  geom_col()

(B) In a new code chunk, modify the plot in the previous part so that the bars are sorted based on height to match the plot provided

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop ) ) +
  geom_col()

(C) In another new code chunk, modify the plot so that the color inside of the bars displays which continent each bar represents as well, and change the outline of all bars in the plot to be black.

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" )

(D) Make the barchart a horizontal bar chart rather than a vertical one to fix the issue of the country names overlapping.

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" ) +
  coord_flip()

(E) Move the legend below the plot (to the “bottom”).

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" ) +
  coord_flip() +
  theme(legend.position = "bottom")

(F) Add descriptive labels for the axes, title, and a caption below the plot.

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" ) +
  coord_flip() +
  labs( title = "World's Most Populated Countries, 2007",
        x = "Country",
        y = "Population") +
  theme(legend.position = "bottom")

(G) Remove the legend title.

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" ) +
  coord_flip() +
  labs( title = "World's Most Populated Countries, 2007",
        x = "Country",
        y = "Population",
        fill = NULL) +
  theme(legend.position = "bottom")

(H) Use color-blind friendly colors by using the colors below:

c(“#D55E00”, “#009E73”, “#56B4E9”, “#CC79A7”)

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" ) +
  coord_flip() +
  labs( title = "World's Most Populated Countries, 2007",
        x = "Country",
        y = "Population",
        fill = NULL) +
  scale_fill_manual(values = c("#D55E00", "#009E73", "#56B4E9", "#CC79A7")) +
  theme(legend.position = "bottom")

(I) Display commas in the population numbers rather than scientific notation by using a scale_y_continuous(labels = scales::comma) layer.

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" ) +
  scale_y_continuous(labels = scales::comma) +
  coord_flip() +
  labs( title = "World's Most Populated Countries, 2007",
        x = "Country",
        y = "Population",
        fill = NULL) +
  scale_fill_manual(values = c("#D55E00", "#009E73", "#56B4E9", "#CC79A7")) +
  theme(legend.position = "bottom")

(J) Remove the excess space between the bars and the axis.

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" ) +
  coord_flip() +
  labs( title = "World's Most Populated Countries, 2007",
        x = "Country",
        y = "Population",
        fill = NULL) +
  scale_fill_manual(values = c("#D55E00", "#009E73", "#56B4E9", "#CC79A7")) +
  scale_y_continuous(labels = scales::comma, expand = c(0, 0)) +
  theme(legend.position = "bottom")

(K) Modify the previous plot by specifying a theme from the ggthemes package

gapminder2007 |>
  ggplot( aes( x = reorder(country, pop), y = pop, fill = continent )) +
  geom_col( color = "black" ) +
  coord_flip() +
  labs( title = "World's Most Populated Countries, 2007",
        x = "Country",
        y = "Population",
        fill = NULL) +
  scale_fill_manual(values = c("#D55E00", "#009E73", "#56B4E9", "#CC79A7")) +
  scale_y_continuous(labels = scales::comma, expand = c(0, 0)) +
  theme_igray() +
  theme(legend.position = "bottom")