What are we going to do here?

Load the tidyverse and gapminder packages.

Note: If you haven’t installed these packages already, do that first. Run install.packages("gapminder") and install.packages("tidyverse").

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(gapminder)

Inspect the gapminder dataset.

head(gapminder)
## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
tail(gapminder)
## # A tibble: 6 × 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1982    60.4  7636524      789.
## 2 Zimbabwe Africa     1987    62.4  9216418      706.
## 3 Zimbabwe Africa     1992    60.4 10704340      693.
## 4 Zimbabwe Africa     1997    46.8 11404948      792.
## 5 Zimbabwe Africa     2002    40.0 11926563      672.
## 6 Zimbabwe Africa     2007    43.5 12311143      470.
glimpse(gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

Create a new dataframe

Create a new dataframe called df1. Assign gapminder to it. (This means we are creating a copy of the gapminder dataset.)

<- is called the assignment operator.

df1 <- gapminder

Inspect the new dataframe.

head(df1)
## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
tail(df1)
## # A tibble: 6 × 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1982    60.4  7636524      789.
## 2 Zimbabwe Africa     1987    62.4  9216418      706.
## 3 Zimbabwe Africa     1992    60.4 10704340      693.
## 4 Zimbabwe Africa     1997    46.8 11404948      792.
## 5 Zimbabwe Africa     2002    40.0 11926563      672.
## 6 Zimbabwe Africa     2007    43.5 12311143      470.
glimpse(df1)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", …
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, …
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, …
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8…
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12…
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, …
summary(df1)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 

Create a new variable.

Use the mutate() function to create a new variable log(gdpPercap). Call it log_income_per_capita.

We will also use the pipe operator %>% to link commands. Read the following command as: Take df1 and then … do something …

df1 %>% 
  mutate(log_income_per_capita = log(gdpPercap))
## # A tibble: 1,704 × 7
##    country     continent  year lifeExp      pop gdpPercap log_income_per_capita
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>                 <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.                  6.66
##  2 Afghanistan Asia       1957    30.3  9240934      821.                  6.71
##  3 Afghanistan Asia       1962    32.0 10267083      853.                  6.75
##  4 Afghanistan Asia       1967    34.0 11537966      836.                  6.73
##  5 Afghanistan Asia       1972    36.1 13079460      740.                  6.61
##  6 Afghanistan Asia       1977    38.4 14880372      786.                  6.67
##  7 Afghanistan Asia       1982    39.9 12881816      978.                  6.89
##  8 Afghanistan Asia       1987    40.8 13867957      852.                  6.75
##  9 Afghanistan Asia       1992    41.7 16317921      649.                  6.48
## 10 Afghanistan Asia       1997    41.8 22227415      635.                  6.45
## # ℹ 1,694 more rows
# Look at the last column!

# Store the new variable in a new dataframe called df2.

df2 <- df1 %>% 
  mutate(log_income_per_capita = log(gdpPercap))

head(df2)
## # A tibble: 6 × 7
##   country     continent  year lifeExp      pop gdpPercap log_income_per_capita
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>                 <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.                  6.66
## 2 Afghanistan Asia       1957    30.3  9240934      821.                  6.71
## 3 Afghanistan Asia       1962    32.0 10267083      853.                  6.75
## 4 Afghanistan Asia       1967    34.0 11537966      836.                  6.73
## 5 Afghanistan Asia       1972    36.1 13079460      740.                  6.61
## 6 Afghanistan Asia       1977    38.4 14880372      786.                  6.67

Draw plots using ggplot()

ggplot(data = df2, aes(x = gdpPercap, y = lifeExp)) + 
  geom_point() 

# Add a title

ggplot(data = df2, aes(x = gdpPercap, y = lifeExp)) +  
  geom_point() +  
  labs(title = "Life expectancy vs income") 

# Show each continent in a different color 
# Note that "color = continent" is inside aes() 
# A legend is automatically created 

ggplot(data = df2, aes(x = gdpPercap, y = lifeExp, color = continent)) +  
  geom_point() 

# Change the X-axis variable to log_gdpPercap 

ggplot(data = df2, aes(x = log_income_per_capita, y = lifeExp)) +  
  geom_point(color = "red") 

# add a regression line (straight line through the data points) 

ggplot(data = df2, aes(x = log_income_per_capita, y = lifeExp)) +  
  geom_point(color = "red") +  
  geom_smooth(method = "lm", se = FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

# Plot a regression line for each continent

ggplot(data = df2, aes(x = log_income_per_capita, y = lifeExp, color = continent)) +  
  geom_point() +  
  geom_smooth(method = "lm", se=FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

Look at data for 3 countries.

Note: You must use quotation marks around strings (the name of each country, in this case.)

df2 %>%  
  filter(country %in% c("Argentina", "Brazil", "India")) 
## # A tibble: 36 × 7
##    country   continent  year lifeExp      pop gdpPercap log_income_per_capita
##    <fct>     <fct>     <int>   <dbl>    <int>     <dbl>                 <dbl>
##  1 Argentina Americas   1952    62.5 17876956     5911.                  8.68
##  2 Argentina Americas   1957    64.4 19610538     6857.                  8.83
##  3 Argentina Americas   1962    65.1 21283783     7133.                  8.87
##  4 Argentina Americas   1967    65.6 22934225     8053.                  8.99
##  5 Argentina Americas   1972    67.1 24779799     9443.                  9.15
##  6 Argentina Americas   1977    68.5 26983828    10079.                  9.22
##  7 Argentina Americas   1982    69.9 29341374     8998.                  9.10
##  8 Argentina Americas   1987    70.8 31620918     9140.                  9.12
##  9 Argentina Americas   1992    71.9 33958947     9308.                  9.14
## 10 Argentina Americas   1997    73.3 36203463    10967.                  9.30
## # ℹ 26 more rows
# Store the results in a new dataframe df3 

df3 <- df2 %>%  
  filter(country %in% c("Argentina", "Brazil", "India")) 

head(df3)
## # A tibble: 6 × 7
##   country   continent  year lifeExp      pop gdpPercap log_income_per_capita
##   <fct>     <fct>     <int>   <dbl>    <int>     <dbl>                 <dbl>
## 1 Argentina Americas   1952    62.5 17876956     5911.                  8.68
## 2 Argentina Americas   1957    64.4 19610538     6857.                  8.83
## 3 Argentina Americas   1962    65.1 21283783     7133.                  8.87
## 4 Argentina Americas   1967    65.6 22934225     8053.                  8.99
## 5 Argentina Americas   1972    67.1 24779799     9443.                  9.15
## 6 Argentina Americas   1977    68.5 26983828    10079.                  9.22
tail(df3)
## # A tibble: 6 × 7
##   country continent  year lifeExp        pop gdpPercap log_income_per_capita
##   <fct>   <fct>     <int>   <dbl>      <int>     <dbl>                 <dbl>
## 1 India   Asia       1982    56.6  708000000      856.                  6.75
## 2 India   Asia       1987    58.6  788000000      977.                  6.88
## 3 India   Asia       1992    60.2  872000000     1164.                  7.06
## 4 India   Asia       1997    61.8  959000000     1459.                  7.29
## 5 India   Asia       2002    62.9 1034172547     1747.                  7.47
## 6 India   Asia       2007    64.7 1110396331     2452.                  7.80
glimpse(df3)
## Rows: 36
## Columns: 7
## $ country               <fct> "Argentina", "Argentina", "Argentina", "Argentin…
## $ continent             <fct> Americas, Americas, Americas, Americas, Americas…
## $ year                  <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, …
## $ lifeExp               <dbl> 62.485, 64.399, 65.142, 65.634, 67.065, 68.481, …
## $ pop                   <int> 17876956, 19610538, 21283783, 22934225, 24779799…
## $ gdpPercap             <dbl> 5911.3151, 6856.8562, 7133.1660, 8052.9530, 9443…
## $ log_income_per_capita <dbl> 8.684624, 8.833004, 8.872510, 8.993794, 9.153033…

Draw plots for the 3 countries. Use a different color for each country.

Note: “color = country” is inside aes()!

ggplot(df3, aes(x = year, y = gdpPercap, color = country)) + 
  geom_line() 

ggplot(df3, aes(x = year, y = log_income_per_capita, color = country)) + 
  geom_line() +
  labs(x = "",                         ## remove "year" on the X-axis
       y = "log(income per capita)",   ## specify a new lable for the Y-axis
       title = "Log of income per capita over time")

Look at data for all countries, but for a certain year.

df2 %>% 
  filter(year == 2007)      # no quotations needed for a numeric variable
## # A tibble: 142 × 7
##    country     continent  year lifeExp       pop gdpPercap log_income_per_capita
##    <fct>       <fct>     <int>   <dbl>     <int>     <dbl>                 <dbl>
##  1 Afghanistan Asia       2007    43.8  31889923      975.                  6.88
##  2 Albania     Europe     2007    76.4   3600523     5937.                  8.69
##  3 Algeria     Africa     2007    72.3  33333216     6223.                  8.74
##  4 Angola      Africa     2007    42.7  12420476     4797.                  8.48
##  5 Argentina   Americas   2007    75.3  40301927    12779.                  9.46
##  6 Australia   Oceania    2007    81.2  20434176    34435.                 10.4 
##  7 Austria     Europe     2007    79.8   8199783    36126.                 10.5 
##  8 Bahrain     Asia       2007    75.6    708573    29796.                 10.3 
##  9 Bangladesh  Asia       2007    64.1 150448339     1391.                  7.24
## 10 Belgium     Europe     2007    79.4  10392226    33693.                 10.4 
## # ℹ 132 more rows
# Save in a new dataframe

df4 <- df2 %>%
    filter(year == 2007) 

head(df4)
## # A tibble: 6 × 7
##   country     continent  year lifeExp      pop gdpPercap log_income_per_capita
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>                 <dbl>
## 1 Afghanistan Asia       2007    43.8 31889923      975.                  6.88
## 2 Albania     Europe     2007    76.4  3600523     5937.                  8.69
## 3 Algeria     Africa     2007    72.3 33333216     6223.                  8.74
## 4 Angola      Africa     2007    42.7 12420476     4797.                  8.48
## 5 Argentina   Americas   2007    75.3 40301927    12779.                  9.46
## 6 Australia   Oceania    2007    81.2 20434176    34435.                 10.4
glimpse(df4)
## Rows: 142
## Columns: 7
## $ country               <fct> "Afghanistan", "Albania", "Algeria", "Angola", "…
## $ continent             <fct> Asia, Europe, Africa, Africa, Americas, Oceania,…
## $ year                  <int> 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, …
## $ lifeExp               <dbl> 43.828, 76.423, 72.301, 42.731, 75.320, 81.235, …
## $ pop                   <int> 31889923, 3600523, 33333216, 12420476, 40301927,…
## $ gdpPercap             <dbl> 974.5803, 5937.0295, 6223.3675, 4797.2313, 12779…
## $ log_income_per_capita <dbl> 6.882007, 8.688964, 8.736066, 8.475794, 9.455588…

Theend