1 Packages and data

# install the wpp2019 package by running install.packages()
install.packages("wpp2019", repos = "http://cran.us.r-project.org")
# load the tidyverse package in your current R session
library(tidyverse)

# load the wpp2019 package in your current R session
library(wpp2019)

Read in dataset

fish.tibble <- read.csv(file = "data/fishtibble.csv") 
fish.tibble
##   whitefish bluefish greenfish yellowfish         location
## 1         5        5         3          1        Australia
## 2         5        2         5          3        Indonesia
## 3         4        6         5          3      Philippines
## 4         4        5         8          1             Fiji
## 5         4        4         9          4         Solomons
## 6         4        5         5          3 Papua New Guinea

2 Introduction

Overview of tidyverse

3 Five functions

  1. filter(): keep/remove rows based on criteria
  2. select(): keep/remove columns by name/number/sequence
  3. mutate(): add new variables
  4. summarize(): reduce variables to summarized values
  5. arrange(): reorder rows

3.1 Filter

# use pipes to create a new object from "fish.tibble"
# filter rows for Australia
fish.tbl.filtered <- fish.tibble %>%
  filter(location == "Australia")
fish.tbl.filtered
##   whitefish bluefish greenfish yellowfish  location
## 1         5        5         3          1 Australia
# use multiple filter criteria using c()
# get rows for Australia and Indonesia
fish.tbl.filtered2 <- fish.tibble %>%
  filter(location == c("Australia", "Indonesia"))
fish.tbl.filtered2
##   whitefish bluefish greenfish yellowfish  location
## 1         5        5         3          1 Australia
## 2         5        2         5          3 Indonesia
# filter by one value, then another
fish.tbl.filtered3 <- fish.tibble %>%
  filter(whitefish > 3 & whitefish < 8)
fish.tbl.filtered3
##   whitefish bluefish greenfish yellowfish         location
## 1         5        5         3          1        Australia
## 2         5        2         5          3        Indonesia
## 3         4        6         5          3      Philippines
## 4         4        5         8          1             Fiji
## 5         4        4         9          4         Solomons
## 6         4        5         5          3 Papua New Guinea
# filter across multiple columns, numeric and character 
fish.tbl.filtered4 <- fish.tibble %>%
  filter(bluefish > 4 | location == "Fiji")
fish.tbl.filtered4
##   whitefish bluefish greenfish yellowfish         location
## 1         5        5         3          1        Australia
## 2         4        6         5          3      Philippines
## 3         4        5         8          1             Fiji
## 4         4        5         5          3 Papua New Guinea

3.2 Select

# select a couple of columns by name
# get only columns whitefish and location
fish.tbl.select <- fish.tibble %>%
  select(whitefish, location)
fish.tbl.select
##   whitefish         location
## 1         5        Australia
## 2         5        Indonesia
## 3         4      Philippines
## 4         4             Fiji
## 5         4         Solomons
## 6         4 Papua New Guinea
# select columns by names, numbers, or ranges
# select the last three columns using the x:y operator
fish.tbl.select2 <- fish.tibble %>%
  select(3:5)
fish.tbl.select2
##   greenfish yellowfish         location
## 1         3          1        Australia
## 2         5          3        Indonesia
## 3         5          3      Philippines
## 4         8          1             Fiji
## 5         9          4         Solomons
## 6         5          3 Papua New Guinea
# use select() to remove columns
# remove the yellowfish and greenfish columns
fish.tbl.select3 <- fish.tibble %>%
  select(-yellowfish, -greenfish)
fish.tbl.select3
##   whitefish bluefish         location
## 1         5        5        Australia
## 2         5        2        Indonesia
## 3         4        6      Philippines
## 4         4        5             Fiji
## 5         4        4         Solomons
## 6         4        5 Papua New Guinea
# select columns based on specific criteria
# get all columns that end with fish
fish.tbl.select4 <- fish.tibble %>%
  select(ends_with("fish"))
fish.tbl.select4
##   whitefish bluefish greenfish yellowfish
## 1         5        5         3          1
## 2         5        2         5          3
## 3         4        6         5          3
## 4         4        5         8          1
## 5         4        4         9          4
## 6         4        5         5          3

3.3 Mutate

# use mutate() to add columns to your dataset
# add another fish species called silverfish and a categorical variable called type
fish.tbl.mutate <- fish.tibble %>%
  mutate(silverfish = c(0,2,4,6,8,10),
         type = c("continental", "continental", "continental", "oceanic", "oceanic", "continental"))
fish.tbl.mutate
##   whitefish bluefish greenfish yellowfish         location silverfish
## 1         5        5         3          1        Australia          0
## 2         5        2         5          3        Indonesia          2
## 3         4        6         5          3      Philippines          4
## 4         4        5         8          1             Fiji          6
## 5         4        4         9          4         Solomons          8
## 6         4        5         5          3 Papua New Guinea         10
##          type
## 1 continental
## 2 continental
## 3 continental
## 4     oceanic
## 5     oceanic
## 6 continental
# use mutate() to sum across numeric variables
# add column called totalfish that sums across fish species
# use relocate() to move columns location and type to first two columns of dataset
fish.tbl.mutate2 <- fish.tbl.mutate %>%
  mutate(totalfish = whitefish+bluefish+greenfish+yellowfish+silverfish) %>% 
  relocate(location, type)
fish.tbl.mutate2
##           location        type whitefish bluefish greenfish yellowfish
## 1        Australia continental         5        5         3          1
## 2        Indonesia continental         5        2         5          3
## 3      Philippines continental         4        6         5          3
## 4             Fiji     oceanic         4        5         8          1
## 5         Solomons     oceanic         4        4         9          4
## 6 Papua New Guinea continental         4        5         5          3
##   silverfish totalfish
## 1          0        14
## 2          2        17
## 3          4        22
## 4          6        24
## 5          8        29
## 6         10        27
# use mutate() to sum across character variables
# add column "loc_type" that combines location and type columns, separated by a period
fish.tbl.mutate3 <- fish.tbl.mutate2 %>%
  mutate(loc_type = paste(location, type, sep = "."))
fish.tbl.mutate3
##           location        type whitefish bluefish greenfish yellowfish
## 1        Australia continental         5        5         3          1
## 2        Indonesia continental         5        2         5          3
## 3      Philippines continental         4        6         5          3
## 4             Fiji     oceanic         4        5         8          1
## 5         Solomons     oceanic         4        4         9          4
## 6 Papua New Guinea continental         4        5         5          3
##   silverfish totalfish                     loc_type
## 1          0        14        Australia.continental
## 2          2        17        Indonesia.continental
## 3          4        22      Philippines.continental
## 4          6        24                 Fiji.oceanic
## 5          8        29             Solomons.oceanic
## 6         10        27 Papua New Guinea.continental
# use mutate() to replace values or names
# use recode() within mutate to replace numbers based on their name/position or characters based on their name
fish.tbl.mutate4 <- fish.tbl.mutate3 %>%
  mutate(type.recode = recode(type, continental = "coastal"))
fish.tbl.mutate4
##           location        type whitefish bluefish greenfish yellowfish
## 1        Australia continental         5        5         3          1
## 2        Indonesia continental         5        2         5          3
## 3      Philippines continental         4        6         5          3
## 4             Fiji     oceanic         4        5         8          1
## 5         Solomons     oceanic         4        4         9          4
## 6 Papua New Guinea continental         4        5         5          3
##   silverfish totalfish                     loc_type type.recode
## 1          0        14        Australia.continental     coastal
## 2          2        17        Indonesia.continental     coastal
## 3          4        22      Philippines.continental     coastal
## 4          6        24                 Fiji.oceanic     oceanic
## 5          8        29             Solomons.oceanic     oceanic
## 6         10        27 Papua New Guinea.continental     coastal
# use mutate() to transform your data
# create a column with the log of totalfish
new.fish.tbl <- fish.tbl.mutate3 %>%
  mutate(log_totalfish = log(totalfish))
new.fish.tbl
##           location        type whitefish bluefish greenfish yellowfish
## 1        Australia continental         5        5         3          1
## 2        Indonesia continental         5        2         5          3
## 3      Philippines continental         4        6         5          3
## 4             Fiji     oceanic         4        5         8          1
## 5         Solomons     oceanic         4        4         9          4
## 6 Papua New Guinea continental         4        5         5          3
##   silverfish totalfish                     loc_type log_totalfish
## 1          0        14        Australia.continental      2.639057
## 2          2        17        Indonesia.continental      2.833213
## 3          4        22      Philippines.continental      3.091042
## 4          6        24                 Fiji.oceanic      3.178054
## 5          8        29             Solomons.oceanic      3.367296
## 6         10        27 Papua New Guinea.continental      3.295837

3.4 Summarize

# summarize across rows
# add columns with means, sd, min, and max
sum.whitefish <- new.fish.tbl %>%
  summarize(mean.whitefish = mean(whitefish),
            sd.whitefish = sd(whitefish),
            min.whitefish = min(whitefish),
            max.whitefish = max(whitefish))
sum.whitefish
##   mean.whitefish sd.whitefish min.whitefish max.whitefish
## 1       4.333333    0.5163978             4             5
# create summaries of multiple columns
sum.whiteblue <- new.fish.tbl %>%
  summarize(mean.whitefish = mean(whitefish), 
            mean.bluefish = mean(bluefish))
sum.whiteblue
##   mean.whitefish mean.bluefish
## 1       4.333333           4.5
# create summaries of the range and quantiles for totalfish
range.total <- new.fish.tbl %>%
  summarize(range.total = range(totalfish), 
            quant.total = quantile(totalfish, c(0.05, 0.95)))
range.total
##   range.total quant.total
## 1          14       14.75
## 2          29       28.50

3.5 Arrange

# arrange() takes place of sort function from base R
# arrange by location
fish.tbl.order <- new.fish.tbl %>%
  arrange(location) 
fish.tbl.order
##           location        type whitefish bluefish greenfish yellowfish
## 1        Australia continental         5        5         3          1
## 2             Fiji     oceanic         4        5         8          1
## 3        Indonesia continental         5        2         5          3
## 4 Papua New Guinea continental         4        5         5          3
## 5      Philippines continental         4        6         5          3
## 6         Solomons     oceanic         4        4         9          4
##   silverfish totalfish                     loc_type log_totalfish
## 1          0        14        Australia.continental      2.639057
## 2          6        24                 Fiji.oceanic      3.178054
## 3          2        17        Indonesia.continental      2.833213
## 4         10        27 Papua New Guinea.continental      3.295837
## 5          4        22      Philippines.continental      3.091042
## 6          8        29             Solomons.oceanic      3.367296
# use arrange() to specify the direction of data
# arrange by totalfish, descending
fish.tbl.order.total <- new.fish.tbl %>%
  arrange(-totalfish)
fish.tbl.order.total
##           location        type whitefish bluefish greenfish yellowfish
## 1         Solomons     oceanic         4        4         9          4
## 2 Papua New Guinea continental         4        5         5          3
## 3             Fiji     oceanic         4        5         8          1
## 4      Philippines continental         4        6         5          3
## 5        Indonesia continental         5        2         5          3
## 6        Australia continental         5        5         3          1
##   silverfish totalfish                     loc_type log_totalfish
## 1          8        29             Solomons.oceanic      3.367296
## 2         10        27 Papua New Guinea.continental      3.295837
## 3          6        24                 Fiji.oceanic      3.178054
## 4          4        22      Philippines.continental      3.091042
## 5          2        17        Indonesia.continental      2.833213
## 6          0        14        Australia.continental      2.639057
# NAs are always arranged at the bottom
# create column called na.fish with NA, arrange data by na.fish (descending)
# use relocate() to move column called na.fish to first column of dataset
fish.tbl.order.NA <- new.fish.tbl %>%
  mutate(na.fish = c(1, 2, 3, NA, 5, 6)) %>%
  arrange(-na.fish) %>%
  relocate(na.fish)
fish.tbl.order.NA
##   na.fish         location        type whitefish bluefish greenfish yellowfish
## 1       6 Papua New Guinea continental         4        5         5          3
## 2       5         Solomons     oceanic         4        4         9          4
## 3       3      Philippines continental         4        6         5          3
## 4       2        Indonesia continental         5        2         5          3
## 5       1        Australia continental         5        5         3          1
## 6      NA             Fiji     oceanic         4        5         8          1
##   silverfish totalfish                     loc_type log_totalfish
## 1         10        27 Papua New Guinea.continental      3.295837
## 2          8        29             Solomons.oceanic      3.367296
## 3          4        22      Philippines.continental      3.091042
## 4          2        17        Indonesia.continental      2.833213
## 5          0        14        Australia.continental      2.639057
## 6          6        24                 Fiji.oceanic      3.178054

4 Other functions

  1. group_by(): groups rows by column values in the data frame
  2. join(): join two or more data frames
  3. gather(): collates a set of columns into a single column
  4. spread(): creates additional columns from a single column

4.1 Group_by

# re-familiarize with format of new.fish.tbl
new.fish.tbl
##           location        type whitefish bluefish greenfish yellowfish
## 1        Australia continental         5        5         3          1
## 2        Indonesia continental         5        2         5          3
## 3      Philippines continental         4        6         5          3
## 4             Fiji     oceanic         4        5         8          1
## 5         Solomons     oceanic         4        4         9          4
## 6 Papua New Guinea continental         4        5         5          3
##   silverfish totalfish                     loc_type log_totalfish
## 1          0        14        Australia.continental      2.639057
## 2          2        17        Indonesia.continental      2.833213
## 3          4        22      Philippines.continental      3.091042
## 4          6        24                 Fiji.oceanic      3.178054
## 5          8        29             Solomons.oceanic      3.367296
## 6         10        27 Papua New Guinea.continental      3.295837
# group by type
# dataset is exactly the same, but it will behave differently due to grouping
fish.tbl.grouped <- new.fish.tbl %>%
  group_by(type)
fish.tbl.grouped 
## # A tibble: 6 × 10
## # Groups:   type [2]
##   location type  white…¹ bluef…² green…³ yello…⁴ silve…⁵ total…⁶ loc_t…⁷ log_t…⁸
##   <chr>    <chr>   <int>   <int>   <int>   <int>   <dbl>   <dbl> <chr>     <dbl>
## 1 Austral… cont…       5       5       3       1       0      14 Austra…    2.64
## 2 Indones… cont…       5       2       5       3       2      17 Indone…    2.83
## 3 Philipp… cont…       4       6       5       3       4      22 Philip…    3.09
## 4 Fiji     ocea…       4       5       8       1       6      24 Fiji.o…    3.18
## 5 Solomons ocea…       4       4       9       4       8      29 Solomo…    3.37
## 6 Papua N… cont…       4       5       5       3      10      27 Papua …    3.30
## # … with abbreviated variable names ¹​whitefish, ²​bluefish, ³​greenfish,
## #   ⁴​yellowfish, ⁵​silverfish, ⁶​totalfish, ⁷​loc_type, ⁸​log_totalfish
# add summarize() to see new behavior
fish.tbl.sum <- fish.tbl.grouped %>%
  summarize(mean.fish <- mean(totalfish))
fish.tbl.sum
## # A tibble: 2 × 2
##   type        `mean.fish <- mean(totalfish)`
##   <chr>                                <dbl>
## 1 continental                           20  
## 2 oceanic                               26.5

4.2 Join

Four versions

  1. left_join(): retains all elements on the left side of the equation
  2. right_join(): retains all elements on the right side of the join equation
  3. inner_join(): only joins elements that match
  4. full_join(): retains everything
# the wpp2019 package includes a dataset called "pop" with global population sizes by country
# take a look at the population size data
data(pop)
str(pop)
## 'data.frame':    249 obs. of  17 variables:
##  $ country_code: int  900 947 1833 921 1832 1830 927 1835 1829 903 ...
##  $ name        : chr  "World" "Sub-Saharan Africa" "Northern Africa and Western Asia" "Central and Southern Asia" ...
##  $ 1950        : num  2536431 179007 100239 510788 842669 ...
##  $ 1955        : num  2773020 197490 113425 558666 932210 ...
##  $ 1960        : num  3034950 220138 129302 619068 1019895 ...
##  $ 1965        : num  3339584 247831 147822 691687 1127782 ...
##  $ 1970        : num  3700437 280908 168730 775437 1280853 ...
##  $ 1975        : num  4079480 321201 192351 870180 1432114 ...
##  $ 1980        : num  4458003 369614 220224 980359 1555768 ...
##  $ 1985        : num  4870922 425841 253469 1105791 1684698 ...
##  $ 1990        : num  5327231 490605 288060 1239984 1837799 ...
##  $ 1995        : num  5744213 560759 323178 1376200 1950220 ...
##  $ 2000        : num  6143494 639661 355882 1511915 2044789 ...
##  $ 2005        : num  6541907 729733 391986 1647074 2125348 ...
##  $ 2010        : num  6956824 836364 435367 1775361 2201807 ...
##  $ 2015        : num  7379797 958577 481520 1896327 2279490 ...
##  $ 2020        : num  7794799 1094366 525869 2014709 2346709 ...
# by joining pop with fish.tibble we can add population size to our fish.tibble dataset
# first select the variable called name (country name) from 2020
# column name must be in quotes because it is a number
# rename column called name as location to match fish.tibble dataset
# rename column called 2020 to population
pop.2020 <- pop %>%
  select(name, "2020") %>% 
  rename(location = "name",
         population = "2020")
head(pop.2020)
##                           location population
## 1                            World  7794798.7
## 2               Sub-Saharan Africa  1094365.6
## 3 Northern Africa and Western Asia   525869.3
## 4        Central and Southern Asia  2014708.5
## 5   Eastern and South-Eastern Asia  2346709.5
## 6  Latin America and the Caribbean   653962.3
# use left_join() to join pop and fish.tibble datasets by left-hand column: location
# Solomons does not exist in the pop dataset, so it gives "NA" for population size
fish.tibble.left.join <- fish.tibble %>%
  left_join(pop.2020)
## Joining, by = "location"
fish.tibble.left.join
##   whitefish bluefish greenfish yellowfish         location population
## 1         5        5         3          1        Australia  25499.881
## 2         5        2         5          3        Indonesia 273523.621
## 3         4        6         5          3      Philippines 109581.085
## 4         4        5         8          1             Fiji    896.444
## 5         4        4         9          4         Solomons         NA
## 6         4        5         5          3 Papua New Guinea   8947.027
# use inner_join() to join pop and fish.tibble datasets, only joining locations that match
# Solomons is now excluded from the dataset
fish.tibble.inner.join <- fish.tibble %>%
  inner_join(pop.2020)
## Joining, by = "location"
fish.tibble.inner.join
##   whitefish bluefish greenfish yellowfish         location population
## 1         5        5         3          1        Australia  25499.881
## 2         5        2         5          3        Indonesia 273523.621
## 3         4        6         5          3      Philippines 109581.085
## 4         4        5         8          1             Fiji    896.444
## 5         4        5         5          3 Papua New Guinea   8947.027

4.3 Gather

# use gather() to create long format data, which may be useful for analyses
# within gather(), specify the columns that include the data frame
# provide names of new key and value columns 
fish.gathered <- fish.tibble.inner.join %>%
  gather(1:4, key = "fish_species", value = "number") 
fish.gathered
##            location population fish_species number
## 1         Australia  25499.881    whitefish      5
## 2         Indonesia 273523.621    whitefish      5
## 3       Philippines 109581.085    whitefish      4
## 4              Fiji    896.444    whitefish      4
## 5  Papua New Guinea   8947.027    whitefish      4
## 6         Australia  25499.881     bluefish      5
## 7         Indonesia 273523.621     bluefish      2
## 8       Philippines 109581.085     bluefish      6
## 9              Fiji    896.444     bluefish      5
## 10 Papua New Guinea   8947.027     bluefish      5
## 11        Australia  25499.881    greenfish      3
## 12        Indonesia 273523.621    greenfish      5
## 13      Philippines 109581.085    greenfish      5
## 14             Fiji    896.444    greenfish      8
## 15 Papua New Guinea   8947.027    greenfish      5
## 16        Australia  25499.881   yellowfish      1
## 17        Indonesia 273523.621   yellowfish      3
## 18      Philippines 109581.085   yellowfish      3
## 19             Fiji    896.444   yellowfish      1
## 20 Papua New Guinea   8947.027   yellowfish      3
# take the mean and standard deviation for each species with group_by() and summarize()
# group_by() includes the newly created variable
fish.means <- fish.gathered %>%
  group_by(fish_species) %>%
  summarize(mean.fish = mean(number),
            sd.fish = sd(number))
fish.means
## # A tibble: 4 × 3
##   fish_species mean.fish sd.fish
##   <chr>            <dbl>   <dbl>
## 1 bluefish           4.6   1.52 
## 2 greenfish          5.2   1.79 
## 3 whitefish          4.4   0.548
## 4 yellowfish         2.2   1.10

4.4 Spread

# spread() is the opposite of gather()
# use spread() to convert the data back into a condensed, wide format
fish.spread <- fish.gathered %>%
  spread(key = fish_species, value = number)
fish.spread
##           location population bluefish greenfish whitefish yellowfish
## 1        Australia  25499.881        5         3         5          1
## 2             Fiji    896.444        5         8         4          1
## 3        Indonesia 273523.621        2         5         5          3
## 4 Papua New Guinea   8947.027        5         5         4          3
## 5      Philippines 109581.085        6         5         4          3