# install the wpp2019 package by running install.packages()
install.packages("wpp2019", repos = "http://cran.us.r-project.org")
# load the tidyverse package in your current R session
library(tidyverse)
# load the wpp2019 package in your current R session
library(wpp2019)
Read in dataset
fish.tibble <- read.csv(file = "data/fishtibble.csv")
fish.tibble
## whitefish bluefish greenfish yellowfish location
## 1 5 5 3 1 Australia
## 2 5 2 5 3 Indonesia
## 3 4 6 5 3 Philippines
## 4 4 5 8 1 Fiji
## 5 4 4 9 4 Solomons
## 6 4 5 5 3 Papua New Guinea
Overview of tidyverse
# use pipes to create a new object from "fish.tibble"
# filter rows for Australia
fish.tbl.filtered <- fish.tibble %>%
filter(location == "Australia")
fish.tbl.filtered
## whitefish bluefish greenfish yellowfish location
## 1 5 5 3 1 Australia
# use multiple filter criteria using c()
# get rows for Australia and Indonesia
fish.tbl.filtered2 <- fish.tibble %>%
filter(location == c("Australia", "Indonesia"))
fish.tbl.filtered2
## whitefish bluefish greenfish yellowfish location
## 1 5 5 3 1 Australia
## 2 5 2 5 3 Indonesia
# filter by one value, then another
fish.tbl.filtered3 <- fish.tibble %>%
filter(whitefish > 3 & whitefish < 8)
fish.tbl.filtered3
## whitefish bluefish greenfish yellowfish location
## 1 5 5 3 1 Australia
## 2 5 2 5 3 Indonesia
## 3 4 6 5 3 Philippines
## 4 4 5 8 1 Fiji
## 5 4 4 9 4 Solomons
## 6 4 5 5 3 Papua New Guinea
# filter across multiple columns, numeric and character
fish.tbl.filtered4 <- fish.tibble %>%
filter(bluefish > 4 | location == "Fiji")
fish.tbl.filtered4
## whitefish bluefish greenfish yellowfish location
## 1 5 5 3 1 Australia
## 2 4 6 5 3 Philippines
## 3 4 5 8 1 Fiji
## 4 4 5 5 3 Papua New Guinea
# select a couple of columns by name
# get only columns whitefish and location
fish.tbl.select <- fish.tibble %>%
select(whitefish, location)
fish.tbl.select
## whitefish location
## 1 5 Australia
## 2 5 Indonesia
## 3 4 Philippines
## 4 4 Fiji
## 5 4 Solomons
## 6 4 Papua New Guinea
# select columns by names, numbers, or ranges
# select the last three columns using the x:y operator
fish.tbl.select2 <- fish.tibble %>%
select(3:5)
fish.tbl.select2
## greenfish yellowfish location
## 1 3 1 Australia
## 2 5 3 Indonesia
## 3 5 3 Philippines
## 4 8 1 Fiji
## 5 9 4 Solomons
## 6 5 3 Papua New Guinea
# use select() to remove columns
# remove the yellowfish and greenfish columns
fish.tbl.select3 <- fish.tibble %>%
select(-yellowfish, -greenfish)
fish.tbl.select3
## whitefish bluefish location
## 1 5 5 Australia
## 2 5 2 Indonesia
## 3 4 6 Philippines
## 4 4 5 Fiji
## 5 4 4 Solomons
## 6 4 5 Papua New Guinea
# select columns based on specific criteria
# get all columns that end with fish
fish.tbl.select4 <- fish.tibble %>%
select(ends_with("fish"))
fish.tbl.select4
## whitefish bluefish greenfish yellowfish
## 1 5 5 3 1
## 2 5 2 5 3
## 3 4 6 5 3
## 4 4 5 8 1
## 5 4 4 9 4
## 6 4 5 5 3
# use mutate() to add columns to your dataset
# add another fish species called silverfish and a categorical variable called type
fish.tbl.mutate <- fish.tibble %>%
mutate(silverfish = c(0,2,4,6,8,10),
type = c("continental", "continental", "continental", "oceanic", "oceanic", "continental"))
fish.tbl.mutate
## whitefish bluefish greenfish yellowfish location silverfish
## 1 5 5 3 1 Australia 0
## 2 5 2 5 3 Indonesia 2
## 3 4 6 5 3 Philippines 4
## 4 4 5 8 1 Fiji 6
## 5 4 4 9 4 Solomons 8
## 6 4 5 5 3 Papua New Guinea 10
## type
## 1 continental
## 2 continental
## 3 continental
## 4 oceanic
## 5 oceanic
## 6 continental
# use mutate() to sum across numeric variables
# add column called totalfish that sums across fish species
# use relocate() to move columns location and type to first two columns of dataset
fish.tbl.mutate2 <- fish.tbl.mutate %>%
mutate(totalfish = whitefish+bluefish+greenfish+yellowfish+silverfish) %>%
relocate(location, type)
fish.tbl.mutate2
## location type whitefish bluefish greenfish yellowfish
## 1 Australia continental 5 5 3 1
## 2 Indonesia continental 5 2 5 3
## 3 Philippines continental 4 6 5 3
## 4 Fiji oceanic 4 5 8 1
## 5 Solomons oceanic 4 4 9 4
## 6 Papua New Guinea continental 4 5 5 3
## silverfish totalfish
## 1 0 14
## 2 2 17
## 3 4 22
## 4 6 24
## 5 8 29
## 6 10 27
# use mutate() to sum across character variables
# add column "loc_type" that combines location and type columns, separated by a period
fish.tbl.mutate3 <- fish.tbl.mutate2 %>%
mutate(loc_type = paste(location, type, sep = "."))
fish.tbl.mutate3
## location type whitefish bluefish greenfish yellowfish
## 1 Australia continental 5 5 3 1
## 2 Indonesia continental 5 2 5 3
## 3 Philippines continental 4 6 5 3
## 4 Fiji oceanic 4 5 8 1
## 5 Solomons oceanic 4 4 9 4
## 6 Papua New Guinea continental 4 5 5 3
## silverfish totalfish loc_type
## 1 0 14 Australia.continental
## 2 2 17 Indonesia.continental
## 3 4 22 Philippines.continental
## 4 6 24 Fiji.oceanic
## 5 8 29 Solomons.oceanic
## 6 10 27 Papua New Guinea.continental
# use mutate() to replace values or names
# use recode() within mutate to replace numbers based on their name/position or characters based on their name
fish.tbl.mutate4 <- fish.tbl.mutate3 %>%
mutate(type.recode = recode(type, continental = "coastal"))
fish.tbl.mutate4
## location type whitefish bluefish greenfish yellowfish
## 1 Australia continental 5 5 3 1
## 2 Indonesia continental 5 2 5 3
## 3 Philippines continental 4 6 5 3
## 4 Fiji oceanic 4 5 8 1
## 5 Solomons oceanic 4 4 9 4
## 6 Papua New Guinea continental 4 5 5 3
## silverfish totalfish loc_type type.recode
## 1 0 14 Australia.continental coastal
## 2 2 17 Indonesia.continental coastal
## 3 4 22 Philippines.continental coastal
## 4 6 24 Fiji.oceanic oceanic
## 5 8 29 Solomons.oceanic oceanic
## 6 10 27 Papua New Guinea.continental coastal
# use mutate() to transform your data
# create a column with the log of totalfish
new.fish.tbl <- fish.tbl.mutate3 %>%
mutate(log_totalfish = log(totalfish))
new.fish.tbl
## location type whitefish bluefish greenfish yellowfish
## 1 Australia continental 5 5 3 1
## 2 Indonesia continental 5 2 5 3
## 3 Philippines continental 4 6 5 3
## 4 Fiji oceanic 4 5 8 1
## 5 Solomons oceanic 4 4 9 4
## 6 Papua New Guinea continental 4 5 5 3
## silverfish totalfish loc_type log_totalfish
## 1 0 14 Australia.continental 2.639057
## 2 2 17 Indonesia.continental 2.833213
## 3 4 22 Philippines.continental 3.091042
## 4 6 24 Fiji.oceanic 3.178054
## 5 8 29 Solomons.oceanic 3.367296
## 6 10 27 Papua New Guinea.continental 3.295837
# summarize across rows
# add columns with means, sd, min, and max
sum.whitefish <- new.fish.tbl %>%
summarize(mean.whitefish = mean(whitefish),
sd.whitefish = sd(whitefish),
min.whitefish = min(whitefish),
max.whitefish = max(whitefish))
sum.whitefish
## mean.whitefish sd.whitefish min.whitefish max.whitefish
## 1 4.333333 0.5163978 4 5
# create summaries of multiple columns
sum.whiteblue <- new.fish.tbl %>%
summarize(mean.whitefish = mean(whitefish),
mean.bluefish = mean(bluefish))
sum.whiteblue
## mean.whitefish mean.bluefish
## 1 4.333333 4.5
# create summaries of the range and quantiles for totalfish
range.total <- new.fish.tbl %>%
summarize(range.total = range(totalfish),
quant.total = quantile(totalfish, c(0.05, 0.95)))
range.total
## range.total quant.total
## 1 14 14.75
## 2 29 28.50
# arrange() takes place of sort function from base R
# arrange by location
fish.tbl.order <- new.fish.tbl %>%
arrange(location)
fish.tbl.order
## location type whitefish bluefish greenfish yellowfish
## 1 Australia continental 5 5 3 1
## 2 Fiji oceanic 4 5 8 1
## 3 Indonesia continental 5 2 5 3
## 4 Papua New Guinea continental 4 5 5 3
## 5 Philippines continental 4 6 5 3
## 6 Solomons oceanic 4 4 9 4
## silverfish totalfish loc_type log_totalfish
## 1 0 14 Australia.continental 2.639057
## 2 6 24 Fiji.oceanic 3.178054
## 3 2 17 Indonesia.continental 2.833213
## 4 10 27 Papua New Guinea.continental 3.295837
## 5 4 22 Philippines.continental 3.091042
## 6 8 29 Solomons.oceanic 3.367296
# use arrange() to specify the direction of data
# arrange by totalfish, descending
fish.tbl.order.total <- new.fish.tbl %>%
arrange(-totalfish)
fish.tbl.order.total
## location type whitefish bluefish greenfish yellowfish
## 1 Solomons oceanic 4 4 9 4
## 2 Papua New Guinea continental 4 5 5 3
## 3 Fiji oceanic 4 5 8 1
## 4 Philippines continental 4 6 5 3
## 5 Indonesia continental 5 2 5 3
## 6 Australia continental 5 5 3 1
## silverfish totalfish loc_type log_totalfish
## 1 8 29 Solomons.oceanic 3.367296
## 2 10 27 Papua New Guinea.continental 3.295837
## 3 6 24 Fiji.oceanic 3.178054
## 4 4 22 Philippines.continental 3.091042
## 5 2 17 Indonesia.continental 2.833213
## 6 0 14 Australia.continental 2.639057
# NAs are always arranged at the bottom
# create column called na.fish with NA, arrange data by na.fish (descending)
# use relocate() to move column called na.fish to first column of dataset
fish.tbl.order.NA <- new.fish.tbl %>%
mutate(na.fish = c(1, 2, 3, NA, 5, 6)) %>%
arrange(-na.fish) %>%
relocate(na.fish)
fish.tbl.order.NA
## na.fish location type whitefish bluefish greenfish yellowfish
## 1 6 Papua New Guinea continental 4 5 5 3
## 2 5 Solomons oceanic 4 4 9 4
## 3 3 Philippines continental 4 6 5 3
## 4 2 Indonesia continental 5 2 5 3
## 5 1 Australia continental 5 5 3 1
## 6 NA Fiji oceanic 4 5 8 1
## silverfish totalfish loc_type log_totalfish
## 1 10 27 Papua New Guinea.continental 3.295837
## 2 8 29 Solomons.oceanic 3.367296
## 3 4 22 Philippines.continental 3.091042
## 4 2 17 Indonesia.continental 2.833213
## 5 0 14 Australia.continental 2.639057
## 6 6 24 Fiji.oceanic 3.178054
# re-familiarize with format of new.fish.tbl
new.fish.tbl
## location type whitefish bluefish greenfish yellowfish
## 1 Australia continental 5 5 3 1
## 2 Indonesia continental 5 2 5 3
## 3 Philippines continental 4 6 5 3
## 4 Fiji oceanic 4 5 8 1
## 5 Solomons oceanic 4 4 9 4
## 6 Papua New Guinea continental 4 5 5 3
## silverfish totalfish loc_type log_totalfish
## 1 0 14 Australia.continental 2.639057
## 2 2 17 Indonesia.continental 2.833213
## 3 4 22 Philippines.continental 3.091042
## 4 6 24 Fiji.oceanic 3.178054
## 5 8 29 Solomons.oceanic 3.367296
## 6 10 27 Papua New Guinea.continental 3.295837
# group by type
# dataset is exactly the same, but it will behave differently due to grouping
fish.tbl.grouped <- new.fish.tbl %>%
group_by(type)
fish.tbl.grouped
## # A tibble: 6 × 10
## # Groups: type [2]
## location type white…¹ bluef…² green…³ yello…⁴ silve…⁵ total…⁶ loc_t…⁷ log_t…⁸
## <chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <chr> <dbl>
## 1 Austral… cont… 5 5 3 1 0 14 Austra… 2.64
## 2 Indones… cont… 5 2 5 3 2 17 Indone… 2.83
## 3 Philipp… cont… 4 6 5 3 4 22 Philip… 3.09
## 4 Fiji ocea… 4 5 8 1 6 24 Fiji.o… 3.18
## 5 Solomons ocea… 4 4 9 4 8 29 Solomo… 3.37
## 6 Papua N… cont… 4 5 5 3 10 27 Papua … 3.30
## # … with abbreviated variable names ¹whitefish, ²bluefish, ³greenfish,
## # ⁴yellowfish, ⁵silverfish, ⁶totalfish, ⁷loc_type, ⁸log_totalfish
# add summarize() to see new behavior
fish.tbl.sum <- fish.tbl.grouped %>%
summarize(mean.fish <- mean(totalfish))
fish.tbl.sum
## # A tibble: 2 × 2
## type `mean.fish <- mean(totalfish)`
## <chr> <dbl>
## 1 continental 20
## 2 oceanic 26.5
Four versions
# the wpp2019 package includes a dataset called "pop" with global population sizes by country
# take a look at the population size data
data(pop)
str(pop)
## 'data.frame': 249 obs. of 17 variables:
## $ country_code: int 900 947 1833 921 1832 1830 927 1835 1829 903 ...
## $ name : chr "World" "Sub-Saharan Africa" "Northern Africa and Western Asia" "Central and Southern Asia" ...
## $ 1950 : num 2536431 179007 100239 510788 842669 ...
## $ 1955 : num 2773020 197490 113425 558666 932210 ...
## $ 1960 : num 3034950 220138 129302 619068 1019895 ...
## $ 1965 : num 3339584 247831 147822 691687 1127782 ...
## $ 1970 : num 3700437 280908 168730 775437 1280853 ...
## $ 1975 : num 4079480 321201 192351 870180 1432114 ...
## $ 1980 : num 4458003 369614 220224 980359 1555768 ...
## $ 1985 : num 4870922 425841 253469 1105791 1684698 ...
## $ 1990 : num 5327231 490605 288060 1239984 1837799 ...
## $ 1995 : num 5744213 560759 323178 1376200 1950220 ...
## $ 2000 : num 6143494 639661 355882 1511915 2044789 ...
## $ 2005 : num 6541907 729733 391986 1647074 2125348 ...
## $ 2010 : num 6956824 836364 435367 1775361 2201807 ...
## $ 2015 : num 7379797 958577 481520 1896327 2279490 ...
## $ 2020 : num 7794799 1094366 525869 2014709 2346709 ...
# by joining pop with fish.tibble we can add population size to our fish.tibble dataset
# first select the variable called name (country name) from 2020
# column name must be in quotes because it is a number
# rename column called name as location to match fish.tibble dataset
# rename column called 2020 to population
pop.2020 <- pop %>%
select(name, "2020") %>%
rename(location = "name",
population = "2020")
head(pop.2020)
## location population
## 1 World 7794798.7
## 2 Sub-Saharan Africa 1094365.6
## 3 Northern Africa and Western Asia 525869.3
## 4 Central and Southern Asia 2014708.5
## 5 Eastern and South-Eastern Asia 2346709.5
## 6 Latin America and the Caribbean 653962.3
# use left_join() to join pop and fish.tibble datasets by left-hand column: location
# Solomons does not exist in the pop dataset, so it gives "NA" for population size
fish.tibble.left.join <- fish.tibble %>%
left_join(pop.2020)
## Joining, by = "location"
fish.tibble.left.join
## whitefish bluefish greenfish yellowfish location population
## 1 5 5 3 1 Australia 25499.881
## 2 5 2 5 3 Indonesia 273523.621
## 3 4 6 5 3 Philippines 109581.085
## 4 4 5 8 1 Fiji 896.444
## 5 4 4 9 4 Solomons NA
## 6 4 5 5 3 Papua New Guinea 8947.027
# use inner_join() to join pop and fish.tibble datasets, only joining locations that match
# Solomons is now excluded from the dataset
fish.tibble.inner.join <- fish.tibble %>%
inner_join(pop.2020)
## Joining, by = "location"
fish.tibble.inner.join
## whitefish bluefish greenfish yellowfish location population
## 1 5 5 3 1 Australia 25499.881
## 2 5 2 5 3 Indonesia 273523.621
## 3 4 6 5 3 Philippines 109581.085
## 4 4 5 8 1 Fiji 896.444
## 5 4 5 5 3 Papua New Guinea 8947.027
# use gather() to create long format data, which may be useful for analyses
# within gather(), specify the columns that include the data frame
# provide names of new key and value columns
fish.gathered <- fish.tibble.inner.join %>%
gather(1:4, key = "fish_species", value = "number")
fish.gathered
## location population fish_species number
## 1 Australia 25499.881 whitefish 5
## 2 Indonesia 273523.621 whitefish 5
## 3 Philippines 109581.085 whitefish 4
## 4 Fiji 896.444 whitefish 4
## 5 Papua New Guinea 8947.027 whitefish 4
## 6 Australia 25499.881 bluefish 5
## 7 Indonesia 273523.621 bluefish 2
## 8 Philippines 109581.085 bluefish 6
## 9 Fiji 896.444 bluefish 5
## 10 Papua New Guinea 8947.027 bluefish 5
## 11 Australia 25499.881 greenfish 3
## 12 Indonesia 273523.621 greenfish 5
## 13 Philippines 109581.085 greenfish 5
## 14 Fiji 896.444 greenfish 8
## 15 Papua New Guinea 8947.027 greenfish 5
## 16 Australia 25499.881 yellowfish 1
## 17 Indonesia 273523.621 yellowfish 3
## 18 Philippines 109581.085 yellowfish 3
## 19 Fiji 896.444 yellowfish 1
## 20 Papua New Guinea 8947.027 yellowfish 3
# take the mean and standard deviation for each species with group_by() and summarize()
# group_by() includes the newly created variable
fish.means <- fish.gathered %>%
group_by(fish_species) %>%
summarize(mean.fish = mean(number),
sd.fish = sd(number))
fish.means
## # A tibble: 4 × 3
## fish_species mean.fish sd.fish
## <chr> <dbl> <dbl>
## 1 bluefish 4.6 1.52
## 2 greenfish 5.2 1.79
## 3 whitefish 4.4 0.548
## 4 yellowfish 2.2 1.10
# spread() is the opposite of gather()
# use spread() to convert the data back into a condensed, wide format
fish.spread <- fish.gathered %>%
spread(key = fish_species, value = number)
fish.spread
## location population bluefish greenfish whitefish yellowfish
## 1 Australia 25499.881 5 3 5 1
## 2 Fiji 896.444 5 8 4 1
## 3 Indonesia 273523.621 2 5 5 3
## 4 Papua New Guinea 8947.027 5 5 4 3
## 5 Philippines 109581.085 6 5 4 3