An example with dplyr

The data set concerns species and weight of animals caught in plots in a study area in Arizona over time.

Each row holds information for a single animal, and the columns represent:

record_id: Unique id for the observation
month: month of observation
day: day of observation
year: year of observation
plot_id: ID of a particular plot
species_id: 2-letter code
sex: sex of animal (“M”, “F”)
hindfoot_length: length of the hindfoot in mm
weight: weight of the animal in grams
genus: genus of animal
species: species of animal
taxa: e.g. Rodent, Reptile, Bird, Rabbit
plot_type: type of plot

# install.packages() & library()
pacman::p_load(tidyverse)

# read csv data
dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")

## Parsed with column specification:
## cols(
##   record_id = col_double(),
##   month = col_double(),
##   day = col_double(),
##   year = col_double(),
##   plot_id = col_double(),
##   species_id = col_character(),
##   sex = col_character(),
##   hindfoot_length = col_double(),
##   weight = col_double(),
##   genus = col_character(),
##   species = col_character(),
##   taxa = col_character(),
##   plot_type = col_character()
## )

# see every column in a data frame
glimpse(dta)

## Observations: 34,786
## Variables: 13
## $ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, …
## $ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9…
## $ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9,…
## $ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978,…
## $ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,…
## $ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL",…
## $ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, …
## $ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32,…
## $ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 20…
## $ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Ne…
## $ species         <chr> "albigula", "albigula", "albigula", "albigula",…
## $ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent…
## $ plot_type       <chr> "Control", "Control", "Control", "Control", "Co…

# data shape
dim(dta)

## [1] 34786    13

# 選plot_id, species_id, weight這些變項看前幾筆資料
dplyr::select(dta, plot_id, species_id, weight) %>% head()

# 不選record_id, species_id這些變項看前幾筆資料
dplyr::select(dta, -record_id, -species_id) %>% head()

# 選 year == 1995 的資料看前幾筆資料
dplyr::filter(dta, year == 1995) %>% head()

# 選 weight <= 5 的資料的 species_id, sex, weight 這幾個 variables 看前幾筆
head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))

# 同上
dta %>% 
  dplyr::filter(weight <= 5) %>% 
  dplyr::select(species_id, sex, weight) %>% 
  head

# 製造新的 variables (weight_kg, weight_lb) 看前幾筆
dta %>% 
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2) %>% 
  head()

# 去掉 weight 有 NA 的，將資料以 sex, species_id 合併在一起算平均 weight 看前幾筆
dta %>% 
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight)) %>%
  arrange(desc(mean_weight)) %>% 
  head()

# Count/tally observations by group(sex variable)
dta %>%
  group_by(sex) %>%
  tally

# 同上
dta %>%
  count(sex)

# 同上
dta %>%
  group_by(sex) %>%
  summarize(count = n())

# 同上(year沒有NA的去算row長度)
dta %>%
  group_by(sex) %>%
  summarize(count = sum(!is.na(year)))

# 創 dta_gw 過濾掉 dta weight 有 NA 的，以 genus, plot_id 分組算平均重量
dta_gw <- dta %>% 
  filter(!is.na(weight)) %>%
  group_by(genus, plot_id) %>%
  summarize(mean_weight = mean(weight))

# 查看 dta_gw 資料結構
glimpse(dta_gw)

## Observations: 196
## Variables: 3
## Groups: genus [10]
## $ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomy…
## $ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8,…
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9…

# 拆開 genus 變項變成多個 columns ，填入 mean_weight
dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight)

# 查看 dta_w 資料結構
glimpse(dta_w)

## Observations: 24
## Variables: 11
## $ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, …
## $ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA,…
## $ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.9827…
## $ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.1135…
## $ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.037…
## $ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.0169…
## $ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.65853…
## $ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.2317…
## $ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.…
## $ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.…
## $ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

# 拆開 genus 變項變成多個 columns ，填入 mean_weight，NA 值填 0，看前幾筆
dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>%
  head()

# 合併 plot_id 以外的變項成為 genus，值放 mean_weight
dta_l <- dta_w %>%
  gather(key = genus, value = mean_weight, -plot_id)

# 查看 dta_l 資料結構
glimpse(dta_l)

## Observations: 240
## Variables: 3
## $ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomy…
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA,…

# 合併 plot_id 以外的變項成為 genus，值放 mean_weight
dta_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
  head()

# 過濾掉 weight, hindfoot_length, sex 有 NA 的 row
dta_complete <- dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex))  
dta_complete

# 篩選 species_id 出現次數大於 50 的
species_counts <- dta_complete %>%
    count(species_id) %>% 
    filter(n >= 50)
species_counts

# 篩選 dta_complete 的 species_id 有在 species_counts 裡面的資料
dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id)
dta_complete