In-class exercise 3

pacman::p_load(tidyverse)

dta <- read_csv("http://kbroman.org/datacarp/portal_data_joined.csv")

## Parsed with column specification:
## cols(
##   record_id = col_double(),
##   month = col_double(),
##   day = col_double(),
##   year = col_double(),
##   plot_id = col_double(),
##   species_id = col_character(),
##   sex = col_character(),
##   hindfoot_length = col_double(),
##   weight = col_double(),
##   genus = col_character(),
##   species = col_character(),
##   taxa = col_character(),
##   plot_type = col_character()
## )

glimpse(dta)

## Observations: 34,786
## Variables: 13
## $ record_id       <dbl> 1, 72, 224, 266, 349, 363, 435, 506, 588, 661, 748, …
## $ month           <dbl> 7, 8, 9, 10, 11, 11, 12, 1, 2, 3, 4, 5, 6, 8, 9, 10,…
## $ day             <dbl> 16, 19, 13, 16, 12, 12, 10, 8, 18, 11, 8, 6, 9, 5, 4…
## $ year            <dbl> 1977, 1977, 1977, 1977, 1977, 1977, 1977, 1978, 1978…
## $ plot_id         <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ species_id      <chr> "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL", "NL"…
## $ sex             <chr> "M", "M", NA, NA, NA, NA, NA, NA, "M", NA, NA, "M", …
## $ hindfoot_length <dbl> 32, 31, NA, NA, NA, NA, NA, NA, NA, NA, NA, 32, NA, …
## $ weight          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, 218, NA, NA, 204, 20…
## $ genus           <chr> "Neotoma", "Neotoma", "Neotoma", "Neotoma", "Neotoma…
## $ species         <chr> "albigula", "albigula", "albigula", "albigula", "alb…
## $ taxa            <chr> "Rodent", "Rodent", "Rodent", "Rodent", "Rodent", "R…
## $ plot_type       <chr> "Control", "Control", "Control", "Control", "Control…

dim(dta)

## [1] 34786    13

dplyr::select(dta, plot_id, species_id, weight) %>% head()

## # A tibble: 6 x 3
##   plot_id species_id weight
##     <dbl> <chr>       <dbl>
## 1       2 NL             NA
## 2       2 NL             NA
## 3       2 NL             NA
## 4       2 NL             NA
## 5       2 NL             NA
## 6       2 NL             NA

dplyr::select(dta, -record_id, -species_id) %>% head()

## # A tibble: 6 x 11
##   month   day  year plot_id sex   hindfoot_length weight genus species taxa 
##   <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr> <chr>   <chr>
## 1     7    16  1977       2 M                  32     NA Neot… albigu… Rode…
## 2     8    19  1977       2 M                  31     NA Neot… albigu… Rode…
## 3     9    13  1977       2 <NA>               NA     NA Neot… albigu… Rode…
## 4    10    16  1977       2 <NA>               NA     NA Neot… albigu… Rode…
## 5    11    12  1977       2 <NA>               NA     NA Neot… albigu… Rode…
## 6    11    12  1977       2 <NA>               NA     NA Neot… albigu… Rode…
## # … with 1 more variable: plot_type <chr>

dplyr::filter(dta, year == 1995) %>% head()

## # A tibble: 6 x 13
##   record_id month   day  year plot_id species_id sex   hindfoot_length weight
##       <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
## 1     22314     6     7  1995       2 NL         M                  34     NA
## 2     22728     9    23  1995       2 NL         F                  32    165
## 3     22899    10    28  1995       2 NL         F                  32    171
## 4     23032    12     2  1995       2 NL         F                  33     NA
## 5     22003     1    11  1995       2 DM         M                  37     41
## 6     22042     2     4  1995       2 DM         F                  36     45
## # … with 4 more variables: genus <chr>, species <chr>, taxa <chr>,
## #   plot_type <chr>

head(dplyr::select(dplyr::filter(dta, weight <= 5), species_id, sex, weight))

## # A tibble: 6 x 3
##   species_id sex   weight
##   <chr>      <chr>  <dbl>
## 1 PF         M          5
## 2 PF         F          5
## 3 PF         F          5
## 4 PF         F          4
## 5 PF         F          5
## 6 PF         F          4

dta %>% 
  dplyr::filter(weight <= 5) %>% 
  dplyr::select(species_id, sex, weight) %>% 
  head

## # A tibble: 6 x 3
##   species_id sex   weight
##   <chr>      <chr>  <dbl>
## 1 PF         M          5
## 2 PF         F          5
## 3 PF         F          5
## 4 PF         F          4
## 5 PF         F          5
## 6 PF         F          4

dta %>% 
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2) %>% 
  head()

## # A tibble: 6 x 15
##   record_id month   day  year plot_id species_id sex   hindfoot_length weight
##       <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
## 1         1     7    16  1977       2 NL         M                  32     NA
## 2        72     8    19  1977       2 NL         M                  31     NA
## 3       224     9    13  1977       2 NL         <NA>               NA     NA
## 4       266    10    16  1977       2 NL         <NA>               NA     NA
## 5       349    11    12  1977       2 NL         <NA>               NA     NA
## 6       363    11    12  1977       2 NL         <NA>               NA     NA
## # … with 6 more variables: genus <chr>, species <chr>, taxa <chr>,
## #   plot_type <chr>, weight_kg <dbl>, weight_lb <dbl>

dta %>% 
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight)) %>%
  arrange(desc(mean_weight)) %>% 
  head()

## # A tibble: 6 x 3
## # Groups:   sex [3]
##   sex   species_id mean_weight
##   <chr> <chr>            <dbl>
## 1 <NA>  NL                168.
## 2 M     NL                166.
## 3 F     NL                154.
## 4 M     SS                130 
## 5 <NA>  SH                130 
## 6 M     DS                122.

dta %>%
  group_by(sex) %>%
  tally

## # A tibble: 3 x 2
##   sex       n
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748

dta %>%
  count(sex)

## # A tibble: 3 x 2
##   sex       n
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748

dta %>%
  group_by(sex) %>%
  summarize(count = n())

## # A tibble: 3 x 2
##   sex   count
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748

dta %>%
  group_by(sex) %>%
  summarize(count = sum(!is.na(year)))

## # A tibble: 3 x 2
##   sex   count
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748

dta_gw <- dta %>% 
  filter(!is.na(weight)) %>%
  group_by(genus, plot_id) %>%
  summarize(mean_weight = mean(weight))

glimpse(dta_gw)

## Observations: 196
## Variables: 3
## Groups: genus [10]
## $ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "…
## $ plot_id     <dbl> 1, 2, 3, 5, 18, 19, 20, 21, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1…
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, 7.750000, 9.500000, 9.5333…

dta_w <- dta_gw %>%
  spread(key = genus, value = mean_weight)

glimpse(dta_w)

## Observations: 24
## Variables: 11
## $ plot_id         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1…
## $ Baiomys         <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, …
## $ Chaetodipus     <dbl> 22.19939, 25.11014, 24.63636, 23.02381, 17.98276, 24…
## $ Dipodomys       <dbl> 60.23214, 55.68259, 52.04688, 57.52454, 51.11356, 58…
## $ Neotoma         <dbl> 156.2222, 169.1436, 158.2414, 164.1667, 190.0370, 17…
## $ Onychomys       <dbl> 27.67550, 26.87302, 26.03241, 28.09375, 27.01695, 25…
## $ Perognathus     <dbl> 9.625000, 6.947368, 7.507812, 7.824427, 8.658537, 7.…
## $ Peromyscus      <dbl> 22.22222, 22.26966, 21.37037, 22.60000, 21.23171, 21…
## $ Reithrodontomys <dbl> 11.375000, 10.680556, 10.516588, 10.263158, 11.15454…
## $ Sigmodon        <dbl> NA, 70.85714, 65.61404, 82.00000, 82.66667, 68.77778…
## $ Spermophilus    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

dta_gw %>%
  spread(genus, mean_weight, fill = 0) %>%
  head()

## # A tibble: 6 x 11
##   plot_id Baiomys Chaetodipus Dipodomys Neotoma Onychomys Perognathus Peromyscus
##     <dbl>   <dbl>       <dbl>     <dbl>   <dbl>     <dbl>       <dbl>      <dbl>
## 1       1    7           22.2      60.2    156.      27.7        9.62       22.2
## 2       2    6           25.1      55.7    169.      26.9        6.95       22.3
## 3       3    8.61        24.6      52.0    158.      26.0        7.51       21.4
## 4       4    0           23.0      57.5    164.      28.1        7.82       22.6
## 5       5    7.75        18.0      51.1    190.      27.0        8.66       21.2
## 6       6    0           24.9      58.6    180.      25.9        7.81       21.8
## # … with 3 more variables: Reithrodontomys <dbl>, Sigmodon <dbl>,
## #   Spermophilus <dbl>

dta_l <- dta_w %>%
  gather(key = genus, value = mean_weight, -plot_id)

glimpse(dta_l)

## Observations: 240
## Variables: 3
## $ plot_id     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1…
## $ genus       <chr> "Baiomys", "Baiomys", "Baiomys", "Baiomys", "Baiomys", "…
## $ mean_weight <dbl> 7.000000, 6.000000, 8.611111, NA, 7.750000, NA, NA, NA, …

dta_w %>%
  gather(key = genus, value = mean_weight, Baiomys:Spermophilus) %>%
  head()

## # A tibble: 6 x 3
##   plot_id genus   mean_weight
##     <dbl> <chr>         <dbl>
## 1       1 Baiomys        7   
## 2       2 Baiomys        6   
## 3       3 Baiomys        8.61
## 4       4 Baiomys       NA   
## 5       5 Baiomys        7.75
## 6       6 Baiomys       NA

dta_complete <- dta %>%
  filter(!is.na(weight),           
         !is.na(hindfoot_length),  
         !is.na(sex))

species_counts <- dta_complete %>%
    count(species_id) %>% 
    filter(n >= 50)

dta_complete <- dta_complete %>%
  filter(species_id %in% species_counts$species_id)

In-class exercise 3

Hao-Lun Fu

2020-04-13