What are dplyr and tidyr?

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
surveys <- read_csv("C:/Users/User/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Tutorial4Project1/data_raw/portal_data_joined.csv")
## Rows: 34786 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## inspect the data
str(surveys)
## spc_tbl_ [34,786 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ record_id      : num [1:34786] 1 72 224 266 349 363 435 506 588 661 ...
##  $ month          : num [1:34786] 7 8 9 10 11 11 12 1 2 3 ...
##  $ day            : num [1:34786] 16 19 13 16 12 12 10 8 18 11 ...
##  $ year           : num [1:34786] 1977 1977 1977 1977 1977 ...
##  $ plot_id        : num [1:34786] 2 2 2 2 2 2 2 2 2 2 ...
##  $ species_id     : chr [1:34786] "NL" "NL" "NL" "NL" ...
##  $ sex            : chr [1:34786] "M" "M" NA NA ...
##  $ hindfoot_length: num [1:34786] 32 31 NA NA NA NA NA NA NA NA ...
##  $ weight         : num [1:34786] NA NA NA NA NA NA NA NA 218 NA ...
##  $ genus          : chr [1:34786] "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
##  $ species        : chr [1:34786] "albigula" "albigula" "albigula" "albigula" ...
##  $ taxa           : chr [1:34786] "Rodent" "Rodent" "Rodent" "Rodent" ...
##  $ plot_type      : chr [1:34786] "Control" "Control" "Control" "Control" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   record_id = col_double(),
##   ..   month = col_double(),
##   ..   day = col_double(),
##   ..   year = col_double(),
##   ..   plot_id = col_double(),
##   ..   species_id = col_character(),
##   ..   sex = col_character(),
##   ..   hindfoot_length = col_double(),
##   ..   weight = col_double(),
##   ..   genus = col_character(),
##   ..   species = col_character(),
##   ..   taxa = col_character(),
##   ..   plot_type = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
## preview the data
view(surveys)

Selecting columns and filtering rows

select(surveys, plot_id, species_id, weight)
## # A tibble: 34,786 × 3
##    plot_id species_id weight
##      <dbl> <chr>       <dbl>
##  1       2 NL             NA
##  2       2 NL             NA
##  3       2 NL             NA
##  4       2 NL             NA
##  5       2 NL             NA
##  6       2 NL             NA
##  7       2 NL             NA
##  8       2 NL             NA
##  9       2 NL            218
## 10       2 NL             NA
## # ℹ 34,776 more rows
select(surveys, -record_id, -species_id)
## # A tibble: 34,786 × 11
##    month   day  year plot_id sex   hindfoot_length weight genus   species  taxa 
##    <dbl> <dbl> <dbl>   <dbl> <chr>           <dbl>  <dbl> <chr>   <chr>    <chr>
##  1     7    16  1977       2 M                  32     NA Neotoma albigula Rode…
##  2     8    19  1977       2 M                  31     NA Neotoma albigula Rode…
##  3     9    13  1977       2 <NA>               NA     NA Neotoma albigula Rode…
##  4    10    16  1977       2 <NA>               NA     NA Neotoma albigula Rode…
##  5    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rode…
##  6    11    12  1977       2 <NA>               NA     NA Neotoma albigula Rode…
##  7    12    10  1977       2 <NA>               NA     NA Neotoma albigula Rode…
##  8     1     8  1978       2 <NA>               NA     NA Neotoma albigula Rode…
##  9     2    18  1978       2 M                  NA    218 Neotoma albigula Rode…
## 10     3    11  1978       2 <NA>               NA     NA Neotoma albigula Rode…
## # ℹ 34,776 more rows
## # ℹ 1 more variable: plot_type <chr>
filter(surveys, year == 1995)
## # A tibble: 1,180 × 13
##    record_id month   day  year plot_id species_id sex   hindfoot_length weight
##        <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
##  1     22314     6     7  1995       2 NL         M                  34     NA
##  2     22728     9    23  1995       2 NL         F                  32    165
##  3     22899    10    28  1995       2 NL         F                  32    171
##  4     23032    12     2  1995       2 NL         F                  33     NA
##  5     22003     1    11  1995       2 DM         M                  37     41
##  6     22042     2     4  1995       2 DM         F                  36     45
##  7     22044     2     4  1995       2 DM         M                  37     46
##  8     22105     3     4  1995       2 DM         F                  37     49
##  9     22109     3     4  1995       2 DM         M                  37     46
## 10     22168     4     1  1995       2 DM         M                  36     48
## # ℹ 1,170 more rows
## # ℹ 4 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>

Pipes

surveys2 <- filter(surveys, weight < 5)
surveys_sml <- select(surveys2, species_id, sex, weight)
surveys_sml <- select(filter(surveys, weight < 5), species_id, sex, weight)
surveys_sml <- surveys %>%
  filter(weight < 5) %>%
  select(species_id, sex, weight)

surveys_sml
## # A tibble: 17 × 3
##    species_id sex   weight
##    <chr>      <chr>  <dbl>
##  1 PF         F          4
##  2 PF         F          4
##  3 PF         M          4
##  4 RM         F          4
##  5 RM         M          4
##  6 PF         <NA>       4
##  7 PP         M          4
##  8 RM         M          4
##  9 RM         M          4
## 10 RM         M          4
## 11 PF         M          4
## 12 PF         F          4
## 13 RM         M          4
## 14 RM         M          4
## 15 RM         F          4
## 16 RM         M          4
## 17 RM         M          4

Challenge

surveys %>% filter(year < 1995) %>% select(year, sex, weight)
## # A tibble: 21,486 × 3
##     year sex   weight
##    <dbl> <chr>  <dbl>
##  1  1977 M         NA
##  2  1977 M         NA
##  3  1977 <NA>      NA
##  4  1977 <NA>      NA
##  5  1977 <NA>      NA
##  6  1977 <NA>      NA
##  7  1977 <NA>      NA
##  8  1978 <NA>      NA
##  9  1978 M        218
## 10  1978 <NA>      NA
## # ℹ 21,476 more rows

Mutate

surveys %>%
  mutate(weight_kg = weight / 1000)
## # A tibble: 34,786 × 14
##    record_id month   day  year plot_id species_id sex   hindfoot_length weight
##        <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
##  1         1     7    16  1977       2 NL         M                  32     NA
##  2        72     8    19  1977       2 NL         M                  31     NA
##  3       224     9    13  1977       2 NL         <NA>               NA     NA
##  4       266    10    16  1977       2 NL         <NA>               NA     NA
##  5       349    11    12  1977       2 NL         <NA>               NA     NA
##  6       363    11    12  1977       2 NL         <NA>               NA     NA
##  7       435    12    10  1977       2 NL         <NA>               NA     NA
##  8       506     1     8  1978       2 NL         <NA>               NA     NA
##  9       588     2    18  1978       2 NL         M                  NA    218
## 10       661     3    11  1978       2 NL         <NA>               NA     NA
## # ℹ 34,776 more rows
## # ℹ 5 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>,
## #   weight_kg <dbl>
surveys %>%
  mutate(weight_kg = weight / 1000,
         weight_lb = weight_kg * 2.2)
## # A tibble: 34,786 × 15
##    record_id month   day  year plot_id species_id sex   hindfoot_length weight
##        <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
##  1         1     7    16  1977       2 NL         M                  32     NA
##  2        72     8    19  1977       2 NL         M                  31     NA
##  3       224     9    13  1977       2 NL         <NA>               NA     NA
##  4       266    10    16  1977       2 NL         <NA>               NA     NA
##  5       349    11    12  1977       2 NL         <NA>               NA     NA
##  6       363    11    12  1977       2 NL         <NA>               NA     NA
##  7       435    12    10  1977       2 NL         <NA>               NA     NA
##  8       506     1     8  1978       2 NL         <NA>               NA     NA
##  9       588     2    18  1978       2 NL         M                  NA    218
## 10       661     3    11  1978       2 NL         <NA>               NA     NA
## # ℹ 34,776 more rows
## # ℹ 6 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>,
## #   weight_kg <dbl>, weight_lb <dbl>
surveys %>%
  mutate(weight_kg = weight / 1000) %>%
  head()
## # A tibble: 6 × 14
##   record_id month   day  year plot_id species_id sex   hindfoot_length weight
##       <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
## 1         1     7    16  1977       2 NL         M                  32     NA
## 2        72     8    19  1977       2 NL         M                  31     NA
## 3       224     9    13  1977       2 NL         <NA>               NA     NA
## 4       266    10    16  1977       2 NL         <NA>               NA     NA
## 5       349    11    12  1977       2 NL         <NA>               NA     NA
## 6       363    11    12  1977       2 NL         <NA>               NA     NA
## # ℹ 5 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>,
## #   weight_kg <dbl>
surveys %>%
  filter(!is.na(weight)) %>%
  mutate(weight_kg = weight / 1000) %>%
  head()
## # A tibble: 6 × 14
##   record_id month   day  year plot_id species_id sex   hindfoot_length weight
##       <dbl> <dbl> <dbl> <dbl>   <dbl> <chr>      <chr>           <dbl>  <dbl>
## 1       588     2    18  1978       2 NL         M                  NA    218
## 2       845     5     6  1978       2 NL         M                  32    204
## 3       990     6     9  1978       2 NL         M                  NA    200
## 4      1164     8     5  1978       2 NL         M                  34    199
## 5      1261     9     4  1978       2 NL         M                  32    197
## 6      1453    11     5  1978       2 NL         M                  NA    218
## # ℹ 5 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>,
## #   weight_kg <dbl>

Challenge

surveys %>% 
   mutate(hindfoot_cm = hindfoot_length / 100) %>% 
  filter(!is.na(hindfoot_cm) & hindfoot_cm < 3) %>% 
  select(species_id, hindfoot_cm)
## # A tibble: 31,438 × 2
##    species_id hindfoot_cm
##    <chr>            <dbl>
##  1 NL                0.32
##  2 NL                0.31
##  3 NL                0.32
##  4 NL                0.34
##  5 NL                0.32
##  6 NL                0.33
##  7 NL                0.32
##  8 NL                0.32
##  9 NL                0.33
## 10 NL                0.3 
## # ℹ 31,428 more rows

Split-apply-combine data analysis and the summarize() function

surveys %>%
  group_by(sex) %>%
  summarize(mean_weight = mean(weight, na.rm = TRUE))
## # A tibble: 3 × 2
##   sex   mean_weight
##   <chr>       <dbl>
## 1 F            42.2
## 2 M            43.0
## 3 <NA>         64.7
surveys %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight, na.rm = TRUE)) %>%
  tail()
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 6 × 3
## # Groups:   sex [1]
##   sex   species_id mean_weight
##   <chr> <chr>            <dbl>
## 1 <NA>  SU                 NaN
## 2 <NA>  UL                 NaN
## 3 <NA>  UP                 NaN
## 4 <NA>  UR                 NaN
## 5 <NA>  US                 NaN
## 6 <NA>  ZL                 NaN
# The resulting mean_weight column does not contain NA but NaN (which refers to “Not a Number”) because mean() was called on a vector of NA values while at the same time setting na.rm = TRUE. To avoid this, we can remove the missing values for weight before we attempt to calculate the summary statistics on weight. 

surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight)) %>%
  print(n = 15)
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 64 × 3
## # Groups:   sex [3]
##    sex   species_id mean_weight
##    <chr> <chr>            <dbl>
##  1 F     BA                9.16
##  2 F     DM               41.6 
##  3 F     DO               48.5 
##  4 F     DS              118.  
##  5 F     NL              154.  
##  6 F     OL               31.1 
##  7 F     OT               24.8 
##  8 F     OX               21   
##  9 F     PB               30.2 
## 10 F     PE               22.8 
## 11 F     PF                7.97
## 12 F     PH               30.8 
## 13 F     PL               19.3 
## 14 F     PM               22.1 
## 15 F     PP               17.2 
## # ℹ 49 more rows
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight))
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 64 × 4
## # Groups:   sex [3]
##    sex   species_id mean_weight min_weight
##    <chr> <chr>            <dbl>      <dbl>
##  1 F     BA                9.16          6
##  2 F     DM               41.6          10
##  3 F     DO               48.5          12
##  4 F     DS              118.           45
##  5 F     NL              154.           32
##  6 F     OL               31.1          10
##  7 F     OT               24.8           5
##  8 F     OX               21            20
##  9 F     PB               30.2          12
## 10 F     PE               22.8          11
## # ℹ 54 more rows
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight)) %>%
  arrange(min_weight)
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 64 × 4
## # Groups:   sex [3]
##    sex   species_id mean_weight min_weight
##    <chr> <chr>            <dbl>      <dbl>
##  1 F     PF                7.97          4
##  2 F     RM               11.1           4
##  3 M     PF                7.89          4
##  4 M     PP               17.2           4
##  5 M     RM               10.1           4
##  6 <NA>  PF                6             4
##  7 F     OT               24.8           5
##  8 F     PP               17.2           5
##  9 F     BA                9.16          6
## 10 M     BA                7.36          6
## # ℹ 54 more rows
surveys %>%
  filter(!is.na(weight)) %>%
  group_by(sex, species_id) %>%
  summarize(mean_weight = mean(weight),
            min_weight = min(weight)) %>%
  arrange(desc(mean_weight))
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 64 × 4
## # Groups:   sex [3]
##    sex   species_id mean_weight min_weight
##    <chr> <chr>            <dbl>      <dbl>
##  1 <NA>  NL               168.          83
##  2 M     NL               166.          30
##  3 F     NL               154.          32
##  4 M     SS               130          130
##  5 <NA>  SH               130          130
##  6 M     DS               122.          12
##  7 <NA>  DS               120           78
##  8 F     DS               118.          45
##  9 F     SH                78.8         30
## 10 F     SF                69           46
## # ℹ 54 more rows

Counting

surveys %>%
    count(sex)
## # A tibble: 3 × 2
##   sex       n
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748
# =
surveys %>%
    group_by(sex) %>%
    summarize(count = n())
## # A tibble: 3 × 2
##   sex   count
##   <chr> <int>
## 1 F     15690
## 2 M     17348
## 3 <NA>   1748
surveys %>%
    count(sex, sort = TRUE)
## # A tibble: 3 × 2
##   sex       n
##   <chr> <int>
## 1 M     17348
## 2 F     15690
## 3 <NA>   1748
surveys %>%
  count(sex, species)
## # A tibble: 81 × 3
##    sex   species         n
##    <chr> <chr>       <int>
##  1 F     albigula      675
##  2 F     baileyi      1646
##  3 F     eremicus      579
##  4 F     flavus        757
##  5 F     fulvescens     57
##  6 F     fulviventer    17
##  7 F     hispidus       99
##  8 F     leucogaster   475
##  9 F     leucopus       16
## 10 F     maniculatus   382
## # ℹ 71 more rows
surveys %>%
  count(sex, species) %>%
  arrange(species, desc(n))
## # A tibble: 81 × 3
##    sex   species             n
##    <chr> <chr>           <int>
##  1 F     albigula          675
##  2 M     albigula          502
##  3 <NA>  albigula           75
##  4 <NA>  audubonii          75
##  5 F     baileyi          1646
##  6 M     baileyi          1216
##  7 <NA>  baileyi            29
##  8 <NA>  bilineata         303
##  9 <NA>  brunneicapillus    50
## 10 <NA>  chlorurus          39
## # ℹ 71 more rows

Challenge

# 1
surveys %>% 
  count(plot_type, species) %>% 
  count(plot_type)
## # A tibble: 5 × 2
##   plot_type                     n
##   <chr>                     <int>
## 1 Control                      33
## 2 Long-term Krat Exclosure     34
## 3 Rodent Exclosure             32
## 4 Short-term Krat Exclosure    32
## 5 Spectab exclosure            25
# 2
surveys %>% 
  group_by(species_id) %>% 
  filter(!is.na(hindfoot_length)) %>% 
  summarise(mean_length = mean(hindfoot_length),
            min_length = min(hindfoot_length),
            max_length = max(hindfoot_length),
            n = n()) 
## # A tibble: 25 × 5
##    species_id mean_length min_length max_length     n
##    <chr>            <dbl>      <dbl>      <dbl> <int>
##  1 AH                33           31         35     2
##  2 BA                13            6         16    45
##  3 DM                36.0         16         50  9972
##  4 DO                35.6         26         64  2887
##  5 DS                49.9         39         58  2132
##  6 NL                32.3         21         70  1074
##  7 OL                20.5         12         39   920
##  8 OT                20.3         13         50  2139
##  9 OX                19.1         13         21     8
## 10 PB                26.1          2         47  2864
## # ℹ 15 more rows
# 3
surveys %>%
  filter(!is.na(weight)) %>% 
  select(weight, year, genus, species_id) %>% 
  group_by(year) %>% 
  filter(weight == max(weight)) %>% 
  arrange(year)
## # A tibble: 27 × 4
## # Groups:   year [26]
##    weight  year genus     species_id
##     <dbl> <dbl> <chr>     <chr>     
##  1    149  1977 Dipodomys DS        
##  2    232  1978 Neotoma   NL        
##  3    232  1978 Neotoma   NL        
##  4    274  1979 Neotoma   NL        
##  5    243  1980 Neotoma   NL        
##  6    264  1981 Neotoma   NL        
##  7    252  1982 Neotoma   NL        
##  8    256  1983 Neotoma   NL        
##  9    259  1984 Neotoma   NL        
## 10    225  1985 Neotoma   NL        
## # ℹ 17 more rows

Combining datasets using merge()

gene_expression <- data.frame(
  GeneID = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5"),
  Condition_A = c(10.5, 8.2, 15.7, 9.1, 7.8),
  Condition_B = c(9.3, 7.5, 14.2, 8.9, 6.7)
)

gene_annotations <- data.frame(
  GeneID = c("Gene1", "Gene3", "Gene4", "Gene6"),
  Function = c("Transporter", "Kinase", "Receptor", "Unknown")
)

merged_genes <- left_join(gene_expression, gene_annotations, by = "GeneID")
# left join = join to the left

Exporting data

surveys_complete <- surveys %>%
  filter(!is.na(weight),           # remove missing weight
         !is.na(hindfoot_length),  # remove missing hindfoot_length
         !is.na(sex))                # remove missing sex

## Extract the most common species_id
species_counts <- surveys_complete %>%
    count(species_id) %>%
    filter(n >= 50)

## Only keep the most common species
surveys_complete <- surveys_complete %>%
  filter(species_id %in% species_counts$species_id)

write_csv(surveys_complete, file = "data_processed/surveys_complete.csv")