What are dplyr and tidyr?
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
surveys <- read_csv("C:/Users/User/Desktop/S's Digital lab/courseware yr 3/0310 bioinfo/Tutorial4Project1/data_raw/portal_data_joined.csv")
## Rows: 34786 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): species_id, sex, genus, species, taxa, plot_type
## dbl (7): record_id, month, day, year, plot_id, hindfoot_length, weight
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## inspect the data
str(surveys)
## spc_tbl_ [34,786 × 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ record_id : num [1:34786] 1 72 224 266 349 363 435 506 588 661 ...
## $ month : num [1:34786] 7 8 9 10 11 11 12 1 2 3 ...
## $ day : num [1:34786] 16 19 13 16 12 12 10 8 18 11 ...
## $ year : num [1:34786] 1977 1977 1977 1977 1977 ...
## $ plot_id : num [1:34786] 2 2 2 2 2 2 2 2 2 2 ...
## $ species_id : chr [1:34786] "NL" "NL" "NL" "NL" ...
## $ sex : chr [1:34786] "M" "M" NA NA ...
## $ hindfoot_length: num [1:34786] 32 31 NA NA NA NA NA NA NA NA ...
## $ weight : num [1:34786] NA NA NA NA NA NA NA NA 218 NA ...
## $ genus : chr [1:34786] "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
## $ species : chr [1:34786] "albigula" "albigula" "albigula" "albigula" ...
## $ taxa : chr [1:34786] "Rodent" "Rodent" "Rodent" "Rodent" ...
## $ plot_type : chr [1:34786] "Control" "Control" "Control" "Control" ...
## - attr(*, "spec")=
## .. cols(
## .. record_id = col_double(),
## .. month = col_double(),
## .. day = col_double(),
## .. year = col_double(),
## .. plot_id = col_double(),
## .. species_id = col_character(),
## .. sex = col_character(),
## .. hindfoot_length = col_double(),
## .. weight = col_double(),
## .. genus = col_character(),
## .. species = col_character(),
## .. taxa = col_character(),
## .. plot_type = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
## preview the data
view(surveys)
Selecting columns and filtering rows
select(surveys, plot_id, species_id, weight)
## # A tibble: 34,786 × 3
## plot_id species_id weight
## <dbl> <chr> <dbl>
## 1 2 NL NA
## 2 2 NL NA
## 3 2 NL NA
## 4 2 NL NA
## 5 2 NL NA
## 6 2 NL NA
## 7 2 NL NA
## 8 2 NL NA
## 9 2 NL 218
## 10 2 NL NA
## # ℹ 34,776 more rows
select(surveys, -record_id, -species_id)
## # A tibble: 34,786 × 11
## month day year plot_id sex hindfoot_length weight genus species taxa
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 7 16 1977 2 M 32 NA Neotoma albigula Rode…
## 2 8 19 1977 2 M 31 NA Neotoma albigula Rode…
## 3 9 13 1977 2 <NA> NA NA Neotoma albigula Rode…
## 4 10 16 1977 2 <NA> NA NA Neotoma albigula Rode…
## 5 11 12 1977 2 <NA> NA NA Neotoma albigula Rode…
## 6 11 12 1977 2 <NA> NA NA Neotoma albigula Rode…
## 7 12 10 1977 2 <NA> NA NA Neotoma albigula Rode…
## 8 1 8 1978 2 <NA> NA NA Neotoma albigula Rode…
## 9 2 18 1978 2 M NA 218 Neotoma albigula Rode…
## 10 3 11 1978 2 <NA> NA NA Neotoma albigula Rode…
## # ℹ 34,776 more rows
## # ℹ 1 more variable: plot_type <chr>
filter(surveys, year == 1995)
## # A tibble: 1,180 × 13
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 22314 6 7 1995 2 NL M 34 NA
## 2 22728 9 23 1995 2 NL F 32 165
## 3 22899 10 28 1995 2 NL F 32 171
## 4 23032 12 2 1995 2 NL F 33 NA
## 5 22003 1 11 1995 2 DM M 37 41
## 6 22042 2 4 1995 2 DM F 36 45
## 7 22044 2 4 1995 2 DM M 37 46
## 8 22105 3 4 1995 2 DM F 37 49
## 9 22109 3 4 1995 2 DM M 37 46
## 10 22168 4 1 1995 2 DM M 36 48
## # ℹ 1,170 more rows
## # ℹ 4 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>
Pipes
surveys2 <- filter(surveys, weight < 5)
surveys_sml <- select(surveys2, species_id, sex, weight)
surveys_sml <- select(filter(surveys, weight < 5), species_id, sex, weight)
surveys_sml <- surveys %>%
filter(weight < 5) %>%
select(species_id, sex, weight)
surveys_sml
## # A tibble: 17 × 3
## species_id sex weight
## <chr> <chr> <dbl>
## 1 PF F 4
## 2 PF F 4
## 3 PF M 4
## 4 RM F 4
## 5 RM M 4
## 6 PF <NA> 4
## 7 PP M 4
## 8 RM M 4
## 9 RM M 4
## 10 RM M 4
## 11 PF M 4
## 12 PF F 4
## 13 RM M 4
## 14 RM M 4
## 15 RM F 4
## 16 RM M 4
## 17 RM M 4
Challenge
surveys %>% filter(year < 1995) %>% select(year, sex, weight)
## # A tibble: 21,486 × 3
## year sex weight
## <dbl> <chr> <dbl>
## 1 1977 M NA
## 2 1977 M NA
## 3 1977 <NA> NA
## 4 1977 <NA> NA
## 5 1977 <NA> NA
## 6 1977 <NA> NA
## 7 1977 <NA> NA
## 8 1978 <NA> NA
## 9 1978 M 218
## 10 1978 <NA> NA
## # ℹ 21,476 more rows
Mutate
surveys %>%
mutate(weight_kg = weight / 1000)
## # A tibble: 34,786 × 14
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL <NA> NA NA
## 4 266 10 16 1977 2 NL <NA> NA NA
## 5 349 11 12 1977 2 NL <NA> NA NA
## 6 363 11 12 1977 2 NL <NA> NA NA
## 7 435 12 10 1977 2 NL <NA> NA NA
## 8 506 1 8 1978 2 NL <NA> NA NA
## 9 588 2 18 1978 2 NL M NA 218
## 10 661 3 11 1978 2 NL <NA> NA NA
## # ℹ 34,776 more rows
## # ℹ 5 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>,
## # weight_kg <dbl>
surveys %>%
mutate(weight_kg = weight / 1000,
weight_lb = weight_kg * 2.2)
## # A tibble: 34,786 × 15
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL <NA> NA NA
## 4 266 10 16 1977 2 NL <NA> NA NA
## 5 349 11 12 1977 2 NL <NA> NA NA
## 6 363 11 12 1977 2 NL <NA> NA NA
## 7 435 12 10 1977 2 NL <NA> NA NA
## 8 506 1 8 1978 2 NL <NA> NA NA
## 9 588 2 18 1978 2 NL M NA 218
## 10 661 3 11 1978 2 NL <NA> NA NA
## # ℹ 34,776 more rows
## # ℹ 6 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>,
## # weight_kg <dbl>, weight_lb <dbl>
surveys %>%
mutate(weight_kg = weight / 1000) %>%
head()
## # A tibble: 6 × 14
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL <NA> NA NA
## 4 266 10 16 1977 2 NL <NA> NA NA
## 5 349 11 12 1977 2 NL <NA> NA NA
## 6 363 11 12 1977 2 NL <NA> NA NA
## # ℹ 5 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>,
## # weight_kg <dbl>
surveys %>%
filter(!is.na(weight)) %>%
mutate(weight_kg = weight / 1000) %>%
head()
## # A tibble: 6 × 14
## record_id month day year plot_id species_id sex hindfoot_length weight
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 588 2 18 1978 2 NL M NA 218
## 2 845 5 6 1978 2 NL M 32 204
## 3 990 6 9 1978 2 NL M NA 200
## 4 1164 8 5 1978 2 NL M 34 199
## 5 1261 9 4 1978 2 NL M 32 197
## 6 1453 11 5 1978 2 NL M NA 218
## # ℹ 5 more variables: genus <chr>, species <chr>, taxa <chr>, plot_type <chr>,
## # weight_kg <dbl>
Challenge
surveys %>%
mutate(hindfoot_cm = hindfoot_length / 100) %>%
filter(!is.na(hindfoot_cm) & hindfoot_cm < 3) %>%
select(species_id, hindfoot_cm)
## # A tibble: 31,438 × 2
## species_id hindfoot_cm
## <chr> <dbl>
## 1 NL 0.32
## 2 NL 0.31
## 3 NL 0.32
## 4 NL 0.34
## 5 NL 0.32
## 6 NL 0.33
## 7 NL 0.32
## 8 NL 0.32
## 9 NL 0.33
## 10 NL 0.3
## # ℹ 31,428 more rows
Split-apply-combine data analysis and the summarize() function
surveys %>%
group_by(sex) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
## # A tibble: 3 × 2
## sex mean_weight
## <chr> <dbl>
## 1 F 42.2
## 2 M 43.0
## 3 <NA> 64.7
surveys %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE)) %>%
tail()
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 6 × 3
## # Groups: sex [1]
## sex species_id mean_weight
## <chr> <chr> <dbl>
## 1 <NA> SU NaN
## 2 <NA> UL NaN
## 3 <NA> UP NaN
## 4 <NA> UR NaN
## 5 <NA> US NaN
## 6 <NA> ZL NaN
# The resulting mean_weight column does not contain NA but NaN (which refers to “Not a Number”) because mean() was called on a vector of NA values while at the same time setting na.rm = TRUE. To avoid this, we can remove the missing values for weight before we attempt to calculate the summary statistics on weight.
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight)) %>%
print(n = 15)
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 64 × 3
## # Groups: sex [3]
## sex species_id mean_weight
## <chr> <chr> <dbl>
## 1 F BA 9.16
## 2 F DM 41.6
## 3 F DO 48.5
## 4 F DS 118.
## 5 F NL 154.
## 6 F OL 31.1
## 7 F OT 24.8
## 8 F OX 21
## 9 F PB 30.2
## 10 F PE 22.8
## 11 F PF 7.97
## 12 F PH 30.8
## 13 F PL 19.3
## 14 F PM 22.1
## 15 F PP 17.2
## # ℹ 49 more rows
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight),
min_weight = min(weight))
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 64 × 4
## # Groups: sex [3]
## sex species_id mean_weight min_weight
## <chr> <chr> <dbl> <dbl>
## 1 F BA 9.16 6
## 2 F DM 41.6 10
## 3 F DO 48.5 12
## 4 F DS 118. 45
## 5 F NL 154. 32
## 6 F OL 31.1 10
## 7 F OT 24.8 5
## 8 F OX 21 20
## 9 F PB 30.2 12
## 10 F PE 22.8 11
## # ℹ 54 more rows
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight),
min_weight = min(weight)) %>%
arrange(min_weight)
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 64 × 4
## # Groups: sex [3]
## sex species_id mean_weight min_weight
## <chr> <chr> <dbl> <dbl>
## 1 F PF 7.97 4
## 2 F RM 11.1 4
## 3 M PF 7.89 4
## 4 M PP 17.2 4
## 5 M RM 10.1 4
## 6 <NA> PF 6 4
## 7 F OT 24.8 5
## 8 F PP 17.2 5
## 9 F BA 9.16 6
## 10 M BA 7.36 6
## # ℹ 54 more rows
surveys %>%
filter(!is.na(weight)) %>%
group_by(sex, species_id) %>%
summarize(mean_weight = mean(weight),
min_weight = min(weight)) %>%
arrange(desc(mean_weight))
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
## # A tibble: 64 × 4
## # Groups: sex [3]
## sex species_id mean_weight min_weight
## <chr> <chr> <dbl> <dbl>
## 1 <NA> NL 168. 83
## 2 M NL 166. 30
## 3 F NL 154. 32
## 4 M SS 130 130
## 5 <NA> SH 130 130
## 6 M DS 122. 12
## 7 <NA> DS 120 78
## 8 F DS 118. 45
## 9 F SH 78.8 30
## 10 F SF 69 46
## # ℹ 54 more rows
Counting
surveys %>%
count(sex)
## # A tibble: 3 × 2
## sex n
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
# =
surveys %>%
group_by(sex) %>%
summarize(count = n())
## # A tibble: 3 × 2
## sex count
## <chr> <int>
## 1 F 15690
## 2 M 17348
## 3 <NA> 1748
surveys %>%
count(sex, sort = TRUE)
## # A tibble: 3 × 2
## sex n
## <chr> <int>
## 1 M 17348
## 2 F 15690
## 3 <NA> 1748
surveys %>%
count(sex, species)
## # A tibble: 81 × 3
## sex species n
## <chr> <chr> <int>
## 1 F albigula 675
## 2 F baileyi 1646
## 3 F eremicus 579
## 4 F flavus 757
## 5 F fulvescens 57
## 6 F fulviventer 17
## 7 F hispidus 99
## 8 F leucogaster 475
## 9 F leucopus 16
## 10 F maniculatus 382
## # ℹ 71 more rows
surveys %>%
count(sex, species) %>%
arrange(species, desc(n))
## # A tibble: 81 × 3
## sex species n
## <chr> <chr> <int>
## 1 F albigula 675
## 2 M albigula 502
## 3 <NA> albigula 75
## 4 <NA> audubonii 75
## 5 F baileyi 1646
## 6 M baileyi 1216
## 7 <NA> baileyi 29
## 8 <NA> bilineata 303
## 9 <NA> brunneicapillus 50
## 10 <NA> chlorurus 39
## # ℹ 71 more rows
Challenge
# 1
surveys %>%
count(plot_type, species) %>%
count(plot_type)
## # A tibble: 5 × 2
## plot_type n
## <chr> <int>
## 1 Control 33
## 2 Long-term Krat Exclosure 34
## 3 Rodent Exclosure 32
## 4 Short-term Krat Exclosure 32
## 5 Spectab exclosure 25
# 2
surveys %>%
group_by(species_id) %>%
filter(!is.na(hindfoot_length)) %>%
summarise(mean_length = mean(hindfoot_length),
min_length = min(hindfoot_length),
max_length = max(hindfoot_length),
n = n())
## # A tibble: 25 × 5
## species_id mean_length min_length max_length n
## <chr> <dbl> <dbl> <dbl> <int>
## 1 AH 33 31 35 2
## 2 BA 13 6 16 45
## 3 DM 36.0 16 50 9972
## 4 DO 35.6 26 64 2887
## 5 DS 49.9 39 58 2132
## 6 NL 32.3 21 70 1074
## 7 OL 20.5 12 39 920
## 8 OT 20.3 13 50 2139
## 9 OX 19.1 13 21 8
## 10 PB 26.1 2 47 2864
## # ℹ 15 more rows
# 3
surveys %>%
filter(!is.na(weight)) %>%
select(weight, year, genus, species_id) %>%
group_by(year) %>%
filter(weight == max(weight)) %>%
arrange(year)
## # A tibble: 27 × 4
## # Groups: year [26]
## weight year genus species_id
## <dbl> <dbl> <chr> <chr>
## 1 149 1977 Dipodomys DS
## 2 232 1978 Neotoma NL
## 3 232 1978 Neotoma NL
## 4 274 1979 Neotoma NL
## 5 243 1980 Neotoma NL
## 6 264 1981 Neotoma NL
## 7 252 1982 Neotoma NL
## 8 256 1983 Neotoma NL
## 9 259 1984 Neotoma NL
## 10 225 1985 Neotoma NL
## # ℹ 17 more rows
Combining datasets using merge()
gene_expression <- data.frame(
GeneID = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5"),
Condition_A = c(10.5, 8.2, 15.7, 9.1, 7.8),
Condition_B = c(9.3, 7.5, 14.2, 8.9, 6.7)
)
gene_annotations <- data.frame(
GeneID = c("Gene1", "Gene3", "Gene4", "Gene6"),
Function = c("Transporter", "Kinase", "Receptor", "Unknown")
)
merged_genes <- left_join(gene_expression, gene_annotations, by = "GeneID")
# left join = join to the left