December 2023

HealthyR Clinic

*_join() demo

From Ewen Harrison - Sarah Elliot - The University of Edinburgh

[https://media.ed.ac.uk/media/HealthyR+demoA+joining+datasets/1_wd239kz6]

Day 08 of HealthyR demo

join() demo

library(tidyverse)
library(lubridate)

The data sets being used

df1 = tibble(subjid = 1:10,
             dob = sample(seq(ymd('1950-01-01'), ymd('2002-01-01'), by = "day"), 
                          size = 10, replace=TRUE),
             country = sample(c("England", "Scotland", "Wales", "N. Ireland"), 
                              size=10, replace=TRUE, prob=c(0.7, 0.16, 0.09, 0.05)),
             sex = c("female", "male", "male", "male", "female", "other", "female",
                     "female", "male", "male"))

df2 = tibble(id = 3:12,
             height = round(runif(10, 150, 190)),
             weight = round(runif(10, 50, 100)),
             test_result = round(runif(10, 40, 80)),
             gender = c("male", "male", "female", "other", "female", "female", 
                        "male", "male", "male", "female"))

df3 = tibble(subjid = 3:12,
             test_result = round(runif(10, 60, 100)),
             job = c("doctor", "lawyer", "police officer", "teacher", "accountant",
                     "scientist", "lecturer", "artist", "author", "engineer"),
             fav_animal = c("dog", "penguin", "giraffe", "armadillo", "elephant",
                            "gorilla", "Guinea pig", "sloth", "lion", "parrot"))


extra_patient = tibble(subjid = 13,
                       dob = ymd("1997-03-11"),
                       country = "Scotland",
                       sex = "Female",
                       height = 173,
                       weight = 70,
                       test_result_1 = 65,
                       test_result_2 = 89,
                       job = "data scientist")

Q1 - Join df1 and df2 using the subject id column

Only rows in common

df1 %>% 
  inner_join(df2, by = c("subjid" = "id"))
## # A tibble: 8 × 8
##   subjid dob        country    sex    height weight test_result gender
##    <int> <date>     <chr>      <chr>   <dbl>  <dbl>       <dbl> <chr> 
## 1      3 1990-12-04 England    male      184     52          80 male  
## 2      4 1991-08-29 Scotland   male      159     98          64 male  
## 3      5 1964-03-23 N. Ireland female    177     70          56 female
## 4      6 1957-07-14 Wales      other     177     80          77 other 
## 5      7 1988-11-19 England    female    161     65          79 female
## 6      8 1951-08-18 England    female    177     51          73 female
## 7      9 1974-12-04 England    male      170     81          67 male  
## 8     10 1980-11-22 England    male      176     63          71 male

Keep all of df1

df1 %>% 
  left_join(df2, by = c("subjid" = "id"))
## # A tibble: 10 × 8
##    subjid dob        country    sex    height weight test_result gender
##     <int> <date>     <chr>      <chr>   <dbl>  <dbl>       <dbl> <chr> 
##  1      1 1974-12-08 Wales      female     NA     NA          NA <NA>  
##  2      2 1971-09-27 England    male       NA     NA          NA <NA>  
##  3      3 1990-12-04 England    male      184     52          80 male  
##  4      4 1991-08-29 Scotland   male      159     98          64 male  
##  5      5 1964-03-23 N. Ireland female    177     70          56 female
##  6      6 1957-07-14 Wales      other     177     80          77 other 
##  7      7 1988-11-19 England    female    161     65          79 female
##  8      8 1951-08-18 England    female    177     51          73 female
##  9      9 1974-12-04 England    male      170     81          67 male  
## 10     10 1980-11-22 England    male      176     63          71 male

Full join

tmp = df1 %>% 
  full_join(df2, by = c("subjid" = "id"))

How to use a base R function with %$%

library(magrittr)
tmp %>% 
  drop_na() %$% 
  identical(sex, gender)
## [1] TRUE

Q2 - Join on multiple columns

Full join

df1 %>% 
  full_join(df2, by = c("subjid" = "id",
                        "sex" = "gender"))
## # A tibble: 12 × 7
##    subjid dob        country    sex    height weight test_result
##     <int> <date>     <chr>      <chr>   <dbl>  <dbl>       <dbl>
##  1      1 1974-12-08 Wales      female     NA     NA          NA
##  2      2 1971-09-27 England    male       NA     NA          NA
##  3      3 1990-12-04 England    male      184     52          80
##  4      4 1991-08-29 Scotland   male      159     98          64
##  5      5 1964-03-23 N. Ireland female    177     70          56
##  6      6 1957-07-14 Wales      other     177     80          77
##  7      7 1988-11-19 England    female    161     65          79
##  8      8 1951-08-18 England    female    177     51          73
##  9      9 1974-12-04 England    male      170     81          67
## 10     10 1980-11-22 England    male      176     63          71
## 11     11 NA         <NA>       male      170     59          58
## 12     12 NA         <NA>       female    170     59          56

Q3 - Join multiple datasets

df1 %>% 
  full_join(df2, by = c("subjid" = "id")) %>% 
  full_join(df3, by = c("subjid"), suffix = c(".day1", ".discharge")) 
## # A tibble: 12 × 11
##    subjid dob        country    sex    height weight test_result.day1 gender
##     <int> <date>     <chr>      <chr>   <dbl>  <dbl>            <dbl> <chr> 
##  1      1 1974-12-08 Wales      female     NA     NA               NA <NA>  
##  2      2 1971-09-27 England    male       NA     NA               NA <NA>  
##  3      3 1990-12-04 England    male      184     52               80 male  
##  4      4 1991-08-29 Scotland   male      159     98               64 male  
##  5      5 1964-03-23 N. Ireland female    177     70               56 female
##  6      6 1957-07-14 Wales      other     177     80               77 other 
##  7      7 1988-11-19 England    female    161     65               79 female
##  8      8 1951-08-18 England    female    177     51               73 female
##  9      9 1974-12-04 England    male      170     81               67 male  
## 10     10 1980-11-22 England    male      176     63               71 male  
## 11     11 NA         <NA>       <NA>      170     59               58 male  
## 12     12 NA         <NA>       <NA>      170     59               56 female
## # ℹ 3 more variables: test_result.discharge <dbl>, job <chr>, fav_animal <chr>

Q4 - What would you do about the column that has the same name?

Q5 - how would you join everything except the favourite animal column?

df1 %>% 
  full_join(df2, by = c("subjid" = "id")) %>% 
  full_join(df3, by = c("subjid"), suffix = c(".day1", ".discharge")) %>% 
  select(-fav_animal)
## # A tibble: 12 × 10
##    subjid dob        country    sex    height weight test_result.day1 gender
##     <int> <date>     <chr>      <chr>   <dbl>  <dbl>            <dbl> <chr> 
##  1      1 1974-12-08 Wales      female     NA     NA               NA <NA>  
##  2      2 1971-09-27 England    male       NA     NA               NA <NA>  
##  3      3 1990-12-04 England    male      184     52               80 male  
##  4      4 1991-08-29 Scotland   male      159     98               64 male  
##  5      5 1964-03-23 N. Ireland female    177     70               56 female
##  6      6 1957-07-14 Wales      other     177     80               77 other 
##  7      7 1988-11-19 England    female    161     65               79 female
##  8      8 1951-08-18 England    female    177     51               73 female
##  9      9 1974-12-04 England    male      170     81               67 male  
## 10     10 1980-11-22 England    male      176     63               71 male  
## 11     11 NA         <NA>       <NA>      170     59               58 male  
## 12     12 NA         <NA>       <NA>      170     59               56 female
## # ℹ 2 more variables: test_result.discharge <dbl>, job <chr>

Q6 - Filtering joins - which individuals are not in the third dataset?

df1 %>% 
  anti_join(df2, by = c("subjid" = "id"))
## # A tibble: 2 × 4
##   subjid dob        country sex   
##    <int> <date>     <chr>   <chr> 
## 1      1 1974-12-08 Wales   female
## 2      2 1971-09-27 England male

Q7 - What would you do if you wanted to add on an extra row of data

df4 = df1 %>% 
  slice(1:4)
df5 = df1 %>% 
  slice(5:10)

df4 %>% 
  select(-sex) %>% 
  bind_rows(df5)
## # A tibble: 10 × 4
##    subjid dob        country    sex   
##     <int> <date>     <chr>      <chr> 
##  1      1 1974-12-08 Wales      <NA>  
##  2      2 1971-09-27 England    <NA>  
##  3      3 1990-12-04 England    <NA>  
##  4      4 1991-08-29 Scotland   <NA>  
##  5      5 1964-03-23 N. Ireland female
##  6      6 1957-07-14 Wales      other 
##  7      7 1988-11-19 England    female
##  8      8 1951-08-18 England    female
##  9      9 1974-12-04 England    male  
## 10     10 1980-11-22 England    male
#df4 = df4 %>% 
#  select(-sex)
#rbind(df4, df5) # error

Q8 watch when joining factors

df4 = df1 %>% 
  slice(1:4) %>% 
  mutate(sex = factor(sex))

df5 = df1 %>% 
  slice(5:10) %>% 
  mutate(sex = factor(sex))

df4$sex %>% levels()
## [1] "female" "male"
df5$sex %>% levels()
## [1] "female" "male"   "other"
df4 %>% 
  full_join(df5) %$% 
  levels(sex)
## Joining with `by = join_by(subjid, dob, country, sex)`
## [1] "female" "male"   "other"
df6 = tibble(subjid = 3:12,
             test_result = round(runif(10, 60, 100)),
             job = c("doctor", "lawyer", "police officer", "teacher", "accountant",
                     "scientist", "lecturer", "artist", "author", "engineer"),
             fav_animal2 = c("doggy", "penguin", "giraffe", "armadillo", "elephant",
                            "gorilla", "Guinea pig", "sloth", "lion", "parrot"))

df3 %>% 
  full_join(df6, by = c("fav_animal" = "fav_animal2"))
## # A tibble: 11 × 7
##    subjid.x test_result.x job.x          fav_animal subjid.y test_result.y job.y
##       <int>         <dbl> <chr>          <chr>         <int>         <dbl> <chr>
##  1        3            63 doctor         dog              NA            NA <NA> 
##  2        4            98 lawyer         penguin           4            91 lawy…
##  3        5            83 police officer giraffe           5            99 poli…
##  4        6            92 teacher        armadillo         6            67 teac…
##  5        7            71 accountant     elephant          7            88 acco…
##  6        8            88 scientist      gorilla           8            89 scie…
##  7        9            62 lecturer       Guinea pig        9            76 lect…
##  8       10            83 artist         sloth            10            90 arti…
##  9       11            72 author         lion             11            93 auth…
## 10       12            66 engineer       parrot           12            92 engi…
## 11       NA            NA <NA>           doggy             3            94 doct…