[https://media.ed.ac.uk/media/HealthyR+demoA+joining+datasets/1_wd239kz6]
Day 08 of HealthyR demo
join() demo
library(tidyverse)
library(lubridate)
df1 = tibble(subjid = 1:10,
dob = sample(seq(ymd('1950-01-01'), ymd('2002-01-01'), by = "day"),
size = 10, replace=TRUE),
country = sample(c("England", "Scotland", "Wales", "N. Ireland"),
size=10, replace=TRUE, prob=c(0.7, 0.16, 0.09, 0.05)),
sex = c("female", "male", "male", "male", "female", "other", "female",
"female", "male", "male"))
df2 = tibble(id = 3:12,
height = round(runif(10, 150, 190)),
weight = round(runif(10, 50, 100)),
test_result = round(runif(10, 40, 80)),
gender = c("male", "male", "female", "other", "female", "female",
"male", "male", "male", "female"))
df3 = tibble(subjid = 3:12,
test_result = round(runif(10, 60, 100)),
job = c("doctor", "lawyer", "police officer", "teacher", "accountant",
"scientist", "lecturer", "artist", "author", "engineer"),
fav_animal = c("dog", "penguin", "giraffe", "armadillo", "elephant",
"gorilla", "Guinea pig", "sloth", "lion", "parrot"))
extra_patient = tibble(subjid = 13,
dob = ymd("1997-03-11"),
country = "Scotland",
sex = "Female",
height = 173,
weight = 70,
test_result_1 = 65,
test_result_2 = 89,
job = "data scientist")
df1 %>%
inner_join(df2, by = c("subjid" = "id"))
## # A tibble: 8 × 8
## subjid dob country sex height weight test_result gender
## <int> <date> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 3 1990-12-04 England male 184 52 80 male
## 2 4 1991-08-29 Scotland male 159 98 64 male
## 3 5 1964-03-23 N. Ireland female 177 70 56 female
## 4 6 1957-07-14 Wales other 177 80 77 other
## 5 7 1988-11-19 England female 161 65 79 female
## 6 8 1951-08-18 England female 177 51 73 female
## 7 9 1974-12-04 England male 170 81 67 male
## 8 10 1980-11-22 England male 176 63 71 male
df1 %>%
left_join(df2, by = c("subjid" = "id"))
## # A tibble: 10 × 8
## subjid dob country sex height weight test_result gender
## <int> <date> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 1 1974-12-08 Wales female NA NA NA <NA>
## 2 2 1971-09-27 England male NA NA NA <NA>
## 3 3 1990-12-04 England male 184 52 80 male
## 4 4 1991-08-29 Scotland male 159 98 64 male
## 5 5 1964-03-23 N. Ireland female 177 70 56 female
## 6 6 1957-07-14 Wales other 177 80 77 other
## 7 7 1988-11-19 England female 161 65 79 female
## 8 8 1951-08-18 England female 177 51 73 female
## 9 9 1974-12-04 England male 170 81 67 male
## 10 10 1980-11-22 England male 176 63 71 male
tmp = df1 %>%
full_join(df2, by = c("subjid" = "id"))
library(magrittr)
tmp %>%
drop_na() %$%
identical(sex, gender)
## [1] TRUE
df1 %>%
full_join(df2, by = c("subjid" = "id",
"sex" = "gender"))
## # A tibble: 12 × 7
## subjid dob country sex height weight test_result
## <int> <date> <chr> <chr> <dbl> <dbl> <dbl>
## 1 1 1974-12-08 Wales female NA NA NA
## 2 2 1971-09-27 England male NA NA NA
## 3 3 1990-12-04 England male 184 52 80
## 4 4 1991-08-29 Scotland male 159 98 64
## 5 5 1964-03-23 N. Ireland female 177 70 56
## 6 6 1957-07-14 Wales other 177 80 77
## 7 7 1988-11-19 England female 161 65 79
## 8 8 1951-08-18 England female 177 51 73
## 9 9 1974-12-04 England male 170 81 67
## 10 10 1980-11-22 England male 176 63 71
## 11 11 NA <NA> male 170 59 58
## 12 12 NA <NA> female 170 59 56
df1 %>%
full_join(df2, by = c("subjid" = "id")) %>%
full_join(df3, by = c("subjid"), suffix = c(".day1", ".discharge"))
## # A tibble: 12 × 11
## subjid dob country sex height weight test_result.day1 gender
## <int> <date> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 1 1974-12-08 Wales female NA NA NA <NA>
## 2 2 1971-09-27 England male NA NA NA <NA>
## 3 3 1990-12-04 England male 184 52 80 male
## 4 4 1991-08-29 Scotland male 159 98 64 male
## 5 5 1964-03-23 N. Ireland female 177 70 56 female
## 6 6 1957-07-14 Wales other 177 80 77 other
## 7 7 1988-11-19 England female 161 65 79 female
## 8 8 1951-08-18 England female 177 51 73 female
## 9 9 1974-12-04 England male 170 81 67 male
## 10 10 1980-11-22 England male 176 63 71 male
## 11 11 NA <NA> <NA> 170 59 58 male
## 12 12 NA <NA> <NA> 170 59 56 female
## # ℹ 3 more variables: test_result.discharge <dbl>, job <chr>, fav_animal <chr>
df1 %>%
full_join(df2, by = c("subjid" = "id")) %>%
full_join(df3, by = c("subjid"), suffix = c(".day1", ".discharge")) %>%
select(-fav_animal)
## # A tibble: 12 × 10
## subjid dob country sex height weight test_result.day1 gender
## <int> <date> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 1 1974-12-08 Wales female NA NA NA <NA>
## 2 2 1971-09-27 England male NA NA NA <NA>
## 3 3 1990-12-04 England male 184 52 80 male
## 4 4 1991-08-29 Scotland male 159 98 64 male
## 5 5 1964-03-23 N. Ireland female 177 70 56 female
## 6 6 1957-07-14 Wales other 177 80 77 other
## 7 7 1988-11-19 England female 161 65 79 female
## 8 8 1951-08-18 England female 177 51 73 female
## 9 9 1974-12-04 England male 170 81 67 male
## 10 10 1980-11-22 England male 176 63 71 male
## 11 11 NA <NA> <NA> 170 59 58 male
## 12 12 NA <NA> <NA> 170 59 56 female
## # ℹ 2 more variables: test_result.discharge <dbl>, job <chr>
df1 %>%
anti_join(df2, by = c("subjid" = "id"))
## # A tibble: 2 × 4
## subjid dob country sex
## <int> <date> <chr> <chr>
## 1 1 1974-12-08 Wales female
## 2 2 1971-09-27 England male
df4 = df1 %>%
slice(1:4)
df5 = df1 %>%
slice(5:10)
df4 %>%
select(-sex) %>%
bind_rows(df5)
## # A tibble: 10 × 4
## subjid dob country sex
## <int> <date> <chr> <chr>
## 1 1 1974-12-08 Wales <NA>
## 2 2 1971-09-27 England <NA>
## 3 3 1990-12-04 England <NA>
## 4 4 1991-08-29 Scotland <NA>
## 5 5 1964-03-23 N. Ireland female
## 6 6 1957-07-14 Wales other
## 7 7 1988-11-19 England female
## 8 8 1951-08-18 England female
## 9 9 1974-12-04 England male
## 10 10 1980-11-22 England male
#df4 = df4 %>%
# select(-sex)
#rbind(df4, df5) # error
df4 = df1 %>%
slice(1:4) %>%
mutate(sex = factor(sex))
df5 = df1 %>%
slice(5:10) %>%
mutate(sex = factor(sex))
df4$sex %>% levels()
## [1] "female" "male"
df5$sex %>% levels()
## [1] "female" "male" "other"
df4 %>%
full_join(df5) %$%
levels(sex)
## Joining with `by = join_by(subjid, dob, country, sex)`
## [1] "female" "male" "other"
df6 = tibble(subjid = 3:12,
test_result = round(runif(10, 60, 100)),
job = c("doctor", "lawyer", "police officer", "teacher", "accountant",
"scientist", "lecturer", "artist", "author", "engineer"),
fav_animal2 = c("doggy", "penguin", "giraffe", "armadillo", "elephant",
"gorilla", "Guinea pig", "sloth", "lion", "parrot"))
df3 %>%
full_join(df6, by = c("fav_animal" = "fav_animal2"))
## # A tibble: 11 × 7
## subjid.x test_result.x job.x fav_animal subjid.y test_result.y job.y
## <int> <dbl> <chr> <chr> <int> <dbl> <chr>
## 1 3 63 doctor dog NA NA <NA>
## 2 4 98 lawyer penguin 4 91 lawy…
## 3 5 83 police officer giraffe 5 99 poli…
## 4 6 92 teacher armadillo 6 67 teac…
## 5 7 71 accountant elephant 7 88 acco…
## 6 8 88 scientist gorilla 8 89 scie…
## 7 9 62 lecturer Guinea pig 9 76 lect…
## 8 10 83 artist sloth 10 90 arti…
## 9 11 72 author lion 11 93 auth…
## 10 12 66 engineer parrot 12 92 engi…
## 11 NA NA <NA> doggy 3 94 doct…