Exercises for merge

#pacman是一個管理R包的工具,加載之後,採用p_load函數對包進行安裝和加載 p_load(ggplot2, EBImage, jpeg, ggpubr, plotly),以pacman::p_load() 同時取代 install.packages(“A”,“B”) 和 library(A, B)

男生作法

# require(pacman)
pacman::p_load(mice)
data(potthoffroy)
subset(potthoffroy, sex=='M')
##    id sex   d8  d10  d12  d14
## 12 12   M 26.0 25.0 29.0 31.0
## 13 13   M 21.5 22.5 23.0 26.5
## 14 14   M 23.0 22.5 24.0 27.5
## 15 15   M 25.5 27.5 26.5 27.0
## 16 16   M 20.0 23.5 22.5 26.0
## 17 17   M 24.5 25.5 27.0 28.5
## 18 18   M 22.0 22.0 24.5 26.5
## 19 19   M 24.0 21.5 24.5 25.5
## 20 20   M 23.0 20.5 31.0 26.0
## 21 21   M 27.5 28.0 31.0 31.5
## 22 22   M 23.0 23.0 23.5 25.0
## 23 23   M 21.5 23.5 24.0 28.0
## 24 24   M 17.0 24.5 26.0 29.5
## 25 25   M 22.5 25.5 25.5 26.0
## 26 26   M 23.0 24.5 26.0 30.0
## 27 27   M 22.0 21.5 23.5 25.0

#dir.create() 設置新的工作目錄, getwd() 檢視目前工作目錄

? lapply的用法 其實不太理解

這段的意思是把M的資料 貼到檔案是 M_ 裡面嗎?[, c(1, i)]這個是什麼意思

#folder叫tmp_data, 檔名M_

list.files("./tmp_data/", pattern="M_")
## [1] "M_0.csv" "M_1.csv" "M_2.csv" "M_3.csv" "M_4.csv"

? 如何留下性別這個欄位

read.csv("./tmp_data/M_1.csv")
##    id   d8
## 1  12 26.0
## 2  13 21.5
## 3  14 23.0
## 4  15 25.5
## 5  16 20.0
## 6  17 24.5
## 7  18 22.0
## 8  19 24.0
## 9  20 23.0
## 10 21 27.5
## 11 22 23.0
## 12 23 21.5
## 13 24 17.0
## 14 25 22.5
## 15 26 23.0
## 16 27 22.0
mls <- list.files(path = "./tmp_data", pattern = "M_")
mls
## [1] "M_0.csv" "M_1.csv" "M_2.csv" "M_3.csv" "M_4.csv"
mL <- paste0("./tmp_data/", mls)
mL
## [1] "./tmp_data/M_0.csv" "./tmp_data/M_1.csv" "./tmp_data/M_2.csv"
## [4] "./tmp_data/M_3.csv" "./tmp_data/M_4.csv"

Input multiple files

Input these files as a list of data frames

mm <- lapply(mL, read.csv)

Merge

We can merge two files by id.

merge(mm[1], mm[2])
##    id   d8
## 1  12 26.0
## 2  13 21.5
## 3  14 23.0
## 4  15 25.5
## 5  16 20.0
## 6  17 24.5
## 7  18 22.0
## 8  19 24.0
## 9  20 23.0
## 10 21 27.5
## 11 22 23.0
## 12 23 21.5
## 13 24 17.0
## 14 25 22.5
## 15 26 23.0
## 16 27 22.0

#這是回圈作併檔

# Roll our own merging function
mrg2 <- function(m1, m2){                                
  merge(m1, m2, by="id")
}
Reduce(mrg2, mm)
##    id d8.x d8.y  d10  d12  d14
## 1  12 26.0 26.0 25.0 29.0 31.0
## 2  13 21.5 21.5 22.5 23.0 26.5
## 3  14 23.0 23.0 22.5 24.0 27.5
## 4  15 25.5 25.5 27.5 26.5 27.0
## 5  16 20.0 20.0 23.5 22.5 26.0
## 6  17 24.5 24.5 25.5 27.0 28.5
## 7  18 22.0 22.0 22.0 24.5 26.5
## 8  19 24.0 24.0 21.5 24.5 25.5
## 9  20 23.0 23.0 20.5 31.0 26.0
## 10 21 27.5 27.5 28.0 31.0 31.5
## 11 22 23.0 23.0 23.0 23.5 25.0
## 12 23 21.5 21.5 23.5 24.0 28.0
## 13 24 17.0 17.0 24.5 26.0 29.5
## 14 25 22.5 22.5 25.5 25.5 26.0
## 15 26 23.0 23.0 24.5 26.0 30.0
## 16 27 22.0 22.0 21.5 23.5 25.0

? 不太明白為何要用 tidyverse

我的理解似乎是把整理出來的M資料取一個檔案名稱的概念,這裡我們叫他mm, 我查資料它包含了dplyr、ggplot2,與stringr等數據處理常用包,這裡我們似乎沒有用到其他功能

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks mice::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
mm |> reduce(inner_join, by='id')
##    id d8.x d8.y  d10  d12  d14
## 1  12 26.0 26.0 25.0 29.0 31.0
## 2  13 21.5 21.5 22.5 23.0 26.5
## 3  14 23.0 23.0 22.5 24.0 27.5
## 4  15 25.5 25.5 27.5 26.5 27.0
## 5  16 20.0 20.0 23.5 22.5 26.0
## 6  17 24.5 24.5 25.5 27.0 28.5
## 7  18 22.0 22.0 22.0 24.5 26.5
## 8  19 24.0 24.0 21.5 24.5 25.5
## 9  20 23.0 23.0 20.5 31.0 26.0
## 10 21 27.5 27.5 28.0 31.0 31.5
## 11 22 23.0 23.0 23.0 23.5 25.0
## 12 23 21.5 21.5 23.5 24.0 28.0
## 13 24 17.0 17.0 24.5 26.0 29.5
## 14 25 22.5 22.5 25.5 25.5 26.0
## 15 26 23.0 23.0 24.5 26.0 30.0
## 16 27 22.0 22.0 21.5 23.5 25.0

女生作法

# require(pacman)
pacman::p_load(mice)
data(potthoffroy)
subset(potthoffroy, sex=='F')
##    id sex   d8  d10  d12  d14
## 1   1   F 21.0 20.0 21.5 23.0
## 2   2   F 21.0 21.5 24.0 25.5
## 3   3   F 20.5 24.0 24.5 26.0
## 4   4   F 23.5 24.5 25.0 26.5
## 5   5   F 21.5 23.0 22.5 23.5
## 6   6   F 20.0 21.0 21.0 22.5
## 7   7   F 21.5 22.5 23.0 25.0
## 8   8   F 23.0 23.0 23.5 24.0
## 9   9   F 20.0 21.0 22.0 21.5
## 10 10   F 16.5 19.0 19.0 19.5
## 11 11   F 24.5 25.0 28.0 28.0
dir.create(file.path(getwd(), "./tmp_data"), showWarnings=FALSE)
lapply(3:6, function(i) {
              write.csv(subset(potthoffroy, sex=='F')[, c(1, i)],
                        file=paste0("./tmp_data/F_", i-2, ".csv"),
                        row.names=FALSE)
                      }
)
## [[1]]
## NULL
## 
## [[2]]
## NULL
## 
## [[3]]
## NULL
## 
## [[4]]
## NULL

#Files in a folder

list.files("./tmp_data/", pattern="f_")
## [1] "f_1.csv" "f_2.csv" "f_3.csv" "f_4.csv"
read.csv("./tmp_data/f_1.csv")
##    id   d8
## 1   1 21.0
## 2   2 21.0
## 3   3 20.5
## 4   4 23.5
## 5   5 21.5
## 6   6 20.0
## 7   7 21.5
## 8   8 23.0
## 9   9 20.0
## 10 10 16.5
## 11 11 24.5
fls <- list.files(path = "./tmp_data", pattern = "f_")
fls
## [1] "f_1.csv" "f_2.csv" "f_3.csv" "f_4.csv"
fL <- paste0("./tmp_data/", fls)
fL
## [1] "./tmp_data/f_1.csv" "./tmp_data/f_2.csv" "./tmp_data/f_3.csv"
## [4] "./tmp_data/f_4.csv"

#Input multiple files

Input these files as a list of data frames

ff <- lapply(fL, read.csv)

#Merge

We can merge two files by id.

merge(ff[1], ff[2])
##    id   d8  d10
## 1   1 21.0 20.0
## 2   2 21.0 21.5
## 3   3 20.5 24.0
## 4   4 23.5 24.5
## 5   5 21.5 23.0
## 6   6 20.0 21.0
## 7   7 21.5 22.5
## 8   8 23.0 23.0
## 9   9 20.0 21.0
## 10 10 16.5 19.0
## 11 11 24.5 25.0

#Reduce

前面是merg2 這裡改成merg3

# Roll our own merging function
mrg3 <- function(f1, f2){                                
  merge(f1, f2, by="id")
}
Reduce(mrg3, ff)
##    id   d8  d10  d12  d14
## 1   1 21.0 20.0 21.5 23.0
## 2   2 21.0 21.5 24.0 25.5
## 3   3 20.5 24.0 24.5 26.0
## 4   4 23.5 24.5 25.0 26.5
## 5   5 21.5 23.0 22.5 23.5
## 6   6 20.0 21.0 21.0 22.5
## 7   7 21.5 22.5 23.0 25.0
## 8   8 23.0 23.0 23.5 24.0
## 9   9 20.0 21.0 22.0 21.5
## 10 10 16.5 19.0 19.0 19.5
## 11 11 24.5 25.0 28.0 28.0
library(tidyverse)
ff |> reduce(inner_join, by='id')
##    id   d8  d10  d12  d14
## 1   1 21.0 20.0 21.5 23.0
## 2   2 21.0 21.5 24.0 25.5
## 3   3 20.5 24.0 24.5 26.0
## 4   4 23.5 24.5 25.0 26.5
## 5   5 21.5 23.0 22.5 23.5
## 6   6 20.0 21.0 21.0 22.5
## 7   7 21.5 22.5 23.0 25.0
## 8   8 23.0 23.0 23.5 24.0
## 9   9 20.0 21.0 22.0 21.5
## 10 10 16.5 19.0 19.0 19.5
## 11 11 24.5 25.0 28.0 28.0

?Merge M&F

為何我會留下id.1 ~id.4

We can merge two files by id.

merge(ff, mm, all = TRUE)
##    id   d8 id.1  d10 id.2  d12 id.3  d14 d8.1 id.4
## 1   1 21.0    1 20.0    1 21.5    1 23.0   NA   NA
## 2   2 21.0    2 21.5    2 24.0    2 25.5   NA   NA
## 3   3 20.5    3 24.0    3 24.5    3 26.0   NA   NA
## 4   4 23.5    4 24.5    4 25.0    4 26.5   NA   NA
## 5   5 21.5    5 23.0    5 22.5    5 23.5   NA   NA
## 6   6 20.0    6 21.0    6 21.0    6 22.5   NA   NA
## 7   7 21.5    7 22.5    7 23.0    7 25.0   NA   NA
## 8   8 23.0    8 23.0    8 23.5    8 24.0   NA   NA
## 9   9 20.0    9 21.0    9 22.0    9 21.5   NA   NA
## 10 10 16.5   10 19.0   10 19.0   10 19.5   NA   NA
## 11 11 24.5   11 25.0   11 28.0   11 28.0   NA   NA
## 12 12 26.0   12 25.0   12 29.0   12 31.0 26.0   12
## 13 13 21.5   13 22.5   13 23.0   13 26.5 21.5   13
## 14 14 23.0   14 22.5   14 24.0   14 27.5 23.0   14
## 15 15 25.5   15 27.5   15 26.5   15 27.0 25.5   15
## 16 16 20.0   16 23.5   16 22.5   16 26.0 20.0   16
## 17 17 24.5   17 25.5   17 27.0   17 28.5 24.5   17
## 18 18 22.0   18 22.0   18 24.5   18 26.5 22.0   18
## 19 19 24.0   19 21.5   19 24.5   19 25.5 24.0   19
## 20 20 23.0   20 20.5   20 31.0   20 26.0 23.0   20
## 21 21 27.5   21 28.0   21 31.0   21 31.5 27.5   21
## 22 22 23.0   22 23.0   22 23.5   22 25.0 23.0   22
## 23 23 21.5   23 23.5   23 24.0   23 28.0 21.5   23
## 24 24 17.0   24 24.5   24 26.0   24 29.5 17.0   24
## 25 25 22.5   25 25.5   25 25.5   25 26.0 22.5   25
## 26 26 23.0   26 24.5   26 26.0   26 30.0 23.0   26
## 27 27 22.0   27 21.5   27 23.5   27 25.0 22.0   27