selectmutate, transform, tibble::add_columnmutate_at, mutate_allstarts_with,end_with, contains, matches, etc.mutate_ifmutateでsweepと同等の事をする.top_nfirstrename, rename_all, rename_if, setNamesrow_number()、行数nrow(), n() を取得group_by.summarise, summarize, tapplytidyr::gather, tidyr::spreadreshape2::melt, as.data.framemerge, *_joinbind_cols, bind_rowsdplyr, magrittr, tibble, tidyr その他を使ってデータフレーム操作する練習
%>%%>%は, dplyr を読み込むと自動的に読み込まれる.lhs %>% rhsのようにして使う. この場合、左項の結果を右項の最初の引数として配置する. つまり、x %>% f(y)はf(x, y)と等価.{}中括弧は、パイプが関数内の最初の引数を使用しないようにする# lhsをrhsの第一引数に渡す. それ以外の引数で呼び出す時は .(ドット)で行う
3 %>% c(., 3 + .)
3 %>% c(3 + .)
1:10 %>% mean(.)
1:10 %>% plot(x = ., y = seq(10, 100, 10))
1:10 %>% plot(x = seq(10, 100, 10), y = .)
# rhsの第一引数は省略可能
3 %>% c(3 + .)
1:10 %>% mean
1:10 %>% plot(y = seq(10, 100, 10))
# 中括弧は、パイプが関数内の最初の引数を使用しないようにする
3 %>% c(3 + .)
3 %>% { c(3 + .) }
mtcars %>% {cor(.$mpg, .$cyl)}
mtcars %>% {cor(t(.))} %>% .[1:6,1:6]%<>%, %T>%, %$%selectselect(.data = iris, Sepal.Length, Species) %>% head
iris %>% select(., Sepal.Length, Species) %>% head
iris %>% select(Sepal.Length, Species) %>% head
select(iris, Sepal.Length:Petal.Width) %>% head
select(iris, -Species) %>% head
select(iris, -Sepal.Length:-Petal.Width) %>% head
select(iris, -(Sepal.Length:Petal.Width)) %>% headselect_, group_by_(非推奨).select(iris, c(Sepal.Length, Species)) %>% head
select(iris, Sepal.Length, Species) %>% head
select(iris, c("Sepal.Length", "Species")) %>% head
select(iris, "Sepal.Length", "Species") %>% head
select(iris, .data$Sepal.Length, .data$Species) %>% head
# select(iris, .$SepalLength, .$Species) %>% head # エラ-
# select_(iris, "Sepal.Length", "Species") %>% head # 非推奨
iris %>% select(Sepal.Length, Species) %>% head
v <- c("Sepal.Length", "Species")
# select(iris, v) %>% head
select(iris, all_of(v)) %>% head?tidyselect::select_helpersmatchは正規表現による一致, one_ofは完全一致ignore.case = TRUE)iris %>% select(starts_with("Sepal")) %>% head
iris %>% select(ends_with("Length")) %>% head
iris %>% select(-starts_with("Petal")) %>% head
iris %>% select(matches("^sp|Length$")) %>% head # 正規表現による一致,
iris %>% select(contains(".")) %>% head
v <- c("Sepal.Length", "Species")
iris %>% select(one_of(v)) %>% head # 文字列ベクトルとマッチするもの
iris %>% select(v) %>% head
# num_range 列名の
df <- data.frame(matrix(rpois(30, 5), ncol=10))
df %>% select(num_range(prefix="X", range = 2:4, width = 1))
df_1 <- df %>% setNames(., paste0("X", sprintf("%03d", 1:10)))
df_1 %>% select(num_range(prefix = "X", range = 5:10, width = 3))select, [selectmutate, transform, tibble::add_columntibble::add_column(.data, ..., .after, .before)書き換えはできない.# 列追加・編集 mutate
iris %>%
mutate(Species = paste0(substr(Species, 1, 3), row_number())) %>% tail
iris %>% group_by(Species) %>%
mutate(Sp = paste0(substr(Species, 1, 3), row_number())) %>% tail
# 列追加・編集 transform
iris %>%
transform(Species = paste0(substr(Species, 1, 3), 1:nrow(.))) %>% tail
# 指定の位置に列追加 tibble::add_column
iris %$%
tibble::add_column(.,
sp = paste0(substr(Species, 1, 3), 1:nrow(.)),
rn = 1:nrow(.),
.after = 4) %>% head
iris %>% tibble::add_column(., rn = 1:nrow(.), .before = "Species") %>% headmutate_at, mutate_allmutate_at(.tbl , .vars, .funs), mutate_all(.tbl, .funs)iris %>% mutate_at(.vars = vars(1:4), .funs = list(~ . * 10)) %>% head
# iris %>% mutate_at(.vars = vars(1:4), .funs = funs(. * 10)) %>% head
iris %>% mutate_at(1:4, list(~ . * 10)) %>% head
iris %>% mutate_at(1:4, ~. * 10) %>% head
iris %>% mutate_at(1:4, list(~ . * rep(c(1, 10, 100), each = 50))) %>%
.[c(1:3, 51:53, 101:103),]
# 列を追加
iris %>% mutate_at(1:4, list(tenX = ~ . * 10)) %>% head
# 型変換
iris %>% mutate_at(vars(Sepal.Length, Sepal.Width), as.integer) %>% head
iris %>% mutate_at(vars("Sepal.Length", "Sepal.Width"), as.integer) %>% head
iris %>% mutate_at(c("Sepal.Length", "Sepal.Width"), as.integer) %>% head
# mutate_all
iris %>% mutate_all(as.integer) %>% headstarts_with,end_with, contains, matches, etc.varsヘルパー関数. mutate_atやsummarise_atで使う.?tidyselect::select_helpersで見ることができる.dplyr::if_elseはmissingも指定できる.funs()は廃止予定らしい.代わりにlist(~f(.)), list(name = ~f(.))のようにする.# データ
df <- tibble::as_tibble(data.frame(P1 = c(NA,NA,1,1,1), P2 = c(2,NA,NA,2,2),
Q1 = c(3,3,NA,NA,3), Q2 = c(4,4,4,NA,NA)))
# ヘルパー関数を用いて列指定し, NAを0に置換
df %>% mutate_at(.tbl = .,
.vars = vars(matches("1",.)),
.funs = list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(starts_with("P")), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(ends_with("2")), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(contains("Q")), list(~ifelse(is.na(.),0,.)))
# # funs()は廃止予定
# df %>% mutate_at(vars(starts_with("P")), funs(ifelse(is.na(.),0,.)))
# df %>% mutate_at(vars(ends_with("2")), funs(ifelse(is.na(.),0,.)))
# df %>% mutate_at(vars(contains("Q")), funs(ifelse(is.na(.),0,.)))
df %>% mutate_at(.tbl = ., .vars = -1, .funs = list(~ifelse(is.na(.), 0, .)))
df %>% mutate_at(vars(3:4), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(-1:-2), list(~ifelse(is.na(.),0,.)))
# varsでラップする必要無し
df %>% mutate_at(3:4, list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(-1:-2, list(~ifelse(is.na(.), 0, .)))
# # lapply, grepl, ifelseを組み合わせて
# f_1 <- function(dat, nm=names(dat)){
# ifelse(grepl("1$", nm), is.na(df[[i]])), )
# }
# lapply(seq_along(df), function(i){
# ifelse(grepl("1$", names(df)[i], is.na(df[[i]])), )
# })mutate_ifmutate_if(.tbl, .predicate, .funs)tibble::glimpse データセットの変数の型を把握する(strより見やすい?).iris %>% mutate_if(.tbl = ., .predicate = is.factor, .funs = as.character) %>% glimpse
iris %>% mutate_if(is.factor, as.character) %>% glimpse
# 複数の変換
iris %>% mutate_if(is.factor, list(as.character, abbreviate)) %>% glimpse
# lapplyを使う場合
dplyr::glimpse(
as.data.frame(
lapply(iris, function(x){
if (is.factor(x)) { as.character(x) } else {x}
})
)
) mutateでsweepと同等の事をする.rowSums(select())tidyr::gather, tidyr::spreadでやる.iris %>% mutate(row_sum = rowSums(select(., 1:4))) %>% head
iris %>% mutate(row_sum = apply(.[-5], 1, sum)) %>% head
iris %>% mutate(row_sum = Reduce("+", select(., 1:4))) %>% head
iris %>% mutate(sep_sum = rowSums(select(., matches("Sepal")))) %>% head
iris %>% mutate(prd_sep = Sepal.Length*Sepal.Width) %>% head
# 元の列を置き換える
# iris %>%
# mutate(total = rowSums(select(., 1:4))) %>%
# mutate_at(1:4, funs(./total)) %>%
# head
iris %>%
mutate(total = rowSums(select(., 1:4))) %>%
mutate_at(1:4, list(~./total)) %>%
head
head(sweep(iris[-5], 1, apply(iris[-5], 1, sum), "/"))
# # 元の列を書き換えない
# iris %>%
# mutate(total = rowSums(select(., 1:4))) %>%
# mutate_at(1:4, funs(rate = ./total)) %>%
# head
iris %>%
mutate(total = rowSums(select(., 1:4))) %>%
mutate_at(1:4, list(rate = ~./total)) %>%
headrow_number(), min_rank(), dense_rank(), percent_rank(), percent_rank(), cume_dist(), ntile(x, n)ntile(n = 3)で均等に分割# mutate(ntile(), min_rank(), dense_rank(), percent_rank())
head(iris) %>%
select(1) %>%
mutate(min_rank = min_rank(Sepal.Length),
`row_number()` = row_number(),
row_number = row_number(Sepal.Length),
dense_rank = dense_rank(Sepal.Length),
percent_rank = percent_rank(Sepal.Length),
cume_dist = cume_dist(Sepal.Length),
ntile = ntile(Sepal.Length, 2)
) -> rank_dat
# ntileをつかって均等に分割
discretized <- head(iris) %>%
mutate_at(1:4, list(~ntile(., n = 3)))| Sepal.Length | min_rank | row_number() | row_number | dense_rank | percent_rank | cume_dist | ntile |
|---|---|---|---|---|---|---|---|
| 5.1 | 5 | 1 | 5 | 5 | 0.8 | 0.83 | 2 |
| 4.9 | 3 | 2 | 3 | 3 | 0.4 | 0.50 | 1 |
| 4.7 | 2 | 3 | 2 | 2 | 0.2 | 0.33 | 1 |
| 4.6 | 1 | 4 | 1 | 1 | 0.0 | 0.17 | 1 |
| 5.0 | 4 | 5 | 4 | 4 | 0.6 | 0.67 | 2 |
| 5.4 | 6 | 6 | 6 | 6 | 1.0 | 1.00 | 2 |
| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
|---|---|---|---|---|
| 3 | 2 | 1 | 1 | setosa |
| 2 | 1 | 2 | 1 | setosa |
| 1 | 2 | 1 | 2 | setosa |
| 1 | 1 | 3 | 2 | setosa |
| 2 | 3 | 2 | 3 | setosa |
| 3 | 3 | 3 | 3 | setosa |
top_ntop_n(x, n, wt)SL_top3 <- iris %>%
group_by(Species) %>%
top_n(3, Sepal.Length) %>%
arrange(desc(Sepal.Length), .by_group = T) %>%
select(1,5) | Sepal.Length | Species |
|---|---|
| 5.8 | setosa |
| 5.7 | setosa |
| 5.7 | setosa |
| 7.0 | versicolor |
| 6.9 | versicolor |
| 6.8 | versicolor |
| 7.9 | virginica |
| 7.7 | virginica |
| 7.7 | virginica |
| 7.7 | virginica |
| 7.7 | virginica |
rename, rename_all, rename_if, setNames# rename
iris %>%
rename(SL = Sepal.Length, SW = Sepal.Width, SP = Species) %>% head
iris %>%
rename(SL = "Sepal.Length", SW = "Sepal.Width", SP = "Species") %>% head
# rename_all
rn <- c("SL","SW","PL","PW","SP")
iris %>%
rename_all(list(~c(rn))) %>% head
# setNames
rn <- c("SL","SW","PL","PW","SP")
head(setNames(iris, nm = rn ))row_number()、行数nrow(), n() を取得group_by(x) %>% summarise(n())で集計.tibble::rownames_to_column()tibble::column_to_rownames()filter, subset,で繋げる ORは|で繋げるsubset(x, subset, select)slice, [slice(.data, MARGIN)# 行番号で取り出し
iris %>% .[2:5,]
iris %>% .[c(-1, -6:-nrow(.)),]
# slice
iris %>% mutate(rn = row_number()) %>%
dplyr::slice(.data = ., MARGIN = 2:5)
iris %>% mutate(rn = row_number()) %>%
dplyr::slice(c(-1, -6:-n()))
iris %>% group_by(Species) %>% dplyr::slice(1)
iris %>% group_by(Species) %>% arrange(Sepal.Length) %>% dplyr::slice(1)sample_n(), sample_frac()distinct, duplicatedgroup_by.group_by, group_by_at, arrangearrange(x)はデフォルトで昇順 arrange(desc(x))で降順になるgroup_byしていてもarrangeではグループごとに並び替えない. .by_group =Tでグループごとに並び替えsw_top5 <- iris %>% group_by(Species) %>% top_n(5, Sepal.Width)
sw_top5 %>% group_by(Species) %>% arrange(Sepal.Width, .by_group = T) %>% select(2,5)
sw_top5 %>% group_by(Species) %>% arrange(Sepal.Width) %>% select(2,5)
# group_by_at(group内で並び替え)
sw_top5 %>% group_by_at(5) %>% arrange(desc(Sepal.Width), .by_group = T) %>% select(c(2,5))
# groupを無視する
sw_top5 %>% group_by_at(5) %>% arrange(desc(Sepal.Width)) %>% select(c(2,5))
# baseの場合
Reduce(rbind, lapply(split(sw_top5, f=sw_top5$Species),
function(x) x[order(x$Sepal.Width, decreasing = T), c(2,5)]))group_indices(非推奨)# data
d <- data.frame(id = rep(c("A","B","C"), times = c(2,1,3)),
v = c(1, 2, 1, 1, 2, 3), stringsAsFactors = F)
# group_indicesを使う
d %>% mutate(idx = group_indices(., id))
# groupをfactorに変換する.
didx <- d %>% mutate(idx = as.integer(factor(id, levels = unique(id))))| id | v | idx |
|---|---|---|
| A | 1 | 1 |
| A | 2 | 1 |
| B | 1 | 2 |
| C | 1 | 3 |
| C | 2 | 3 |
| C | 3 | 3 |
summarise, summarize, tapplysummarise_at, summarise_allsummarise_at(.tbl, .vars, .funs), summarise_all(.tbl, .funs)vars()は省略不可tapply(X, INDEX, FUN)でやる場合.funsを指定する場合# summarise_at 列名(ベクトル)で指定 vars(Species)
iris %>%
group_by(Species) %>%
summarise_at(.tbl = ., .vars = vars(1:4), .funs = mean)
iris %>% group_by(Species) %>% summarise_at(1:4, mean)
iris %>% group_by(Species) %>% summarise_at(1:4, list(men = mean, med = median))
iris %>% group_by(Species) %>%
summarise_at(vars(Sepal.Length, Sepal.Width), list(mean = mean))
iris %>% group_by(Species) %>%
summarise_at(c("Sepal.Length", "Sepal.Width"), mean)
# summarise_all
iris %>% group_by(Species) %>% summarise_all(mean)
# tapplyの場合
data.frame(lapply(iris[-5], function(v){tapply(X = v, INDEX = iris$Species, FUN = function(x)mean(x))}))
# 複数のfuns
iris %>% group_by(Species) %>%
summarize_at(vars(Sepal.Length), list(men = mean , med = median,
mx = max, mn = min))
iris %>% group_by(Species) %>%
summarize_at(vars(Sepal.Length), list(mean, median, max, min))
#iris %>% group_by(Species) %>% summarize_at(1:4, list(mean, median, max, min))
#iris %>% group_by(Species) %>% summarize_at(1:4, funs(mean, median, max, min))summarise, n(), prop.table()mutateprop.table# data
d <- data.frame(x = c(3, 2, 1, 2, 1, 3, 3, 1, 2),
y = c(3, 1, 3, 1, 2, 2, 2, 3, 1))
# グループのメンバー数 group_by() %>% summarise(n())
sum_d <- d %>% group_by(x, y) %>% summarise(count = n())## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# sum_d <- d %>% group_by(x, y) %>% mutate(count = n()) %>% slice(1)
# グループメンバーの割合 prop.table()
sum_d_prop <- d %>%
group_by(x, y) %>%
summarise(count = n()) %>%
mutate(gprop = round(prop.table(count), 3)) %>%
ungroup() %>%
mutate(prop = round(prop.table(count), 3))## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# 要約しても、行を維持したい場合はmutate
sum_d_al <- d %>% group_by(x, y) %>% mutate(count = n()) %>% arrange(x, y)
sum_d_al <- d %>% group_by(x, y) %>% mutate(gp = group_indices()) %>% arrange(gp)## Warning: `group_indices()` was deprecated in dplyr 1.0.0.
## Warning: Please use `cur_group_id()` instead.
| x | y |
|---|---|
| 3 | 3 |
| 2 | 1 |
| 1 | 3 |
| 2 | 1 |
| 1 | 2 |
| 3 | 2 |
| 3 | 2 |
| 1 | 3 |
| 2 | 1 |
| x | y | count |
|---|---|---|
| 1 | 2 | 1 |
| 1 | 3 | 2 |
| 2 | 1 | 3 |
| 3 | 2 | 2 |
| 3 | 3 | 1 |
| x | y | count | gprop | prop |
|---|---|---|---|---|
| 1 | 2 | 1 | 0.333 | 0.111 |
| 1 | 3 | 2 | 0.667 | 0.222 |
| 2 | 1 | 3 | 1.000 | 0.333 |
| 3 | 2 | 2 | 0.667 | 0.222 |
| 3 | 3 | 1 | 0.333 | 0.111 |
| x | y | gp |
|---|---|---|
| 1 | 2 | 1 |
| 1 | 3 | 2 |
| 1 | 3 | 2 |
| 2 | 1 | 3 |
| 2 | 1 | 3 |
| 2 | 1 | 3 |
| 3 | 2 | 4 |
| 3 | 2 | 4 |
| 3 | 3 | 5 |
d <- data.frame(k = c("a","b","c","a","c","c"), v = c("xxx","xyx","xxy","yyy", "yyx","xxz"))
sumd1 <- d %>% group_by(k) %>% summarise(sumtxt = toString(v), by = "k")
sumd2 <- d %>% group_by(k) %>% summarise(sumtxt = paste(v, collapse = ";")) | k | v |
|---|---|
| a | xxx |
| b | xyx |
| c | xxy |
| a | yyy |
| c | yyx |
| c | xxz |
| k | sumtxt | by |
|---|---|---|
| a | xxx, yyy | k |
| b | xyx | k |
| c | xxy, yyx, xxz | k |
| k | sumtxt |
|---|---|
| a | xxx;yyy |
| b | xyx |
| c | xxy;yyx;xxz |
reshape2::melt, stack# 重複レコードを結合した文字列をstrsplitしてリストに変換keyとなっている列を名前として付ける
(nl <- sapply(strsplit(sumd2$sumtxt, ";"), "[") %>% setNames(sumd2$k))## $a
## [1] "xxx" "yyy"
##
## $b
## [1] "xyx"
##
## $c
## [1] "xxy" "yyx" "xxz"
# 全く同じ形にすることにこだわらなければ, meltもしくはstackが良い
nl_to_df1 <- data.frame(k = rep(names(nl), v = sapply(nl, length)), v = unlist(nl))
nl_to_df2 <- reshape2::melt(nl) %>% select(2,1) %>% setNames(c("k","v"))
nl_to_df3 <- data.frame(k = substr(names(unlist(nl)), 1,1), v = unlist(nl))
nl_to_df4 <- stack(nl) %>% select(2,1) %>% setNames(c("k", "v"))| k | v |
|---|---|
| a | xxx |
| a | yyy |
| b | xyx |
| c | xxy |
| c | yyx |
| c | xxz |
# data
df <- data.frame(gene = paste("g", 1:5, sep = "_"),
matrix(rpois(15, 5), ncol = 3),
matrix(rpois(15, 20), ncol = 3)) %>%
setNames(., c("gene", "s.1.1","s.1.2", "s.1.3", "s.2.1", "s.2.2", "s.2.3"))
# 列ラベルのindexを基にして, apply
idx <- split(names(df)[-1], rep(c(1,2), each = 3))
mdf1 <- lapply(idx, function(x) apply(df[x], 1, mean)) %>%
{data.frame(gene = df$gene, do.call(cbind, .))} %>%
setNames(., c("gene", "s.1", "s.2"))
# 列持ちデータにしてから
mdf2 <- tidyr::gather(df, "smp", "value", -1) %>%
{data.frame(gene = .$gene,
smp = .$smp,
do.call(rbind, strsplit(.$smp, "\\.")),
value = .$value)} %>%
group_by(gene, X1, X2) %>%
summarise(meanv = mean(value)) %>%
ungroup() %>%
mutate(smp = paste(X1, X2, sep = ".")) %>%
select(gene, smp, meanv) %>%
tidyr::spread(., "smp", "meanv")## `summarise()` has grouped output by 'gene', 'X1'. You can override using the
## `.groups` argument.
| gene | s.1.1 | s.1.2 | s.1.3 | s.2.1 | s.2.2 | s.2.3 |
|---|---|---|---|---|---|---|
| g_1 | 6 | 3 | 6 | 20 | 22 | 24 |
| g_2 | 5 | 4 | 4 | 23 | 20 | 26 |
| g_3 | 5 | 7 | 6 | 18 | 18 | 19 |
| g_4 | 8 | 7 | 4 | 17 | 16 | 28 |
| g_5 | 2 | 7 | 9 | 19 | 19 | 17 |
| gene | s.1 | s.2 |
|---|---|---|
| g_1 | 5.000000 | 22.00000 |
| g_2 | 4.333333 | 23.00000 |
| g_3 | 6.000000 | 18.33333 |
| g_4 | 6.333333 | 20.33333 |
| g_5 | 6.000000 | 18.33333 |
| gene | s.1 | s.2 |
|---|---|---|
| g_1 | 5.000000 | 22.00000 |
| g_2 | 4.333333 | 23.00000 |
| g_3 | 6.000000 | 18.33333 |
| g_4 | 6.333333 | 20.33333 |
| g_5 | 6.000000 | 18.33333 |
tidyr::gather, tidyr::spreadspreadで元に戻せない.factor_key = Tにすることもできるがlevelsの調整が必要な場合があるので、後でmutateで変更するほうが無難.reshape2::melt, as.data.framedf_hec <- reshape2::melt(HairEyeColor, c('Hair', 'Eye', 'Sex'))
df_hec <- as.data.frame(HairEyeColor) | Hair | Eye | Sex | Freq |
|---|---|---|---|
| Black | Brown | Male | 32 |
| Brown | Brown | Male | 53 |
| Red | Brown | Male | 10 |
| Blond | Brown | Male | 3 |
| Black | Blue | Male | 11 |
| Brown | Blue | Male | 50 |
merge, *_joinset.seed(123)
df1 <- data.frame(cbind(key = LETTERS[1:5], v = round(runif(5, 1, 10), 1)),
stringsAsFactors = F)
df2 <- data.frame(cbind(key = LETTERS[c(1,3,5)], v = round(runif(3, 1, 10), 1)),
stringsAsFactors = F)
df3 <- data.frame(cbind(key = LETTERS[c(2,6)], v = round(runif(2, 1, 10), 1)),
stringsAsFactors = F)
df4 <- data.frame(cbind(lab = LETTERS[c(1,1:4)], v = round(runif(5, 1, 10), 1)),
stringsAsFactors = F)
dfl <- list(df1, df2, df3)| key | v |
|---|---|
| A | 3.6 |
| B | 8.1 |
| C | 4.7 |
| D | 8.9 |
| E | 9.5 |
| key | v |
|---|---|
| A | 1.4 |
| C | 5.8 |
| E | 9 |
| key | v |
|---|---|
| B | 6 |
| F | 5.1 |
| lab | v |
|---|---|
| A | 9.6 |
| A | 5.1 |
| B | 7.1 |
| C | 6.2 |
| D | 1.9 |
merge(x, y, by, all.x, suffixes)# merge 左結合,右結合, 完全結合 "all.x", "all.y", "all"
mdf_1.3 <- merge(df1, df3, by = "key", all = T, suffixes = c("_1", "_3"))
mdf_1.2_left <- merge(df1, df2, by = "key", all.x = T, suffixes = c("_1", "_2"))
mdf_1.2_right <- merge(df1, df2, by = "key", all.y = T, suffixes = c("_1", "_2"))
# keyが異なる場合
mdf_1.4 <- merge(df1, df4, by.x = "key", by.y = "lab", all = T, suffixes = c("_1", "_4"))
# 複数dfをマージ(データが巨大になると時間がかかる)
mdf_al <- Reduce(function(x,y) merge(x, y, by = "key", all = T), dfl) %>%
setNames(., c("k", "V_1", "V_2", "V_3"))| key | v_1 | v_3 |
|---|---|---|
| A | 3.6 | NA |
| B | 8.1 | 6 |
| C | 4.7 | NA |
| D | 8.9 | NA |
| E | 9.5 | NA |
| F | NA | 5.1 |
| key | v_1 | v_2 |
|---|---|---|
| A | 3.6 | 1.4 |
| B | 8.1 | NA |
| C | 4.7 | 5.8 |
| D | 8.9 | NA |
| E | 9.5 | 9 |
| key | v_1 | v_2 |
|---|---|---|
| A | 3.6 | 1.4 |
| C | 4.7 | 5.8 |
| E | 9.5 | 9 |
| key | v_1 | v_4 |
|---|---|---|
| A | 3.6 | 9.6 |
| A | 3.6 | 5.1 |
| B | 8.1 | 7.1 |
| C | 4.7 | 6.2 |
| D | 8.9 | 1.9 |
| E | 9.5 | NA |
| k | V_1 | V_2 | V_3 |
|---|---|---|---|
| A | 3.6 | 1.4 | NA |
| B | 8.1 | NA | 6 |
| C | 4.7 | 5.8 | NA |
| D | 8.9 | NA | NA |
| E | 9.5 | 9 | NA |
| F | NA | NA | 5.1 |
*_joinfull_join(x, y, by = c("key_x" = "key_y"))# full_join, left_join, right_join, inner_join
full_join(df1, df3, by = "key", suffix = c("_1","_3"))
left_join(df1, df2, by = "key", suffix = c("_1","_2"))
right_join(df1, df2, by = "key", suffix = c("_1","_2"))
full_join(df1, df4, by = c("key" = "lab"))
inner_join(df1, df2, by = "key", suffix = c("_1", "_2"))full_join(x, y, by = c("key_1", "key_2"))# 複数列をkeyとしてjoin
df_l <- data.frame(k1 = rep(c("A","B","C"), each = 3), stringsAsFactors = F) %>%
group_by(k1) %>% mutate(k2 = row_number()) %>% ungroup()
df_r <- df_l[c(-1,-4,-5,-7),] %>% mutate(., v = rpois(5, 20))
mdf_mk <- left_join(df_l, df_r, by = c("k1", "k2"))
mdf_sk <- left_join(df_l, df_r, by = "k1")| k1 | k2 |
|---|---|
| A | 1 |
| A | 2 |
| A | 3 |
| B | 1 |
| B | 2 |
| B | 3 |
| C | 1 |
| C | 2 |
| C | 3 |
| k1 | k2 | v |
|---|---|---|
| A | 2 | 25 |
| A | 3 | 12 |
| B | 3 | 25 |
| C | 2 | 21 |
| C | 3 | 21 |
| k1 | k2 | v |
|---|---|---|
| A | 1 | NA |
| A | 2 | 25 |
| A | 3 | 12 |
| B | 1 | NA |
| B | 2 | NA |
| B | 3 | 25 |
| C | 1 | NA |
| C | 2 | 21 |
| C | 3 | 21 |
bind_cols, bind_rowsdfl <- split(iris, iris$Species)
bind_rows(dfl) %>% glimpse()
# bind_cols( or bind_rows)はdata.frameに対して適応される.
mats <- lapply(1:3, function(i) {set.seed(i); matrix(rnorm(6), ncol=2)})
dats <- lapply(1:3, function(i) {set.seed(i); data.frame(matrix(rnorm(6), ncol=2))})
Reduce(cbind, mats)
bind_cols(dats)
# rownamesが保持されない
dats <- lapply(dats, function(x) {rownames(x) <- letters[1:3]; return(x)})
bind_cols(dats)## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stringr_1.4.0 tidyr_1.2.0 tibble_3.1.6 magrittr_2.0.3 dplyr_1.0.8
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.8.3 pillar_1.8.0 bslib_0.3.0 compiler_4.0.3
## [5] jquerylib_0.1.4 highr_0.9 plyr_1.8.7 tools_4.0.3
## [9] digest_0.6.29 viridisLite_0.4.0 jsonlite_1.7.2 evaluate_0.14
## [13] lifecycle_1.0.3 pkgconfig_2.0.3 rlang_1.0.6 cli_3.5.0
## [17] DBI_1.1.1 rstudioapi_0.13 yaml_2.2.1 xfun_0.25
## [21] rsko_0.1.0 fastmap_1.1.0 kableExtra_1.3.4 xml2_1.3.2
## [25] httr_1.4.2 knitr_1.33 systemfonts_1.0.2 generics_0.1.3
## [29] vctrs_0.5.1 sass_0.4.0 webshot_0.5.2 tidyselect_1.1.2
## [33] svglite_2.0.0 glue_1.6.2 R6_2.5.1 fansi_1.0.3
## [37] rmarkdown_2.10 reshape2_1.4.4 purrr_0.3.4 scales_1.2.0
## [41] htmltools_0.5.2 ellipsis_0.3.2 rvest_1.0.1 assertthat_0.2.1
## [45] colorspace_2.0-3 utf8_1.2.2 stringi_1.7.6 munsell_0.5.0