select
mutate
, transform
, tibble::add_column
mutate_at
, mutate_all
starts_with
,end_with
, contains
, matches
, etc.mutate_if
mutate
でsweep
と同等の事をする.top_n
first
rename
, rename_all
, rename_if
, setNames
row_number()
、行数nrow()
, n()
を取得group_by
.summarise
, summarize
, tapply
tidyr::gather
, tidyr::spread
reshape2::melt
, as.data.frame
merge
, *_join
bind_cols
, bind_rows
dplyr, magrittr, tibble, tidyr その他を使ってデータフレーム操作する練習
%>%
%>%
は, dplyr を読み込むと自動的に読み込まれる.lhs %>% rhs
のようにして使う. この場合、左項の結果を右項の最初の引数として配置する. つまり、x %>% f(y)
はf(x, y)
と等価.{}
中括弧は、パイプが関数内の最初の引数を使用しないようにする# lhsをrhsの第一引数に渡す. それ以外の引数で呼び出す時は .(ドット)で行う
3 %>% c(., 3 + .)
3 %>% c(3 + .)
1:10 %>% mean(.)
1:10 %>% plot(x = ., y = seq(10, 100, 10))
1:10 %>% plot(x = seq(10, 100, 10), y = .)
# rhsの第一引数は省略可能
3 %>% c(3 + .)
1:10 %>% mean
1:10 %>% plot(y = seq(10, 100, 10))
# 中括弧は、パイプが関数内の最初の引数を使用しないようにする
3 %>% c(3 + .)
3 %>% { c(3 + .) }
mtcars %>% {cor(.$mpg, .$cyl)}
mtcars %>% {cor(t(.))} %>% .[1:6,1:6]
%<>%
, %T>%
, %$%
select
select(.data = iris, Sepal.Length, Species) %>% head
iris %>% select(., Sepal.Length, Species) %>% head
iris %>% select(Sepal.Length, Species) %>% head
select(iris, Sepal.Length:Petal.Width) %>% head
select(iris, -Species) %>% head
select(iris, -Sepal.Length:-Petal.Width) %>% head
select(iris, -(Sepal.Length:Petal.Width)) %>% head
select_
, group_by_
(非推奨).select(iris, c(Sepal.Length, Species)) %>% head
select(iris, Sepal.Length, Species) %>% head
select(iris, c("Sepal.Length", "Species")) %>% head
select(iris, "Sepal.Length", "Species") %>% head
select(iris, .data$Sepal.Length, .data$Species) %>% head
# select(iris, .$SepalLength, .$Species) %>% head # エラ-
# select_(iris, "Sepal.Length", "Species") %>% head # 非推奨
iris %>% select(Sepal.Length, Species) %>% head
v <- c("Sepal.Length", "Species")
# select(iris, v) %>% head
select(iris, all_of(v)) %>% head
?tidyselect::select_helpers
match
は正規表現による一致, one_of
は完全一致ignore.case = TRUE
)iris %>% select(starts_with("Sepal")) %>% head
iris %>% select(ends_with("Length")) %>% head
iris %>% select(-starts_with("Petal")) %>% head
iris %>% select(matches("^sp|Length$")) %>% head # 正規表現による一致,
iris %>% select(contains(".")) %>% head
v <- c("Sepal.Length", "Species")
iris %>% select(one_of(v)) %>% head # 文字列ベクトルとマッチするもの
iris %>% select(v) %>% head
# num_range 列名の
df <- data.frame(matrix(rpois(30, 5), ncol=10))
df %>% select(num_range(prefix="X", range = 2:4, width = 1))
df_1 <- df %>% setNames(., paste0("X", sprintf("%03d", 1:10)))
df_1 %>% select(num_range(prefix = "X", range = 5:10, width = 3))
select
, [
select
mutate
, transform
, tibble::add_column
tibble::add_column(.data, ..., .after, .before)
書き換えはできない.# 列追加・編集 mutate
iris %>%
mutate(Species = paste0(substr(Species, 1, 3), row_number())) %>% tail
iris %>% group_by(Species) %>%
mutate(Sp = paste0(substr(Species, 1, 3), row_number())) %>% tail
# 列追加・編集 transform
iris %>%
transform(Species = paste0(substr(Species, 1, 3), 1:nrow(.))) %>% tail
# 指定の位置に列追加 tibble::add_column
iris %$%
tibble::add_column(.,
sp = paste0(substr(Species, 1, 3), 1:nrow(.)),
rn = 1:nrow(.),
.after = 4) %>% head
iris %>% tibble::add_column(., rn = 1:nrow(.), .before = "Species") %>% head
mutate_at
, mutate_all
mutate_at(.tbl , .vars, .funs)
, mutate_all(.tbl, .funs)
iris %>% mutate_at(.vars = vars(1:4), .funs = list(~ . * 10)) %>% head
# iris %>% mutate_at(.vars = vars(1:4), .funs = funs(. * 10)) %>% head
iris %>% mutate_at(1:4, list(~ . * 10)) %>% head
iris %>% mutate_at(1:4, ~. * 10) %>% head
iris %>% mutate_at(1:4, list(~ . * rep(c(1, 10, 100), each = 50))) %>%
.[c(1:3, 51:53, 101:103),]
# 列を追加
iris %>% mutate_at(1:4, list(tenX = ~ . * 10)) %>% head
# 型変換
iris %>% mutate_at(vars(Sepal.Length, Sepal.Width), as.integer) %>% head
iris %>% mutate_at(vars("Sepal.Length", "Sepal.Width"), as.integer) %>% head
iris %>% mutate_at(c("Sepal.Length", "Sepal.Width"), as.integer) %>% head
# mutate_all
iris %>% mutate_all(as.integer) %>% head
starts_with
,end_with
, contains
, matches
, etc.vars
ヘルパー関数. mutate_at
やsummarise_at
で使う.?tidyselect::select_helpers
で見ることができる.dplyr::if_else
はmissingも指定できる.funs()
は廃止予定らしい.代わりにlist(~f(.))
, list(name = ~f(.))
のようにする.# データ
df <- tibble::as_tibble(data.frame(P1 = c(NA,NA,1,1,1), P2 = c(2,NA,NA,2,2),
Q1 = c(3,3,NA,NA,3), Q2 = c(4,4,4,NA,NA)))
# ヘルパー関数を用いて列指定し, NAを0に置換
df %>% mutate_at(.tbl = .,
.vars = vars(matches("1",.)),
.funs = list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(starts_with("P")), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(ends_with("2")), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(contains("Q")), list(~ifelse(is.na(.),0,.)))
# # funs()は廃止予定
# df %>% mutate_at(vars(starts_with("P")), funs(ifelse(is.na(.),0,.)))
# df %>% mutate_at(vars(ends_with("2")), funs(ifelse(is.na(.),0,.)))
# df %>% mutate_at(vars(contains("Q")), funs(ifelse(is.na(.),0,.)))
df %>% mutate_at(.tbl = ., .vars = -1, .funs = list(~ifelse(is.na(.), 0, .)))
df %>% mutate_at(vars(3:4), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(-1:-2), list(~ifelse(is.na(.),0,.)))
# varsでラップする必要無し
df %>% mutate_at(3:4, list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(-1:-2, list(~ifelse(is.na(.), 0, .)))
# # lapply, grepl, ifelseを組み合わせて
# f_1 <- function(dat, nm=names(dat)){
# ifelse(grepl("1$", nm), is.na(df[[i]])), )
# }
# lapply(seq_along(df), function(i){
# ifelse(grepl("1$", names(df)[i], is.na(df[[i]])), )
# })
mutate_if
mutate_if(.tbl, .predicate, .funs)
tibble::glimpse
データセットの変数の型を把握する(strより見やすい?).iris %>% mutate_if(.tbl = ., .predicate = is.factor, .funs = as.character) %>% glimpse
iris %>% mutate_if(is.factor, as.character) %>% glimpse
# 複数の変換
iris %>% mutate_if(is.factor, list(as.character, abbreviate)) %>% glimpse
# lapplyを使う場合
dplyr::glimpse(
as.data.frame(
lapply(iris, function(x){
if (is.factor(x)) { as.character(x) } else {x}
})
)
)
mutate
でsweep
と同等の事をする.rowSums(select())
tidyr::gather
, tidyr::spread
でやる.iris %>% mutate(row_sum = rowSums(select(., 1:4))) %>% head
iris %>% mutate(row_sum = apply(.[-5], 1, sum)) %>% head
iris %>% mutate(row_sum = Reduce("+", select(., 1:4))) %>% head
iris %>% mutate(sep_sum = rowSums(select(., matches("Sepal")))) %>% head
iris %>% mutate(prd_sep = Sepal.Length*Sepal.Width) %>% head
# 元の列を置き換える
# iris %>%
# mutate(total = rowSums(select(., 1:4))) %>%
# mutate_at(1:4, funs(./total)) %>%
# head
iris %>%
mutate(total = rowSums(select(., 1:4))) %>%
mutate_at(1:4, list(~./total)) %>%
head
head(sweep(iris[-5], 1, apply(iris[-5], 1, sum), "/"))
# # 元の列を書き換えない
# iris %>%
# mutate(total = rowSums(select(., 1:4))) %>%
# mutate_at(1:4, funs(rate = ./total)) %>%
# head
iris %>%
mutate(total = rowSums(select(., 1:4))) %>%
mutate_at(1:4, list(rate = ~./total)) %>%
head
row_number()
, min_rank()
, dense_rank()
, percent_rank()
, percent_rank()
, cume_dist()
, ntile(x, n)
ntile(n = 3)
で均等に分割# mutate(ntile(), min_rank(), dense_rank(), percent_rank())
head(iris) %>%
select(1) %>%
mutate(min_rank = min_rank(Sepal.Length),
`row_number()` = row_number(),
row_number = row_number(Sepal.Length),
dense_rank = dense_rank(Sepal.Length),
percent_rank = percent_rank(Sepal.Length),
cume_dist = cume_dist(Sepal.Length),
ntile = ntile(Sepal.Length, 2)
) -> rank_dat
# ntileをつかって均等に分割
discretized <- head(iris) %>%
mutate_at(1:4, list(~ntile(., n = 3)))
Sepal.Length | min_rank | row_number() | row_number | dense_rank | percent_rank | cume_dist | ntile |
---|---|---|---|---|---|---|---|
5.1 | 5 | 1 | 5 | 5 | 0.8 | 0.83 | 2 |
4.9 | 3 | 2 | 3 | 3 | 0.4 | 0.50 | 1 |
4.7 | 2 | 3 | 2 | 2 | 0.2 | 0.33 | 1 |
4.6 | 1 | 4 | 1 | 1 | 0.0 | 0.17 | 1 |
5.0 | 4 | 5 | 4 | 4 | 0.6 | 0.67 | 2 |
5.4 | 6 | 6 | 6 | 6 | 1.0 | 1.00 | 2 |
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
3 | 2 | 1 | 1 | setosa |
2 | 1 | 2 | 1 | setosa |
1 | 2 | 1 | 2 | setosa |
1 | 1 | 3 | 2 | setosa |
2 | 3 | 2 | 3 | setosa |
3 | 3 | 3 | 3 | setosa |
top_n
top_n(x, n, wt)
SL_top3 <- iris %>%
group_by(Species) %>%
top_n(3, Sepal.Length) %>%
arrange(desc(Sepal.Length), .by_group = T) %>%
select(1,5)
Sepal.Length | Species |
---|---|
5.8 | setosa |
5.7 | setosa |
5.7 | setosa |
7.0 | versicolor |
6.9 | versicolor |
6.8 | versicolor |
7.9 | virginica |
7.7 | virginica |
7.7 | virginica |
7.7 | virginica |
7.7 | virginica |
rename
, rename_all
, rename_if
, setNames
# rename
iris %>%
rename(SL = Sepal.Length, SW = Sepal.Width, SP = Species) %>% head
iris %>%
rename(SL = "Sepal.Length", SW = "Sepal.Width", SP = "Species") %>% head
# rename_all
rn <- c("SL","SW","PL","PW","SP")
iris %>%
rename_all(list(~c(rn))) %>% head
# setNames
rn <- c("SL","SW","PL","PW","SP")
head(setNames(iris, nm = rn ))
row_number()
、行数nrow()
, n()
を取得group_by(x) %>% summarise(n())
で集計.tibble::rownames_to_column()
tibble::column_to_rownames()
filter
, subset
,
で繋げる ORは|
で繋げるsubset(x, subset, select)
slice
, [
slice(.data, MARGIN)
# 行番号で取り出し
iris %>% .[2:5,]
iris %>% .[c(-1, -6:-nrow(.)),]
# slice
iris %>% mutate(rn = row_number()) %>%
dplyr::slice(.data = ., MARGIN = 2:5)
iris %>% mutate(rn = row_number()) %>%
dplyr::slice(c(-1, -6:-n()))
iris %>% group_by(Species) %>% dplyr::slice(1)
iris %>% group_by(Species) %>% arrange(Sepal.Length) %>% dplyr::slice(1)
sample_n()
, sample_frac()
distinct
, duplicated
group_by
.group_by
, group_by_at
, arrange
arrange(x)
はデフォルトで昇順 arrange(desc(x))
で降順になるgroup_by
していてもarrange
ではグループごとに並び替えない. .by_group =T
でグループごとに並び替えsw_top5 <- iris %>% group_by(Species) %>% top_n(5, Sepal.Width)
sw_top5 %>% group_by(Species) %>% arrange(Sepal.Width, .by_group = T) %>% select(2,5)
sw_top5 %>% group_by(Species) %>% arrange(Sepal.Width) %>% select(2,5)
# group_by_at(group内で並び替え)
sw_top5 %>% group_by_at(5) %>% arrange(desc(Sepal.Width), .by_group = T) %>% select(c(2,5))
# groupを無視する
sw_top5 %>% group_by_at(5) %>% arrange(desc(Sepal.Width)) %>% select(c(2,5))
# baseの場合
Reduce(rbind, lapply(split(sw_top5, f=sw_top5$Species),
function(x) x[order(x$Sepal.Width, decreasing = T), c(2,5)]))
group_indices
(非推奨)# data
d <- data.frame(id = rep(c("A","B","C"), times = c(2,1,3)),
v = c(1, 2, 1, 1, 2, 3), stringsAsFactors = F)
# group_indicesを使う
d %>% mutate(idx = group_indices(., id))
# groupをfactorに変換する.
didx <- d %>% mutate(idx = as.integer(factor(id, levels = unique(id))))
id | v | idx |
---|---|---|
A | 1 | 1 |
A | 2 | 1 |
B | 1 | 2 |
C | 1 | 3 |
C | 2 | 3 |
C | 3 | 3 |
summarise
, summarize
, tapply
summarise_at
, summarise_all
summarise_at(.tbl, .vars, .funs)
, summarise_all(.tbl, .funs)
vars()
は省略不可tapply(X, INDEX, FUN)
でやる場合.funs
を指定する場合# summarise_at 列名(ベクトル)で指定 vars(Species)
iris %>%
group_by(Species) %>%
summarise_at(.tbl = ., .vars = vars(1:4), .funs = mean)
iris %>% group_by(Species) %>% summarise_at(1:4, mean)
iris %>% group_by(Species) %>% summarise_at(1:4, list(men = mean, med = median))
iris %>% group_by(Species) %>%
summarise_at(vars(Sepal.Length, Sepal.Width), list(mean = mean))
iris %>% group_by(Species) %>%
summarise_at(c("Sepal.Length", "Sepal.Width"), mean)
# summarise_all
iris %>% group_by(Species) %>% summarise_all(mean)
# tapplyの場合
data.frame(lapply(iris[-5], function(v){tapply(X = v, INDEX = iris$Species, FUN = function(x)mean(x))}))
# 複数のfuns
iris %>% group_by(Species) %>%
summarize_at(vars(Sepal.Length), list(men = mean , med = median,
mx = max, mn = min))
iris %>% group_by(Species) %>%
summarize_at(vars(Sepal.Length), list(mean, median, max, min))
#iris %>% group_by(Species) %>% summarize_at(1:4, list(mean, median, max, min))
#iris %>% group_by(Species) %>% summarize_at(1:4, funs(mean, median, max, min))
summarise
, n()
, prop.table()
mutate
prop.table
# data
d <- data.frame(x = c(3, 2, 1, 2, 1, 3, 3, 1, 2),
y = c(3, 1, 3, 1, 2, 2, 2, 3, 1))
# グループのメンバー数 group_by() %>% summarise(n())
sum_d <- d %>% group_by(x, y) %>% summarise(count = n())
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# sum_d <- d %>% group_by(x, y) %>% mutate(count = n()) %>% slice(1)
# グループメンバーの割合 prop.table()
sum_d_prop <- d %>%
group_by(x, y) %>%
summarise(count = n()) %>%
mutate(gprop = round(prop.table(count), 3)) %>%
ungroup() %>%
mutate(prop = round(prop.table(count), 3))
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# 要約しても、行を維持したい場合はmutate
sum_d_al <- d %>% group_by(x, y) %>% mutate(count = n()) %>% arrange(x, y)
sum_d_al <- d %>% group_by(x, y) %>% mutate(gp = group_indices()) %>% arrange(gp)
## Warning: `group_indices()` was deprecated in dplyr 1.0.0.
## Warning: Please use `cur_group_id()` instead.
x | y |
---|---|
3 | 3 |
2 | 1 |
1 | 3 |
2 | 1 |
1 | 2 |
3 | 2 |
3 | 2 |
1 | 3 |
2 | 1 |
x | y | count |
---|---|---|
1 | 2 | 1 |
1 | 3 | 2 |
2 | 1 | 3 |
3 | 2 | 2 |
3 | 3 | 1 |
x | y | count | gprop | prop |
---|---|---|---|---|
1 | 2 | 1 | 0.333 | 0.111 |
1 | 3 | 2 | 0.667 | 0.222 |
2 | 1 | 3 | 1.000 | 0.333 |
3 | 2 | 2 | 0.667 | 0.222 |
3 | 3 | 1 | 0.333 | 0.111 |
x | y | gp |
---|---|---|
1 | 2 | 1 |
1 | 3 | 2 |
1 | 3 | 2 |
2 | 1 | 3 |
2 | 1 | 3 |
2 | 1 | 3 |
3 | 2 | 4 |
3 | 2 | 4 |
3 | 3 | 5 |
d <- data.frame(k = c("a","b","c","a","c","c"), v = c("xxx","xyx","xxy","yyy", "yyx","xxz"))
sumd1 <- d %>% group_by(k) %>% summarise(sumtxt = toString(v), by = "k")
sumd2 <- d %>% group_by(k) %>% summarise(sumtxt = paste(v, collapse = ";"))
k | v |
---|---|
a | xxx |
b | xyx |
c | xxy |
a | yyy |
c | yyx |
c | xxz |
k | sumtxt | by |
---|---|---|
a | xxx, yyy | k |
b | xyx | k |
c | xxy, yyx, xxz | k |
k | sumtxt |
---|---|
a | xxx;yyy |
b | xyx |
c | xxy;yyx;xxz |
reshape2::melt
, stack
# 重複レコードを結合した文字列をstrsplitしてリストに変換keyとなっている列を名前として付ける
(nl <- sapply(strsplit(sumd2$sumtxt, ";"), "[") %>% setNames(sumd2$k))
## $a
## [1] "xxx" "yyy"
##
## $b
## [1] "xyx"
##
## $c
## [1] "xxy" "yyx" "xxz"
# 全く同じ形にすることにこだわらなければ, meltもしくはstackが良い
nl_to_df1 <- data.frame(k = rep(names(nl), v = sapply(nl, length)), v = unlist(nl))
nl_to_df2 <- reshape2::melt(nl) %>% select(2,1) %>% setNames(c("k","v"))
nl_to_df3 <- data.frame(k = substr(names(unlist(nl)), 1,1), v = unlist(nl))
nl_to_df4 <- stack(nl) %>% select(2,1) %>% setNames(c("k", "v"))
k | v |
---|---|
a | xxx |
a | yyy |
b | xyx |
c | xxy |
c | yyx |
c | xxz |
# data
df <- data.frame(gene = paste("g", 1:5, sep = "_"),
matrix(rpois(15, 5), ncol = 3),
matrix(rpois(15, 20), ncol = 3)) %>%
setNames(., c("gene", "s.1.1","s.1.2", "s.1.3", "s.2.1", "s.2.2", "s.2.3"))
# 列ラベルのindexを基にして, apply
idx <- split(names(df)[-1], rep(c(1,2), each = 3))
mdf1 <- lapply(idx, function(x) apply(df[x], 1, mean)) %>%
{data.frame(gene = df$gene, do.call(cbind, .))} %>%
setNames(., c("gene", "s.1", "s.2"))
# 列持ちデータにしてから
mdf2 <- tidyr::gather(df, "smp", "value", -1) %>%
{data.frame(gene = .$gene,
smp = .$smp,
do.call(rbind, strsplit(.$smp, "\\.")),
value = .$value)} %>%
group_by(gene, X1, X2) %>%
summarise(meanv = mean(value)) %>%
ungroup() %>%
mutate(smp = paste(X1, X2, sep = ".")) %>%
select(gene, smp, meanv) %>%
tidyr::spread(., "smp", "meanv")
## `summarise()` has grouped output by 'gene', 'X1'. You can override using the
## `.groups` argument.
gene | s.1.1 | s.1.2 | s.1.3 | s.2.1 | s.2.2 | s.2.3 |
---|---|---|---|---|---|---|
g_1 | 6 | 3 | 6 | 20 | 22 | 24 |
g_2 | 5 | 4 | 4 | 23 | 20 | 26 |
g_3 | 5 | 7 | 6 | 18 | 18 | 19 |
g_4 | 8 | 7 | 4 | 17 | 16 | 28 |
g_5 | 2 | 7 | 9 | 19 | 19 | 17 |
gene | s.1 | s.2 |
---|---|---|
g_1 | 5.000000 | 22.00000 |
g_2 | 4.333333 | 23.00000 |
g_3 | 6.000000 | 18.33333 |
g_4 | 6.333333 | 20.33333 |
g_5 | 6.000000 | 18.33333 |
gene | s.1 | s.2 |
---|---|---|
g_1 | 5.000000 | 22.00000 |
g_2 | 4.333333 | 23.00000 |
g_3 | 6.000000 | 18.33333 |
g_4 | 6.333333 | 20.33333 |
g_5 | 6.000000 | 18.33333 |
tidyr::gather
, tidyr::spread
spread
で元に戻せない.factor_key = T
にすることもできるがlevelsの調整が必要な場合があるので、後でmutateで変更するほうが無難.reshape2::melt
, as.data.frame
df_hec <- reshape2::melt(HairEyeColor, c('Hair', 'Eye', 'Sex'))
df_hec <- as.data.frame(HairEyeColor)
Hair | Eye | Sex | Freq |
---|---|---|---|
Black | Brown | Male | 32 |
Brown | Brown | Male | 53 |
Red | Brown | Male | 10 |
Blond | Brown | Male | 3 |
Black | Blue | Male | 11 |
Brown | Blue | Male | 50 |
merge
, *_join
set.seed(123)
df1 <- data.frame(cbind(key = LETTERS[1:5], v = round(runif(5, 1, 10), 1)),
stringsAsFactors = F)
df2 <- data.frame(cbind(key = LETTERS[c(1,3,5)], v = round(runif(3, 1, 10), 1)),
stringsAsFactors = F)
df3 <- data.frame(cbind(key = LETTERS[c(2,6)], v = round(runif(2, 1, 10), 1)),
stringsAsFactors = F)
df4 <- data.frame(cbind(lab = LETTERS[c(1,1:4)], v = round(runif(5, 1, 10), 1)),
stringsAsFactors = F)
dfl <- list(df1, df2, df3)
key | v |
---|---|
A | 3.6 |
B | 8.1 |
C | 4.7 |
D | 8.9 |
E | 9.5 |
key | v |
---|---|
A | 1.4 |
C | 5.8 |
E | 9 |
key | v |
---|---|
B | 6 |
F | 5.1 |
lab | v |
---|---|
A | 9.6 |
A | 5.1 |
B | 7.1 |
C | 6.2 |
D | 1.9 |
merge(x, y, by, all.x, suffixes)
# merge 左結合,右結合, 完全結合 "all.x", "all.y", "all"
mdf_1.3 <- merge(df1, df3, by = "key", all = T, suffixes = c("_1", "_3"))
mdf_1.2_left <- merge(df1, df2, by = "key", all.x = T, suffixes = c("_1", "_2"))
mdf_1.2_right <- merge(df1, df2, by = "key", all.y = T, suffixes = c("_1", "_2"))
# keyが異なる場合
mdf_1.4 <- merge(df1, df4, by.x = "key", by.y = "lab", all = T, suffixes = c("_1", "_4"))
# 複数dfをマージ(データが巨大になると時間がかかる)
mdf_al <- Reduce(function(x,y) merge(x, y, by = "key", all = T), dfl) %>%
setNames(., c("k", "V_1", "V_2", "V_3"))
key | v_1 | v_3 |
---|---|---|
A | 3.6 | NA |
B | 8.1 | 6 |
C | 4.7 | NA |
D | 8.9 | NA |
E | 9.5 | NA |
F | NA | 5.1 |
key | v_1 | v_2 |
---|---|---|
A | 3.6 | 1.4 |
B | 8.1 | NA |
C | 4.7 | 5.8 |
D | 8.9 | NA |
E | 9.5 | 9 |
key | v_1 | v_2 |
---|---|---|
A | 3.6 | 1.4 |
C | 4.7 | 5.8 |
E | 9.5 | 9 |
key | v_1 | v_4 |
---|---|---|
A | 3.6 | 9.6 |
A | 3.6 | 5.1 |
B | 8.1 | 7.1 |
C | 4.7 | 6.2 |
D | 8.9 | 1.9 |
E | 9.5 | NA |
k | V_1 | V_2 | V_3 |
---|---|---|---|
A | 3.6 | 1.4 | NA |
B | 8.1 | NA | 6 |
C | 4.7 | 5.8 | NA |
D | 8.9 | NA | NA |
E | 9.5 | 9 | NA |
F | NA | NA | 5.1 |
*_join
full_join(x, y, by = c("key_x" = "key_y"))
# full_join, left_join, right_join, inner_join
full_join(df1, df3, by = "key", suffix = c("_1","_3"))
left_join(df1, df2, by = "key", suffix = c("_1","_2"))
right_join(df1, df2, by = "key", suffix = c("_1","_2"))
full_join(df1, df4, by = c("key" = "lab"))
inner_join(df1, df2, by = "key", suffix = c("_1", "_2"))
full_join(x, y, by = c("key_1", "key_2"))
# 複数列をkeyとしてjoin
df_l <- data.frame(k1 = rep(c("A","B","C"), each = 3), stringsAsFactors = F) %>%
group_by(k1) %>% mutate(k2 = row_number()) %>% ungroup()
df_r <- df_l[c(-1,-4,-5,-7),] %>% mutate(., v = rpois(5, 20))
mdf_mk <- left_join(df_l, df_r, by = c("k1", "k2"))
mdf_sk <- left_join(df_l, df_r, by = "k1")
k1 | k2 |
---|---|
A | 1 |
A | 2 |
A | 3 |
B | 1 |
B | 2 |
B | 3 |
C | 1 |
C | 2 |
C | 3 |
k1 | k2 | v |
---|---|---|
A | 2 | 25 |
A | 3 | 12 |
B | 3 | 25 |
C | 2 | 21 |
C | 3 | 21 |
k1 | k2 | v |
---|---|---|
A | 1 | NA |
A | 2 | 25 |
A | 3 | 12 |
B | 1 | NA |
B | 2 | NA |
B | 3 | 25 |
C | 1 | NA |
C | 2 | 21 |
C | 3 | 21 |
bind_cols
, bind_rows
dfl <- split(iris, iris$Species)
bind_rows(dfl) %>% glimpse()
# bind_cols( or bind_rows)はdata.frameに対して適応される.
mats <- lapply(1:3, function(i) {set.seed(i); matrix(rnorm(6), ncol=2)})
dats <- lapply(1:3, function(i) {set.seed(i); data.frame(matrix(rnorm(6), ncol=2))})
Reduce(cbind, mats)
bind_cols(dats)
# rownamesが保持されない
dats <- lapply(dats, function(x) {rownames(x) <- letters[1:3]; return(x)})
bind_cols(dats)
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] stringr_1.4.0 tidyr_1.2.0 tibble_3.1.6 magrittr_2.0.3 dplyr_1.0.8
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.8.3 pillar_1.8.0 bslib_0.3.0 compiler_4.0.3
## [5] jquerylib_0.1.4 highr_0.9 plyr_1.8.7 tools_4.0.3
## [9] digest_0.6.29 viridisLite_0.4.0 jsonlite_1.7.2 evaluate_0.14
## [13] lifecycle_1.0.3 pkgconfig_2.0.3 rlang_1.0.6 cli_3.5.0
## [17] DBI_1.1.1 rstudioapi_0.13 yaml_2.2.1 xfun_0.25
## [21] rsko_0.1.0 fastmap_1.1.0 kableExtra_1.3.4 xml2_1.3.2
## [25] httr_1.4.2 knitr_1.33 systemfonts_1.0.2 generics_0.1.3
## [29] vctrs_0.5.1 sass_0.4.0 webshot_0.5.2 tidyselect_1.1.2
## [33] svglite_2.0.0 glue_1.6.2 R6_2.5.1 fansi_1.0.3
## [37] rmarkdown_2.10 reshape2_1.4.4 purrr_0.3.4 scales_1.2.0
## [41] htmltools_0.5.2 ellipsis_0.3.2 rvest_1.0.1 assertthat_0.2.1
## [45] colorspace_2.0-3 utf8_1.2.2 stringi_1.7.6 munsell_0.5.0