dplyr, magrittr, tibble, tidyr その他を使ってデータフレーム操作する練習


1 ライブラリ.


library(dplyr)
library(magrittr)
library(tibble)
library(tidyr)
library(stringr)

2 パイプ演算子.


2.1 %>%

  • magrittr のパイプ演算子%>%は, dplyr を読み込むと自動的に読み込まれる.
  • lhs %>% rhsのようにして使う. この場合、左項の結果を右項の最初の引数として配置する. つまり、x %>% f(y)f(x, y)と等価.
  • rhsで第一引数以外の場所で、lhsを呼び出す時は .(ドット)で行う
  • {}中括弧は、パイプが関数内の最初の引数を使用しないようにする
# lhsをrhsの第一引数に渡す. それ以外の引数で呼び出す時は .(ドット)で行う
3 %>% c(., 3 + .) 
3 %>% c(3 + .) 

1:10 %>% mean(.) 
1:10 %>% plot(x = ., y = seq(10, 100, 10))
1:10 %>% plot(x = seq(10, 100, 10), y = .)

# rhsの第一引数は省略可能
3 %>% c(3 + .)
1:10 %>% mean
1:10 %>% plot(y = seq(10, 100, 10))

# 中括弧は、パイプが関数内の最初の引数を使用しないようにする 
3 %>% c(3 + .)
3 %>% { c(3 + .) }
mtcars %>% {cor(.$mpg, .$cyl)}
mtcars %>% {cor(t(.))} %>% .[1:6,1:6]

2.2 %<>%, %T>%, %$%

  • Tee pipe
  • Assignment pipe
  • Exposition pipe
# %T>%  (Tパイプ)
x <- rnorm(100) %>% matrix(ncol=2) %T>% plot() %T>% str() 

# %$% (exposition pipe) 左辺のデータの名前を右辺でも利用できる
mtcars %$% cor(mpg, cyl) 
mtcars %>% {cor(.$mpg, .$cyl)} 

#  %<>% 元の変数を置き換える 
x <- " a b c "
(x %<>% gsub("^ | $","", .) %<>% gsub(" ", ",", .))

3 列取り出しselect


3.1 列ラベルで取り出す.

select(.data = iris, Sepal.Length, Species) %>% head

iris %>% select(., Sepal.Length,  Species) %>% head
iris %>% select(Sepal.Length,  Species) %>% head

select(iris, Sepal.Length:Petal.Width) %>% head
select(iris, -Species) %>% head

select(iris, -Sepal.Length:-Petal.Width) %>% head
select(iris, -(Sepal.Length:Petal.Width)) %>% head

3.2 標準評価(SE)と非標準評価(NSE)

  • ほどんどのdplyrの関数はNSE(Non-Standard Evaluation:非標準評価)
  • 明示的にSE(Standard evaluation: 標準評価)を使う場合は, select_, group_by_(非推奨).
  • select(data, v)のように外部ベクトルを使うのは非推奨
select(iris, c(Sepal.Length, Species)) %>% head
select(iris, Sepal.Length, Species) %>% head
select(iris, c("Sepal.Length", "Species")) %>% head
select(iris, "Sepal.Length", "Species") %>% head
select(iris, .data$Sepal.Length, .data$Species) %>% head
# select(iris, .$SepalLength, .$Species) %>% head # エラ-
# select_(iris, "Sepal.Length", "Species") %>% head # 非推奨


iris %>% select(Sepal.Length, Species) %>% head

v <- c("Sepal.Length", "Species")
# select(iris, v) %>% head
select(iris, all_of(v)) %>% head

3.3 選択ユーティリティー関数を使って取り出す.

  • 選択ユーティリティー関数は?tidyselect::select_helpers
  • matchは正規表現による一致, one_ofは完全一致
  • 大/小文字はデフォルトで区別しない(ignore.case = TRUE)
iris %>% select(starts_with("Sepal")) %>% head
iris %>% select(ends_with("Length")) %>% head
iris %>% select(-starts_with("Petal")) %>% head

iris %>% select(matches("^sp|Length$")) %>% head # 正規表現による一致, 
iris %>% select(contains(".")) %>% head

v <- c("Sepal.Length", "Species")
iris %>% select(one_of(v)) %>% head # 文字列ベクトルとマッチするもの
iris %>% select(v) %>% head

# num_range 列名の
df <- data.frame(matrix(rpois(30, 5), ncol=10))
df %>% select(num_range(prefix="X", range = 2:4, width = 1))

df_1 <- df %>% setNames(., paste0("X", sprintf("%03d", 1:10)))
df_1 %>% select(num_range(prefix = "X", range = 5:10, width = 3))

3.4 列番号で取り出す select, [

# 1,5列目を取り出す select 
iris %>% select(1, 5) %>% head
iris %>% select(c(1, 5)) %>% head
iris %>% select(-2:-4) %>% head

# 1,5列目を取り出す [ 
iris %>% .[c(1,5)] %>% head
iris %>% .[, c(1,5)] %>% head
iris %>% "["(., c(1,5)) %>% head
iris %>% `[`(., c(1,5)) %>% head

3.5 列並び替えselect

  • 列名と列番号(文字列)一緒でもいける. logicalはだめ
iris %>% select(sort(names(iris))) %>% head
iris %>% select(c("Species", names(iris)[-5])) %>% head
iris %>% select(c("Species", 1:4)) %>% head
iris %>% select(c("Species", 1, 3, 2, 4)) %>% head
iris %>% select(c(Species, 1, 3, 2, 4)) %>% head

3.6 指定列のみのdata.frameを返すtransmute. ベクトルとして取り出す pull

iris %>% pull(Species) %>% head
iris %>% transmute(petal_area=Petal.Length * Petal.Width * 0.5) %>% head

4 列追加・編集.


4.1 mutate, transform, tibble::add_column

  • group_byしてあると書き換えられない. 列追加する.
  • 指定の場所に列を挿入tibble::add_column(.data, ..., .after, .before)書き換えはできない.
# 列追加・編集 mutate
iris %>% 
  mutate(Species = paste0(substr(Species, 1, 3), row_number())) %>% tail

iris %>% group_by(Species) %>%
  mutate(Sp = paste0(substr(Species, 1, 3), row_number())) %>% tail

# 列追加・編集 transform
iris %>% 
  transform(Species = paste0(substr(Species, 1, 3), 1:nrow(.))) %>% tail

# 指定の位置に列追加 tibble::add_column
iris %$% 
  tibble::add_column(., 
                     sp = paste0(substr(Species, 1, 3), 1:nrow(.)), 
                     rn = 1:nrow(.),
                     .after = 4) %>% head

iris %>% tibble::add_column(., rn = 1:nrow(.), .before = "Species") %>% head

4.2 mutate_at, mutate_all

  • mutate_at(.tbl , .vars, .funs), mutate_all(.tbl, .funs)
iris %>% mutate_at(.vars = vars(1:4), .funs = list(~ . * 10)) %>% head
# iris %>% mutate_at(.vars = vars(1:4), .funs = funs(. * 10)) %>% head

iris %>% mutate_at(1:4, list(~ . * 10)) %>% head
iris %>% mutate_at(1:4, ~. * 10) %>% head

iris %>% mutate_at(1:4, list(~ . * rep(c(1, 10, 100), each = 50))) %>% 
  .[c(1:3, 51:53, 101:103),]

# 列を追加
iris %>% mutate_at(1:4, list(tenX = ~ . * 10)) %>% head

# 型変換
iris %>% mutate_at(vars(Sepal.Length, Sepal.Width), as.integer) %>% head
iris %>% mutate_at(vars("Sepal.Length", "Sepal.Width"), as.integer) %>% head
iris %>% mutate_at(c("Sepal.Length", "Sepal.Width"), as.integer) %>% head

# mutate_all
iris %>% mutate_all(as.integer) %>% head

4.3 ヘルパー関数starts_with,end_with, contains, matches, etc.

  • varsヘルパー関数. mutate_atsummarise_atで使う.
  • ?tidyselect::select_helpersで見ることができる.
  • dplyr::if_elseはmissingも指定できる.
  • funs()は廃止予定らしい.代わりにlist(~f(.)), list(name = ~f(.))のようにする.
# データ
df <- tibble::as_tibble(data.frame(P1 = c(NA,NA,1,1,1), P2 = c(2,NA,NA,2,2),
                                   Q1 = c(3,3,NA,NA,3), Q2 = c(4,4,4,NA,NA)))

# ヘルパー関数を用いて列指定し, NAを0に置換
df %>% mutate_at(.tbl = ., 
                 .vars = vars(matches("1",.)), 
                 .funs = list(~ifelse(is.na(.),0,.)))

df %>% mutate_at(vars(starts_with("P")), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(ends_with("2")), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(contains("Q")), list(~ifelse(is.na(.),0,.)))

# # funs()は廃止予定
# df %>% mutate_at(vars(starts_with("P")), funs(ifelse(is.na(.),0,.)))
# df %>% mutate_at(vars(ends_with("2")), funs(ifelse(is.na(.),0,.)))
# df %>% mutate_at(vars(contains("Q")), funs(ifelse(is.na(.),0,.)))

df %>% mutate_at(.tbl = ., .vars = -1, .funs = list(~ifelse(is.na(.), 0, .)))

df %>% mutate_at(vars(3:4), list(~ifelse(is.na(.),0,.)))
df %>% mutate_at(vars(-1:-2), list(~ifelse(is.na(.),0,.)))

# varsでラップする必要無し
df %>% mutate_at(3:4, list(~ifelse(is.na(.),0,.))) 
df %>% mutate_at(-1:-2, list(~ifelse(is.na(.), 0, .)))


# # lapply, grepl, ifelseを組み合わせて
# f_1 <- function(dat, nm=names(dat)){
#   ifelse(grepl("1$", nm), is.na(df[[i]])), )
# }


# lapply(seq_along(df), function(i){
#   ifelse(grepl("1$", names(df)[i], is.na(df[[i]])), )
#   })

4.4 mutate_if

  • 変数の型を変換するときなど. mutate_if(.tbl, .predicate, .funs)
  • tibble::glimpse データセットの変数の型を把握する(strより見やすい?).
iris %>% mutate_if(.tbl = ., .predicate = is.factor, .funs = as.character) %>% glimpse
iris %>% mutate_if(is.factor, as.character) %>% glimpse 

# 複数の変換
iris %>% mutate_if(is.factor, list(as.character, abbreviate)) %>% glimpse

# lapplyを使う場合
dplyr::glimpse(
  as.data.frame(
      lapply(iris, function(x){ 
        if (is.factor(x)) { as.character(x) } else {x}
      }) 
    )
)    

4.5 mutatesweepと同等の事をする.

  • rowSums(select())
  • 行方向にもっと複雑にする場合はtidyr::gather, tidyr::spreadでやる.
iris %>% mutate(row_sum = rowSums(select(., 1:4))) %>% head
iris %>% mutate(row_sum = apply(.[-5], 1, sum)) %>% head
iris %>% mutate(row_sum = Reduce("+", select(., 1:4))) %>% head
iris %>% mutate(sep_sum = rowSums(select(., matches("Sepal")))) %>% head
iris %>% mutate(prd_sep = Sepal.Length*Sepal.Width) %>% head

# 元の列を置き換える
# iris %>% 
#   mutate(total = rowSums(select(., 1:4))) %>% 
#   mutate_at(1:4, funs(./total)) %>% 
#   head

iris %>% 
  mutate(total = rowSums(select(., 1:4))) %>% 
  mutate_at(1:4, list(~./total)) %>% 
  head

head(sweep(iris[-5], 1, apply(iris[-5], 1, sum), "/"))

# # 元の列を書き換えない 
# iris %>% 
#   mutate(total = rowSums(select(., 1:4))) %>% 
#   mutate_at(1:4, funs(rate = ./total)) %>% 
#   head

iris %>% 
  mutate(total = rowSums(select(., 1:4))) %>% 
  mutate_at(1:4, list(rate = ~./total)) %>% 
  head

4.6 ランキング関数.

  • row_number(), min_rank(), dense_rank(), percent_rank(), percent_rank(), cume_dist(), ntile(x, n)
  • ntile(n = 3)で均等に分割
# mutate(ntile(), min_rank(), dense_rank(), percent_rank())
head(iris) %>% 
  select(1) %>%
  mutate(min_rank = min_rank(Sepal.Length),
         `row_number()` = row_number(),
         row_number = row_number(Sepal.Length),
         dense_rank = dense_rank(Sepal.Length),
         percent_rank = percent_rank(Sepal.Length),
         cume_dist = cume_dist(Sepal.Length),
         ntile = ntile(Sepal.Length, 2)
         ) -> rank_dat

# ntileをつかって均等に分割
discretized <- head(iris) %>% 
  mutate_at(1:4, list(~ntile(., n = 3)))
ranking function
Sepal.Length min_rank row_number() row_number dense_rank percent_rank cume_dist ntile
5.1 5 1 5 5 0.8 0.83 2
4.9 3 2 3 3 0.4 0.50 1
4.7 2 3 2 2 0.2 0.33 1
4.6 1 4 1 1 0.0 0.17 1
5.0 4 5 4 4 0.6 0.67 2
5.4 6 6 6 6 1.0 1.00 2
discretized
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
3 2 1 1 setosa
2 1 2 1 setosa
1 2 1 2 setosa
1 1 3 2 setosa
2 3 2 3 setosa
3 3 3 3 setosa

4.7 top_n

  • top_n(x, n, wt)
SL_top3 <- iris %>% 
  group_by(Species) %>% 
  top_n(3, Sepal.Length) %>%
  arrange(desc(Sepal.Length), .by_group = T) %>%
  select(1,5) 
S.Length top 3
Sepal.Length Species
5.8 setosa
5.7 setosa
5.7 setosa
7.0 versicolor
6.9 versicolor
6.8 versicolor
7.9 virginica
7.7 virginica
7.7 virginica
7.7 virginica
7.7 virginica

4.8 ベクトルの最初の要素を取り出すfirst

lapply(iris, dplyr::first)
lapply(iris, function(x)x[1])

4.9 列ラベル変更 rename, rename_all, rename_if, setNames

# rename
iris %>% 
  rename(SL = Sepal.Length, SW = Sepal.Width, SP = Species) %>% head

iris %>% 
  rename(SL = "Sepal.Length", SW = "Sepal.Width", SP = "Species") %>% head

# rename_all
rn <- c("SL","SW","PL","PW","SP")
iris %>% 
  rename_all(list(~c(rn))) %>% head

# setNames
rn <- c("SL","SW","PL","PW","SP")
head(setNames(iris, nm =  rn ))

4.10 行番号row_number()、行数nrow(), n() を取得

  • group_by(x) %>% summarise(n())で集計.
iris %>% group_by(Species) %>% 
  mutate(rank = row_number()) %>% 
  filter(Species == "versicolor") 

iris %>% 
  filter(Species == "setosa") %>% 
  nrow(.) 

iris %>% 
  group_by(Species) %>% 
  summarise(n = n()) 

# グループごとに行番号をつける
data.frame(gp = rep(1:3, c(3,2,4))) %>% 
  group_by(gp) %>% 
  mutate(rank = row_number())

4.11 行ラベルをカラムに加える, 指定列を行ラベルにする.

  • tibble::rownames_to_column()
  • tibble::column_to_rownames()
# rownames_to_column  
head(tibble::rownames_to_column(iris)) 
iris %>% tibble::rownames_to_column("id") %>% head 
  
# column_to_rownames  
iris %>% 
  mutate(Species = paste(Species, 1:nrow(.), sep = ".")) %>% 
  tibble::column_to_rownames("Species") %>% head

5 行抽出.


5.1 条件式で行抽出filter, subset

  • 条件式の書式: ANDは,で繋げる  ORは|で繋げる
  • baseの場合subset(x, subset, select)
iris %>% 
  filter(Species == "setosa" | Species == 'virginica' ) %>% 
  {table(select(., 5))}

iris %>% 
  filter(Species != "setosa", Species != 'virginica' ) %>% 
  {table(select(., 5))}

iris %>% 
  filter(row_number() <= 4)

# base 
subset(x = iris, subset = Species == "versicolor", select = c(1,5)) 

5.2 行番号で取り出しslice, [

  • slice(.data, MARGIN)
# 行番号で取り出し 
iris %>% .[2:5,]
iris %>% .[c(-1, -6:-nrow(.)),]

# slice 
iris %>% mutate(rn = row_number()) %>%
  dplyr::slice(.data = ., MARGIN = 2:5)
iris %>% mutate(rn = row_number()) %>%
  dplyr::slice(c(-1, -6:-n()))

iris %>% group_by(Species) %>% dplyr::slice(1)
iris %>% group_by(Species) %>% arrange(Sepal.Length) %>% dplyr::slice(1)

5.3 ランダム行抽出 sample_n(), sample_frac()

iris %>% group_by(Species) %>% sample_n(size=3, replace=FALSE, weight=NULL)
sample_n(tbl = iris, size = 8)
sample_frac(tbl = iris, size = 0.05)

5.4 指定列の一意な行のみを取り出すdistinct, duplicated

iris %>% distinct(Species)  
iris %>% distinct(Species, .keep_all = T)
iris[!duplicated(iris$Species),]

5.5 filterなどで取り出したfactorの水準を調節するdroplevels()

iris %>% filter(Species == "setosa") %>% pull(Species)
iris %>% filter(Species == "setosa") %>% droplevels() %>% pull(Species)

6 グループ化group_by.


6.1 並び替え group_by, group_by_at, arrange

  • arrange(x)はデフォルトで昇順 arrange(desc(x))で降順になる
  • group_byしていてもarrangeではグループごとに並び替えない. .by_group =Tでグループごとに並び替え
sw_top5 <- iris %>% group_by(Species) %>% top_n(5, Sepal.Width) 
sw_top5 %>% group_by(Species) %>% arrange(Sepal.Width, .by_group = T) %>% select(2,5)
sw_top5 %>% group_by(Species) %>% arrange(Sepal.Width) %>% select(2,5)

# group_by_at(group内で並び替え)
sw_top5 %>% group_by_at(5) %>% arrange(desc(Sepal.Width), .by_group = T) %>% select(c(2,5)) 

# groupを無視する
sw_top5 %>% group_by_at(5) %>% arrange(desc(Sepal.Width)) %>% select(c(2,5)) 

# baseの場合
Reduce(rbind, lapply(split(sw_top5, f=sw_top5$Species), 
                     function(x) x[order(x$Sepal.Width, decreasing = T), c(2,5)]))

6.2 グループにユニークなidを割り振るgroup_indices(非推奨)

# data
d <- data.frame(id = rep(c("A","B","C"), times = c(2,1,3)), 
                v = c(1, 2, 1, 1, 2, 3), stringsAsFactors = F)

# group_indicesを使う
d %>% mutate(idx = group_indices(., id))

# groupをfactorに変換する.
didx <- d %>% mutate(idx = as.integer(factor(id, levels = unique(id))))
group_indices
id v idx
A 1 1
A 2 1
B 1 2
C 1 3
C 2 3
C 3 3

7 要約 summarise, summarize, tapply


7.1 複数指定列を要約summarise_at, summarise_all

  • summarise_at(.tbl, .vars, .funs), summarise_all(.tbl, .funs)
  • カラム名(NSE)で指定する場合vars()は省略不可
  • tapply(X, INDEX, FUN)でやる場合
  • 複数の.funsを指定する場合
# summarise_at  列名(ベクトル)で指定 vars(Species)
iris %>% 
  group_by(Species) %>% 
  summarise_at(.tbl = ., .vars = vars(1:4), .funs = mean)

iris %>% group_by(Species) %>% summarise_at(1:4, mean)
iris %>% group_by(Species) %>% summarise_at(1:4, list(men = mean, med = median))

iris %>% group_by(Species) %>% 
  summarise_at(vars(Sepal.Length, Sepal.Width), list(mean = mean))

iris %>% group_by(Species) %>% 
  summarise_at(c("Sepal.Length", "Sepal.Width"), mean)

# summarise_all
iris %>% group_by(Species) %>% summarise_all(mean) 
 
# tapplyの場合
data.frame(lapply(iris[-5], function(v){tapply(X = v, INDEX = iris$Species, FUN = function(x)mean(x))}))

# 複数のfuns
iris %>% group_by(Species) %>% 
  summarize_at(vars(Sepal.Length), list(men = mean , med = median, 
                                        mx = max, mn = min))
iris %>% group_by(Species) %>% 
  summarize_at(vars(Sepal.Length), list(mean, median, max, min))

#iris %>% group_by(Species) %>% summarize_at(1:4, list(mean, median, max, min))
#iris %>% group_by(Species) %>% summarize_at(1:4, funs(mean, median, max, min))

7.2 集計summarise, n(), prop.table()

  • 要約しても、行を維持したい場合はmutate
  • 割合prop.table
# data
d <- data.frame(x = c(3, 2, 1, 2, 1, 3, 3, 1, 2),
                y = c(3, 1, 3, 1, 2, 2, 2, 3, 1))

# グループのメンバー数 group_by() %>% summarise(n())
sum_d <- d %>% group_by(x, y) %>% summarise(count = n())
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# sum_d <- d %>% group_by(x, y) %>% mutate(count = n()) %>% slice(1)

# グループメンバーの割合 prop.table()
sum_d_prop <- d %>% 
  group_by(x, y) %>% 
  summarise(count = n()) %>% 
  mutate(gprop = round(prop.table(count), 3)) %>%
  ungroup() %>%
  mutate(prop = round(prop.table(count), 3))
## `summarise()` has grouped output by 'x'. You can override using the `.groups`
## argument.
# 要約しても、行を維持したい場合はmutate
sum_d_al <- d %>% group_by(x, y) %>% mutate(count = n()) %>% arrange(x, y)
sum_d_al <- d %>% group_by(x, y) %>% mutate(gp = group_indices()) %>% arrange(gp)
## Warning: `group_indices()` was deprecated in dplyr 1.0.0.

## Warning: Please use `cur_group_id()` instead.
base
x y
3 3
2 1
1 3
2 1
1 2
3 2
3 2
1 3
2 1
summarise(n())
x y count
1 2 1
1 3 2
2 1 3
3 2 2
3 3 1
summarise(n())
x y count gprop prop
1 2 1 0.333 0.111
1 3 2 0.667 0.222
2 1 3 1.000 0.333
3 2 2 0.667 0.222
3 3 1 0.333 0.111
mutate(n())
x y gp
1 2 1
1 3 2
1 3 2
2 1 3
2 1 3
2 1 3
3 2 4
3 2 4
3 3 5

7.3 重複レコードの値(文字列)を結合.

d <- data.frame(k = c("a","b","c","a","c","c"), v = c("xxx","xyx","xxy","yyy", "yyx","xxz")) 
sumd1 <- d %>% group_by(k) %>% summarise(sumtxt = toString(v), by = "k")
sumd2 <- d %>% group_by(k) %>% summarise(sumtxt = paste(v, collapse = ";")) 
multiple_key
k v
a xxx
b xyx
c xxy
a yyy
c yyx
c xxz
summarise(toString(v),by=‘k’)
k sumtxt by
a xxx, yyy k
b xyx k
c xxy, yyx, xxz k
summarise(paste(v,collapse=‘;’)
k sumtxt
a xxx;yyy
b xyx
c xxy;yyx;xxz

7.4 要素数の異なるリストをデータフレームに変換.

  • 上記の短くしたデータを元に戻している(名前付きのリストをデータフレームに変換する).
  • reshape2::melt, stack
# 重複レコードを結合した文字列をstrsplitしてリストに変換keyとなっている列を名前として付ける
(nl <- sapply(strsplit(sumd2$sumtxt, ";"), "[") %>% setNames(sumd2$k))
## $a
## [1] "xxx" "yyy"
## 
## $b
## [1] "xyx"
## 
## $c
## [1] "xxy" "yyx" "xxz"
# 全く同じ形にすることにこだわらなければ, meltもしくはstackが良い 
nl_to_df1 <- data.frame(k = rep(names(nl), v = sapply(nl, length)), v = unlist(nl))

nl_to_df2 <- reshape2::melt(nl) %>% select(2,1) %>% setNames(c("k","v"))

nl_to_df3 <- data.frame(k = substr(names(unlist(nl)), 1,1), v = unlist(nl))

nl_to_df4 <- stack(nl) %>% select(2,1) %>% setNames(c("k", "v"))
multiple key and different value
k v
a xxx
a yyy
b xyx
c xxy
c yyx
c xxz

 

7.5 反復データ -> 平均値

# data
df <- data.frame(gene = paste("g", 1:5, sep = "_"), 
           matrix(rpois(15, 5), ncol = 3), 
           matrix(rpois(15, 20), ncol = 3)) %>%
  setNames(., c("gene", "s.1.1","s.1.2", "s.1.3", "s.2.1", "s.2.2", "s.2.3"))

# 列ラベルのindexを基にして, apply 
idx <- split(names(df)[-1], rep(c(1,2), each = 3))
mdf1 <- lapply(idx, function(x) apply(df[x], 1, mean)) %>% 
  {data.frame(gene = df$gene, do.call(cbind, .))} %>%
  setNames(., c("gene", "s.1", "s.2"))

# 列持ちデータにしてから
mdf2 <- tidyr::gather(df, "smp", "value", -1) %>% 
  {data.frame(gene = .$gene, 
              smp = .$smp, 
              do.call(rbind, strsplit(.$smp, "\\.")), 
              value = .$value)} %>%
  group_by(gene, X1, X2) %>% 
  summarise(meanv = mean(value)) %>% 
  ungroup() %>%
  mutate(smp = paste(X1, X2, sep = ".")) %>%
  select(gene, smp, meanv) %>%
  tidyr::spread(., "smp", "meanv")
## `summarise()` has grouped output by 'gene', 'X1'. You can override using the
## `.groups` argument.
3反復づつのデータ
gene s.1.1 s.1.2 s.1.3 s.2.1 s.2.2 s.2.3
g_1 6 3 6 20 22 24
g_2 5 4 4 23 20 26
g_3 5 7 6 18 18 19
g_4 8 7 4 17 16 28
g_5 2 7 9 19 19 17
列ラベルのindexを用いて
gene s.1 s.2
g_1 5.000000 22.00000
g_2 4.333333 23.00000
g_3 6.000000 18.33333
g_4 6.333333 20.33333
g_5 6.000000 18.33333
tidyr::gather –> tidyr::spread
gene s.1 s.2
g_1 5.000000 22.00000
g_2 4.333333 23.00000
g_3 6.000000 18.33333
g_4 6.333333 20.33333
g_5 6.000000 18.33333

8 行持ちデータ、列持ちデータ変換 tidyr::gather, tidyr::spread


library(magrittr)
# gather
gt_iris <- iris %>% mutate(rn = row_number()) %>% 
  tidyr::gather(., key = "k", value = "v", -5:-6) %T>% {print(head(.))}

# spread 
sp_iris <- tidyr::spread(gt_iris, k, v) %T>% {print(head(.))}

9 arrayをデータフレームに変換するreshape2::melt, as.data.frame


df_hec <- reshape2::melt(HairEyeColor, c('Hair', 'Eye', 'Sex'))
df_hec <- as.data.frame(HairEyeColor) 
HairEyeColor 1~6行
Hair Eye Sex Freq
Black Brown Male 32
Brown Brown Male 53
Red Brown Male 10
Blond Brown Male 3
Black Blue Male 11
Brown Blue Male 50

10 データフレームをマージ merge, *_join


set.seed(123)
df1 <- data.frame(cbind(key = LETTERS[1:5], v = round(runif(5, 1, 10), 1)), 
                  stringsAsFactors = F)

df2 <- data.frame(cbind(key = LETTERS[c(1,3,5)], v = round(runif(3, 1, 10), 1)), 
                  stringsAsFactors = F)

df3 <- data.frame(cbind(key = LETTERS[c(2,6)], v = round(runif(2, 1, 10), 1)), 
                  stringsAsFactors = F)

df4 <- data.frame(cbind(lab = LETTERS[c(1,1:4)], v = round(runif(5, 1, 10), 1)),
                  stringsAsFactors = F)

dfl <- list(df1, df2, df3)
df1
key v
A 3.6
B 8.1
C 4.7
D 8.9
E 9.5
df2
key v
A 1.4
C 5.8
E 9
df3
key v
B 6
F 5.1
df4
lab v
A 9.6
A 5.1
B 7.1
C 6.2
D 1.9

10.1 merge(x, y, by, all.x, suffixes)

# merge  左結合,右結合, 完全結合 "all.x", "all.y", "all"
mdf_1.3 <- merge(df1, df3, by = "key", all = T, suffixes = c("_1", "_3"))
mdf_1.2_left <- merge(df1, df2, by = "key", all.x = T, suffixes = c("_1", "_2"))
mdf_1.2_right <- merge(df1, df2, by = "key", all.y = T, suffixes = c("_1", "_2"))

# keyが異なる場合
mdf_1.4 <- merge(df1, df4, by.x = "key", by.y = "lab", all = T, suffixes = c("_1", "_4"))

# 複数dfをマージ(データが巨大になると時間がかかる)
mdf_al <- Reduce(function(x,y) merge(x, y, by = "key", all = T), dfl) %>% 
  setNames(., c("k", "V_1", "V_2", "V_3"))
完全結合 df1 & df3
key v_1 v_3
A 3.6 NA
B 8.1 6
C 4.7 NA
D 8.9 NA
E 9.5 NA
F NA 5.1
左結合 df1 & df2
key v_1 v_2
A 3.6 1.4
B 8.1 NA
C 4.7 5.8
D 8.9 NA
E 9.5 9
右結合 df1 & df2
key v_1 v_2
A 3.6 1.4
C 4.7 5.8
E 9.5 9
異なるkeyで結合 df1 & df4
key v_1 v_4
A 3.6 9.6
A 3.6 5.1
B 8.1 7.1
C 4.7 6.2
D 8.9 1.9
E 9.5 NA
完全結合 df1,df2,df3
k V_1 V_2 V_3
A 3.6 1.4 NA
B 8.1 NA 6
C 4.7 5.8 NA
D 8.9 NA NA
E 9.5 9 NA
F NA NA 5.1

10.2 *_join

  • 異なるkeyで結合する場合full_join(x, y, by = c("key_x" = "key_y"))
# full_join, left_join, right_join, inner_join
full_join(df1, df3, by = "key", suffix = c("_1","_3"))
left_join(df1, df2, by = "key", suffix = c("_1","_2"))
right_join(df1, df2, by = "key", suffix = c("_1","_2"))
full_join(df1, df4, by = c("key" = "lab"))
inner_join(df1, df2, by = "key", suffix = c("_1", "_2"))

10.3 複数列をkeyとして結合する場合full_join(x, y, by = c("key_1", "key_2"))

# 複数列をkeyとしてjoin
df_l <- data.frame(k1 = rep(c("A","B","C"), each = 3), stringsAsFactors = F) %>% 
  group_by(k1) %>% mutate(k2 = row_number()) %>% ungroup()

df_r <- df_l[c(-1,-4,-5,-7),] %>% mutate(., v = rpois(5, 20))

mdf_mk <- left_join(df_l, df_r, by = c("k1", "k2"))
mdf_sk <- left_join(df_l, df_r, by = "k1")
df_left
k1 k2
A 1
A 2
A 3
B 1
B 2
B 3
C 1
C 2
C 3
df_right
k1 k2 v
A 2 25
A 3 12
B 3 25
C 2 21
C 3 21
joined by multiple keys
k1 k2 v
A 1 NA
A 2 25
A 3 12
B 1 NA
B 2 NA
B 3 25
C 1 NA
C 2 21
C 3 21

11 データフレームを結合bind_cols, bind_rows


dfl <- split(iris, iris$Species) 
bind_rows(dfl) %>% glimpse()

# bind_cols( or bind_rows)はdata.frameに対して適応される. 
mats <- lapply(1:3, function(i) {set.seed(i); matrix(rnorm(6), ncol=2)})
dats <- lapply(1:3, function(i) {set.seed(i); data.frame(matrix(rnorm(6), ncol=2))})

Reduce(cbind, mats)
bind_cols(dats)

# rownamesが保持されない
dats <- lapply(dats, function(x) {rownames(x) <- letters[1:3]; return(x)})
bind_cols(dats)

12 dplyrの関数とbase R関数の比較

13 環境


sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Mojave 10.14.6
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] stringr_1.4.0  tidyr_1.2.0    tibble_3.1.6   magrittr_2.0.3 dplyr_1.0.8   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.8.3      pillar_1.8.0      bslib_0.3.0       compiler_4.0.3   
##  [5] jquerylib_0.1.4   highr_0.9         plyr_1.8.7        tools_4.0.3      
##  [9] digest_0.6.29     viridisLite_0.4.0 jsonlite_1.7.2    evaluate_0.14    
## [13] lifecycle_1.0.3   pkgconfig_2.0.3   rlang_1.0.6       cli_3.5.0        
## [17] DBI_1.1.1         rstudioapi_0.13   yaml_2.2.1        xfun_0.25        
## [21] rsko_0.1.0        fastmap_1.1.0     kableExtra_1.3.4  xml2_1.3.2       
## [25] httr_1.4.2        knitr_1.33        systemfonts_1.0.2 generics_0.1.3   
## [29] vctrs_0.5.1       sass_0.4.0        webshot_0.5.2     tidyselect_1.1.2 
## [33] svglite_2.0.0     glue_1.6.2        R6_2.5.1          fansi_1.0.3      
## [37] rmarkdown_2.10    reshape2_1.4.4    purrr_0.3.4       scales_1.2.0     
## [41] htmltools_0.5.2   ellipsis_0.3.2    rvest_1.0.1       assertthat_0.2.1 
## [45] colorspace_2.0-3  utf8_1.2.2        stringi_1.7.6     munsell_0.5.0