ptm <- proc.time()

目的

文献検索

# ERIC
"aptitude-treatment interaction" AND PEER(yes) AND (stype.exact("Scholarly Journals") AND pd(20130101-20231231))

# PsycInfo
全てのフィールド: "aptitude-treatment interaction"
出版日: 20130101-
出版物タイプ: Peer Reviewed Journal
言語: English, Japanese

bibファイルの加工

論文ごとのcsvファイルを作る

# ダウンロードしたbibファイルを読み
e <- readr::read_file(file = "../bib/ERIC.bib")
p <- readr::read_file(file = "../bib/PsycInfo.bib")

# 1論文1行にして
e2 <- data.frame(base::strsplit(e, split = "\n\n")[[1]])
p2 <- data.frame(base::strsplit(p, split = "\n\n")[[1]])

# データベースごとの行数のデータを作り
e2.row <- c(1:nrow(e2))
p2.row <- c(1:nrow(p2))

# 1論文1つのcsvファイルで保存して
for(i in e2.row){
  file <- data.frame(base::strsplit(e2[i,], split=",\n"))
  setwd("../ERIC")
  csvname <- paste("e_", i, ".csv", sep = "")
  write.csv(file, csvname)
}

for(i in p2.row){
  file <- data.frame(base::strsplit(p2[i,], split=",\n"))
  setwd("../PsycInfo")
  csvname <- paste("p_", i, ".csv", sep = "")
  write.csv(file, csvname)
}

# csvファイルを読み出し
for(i in e2.row){
  name <- paste("e_", i, sep = "")
  dataname <- paste("e.csv.", i, sep = "")
  assign(dataname, read.csv(paste("../ERIC/", name, ".csv", sep = "")))
}

for(i in p2.row){
  name <- paste("p_", i, sep = "")
  dataname <- paste("p.csv.", i, sep = "")
  assign(dataname, read.csv(paste("../PsycInfo/", name, ".csv", sep = "")))
}

行数が論文によって違うことがあるので確認する

e.nrow <- data.frame(matrix(c(
1, nrow(e.csv.1), 2, nrow(e.csv.2), 3, nrow(e.csv.3),
4, nrow(e.csv.4), 5, nrow(e.csv.5), 6, nrow(e.csv.6),
7, nrow(e.csv.7), 8, nrow(e.csv.8), 9, nrow(e.csv.9),
10, nrow(e.csv.10), 11, nrow(e.csv.11), 12, nrow(e.csv.12),
13, nrow(e.csv.13), 14, nrow(e.csv.14), 15, nrow(e.csv.15),
16, nrow(e.csv.16), 17, nrow(e.csv.17), 18, nrow(e.csv.18),
19, nrow(e.csv.19), 20, nrow(e.csv.20), 21, nrow(e.csv.21),
22, nrow(e.csv.22), 23, nrow(e.csv.23), 24, nrow(e.csv.24),
25, nrow(e.csv.25), 26, nrow(e.csv.26), 27, nrow(e.csv.27),
28, nrow(e.csv.28), 29, nrow(e.csv.29), 30, nrow(e.csv.30),
31, nrow(e.csv.31), 32, nrow(e.csv.32), 33, nrow(e.csv.33),
34, nrow(e.csv.34), 35, nrow(e.csv.35), 36, nrow(e.csv.36),
37, nrow(e.csv.37), 38, nrow(e.csv.38), 39, nrow(e.csv.39),
40, nrow(e.csv.40), 41, nrow(e.csv.41), 42, nrow(e.csv.42),
43, nrow(e.csv.43), 44, nrow(e.csv.44), 45, nrow(e.csv.45),
46, nrow(e.csv.46), 47, nrow(e.csv.47), 48, nrow(e.csv.48),
49, nrow(e.csv.49), 50, nrow(e.csv.50), 51, nrow(e.csv.51),
52, nrow(e.csv.52), 53, nrow(e.csv.53), 54, nrow(e.csv.54),
55, nrow(e.csv.55), 56, nrow(e.csv.56), 57, nrow(e.csv.57),
58, nrow(e.csv.58), 59, nrow(e.csv.59), 60, nrow(e.csv.60),
61, nrow(e.csv.61), 62, nrow(e.csv.62), 63, nrow(e.csv.63),
64, nrow(e.csv.64), 65, nrow(e.csv.65), 66, nrow(e.csv.66),
67, nrow(e.csv.67), 68, nrow(e.csv.68), 69, nrow(e.csv.69),
70, nrow(e.csv.70), 71, nrow(e.csv.71), 72, nrow(e.csv.72),
73, nrow(e.csv.73), 74, nrow(e.csv.74), 75, nrow(e.csv.75),
76, nrow(e.csv.76), 77, nrow(e.csv.77), 78, nrow(e.csv.78),
79, nrow(e.csv.79), 80, nrow(e.csv.80), 81, nrow(e.csv.81),
82, nrow(e.csv.82), 83, nrow(e.csv.83), 84, nrow(e.csv.84),
85, nrow(e.csv.85), 86, nrow(e.csv.86), 87, nrow(e.csv.87),
88, nrow(e.csv.88), 89, nrow(e.csv.89), 90, nrow(e.csv.90),
91, nrow(e.csv.91), 92, nrow(e.csv.92), 93, nrow(e.csv.93),
94, nrow(e.csv.94), 95, nrow(e.csv.95), 96, nrow(e.csv.96),
97, nrow(e.csv.97), 98, nrow(e.csv.98), 99, nrow(e.csv.99),
100, nrow(e.csv.100), 101, nrow(e.csv.101), 102, nrow(e.csv.102),
103, nrow(e.csv.103), 104, nrow(e.csv.104), 105, nrow(e.csv.105),
106, nrow(e.csv.106), 107, nrow(e.csv.107), 108, nrow(e.csv.108),
109, nrow(e.csv.109), 110, nrow(e.csv.110), 111, nrow(e.csv.111),
112, nrow(e.csv.112), 113, nrow(e.csv.113), 114, nrow(e.csv.114),
115, nrow(e.csv.115), 116, nrow(e.csv.116), 117, nrow(e.csv.117),
118, nrow(e.csv.118), 119, nrow(e.csv.119), 120, nrow(e.csv.120),
121, nrow(e.csv.121)
          ),ncol = 2, byrow =TRUE))
colnames(e.nrow) <- c("seq", "nrow")

p.nrow <- data.frame(matrix(c(
1, nrow(p.csv.1), 2, nrow(p.csv.2), 3, nrow(p.csv.3),
4, nrow(p.csv.4), 5, nrow(p.csv.5), 6, nrow(p.csv.6),
7, nrow(p.csv.7), 8, nrow(p.csv.8), 9, nrow(p.csv.9),
10, nrow(p.csv.10), 11, nrow(p.csv.11), 12, nrow(p.csv.12),
13, nrow(p.csv.13), 14, nrow(p.csv.14), 15, nrow(p.csv.15),
16, nrow(p.csv.16), 17, nrow(p.csv.17), 18, nrow(p.csv.18),
19, nrow(p.csv.19), 20, nrow(p.csv.20), 21, nrow(p.csv.21),
22, nrow(p.csv.22), 23, nrow(p.csv.23), 24, nrow(p.csv.24),
25, nrow(p.csv.25), 26, nrow(p.csv.26), 27, nrow(p.csv.27),
28, nrow(p.csv.28), 29, nrow(p.csv.29), 30, nrow(p.csv.30),
31, nrow(p.csv.31), 32, nrow(p.csv.32), 33, nrow(p.csv.33),
34, nrow(p.csv.34), 35, nrow(p.csv.35), 36, nrow(p.csv.36),
37, nrow(p.csv.37), 38, nrow(p.csv.38), 39, nrow(p.csv.39),
40, nrow(p.csv.40), 41, nrow(p.csv.41), 42, nrow(p.csv.42),
43, nrow(p.csv.43), 44, nrow(p.csv.44), 45, nrow(p.csv.45),
46, nrow(p.csv.46)
          ),ncol = 2, byrow =TRUE))
colnames(p.nrow) <- c("seq", "nrow")

#table(e.nrow[c("nrow")])
#table(p.nrow[c("nrow")])

行数ごとにデータをまとめていく

ERIC

結合するデータの確認

e.nrow.15.seq <- subset(e.nrow, nrow == 15); e.nrow.15.seq[c("seq")]
##     seq
## 16   16
## 17   17
## 18   18
## 19   19
## 20   20
## 22   22
## 23   23
## 24   24
## 28   28
## 29   29
## 31   31
## 33   33
## 34   34
## 40   40
## 41   41
## 43   43
## 45   45
## 46   46
## 47   47
## 48   48
## 49   49
## 50   50
## 62   62
## 63   63
## 64   64
## 65   65
## 72   72
## 73   73
## 74   74
## 75   75
## 76   76
## 77   77
## 78   78
## 79   79
## 80   80
## 81   81
## 82   82
## 83   83
## 84   84
## 86   86
## 87   87
## 88   88
## 89   89
## 90   90
## 91   91
## 92   92
## 108 108
## 114 114
## 115 115
## 118 118
e.nrow.14.seq <- subset(e.nrow, nrow == 14); e.nrow.14.seq[c("seq")]
##     seq
## 1     1
## 2     2
## 3     3
## 4     4
## 5     5
## 6     6
## 7     7
## 9     9
## 10   10
## 12   12
## 13   13
## 14   14
## 15   15
## 21   21
## 25   25
## 26   26
## 27   27
## 32   32
## 35   35
## 36   36
## 37   37
## 38   38
## 39   39
## 42   42
## 44   44
## 51   51
## 52   52
## 53   53
## 55   55
## 56   56
## 58   58
## 59   59
## 60   60
## 61   61
## 66   66
## 67   67
## 68   68
## 70   70
## 71   71
## 85   85
## 93   93
## 94   94
## 95   95
## 96   96
## 97   97
## 98   98
## 99   99
## 100 100
## 101 101
## 102 102
## 103 103
## 104 104
## 105 105
## 106 106
## 107 107
## 109 109
## 110 110
## 111 111
## 112 112
## 113 113
## 116 116
## 117 117
## 119 119
## 120 120
## 121 121
e.nrow.13.seq <- subset(e.nrow, nrow == 13); e.nrow.13.seq[c("seq")]
##    seq
## 8    8
## 11  11
## 30  30
## 54  54
## 57  57
## 69  69
#e.nrow.15.seq
#e.nrow.14.seq
#e.nrow.13.seq

結合して保存する

e.nrow.15_ <- cbind(e.csv.16[,2], e.csv.17[,2], e.csv.18[,2],
                    e.csv.19[,2], e.csv.20[,2], e.csv.22[,2],
                    e.csv.23[,2], e.csv.24[,2], e.csv.28[,2],
                    e.csv.29[,2], e.csv.31[,2], e.csv.33[,2],
                    e.csv.34[,2], e.csv.40[,2], e.csv.41[,2],
                    e.csv.43[,2], e.csv.45[,2], e.csv.46[,2],
                    e.csv.47[,2], e.csv.48[,2], e.csv.49[,2],
                    e.csv.50[,2], e.csv.62[,2], e.csv.63[,2],
                    e.csv.64[,2], e.csv.65[,2], e.csv.72[,2],
                    e.csv.73[,2], e.csv.74[,2], e.csv.75[,2],
                    e.csv.76[,2], e.csv.77[,2], e.csv.78[,2],
                    e.csv.79[,2], e.csv.80[,2], e.csv.81[,2],
                    e.csv.82[,2], e.csv.83[,2], e.csv.84[,2],
                    e.csv.86[,2], e.csv.87[,2], e.csv.88[,2],
                    e.csv.89[,2], e.csv.90[,2], e.csv.91[,2],
                    e.csv.92[,2], e.csv.108[,2], e.csv.114[,2],
                    e.csv.115[,2],e.csv.118[,2])
e.nrow.14_ <- cbind(e.csv.1[,2], e.csv.2[,2], e.csv.3[,2],
                    e.csv.4[,2], e.csv.5[,2], e.csv.6[,2],
                    e.csv.7[,2], e.csv.9[,2], e.csv.10[,2],
                    e.csv.12[,2], e.csv.13[,2], e.csv.14[,2],
                    e.csv.15[,2], e.csv.21[,2], e.csv.25[,2],
                    e.csv.26[,2], e.csv.27[,2], e.csv.32[,2],
                    e.csv.35[,2], e.csv.36[,2], e.csv.37[,2],
                    e.csv.38[,2], e.csv.39[,2], e.csv.42[,2],
                    e.csv.44[,2], e.csv.51[,2], e.csv.52[,2],
                    e.csv.53[,2], e.csv.55[,2], e.csv.56[,2],
                    e.csv.58[,2], e.csv.59[,2], e.csv.60[,2],
                    e.csv.61[,2], e.csv.66[,2], e.csv.67[,2],
                    e.csv.68[,2], e.csv.70[,2], e.csv.71[,2],
                    e.csv.85[,2], e.csv.93[,2], e.csv.94[,2],
                    e.csv.95[,2], e.csv.96[,2], e.csv.97[,2],
                    e.csv.98[,2], e.csv.99[,2], e.csv.100[,2],
                    e.csv.101[,2], e.csv.102[,2], e.csv.103[,2],
                    e.csv.104[,2], e.csv.105[,2], e.csv.106[,2],
                    e.csv.107[,2], e.csv.109[,2], e.csv.110[,2],
                    e.csv.111[,2], e.csv.112[,2], e.csv.113[,2],
                    e.csv.116[,2], e.csv.117[,2], e.csv.119[,2],
                    e.csv.120[,2], e.csv.121[,2])
e.nrow.13_ <- cbind(e.csv.8[,2], e.csv.11[,2], e.csv.30[,2],
                     e.csv.54[,2], e.csv.57[,2],e.csv.69[,2])
# 保存
e.nrow.15 <- data.frame(t(e.nrow.15_))
e.nrow.14 <- data.frame(t(e.nrow.14_))
e.nrow.13 <- data.frame(t(e.nrow.13_))

library(openxlsx)
write.xlsx(e.nrow.15, "../Data/e_nrow_15.xlsx")
write.xlsx(e.nrow.14, "../Data/e_nrow_14.xlsx")
write.xlsx(e.nrow.13, "../Data/e_nrow_13.xlsx")

PsycInfo

結合するデータの確認

  • 行数は全て13行

結合して保存する

p.nrow.13_ <- cbind(p.csv.6[,2], p.csv.7[,2], p.csv.8[,2],
                    p.csv.9[,2], p.csv.10[,2], p.csv.11[,2],
                    p.csv.12[,2], p.csv.13[,2], p.csv.14[,2],
                    p.csv.15[,2], p.csv.16[,2], p.csv.17[,2],
                    p.csv.18[,2], p.csv.19[,2], p.csv.20[,2],
                    p.csv.21[,2], p.csv.22[,2], p.csv.23[,2],
                    p.csv.24[,2], p.csv.25[,2], p.csv.26[,2],
                    p.csv.27[,2], p.csv.28[,2], p.csv.29[,2],
                    p.csv.30[,2], p.csv.31[,2], p.csv.32[,2],
                    p.csv.33[,2], p.csv.34[,2], p.csv.35[,2],
                    p.csv.36[,2], p.csv.37[,2], p.csv.38[,2],
                    p.csv.39[,2], p.csv.40[,2], p.csv.41[,2],
                    p.csv.42[,2], p.csv.43[,2], p.csv.44[,2],
                    p.csv.45[,2], p.csv.46[,2])
## Warning in cbind(p.csv.6[, 2], p.csv.7[, 2], p.csv.8[, 2], p.csv.9[, 2], :
## number of rows of result is not a multiple of vector length (arg 2)
# 保存
p.nrow.13 <- data.frame(t(p.nrow.13_))

library(openxlsx)
write.xlsx(p.nrow.13, "../Data/p_nrow_13.xlsx")

出力を成形する

  • 行の内容が一致しないケースがある
  • 出力したxlsxファイルを手作業で加工する必要がある
# ERIC
15列
c("author", "year", "month", "title", "journal", "volume", "number", "pages", "note", "abstract", "keywords", "issn", "language", "url", "no.use")
14列,13列
整形して15列にして以下の通りにする
c("author", "year", "month", "title", "journal", "volume", "number", "pages", "note", "abstract", "keywords", "isbn", "language", "url", "no.use")

# PsycInfo
13列
VolumeとURLが入れ替わっているデータがあるので注意
c("no.use.1", "abstract", "author", "issn", "journal", "keywords", "number", "pages", "title", "url", "volume", "year", "no.use2")

整形したxlsxファイルを読み込む

library(openxlsx)
e.nrow.15r <- read.xlsx("../Data/e_nrow_15r.xlsx", sheet=1)
e.nrow.14r <- read.xlsx("../Data/e_nrow_14r.xlsx", sheet=1)
e.nrow.13r <- read.xlsx("../Data/e_nrow_13r.xlsx", sheet=1)

p.nrow.13r <- read.xlsx("../Data/p_nrow_13r.xlsx", sheet=1)

eric.col <- c("author", "year", "month", "title", "journal", "volume", "number", "pages", "note", "abstract", "keywords", "issn", "language", "url", "no.use")
psyc.col <- c("no.use.1", "abstract", "author", "issn", "journal", "keywords", "number", "pages", "title", "volume", "url", "year", "no.use2")

colnames(e.nrow.15r) <- eric.col
colnames(e.nrow.14r) <- eric.col
colnames(e.nrow.13r) <- eric.col
colnames(p.nrow.13r) <- psyc.col

library(dplyr)
e.raw <- dplyr::bind_rows(e.nrow.15r, e.nrow.14r, e.nrow.13r)
p.raw <- p.nrow.13r

余計な文字を削除する

ERIC

author.rm <- c("author=\\{")
rm.2 <- c("\\}")
year.rm <- c("year=\\{")
month.rm <- c("month=\\{")
title.rm <- c("title=\\{")
journal.rm <- c("journal=\\{")
volume.rm <- c("volume=\\{")
number.rm <- c("number=\\{")
pages.rm <- c("pages=\\{")
abstract.rm <- c("abstract=\\{")
keywords.rm <- c("keywords=\\{")
issn.rm <- c("isbn=\\{")
url.rm <- c("url=\\{")

library(stringr)
e.raw$author <- str_replace_all(e.raw$author, pattern = author.rm, replacement = "")
e.raw$author <- str_replace_all(e.raw$author, rm.2, "")
e.raw$year   <- str_replace_all(e.raw$year, pattern = year.rm, replacement = "")
e.raw$year   <- str_replace_all(e.raw$year, rm.2, "")
e.raw$month  <- str_replace_all(e.raw$month, pattern = month.rm, replacement = "")
e.raw$month  <- str_replace_all(e.raw$month, rm.2, "")
e.raw$title  <- str_replace_all(e.raw$title, pattern = title.rm, replacement = "")
e.raw$title  <- str_replace_all(e.raw$title, rm.2, "")
e.raw$journal <- str_replace_all(e.raw$journal, pattern = journal.rm, replacement = "")
e.raw$journal <- str_replace_all(e.raw$journal, rm.2, "")
e.raw$volume <- str_replace_all(e.raw$volume, pattern = volume.rm, replacement = "")
e.raw$volume <- str_replace_all(e.raw$volume, rm.2, "")
e.raw$number <- str_replace_all(e.raw$number, pattern = number.rm, replacement = "")
e.raw$number <- str_replace_all(e.raw$number, rm.2, "")
e.raw$pages  <- str_replace_all(e.raw$pages, pattern = pages.rm, replacement = "")
e.raw$pages  <- str_replace_all(e.raw$pages, rm.2, "")
e.raw$abstract <- str_replace_all(e.raw$abstract, pattern = abstract.rm, replacement = "")
e.raw$abstract <- str_replace_all(e.raw$abstract, rm.2, "")
e.raw$keywords <- str_replace_all(e.raw$keywords, pattern = keywords.rm, replacement = "")
e.raw$keywords <- str_replace_all(e.raw$keywords, rm.2, "")
e.raw$issn <- str_replace_all(e.raw$issn, pattern = issn.rm, replacement = "")
e.raw$issn <- str_replace_all(e.raw$issn, rm.2, "")
e.raw$url <- str_replace_all(e.raw$url, pattern = url.rm, replacement = "")
e.raw$url <- str_replace_all(e.raw$url, rm.2, "")

# タイトルを先頭だけ大文字,あとは小文字
e.raw$title <- str_to_sentence(e.raw$title)

PsycInfo

abstract.rm <- c("Abstract = \\{")
author.rm <- c("Author = \\{")
issn.rm <- c("ISSN = \\{")
journal.rm <- c("Journal = \\{")
keywords.rm <- c("Keywords = \\{")
number.rm <- c("Number = \\{")
pages.rm <- c("Pages = \\{")
title.rm <- c("Title = \\{")
url.rm <- c("URL = \\{")
volume.rm <- c("Volume = \\{")
year.rm <- c("Year = \\{")
rm.2 <- c("\\}")
period <- c("\\.")

library(stringr)
p.raw$abstract <- str_replace_all(p.raw$abstract, pattern = abstract.rm, replacement = "")
p.raw$abstract <- str_replace_all(p.raw$abstract, rm.2, "")
p.raw$author <- str_replace_all(p.raw$author, pattern = author.rm, replacement = "")
p.raw$author <- str_replace_all(p.raw$author, rm.2, "")
p.raw$issn <- str_replace_all(p.raw$issn, pattern = issn.rm, replacement = "")
p.raw$issn <- str_replace_all(p.raw$issn, rm.2, "")
p.raw$journal <- str_replace_all(p.raw$journal, pattern = journal.rm, replacement = "")
p.raw$journal <- str_replace_all(p.raw$journal, rm.2, "")
p.raw$keywords <- str_replace_all(p.raw$keywords, pattern = keywords.rm, replacement = "")
p.raw$keywords <- str_replace_all(p.raw$keywords, rm.2, "")
p.raw$number <- str_replace_all(p.raw$number, pattern = number.rm, replacement = "")
p.raw$number <- str_replace_all(p.raw$number, rm.2, "")
p.raw$pages <- str_replace_all(p.raw$pages, pattern = pages.rm, replacement = "")
p.raw$pages <- str_replace_all(p.raw$pages, rm.2, "")
p.raw$title <- str_replace_all(p.raw$title, pattern = title.rm, replacement = "")
p.raw$title <- str_replace_all(p.raw$title, pattern = period, replacement = "")
p.raw$title <- str_replace_all(p.raw$title, rm.2, "")
p.raw$url <- str_replace_all(p.raw$url, pattern = url.rm, replacement = "")
p.raw$url <- str_replace_all(p.raw$url, rm.2, "")
p.raw$volume <- str_replace_all(p.raw$volume, pattern = volume.rm, replacement = "")
p.raw$volume <- str_replace_all(p.raw$volume, rm.2, "")
p.raw$year <- str_replace_all(p.raw$year, pattern = year.rm, replacement = "")
p.raw$year <- str_replace_all(p.raw$year, rm.2, "")

# キーワード区切りをセミコロンにする
p.raw$keywords <- str_replace_all(p.raw$keywords, pattern = ",", replacement = ";")
# ページ番号の空白をなくす
p.raw$pages <- str_replace_all(p.raw$pages, " ", "")
# タイトルを先頭だけ大文字,あとは小文字
p.raw$title <- str_to_sentence(p.raw$title)

連番を振り,列を揃えて,1本のデータにまとめる

# ERIC
e.raw$db <- c("eric")
library(dplyr)
e.raw <- e.raw %>% mutate(seq = row_number())
e.raw$db.seq <- paste(e.raw$db, ".", e.raw$seq, sep = "")
eric <- e.raw[c("db", "seq", "db.seq", "author", "year", "title", "journal", 
                   "volume", "number", "pages", "keywords","issn", "abstract")]

# PsycInfo
p.raw$db <- c("psyc")
library(dplyr)
p.raw <- p.raw %>% mutate(seq = row_number())
p.raw$db.seq <- paste(p.raw$db, ".", p.raw$seq, sep = "")
psyc <- p.raw[c("db", "seq", "db.seq", "author", "year", "title", "journal", 
                   "volume", "number", "pages", "keywords","issn", "abstract")]

# まとめる
eric.psyc <- dplyr::bind_rows(eric, psyc)

# ここで保存しておく
library(openxlsx)
write.xlsx(eric, "../Data/ERIC.xlsx")
write.xlsx(psyc, "../Data/Psyc.xlsx")

write.xlsx(eric.psyc, "../Data/ERIC_Psyc.xlsx")

重複を削除したデータにする(PsycInfo優先)

# タイトルで重複確認
library("dplyr")
duplicate <- eric.psyc %>% group_by(title) %>% filter(n()>1)
duplicate <- duplicate[order(duplicate$title, decreasing=T),]
duplicate.sec.title <- duplicate[c("db.seq", "title")]

library(openxlsx)
write.xlsx(duplicate.sec.title, "../Data/Duplicate.xlsx")

# 重複は8件
eric.psyc.nodp <- eric.psyc %>%
  dplyr::filter(db.seq != "eric.12" & 
                db.seq != "eric.53" & 
                db.seq != "eric.56" & 
                db.seq != "eric.71" & 
                db.seq != "eric.52" & 
                db.seq != "eric.120" & 
                db.seq != "eric.60" & 
                db.seq != "eric.116")

library(openxlsx)
write.xlsx(eric.psyc.nodp, "../Data/eric_psyc_nodp.xlsx")

データを翻訳する

library(dplyr)
library(tidyverse)
library(deeplr)

##データの読み込み

library(openxlsx)
e.p.nodp <- read.xlsx("../Data/eric_psyc_nodp.xlsx")

DeepL API の設定と確認

  • アブストの文字数を数えてみる
  • 必要に応じてDeepL APIの文字数制限を変更する
sum(nchar(e.p.nodp$abstract))
## [1] 168992
  • Deepl APIの情報(非表示)

  • 動作確認

available_languages(auth_key = api)
## # A tibble: 29 × 2
##    language name     
##    <chr>    <chr>    
##  1 BG       Bulgarian
##  2 CS       Czech    
##  3 DA       Danish   
##  4 DE       German   
##  5 EL       Greek    
##  6 EN       English  
##  7 ES       Spanish  
##  8 ET       Estonian 
##  9 FI       Finnish  
## 10 FR       French   
## # ℹ 19 more rows

言語の指定

source_lang <- "EN"
target_lang <- "JA"

関数
- https://note.com/text_tier2718/n/n3451567126a7 に載っている関数を使う

deepL <- function(Sentence, source_lang = "EN", target_lang = "JA", api_key = api_key) {
 a <- system(
   paste0(
     'curl -s https://api.deepl.com/v2/translate -d "auth_key=',
     api_key,
     '" -d "text=',
     str_replace_all(Sentence, pattern = '"', replacement = "'"), # 翻訳文中に""があるとpaste0と干渉してエラーを起こすので''に変換
     '" -d source_lang="',
     source_lang,
     '" -d "target_lang=',
     target_lang,
     '"'
   ),
   intern = T
 )
 b <- strsplit(
   strsplit(
     as.character(a),
     '\"text\":\"'
   )[[1]][2],
   '\"}]}'
 )[[1]][1]
 Sys.sleep(1)

 return(b)
}

翻訳

translation <- map(e.p.nodp$abstract, function(x) {
 deepL(
   Sentence = x,
   source_lang = source_lang,
   target_lang = target_lang,
   api_key = api
 )
})
eric.psyc.ja <- mutate(e.p.nodp, abstract_ja = translation)

library(openxlsx)
write.xlsx(eric.psyc.ja, "../Data/eric_psyc_ja.xlsx")
##    user  system elapsed 
##   4.217   2.004 362.960