ここでは,bibファイルを分析しやすいように成形するところまでをやってみる。

bibファイルの成形

ERICやPsycinfoでダウンロードしたbibファイルを,分析しやすいように成形する。

文献の検索

Serarch queryは以下の通り

# ERIC
全てのフィールド
("data driven") AND ("education" OR "instruction" OR "classroom")
査読付き,学術誌,英語,就学前から12年生まで

# PsycInfo
全てのフィールド
("data driven") AND ("education" OR "instruction" OR "classroom")
査読付き,学術誌,英語,
Preschool age (2-5歳),School age (6-12歳),Adolescence (13-17歳)

ERICは270件
PsycInfoは135件
(検索日: 2023/08/31)

bibファイルの加工

論文ごとのcsvファイルを作る

# ダウンロードしたbibファイルを読み
e <- readr::read_file(file = "../bib/230831_ERIC.bib")
p <- readr::read_file(file = "../bib/230831_PsycInfo.bib")

# 1論文1行にして
e1 <- data.frame(base::strsplit(e, split = "\n\n")[[1]])
p1 <- data.frame(base::strsplit(p, split = "\r\n\r\n\r\n")[[1]])

# データベースごとの行数のデータを作り
e.row <- c(1:nrow(e1))
p.row <- c(1:nrow(p1))

# 1論文1つのcsvファイルで保存して
for(i in e.row){
  file <- data.frame(base::strsplit(e1[i,], split=",\n"))
  setwd("../ERIC")
  csvname <- paste("e", i, ".csv", sep = "")
  write.csv(file, csvname)
}

for(i in p.row){
  file <- data.frame(base::strsplit(p1[i,], split=",\r\n"))
  setwd("../PsycInfo")
  csvname <- paste("p", i, ".csv", sep = "")
  write.csv(file, csvname)
}

# csvファイルを読み出し
for(i in e.row){
  name <- paste("e", i, sep = "")
  dataname <- paste("e.csv.", i, sep = "")
  assign(dataname, read.csv(paste("../ERIC/", name, ".csv", sep = "")))
}

for(i in p.row){
  name <- paste("p", i, sep = "")
  dataname <- paste("p.csv.", i, sep = "")
  assign(dataname, read.csv(paste("../PsycInfo/", name, ".csv", sep = "")))
}

行数が論文によって違うので確認する

eric.nrow <- data.frame(matrix(c(
          1, nrow(e.csv.1), 2, nrow(e.csv.2), 3, nrow(e.csv.3),
          4, nrow(e.csv.4), 5, nrow(e.csv.5), 6, nrow(e.csv.6),
          7, nrow(e.csv.7), 8, nrow(e.csv.8), 9, nrow(e.csv.9),
          10, nrow(e.csv.10), 11, nrow(e.csv.11), 12, nrow(e.csv.12),
          13, nrow(e.csv.13), 14, nrow(e.csv.14), 15, nrow(e.csv.15),
          16, nrow(e.csv.16), 17, nrow(e.csv.17), 18, nrow(e.csv.18),
          19, nrow(e.csv.19), 20, nrow(e.csv.20), 21, nrow(e.csv.21), 
          22, nrow(e.csv.22), 23, nrow(e.csv.23), 24, nrow(e.csv.24), 
          25, nrow(e.csv.25), 26, nrow(e.csv.26), 27, nrow(e.csv.27), 
          28, nrow(e.csv.28), 29, nrow(e.csv.29), 30, nrow(e.csv.30), 
          31, nrow(e.csv.31), 32, nrow(e.csv.32), 33, nrow(e.csv.33), 
          34, nrow(e.csv.34), 35, nrow(e.csv.35), 36, nrow(e.csv.36), 
          37, nrow(e.csv.37), 38, nrow(e.csv.38), 39, nrow(e.csv.39), 
          40, nrow(e.csv.40), 41, nrow(e.csv.41), 42, nrow(e.csv.42), 
          43, nrow(e.csv.43), 44, nrow(e.csv.44), 45, nrow(e.csv.45), 
          46, nrow(e.csv.46), 47, nrow(e.csv.47), 48, nrow(e.csv.48), 
          49, nrow(e.csv.49), 50, nrow(e.csv.50), 51, nrow(e.csv.51), 
          52, nrow(e.csv.52), 53, nrow(e.csv.53), 54, nrow(e.csv.54), 
          55, nrow(e.csv.55), 56, nrow(e.csv.56), 57, nrow(e.csv.57), 
          58, nrow(e.csv.58), 59, nrow(e.csv.59), 60, nrow(e.csv.60), 
          61, nrow(e.csv.61), 62, nrow(e.csv.62), 63, nrow(e.csv.63), 
          64, nrow(e.csv.64), 65, nrow(e.csv.65), 66, nrow(e.csv.66), 
          67, nrow(e.csv.67), 68, nrow(e.csv.68), 69, nrow(e.csv.69), 
          70, nrow(e.csv.70), 71, nrow(e.csv.71), 72, nrow(e.csv.72), 
          73, nrow(e.csv.73), 74, nrow(e.csv.74), 75, nrow(e.csv.75), 
          76, nrow(e.csv.76), 77, nrow(e.csv.77), 78, nrow(e.csv.78), 
          79, nrow(e.csv.79), 80, nrow(e.csv.80), 81, nrow(e.csv.81), 
          82, nrow(e.csv.82), 83, nrow(e.csv.83), 84, nrow(e.csv.84), 
          85, nrow(e.csv.85), 86, nrow(e.csv.86), 87, nrow(e.csv.87), 
          88, nrow(e.csv.88), 89, nrow(e.csv.89), 90, nrow(e.csv.90), 
          91, nrow(e.csv.91), 92, nrow(e.csv.92), 93, nrow(e.csv.93), 
          94, nrow(e.csv.94), 95, nrow(e.csv.95), 96, nrow(e.csv.96), 
          97, nrow(e.csv.97), 98, nrow(e.csv.98), 99, nrow(e.csv.99), 
          100, nrow(e.csv.100), 101, nrow(e.csv.101), 102, nrow(e.csv.102), 
          103, nrow(e.csv.103), 104, nrow(e.csv.104), 105, nrow(e.csv.105), 
          106, nrow(e.csv.106), 107, nrow(e.csv.107), 108, nrow(e.csv.108), 
          109, nrow(e.csv.109), 110, nrow(e.csv.110), 111, nrow(e.csv.111), 
          112, nrow(e.csv.112), 113, nrow(e.csv.113), 114, nrow(e.csv.114), 
          115, nrow(e.csv.115), 116, nrow(e.csv.116), 117, nrow(e.csv.117), 
          118, nrow(e.csv.118), 119, nrow(e.csv.119), 120, nrow(e.csv.120), 
          121, nrow(e.csv.121), 122, nrow(e.csv.122), 123, nrow(e.csv.123), 
          124, nrow(e.csv.124), 125, nrow(e.csv.125), 126, nrow(e.csv.126), 
          127, nrow(e.csv.127), 128, nrow(e.csv.128), 129, nrow(e.csv.129), 
          130, nrow(e.csv.130), 131, nrow(e.csv.131), 132, nrow(e.csv.132), 
          133, nrow(e.csv.133), 134, nrow(e.csv.134), 135, nrow(e.csv.135), 
          136, nrow(e.csv.136), 137, nrow(e.csv.137), 138, nrow(e.csv.138), 
          139, nrow(e.csv.139), 140, nrow(e.csv.140), 141, nrow(e.csv.141), 
          142, nrow(e.csv.142), 143, nrow(e.csv.143), 144, nrow(e.csv.144), 
          145, nrow(e.csv.145), 146, nrow(e.csv.146), 147, nrow(e.csv.147), 
          148, nrow(e.csv.148), 149, nrow(e.csv.149), 150, nrow(e.csv.150), 
          151, nrow(e.csv.151), 152, nrow(e.csv.152), 153, nrow(e.csv.153), 
          154, nrow(e.csv.154), 155, nrow(e.csv.155), 156, nrow(e.csv.156), 
          157, nrow(e.csv.157), 158, nrow(e.csv.158), 159, nrow(e.csv.159), 
          160, nrow(e.csv.160), 161, nrow(e.csv.161), 162, nrow(e.csv.162), 
          163, nrow(e.csv.163), 164, nrow(e.csv.164), 165, nrow(e.csv.165), 
          166, nrow(e.csv.166), 167, nrow(e.csv.167), 168, nrow(e.csv.168), 
          169, nrow(e.csv.169), 170, nrow(e.csv.170), 171, nrow(e.csv.171), 
          172, nrow(e.csv.172), 173, nrow(e.csv.173), 174, nrow(e.csv.174), 
          175, nrow(e.csv.175), 176, nrow(e.csv.176), 177, nrow(e.csv.177), 
          178, nrow(e.csv.178), 179, nrow(e.csv.179), 180, nrow(e.csv.180), 
          181, nrow(e.csv.181), 182, nrow(e.csv.182), 183, nrow(e.csv.183), 
          184, nrow(e.csv.184), 185, nrow(e.csv.185), 186, nrow(e.csv.186), 
          187, nrow(e.csv.187), 188, nrow(e.csv.188), 189, nrow(e.csv.189), 
          190, nrow(e.csv.190), 191, nrow(e.csv.191), 192, nrow(e.csv.192), 
          193, nrow(e.csv.193), 194, nrow(e.csv.194), 195, nrow(e.csv.195), 
          196, nrow(e.csv.196), 197, nrow(e.csv.197), 198, nrow(e.csv.198), 
          199, nrow(e.csv.199), 200, nrow(e.csv.200), 201, nrow(e.csv.201), 
          202, nrow(e.csv.202), 203, nrow(e.csv.203), 204, nrow(e.csv.204), 
          205, nrow(e.csv.205), 206, nrow(e.csv.206), 207, nrow(e.csv.207), 
          208, nrow(e.csv.208), 209, nrow(e.csv.209), 210, nrow(e.csv.210), 
          211, nrow(e.csv.211), 212, nrow(e.csv.212), 213, nrow(e.csv.213), 
          214, nrow(e.csv.214), 215, nrow(e.csv.215), 216, nrow(e.csv.216), 
          217, nrow(e.csv.217), 218, nrow(e.csv.218), 219, nrow(e.csv.219), 
          220, nrow(e.csv.220), 221, nrow(e.csv.221), 222, nrow(e.csv.222), 
          223, nrow(e.csv.223), 224, nrow(e.csv.224), 225, nrow(e.csv.225), 
          226, nrow(e.csv.226), 227, nrow(e.csv.227), 228, nrow(e.csv.228), 
          229, nrow(e.csv.229), 230, nrow(e.csv.230), 231, nrow(e.csv.231), 
          232, nrow(e.csv.232), 233, nrow(e.csv.233), 234, nrow(e.csv.234), 
          235, nrow(e.csv.235), 236, nrow(e.csv.236), 237, nrow(e.csv.237), 
          238, nrow(e.csv.238), 239, nrow(e.csv.239), 240, nrow(e.csv.240), 
          241, nrow(e.csv.241), 242, nrow(e.csv.242), 243, nrow(e.csv.243), 
          244, nrow(e.csv.244), 245, nrow(e.csv.245), 246, nrow(e.csv.246), 
          247, nrow(e.csv.247), 248, nrow(e.csv.248), 249, nrow(e.csv.249), 
          250, nrow(e.csv.250), 251, nrow(e.csv.251), 252, nrow(e.csv.252), 
          253, nrow(e.csv.253), 254, nrow(e.csv.254), 255, nrow(e.csv.255), 
          256, nrow(e.csv.256), 257, nrow(e.csv.257), 258, nrow(e.csv.258), 
          259, nrow(e.csv.259), 260, nrow(e.csv.260), 261, nrow(e.csv.261), 
          262, nrow(e.csv.262), 263, nrow(e.csv.263), 264, nrow(e.csv.264), 
          265, nrow(e.csv.265), 266, nrow(e.csv.266), 267, nrow(e.csv.267), 
          268, nrow(e.csv.268), 269, nrow(e.csv.269), 270, nrow(e.csv.270)
          ),ncol = 2, byrow =TRUE))
colnames(eric.nrow) <- c("seq", "nrow")

psyc.nrow <- data.frame(matrix(c(
          1, nrow(p.csv.1), 2, nrow(p.csv.2), 3, nrow(p.csv.3),
          4, nrow(p.csv.4), 5, nrow(p.csv.5), 6, nrow(p.csv.6),
          7, nrow(p.csv.7), 8, nrow(p.csv.8), 9, nrow(p.csv.9),
          10, nrow(p.csv.10), 11, nrow(p.csv.11), 12, nrow(p.csv.12),
          13, nrow(p.csv.13), 14, nrow(p.csv.14), 15, nrow(p.csv.15),
          16, nrow(p.csv.16), 17, nrow(p.csv.17), 18, nrow(p.csv.18),
          19, nrow(p.csv.19), 20, nrow(p.csv.20), 21, nrow(p.csv.21), 
          22, nrow(p.csv.22), 23, nrow(p.csv.23), 24, nrow(p.csv.24), 
          25, nrow(p.csv.25), 26, nrow(p.csv.26), 27, nrow(p.csv.27), 
          28, nrow(p.csv.28), 29, nrow(p.csv.29), 30, nrow(p.csv.30), 
          31, nrow(p.csv.31), 32, nrow(p.csv.32), 33, nrow(p.csv.33), 
          34, nrow(p.csv.34), 35, nrow(p.csv.35), 36, nrow(p.csv.36), 
          37, nrow(p.csv.37), 38, nrow(p.csv.38), 39, nrow(p.csv.39), 
          40, nrow(p.csv.40), 41, nrow(p.csv.41), 42, nrow(p.csv.42), 
          43, nrow(p.csv.43), 44, nrow(p.csv.44), 45, nrow(p.csv.45), 
          46, nrow(p.csv.46), 47, nrow(p.csv.47), 48, nrow(p.csv.48), 
          49, nrow(p.csv.49), 50, nrow(p.csv.50), 51, nrow(p.csv.51), 
          52, nrow(p.csv.52), 53, nrow(p.csv.53), 54, nrow(p.csv.54), 
          55, nrow(p.csv.55), 56, nrow(p.csv.56), 57, nrow(p.csv.57), 
          58, nrow(p.csv.58), 59, nrow(p.csv.59), 60, nrow(p.csv.60), 
          61, nrow(p.csv.61), 62, nrow(p.csv.62), 63, nrow(p.csv.63), 
          64, nrow(p.csv.64), 65, nrow(p.csv.65), 66, nrow(p.csv.66), 
          67, nrow(p.csv.67), 68, nrow(p.csv.68), 69, nrow(p.csv.69), 
          70, nrow(p.csv.70), 71, nrow(p.csv.71), 72, nrow(p.csv.72), 
          73, nrow(p.csv.73), 74, nrow(p.csv.74), 75, nrow(p.csv.75), 
          76, nrow(p.csv.76), 77, nrow(p.csv.77), 78, nrow(p.csv.78), 
          79, nrow(p.csv.79), 80, nrow(p.csv.80), 81, nrow(p.csv.81), 
          82, nrow(p.csv.82), 83, nrow(p.csv.83), 84, nrow(p.csv.84), 
          85, nrow(p.csv.85), 86, nrow(p.csv.86), 87, nrow(p.csv.87), 
          88, nrow(p.csv.88), 89, nrow(p.csv.89), 90, nrow(p.csv.90), 
          91, nrow(p.csv.91), 92, nrow(p.csv.92), 93, nrow(p.csv.93), 
          94, nrow(p.csv.94), 95, nrow(p.csv.95), 96, nrow(p.csv.96), 
          97, nrow(p.csv.97), 98, nrow(p.csv.98), 99, nrow(p.csv.99), 
          100, nrow(p.csv.100), 101, nrow(p.csv.101), 102, nrow(p.csv.102), 
          103, nrow(p.csv.103), 104, nrow(p.csv.104), 105, nrow(p.csv.105), 
          106, nrow(p.csv.106), 107, nrow(p.csv.107), 108, nrow(p.csv.108), 
          109, nrow(p.csv.109), 110, nrow(p.csv.110), 111, nrow(p.csv.111), 
          112, nrow(p.csv.112), 113, nrow(p.csv.113), 114, nrow(p.csv.114), 
          115, nrow(p.csv.115), 116, nrow(p.csv.116), 117, nrow(p.csv.117), 
          118, nrow(p.csv.118), 119, nrow(p.csv.119), 120, nrow(p.csv.120), 
          121, nrow(p.csv.121), 122, nrow(p.csv.122), 123, nrow(p.csv.123), 
          124, nrow(p.csv.124), 125, nrow(p.csv.125), 126, nrow(p.csv.126), 
          127, nrow(p.csv.127), 128, nrow(p.csv.128), 129, nrow(p.csv.129), 
          130, nrow(p.csv.130), 131, nrow(p.csv.131), 132, nrow(p.csv.132), 
          133, nrow(p.csv.133), 134, nrow(p.csv.134), 135, nrow(p.csv.135)
          ),ncol = 2, byrow =TRUE))
colnames(psyc.nrow) <- c("seq", "nrow")

table(eric.nrow[c("nrow")])
## nrow
##  11  12  13  14  15 
##   2   4  29 100 135
table(psyc.nrow[c("nrow")])
## nrow
##  11  12  13 
##  12  23 100

行数ごとにデータをまとめていく

ERIC

結合するデータの確認
eric.nrow.15.seq <- subset(eric.nrow, nrow == 15); eric.nrow.15.seq[c("seq")]
##     seq
## 1     1
## 3     3
## 4     4
## 5     5
## 6     6
## 21   21
## 22   22
## 23   23
## 25   25
## 26   26
## 33   33
## 35   35
## 36   36
## 37   37
## 39   39
## 40   40
## 42   42
## 43   43
## 44   44
## 45   45
## 46   46
## 47   47
## 62   62
## 63   63
## 66   66
## 67   67
## 68   68
## 69   69
## 70   70
## 73   73
## 84   84
## 86   86
## 87   87
## 88   88
## 89   89
## 108 108
## 109 109
## 110 110
## 111 111
## 113 113
## 115 115
## 125 125
## 126 126
## 127 127
## 128 128
## 129 129
## 130 130
## 132 132
## 134 134
## 135 135
## 138 138
## 151 151
## 152 152
## 153 153
## 154 154
## 155 155
## 156 156
## 157 157
## 158 158
## 159 159
## 160 160
## 161 161
## 162 162
## 163 163
## 168 168
## 174 174
## 175 175
## 176 176
## 177 177
## 178 178
## 179 179
## 180 180
## 181 181
## 182 182
## 183 183
## 184 184
## 185 185
## 186 186
## 192 192
## 193 193
## 196 196
## 197 197
## 198 198
## 201 201
## 202 202
## 203 203
## 204 204
## 205 205
## 206 206
## 207 207
## 208 208
## 211 211
## 212 212
## 213 213
## 214 214
## 215 215
## 216 216
## 217 217
## 218 218
## 219 219
## 221 221
## 222 222
## 223 223
## 229 229
## 230 230
## 231 231
## 232 232
## 233 233
## 234 234
## 235 235
## 236 236
## 237 237
## 240 240
## 241 241
## 242 242
## 243 243
## 244 244
## 245 245
## 247 247
## 248 248
## 249 249
## 250 250
## 251 251
## 252 252
## 254 254
## 255 255
## 256 256
## 258 258
## 262 262
## 263 263
## 264 264
## 265 265
## 268 268
## 269 269
## 270 270
eric.nrow.14.seq <- subset(eric.nrow, nrow == 14); eric.nrow.14.seq[c("seq")]
##     seq
## 2     2
## 7     7
## 10   10
## 11   11
## 15   15
## 16   16
## 17   17
## 18   18
## 19   19
## 20   20
## 24   24
## 27   27
## 28   28
## 29   29
## 30   30
## 31   31
## 32   32
## 38   38
## 41   41
## 48   48
## 49   49
## 51   51
## 53   53
## 54   54
## 55   55
## 57   57
## 58   58
## 59   59
## 60   60
## 61   61
## 64   64
## 74   74
## 75   75
## 77   77
## 78   78
## 79   79
## 80   80
## 81   81
## 82   82
## 83   83
## 90   90
## 95   95
## 96   96
## 97   97
## 98   98
## 99   99
## 100 100
## 104 104
## 106 106
## 112 112
## 114 114
## 117 117
## 118 118
## 120 120
## 123 123
## 124 124
## 136 136
## 137 137
## 141 141
## 143 143
## 144 144
## 145 145
## 146 146
## 147 147
## 148 148
## 149 149
## 150 150
## 164 164
## 165 165
## 166 166
## 167 167
## 169 169
## 170 170
## 171 171
## 172 172
## 173 173
## 187 187
## 188 188
## 189 189
## 190 190
## 191 191
## 195 195
## 199 199
## 200 200
## 209 209
## 210 210
## 220 220
## 224 224
## 225 225
## 226 226
## 227 227
## 228 228
## 238 238
## 239 239
## 246 246
## 257 257
## 260 260
## 261 261
## 266 266
## 267 267
eric.nrow.13.seq <- subset(eric.nrow, nrow == 13); eric.nrow.13.seq[c("seq")]
##     seq
## 8     8
## 9     9
## 12   12
## 13   13
## 34   34
## 50   50
## 52   52
## 56   56
## 65   65
## 71   71
## 72   72
## 76   76
## 85   85
## 91   91
## 92   92
## 93   93
## 94   94
## 101 101
## 102 102
## 103 103
## 105 105
## 107 107
## 119 119
## 121 121
## 122 122
## 139 139
## 194 194
## 253 253
## 259 259
eric.nrow.12.seq <- subset(eric.nrow, nrow == 12); eric.nrow.12.seq[c("seq")]
##     seq
## 14   14
## 116 116
## 131 131
## 133 133
eric.nrow.11.seq <- subset(eric.nrow, nrow == 11); eric.nrow.11.seq[c("seq")]
##     seq
## 140 140
## 142 142
結合して保存する
eric.nrow.15_ <- cbind(e.csv.1[,2], e.csv.3[,2], e.csv.4[,2], e.csv.5[,2], e.csv.6[,2], 
                      e.csv.21[,2], e.csv.22[,2], e.csv.23[,2], e.csv.25[,2], e.csv.26[,2],
                      e.csv.33[,2], e.csv.35[,2], e.csv.36[,2], e.csv.37[,2], e.csv.39[,2],
                      e.csv.40[,2], e.csv.42[,2], e.csv.43[,2], e.csv.44[,2], e.csv.45[,2], 
                      e.csv.46[,2], e.csv.47[,2], e.csv.62[,2], e.csv.63[,2], e.csv.66[,2], 
                      e.csv.67[,2], e.csv.68[,2], e.csv.69[,2], e.csv.70[,2], e.csv.73[,2], 
                      e.csv.84[,2], e.csv.86[,2], e.csv.87[,2], e.csv.88[,2], e.csv.89[,2], 
                      e.csv.108[,2], e.csv.109[,2], e.csv.110[,2], e.csv.111[,2], e.csv.113[,2], 
                      e.csv.115[,2], e.csv.125[,2], e.csv.126[,2], e.csv.127[,2], e.csv.128[,2], 
                      e.csv.129[,2], e.csv.130[,2], e.csv.132[,2], e.csv.134[,2], e.csv.135[,2], 
                      e.csv.138[,2], e.csv.151[,2], e.csv.152[,2], e.csv.153[,2], e.csv.154[,2], 
                      e.csv.155[,2], e.csv.156[,2], e.csv.157[,2], e.csv.158[,2], e.csv.159[,2], 
                      e.csv.160[,2], e.csv.161[,2], e.csv.162[,2], e.csv.163[,2], e.csv.168[,2], 
                      e.csv.174[,2], e.csv.175[,2], e.csv.176[,2], e.csv.177[,2], e.csv.178[,2], 
                      e.csv.179[,2], e.csv.180[,2], e.csv.181[,2], e.csv.182[,2], e.csv.183[,2], 
                      e.csv.184[,2], e.csv.185[,2], e.csv.186[,2], e.csv.192[,2], e.csv.193[,2], 
                      e.csv.196[,2], e.csv.197[,2], e.csv.198[,2], e.csv.201[,2], e.csv.202[,2], 
                      e.csv.203[,2], e.csv.204[,2], e.csv.205[,2], e.csv.206[,2], e.csv.207[,2], 
                      e.csv.208[,2], e.csv.211[,2], e.csv.212[,2], e.csv.213[,2], e.csv.214[,2], 
                      e.csv.215[,2], e.csv.216[,2], e.csv.217[,2], e.csv.218[,2], e.csv.219[,2], 
                      e.csv.221[,2], e.csv.222[,2], e.csv.223[,2], e.csv.229[,2], e.csv.230[,2], 
                      e.csv.231[,2], e.csv.232[,2], e.csv.233[,2], e.csv.234[,2], e.csv.235[,2], 
                      e.csv.236[,2], e.csv.237[,2], e.csv.240[,2], e.csv.241[,2], e.csv.242[,2], 
                      e.csv.243[,2], e.csv.244[,2], e.csv.245[,2], e.csv.247[,2], e.csv.248[,2], 
                      e.csv.249[,2], e.csv.250[,2], e.csv.251[,2], e.csv.252[,2], e.csv.254[,2], 
                      e.csv.255[,2], e.csv.256[,2], e.csv.258[,2], e.csv.262[,2], e.csv.263[,2], 
                      e.csv.264[,2], e.csv.265[,2], e.csv.268[,2], e.csv.269[,2], e.csv.270[,2])

eric.nrow.14_ <- cbind(e.csv.2[,2], e.csv.7[,2], e.csv.10[,2], e.csv.11[,2], e.csv.15[,2], 
                      e.csv.16[,2], e.csv.17[,2], e.csv.18[,2], e.csv.19[,2], e.csv.20[,2], 
                      e.csv.24[,2], e.csv.27[,2], e.csv.28[,2], e.csv.29[,2], e.csv.30[,2], 
                      e.csv.31[,2], e.csv.32[,2], e.csv.38[,2], e.csv.41[,2], e.csv.48[,2], 
                      e.csv.49[,2], e.csv.51[,2], e.csv.53[,2], e.csv.54[,2], e.csv.55[,2], 
                      e.csv.57[,2], e.csv.58[,2], e.csv.59[,2], e.csv.60[,2], e.csv.61[,2], 
                      e.csv.64[,2], e.csv.74[,2], e.csv.75[,2], e.csv.77[,2], e.csv.78[,2], 
                      e.csv.79[,2], e.csv.80[,2], e.csv.81[,2], e.csv.82[,2], e.csv.83[,2], 
                      e.csv.90[,2], e.csv.95[,2], e.csv.96[,2], e.csv.97[,2], e.csv.98[,2], 
                      e.csv.99[,2], e.csv.100[,2], e.csv.104[,2], e.csv.106[,2], e.csv.112[,2], 
                      e.csv.114[,2], e.csv.117[,2], e.csv.118[,2], e.csv.120[,2], e.csv.123[,2], 
                      e.csv.124[,2], e.csv.136[,2], e.csv.137[,2], e.csv.141[,2], e.csv.143[,2], 
                      e.csv.144[,2], e.csv.145[,2], e.csv.146[,2], e.csv.147[,2], e.csv.148[,2], 
                      e.csv.149[,2], e.csv.150[,2], e.csv.164[,2], e.csv.165[,2], e.csv.166[,2], 
                      e.csv.167[,2], e.csv.169[,2], e.csv.170[,2], e.csv.171[,2], e.csv.172[,2], 
                      e.csv.173[,2], e.csv.187[,2], e.csv.188[,2], e.csv.189[,2], e.csv.190[,2], 
                      e.csv.191[,2], e.csv.195[,2], e.csv.199[,2], e.csv.200[,2], e.csv.209[,2], 
                      e.csv.210[,2], e.csv.220[,2], e.csv.224[,2], e.csv.225[,2], e.csv.226[,2], 
                      e.csv.227[,2], e.csv.228[,2], e.csv.238[,2], e.csv.239[,2], e.csv.246[,2], 
                      e.csv.257[,2], e.csv.260[,2], e.csv.261[,2], e.csv.266[,2], e.csv.267[,2])

eric.nrow.13_ <- cbind(e.csv.8[,2], e.csv.9[,2], e.csv.12[,2], e.csv.13[,2], e.csv.34[,2], 
                      e.csv.50[,2], e.csv.52[,2], e.csv.56[,2], e.csv.65[,2], e.csv.71[,2], 
                      e.csv.72[,2], e.csv.76[,2], e.csv.85[,2], e.csv.91[,2], e.csv.92[,2], 
                      e.csv.93[,2], e.csv.94[,2], e.csv.101[,2], e.csv.102[,2], e.csv.103[,2], 
                      e.csv.105[,2], e.csv.107[,2], e.csv.119[,2], e.csv.121[,2], e.csv.122[,2], 
                      e.csv.139[,2], e.csv.194[,2], e.csv.253[,2], e.csv.259[,2])

eric.nrow.12_ <- cbind(e.csv.14[,2], e.csv.116[,2], e.csv.131[,2], e.csv.133[,2])

eric.nrow.11_ <- cbind(e.csv.140[,2], e.csv.142[,2])

# 保存
eric.nrow.15 <- data.frame(t(eric.nrow.15_))
eric.nrow.14 <- data.frame(t(eric.nrow.14_))
eric.nrow.13 <- data.frame(t(eric.nrow.13_))
eric.nrow.12 <- data.frame(t(eric.nrow.12_))
eric.nrow.11 <- data.frame(t(eric.nrow.11_))

library(openxlsx)
write.xlsx(eric.nrow.15, "../Data/eric_nrow_15.xlsx")
write.xlsx(eric.nrow.14, "../Data/eric_nrow_14.xlsx")
write.xlsx(eric.nrow.13, "../Data/eric_nrow_13.xlsx")
write.xlsx(eric.nrow.12, "../Data/eric_nrow_12.xlsx")
write.xlsx(eric.nrow.11, "../Data/eric_nrow_11.xlsx")

PsycInfo

結合するデータの確認
psyc.nrow.13.seq <- subset(psyc.nrow, nrow == 13); psyc.nrow.13.seq[c("seq")]
##     seq
## 1     1
## 2     2
## 3     3
## 4     4
## 5     5
## 6     6
## 8     8
## 9     9
## 10   10
## 11   11
## 12   12
## 15   15
## 18   18
## 19   19
## 21   21
## 22   22
## 23   23
## 24   24
## 25   25
## 26   26
## 27   27
## 28   28
## 29   29
## 31   31
## 32   32
## 33   33
## 34   34
## 36   36
## 40   40
## 41   41
## 46   46
## 47   47
## 48   48
## 49   49
## 55   55
## 56   56
## 57   57
## 58   58
## 59   59
## 62   62
## 64   64
## 65   65
## 66   66
## 69   69
## 70   70
## 71   71
## 73   73
## 74   74
## 76   76
## 77   77
## 78   78
## 79   79
## 80   80
## 81   81
## 82   82
## 85   85
## 88   88
## 89   89
## 90   90
## 91   91
## 93   93
## 94   94
## 95   95
## 96   96
## 97   97
## 98   98
## 99   99
## 100 100
## 101 101
## 102 102
## 103 103
## 104 104
## 105 105
## 106 106
## 107 107
## 108 108
## 109 109
## 110 110
## 111 111
## 112 112
## 113 113
## 114 114
## 115 115
## 116 116
## 117 117
## 118 118
## 119 119
## 120 120
## 121 121
## 123 123
## 125 125
## 126 126
## 127 127
## 129 129
## 130 130
## 131 131
## 132 132
## 133 133
## 134 134
## 135 135
psyc.nrow.12.seq <- subset(psyc.nrow, nrow == 12); psyc.nrow.12.seq[c("seq")]
##     seq
## 7     7
## 14   14
## 16   16
## 17   17
## 20   20
## 30   30
## 37   37
## 38   38
## 42   42
## 43   43
## 50   50
## 51   51
## 52   52
## 60   60
## 63   63
## 67   67
## 68   68
## 83   83
## 86   86
## 87   87
## 92   92
## 124 124
## 128 128
psyc.nrow.11.seq <- subset(psyc.nrow, nrow == 11); psyc.nrow.11.seq[c("seq")]
##     seq
## 13   13
## 35   35
## 39   39
## 44   44
## 45   45
## 53   53
## 54   54
## 61   61
## 72   72
## 75   75
## 84   84
## 122 122
結合して保存する
psyc.nrow.13_ <- cbind(p.csv.1[,2], p.csv.2[,2], p.csv.3[,2], p.csv.4[,2],
                       p.csv.5[,2], p.csv.6[,2], p.csv.8[,2], p.csv.9[,2], 
                       p.csv.10[,2], p.csv.11[,2], p.csv.12[,2], p.csv.15[,2], 
                       p.csv.18[,2], p.csv.19[,2], p.csv.21[,2], p.csv.22[,2], 
                       p.csv.23[,2], p.csv.24[,2], p.csv.25[,2], p.csv.26[,2], 
                       p.csv.27[,2], p.csv.28[,2], p.csv.29[,2], p.csv.31[,2], 
                       p.csv.32[,2], p.csv.33[,2], p.csv.34[,2], p.csv.36[,2], 
                       p.csv.40[,2], p.csv.41[,2], p.csv.46[,2], p.csv.47[,2],
                       p.csv.48[,2], p.csv.49[,2], p.csv.55[,2], p.csv.56[,2],
                       p.csv.57[,2], p.csv.58[,2], p.csv.59[,2], p.csv.62[,2],
                       p.csv.64[,2], p.csv.65[,2], p.csv.66[,2], p.csv.69[,2], 
                       p.csv.70[,2], p.csv.71[,2], p.csv.73[,2], p.csv.74[,2], 
                       p.csv.76[,2], p.csv.77[,2], p.csv.78[,2], p.csv.79[,2], 
                       p.csv.80[,2], p.csv.81[,2], p.csv.82[,2], p.csv.85[,2],
                       p.csv.88[,2], p.csv.89[,2], p.csv.90[,2], p.csv.91[,2], 
                       p.csv.93[,2], p.csv.94[,2], p.csv.95[,2], p.csv.96[,2], 
                       p.csv.97[,2], p.csv.98[,2], p.csv.99[,2], p.csv.100[,2], 
                       p.csv.101[,2], p.csv.102[,2], p.csv.103[,2], p.csv.104[,2], 
                       p.csv.105[,2], p.csv.106[,2], p.csv.107[,2], p.csv.108[,2], 
                       p.csv.109[,2], p.csv.110[,2], p.csv.111[,2], p.csv.112[,2], 
                       p.csv.113[,2], p.csv.114[,2], p.csv.115[,2], p.csv.116[,2],
                       p.csv.117[,2], p.csv.118[,2], p.csv.119[,2], p.csv.120[,2],
                       p.csv.121[,2], p.csv.123[,2], p.csv.125[,2], p.csv.126[,2], 
                       p.csv.127[,2], p.csv.129[,2], p.csv.130[,2], p.csv.131[,2], 
                       p.csv.132[,2], p.csv.133[,2], p.csv.134[,2], p.csv.135[,2])

psyc.nrow.12_ <- cbind(p.csv.7[,2], p.csv.14[,2], p.csv.16[,2], p.csv.17[,2], 
                       p.csv.20[,2], p.csv.30[,2], p.csv.37[,2], p.csv.38[,2], 
                       p.csv.42[,2], p.csv.43[,2], p.csv.50[,2], p.csv.51[,2], 
                       p.csv.52[,2], p.csv.60[,2], p.csv.63[,2], p.csv.67[,2], 
                       p.csv.68[,2], p.csv.83[,2], p.csv.86[,2], p.csv.87[,2], 
                       p.csv.92[,2], p.csv.124[,2], p.csv.128[,2])

psyc.nrow.11_ <- cbind(p.csv.13[,2], p.csv.35[,2], p.csv.39[,2], p.csv.44[,2], 
                       p.csv.45[,2], p.csv.53[,2], p.csv.54[,2], p.csv.61[,2], 
                       p.csv.72[,2], p.csv.75[,2], p.csv.84[,2], p.csv.122[,2])

# 保存
psyc.nrow.13 <- data.frame(t(psyc.nrow.13_))
psyc.nrow.12 <- data.frame(t(psyc.nrow.12_))
psyc.nrow.11 <- data.frame(t(psyc.nrow.11_))

library(openxlsx)
write.xlsx(psyc.nrow.13, "../Data/psyc_nrow_13.xlsx")
write.xlsx(psyc.nrow.12, "../Data/psyc_nrow_12.xlsx")
write.xlsx(psyc.nrow.11, "../Data/psyc_nrow_11.xlsx")

出力を成形する

行の内容が一致しないケースがある
出力したxlsxファイルを手作業で加工する必要がある

  • 以下のようなjournalは対象外にした方がよいか
    • isbnがない
    • Journalのタイトルがない
  • 当てはまるのは ERIC のデータ
# ERIC
15列
c("author", "year", "month", "title", "journal", "volume", "number", "pages", "note", "abstract", "keywords", "issn", "language", "url", "no.use")
14列,13列
整形して15列にして以下の通りにする
c("author", "year", "month", "title", "journal", "volume", "number", "pages", "note", "abstract", "keywords", "isbn", "language", "url", "no.use")
12列,11列
isbnやjournalのタイトルがないので対象外

# PsycInfo
13列
VolumeとURLが入れ替わっているデータがあるので注意
c("no.use.1", "abstract", "author", "issn", "journal", "keywords", "number", "pages", "title", "url", "volume", "year", "no.use2")
12列
整形して13列にして以下の通りにする
c("no.use.1", "abstract", "author", "issn", "journal", "keywords", "number", "pages", "title", "url", "volume", "year", "no.use2")
11列
揃っているがオンラインジャーナルでnumberとpagesがないので,これら2列を加えて,列名を以下の通りにする
c("no.use.1", "abstract", "author", "issn", "journal", "keywords", "number", "pages", "title", "url", "volume", "year", "no.use2")

整形したxlsxファイルを読み込む

library(openxlsx)
eric.nrow.15r <- read.xlsx("../Data/eric_nrow_15r.xlsx", sheet=1)
eric.nrow.14r <- read.xlsx("../Data/eric_nrow_14r.xlsx", sheet=1)
eric.nrow.13r <- read.xlsx("../Data/eric_nrow_13r.xlsx", sheet=1)

psyc.nrow.13r <- read.xlsx("../Data/psyc_nrow_13r.xlsx", sheet=1)
psyc.nrow.12r <- read.xlsx("../Data/psyc_nrow_12r.xlsx", sheet=1)
psyc.nrow.11r <- read.xlsx("../Data/psyc_nrow_11r.xlsx", sheet=1)

eric.col <- c("author", "year", "month", "title", "journal", "volume", "number", "pages", "note", "abstract", "keywords", "issn", "language", "url", "no.use")
psyc.col <- c("no.use.1", "abstract", "author", "issn", "journal", "keywords", "number", "pages", "title", "url", "volume", "year", "no.use2")

colnames(eric.nrow.15r) <- eric.col
colnames(eric.nrow.14r) <- eric.col
colnames(eric.nrow.13r) <- eric.col

colnames(psyc.nrow.13r) <- psyc.col
colnames(psyc.nrow.12r) <- psyc.col
colnames(psyc.nrow.11r) <- psyc.col

psyc.nrow.11r$number <- as.character(psyc.nrow.11r$number) # number が数値型になっているので
psyc.nrow.11r$pages  <- as.character(psyc.nrow.11r$pages)  # pages が数値型になっているので

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
eric.raw <- dplyr::bind_rows(eric.nrow.15r, eric.nrow.14r, eric.nrow.13r)
psyc.raw <- dplyr::bind_rows(psyc.nrow.13r, psyc.nrow.12r, psyc.nrow.11r)

余計な文字を削除する

ERIC

author.rm2 <- c("\\r")
               # @article  { \r\nauthor={
author.rm <- c("@article\\{\nauthor=\\{")
              # @article{\nauthor={
rm.2 <- c("\\}")
year.rm <- c("year=\\{")
month.rm <- c("month=\\{")
title.rm <- c("title=\\{")
journal.rm <- c("journal=\\{")
volume.rm <- c("volume=\\{")
number.rm <- c("number=\\{")
pages.rm <- c("pages=\\{")
abstract.rm <- c("abstract=\\{")
keywords.rm <- c("keywords=\\{")
issn.rm <- c("isbn=\\{")
url.rm <- c("url=\\{")

library(stringr)
eric.raw$author <- str_replace_all(eric.raw$author, pattern = author.rm2, replacement = "")
eric.raw$author <- str_replace_all(eric.raw$author, pattern = author.rm, replacement = "")
eric.raw$author <- str_replace_all(eric.raw$author, rm.2, "")
eric.raw$year <- str_replace_all(eric.raw$year, pattern = year.rm, replacement = "")
eric.raw$year <- str_replace_all(eric.raw$year, rm.2, "")
eric.raw$month <- str_replace_all(eric.raw$month, pattern = month.rm, replacement = "")
eric.raw$month <- str_replace_all(eric.raw$month, rm.2, "")
eric.raw$title <- str_replace_all(eric.raw$title, pattern = title.rm, replacement = "")
eric.raw$title <- str_replace_all(eric.raw$title, rm.2, "")
eric.raw$journal <- str_replace_all(eric.raw$journal, pattern = journal.rm, replacement = "")
eric.raw$journal <- str_replace_all(eric.raw$journal, rm.2, "")
eric.raw$volume <- str_replace_all(eric.raw$volume, pattern = volume.rm, replacement = "")
eric.raw$volume <- str_replace_all(eric.raw$volume, rm.2, "")
eric.raw$number <- str_replace_all(eric.raw$number, pattern = number.rm, replacement = "")
eric.raw$number <- str_replace_all(eric.raw$number, rm.2, "")
eric.raw$pages <- str_replace_all(eric.raw$pages, pattern = pages.rm, replacement = "")
eric.raw$pages <- str_replace_all(eric.raw$pages, rm.2, "")
eric.raw$abstract <- str_replace_all(eric.raw$abstract, pattern = abstract.rm, replacement = "")
eric.raw$abstract <- str_replace_all(eric.raw$abstract, rm.2, "")
eric.raw$keywords <- str_replace_all(eric.raw$keywords, pattern = keywords.rm, replacement = "")
eric.raw$keywords <- str_replace_all(eric.raw$keywords, rm.2, "")
eric.raw$issn <- str_replace_all(eric.raw$issn, pattern = issn.rm, replacement = "")
eric.raw$issn <- str_replace_all(eric.raw$issn, rm.2, "")
eric.raw$url <- str_replace_all(eric.raw$url, pattern = url.rm, replacement = "")
eric.raw$url <- str_replace_all(eric.raw$url, rm.2, "")

# タイトルを先頭だけ大文字,あとは小文字
eric.raw$title <- str_to_sentence(eric.raw$title)

PsycInfo

abstract.rm <- c("Abstract = \\{")
author.rm <- c("Author = \\{")
issn.rm <- c("ISSN = \\{")
journal.rm <- c("Journal = \\{")
keywords.rm <- c("Keywords = \\{")
number.rm <- c("Number = \\{")
pages.rm <- c("Pages = \\{")
title.rm <- c("Title = \\{")
url.rm <- c("URL = \\{")
volume.rm <- c("Volume = \\{")
year.rm <- c("Year = \\{")
rm.2 <- c("\\}")

library(stringr)
psyc.raw$abstract <- str_replace_all(psyc.raw$abstract, pattern = abstract.rm, replacement = "")
psyc.raw$abstract <- str_replace_all(psyc.raw$abstract, rm.2, "")
psyc.raw$author <- str_replace_all(psyc.raw$author, pattern = author.rm, replacement = "")
psyc.raw$author <- str_replace_all(psyc.raw$author, rm.2, "")
psyc.raw$issn <- str_replace_all(psyc.raw$issn, pattern = issn.rm, replacement = "")
psyc.raw$issn <- str_replace_all(psyc.raw$issn, rm.2, "")
psyc.raw$journal <- str_replace_all(psyc.raw$journal, pattern = journal.rm, replacement = "")
psyc.raw$journal <- str_replace_all(psyc.raw$journal, rm.2, "")
psyc.raw$keywords <- str_replace_all(psyc.raw$keywords, pattern = keywords.rm, replacement = "")
psyc.raw$keywords <- str_replace_all(psyc.raw$keywords, rm.2, "")
psyc.raw$number <- str_replace_all(psyc.raw$number, pattern = number.rm, replacement = "")
psyc.raw$number <- str_replace_all(psyc.raw$number, rm.2, "")
psyc.raw$pages <- str_replace_all(psyc.raw$pages, pattern = pages.rm, replacement = "")
psyc.raw$pages <- str_replace_all(psyc.raw$pages, rm.2, "")
psyc.raw$title <- str_replace_all(psyc.raw$title, pattern = title.rm, replacement = "")
psyc.raw$title <- str_replace_all(psyc.raw$title, rm.2, "")
psyc.raw$url <- str_replace_all(psyc.raw$url, pattern = url.rm, replacement = "")
psyc.raw$url <- str_replace_all(psyc.raw$url, rm.2, "")
psyc.raw$volume <- str_replace_all(psyc.raw$volume, pattern = volume.rm, replacement = "")
psyc.raw$volume <- str_replace_all(psyc.raw$volume, rm.2, "")
psyc.raw$year <- str_replace_all(psyc.raw$year, pattern = year.rm, replacement = "")
psyc.raw$year <- str_replace_all(psyc.raw$year, rm.2, "")

# キーワード区切りをセミコロンにする
psyc.raw$keywords <- str_replace_all(psyc.raw$keywords, pattern = ",", replacement = ";")
# ページ番号の空白をなくす
psyc.raw$pages <- str_replace_all(psyc.raw$pages, " ", "")
# タイトルを先頭だけ大文字,あとは小文字
psyc.raw$title <- str_to_sentence(psyc.raw$title)

連番を振り,列を揃えて,1本のデータにまとめる

# ERIC
eric.raw$db <- c("eric")
library(dplyr)
eric.raw <- eric.raw %>% mutate(seq = row_number())
eric.raw$db.seq <- paste(eric.raw$db, ".", eric.raw$seq, sep = "")
eric <- eric.raw[c("db", "seq", "db.seq", "author", "year", "title", "journal", 
                   "volume", "number", "pages", "keywords","issn", "abstract")]

# PsycInfo
psyc.raw$db <- c("psyc")
library(dplyr)
psyc.raw <- psyc.raw %>% mutate(seq = row_number())
psyc.raw$db.seq <- paste(psyc.raw$db, ".", psyc.raw$seq, sep = "")
psyc <- psyc.raw[c("db", "seq", "db.seq", "author", "year", "title", "journal", 
                   "volume", "number", "pages", "keywords","issn", "abstract")]

# まとめる
eric.psyc <- dplyr::bind_rows(eric, psyc)

# ここで保存しておく
library(openxlsx)
write.xlsx(eric.psyc, "../Data/ERIC_Psyc.xlsx")

重複を削除したデータにする(PsycInfo優先)

# タイトルで重複確認
library("dplyr")
duplicate <- eric.psyc %>% group_by(title) %>% filter(n()>1)
duplicate <- duplicate[order(duplicate$title, decreasing=T),]
duplicate.sec.title <- duplicate[c("db.seq", "title")]

library(openxlsx)
write.xlsx(duplicate.sec.title, "../Data/Duplicate.xlsx")

# 重複は21件
eric.psyc.nodp <- eric.psyc %>%
  dplyr::filter(db.seq != "eric.80") %>%
  dplyr::filter(db.seq != "eric.196") %>%
  dplyr::filter(db.seq != "eric.264") %>%
  dplyr::filter(db.seq != "eric.92") %>%
  dplyr::filter(db.seq != "eric.32") %>%
  dplyr::filter(db.seq != "eric.27") %>%
  dplyr::filter(db.seq != "eric.67") %>%
  dplyr::filter(db.seq != "eric.195") %>%
  dplyr::filter(db.seq != "eric.123") %>%
  dplyr::filter(db.seq != "eric.156") %>%
  dplyr::filter(db.seq != "eric.151") %>%
  dplyr::filter(db.seq != "eric.183") %>%
  dplyr::filter(db.seq != "eric.118") %>%
  dplyr::filter(db.seq != "eric.50") %>%
  dplyr::filter(db.seq != "eric.227") %>%
  dplyr::filter(db.seq != "eric.198") %>%
  dplyr::filter(db.seq != "eric.124") %>%
  dplyr::filter(db.seq != "eric.21") %>%
  dplyr::filter(db.seq != "eric.35") %>%
  dplyr::filter(db.seq != "eric.22") %>%
  dplyr::filter(db.seq != "eric.71")

library(openxlsx)
write.xlsx(eric.psyc.nodp, "../Data/eric_psyc_nodp.xlsx")