メソドロジー研究部会 2012年度報告論集使用コード

水本 篤 (2013).「英文解析プログラムから得られる各種指標を使ったテキスト難易度の推定―教材作成への適用可能性―」
『外国語教育メディア学会(LET) 関西支部メソドロジー研究部会2012年度報告論集』で使用したコードです。


データの読み込み

dat <- read.csv("http://www.mizumot.com/files/metho2012.csv", 
    header = T, fileEncoding = "CP932")
dat  # 読み込んだデータの確認
##    Lesson Order TotalWords WordsPerSentence FleschKincaidGrade Narrativity
## 1     L01     1        569             13.5                6.2       79.10
## 2     L02     2        516             16.6                8.1       57.53
## 3     L03     3        525             15.9                9.4       31.21
## 4     L04     4        421             17.5                9.8       42.07
## 5     L05     5        551             14.9                8.8       53.19
## 6     L06     6        490             16.9               10.0       34.83
## 7     L07     7        479             20.0                7.7       41.29
## 8     L08     8        681             14.3                7.1       69.85
## 9     L09     9        668             16.7                7.6       89.07
## 10    L10    10        714             14.0                6.5       44.04
## 11    L11    11        713             17.4                6.0       93.82
## 12    L12    12        731             21.5               11.1       37.83
## 13    L13    13        887             18.5               10.2       28.77
## 14    L14    14        755             17.6                9.5       64.80
## 15    L15    15        848             20.2                9.9       28.43
##    SyntacticSimplicity WordConcreteness ReferentialCohesion DeepCohesion
## 1                70.19            60.64               24.51        52.79
## 2                68.79            95.91               59.87        78.81
## 3                54.38            29.81                6.43        68.08
## 4                56.36            47.21               27.76        19.49
## 5                72.24            58.32               13.57        54.78
## 6                63.31            57.93                5.82        85.54
## 7                33.72            84.38               93.57        51.99
## 8                82.38            35.20                3.22        81.59
## 9                34.46            89.62               32.64        58.71
## 10               75.49            71.90               19.49        46.81
## 11               50.80            57.53               47.61        65.54
## 12               35.57            43.64               20.90        81.86
## 13               59.48            32.28               18.41        42.47
## 14               42.47            65.91               50.40        68.79
## 15               34.46            84.61               15.15        74.22
##    VerbCohesion Connectivity Temporality   TTR AverageWordLevel_SVL
## 1         36.69         1.83       55.17 0.436                1.302
## 2         30.85         0.04       38.21 0.468                1.379
## 3         79.39        15.39       15.87 0.494                1.479
## 4         44.43        21.19        3.51 0.451                1.470
## 5         50.80         3.22       33.36 0.478                1.549
## 6         55.17         0.00       73.89 0.516                1.556
## 7         64.06         9.51       23.58 0.403                1.591
## 8         46.41         0.03       69.15 0.430                1.311
## 9         18.14         4.27       70.88 0.453                1.378
## 10        77.64         4.75        7.93 0.424                1.347
## 11        48.40         0.03       56.75 0.400                1.286
## 12        52.79         0.64       28.10 0.476                1.413
## 13        81.33         3.14       46.41 0.417                1.334
## 14        39.74         2.68       53.98 0.406                1.371
## 15        57.53         1.62       44.43 0.435                1.638
##    AverageWordLevel_JACET
## 1                   1.203
## 2                   1.128
## 3                   1.227
## 4                   1.225
## 5                   1.201
## 6                   1.248
## 7                   1.153
## 8                   1.210
## 9                   1.188
## 10                  1.193
## 11                  1.178
## 12                  1.240
## 13                  1.141
## 14                  1.087
## 15                  1.347

使用した指標間の相関係数

cor(dat[, -1:-2])
##                        TotalWords WordsPerSentence FleschKincaidGrade
## TotalWords              1.0000000          0.26840             0.1109
## WordsPerSentence        0.2683972          1.00000             0.6235
## FleschKincaidGrade      0.1109120          0.62347             1.0000
## Narrativity            -0.0004597         -0.41605            -0.7237
## SyntacticSimplicity    -0.2138468         -0.81773            -0.3728
## WordConcreteness       -0.0879400          0.10812            -0.2967
## ReferentialCohesion    -0.2128517          0.34466            -0.2655
## DeepCohesion            0.1711794          0.13326             0.1111
## VerbCohesion            0.2097802          0.12052             0.2372
## Connectivity           -0.5050867          0.02708             0.1937
## Temporality             0.2725683         -0.12178            -0.1731
## TTR                    -0.4633655         -0.09273             0.4578
## AverageWordLevel_SVL   -0.2975016          0.42909             0.4966
## AverageWordLevel_JACET  0.0411757          0.16983             0.2570
##                        Narrativity SyntacticSimplicity WordConcreteness
## TotalWords              -0.0004597            -0.21385         -0.08794
## WordsPerSentence        -0.4160518            -0.81773          0.10812
## FleschKincaidGrade      -0.7237178            -0.37280         -0.29667
## Narrativity              1.0000000             0.09259          0.24091
## SyntacticSimplicity      0.0925942             1.00000         -0.30375
## WordConcreteness         0.2409067            -0.30375          1.00000
## ReferentialCohesion      0.2283145            -0.42307          0.58626
## DeepCohesion             0.0546209            -0.05241          0.08642
## VerbCohesion            -0.7106392             0.10435         -0.47373
## Connectivity            -0.3241995            -0.13621         -0.23602
## Temporality              0.5159523             0.02016          0.09565
## TTR                     -0.3450640             0.16767         -0.16555
## AverageWordLevel_SVL    -0.6209558            -0.37570          0.24128
## AverageWordLevel_JACET  -0.3602193            -0.11939         -0.09130
##                        ReferentialCohesion DeepCohesion VerbCohesion
## TotalWords                        -0.21285      0.17118       0.2098
## WordsPerSentence                   0.34466      0.13326       0.1205
## FleschKincaidGrade                -0.26553      0.11107       0.2372
## Narrativity                        0.22831      0.05462      -0.7106
## SyntacticSimplicity               -0.42307     -0.05241       0.1044
## WordConcreteness                   0.58626      0.08642      -0.4737
## ReferentialCohesion                1.00000     -0.14457      -0.2299
## DeepCohesion                      -0.14457      1.00000      -0.1627
## VerbCohesion                      -0.22985     -0.16267       1.0000
## Connectivity                       0.06145     -0.66844       0.2387
## Temporality                       -0.13489      0.53770      -0.4807
## TTR                               -0.51110      0.33914      -0.0356
## AverageWordLevel_SVL               0.03859      0.05921       0.2154
## AverageWordLevel_JACET            -0.57120      0.17724       0.1553
##                        Connectivity Temporality      TTR
## TotalWords                 -0.50509     0.27257 -0.46337
## WordsPerSentence            0.02708    -0.12178 -0.09273
## FleschKincaidGrade          0.19368    -0.17310  0.45775
## Narrativity                -0.32420     0.51595 -0.34506
## SyntacticSimplicity        -0.13621     0.02016  0.16767
## WordConcreteness           -0.23602     0.09565 -0.16555
## ReferentialCohesion         0.06145    -0.13489 -0.51110
## DeepCohesion               -0.66844     0.53770  0.33914
## VerbCohesion                0.23868    -0.48073 -0.03560
## Connectivity                1.00000    -0.69182  0.08627
## Temporality                -0.69182     1.00000 -0.01161
## TTR                         0.08627    -0.01161  1.00000
## AverageWordLevel_SVL        0.27754    -0.24476  0.37486
## AverageWordLevel_JACET      0.06394    -0.05035  0.40429
##                        AverageWordLevel_SVL AverageWordLevel_JACET
## TotalWords                         -0.29750                0.04118
## WordsPerSentence                    0.42909                0.16983
## FleschKincaidGrade                  0.49664                0.25703
## Narrativity                        -0.62096               -0.36022
## SyntacticSimplicity                -0.37570               -0.11939
## WordConcreteness                    0.24128               -0.09130
## ReferentialCohesion                 0.03859               -0.57120
## DeepCohesion                        0.05921                0.17724
## VerbCohesion                        0.21541                0.15534
## Connectivity                        0.27754                0.06394
## Temporality                        -0.24476               -0.05035
## TTR                                 0.37486                0.40429
## AverageWordLevel_SVL                1.00000                0.51639
## AverageWordLevel_JACET              0.51639                1.00000

レッスン配置順と指標の順位相関係数

cor(dat[, 2], dat[, -1:-2], method = "spearman")
##      TotalWords WordsPerSentence FleschKincaidGrade Narrativity
## [1,]      0.825           0.6179             0.3107     -0.2036
##      SyntacticSimplicity WordConcreteness ReferentialCohesion DeepCohesion
## [1,]              -0.429          0.01071             0.01071       0.1143
##      VerbCohesion Connectivity Temporality     TTR AverageWordLevel_SVL
## [1,]       0.2893      -0.1805      0.1607 -0.5036             -0.05714
##      AverageWordLevel_JACET
## [1,]               -0.07857

主成分分析

1回目

dat2 <- dat[, -1:-2]
# 相関行列を指定
pc.cr <- princomp(dat2, cor = TRUE)
# 上位の主成分の確認
summary(pc.cr, loadings = TRUE)
## Importance of components:
##                        Comp.1 Comp.2 Comp.3 Comp.4  Comp.5  Comp.6  Comp.7
## Standard deviation     1.9676 1.6302 1.5995 1.3594 0.96137 0.93267 0.67013
## Proportion of Variance 0.2765 0.1898 0.1828 0.1320 0.06602 0.06213 0.03208
## Cumulative Proportion  0.2765 0.4664 0.6491 0.7811 0.84714 0.90927 0.94135
##                         Comp.8  Comp.9  Comp.10 Comp.11  Comp.12   Comp.13
## Standard deviation     0.59852 0.45499 0.369957 0.30050 0.141140 0.0741129
## Proportion of Variance 0.02559 0.01479 0.009776 0.00645 0.001423 0.0003923
## Cumulative Proportion  0.96693 0.98172 0.991498 0.99795 0.999371 0.9997632
##                          Comp.14
## Standard deviation     0.0575775
## Proportion of Variance 0.0002368
## Cumulative Proportion  1.0000000
## 
## Loadings:
##                        Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## TotalWords                            0.287  0.590  0.164 -0.160  0.322
## WordsPerSentence        0.253  0.447  0.178  0.156 -0.182        -0.133
## FleschKincaidGrade      0.397         0.205        -0.444         0.394
## Narrativity            -0.460               -0.107 -0.117 -0.329 -0.209
## SyntacticSimplicity    -0.145 -0.507 -0.119         0.144  0.322  0.226
## WordConcreteness       -0.154  0.385        -0.324  0.445  0.109  0.467
## ReferentialCohesion    -0.139  0.494 -0.238                0.327 -0.198
## DeepCohesion                          0.516 -0.120         0.373 -0.430
## VerbCohesion            0.296 -0.147 -0.112  0.402  0.293  0.348 -0.316
## Connectivity            0.264        -0.454 -0.112 -0.181 -0.309 -0.156
## Temporality            -0.284         0.422        -0.142              
## TTR                     0.229 -0.265  0.201 -0.444 -0.237  0.117  0.111
## AverageWordLevel_SVL    0.365  0.178        -0.306  0.307              
## AverageWordLevel_JACET  0.267 -0.126  0.252 -0.142  0.469 -0.509 -0.207
##                        Comp.8 Comp.9 Comp.10 Comp.11 Comp.12 Comp.13
## TotalWords              0.112 -0.237  0.163           0.286  -0.442 
## WordsPerSentence        0.121  0.409 -0.283           0.480         
## FleschKincaidGrade                    0.205                   0.386 
## Narrativity                                  -0.485   0.393   0.349 
## SyntacticSimplicity            0.506  0.138           0.477         
## WordConcreteness        0.138 -0.244          0.289   0.157   0.326 
## ReferentialCohesion            0.183                         -0.392 
## DeepCohesion            0.311 -0.120  0.507   0.125                 
## VerbCohesion           -0.127 -0.328 -0.319           0.148   0.343 
## Connectivity                  -0.297  0.372   0.411   0.392         
## Temporality            -0.704        -0.172   0.395                 
## TTR                     0.243 -0.305 -0.471           0.223  -0.340 
## AverageWordLevel_SVL   -0.490         0.261  -0.519   0.160  -0.143 
## AverageWordLevel_JACET  0.155  0.312          0.185  -0.104         
##                        Comp.14
## TotalWords              0.151 
## WordsPerSentence       -0.347 
## FleschKincaidGrade      0.472 
## Narrativity             0.290 
## SyntacticSimplicity     0.106 
## WordConcreteness              
## ReferentialCohesion     0.575 
## DeepCohesion                  
## VerbCohesion            0.213 
## Connectivity                  
## Temporality                   
## TTR                     0.105 
## AverageWordLevel_SVL   -0.114 
## AverageWordLevel_JACET  0.365 
# 主成分負荷量と主成分得点を同時にプロット
biplot(pc.cr, xlim = c(-0.6, 0.6), ylim = c(-0.6, 
    0.6))
abline(v = 0, lty = 3)  #0で縦に線を引き,破線(lty=3)を引く
abline(h = 0, lty = 3)  #0で横に線を引き,破線(lty=3)を引く

plot of chunk unnamed-chunk-4

2回目

delete <- c(1, 2, 3, 10, 11, 12, 13, 14, 16)
dat3 <- dat[, -delete]
# 相関行列を指定
pc.cr <- princomp(dat3, cor = TRUE)
# 上位の主成分の確認
summary(pc.cr, loadings = TRUE)
## Importance of components:
##                        Comp.1 Comp.2 Comp.3 Comp.4  Comp.5 Comp.6  Comp.7
## Standard deviation     1.7378 1.4692 0.9472 0.6582 0.50654 0.3834 0.29588
## Proportion of Variance 0.4314 0.3084 0.1282 0.0619 0.03666 0.0210 0.01251
## Cumulative Proportion  0.4314 0.7398 0.8679 0.9298 0.96649 0.9875 1.00000
## 
## Loadings:
##                      Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7
## WordsPerSentence     -0.504  0.158 -0.368               -0.241  0.716
## FleschKincaidGrade   -0.464 -0.312 -0.149         0.466  0.647 -0.167
## Narrativity           0.405  0.352 -0.304  0.479 -0.136  0.529  0.303
## SyntacticSimplicity   0.414 -0.333  0.366 -0.412  0.231  0.231  0.554
## WordConcreteness             0.547  0.497  0.248  0.613 -0.116       
## ReferentialCohesion          0.586        -0.696 -0.156  0.340 -0.155
## AverageWordLevel_SVL -0.430         0.607  0.213 -0.550  0.249  0.191
# 主成分負荷量と主成分得点を同時にプロット
biplot(pc.cr, xlim = c(-0.6, 0.6), ylim = c(-0.6, 
    0.6))
abline(v = 0, lty = 3)  #0で縦に線を引き,破線(lty=3)を引く
abline(h = 0, lty = 3)  #0で横に線を引き,破線(lty=3)を引く

plot of chunk unnamed-chunk-5


クラスター分析

# 主成分得点をdat3に列として加える
dat3$PCA1 <- pc.cr$scores[, 1]
dat3$PCA2 <- pc.cr$scores[, 2]
# 主成分得点を使ったクラスター分析
z <- dat3[, c("PCA1", "PCA2")]
z.d <- dist(z)^2  # ユークリッド距離の平方(2乗)
result <- hclust(z.d, method = "ward")  # クラスター分析(Ward法と平方ユークリッド距離)
plot(result)  # デンドログラム作図

plot of chunk unnamed-chunk-6