堀辰雄と泉鏡花の文体比較

まず堀辰雄と泉鏡花について

文の長さに違いがあるかどうかを調べてみたいと思います。

堀の場合

まずファイルダウンロードして、解凍、ルビ取りをします

青空文庫から新字新仮名でほぼ同じサイズのファイルを、適当に選び、そのURLを調べます。

ほととぎす 1988年サイズ 27856 http://www.aozora.gr.jp/cards/001030/files/4800_ruby_14470.zip

かげろうの日記 1977年サイズ 27852 http://www.aozora.gr.jp/cards/001030/files/4801_ruby_14468.zip

楡の家 1969年サイズ 29019 http://www.aozora.gr.jp/cards/001030/files/4848_ruby_14152.zip

幼年時代 1955年サイズ 35137 http://www.aozora.gr.jp/cards/001030/files/4818_ruby_14385.zip

サイズが比較的近い4つに限定して解析します

RStudio上でダウンロード、解凍、ルビ取りを行います。

この際、出力ファイル名に作家の頭文字と作品年代を追記します。


# まず青空文庫からのダウンロードと解凍，ルビを取る準備
source("/var/data/AozoraURL.R")

# 第一引数にURL, 第二引数にルビをとった後のファイル名を指定
Aozora("http://www.aozora.gr.jp/cards/001030/files/4800_ruby_14470.zip", "T_1988Hoto")

## [1] "./NORUBY/T_1988Hoto2.txt"

Aozora("http://www.aozora.gr.jp/cards/001030/files/4801_ruby_14468.zip", "T_1977Kagerou")

## [1] "./NORUBY/T_1977Kagerou2.txt"

Aozora("http://www.aozora.gr.jp/cards/001030/files/4848_ruby_14152.zip", "T_1992Nire")

## [1] "./NORUBY/T_1992Nire2.txt"

Aozora("http://www.aozora.gr.jp/cards/001030/files/4818_ruby_14385.zip", "T_1955Younen")

## [1] "./NORUBY/T_1955Younen2.txt"


(folder <- getwd())  # フォルダを確認

## [1] "/home/c101103029"

まとめて解析する

library(RMeCab)
tmp <- paste(folder, "NORUBY", sep = "/")  # 保存先フォルダは、現在のフォルダ下のNORUBY
setwd(tmp)  # 保存先フォルダに移動
txts <- dir()  # 含まれている全ファイル名を指定
hori <- data.frame()  # 解析結果を保存する入れ物

for (i in txts) {
    x <- sum(nchar(readLines(i)))
    y <- RMeCabFreq(i)
    kuten <- y[y$Info2 == "句点", ]
    toten <- y[y$Info2 == "読点", ]
    z <- sum(y$Freq)  # / kuten$Freq
    hori <- rbind(hori, data.frame(text = i, chars = x, words = z, kuten = sum(kuten$Freq), 
        toten = sum(toten$Freq)))
}

## file = T_1955Younen2.txt 
## length = 2671 
## file = T_1977Kagerou2.txt 
## length = 2114 
## file = T_1988Hoto2.txt 
## length = 2168 
## file = T_1992Nire2.txt 
## length = 2351


# 結果を確認する
head(hori)

##                 text chars words kuten toten
## 1  T_1955Younen2.txt 37078 23950   641  1423
## 2 T_1977Kagerou2.txt 30901 19760   457  1101
## 3    T_1988Hoto2.txt 30260 19438   439  1085
## 4    T_1992Nire2.txt 31006 19969   533   933

文の長さの中央値を求めてみます。

median(hori$words/(hori$kuten + hori$toten))

## [1] 12.72

median(hori$chars/(hori$kuten + hori$toten))

## [1] 19.84

一文あたりの単語数12.7188 は、また文字数は 19.8447 とわかります。

結果をプロットしてみます。

plot(hori$words/(hori$kuten + hori$toten), main = "一文の単語数", xlab = "作品", 
    type = "l")

plot of chunk unnamed-chunk-4

plot(hori$chars/(hori$kuten + hori$toten), main = "一文の文字数", xlab = "作品", 
    type = "l")

plot of chunk unnamed-chunk-4

unlink(tmp, recursive = T)  # 堀辰雄の解析結果の入ったフォルダをいったん空にする

泉鏡花の場合

まずファイルの選定、ダウンロードと処理

海神別荘 1942年サイズ29009 http://www.aozora.gr.jp/cards/000050/files/3244_ruby_24280.zip

白金之絵図 1942年　サイズ31198 http://www.aozora.gr.jp/cards/000050/files/3656_ruby_26029.zip

神鷺之巻 1933年　サイズ33523 http://www.aozora.gr.jp/cards/000050/files/3660_ruby_22278.zip

政談十二社 1996年　サイズ29547 http://www.aozora.gr.jp/cards/000050/files/4559_ruby_26032.zip

この中からサイズが比較的近い4つに限定して解析します

source("/var/data/AozoraURL.R")

Aozora("http://www.aozora.gr.jp/cards/000050/files/3244_ruby_24280.zip", "K_1942kaijin")

## [1] "./NORUBY/K_1942kaijin2.txt"

Aozora("http://www.aozora.gr.jp/cards/000050/files/3656_ruby_26029.zip", "K_1942sirogane")

## [1] "./NORUBY/K_1942sirogane2.txt"

Aozora("http://www.aozora.gr.jp/cards/000050/files/3660_ruby_22278.zip", "K_1933sinro")

## [1] "./NORUBY/K_1933sinro2.txt"

Aozora("http://www.aozora.gr.jp/cards/000050/files/4559_ruby_26032.zip", "K_1996seidan")

## [1] "./NORUBY/K_1996seidan2.txt"



(folder <- getwd())  # フォルダを確認

## [1] "/home/c101103029"

まとめて解析する

library(RMeCab)
tmp <- paste(folder, "NORUBY", sep = "/")  # 保存先フォルダは、現在のフォルダ下のNORUBY
setwd(tmp)  # 保存先フォルダに移動
txts <- dir()  # 含まれている全ファイル名を指定
kyo <- data.frame()  # 解析結果を保存する入れ物

for (i in txts) {
    x <- sum(nchar(readLines(i)))
    y <- RMeCabFreq(i)
    kuten <- y[y$Info2 == "句点", ]
    toten <- y[y$Info2 == "読点", ]
    z <- sum(y$Freq)  # / kuten$Freq
    kyo <- rbind(kyo, data.frame(text = i, chars = x, words = z, kuten = kuten$Freq, 
        toten = toten$Freq))
}

## file = K_1933sinro2.txt 
## length = 3322 
## file = K_1942kaijin2.txt 
## length = 2668 
## file = K_1942sirogane2.txt 
## length = 3084 
## file = K_1996seidan2.txt 
## length = 2870

# 結果を確認

head(kyo)

##                  text chars words kuten toten
## 1    K_1933sinro2.txt 26644 18775   704  1999
## 2   K_1942kaijin2.txt 22635 15966   955  1424
## 3 K_1942sirogane2.txt 23371 16545   632  1768
## 4   K_1996seidan2.txt 24626 16647   353  1389

文の長さの中央値を求めてみます。

median(kyo$words/(kyo$kuten + kyo$toten))

## [1] 6.92

median(kyo$chars/(kyo$kuten + kyo$toten))

## [1] 9.798

一文あたりの単語数6.9199 は、また文字数は 9.7976 とわかります。

結果をプロットしてみます。

plot(kyo$words/(kyo$kuten + kyo$toten), main = "一文の単語数", xlab = "作品", 
    type = "l")

plot of chunk unnamed-chunk-9

plot(kyo$chars/(kyo$kuten + kyo$toten), main = "一文の文字数", xlab = "作品", 
    type = "l")

plot of chunk unnamed-chunk-9

unlink(tmp, recursive = T)  # 泉鏡花の解析結果の入ったフォルダをいったん空にする

堀と泉に文長に違いかあるか調べます。

boxplot(kyo$words/(kyo$kuten + kyo$toten), hori$words/(hori$kuten + hori$toten), 
    name = c("泉", "堀"))

plot of chunk unnamed-chunk-11

この結果を見る限り、二人の文長に差があるとはいえません。

二人の比較

視点を変えて、二人の作家の助詞および読点の使い分けに差があるかどうかを調べます。

堀と泉、それぞれのファイルを読み込みます

source("/var/data/AozoraURL.R")

Aozora("http://www.aozora.gr.jp/cards/001030/files/4800_ruby_14470.zip", "T_1988Hoto")

## [1] "./NORUBY/T_1988Hoto2.txt"

Aozora("http://www.aozora.gr.jp/cards/001030/files/4801_ruby_14468.zip", "T_1977Kagerou")

## [1] "./NORUBY/T_1977Kagerou2.txt"

Aozora("http://www.aozora.gr.jp/cards/001030/files/4848_ruby_14152.zip", "T_1992Nire")

## [1] "./NORUBY/T_1992Nire2.txt"

Aozora("http://www.aozora.gr.jp/cards/001030/files/4818_ruby_14385.zip", "T_1955Younen")

## [1] "./NORUBY/T_1955Younen2.txt"

Aozora("http://www.aozora.gr.jp/cards/000050/files/3244_ruby_24280.zip", "K_1942kaijin")

## [1] "./NORUBY/K_1942kaijin2.txt"

Aozora("http://www.aozora.gr.jp/cards/000050/files/3656_ruby_26029.zip", "K_1942sirogane")

## [1] "./NORUBY/K_1942sirogane2.txt"

Aozora("http://www.aozora.gr.jp/cards/000050/files/3660_ruby_22278.zip", "K_1933sinro")

## [1] "./NORUBY/K_1933sinro2.txt"

Aozora("http://www.aozora.gr.jp/cards/000050/files/4559_ruby_26032.zip", "K_1996seidan")

## [1] "./NORUBY/K_1996seidan2.txt"




(folder <- getwd())  # フォルダを確認

## [1] "/home/c101103029"

まとめて解析する

library(RMeCab)
tmp <- paste(folder, "NORUBY", sep = "/")  # 保存先フォルダは、現在のフォルダ下のNORUBY
setwd(tmp)  # 保存先フォルダに移動

# 文字のNgramを取り出す
x <- docNgram(tmp, type = 0)

## file = /home/c101103029/NORUBY/K_1933sinro2.txt Ngram = 2 
## length = 9449 
## 
## file = /home/c101103029/NORUBY/K_1942kaijin2.txt Ngram = 2 
## length = 8193 
## 
## file = /home/c101103029/NORUBY/K_1942sirogane2.txt Ngram = 2 
## length = 8933 
## 
## file = /home/c101103029/NORUBY/K_1996seidan2.txt Ngram = 2 
## length = 8343 
## 
## file = /home/c101103029/NORUBY/T_1955Younen2.txt Ngram = 2 
## length = 8166 
## 
## file = /home/c101103029/NORUBY/T_1977Kagerou2.txt Ngram = 2 
## length = 6258 
## 
## file = /home/c101103029/NORUBY/T_1988Hoto2.txt Ngram = 2 
## length = 6460 
## 
## file = /home/c101103029/NORUBY/T_1992Nire2.txt Ngram = 2 
## length = 7090

読み込んだ中から、助詞と読点の組み合わせを幾つか抽出する

x <- x[rownames(x) %in% c("[と-、]", "[て-、]", "[は-、]", "[が-、]", 
    "[で-、]", "[に-、]", "[ら-、]", "[も-、]"), ]

####主成分分析を行なってみます。

x <- princomp(t(x))

結果をプロットします。

biplot(x)

plot of chunk unnamed-chunk-16

結論

この図を見ると、主に泉鏡花の作品は上部に、堀辰雄の作品は下の方に分布しています。泉は「は、」や「も、」、を頻繁に使っていることが伺えます。一方、堀の場合、「が、」や「と、」や「で、」を多く好んで使用していることが分かります。

unlink(tmp, recursive = T)  # 解析結果の入ったフォルダを空にする