テキストファイル名を引数にして、単語の出現頻度と相対頻度の行列データを出力するgetFreqMtx関数を作成しなさい。
"osaka-u.txt"を使用して、正しく実行できるか確認すること。
getFreqMtx.R
getFreqMtx <- function(filename) {
txt <- readLines(filename, encoding = "utf8")
wordLst <- strsplit(txt, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
freq <- sort(table(wordLst), decreasing = TRUE)
relative <- round(freq/sum(freq), 3)
freqData <- data.frame(word = rownames(freq), freq = freq)
relativeData <- data.frame(word = rownames(relative), freq = relative)
freqMtx <- merge(freqData, relativeData, all = T, by = "word")
names(freqMtx) <- c("term", "raw", "relative")
freqOrder <- order(freqMtx$raw, decreasing = TRUE)
freqMtx <- freqMtx[freqOrder, ]
freqMtx$term <- as.character(freqMtx$term)
return(freqMtx)
}
freqMtx$term <- as.character(freqMtx$term)
source("getFreqMtx.R")
## term raw relative
## 261 the 41 0.060
## 23 and 33 0.048
## 183 of 30 0.044
## 281 university 28 0.041
## 194 osaka 23 0.033
## 270 to 23 0.033
## 9 a 15 0.022
## 26 as 13 0.019
## 127 in 12 0.017
## 93 for 10 0.015
変数relativeで分岐
relative = FALSE
freq <- sort(table(wordLst), decreasing = TRUE)
if (relative == TRUE) {
freq <- round(freq/sum(freq), 3)
}
getFreqMtx2.R
getFreqMtx2 <- function(filename, relative = FALSE) {
txt <- readLines(filename, encoding = "utf8")
wordLst <- strsplit(txt, "[[:space:]]|[[:punct:]]")
wordLst <- unlist(wordLst)
wordLst <- tolower(wordLst)
wordLst <- wordLst[wordLst != ""]
freq <- sort(table(wordLst), decreasing = TRUE)
if (relative == TRUE) {
freq <- round(freq/sum(freq), 3)
}
freqMtx <- data.frame(word = rownames(freq), freq)
label = ""
fpathL = unlist(strsplit(filename, "/|\\."))
label = fpathL[length(fpathL) - 1]
names(freqMtx) <- c("term", label)
freqOrder <- order(freqMtx$freq, decreasing = TRUE)
freqMtx <- freqMtx[freqOrder, ]
freqMtx$term <- as.character(freqMtx$term)
rownames(freqMtx) <- seq(1:nrow(freqMtx))
return(freqMtx)
}
rownames(freqMtx) <- seq(1:nrow(freqMtx))
label = ""
fpathL = unlist(strsplit(filename, "/|\\."))
label = fpathL[length(fpathL) - 1]
names(freqData) <- c("term", label)
f = "../testData/test2.txt"
label = ""
fpathL = unlist(strsplit(f, "/|\\."))
label = fpathL[length(fpathL) - 1]
label
## [1] "test2"
source("getFreqMtx2.R")
getFreqMtx2("Data/test1.txt")
## term test1
## 1 c 13
## 2 e 7
## 3 b 4
## 4 a 3
getFreqMtx2("Data/test1.txt", relative = TRUE)
## term test1
## 1 c 0.481
## 2 e 0.259
## 3 b 0.148
## 4 a 0.111
res <- getFreqMtx2("osaka-u.txt")
dim(res)
## [1] 305 2
title = "Word Frequency Distribution"
xlabel = "Rank"
ylabel = "Frequency"
plot(rownames(res), res[, 2], pch = 8, col = "darkgreen", main = title, xlab = xlabel,
ylab = ylabel)
plot(rownames(res), res[, 2], xlim = c(1, nrow(res)), ylim = c(1, 100), log = "xy",
pch = 8, col = "darkgreen", main = title, xlab = xlabel, ylab = ylabel)
res$term[1]
## [1] "the"
nchar(res$term[1])
## [1] 3
lapply(res$term, nchar)
lapply(res$term, nchar)[1:3]
charlen <- unlist(lapply(res$term, nchar))
charlen[1:5]
## [1] 3 3 2 10 5
charlenF <- table(charlen)
charlenF
## charlen
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 4 18 25 49 35 31 42 36 22 16 17 4 6
title = "Word Length Frequency Distribution (Types)"
xlabel = "Word Length"
ylabel = "Frequency"
xmax = length(charlenF)
ymax = max(charlenF)
plot(charlenF, type = "b", pch = 8, col = "orange", xlim = c(1, xmax), ylim = c(1,
ymax), main = title, xlab = xlabel, ylab = ylabel)
\[ Frequency=\frac{K}{Rank^A} \] K,A: 定数
K = ymax
A = 0.75
K
## [1] 49
rank <- seq(1:dim(res)[1])
zipf <- unlist(lapply(rank, function(r) K/r^A))
## [1] 49.000 29.136 21.496 17.324 14.654 12.782 11.386 10.301 9.430 8.714
title = "Zipf's Law"
xlabel = "Rank"
ylabel = "Frequency"
plot(zipf, log = "xy", type = "l", col = "red", xlim = c(1, 100), ylim = c(1,
100), main = title, xlab = xlabel, ylab = ylabel)
par(new = T)
plot(rownames(res), res[, 2], xlim = c(1, nrow(res)), ylim = c(1, 100), log = "xy",
pch = 8, col = "darkgreen", main = title, xlab = xlabel, ylab = ylabel)
legend(20, 100, c("Frequency", "Zipf's\nlaw"), lty = c(NA, 1), pch = c(8, NA),
col = c("darkgreen", "red"))
x軸の範囲:10 〜 Rankの要素数
定数Aの範囲:0.5 〜1.5
をsliderでインタラクティブ操作ができるようにしてください。
dirName <- "../testData"
files <- list.files(dirName)
files
## [1] "test2.txt" "test3.txt"
filesDir <- unlist(lapply(dirName, paste, files, sep = "/"))
filesDir
## [1] "../testData/test2.txt" "../testData/test3.txt"
freqLst <- lapply(filesDir, getFreqMtx2)
freqLst[1]
## [[1]]
## term test2
## 1 f 11
## 2 g 7
## 3 b 4
## 4 a 2
## 5 c 2
## 6 e 1
mtx <- freqLst[[1]]
for (i in freqLst[-1]) mtx <- merge(mtx, i, all = T, by = "term")
mtx[is.na(mtx)] <- 0
mtx <- mtx[order(as.vector(mtx$term)), ]
mtx
## term test2 test3
## 1 a 2 2
## 2 b 4 0
## 3 c 2 3
## 4 d 0 1
## 5 e 1 1
## 6 f 11 9
## 7 g 7 7
## 8 h 0 4
row.names(mtx) <- mtx[, 1]
mtx
## term test2 test3
## a a 2 2
## b b 4 0
## c c 2 3
## d d 0 1
## e e 1 1
## f f 11 9
## g g 7 7
## h h 0 4
mtx <- mtx[-1]
mtx
## test2 test3
## a 2 2
## b 4 0
## c 2 3
## d 0 1
## e 1 1
## f 11 9
## g 7 7
## h 0 4
write.csv(mtx, "testMtx.csv")
rowSums(mtx)
## a b c d e f g h
## 4 4 5 1 2 20 14 4
mtx[rowSums(mtx) >= 10, ]
## test2 test3
## f 11 9
## g 7 7
ただし、オプション引数のデフォルトは、素頻度行列が結果として出力されること。