一行ずつ読み込んで、リストに格納
txt<-readLines("test1.txt")
## [1] "The culture culture Culture and the and and culture and culture "
## [2] "language culture culture culture the the culture culture language"
## [3] "the the culture culture culture language Language culture the"
## [4] ""
txt[1]
## [1] "The culture culture Culture and the and and culture and culture "
length(txt)
## [1] 4
Punctuation characters:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
## [[1]]
## [1] "The" "culture" "culture" "Culture" "and" "the" "and"
## [8] "and" "culture" "" "" "" "" "and"
## [15] "culture"
##
## [[2]]
## [1] "language" "culture" "culture" "culture" "the" "the"
## [7] "culture" "culture" "language"
##
## [[3]]
## [1] "the" "the" "culture" "culture" "culture" "language"
## [7] "Language" "culture" "the"
##
## [[4]]
## character(0)
wordLst<-unlist(wordLst)
## [1] "The" "culture" "culture" "Culture" "and" "the"
## [7] "and" "and" "culture" "" "" ""
## [13] "" "and" "culture" "language" "culture" "culture"
## [19] "culture" "the" "the" "culture" "culture" "language"
## [25] "the" "the" "culture" "culture" "culture" "language"
## [31] "Language" "culture" "the"
wordLst<-tolower(wordLst)
## [1] "the" "culture" "culture" "culture" "and" "the"
## [7] "and" "and" "culture" "" "" ""
## [13] "" "and" "culture" "language" "culture" "culture"
## [19] "culture" "the" "the" "culture" "culture" "language"
## [25] "the" "the" "culture" "culture" "culture" "language"
## [31] "language" "culture" "the"
wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]
tokens <- length(wordLst)
## [1] 29
types <- length(unique(wordLst))
## [1] 4
\[TTR=\frac{types}{tokens} \times 100 \]
types/tokens*100
## [1] 13.7931
TTR <- round(types/tokens*100,2)
## [1] 13.79
freq<-sort(table(wordLst), decreasing=TRUE)
## wordLst
## culture the and language
## 14 7 4 4
全体を1としたときの出現率
relative <- freq / sum(freq)
## wordLst
## culture the and language
## 0.4827586 0.2413793 0.1379310 0.1379310
sum(relative)
## [1] 1
round(relative,2)
## wordLst
## culture the and language
## 0.48 0.24 0.14 0.14
freqData <- data.frame(word=rownames(freq),freq=freq)
## word freq
## culture culture 14
## the the 7
## and and 4
## language language 4
relativeData <- data.frame(word=rownames(relative),freq=relative)
## word freq
## culture culture 0.4827586
## the the 0.2413793
## and and 0.1379310
## language language 0.1379310
write.csv(freqData,"freq-test1.csv")
freqMtx <- merge(freqData, relativeData, all=T, by="word")
## word freq.x freq.y
## 1 and 4 0.1379310
## 2 culture 14 0.4827586
## 3 language 4 0.1379310
## 4 the 7 0.2413793
names(freqMtx) <- c("term","raw", "relative")
## term raw relative
## 1 and 4 0.1379310
## 2 culture 14 0.4827586
## 3 language 4 0.1379310
## 4 the 7 0.2413793
freqOrder<-order(freqMtx$raw, decreasing=TRUE)
freqMtx <- freqMtx[freqOrder,]
## term raw relative
## 2 culture 14 0.4827586
## 4 the 7 0.2413793
## 1 and 4 0.1379310
## 3 language 4 0.1379310
freqMtx$raw
## wordLst
## culture the and language
## 14 7 4 4
sort(freqMtx$raw)
## wordLst
## and language the culture
## 4 4 7 14
order(freqMtx$raw)
## [1] 3 4 2 1
freqOrder2<-order(freqMtx$term)
freqMtx2 <- freqMtx[freqOrder2,]
## term raw relative
## 1 and 4 0.1379310
## 2 culture 14 0.4827586
## 3 language 4 0.1379310
## 4 the 7 0.2413793
mySort <- function(freqData, sortBy="term"){
if(sortBy=="term"){
freqOrder<-order(freqData$term)
}else if(sortBy=="raw"){
freqOrder<-order(freqData$raw, decreasing=TRUE)
}
freqData <- freqData[freqOrder,]
return(freqData)
}
mySort(freqMtx)
## term raw relative
## 1 and 4 0.1379310
## 2 culture 14 0.4827586
## 3 language 4 0.1379310
## 4 the 7 0.2413793
mySort(freqMtx, sortBy="term")
## term raw relative
## 1 and 4 0.1379310
## 2 culture 14 0.4827586
## 3 language 4 0.1379310
## 4 the 7 0.2413793
mySort(freqMtx, sortBy="raw")
## term raw relative
## 2 culture 14 0.4827586
## 4 the 7 0.2413793
## 1 and 4 0.1379310
## 3 language 4 0.1379310
getRawFreqMtx.Rを作成
getRawFreqMtx <- function(filename){
txt<-readLines(filename)
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst<- wordLst[wordLst != ""]
freq<-table(wordLst)
data.frame(freq) ->freqData
freqOrder<-order(freqData$Freq, decreasing=TRUE)
freqData<-freqData[freqOrder,]
return(freqData)
}
source("getRawFreqMtx.R")
getRawFreqMtx("test1.txt")
## wordLst Freq
## 2 culture 14
## 4 the 7
## 1 and 4
## 3 language 4
テキストファイル名を引数にして、TTRの計算結果をを出力する関数を作成しなさい。
関数名はgetTTRとし、関数ファイル(getTTR.R)をメールで提出すること。
提出前に、テキストファイル"osaka-u.txt"を使用して、正しく実行できるかを必ず確認すること。
getTTR("osaka-u.txt")
## [1] 48.24
テキストファイル名を引数にして、単語の頻度数と相対頻度をマージした行列データを出力する関数を作成しなさい。
関数名はgetRelativeFreqMtxとし、関数ファイル(getRelativeFreqMtx.R)をメールで提出すること。
提出前に、テキストファイル"osaka-u.txt"を使用して、正しく実行できるかを必ず確認すること。
getFreqMtx("osaka-u.txt")
## term raw relative
## 211 the 33 0.065
## 21 and 31 0.061
## 146 of 31 0.061
## 228 university 16 0.031
## 112 in 15 0.029