getwd()
[1] "/cloud/project"
c(1, 2, 3, 4, 5)
c(1, 2, 3, 4, 5)*2
Y <- c(1, 2, 3, 4, 5)
Y*2
Y^2
[1] 1 4 9 16 25
Y[4]
[1] 4
length(str)
[1] 3
nchar(str)
[1] 1 2 3
sqrt(numLst)
[1] 4 5 16
一行ずつ読み込んで、リストに格納
txt<-readLines("sample_texts/sample_en.txt")
Warning in readLines("sample_texts/sample_en.txt") :
incomplete final line found on 'sample_texts/sample_en.txt'
txt
txt[3]
[1] "In general, it is spread through droplet and contact transmission. "
Punctuation characters:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordLst
[[1]]
[1] "COVID" "19" "is" "an" "infectious" "disease"
[7] "caused" "by" "a" "coronavirus" "called" "SARS"
[13] "CoV" "2" ""
[[2]]
[1] "It" "mainly" "causes" "symptoms" "such" "as" "fever" "and"
[9] "or" "cough" ""
[[3]]
[1] "In" "general" "" "it" "is"
[6] "spread" "through" "droplet" "and" "contact"
[11] "transmission" ""
[[4]]
[1] "It" "has" "been" "pointed" "out" "that" "it" "may"
[9] "spread" "before" "symptoms" "appear" ""
[[5]]
[1] "It" "is" "therefore" "important" "to" "habitually"
[7] "follow" "the" "general" "strategies" "for" "preventing"
[13] "infectious" "diseases" "" "such" "as" "social"
[19] "distancing" "and" "wearing" "a" "mask" "when"
[25] "in" "public"
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst
[1] "COVID" "19" "is" "an" "infectious"
[6] "disease" "caused" "by" "a" "coronavirus"
[11] "called" "SARS" "CoV" "2" ""
[16] "It" "mainly" "causes" "symptoms" "such"
[21] "as" "fever" "and" "or" "cough"
[26] "" "In" "general" "" "it"
[31] "is" "spread" "through" "droplet" "and"
[36] "contact" "transmission" "" "It" "has"
[41] "been" "pointed" "out" "that" "it"
[46] "may" "spread" "before" "symptoms" "appear"
[51] "" "It" "is" "therefore" "important"
[56] "to" "habitually" "follow" "the" "general"
[61] "strategies" "for" "preventing" "infectious" "diseases"
[66] "" "such" "as" "social" "distancing"
[71] "and" "wearing" "a" "mask" "when"
[76] "in" "public"
#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]
wordLst
[1] "COVID" "19" "is" "an" "infectious"
[6] "disease" "caused" "by" "a" "coronavirus"
[11] "called" "SARS" "CoV" "2" "It"
[16] "mainly" "causes" "symptoms" "such" "as"
[21] "fever" "and" "or" "cough" "In"
[26] "general" "it" "is" "spread" "through"
[31] "droplet" "and" "contact" "transmission" "It"
[36] "has" "been" "pointed" "out" "that"
[41] "it" "may" "spread" "before" "symptoms"
[46] "appear" "It" "is" "therefore" "important"
[51] "to" "habitually" "follow" "the" "general"
[56] "strategies" "for" "preventing" "infectious" "diseases"
[61] "such" "as" "social" "distancing" "and"
[66] "wearing" "a" "mask" "when" "in"
[71] "public"
tokens <- length(wordLst)
types <- length(unique(wordLst))
print(paste("Token =", tokens))
[1] "Token = 71"
print(paste("Types =", types))
[1] "Types = 57"
\[TTR=\frac{types}{tokens} \times 100 \]
types/tokens*100
[1] 80.28169
round(types/tokens*100,2)
[1] 80.28
\[RTTR=\frac{types}{\sqrt{tokens}} \]
[1] 6.53
(freq <- table(wordLst))
(freq_data<-sort(freq, decreasing=TRUE))
wordLst
に を が で と の は ください て
12 8 6 6 6 6 6 5 5
とき ない 1 9 COVID ウイルス コロナ ます 話
5 5 4 4 4 4 4 4 4
か し や 中 人 体 2 CoV SARS
3 3 3 3 3 3 2 2 2
つけ など なり マスク まわり よう 出 出る 咳
2 2 2 2 2 2 2 2 2
外 感染 熱 誰 あけ あと あり ある うつさ
2 2 2 2 1 1 1 1 1
うつし から こと しかし しまう しれ そして た だ
1 1 1 1 1 1 1 1 1
つける なら なる ひと ませ も ん 入っ 入る
1 1 1 1 1 1 1 1 1
接触 気 空 話す 近 近く 間 飛沫
1 1 1 1 1 1 1 1