getwd()
[1] "/cloud/project"
c(1, 2, 3, 4, 5)
[1] 1 2 3 4 5
c(1, 2, 3, 4, 5)*2
[1] 2 4 6 8 10
Y <- c(1, 2, 3, 4, 5)
Y*2
[1] 2 4 6 8 10
Y^2
[1] 1 4 9 16 25
Y[4]
[1] 4
str <- c ("a", "ab", "abc")
length(str)
[1] 3
nchar(str)
[1] 1 2 3
numLst <- c (16,25,256)
sqrt(numLst)
[1] 4 5 16
一行ずつ読み込んで、リストに格納
txt<-readLines("sample_texts/sample_en.txt")
txt
[1] "COVID-19 is an infectious disease caused by a coronavirus called SARS-CoV-2. "
[2] "It mainly causes symptoms such as fever and/or cough. In general, it is spread through droplet and contact transmission. "
[3] "It has been pointed out that it may spread before symptoms appear. "
[4] "It is therefore important to habitually follow the general strategies for preventing infectious diseases, such as social distancing and wearing a mask when in public."
txt[3]
[1] "It has been pointed out that it may spread before symptoms appear. "
[1] 4
Punctuation characters:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordLst
[[1]]
[1] "COVID" "19" "is"
[4] "an" "infectious" "disease"
[7] "caused" "by" "a"
[10] "coronavirus" "called" "SARS"
[13] "CoV" "2" ""
[[2]]
[1] "It" "mainly" "causes"
[4] "symptoms" "such" "as"
[7] "fever" "and" "or"
[10] "cough" "" "In"
[13] "general" "" "it"
[16] "is" "spread" "through"
[19] "droplet" "and" "contact"
[22] "transmission" ""
[[3]]
[1] "It" "has" "been" "pointed"
[5] "out" "that" "it" "may"
[9] "spread" "before" "symptoms" "appear"
[13] ""
[[4]]
[1] "It" "is" "therefore"
[4] "important" "to" "habitually"
[7] "follow" "the" "general"
[10] "strategies" "for" "preventing"
[13] "infectious" "diseases" ""
[16] "such" "as" "social"
[19] "distancing" "and" "wearing"
[22] "a" "mask" "when"
[25] "in" "public"
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst
[1] "covid" "19" "is"
[4] "an" "infectious" "disease"
[7] "caused" "by" "a"
[10] "coronavirus" "called" "sars"
[13] "cov" "2" ""
[16] "it" "mainly" "causes"
[19] "symptoms" "such" "as"
[22] "fever" "and" "or"
[25] "cough" "" "in"
[28] "general" "" "it"
[31] "is" "spread" "through"
[34] "droplet" "and" "contact"
[37] "transmission" "" "it"
[40] "has" "been" "pointed"
[43] "out" "that" "it"
[46] "may" "spread" "before"
[49] "symptoms" "appear" ""
[52] "it" "is" "therefore"
[55] "important" "to" "habitually"
[58] "follow" "the" "general"
[61] "strategies" "for" "preventing"
[64] "infectious" "diseases" ""
[67] "such" "as" "social"
[70] "distancing" "and" "wearing"
[73] "a" "mask" "when"
[76] "in" "public"
#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]
wordLst
[1] "covid" "19" "is"
[4] "an" "infectious" "disease"
[7] "caused" "by" "a"
[10] "coronavirus" "called" "sars"
[13] "cov" "2" "it"
[16] "mainly" "causes" "symptoms"
[19] "such" "as" "fever"
[22] "and" "or" "cough"
[25] "in" "general" "it"
[28] "is" "spread" "through"
[31] "droplet" "and" "contact"
[34] "transmission" "it" "has"
[37] "been" "pointed" "out"
[40] "that" "it" "may"
[43] "spread" "before" "symptoms"
[46] "appear" "it" "is"
[49] "therefore" "important" "to"
[52] "habitually" "follow" "the"
[55] "general" "strategies" "for"
[58] "preventing" "infectious" "diseases"
[61] "such" "as" "social"
[64] "distancing" "and" "wearing"
[67] "a" "mask" "when"
[70] "in" "public"
tokens <- length(wordLst)
types <- length(unique(wordLst))
print(paste("Token =", tokens))
[1] "Token = 71"
print(paste("Types =", types))
[1] "Types = 55"
\[TTR=\frac{types}{tokens} \times 100 \]
types/tokens*100
[1] 77.46479
round(types/tokens*100,2)
[1] 77.46
\[RTTR=\frac{types}{\sqrt{tokens}} \]
[1] 6.53
(freq <- table(wordLst))
wordLst
19 2 a
1 1 2
an and appear
1 3 1
as been before
2 1 1
by called caused
1 1 1
causes contact coronavirus
1 1 1
cough cov covid
1 1 1
disease diseases distancing
1 1 1
droplet fever follow
1 1 1
for general habitually
1 2 1
has important in
1 1 2
infectious is it
2 3 5
mainly mask may
1 1 1
or out pointed
1 1 1
preventing public sars
1 1 1
social spread strategies
1 2 1
such symptoms that
2 2 1
the therefore through
1 1 1
to transmission wearing
1 1 1
when
1
(freq_data<-sort(freq, decreasing=TRUE))
wordLst
it and is
5 3 3
a as general
2 2 2
in infectious spread
2 2 2
such symptoms 19
2 2 1
2 an appear
1 1 1
been before by
1 1 1
called caused causes
1 1 1
contact coronavirus cough
1 1 1
cov covid disease
1 1 1
diseases distancing droplet
1 1 1
fever follow for
1 1 1
habitually has important
1 1 1
mainly mask may
1 1 1
or out pointed
1 1 1
preventing public sars
1 1 1
social strategies that
1 1 1
the therefore through
1 1 1
to transmission wearing
1 1 1
when
1
write.csv(freq_data, "freq_en.csv")
一行ずつ読み込んで、リストに格納
txt<-readLines("sample_texts/sample_ja_1.txt")
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordLst<-unlist(wordLst)
wordLst<- wordLst[wordLst != ""]
freq <- table(wordLst)
(freq_data<-sort(freq, decreasing=TRUE))
wordLst
に を と が
12 8 7 6
の は ください て
6 6 5 5
で とき ない ます
5 5 5 5
1 9 COVID ウイルス
4 4 4 4
コロナ 話 か し
4 4 3 3
や 中 人 体
3 3 3 3
2 CoV SARS つけ
2 2 2 2
など なり マスク まわり
2 2 2 2
よう 出 出る 咳
2 2 2 2
外 感染 熱 誰
2 2 2 2
近く distancing Social あと
2 1 1 1
あり ある いい うつさ
1 1 1 1
うつし から こと しかし
1 1 1 1
しまう しれ そして た
1 1 1 1
だ つける なら なる
1 1 1 1
ませ も ん 入っ
1 1 1 1
入る 接触 気 空け
1 1 1 1
話す 間 飛沫
1 1 1
tokens <- length(wordLst)
types <- length(unique(wordLst))
round(types/tokens*100,2)
[1] 39.23