Lecture3: 関数の作成

Warmup Practice

作業ディレクトリの設定

setwd("/cloud/project")

作業ディレクトリの確認

getwd()
[1] "/cloud/project"

準備:単語リスト

txt<-readLines("sample_texts/sample_en.txt")
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst<- wordLst[wordLst != ""]

前回の補足

strsplit(txt," |[.,!?:...]")
[[1]]
 [1] "COVID-19"    "is"          "an"          "infectious"  "disease"     "caused"     
 [7] "by"          "a"           "coronavirus" "called"      "SARS-CoV-2"  ""           

[[2]]
 [1] "It"       "mainly"   "causes"   "symptoms" "such"     "as"       "fever"    "and/or"  
 [9] "cough"    ""        

[[3]]
 [1] "In"           "general"      ""             "it"           "is"           "spread"      
 [7] "through"      "droplet"      "and"          "contact"      "transmission" ""            

[[4]]
 [1] "It"       "has"      "been"     "pointed"  "out"      "that"     "it"       "may"     
 [9] "spread"   "before"   "symptoms" "appear"   ""        

[[5]]
 [1] "It"         "is"         "therefore"  "important"  "to"         "habitually" "follow"    
 [8] "the"        "general"    "strategies" "for"        "preventing" "infectious" "diseases"  
[15] ""           "such"       "as"         "social"     "distancing" "and"        "wearing"   
[22] "a"          "mask"       "when"       "in"         "public"    

結果部分出力

head(wordLst)
[1] "covid"      "19"         "is"         "an"         "infectious" "disease"   

単語のToken数

tokens <- length(wordLst)

単語のTypes数

  • unique()関数は,リストの重複しない要素を返す
types <- length(unique(wordLst))

結果出力

print(paste("The number of tokens: ", tokens))
[1] "The number of tokens:  71"
print(paste("The number of types: ", types))
[1] "The number of types:  55"

関数の作成

TTR

返り(戻り)値なし関数:printTTR関数を作成

引数: 単語リスト

結果出力: TTRの結果

printTTR<- function(wLst) {
    num_tokens <- length(wLst)
    num_types <- length(unique(wLst))
    res_TTR = num_types/num_tokens * 100
    paste("TTR =", res_TTR)
}

printTTR関数の実行

printTTR(wordLst)
[1] "TTR = 77.4647887323944"

返り(戻り)値あり関数:calcTTR1関数を作成

引数: 単語リスト

戻り値: TTR計算値

calcTTR1<- function(wLst) {
    num_tokens <- length(wLst)
    num_types <- length(unique(wLst))
    res_TTR = num_types/num_tokens * 100
    return(res_TTR)
}

calcTTR1関数の実行

calcTTR1(wordLst)
[1] 77.46479

戻り値の利用

res <- calcTTR1(wordLst)
round(res,1)
[1] 77.5

返り(戻り)値あり関数:calcTTR2関数を作成

引数: 単語リスト

戻り値: TTR計算値

calcTTR2<- function(arg_tokens,arg_types) {
    res_TTR <- arg_types/arg_tokens * 100
    return(res_TTR)
}

calcTTR2関数の実行

calcTTR2(tokens,types)
[1] 77.46479

関数の読み込み

calcTTR3(wordLst)
[1] 77.46479

練習: RTTR(Root Type-Token Ratio) Giraudの値を計算する関数calcRTTRの作成

引数: 英文テキストファイル

戻り値: Giraud計算値

calcRTTR関数の実行

calcRTTR("sample_texts/sample_en.txt")
[1] 6.527299

Word Frequencies

(freq <- table(wordLst))
wordLst
          19            2            a           an          and       appear           as 
           1            1            2            1            3            1            2 
        been       before           by       called       caused       causes      contact 
           1            1            1            1            1            1            1 
 coronavirus        cough          cov        covid      disease     diseases   distancing 
           1            1            1            1            1            1            1 
     droplet        fever       follow          for      general   habitually          has 
           1            1            1            1            2            1            1 
   important           in   infectious           is           it       mainly         mask 
           1            2            2            3            5            1            1 
         may           or          out      pointed   preventing       public         sars 
           1            1            1            1            1            1            1 
      social       spread   strategies         such     symptoms         that          the 
           1            2            1            2            2            1            1 
   therefore      through           to transmission      wearing         when 
           1            1            1            1            1            1 

Sort

(freq_data<-sort(freq, decreasing=TRUE))
wordLst
          it          and           is            a           as      general           in 
           5            3            3            2            2            2            2 
  infectious       spread         such     symptoms           19            2           an 
           2            2            2            2            1            1            1 
      appear         been       before           by       called       caused       causes 
           1            1            1            1            1            1            1 
     contact  coronavirus        cough          cov        covid      disease     diseases 
           1            1            1            1            1            1            1 
  distancing      droplet        fever       follow          for   habitually          has 
           1            1            1            1            1            1            1 
   important       mainly         mask          may           or          out      pointed 
           1            1            1            1            1            1            1 
  preventing       public         sars       social   strategies         that          the 
           1            1            1            1            1            1            1 
   therefore      through           to transmission      wearing         when 
           1            1            1            1            1            1 

頻度テーブルをデータ型に変換

freqData <- data.frame(freq_data)
freqData

相対頻度テーブル

(relative<-sort(freq/tokens, decreasing=TRUE))
wordLst
          it          and           is            a           as      general           in 
  0.07042254   0.04225352   0.04225352   0.02816901   0.02816901   0.02816901   0.02816901 
  infectious       spread         such     symptoms           19            2           an 
  0.02816901   0.02816901   0.02816901   0.02816901   0.01408451   0.01408451   0.01408451 
      appear         been       before           by       called       caused       causes 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
     contact  coronavirus        cough          cov        covid      disease     diseases 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
  distancing      droplet        fever       follow          for   habitually          has 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
   important       mainly         mask          may           or          out      pointed 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
  preventing       public         sars       social   strategies         that          the 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
   therefore      through           to transmission      wearing         when 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 

相対頻度テーブルをデータ型に変換

2つのデータ型変数を連結(merge)

freqMtx <- merge(freqData, relativeData, all=T, by="wordLst")
freqMtx[,1]
 [1] 19           2            a            an           and          appear       as          
 [8] been         before       by           called       caused       causes       contact     
[15] coronavirus  cough        cov          covid        disease      diseases     distancing  
[22] droplet      fever        follow       for          general      habitually   has         
[29] important    in           infectious   is           it           mainly       mask        
[36] may          or           out          pointed      preventing   public       sars        
[43] social       spread       strategies   such         symptoms     that         the         
[50] therefore    through      to           transmission wearing      when        
55 Levels: it and is a as general in infectious spread such symptoms 19 2 an appear been ... when

出現単語の情報を行ラベルにコピー

rownames(freqMtx)<-as.character(freqMtx[,1])
Warning: non-unique values when setting 'row.names': ‘1’, ‘2’, ‘3’
Error in `.rowNamesDF<-`(x, value = value) : 
  duplicate 'row.names' are not allowed

出現単語の情報(1列目)を削除

freqMtx<-freqMtx[-1]
colnames(freqMtx) <- c("raw", "relative")

粗頻度-相対頻度の行列

LS0tCnRpdGxlOiAiREhfQjogTGVjdHVyZTAzIChGYWxsIDIwMjEpIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIExlY3R1cmUzOiDplqLmlbDjga7kvZzmiJAKCiMgV2FybXVwIFByYWN0aWNlCgoKIyMg5L2c5qWt44OH44Kj44Os44Kv44OI44Oq44Gu6Kit5a6aCmBgYHtyfQpzZXR3ZCgiL2Nsb3VkL3Byb2plY3QiKQpgYGAKCiMjIOS9nOalreODh+OCo+ODrOOCr+ODiOODquOBrueiuuiqjSAKYGBge3J9CmdldHdkKCkKYGBgCgojIyDmupblgpnvvJrljZjoqp7jg6rjgrnjg4gKCmBgYHtyfQp0eHQ8LXJlYWRMaW5lcygic2FtcGxlX3RleHRzL3NhbXBsZV9lbi50eHQiKQp3b3JkTHN0PC1zdHJzcGxpdCh0eHQsIltbOnNwYWNlOl1dfFtbOnB1bmN0Ol1dIikKd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCndvcmRMc3Q8LXRvbG93ZXIod29yZExzdCkKd29yZExzdDwtIHdvcmRMc3Rbd29yZExzdCAhPSAiIl0KYGBgCgojIyDliY3lm57jga7oo5zotrMKYGBge3J9CnN0cnNwbGl0KHR4dCwiIHxbLiwhPzouLi5dIikKYGBgCgojIyMg57WQ5p6c6YOo5YiG5Ye65YqbCmBgYHtyfQpoZWFkKHdvcmRMc3QpCmBgYAoKIyMjIOWNmOiqnuOBrlRva2Vu5pWwCmBgYHtyfQp0b2tlbnMgPC0gbGVuZ3RoKHdvcmRMc3QpCmBgYAoKIyMjIOWNmOiqnuOBrlR5cGVz5pWwCiogdW5pcXVlKCnplqLmlbDjga/vvIzjg6rjgrnjg4jjga7ph43opIfjgZfjgarjgYTopoHntKDjgpLov5TjgZkKYGBge3J9CnR5cGVzIDwtIGxlbmd0aCh1bmlxdWUod29yZExzdCkpCmBgYAoKIyMjIOe1kOaenOWHuuWKmwpgYGB7cn0KcHJpbnQocGFzdGUoIlRoZSBudW1iZXIgb2YgdG9rZW5zOiAiLCB0b2tlbnMpKQpwcmludChwYXN0ZSgiVGhlIG51bWJlciBvZiB0eXBlczogIiwgdHlwZXMpKQpgYGAKCiMjIOmWouaVsOOBruS9nOaIkAojIyBUVFIKYGBge3J9CgpgYGAKCiMjIOi/lOOCiu+8iOaIu+OCiu+8ieWApOOBquOBl+mWouaVsO+8mnByaW50VFRS6Zai5pWw44KS5L2c5oiQCiMjIyDlvJXmlbA6IOWNmOiqnuODquOCueODiAojIyMg57WQ5p6c5Ye65YqbOiBUVFLjga7ntZDmnpwKYGBge3J9CnByaW50VFRSPC0gZnVuY3Rpb24od0xzdCkgewogICAgbnVtX3Rva2VucyA8LSBsZW5ndGgod0xzdCkKICAgIG51bV90eXBlcyA8LSBsZW5ndGgodW5pcXVlKHdMc3QpKQogICAgcmVzX1RUUiA8LSBudW1fdHlwZXMvbnVtX3Rva2VucyAqIDEwMAogICAgcGFzdGUoIlRUUiA9IiwgcmVzX1RUUikKfQpgYGAKCiMjIHByaW50VFRS6Zai5pWw44Gu5a6f6KGMCmBgYHtyfQpwcmludFRUUih3b3JkTHN0KQpgYGAKCiMjIOi/lOOCiu+8iOaIu+OCiu+8ieWApOOBguOCiumWouaVsO+8mmNhbGNUVFIx6Zai5pWw44KS5L2c5oiQCiMjIyDlvJXmlbA6IOWNmOiqnuODquOCueODiAojIyMg5oi744KK5YCkOiBUVFLoqIjnrpflgKQKYGBge3J9CmNhbGNUVFIxPC0gZnVuY3Rpb24od0xzdCkgewogICAgbnVtX3Rva2VucyA8LSBsZW5ndGgod0xzdCkKICAgIG51bV90eXBlcyA8LSBsZW5ndGgodW5pcXVlKHdMc3QpKQogICAgcmVzX1RUUiA8LSBudW1fdHlwZXMvbnVtX3Rva2VucyAqIDEwMAogICAgcmV0dXJuKHJlc19UVFIpCn0KYGBgCgojIyBjYWxjVFRSMemWouaVsOOBruWun+ihjApgYGB7cn0KY2FsY1RUUjEod29yZExzdCkKYGBgCiMjICDmiLvjgorlgKTjga7liKnnlKgKYGBge3J9CnJlcyA8LSBjYWxjVFRSMSh3b3JkTHN0KQpyb3VuZChyZXMsMSkKYGBgCgojIyDov5TjgorvvIjmiLvjgorvvInlgKTjgYLjgorplqLmlbDvvJpjYWxjVFRSMumWouaVsOOCkuS9nOaIkAojIyMg5byV5pWwOiDljZjoqp7jg6rjgrnjg4gKIyMjIOaIu+OCiuWApDogVFRS6KiI566X5YCkCmBgYHtyfQpjYWxjVFRSMjwtIGZ1bmN0aW9uKGFyZ190b2tlbnMsYXJnX3R5cGVzKSB7CiAgICByZXNfVFRSIDwtIGFyZ190eXBlcy9hcmdfdG9rZW5zICogMTAwCiAgICByZXR1cm4ocmVzX1RUUikKfQpgYGAKCiMjIGNhbGNUVFIy6Zai5pWw44Gu5a6f6KGMCmBgYHtyfQpjYWxjVFRSMih0b2tlbnMsdHlwZXMpCmBgYAoKCiMjIOmWouaVsOOBruiqreOBv+i+vOOBvwpgYGB7cn0Kc291cmNlKCJjYWxjVFRSMy5SIikKY2FsY1RUUjMod29yZExzdCkKYGBgCgojIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7ICI+57e057+SPC9zcGFuPjogUlRUUihSb290IFR5cGUtVG9rZW4gUmF0aW8pIEdpcmF1ZOOBruWApOOCkuioiOeul+OBmeOCizxzcGFuIHN0eWxlPSJjb2xvcjogcmVkOyAiPumWouaVsGNhbGNSVFRSPC9zcGFuPuOBruS9nOaIkAojIyMg5byV5pWwOiAg6Iux5paH44OG44Kt44K544OI44OV44Kh44Kk44OrCiMjIyDmiLvjgorlgKQ6IEdpcmF1ZOioiOeul+WApApgYGB7ciwgZWNobz1GQUxTRX0KY2FsY1JUVFI8LSBmdW5jdGlvbihmbmFtZSkgewogICAgdHh0PC1yZWFkTGluZXMoZm5hbWUpCiAgICB3b3JkTHN0PC1zdHJzcGxpdCh0eHQsIltbOnNwYWNlOl1dfFtbOnB1bmN0Ol1dIikKICAgIHdvcmRMc3Q8LXVubGlzdCh3b3JkTHN0KQogICAgd29yZExzdDwtdG9sb3dlcih3b3JkTHN0KQogICAgd29yZExzdDwtIHdvcmRMc3Rbd29yZExzdCAhPSAiIl0KICAgIG51bV90b2tlbnMgPC0gbGVuZ3RoKHdvcmRMc3QpCiAgICBudW1fdHlwZXMgPC0gbGVuZ3RoKHVuaXF1ZSh3b3JkTHN0KSkKICAgIHJlc19HIDwtIG51bV90eXBlcy9zcXJ0KG51bV90b2tlbnMpCiAgICByZXR1cm4ocmVzX0cpCn0KYGBgCgojIyBjYWxjUlRUUumWouaVsOOBruWun+ihjApgYGB7cn0KY2FsY1JUVFIoInNhbXBsZV90ZXh0cy9zYW1wbGVfZW4udHh0IikKYGBgCgojIyBXb3JkIEZyZXF1ZW5jaWVzCmBgYHtyfQooZnJlcSA8LSB0YWJsZSh3b3JkTHN0KSkKYGBgCgojIyBTb3J0CmBgYHtyfQooZnJlcV9kYXRhPC1zb3J0KGZyZXEsIGRlY3JlYXNpbmc9VFJVRSkpCmBgYAoKIyMg6aC75bqm44OG44O844OW44Or44KS44OH44O844K/5Z6L44Gr5aSJ5o+bCmBgYHtyfQpmcmVxRGF0YSA8LSBkYXRhLmZyYW1lKGZyZXFfZGF0YSkKZnJlcURhdGEKYGBgCgojIyDnm7jlr77poLvluqbjg4bjg7zjg5bjg6sKYGBge3J9CihyZWxhdGl2ZTwtc29ydChmcmVxL3Rva2VucywgZGVjcmVhc2luZz1UUlVFKSkKI3N1bShmcmVxL3Rva2VucykKYGBgCiMjIOebuOWvvumgu+W6puODhuODvOODluODq+OCkuODh+ODvOOCv+Wei+OBq+WkieaPmwpgYGB7cn0KKHJlbGF0aXZlRGF0YSA8LSBkYXRhLmZyYW1lKHJlbGF0aXZlKSkKYGBgCiMjIO+8kuOBpOOBruODh+ODvOOCv+Wei+WkieaVsOOCkumAo+e1kChtZXJnZSkKYGBge3J9CmZyZXFNdHggPC0gbWVyZ2UoZnJlcURhdGEsIHJlbGF0aXZlRGF0YSwgYWxsPVQsIGJ5PSJ3b3JkTHN0IikKYGBgCgpgYGB7cn0KZnJlcU10eApgYGAKCmBgYHtyfQpmcmVxTXR4WywxXQpgYGAKCiMjIyDlh7rnj77ljZjoqp7jga7mg4XloLHjgpLooYzjg6njg5njg6vjgavjgrPjg5Tjg7wKYGBge3J9CnJvd25hbWVzKGZyZXFNdHgpPC1hcy5jaGFyYWN0ZXIoZnJlcU10eFssMV0pCmBgYAoKIyMg5Ye654++5Y2Y6Kqe44Gu5oOF5aCxKDHliJfnm64p44KS5YmK6ZmkCmBgYHtyfQpmcmVxTXR4PC1mcmVxTXR4Wy0xXQpjb2xuYW1lcyhmcmVxTXR4KSA8LSBjKCJyYXciLCAicmVsYXRpdmUiKQpgYGAKCiMjIOeyl+mgu+W6pi3nm7jlr77poLvluqbjga7ooYzliJcKYGBge3J9CmZyZXFNdHgKYGBgCgoKCgo=