Lecture3: 関数の作成

Warmup Practice

作業ディレクトリの設定

setwd("/cloud/project")

作業ディレクトリの確認

getwd()
[1] "/cloud/project"

準備:単語リスト

txt<-readLines("sample_texts/sample_en.txt")
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")

前回の補足: 正規表現

Punctuation characters:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.
tmp <- strsplit(txt,"\\s|\\W")
tmp[1]
[[1]]
 [1] "COVID"       "19"          "is"          "an"          "infectious" 
 [6] "disease"     "caused"      "by"          "a"           "coronavirus"
[11] "called"      "SARS"        "CoV"         "2"           ""           
tmp <- strsplit(txt,"[ \t]|[.,!?:...]")
tmp[1]
[[1]]
 [1] "COVID-19"    "is"          "an"          "infectious"  "disease"    
 [6] "caused"      "by"          "a"           "coronavirus" "called"     
[11] "SARS-CoV-2"  ""           
tmp <- strsplit(txt," |[^a-zA-Z0-9]")
tmp[1]
[[1]]
 [1] "COVID"       "19"          "is"          "an"          "infectious" 
 [6] "disease"     "caused"      "by"          "a"           "coronavirus"
[11] "called"      "SARS"        "CoV"         "2"           ""           

単語リスト(つづき)

wordLst<-unlist(wordLst)
wordLst<-tolower(wordLst)
wordLst<- wordLst[wordLst != ""]

結果部分出力

head(wordLst)
[1] "covid"      "19"         "is"         "an"         "infectious"
[6] "disease"   

Word Frequencies

(freq <- table(wordLst))
wordLst
          19            2            a           an          and       appear 
           1            1            2            1            3            1 
          as         been       before           by       called       caused 
           2            1            1            1            1            1 
      causes      contact  coronavirus        cough          cov        covid 
           1            1            1            1            1            1 
     disease     diseases   distancing      droplet        fever       follow 
           1            1            1            1            1            1 
         for      general   habitually          has    important           in 
           1            2            1            1            1            2 
  infectious           is           it       mainly         mask          may 
           2            3            5            1            1            1 
          or          out      pointed   preventing       public         sars 
           1            1            1            1            1            1 
      social       spread   strategies         such     symptoms         that 
           1            2            1            2            2            1 
         the    therefore      through           to transmission      wearing 
           1            1            1            1            1            1 
        when 
           1 

Sort

(freq_data<-sort(freq, decreasing=TRUE))
wordLst
          it          and           is            a           as      general 
           5            3            3            2            2            2 
          in   infectious       spread         such     symptoms           19 
           2            2            2            2            2            1 
           2           an       appear         been       before           by 
           1            1            1            1            1            1 
      called       caused       causes      contact  coronavirus        cough 
           1            1            1            1            1            1 
         cov        covid      disease     diseases   distancing      droplet 
           1            1            1            1            1            1 
       fever       follow          for   habitually          has    important 
           1            1            1            1            1            1 
      mainly         mask          may           or          out      pointed 
           1            1            1            1            1            1 
  preventing       public         sars       social   strategies         that 
           1            1            1            1            1            1 
         the    therefore      through           to transmission      wearing 
           1            1            1            1            1            1 
        when 
           1 

ファイルに出力

write.csv(freq_data, "freq_en.csv")

単語頻度数分布(色付き)

las: label style

colors = c("orange", "lightblue", "green") 
barplot(freq_data, las=3,col=colors)

関数の作成

返り(戻り)値なし関数:printTTR関数を作成

引数: 単語リスト

結果出力: TTRの結果

printTTR<- function(wLst) {
    num_tokens <- length(wLst)
    num_types <- length(unique(wLst))
    res_TTR <- num_types/num_tokens * 100
    paste("TTR =", round(res_TTR,3))
}

printTTR関数の実行

printTTR(wordLst)
[1] "TTR = 77.465"

返り(戻り)値あり関数:calcTTR1関数を作成

引数: 単語リスト

戻り値: TTR値

calcTTR1<- function(wLst) {
    num_tokens <- length(wLst)
    num_types <- length(unique(wLst))
    res_TTR <- num_types/num_tokens * 100
    return(round(res_TTR,3))
}

calcTTR1関数の実行

calcTTR1(wordLst)
[1] 77.465

戻り値の利用

res <- calcTTR1(wordLst)

単語のToken, Types

tokens <- length(wordLst)
types <- length(unique(wordLst))

返り(戻り)値あり関数:calcTTR2関数を作成

引数: Tokens, Types

戻り値: TTR計算値

calcTTR2<- function(arg_tokens,arg_types) {
    res_TTR <- arg_types/arg_tokens * 100
    return(round(res_TTR,3))
}

calcTTR2関数の実行

calcTTR2(tokens,types)
[1] 77.465

実習MeCabのインストール

練習: テキストファイルを読み込んで、Guiraudの値を返す関数calcRTTRの作成 

引数: 英文テキストファイル

戻り値: Guiraud計算値

calcRTTR関数の実行例

calcRTTR("sample_texts/sample_en.txt")
[1] 6.527299

【次週の予告】

頻度テーブルをデータ型に変換

freqData <- data.frame(freq_data)
head(freqData)

相対頻度テーブル

(relative<-sort(freq/tokens, decreasing=TRUE))
wordLst
          it          and           is            a           as      general 
  0.07042254   0.04225352   0.04225352   0.02816901   0.02816901   0.02816901 
          in   infectious       spread         such     symptoms           19 
  0.02816901   0.02816901   0.02816901   0.02816901   0.02816901   0.01408451 
           2           an       appear         been       before           by 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
      called       caused       causes      contact  coronavirus        cough 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
         cov        covid      disease     diseases   distancing      droplet 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
       fever       follow          for   habitually          has    important 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
      mainly         mask          may           or          out      pointed 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
  preventing       public         sars       social   strategies         that 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
         the    therefore      through           to transmission      wearing 
  0.01408451   0.01408451   0.01408451   0.01408451   0.01408451   0.01408451 
        when 
  0.01408451 
#sum(freq/tokens)

相対頻度テーブルをデータ型に変換

(relativeData <- data.frame(relative))

2つのデータ型変数を連結(merge)

freqMtx <- merge(freqData, relativeData, all=T, by="wordLst")
head(freqMtx)
freqMtx[,1]
 [1] 19           2            a            an           and         
 [6] appear       as           been         before       by          
[11] called       caused       causes       contact      coronavirus 
[16] cough        cov          covid        disease      diseases    
[21] distancing   droplet      fever        follow       for         
[26] general      habitually   has          important    in          
[31] infectious   is           it           mainly       mask        
[36] may          or           out          pointed      preventing  
[41] public       sars         social       spread       strategies  
[46] such         symptoms     that         the          therefore   
[51] through      to           transmission wearing      when        
55 Levels: it and is a as general in infectious spread such symptoms 19 ... when

出現単語の情報を行ラベルにコピー

rownames(freqMtx)<-as.character(freqMtx[,1])

出現単語の情報(1列目)を削除

freqMtx<-freqMtx[-1]
colnames(freqMtx) <- c("raw", "relative")

粗頻度-相対頻度の行列

freqMtx
LS0tCnRpdGxlOiAiTGVjMDMgKEZhbGwgMjAyMikiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiMgTGVjdHVyZTM6IOmWouaVsOOBruS9nOaIkAoKIyMgV2FybXVwIFByYWN0aWNlCiMjIOS9nOalreODh+OCo+ODrOOCr+ODiOODquOBruioreWumgpgYGB7cn0Kc2V0d2QoIi9jbG91ZC9wcm9qZWN0IikKYGBgCgojIyDkvZzmpa3jg4fjgqPjg6zjgq/jg4jjg6rjga7norroqo0gCmBgYHtyfQpnZXR3ZCgpCmBgYAoKIyMg5rqW5YKZ77ya5Y2Y6Kqe44Oq44K544OICmBgYHtyfQp0eHQ8LXJlYWRMaW5lcygic2FtcGxlX3RleHRzL3NhbXBsZV9lbi50eHQiKQp3b3JkTHN0PC1zdHJzcGxpdCh0eHQsIltbOnNwYWNlOl1dfFtbOnB1bmN0Ol1dIikKYGBgCgojIyMg5YmN5Zue44Gu6KOc6LazOiDmraPopo/ooajnj74KYGBgClB1bmN0dWF0aW9uIGNoYXJhY3RlcnM6CiEgIiAjICQgJSAmICcgKCApICogKyAsIC0gLiAvIDogOyA8ID0gPiA/IEAgWyBcIF0gXiBfIGAgeyB8IH0gfi4KYGBgCi0gPGEgaHJlZj0iaHR0cHM6Ly9qYS53aWtpcGVkaWEub3JnL3dpa2kvJUU2JUFEJUEzJUU4JUE2JThGJUU4JUExJUE4JUU3JThGJUJFIiB0YXJnZXQ9Il9ibGFuayI+5q2j6KaP6KGo54++PC9hPgoKYGBge3J9CnRtcCA8LSBzdHJzcGxpdCh0eHQsIlxcc3xcXFciKQp0bXBbMV0KYGBgCmBgYHtyfQp0bXAgPC0gc3Ryc3BsaXQodHh0LCJbIFx0XXxbLiwhPzouLi5dIikKdG1wWzFdCmBgYAoKYGBge3J9CnRtcCA8LSBzdHJzcGxpdCh0eHQsIiB8W15hLXpBLVowLTldIikKdG1wWzFdCmBgYAoKIyMjIOWNmOiqnuODquOCueODiO+8iOOBpOOBpeOBje+8iQpgYGB7cn0Kd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCndvcmRMc3Q8LXRvbG93ZXIod29yZExzdCkKd29yZExzdDwtIHdvcmRMc3Rbd29yZExzdCAhPSAiIl0KYGBgCgojIyMg57WQ5p6c6YOo5YiG5Ye65YqbCmBgYHtyfQpoZWFkKHdvcmRMc3QpCmBgYAoKIyMgV29yZCBGcmVxdWVuY2llcwpgYGB7cn0KKGZyZXEgPC0gdGFibGUod29yZExzdCkpCmBgYAoKIyMgU29ydApgYGB7cn0KKGZyZXFfZGF0YTwtc29ydChmcmVxLCBkZWNyZWFzaW5nPVRSVUUpKQpgYGAKCgojIyDjg5XjgqHjgqTjg6vjgavlh7rlipsKYGBge3J9CndyaXRlLmNzdihmcmVxX2RhdGEsICJmcmVxX2VuLmNzdiIpCmBgYAoKCiMjIyDljZjoqp7poLvluqbmlbDliIbluIMo6Imy5LuY44GNKQojIyMjIDxhIGhyZWY9Imh0dHBzOi8vaHRzdWRhLm5ldC9zdGF0cy9wbG90Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5sYXM6IGxhYmVsIHN0eWxlPC9hPgpgYGB7cn0KY29sb3JzID0gYygib3JhbmdlIiwgImxpZ2h0Ymx1ZSIsICJncmVlbiIpIApiYXJwbG90KGZyZXFfZGF0YSwgbGFzPTMsY29sPWNvbG9ycykKYGBgCgoKIyDplqLmlbDjga7kvZzmiJAKIyMg6L+U44KK77yI5oi744KK77yJ5YCk44Gq44GX6Zai5pWw77yacHJpbnRUVFLplqLmlbDjgpLkvZzmiJAKIyMjIOW8leaVsDog5Y2Y6Kqe44Oq44K544OICiMjIyDntZDmnpzlh7rlips6IFRUUuOBrue1kOaenApgYGB7cn0KcHJpbnRUVFI8LSBmdW5jdGlvbih3THN0KSB7CiAgICBudW1fdG9rZW5zIDwtIGxlbmd0aCh3THN0KQogICAgbnVtX3R5cGVzIDwtIGxlbmd0aCh1bmlxdWUod0xzdCkpCiAgICByZXNfVFRSIDwtIG51bV90eXBlcy9udW1fdG9rZW5zICogMTAwCiAgICBwYXN0ZSgiVFRSID0iLCByb3VuZChyZXNfVFRSLDMpKQp9CmBgYAoKIyMgcHJpbnRUVFLplqLmlbDjga7lrp/ooYwKYGBge3J9CnByaW50VFRSKHdvcmRMc3QpCmBgYAoKIyMg6L+U44KK77yI5oi744KK77yJ5YCk44GC44KK6Zai5pWw77yaY2FsY1RUUjHplqLmlbDjgpLkvZzmiJAKIyMjIOW8leaVsDog5Y2Y6Kqe44Oq44K544OICiMjIyDmiLvjgorlgKQ6IFRUUuWApApgYGB7cn0KY2FsY1RUUjE8LSBmdW5jdGlvbih3THN0KSB7CiAgICBudW1fdG9rZW5zIDwtIGxlbmd0aCh3THN0KQogICAgbnVtX3R5cGVzIDwtIGxlbmd0aCh1bmlxdWUod0xzdCkpCiAgICByZXNfVFRSIDwtIG51bV90eXBlcy9udW1fdG9rZW5zICogMTAwCiAgICByZXR1cm4ocm91bmQocmVzX1RUUiwzKSkKfQpgYGAKCiMjIGNhbGNUVFIx6Zai5pWw44Gu5a6f6KGMCmBgYHtyfQpjYWxjVFRSMSh3b3JkTHN0KQpgYGAKCiMjICDmiLvjgorlgKTjga7liKnnlKgKYGBge3J9CnJlcyA8LSBjYWxjVFRSMSh3b3JkTHN0KQpgYGAKIyMjIOWNmOiqnuOBrlRva2VuLCBUeXBlcyAKYGBge3J9CnRva2VucyA8LSBsZW5ndGgod29yZExzdCkKdHlwZXMgPC0gbGVuZ3RoKHVuaXF1ZSh3b3JkTHN0KSkKYGBgCgojIyDov5TjgorvvIjmiLvjgorvvInlgKTjgYLjgorplqLmlbDvvJpjYWxjVFRSMumWouaVsOOCkuS9nOaIkAojIyMg5byV5pWwOiBUb2tlbnMsIFR5cGVzCiMjIyDmiLvjgorlgKQ6IFRUUuioiOeul+WApApgYGB7cn0KY2FsY1RUUjI8LSBmdW5jdGlvbihhcmdfdG9rZW5zLGFyZ190eXBlcykgewogICAgcmVzX1RUUiA8LSBhcmdfdHlwZXMvYXJnX3Rva2VucyAqIDEwMAogICAgcmV0dXJuKHJvdW5kKHJlc19UVFIsMykpCn0KYGBgCgojIyBjYWxjVFRSMumWouaVsOOBruWun+ihjApgYGB7cn0KY2FsY1RUUjIodG9rZW5zLHR5cGVzKQpgYGAKIyMgPHNwYW4gc3R5bGU9ImNvbG9yOiBibHVlOyI+5a6f57+SPC9zcGFuPk1lQ2Fi44Gu44Kk44Oz44K544OI44O844OrCgojIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7Ij7nt7Tnv5I8L3NwYW4+OiDjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6vjgpLoqq3jgb/ovrzjgpPjgafjgIFHdWlyYXVk44Gu5YCk44KS6L+U44GZ6Zai5pWwY2FsY1JUVFLjga7kvZzmiJDjgIAKIyMjIOW8leaVsDogIOiLseaWh+ODhuOCreOCueODiOODleOCoeOCpOODqwojIyMg5oi744KK5YCkOiBHdWlyYXVk6KiI566X5YCkCmBgYHtyLCBlY2hvPUZBTFNFfQpjYWxjUlRUUjwtIGZ1bmN0aW9uKGZuYW1lKSB7CiAgICB0eHQ8LXJlYWRMaW5lcyhmbmFtZSkKICAgIHdvcmRMc3Q8LXN0cnNwbGl0KHR4dCwiW1s6c3BhY2U6XV18W1s6cHVuY3Q6XV0iKQogICAgd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCiAgICB3b3JkTHN0PC10b2xvd2VyKHdvcmRMc3QpCiAgICB3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQogICAgbnVtX3Rva2VucyA8LSBsZW5ndGgod29yZExzdCkKICAgIG51bV90eXBlcyA8LSBsZW5ndGgodW5pcXVlKHdvcmRMc3QpKQogICAgcmVzX0cgPC0gbnVtX3R5cGVzL3NxcnQobnVtX3Rva2VucykKICAgIHJldHVybihyZXNfRykKfQpgYGAKCiMjIyBjYWxjUlRUUumWouaVsOOBruWun+ihjOS+iwpgYGB7cn0KY2FsY1JUVFIoInNhbXBsZV90ZXh0cy9zYW1wbGVfZW4udHh0IikKYGBgCiMg44CQ5qyh6YCx44Gu5LqI5ZGK44CRCiMjIDxhIGhyZWY9Imh0dHBzOi8vc3RhdC5ldGh6LmNoL1ItbWFudWFsL1ItZGV2ZWwvbGlicmFyeS9iYXNlL2h0bWwvbWVyZ2UuaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPm1lcmdlPC9hPgojIyDpoLvluqbjg4bjg7zjg5bjg6vjgpLjg4fjg7zjgr/lnovjgavlpInmj5sKYGBge3J9CmZyZXFEYXRhIDwtIGRhdGEuZnJhbWUoZnJlcV9kYXRhKQpoZWFkKGZyZXFEYXRhKQpgYGAKCiMjIOebuOWvvumgu+W6puODhuODvOODluODqwpgYGB7cn0KKHJlbGF0aXZlPC1zb3J0KGZyZXEvdG9rZW5zLCBkZWNyZWFzaW5nPVRSVUUpKQojc3VtKGZyZXEvdG9rZW5zKQpgYGAKIyMg55u45a++6aC75bqm44OG44O844OW44Or44KS44OH44O844K/5Z6L44Gr5aSJ5o+bCmBgYHtyfQoocmVsYXRpdmVEYXRhIDwtIGRhdGEuZnJhbWUocmVsYXRpdmUpKQpgYGAKIyMg77yS44Gk44Gu44OH44O844K/5Z6L5aSJ5pWw44KS6YCj57WQKG1lcmdlKQpgYGB7cn0KZnJlcU10eCA8LSBtZXJnZShmcmVxRGF0YSwgcmVsYXRpdmVEYXRhLCBhbGw9VCwgYnk9IndvcmRMc3QiKQpgYGAKCmBgYHtyfQpoZWFkKGZyZXFNdHgpCmBgYAoKYGBge3J9CmZyZXFNdHhbLDFdCmBgYAoKIyMjIOWHuuePvuWNmOiqnuOBruaDheWgseOCkuihjOODqeODmeODq+OBq+OCs+ODlOODvApgYGB7cn0Kcm93bmFtZXMoZnJlcU10eCk8LWFzLmNoYXJhY3RlcihmcmVxTXR4WywxXSkKYGBgCgojIyDlh7rnj77ljZjoqp7jga7mg4XloLEoMeWIl+ebrinjgpLliYrpmaQKYGBge3J9CmZyZXFNdHg8LWZyZXFNdHhbLTFdCmNvbG5hbWVzKGZyZXFNdHgpIDwtIGMoInJhdyIsICJyZWxhdGl2ZSIpCmBgYAoKIyMg57KX6aC75bqmLeebuOWvvumgu+W6puOBruihjOWIlwpgYGB7cn0KZnJlcU10eApgYGAKCg==