Warmup Practice
操作:2倍する
c(1, 2, 3, 4, 5)*2
要素の抽出
Y[4]
length関数: リストの長さ(要素数)
str <- c ("a", "ab", "abc")
length(str)
nchar関数: 文字の長さ
nchar(str)
sqrt関数: 平方根(squre root)を計算する
\[{\sqrt{16}}, {\sqrt{25}}, {\sqrt{256}}
\]
numLst <- c (16,25,256)
sqrt(numLst)
テキストの頻度表作成
テキストファイルの読み込み
txt<-readLines("News_JT_20240527.txt")
練習変数txtの要素数(ファイルから読み込んだ行数)を出力
練習変数txtの4要素目の情報を抽出
スペース&記号による分割
Punctuation characters:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.
正規表現その1: POSIX クラス
wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")
正規表現その2
tmp <- strsplit(txt," |[^a-zA-Z0-9]")
正規表現その3:メタ文字
- :アルファベット、アラビア数字またはアンダーバー
- : アルファベット、数字やアンダーバー以外の文字
- : 空白文字
- エスケープ
シーケンス
tmp <- strsplit(txt,"\\s|\\W")
各行のデータを一括化
wordLst<-unlist(wordLst)
小文字に変換
wordLst<-tolower(wordLst)
空白”“の削除
#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]
単語のToken数
tokens <- length(wordLst)
単語のTypes数
- unique()関数は,リストの重複しない要素を返す
types <- length(unique(wordLst))
結果出力
print(paste("Tokens =", tokens))
print(paste("Types =", types))
TTR: Type-Token Ratioの計算
\[TTR=\frac{types}{tokens} \times 100
\]
types/tokens*100
小数点2桁で結果を出力
round(types/tokens*100,2)
練習: Guiraud値(RTTR: Root
Type-Token Ratio)を求める
\[RTTR=\frac{types}{\sqrt{tokens}}
\]
小数点2桁で結果を出力
Word Frequencies
freq <- table(wordLst)
head(freq)
Sort
freq_data<-sort(freq, decreasing=TRUE)
head(freq_data)
ファイルに出力
write.csv(freq_data, "freq_en.csv")
単語頻度数分布(単色)
barplot(freq_data, las=3,col="orange")
単語頻度数分布(複数色)
colors = c("orange", "lightblue", "green")
barplot(freq_data, las=3,col=colors)
オンライン記事から情報を取得
httr, rvestパッケージをインストール
install.packages("httr")
install.packages("rvest")
ライブラリの読み込み
library(httr)
library(rvest)
# URL of the Mainichi Shinbun's article
url <- "https://mainichi.jp/english/articles/20240925/p2a/00m/0na/014000c"
# Send a GET request with a user agent
response <- GET(url, user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15"))
page <- read_html(response)
Text Cleaning
cleaned_content <- gsub("\\r|\\n", "", article_content)
cleaned_content <- trimws(cleaned_content)
cleaned_content <- paste(cleaned_content, collapse = "")
substring(cleaned_content, 1, 100)
練習 :
上の記事の出現単語頻度を、頻度の高い順に出力してみましょう
freq_data[1:20]
wordLst
the of and university a to
29 17 14 13 12 12
students tuition tokyo that as at
9 9 8 6 5 5
by fees national universities association is
5 5 5 5 4 4
on set
4 4
LS0tCnRpdGxlOiAiTGVjMDI6IOWfuuacrOaTjeS9nCIKb3V0cHV0OiBodG1sX25vdGVib29rCmVkaXRvcl9vcHRpb25zOiAKICBjaHVua19vdXRwdXRfdHlwZTogaW5saW5lCi0tLQojIFdhcm11cCBQcmFjdGljZQojIyDkvZzmpa3jg4fjgqPjg6zjgq/jg4jjg6rjga7norroqo0gCmBgYHtyfQpnZXR3ZCgpCmBgYAoKIyMg44Oq44K544OI44Gu5L2c5oiQCmBgYHtyfQpjKDEsIDIsIDMsIDQsIDUpCmBgYAojIyDmk43kvZzvvJoy5YCN44GZ44KLCmBgYHtyfQpjKDEsIDIsIDMsIDQsIDUpKjIKYGBgCiMjIOWkieaVsOOBq+S7o+WFpQojIyMgPGEgaHJlZj0iaHR0cHM6Ly9zdGF0LmV0aHouY2gvUi1tYW51YWwvUi1kZXZlbC9saWJyYXJ5L2Jhc2UvaHRtbC9hc3NpZ25PcHMuaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPuS7o+WFpea8lOeul+WtkDwvYT4KYGBge3J9ClkgPC0gYygxLCAyLCAzLCA0LCA1KQpgYGAKIyMg5Z+65pys5pON5L2c77yaMuWAjeOBmeOCiwpgYGB7cn0KWSoyCmBgYAojIyDln7rmnKzmk43kvZzvvJoy5LmX44GZ44KLCmBgYHtyfQpZXjIKYGBgCiMjIOimgee0oOOBruaKveWHugpgYGB7cn0KWVs0XQpgYGAKIyMjIGxlbmd0aOmWouaVsDog44Oq44K544OI44Gu6ZW344GV77yI6KaB57Sg5pWw77yJCmBgYHtyfQpzdHIgPC0gYyAoImEiLCAiYWIiLCAiYWJjIikKbGVuZ3RoKHN0cikKYGBgCiMjIyBuY2hhcumWouaVsDog5paH5a2X44Gu6ZW344GVCmBgYHtyfQpuY2hhcihzdHIpCmBgYAojIyMgc3FydOmWouaVsO+8miDlubPmlrnmoLkoc3F1cmUgcm9vdCnjgpLoqIjnrpfjgZnjgosKJCR7XHNxcnR7MTZ9fSwge1xzcXJ0ezI1fX0sIHtcc3FydHsyNTZ9fSAkJApgYGB7cn0KbnVtTHN0IDwtIGMgKDE2LDI1LDI1NikKc3FydChudW1Mc3QpCmBgYAojIOODhuOCreOCueODiOOBrumgu+W6puihqOS9nOaIkAojIyDjgrXjg7Pjg5fjg6vjg4bjgq3jgrnjg4gKLSA8YSBocmVmPSJodHRwczovL3d3dy5qYXBhbnRpbWVzLmNvLmpwL25ld3MvMjAyNC8wNS8yNy9qYXBhbi9zb2NpZXR5L3Rva3lvLXVuaXZlcnNpdHktdHVpdGlvbi8iIHRhcmdldD0iX2JsYW5rIj5Vbml2ZXJzaXR5IG9mIFRva3lvIGNvbnNpZGVycyB0dWl0aW9uIGhpa2U8L2E+CgojIyDjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6vjga7oqq3jgb/ovrzjgb8KLSDkuIDooYzjgZrjgaToqq3jgb/ovrzjgpPjgafjgIHjg6rjgrnjg4jjgavmoLzntI0KYGBge3J9CnR4dDwtcmVhZExpbmVzKCJOZXdzX0pUXzIwMjQwNTI3LnR4dCIpCmBgYAojIyMg57WQ5p6c5Ye65YqbCmBgYHtyLCBldmFsPUZBTFNFfQpoZWFkKHR4dCkKYGBgCiMjIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPue3tOe/kjwvc3Bhbj7lpInmlbB0eHTjga7opoHntKDmlbDvvIjjg5XjgqHjgqTjg6vjgYvjgonoqq3jgb/ovrzjgpPjgaDooYzmlbDvvInjgpLlh7rlipsKYGBge3IsIGVjaG89RkFMU0V9Cmxlbmd0aCh0eHQpCmBgYAoKIyMjIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlOyI+57e057+SPC9zcGFuPuWkieaVsHR4dOOBrjTopoHntKDnm67jga7mg4XloLHjgpLmir3lh7oKYGBge3IsIGVjaG89RkFMU0V9CnR4dFs0XQpgYGAKCiMjIOOCueODmuODvOOCuSboqJjlj7fjgavjgojjgovliIblibIKYGBgClB1bmN0dWF0aW9uIGNoYXJhY3RlcnM6CiEgIiAjICQgJSAmICcgKCApICogKyAsIC0gLiAvIDogOyA8ID0gPiA/IEAgWyBcIF0gXiBfIGAgeyB8IH0gfi4KYGBgCi0gPGEgaHJlZj0iaHR0cHM6Ly9qYS53aWtpcGVkaWEub3JnL3dpa2kvJUU2JUFEJUEzJUU4JUE2JThGJUU4JUExJUE4JUU3JThGJUJFIiB0YXJnZXQ9Il9ibGFuayI+5q2j6KaP6KGo54++PC9hPgoKIyMjIOato+imj+ihqOePvuOBneOBru+8kTogUE9TSVgg44Kv44Op44K5CmBgYHtyfQp3b3JkTHN0PC1zdHJzcGxpdCh0eHQsIltbOnNwYWNlOl1dfFtbOnB1bmN0Ol1dIikKYGBgCgojIyMg5q2j6KaP6KGo54++44Gd44Gu77ySCmBgYHtyLCBldmFsPUZBTFNFfQp0bXAgPC0gc3Ryc3BsaXQodHh0LCIgfFteYS16QS1aMC05XSIpCmBgYAoKIyMjIOato+imj+ihqOePvuOBneOBru+8k++8muODoeOCv+aWh+WtlwotIFx3OuOCouODq+ODleOCoeODmeODg+ODiOOAgeOCouODqeODk+OCouaVsOWtl+OBvuOBn+OBr+OCouODs+ODgOODvOODkOODvAotIFxXOiDjgqLjg6vjg5XjgqHjg5njg4Pjg4jjgIHmlbDlrZfjgoTjgqLjg7Pjg4Djg7zjg5Djg7zku6XlpJbjga7mloflrZcKLSBcczog56m655m95paH5a2XCi0gPGEgaHJlZj0iaHR0cHM6Ly9zby16b3UuanAvcm9ib3QvdGVjaC9udW1lcmljYWwtYW5hbHlzaXMvci9ncmFtbWFyL3N0cmluZ3MuaHRtIiB0YXJnZXQ9Il9ibGFuayI+44Ko44K544Kx44O844OXIOOCt+ODvOOCseODs+OCuTwvYT4KYGBge3IsIGV2YWw9RkFMU0V9CnRtcCA8LSBzdHJzcGxpdCh0eHQsIlxcc3xcXFciKQpgYGAKCiMjIOWQhOihjOOBruODh+ODvOOCv+OCkuS4gOaLrOWMlgpgYGB7cn0Kd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCmBgYAoKIyMg5bCP5paH5a2X44Gr5aSJ5o+bCmBgYHtyfQp3b3JkTHN0PC10b2xvd2VyKHdvcmRMc3QpCmBgYAoKIyMjIOe1kOaenOS4gOmDqOWHuuWKmwpgYGB7ciwgZXZhbD1GQUxTRX0Kd29yZExzdFsxOjE1XQpgYGAKCiMjIOepuueZvSIi44Gu5YmK6ZmkCmBgYHtyfQojd29yZExzdDwtd29yZExzdFtuY2hhcih3b3JkTHN0KT4wXQp3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQpgYGAKCiMjIyDntZDmnpzkuIDpg6jlh7rlipsKYGBge3IsIGV2YWw9RkFMU0V9CndvcmRMc3RbMToxNV0KYGBgCgojIyDljZjoqp7jga5Ub2tlbuaVsApgYGB7cn0KdG9rZW5zIDwtIGxlbmd0aCh3b3JkTHN0KQpgYGAKCiMjIOWNmOiqnuOBrlR5cGVz5pWwCiogdW5pcXVlKCnplqLmlbDjga/vvIzjg6rjgrnjg4jjga7ph43opIfjgZfjgarjgYTopoHntKDjgpLov5TjgZkKYGBge3J9CnR5cGVzIDwtIGxlbmd0aCh1bmlxdWUod29yZExzdCkpCmBgYAoKIyMjIOe1kOaenOWHuuWKmwpgYGB7cn0KcHJpbnQocGFzdGUoIlRva2VucyA9IiwgdG9rZW5zKSkKcHJpbnQocGFzdGUoIlR5cGVzID0iLCB0eXBlcykpCmBgYAoKIyMgVFRSOiBUeXBlLVRva2VuIFJhdGlv44Gu6KiI566XCiQkVFRSPVxmcmFje3R5cGVzfXt0b2tlbnN9IFx0aW1lcyAxMDAgJCQKCmBgYHtyfQp0eXBlcy90b2tlbnMqMTAwCmBgYAoKIyMjIOWwj+aVsOeCuTLmoYHjgafntZDmnpzjgpLlh7rlipsKYGBge3J9CnJvdW5kKHR5cGVzL3Rva2VucyoxMDAsMikKYGBgCgojIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7ICI+57e057+SPC9zcGFuPjogR3VpcmF1ZOWApChSVFRSOiBSb290IFR5cGUtVG9rZW4gUmF0aW8p44KS5rGC44KB44KLCiQkUlRUUj1cZnJhY3t0eXBlc317XHNxcnR7dG9rZW5zfX0gJCQKCiMjIyDlsI/mlbDngrky5qGB44Gn57WQ5p6c44KS5Ye65YqbCmBgYHtyLCBlY2hvPUZBTFNFfQpyb3VuZCh0eXBlcy9zcXJ0KHRva2VucyksMikKYGBgCgojIyBXb3JkIEZyZXF1ZW5jaWVzCmBgYHtyfQpmcmVxIDwtIHRhYmxlKHdvcmRMc3QpCmhlYWQoZnJlcSkKYGBgCgojIyBTb3J0CmBgYHtyfQpmcmVxX2RhdGE8LXNvcnQoZnJlcSwgZGVjcmVhc2luZz1UUlVFKQpoZWFkKGZyZXFfZGF0YSkKYGBgCgojIyDjg5XjgqHjgqTjg6vjgavlh7rlipsKYGBge3IsIGV2YWw9RkFMU0V9CndyaXRlLmNzdihmcmVxX2RhdGEsICJmcmVxX2VuLmNzdiIpCmBgYAoKCiMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjljZjoibIpCiMjIyA8YSBocmVmPSJodHRwczovL2h0c3VkYS5uZXQvc3RhdHMvcGxvdC5odG1sIiB0YXJnZXQ9Il9ibGFuayI+bGFzOiBsYWJlbCBzdHlsZTwvYT4KYGBge3J9CmJhcnBsb3QoZnJlcV9kYXRhLCBsYXM9Myxjb2w9Im9yYW5nZSIpCmBgYAoKIyMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjopIfmlbDoibIpCmBgYHtyfQpjb2xvcnMgPSBjKCJvcmFuZ2UiLCAibGlnaHRibHVlIiwgImdyZWVuIikgCmJhcnBsb3QoZnJlcV9kYXRhLCBsYXM9Myxjb2w9Y29sb3JzKQpgYGAKCgojIyDjgqrjg7Pjg6njgqTjg7PoqJjkuovjgYvjgonmg4XloLHjgpLlj5blvpcKIyMjIGh0dHIsIHJ2ZXN044OR44OD44Kx44O844K444KS44Kk44Oz44K544OI44O844OrCi0gPGEgaHJlZj0iaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL2h0dHIvaW5kZXguaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPmh0dHI6IFRvb2xzIGZvciBXb3JraW5nIHdpdGggVVJMcyBhbmQgSFRUUDwvYT4KLSA8YSBocmVmPSJodHRwczovL2NyYW4uci1wcm9qZWN0Lm9yZy93ZWIvcGFja2FnZXMvcnZlc3QvaW5kZXguaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPnJ2ZXN0OiBFYXNpbHkgSGFydmVzdCAoU2NyYXBlKSBXZWIgUGFnZXM8L2E+CmBgYHtyLCBldmFsID1GQUxTRX0KaW5zdGFsbC5wYWNrYWdlcygiaHR0ciIpCmluc3RhbGwucGFja2FnZXMoInJ2ZXN0IikKYGBgCiMjIyDjg6njgqTjg5bjg6njg6rjga7oqq3jgb/ovrzjgb8KYGBge3J9CmxpYnJhcnkoaHR0cikKbGlicmFyeShydmVzdCkKYGBgCgojIyMgPGEgaHJlZj0iaHR0cHM6Ly9tYWluaWNoaS5qcC9lbmdsaXNoL2FydGljbGVzLzIwMjQwOTI1L3AyYS8wMG0vMG5hLzAxNDAwMGMiIHRhcmdldD0iX2JsYW5rIj5NYWluaWNoaSBKYXBhbidzIEFydGljbGUgdGl0bGVkICJFbGl0ZSBVbml2ZXJzaXR5IG9mIFRva3lvIHRvIGhpa2UgdHVpdGlvbiBmZWVzIGJ5IDIwJTsgZmlyc3QgcmlzZSBpbiAyMHlyczwvYT4KYGBge3J9CiMgVVJMIG9mIHRoZSBNYWluaWNoaSBTaGluYnVuJ3MgYXJ0aWNsZQp1cmwgPC0gImh0dHBzOi8vbWFpbmljaGkuanAvZW5nbGlzaC9hcnRpY2xlcy8yMDI0MDkyNS9wMmEvMDBtLzBuYS8wMTQwMDBjIgoKIyBTZW5kIGEgR0VUIHJlcXVlc3Qgd2l0aCBhIHVzZXIgYWdlbnQKcmVzcG9uc2UgPC0gR0VUKHVybCwgdXNlcl9hZ2VudCgiTW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTRfNykgQXBwbGVXZWJLaXQvNjA1LjEuMTUgKEtIVE1MLCBsaWtlIEdlY2tvKSBWZXJzaW9uLzE4LjAgU2FmYXJpLzYwNS4xLjE1IikpCnBhZ2UgPC0gcmVhZF9odG1sKHJlc3BvbnNlKQpgYGAKCiMjIyBFeHRyYWN0IHRoZSBjb250ZW50CmBgYHtyfQphcnRpY2xlX2NvbnRlbnQgPC0gaHRtbF90ZXh0KGh0bWxfbm9kZXMocGFnZSwgInAudHh0IiksIHRyaW0gPSBUUlVFKQpgYGAKCiMjIyBUZXh0IENsZWFuaW5nCi0gPGEgaHJlZj0iaHR0cHM6Ly93d3cucmRvY3VtZW50YXRpb24ub3JnL3BhY2thZ2VzL2Jhc2UvdmVyc2lvbnMvMy42LjIvdG9waWNzL3RyaW13cyIgdGFyZ2V0PSJfYmxhbmsiPnRyaW13czogUmVtb3ZlIGxlYWRpbmcgYW5kL29yIHRyYWlsaW5nIHdoaXRlc3BhY2UgZnJvbSBjaGFyYWN0ZXIgc3RyaW5nczwvYT4KLSA8YSBocmVmPSJodHRwczovL3d3dy5yZG9jdW1lbnRhdGlvbi5vcmcvcGFja2FnZXMvYmFzZS92ZXJzaW9ucy8zLjYuMi90b3BpY3MvZ3JlcCIgdGFyZ2V0PSJfYmxhbmsiPmdzdWI6IFBhdHRlcm4gTWF0Y2hpbmcgYW5kIFJlcGxhY2VtZW50PC9hPgotIDxhIGhyZWY9Imh0dHBzOi8vd3d3LnJkb2N1bWVudGF0aW9uLm9yZy9wYWNrYWdlcy9iYXNlL3ZlcnNpb25zLzMuNi4yL3RvcGljcy9wYXN0ZSIgdGFyZ2V0PSJfYmxhbmsiPnBhc3RlOiBDb25jYXRlbmF0ZSBTdHJpbmdzPC9hPgoKYGBge3J9CmNsZWFuZWRfY29udGVudCA8LSBnc3ViKCJcXHJ8XFxuIiwgIiIsIGFydGljbGVfY29udGVudCkKY2xlYW5lZF9jb250ZW50IDwtIHRyaW13cyhjbGVhbmVkX2NvbnRlbnQpCmNsZWFuZWRfY29udGVudCA8LSBwYXN0ZShjbGVhbmVkX2NvbnRlbnQsIGNvbGxhcHNlID0gIiIpCnN1YnN0cmluZyhjbGVhbmVkX2NvbnRlbnQsIDEsIDEwMCkKYGBgCiMjIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPue3tOe/kjwvc3Bhbj4gOiDkuIrjga7oqJjkuovjga7lh7rnj77ljZjoqp7poLvluqbjgpLjgIHpoLvluqbjga7pq5jjgYTpoIbjgavlh7rlipvjgZfjgabjgb/jgb7jgZfjgofjgYYKLSDkuIvjga7lh7rlipvntZDmnpzjga/jgIHpq5jpoLvluqbjga7kuIDpg6jjgpLmipznsovjgZfjgZ/jgoLjga4KYGBge3IsIGVjaG89RkFMU0V9CndvcmRMc3Q8LXN0cnNwbGl0KHR4dCwiW1s6c3BhY2U6XV18W1s6cHVuY3Q6XV0iKQp3b3JkTHN0PC11bmxpc3Qod29yZExzdCkKd29yZExzdDwtdG9sb3dlcih3b3JkTHN0KQp3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQpmcmVxIDwtIHRhYmxlKHdvcmRMc3QpCmZyZXFfZGF0YTwtc29ydChmcmVxLCBkZWNyZWFzaW5nPVRSVUUpCmBgYAoKYGBge3J9CmZyZXFfZGF0YVsxOjIwXQpgYGAKCgo=