Warmup Practice

作業ディレクトリの確認

getwd()

リストの作成

c(1, 2, 3, 4, 5)

操作:2倍する

c(1, 2, 3, 4, 5)*2

変数に代入

代入演算子

Y <- c(1, 2, 3, 4, 5)

基本操作:2倍する

Y*2

基本操作:2乗する

Y^2

要素の抽出

Y[4]

length関数: リストの長さ(要素数)

str <- c ("a", "ab", "abc")
length(str)

nchar関数: 文字の長さ

nchar(str)

sqrt関数: 平方根(squre root)を計算する

\[{\sqrt{16}}, {\sqrt{25}}, {\sqrt{256}} \]

numLst <- c (16,25,256)
sqrt(numLst)

テキストの頻度表作成

テキストファイルの読み込み

  • 一行ずつ読み込んで、リストに格納
txt<-readLines("News_JT_20240527.txt")

結果出力

head(txt)

練習変数txtの要素数(ファイルから読み込んだ行数)を出力

練習変数txtの4要素目の情報を抽出

スペース&記号による分割

Punctuation characters:
! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~.

正規表現その1: POSIX クラス

wordLst<-strsplit(txt,"[[:space:]]|[[:punct:]]")

正規表現その2

tmp <- strsplit(txt," |[^a-zA-Z0-9]")

正規表現その3:メタ文字

  • :アルファベット、アラビア数字またはアンダーバー
  • : アルファベット、数字やアンダーバー以外の文字
  • : 空白文字
  • エスケープ シーケンス
tmp <- strsplit(txt,"\\s|\\W")

各行のデータを一括化

wordLst<-unlist(wordLst)

小文字に変換

wordLst<-tolower(wordLst)

結果一部出力

wordLst[1:15]

空白”“の削除

#wordLst<-wordLst[nchar(wordLst)>0]
wordLst<- wordLst[wordLst != ""]

結果一部出力

wordLst[1:15]

単語のToken数

tokens <- length(wordLst)

単語のTypes数

  • unique()関数は,リストの重複しない要素を返す
types <- length(unique(wordLst))

結果出力

print(paste("Tokens =", tokens))
print(paste("Types =", types))

TTR: Type-Token Ratioの計算

\[TTR=\frac{types}{tokens} \times 100 \]

types/tokens*100

小数点2桁で結果を出力

round(types/tokens*100,2)

練習: Guiraud値(RTTR: Root Type-Token Ratio)を求める

\[RTTR=\frac{types}{\sqrt{tokens}} \]

小数点2桁で結果を出力

Word Frequencies

freq <- table(wordLst)
head(freq)

Sort

freq_data<-sort(freq, decreasing=TRUE)
head(freq_data)

ファイルに出力

write.csv(freq_data, "freq_en.csv")

単語頻度数分布(単色)

las: label style

barplot(freq_data, las=3,col="orange")

単語頻度数分布(複数色)

colors = c("orange", "lightblue", "green") 
barplot(freq_data, las=3,col=colors)

オンライン記事から情報を取得

httr, rvestパッケージをインストール

install.packages("httr")
install.packages("rvest")

ライブラリの読み込み

library(httr)
library(rvest)

Mainichi Japan’s Article titled “Elite University of Tokyo to hike tuition fees by 20%; first rise in 20yrs

# URL of the Mainichi Shinbun's article
url <- "https://mainichi.jp/english/articles/20240925/p2a/00m/0na/014000c"

# Send a GET request with a user agent
response <- GET(url, user_agent("Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15"))
page <- read_html(response)

Extract the content

article_content <- html_text(html_nodes(page, "p.txt"), trim = TRUE)

Text Cleaning

cleaned_content <- gsub("\\r|\\n", "", article_content)
cleaned_content <- trimws(cleaned_content)
cleaned_content <- paste(cleaned_content, collapse = "")
substring(cleaned_content, 1, 100)

練習 : 上の記事の出現単語頻度を、頻度の高い順に出力してみましょう

  • 下の出力結果は、高頻度の一部を抜粋したもの
freq_data[1:20]
wordLst
         the           of          and   university            a           to 
          29           17           14           13           12           12 
    students      tuition        tokyo         that           as           at 
           9            9            8            6            5            5 
          by         fees     national universities  association           is 
           5            5            5            5            4            4 
          on          set 
           4            4 
LS0tCnRpdGxlOiAiTGVjMDI6IOWfuuacrOaTjeS9nCIKb3V0cHV0OiBodG1sX25vdGVib29rCmVkaXRvcl9vcHRpb25zOiAKICBjaHVua19vdXRwdXRfdHlwZTogaW5saW5lCi0tLQojIFdhcm11cCBQcmFjdGljZQojIyDkvZzmpa3jg4fjgqPjg6zjgq/jg4jjg6rjga7norroqo0gCmBgYHtyfQpnZXR3ZCgpCmBgYAoKIyMg44Oq44K544OI44Gu5L2c5oiQCmBgYHtyfQpjKDEsIDIsIDMsIDQsIDUpCmBgYAojIyDmk43kvZzvvJoy5YCN44GZ44KLCmBgYHtyfQpjKDEsIDIsIDMsIDQsIDUpKjIKYGBgCiMjIOWkieaVsOOBq+S7o+WFpQojIyMgPGEgaHJlZj0iaHR0cHM6Ly9zdGF0LmV0aHouY2gvUi1tYW51YWwvUi1kZXZlbC9saWJyYXJ5L2Jhc2UvaHRtbC9hc3NpZ25PcHMuaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPuS7o+WFpea8lOeul+WtkDwvYT4KYGBge3J9ClkgPC0gYygxLCAyLCAzLCA0LCA1KQpgYGAKIyMg5Z+65pys5pON5L2c77yaMuWAjeOBmeOCiwpgYGB7cn0KWSoyCmBgYAojIyDln7rmnKzmk43kvZzvvJoy5LmX44GZ44KLCmBgYHtyfQpZXjIKYGBgCiMjIOimgee0oOOBruaKveWHugpgYGB7cn0KWVs0XQpgYGAKIyMjIGxlbmd0aOmWouaVsDog44Oq44K544OI44Gu6ZW344GV77yI6KaB57Sg5pWw77yJCmBgYHtyfQpzdHIgPC0gYyAoImEiLCAiYWIiLCAiYWJjIikKbGVuZ3RoKHN0cikKYGBgCiMjIyBuY2hhcumWouaVsDog5paH5a2X44Gu6ZW344GVCmBgYHtyfQpuY2hhcihzdHIpCmBgYAojIyMgc3FydOmWouaVsO+8miDlubPmlrnmoLkoc3F1cmUgcm9vdCnjgpLoqIjnrpfjgZnjgosKJCR7XHNxcnR7MTZ9fSwge1xzcXJ0ezI1fX0sIHtcc3FydHsyNTZ9fSAkJApgYGB7cn0KbnVtTHN0IDwtIGMgKDE2LDI1LDI1NikKc3FydChudW1Mc3QpCmBgYAojIOODhuOCreOCueODiOOBrumgu+W6puihqOS9nOaIkAojIyDjgrXjg7Pjg5fjg6vjg4bjgq3jgrnjg4gKLSA8YSBocmVmPSJodHRwczovL3d3dy5qYXBhbnRpbWVzLmNvLmpwL25ld3MvMjAyNC8wNS8yNy9qYXBhbi9zb2NpZXR5L3Rva3lvLXVuaXZlcnNpdHktdHVpdGlvbi8iIHRhcmdldD0iX2JsYW5rIj5Vbml2ZXJzaXR5IG9mIFRva3lvIGNvbnNpZGVycyB0dWl0aW9uIGhpa2U8L2E+CgojIyDjg4bjgq3jgrnjg4jjg5XjgqHjgqTjg6vjga7oqq3jgb/ovrzjgb8KLSDkuIDooYzjgZrjgaToqq3jgb/ovrzjgpPjgafjgIHjg6rjgrnjg4jjgavmoLzntI0KYGBge3J9CnR4dDwtcmVhZExpbmVzKCJOZXdzX0pUXzIwMjQwNTI3LnR4dCIpCmBgYAojIyMg57WQ5p6c5Ye65YqbCmBgYHtyLCBldmFsPUZBTFNFfQpoZWFkKHR4dCkKYGBgCiMjIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPue3tOe/kjwvc3Bhbj7lpInmlbB0eHTjga7opoHntKDmlbDvvIjjg5XjgqHjgqTjg6vjgYvjgonoqq3jgb/ovrzjgpPjgaDooYzmlbDvvInjgpLlh7rlipsKYGBge3IsIGVjaG89RkFMU0V9Cmxlbmd0aCh0eHQpCmBgYAoKIyMjIDxzcGFuIHN0eWxlPSJjb2xvcjpibHVlOyI+57e057+SPC9zcGFuPuWkieaVsHR4dOOBrjTopoHntKDnm67jga7mg4XloLHjgpLmir3lh7oKYGBge3IsIGVjaG89RkFMU0V9CnR4dFs0XQpgYGAKCiMjIOOCueODmuODvOOCuSboqJjlj7fjgavjgojjgovliIblibIKYGBgClB1bmN0dWF0aW9uIGNoYXJhY3RlcnM6CiEgIiAjICQgJSAmICcgKCApICogKyAsIC0gLiAvIDogOyA8ID0gPiA/IEAgWyBcIF0gXiBfIGAgeyB8IH0gfi4KYGBgCi0gPGEgaHJlZj0iaHR0cHM6Ly9qYS53aWtpcGVkaWEub3JnL3dpa2kvJUU2JUFEJUEzJUU4JUE2JThGJUU4JUExJUE4JUU3JThGJUJFIiB0YXJnZXQ9Il9ibGFuayI+5q2j6KaP6KGo54++PC9hPgoKIyMjIOato+imj+ihqOePvuOBneOBru+8kTogUE9TSVgg44Kv44Op44K5CmBgYHtyfQp3b3JkTHN0PC1zdHJzcGxpdCh0eHQsIltbOnNwYWNlOl1dfFtbOnB1bmN0Ol1dIikKYGBgCgojIyMg5q2j6KaP6KGo54++44Gd44Gu77ySCmBgYHtyLCBldmFsPUZBTFNFfQp0bXAgPC0gc3Ryc3BsaXQodHh0LCIgfFteYS16QS1aMC05XSIpCmBgYAoKIyMjIOato+imj+ihqOePvuOBneOBru+8k++8muODoeOCv+aWh+WtlwotIFx3OuOCouODq+ODleOCoeODmeODg+ODiOOAgeOCouODqeODk+OCouaVsOWtl+OBvuOBn+OBr+OCouODs+ODgOODvOODkOODvAotIFxXOiDjgqLjg6vjg5XjgqHjg5njg4Pjg4jjgIHmlbDlrZfjgoTjgqLjg7Pjg4Djg7zjg5Djg7zku6XlpJbjga7mloflrZcKLSBcczog56m655m95paH5a2XCi0gPGEgaHJlZj0iaHR0cHM6Ly9zby16b3UuanAvcm9ib3QvdGVjaC9udW1lcmljYWwtYW5hbHlzaXMvci9ncmFtbWFyL3N0cmluZ3MuaHRtIiB0YXJnZXQ9Il9ibGFuayI+44Ko44K544Kx44O844OXIOOCt+ODvOOCseODs+OCuTwvYT4KYGBge3IsIGV2YWw9RkFMU0V9CnRtcCA8LSBzdHJzcGxpdCh0eHQsIlxcc3xcXFciKQpgYGAKCiMjIOWQhOihjOOBruODh+ODvOOCv+OCkuS4gOaLrOWMlgpgYGB7cn0Kd29yZExzdDwtdW5saXN0KHdvcmRMc3QpCmBgYAoKIyMg5bCP5paH5a2X44Gr5aSJ5o+bCmBgYHtyfQp3b3JkTHN0PC10b2xvd2VyKHdvcmRMc3QpCmBgYAoKIyMjIOe1kOaenOS4gOmDqOWHuuWKmwpgYGB7ciwgZXZhbD1GQUxTRX0Kd29yZExzdFsxOjE1XQpgYGAKCiMjIOepuueZvSIi44Gu5YmK6ZmkCmBgYHtyfQojd29yZExzdDwtd29yZExzdFtuY2hhcih3b3JkTHN0KT4wXQp3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQpgYGAKCiMjIyDntZDmnpzkuIDpg6jlh7rlipsKYGBge3IsIGV2YWw9RkFMU0V9CndvcmRMc3RbMToxNV0KYGBgCgojIyDljZjoqp7jga5Ub2tlbuaVsApgYGB7cn0KdG9rZW5zIDwtIGxlbmd0aCh3b3JkTHN0KQpgYGAKCiMjIOWNmOiqnuOBrlR5cGVz5pWwCiogdW5pcXVlKCnplqLmlbDjga/vvIzjg6rjgrnjg4jjga7ph43opIfjgZfjgarjgYTopoHntKDjgpLov5TjgZkKYGBge3J9CnR5cGVzIDwtIGxlbmd0aCh1bmlxdWUod29yZExzdCkpCmBgYAoKIyMjIOe1kOaenOWHuuWKmwpgYGB7cn0KcHJpbnQocGFzdGUoIlRva2VucyA9IiwgdG9rZW5zKSkKcHJpbnQocGFzdGUoIlR5cGVzID0iLCB0eXBlcykpCmBgYAoKIyMgVFRSOiBUeXBlLVRva2VuIFJhdGlv44Gu6KiI566XCiQkVFRSPVxmcmFje3R5cGVzfXt0b2tlbnN9IFx0aW1lcyAxMDAgJCQKCmBgYHtyfQp0eXBlcy90b2tlbnMqMTAwCmBgYAoKIyMjIOWwj+aVsOeCuTLmoYHjgafntZDmnpzjgpLlh7rlipsKYGBge3J9CnJvdW5kKHR5cGVzL3Rva2VucyoxMDAsMikKYGBgCgojIyA8c3BhbiBzdHlsZT0iY29sb3I6IGJsdWU7ICI+57e057+SPC9zcGFuPjogR3VpcmF1ZOWApChSVFRSOiBSb290IFR5cGUtVG9rZW4gUmF0aW8p44KS5rGC44KB44KLCiQkUlRUUj1cZnJhY3t0eXBlc317XHNxcnR7dG9rZW5zfX0gJCQKCiMjIyDlsI/mlbDngrky5qGB44Gn57WQ5p6c44KS5Ye65YqbCmBgYHtyLCBlY2hvPUZBTFNFfQpyb3VuZCh0eXBlcy9zcXJ0KHRva2VucyksMikKYGBgCgojIyBXb3JkIEZyZXF1ZW5jaWVzCmBgYHtyfQpmcmVxIDwtIHRhYmxlKHdvcmRMc3QpCmhlYWQoZnJlcSkKYGBgCgojIyBTb3J0CmBgYHtyfQpmcmVxX2RhdGE8LXNvcnQoZnJlcSwgZGVjcmVhc2luZz1UUlVFKQpoZWFkKGZyZXFfZGF0YSkKYGBgCgojIyDjg5XjgqHjgqTjg6vjgavlh7rlipsKYGBge3IsIGV2YWw9RkFMU0V9CndyaXRlLmNzdihmcmVxX2RhdGEsICJmcmVxX2VuLmNzdiIpCmBgYAoKCiMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjljZjoibIpCiMjIyA8YSBocmVmPSJodHRwczovL2h0c3VkYS5uZXQvc3RhdHMvcGxvdC5odG1sIiB0YXJnZXQ9Il9ibGFuayI+bGFzOiBsYWJlbCBzdHlsZTwvYT4KYGBge3J9CmJhcnBsb3QoZnJlcV9kYXRhLCBsYXM9Myxjb2w9Im9yYW5nZSIpCmBgYAoKIyMjIOWNmOiqnumgu+W6puaVsOWIhuW4gyjopIfmlbDoibIpCmBgYHtyfQpjb2xvcnMgPSBjKCJvcmFuZ2UiLCAibGlnaHRibHVlIiwgImdyZWVuIikgCmJhcnBsb3QoZnJlcV9kYXRhLCBsYXM9Myxjb2w9Y29sb3JzKQpgYGAKCgojIyDjgqrjg7Pjg6njgqTjg7PoqJjkuovjgYvjgonmg4XloLHjgpLlj5blvpcKIyMjIGh0dHIsIHJ2ZXN044OR44OD44Kx44O844K444KS44Kk44Oz44K544OI44O844OrCi0gPGEgaHJlZj0iaHR0cHM6Ly9jcmFuLnItcHJvamVjdC5vcmcvd2ViL3BhY2thZ2VzL2h0dHIvaW5kZXguaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPmh0dHI6IFRvb2xzIGZvciBXb3JraW5nIHdpdGggVVJMcyBhbmQgSFRUUDwvYT4KLSA8YSBocmVmPSJodHRwczovL2NyYW4uci1wcm9qZWN0Lm9yZy93ZWIvcGFja2FnZXMvcnZlc3QvaW5kZXguaHRtbCIgdGFyZ2V0PSJfYmxhbmsiPnJ2ZXN0OiBFYXNpbHkgSGFydmVzdCAoU2NyYXBlKSBXZWIgUGFnZXM8L2E+CmBgYHtyLCBldmFsID1GQUxTRX0KaW5zdGFsbC5wYWNrYWdlcygiaHR0ciIpCmluc3RhbGwucGFja2FnZXMoInJ2ZXN0IikKYGBgCiMjIyDjg6njgqTjg5bjg6njg6rjga7oqq3jgb/ovrzjgb8KYGBge3J9CmxpYnJhcnkoaHR0cikKbGlicmFyeShydmVzdCkKYGBgCgojIyMgPGEgaHJlZj0iaHR0cHM6Ly9tYWluaWNoaS5qcC9lbmdsaXNoL2FydGljbGVzLzIwMjQwOTI1L3AyYS8wMG0vMG5hLzAxNDAwMGMiIHRhcmdldD0iX2JsYW5rIj5NYWluaWNoaSBKYXBhbidzIEFydGljbGUgdGl0bGVkICJFbGl0ZSBVbml2ZXJzaXR5IG9mIFRva3lvIHRvIGhpa2UgdHVpdGlvbiBmZWVzIGJ5IDIwJTsgZmlyc3QgcmlzZSBpbiAyMHlyczwvYT4KYGBge3J9CiMgVVJMIG9mIHRoZSBNYWluaWNoaSBTaGluYnVuJ3MgYXJ0aWNsZQp1cmwgPC0gImh0dHBzOi8vbWFpbmljaGkuanAvZW5nbGlzaC9hcnRpY2xlcy8yMDI0MDkyNS9wMmEvMDBtLzBuYS8wMTQwMDBjIgoKIyBTZW5kIGEgR0VUIHJlcXVlc3Qgd2l0aCBhIHVzZXIgYWdlbnQKcmVzcG9uc2UgPC0gR0VUKHVybCwgdXNlcl9hZ2VudCgiTW96aWxsYS81LjAgKE1hY2ludG9zaDsgSW50ZWwgTWFjIE9TIFggMTRfNykgQXBwbGVXZWJLaXQvNjA1LjEuMTUgKEtIVE1MLCBsaWtlIEdlY2tvKSBWZXJzaW9uLzE4LjAgU2FmYXJpLzYwNS4xLjE1IikpCnBhZ2UgPC0gcmVhZF9odG1sKHJlc3BvbnNlKQpgYGAKCiMjIyBFeHRyYWN0IHRoZSBjb250ZW50CmBgYHtyfQphcnRpY2xlX2NvbnRlbnQgPC0gaHRtbF90ZXh0KGh0bWxfbm9kZXMocGFnZSwgInAudHh0IiksIHRyaW0gPSBUUlVFKQpgYGAKCiMjIyBUZXh0IENsZWFuaW5nCi0gPGEgaHJlZj0iaHR0cHM6Ly93d3cucmRvY3VtZW50YXRpb24ub3JnL3BhY2thZ2VzL2Jhc2UvdmVyc2lvbnMvMy42LjIvdG9waWNzL3RyaW13cyIgdGFyZ2V0PSJfYmxhbmsiPnRyaW13czogUmVtb3ZlIGxlYWRpbmcgYW5kL29yIHRyYWlsaW5nIHdoaXRlc3BhY2UgZnJvbSBjaGFyYWN0ZXIgc3RyaW5nczwvYT4KLSA8YSBocmVmPSJodHRwczovL3d3dy5yZG9jdW1lbnRhdGlvbi5vcmcvcGFja2FnZXMvYmFzZS92ZXJzaW9ucy8zLjYuMi90b3BpY3MvZ3JlcCIgdGFyZ2V0PSJfYmxhbmsiPmdzdWI6IFBhdHRlcm4gTWF0Y2hpbmcgYW5kIFJlcGxhY2VtZW50PC9hPgotIDxhIGhyZWY9Imh0dHBzOi8vd3d3LnJkb2N1bWVudGF0aW9uLm9yZy9wYWNrYWdlcy9iYXNlL3ZlcnNpb25zLzMuNi4yL3RvcGljcy9wYXN0ZSIgdGFyZ2V0PSJfYmxhbmsiPnBhc3RlOiBDb25jYXRlbmF0ZSBTdHJpbmdzPC9hPgoKYGBge3J9CmNsZWFuZWRfY29udGVudCA8LSBnc3ViKCJcXHJ8XFxuIiwgIiIsIGFydGljbGVfY29udGVudCkKY2xlYW5lZF9jb250ZW50IDwtIHRyaW13cyhjbGVhbmVkX2NvbnRlbnQpCmNsZWFuZWRfY29udGVudCA8LSBwYXN0ZShjbGVhbmVkX2NvbnRlbnQsIGNvbGxhcHNlID0gIiIpCnN1YnN0cmluZyhjbGVhbmVkX2NvbnRlbnQsIDEsIDEwMCkKYGBgCiMjIyA8c3BhbiBzdHlsZT0iY29sb3I6Ymx1ZTsiPue3tOe/kjwvc3Bhbj4gOiDkuIrjga7oqJjkuovjga7lh7rnj77ljZjoqp7poLvluqbjgpLjgIHpoLvluqbjga7pq5jjgYTpoIbjgavlh7rlipvjgZfjgabjgb/jgb7jgZfjgofjgYYKLSDkuIvjga7lh7rlipvntZDmnpzjga/jgIHpq5jpoLvluqbjga7kuIDpg6jjgpLmipznsovjgZfjgZ/jgoLjga4KYGBge3IsIGVjaG89RkFMU0V9CndvcmRMc3Q8LXN0cnNwbGl0KHR4dCwiW1s6c3BhY2U6XV18W1s6cHVuY3Q6XV0iKQp3b3JkTHN0PC11bmxpc3Qod29yZExzdCkKd29yZExzdDwtdG9sb3dlcih3b3JkTHN0KQp3b3JkTHN0PC0gd29yZExzdFt3b3JkTHN0ICE9ICIiXQpmcmVxIDwtIHRhYmxlKHdvcmRMc3QpCmZyZXFfZGF0YTwtc29ydChmcmVxLCBkZWNyZWFzaW5nPVRSVUUpCmBgYAoKYGBge3J9CmZyZXFfZGF0YVsxOjIwXQpgYGAKCgo=