作業ディレクトリの設定

setwd("/cloud/project")

絵文字・装飾文字の処理

参考資料

サンプル文

text_en <- "Wishing ✨ you cozy autumn days 🎃 filled with colorful leaves 🍁 "
text_ja <- "色鮮やかな葉 🍁 とともに、ほっこりとした秋の日々を ✨ お過ごしください🎃 !"
text_fa <- "آرزوی ✨ روزهای پاییزی دنج 🎃 برای شما، پر از برگ‌های رنگارنگ 🍁!"

絵文字・装飾文字の検索

uni_emoji_pattern <- "[\U{1F600}-\U{1F64F}\U{1F300}-\U{1F5FF}\U{1F680}-\U{1F6FF}\U{1F1E0}-\U{1F1FF}\U{2600}-\U{26FF}\U{2700}-\U{27BF}]"

#return the emoji positions in a given text
emoji_positions <- gregexpr(uni_emoji_pattern, text_en)

#extract the matched emojis
regmatches(text_en, emoji_positions)
[[1]]
[1] "✨" "🎃" "🍁"

絵文字・装飾文字の削除

gsub(uni_emoji_pattern, "", text_en)
[1] "Wishing  you cozy autumn days  filled with colorful leaves  "

「絵文字・装飾文字削除」関数作成

remove_emojis <- function(txt) {
    emoji_pattern <- "[\U{1F600}-\U{1F64F}\U{1F300}-\U{1F5FF}\U{1F680}-\U{1F6FF}\U{1F1E0}-\U{1F1FF}\U{2600}-\U{26FF}\U{2700}-\U{27BF}]"
    gsub(emoji_pattern, "", txt)
}

Run the function with text_en

remove_emojis(text_en)
[1] "Wishing  you cozy autumn days  filled with colorful leaves  "

Run the function with text_ja

remove_emojis(text_ja)
[1] "色鮮やかな葉  とともに、ほっこりとした秋の日々を  お過ごしください !"

Run the function with text_fa

remove_emojis(text_fa)
[1] "آرزوی  روزهای پاییزی دنج  برای شما، پر از برگ‌های رنگارنگ !"

使用言語の特定: Compact Language Detector v3 (CLD3)

install.packages("cld3")
install.packages("jsonlite")
library(cld3)
library(jsonlite)

Extract ISO 639-1 Language code

lang_code <- detect_language(text_en)

Extract Language name

languages_json <- fromJSON("cldr-localenames-modern/languages.json")

#Map codes to names using the JSON data
language_name <- languages_json$main$en$localeDisplayNames$languages[[lang_code]]
language_name
[1] "English"

テキスト処理: UDPipe

UDPipe Natural Language Processing

ライブラリの読み込み

library(udpipe)

Annotation with UDPipe

parsed_sentence <- udpipe(remove_emojis(text_en), tolower(lang_name))
head(parsed_sentence)

View関数

View(parsed_sentence)

列名を抽出

colnames(parsed_sentence)
 [1] "doc_id"        "paragraph_id"  "sentence_id"   "sentence"      "start"        
 [6] "end"           "term_id"       "token_id"      "token"         "lemma"        
[11] "upos"          "xpos"          "feats"         "head_token_id" "dep_rel"      
[16] "deps"          "misc"         

Word Freciencies

freqByUDPipe_lemma<-table(parsed_sentence$lemma)
freqByUDPipe_lemma

  autumn colorful     cozy      day     fill    leave     wish     with      you 
       1        1        1        1        1        1        1        1        1 

Data Formatting

参考資料

Check the class & type of freqByUDPipe_lemma

class(freqByUDPipe_lemma)
[1] "table"
typeof(freqByUDPipe_lemma)
[1] "integer"

Convert the table to a data frame format

freqData <- data.frame(freqByUDPipe_lemma)

Check the class & type of freqData

class(freqData)
[1] "data.frame"
typeof(freqData)
[1] "list"

Put the first column’s values into the rownames

colnames(freqData)
[1] "Var1" "Freq"
rownames(freqData) <- freqData$Var1
head(freqData)

Delete a specific column

freqData <- freqData[-1]
head(freqData)

係受け解析の視覚化

関数ファイルの読み込み

source("func_plot_annotation.R")

視覚化

plot_annotation(parsed_sentence, size = 4)

課題1(締め切り11月5日)

入力文の係り受け解析結果を描画出力する関数を作成してください

  • 条件1: 入力文に絵・装飾文字が含まれている場合は、除去する
  • 条件2: 関数の引数は文字列
  • NOTE: 関数の名前は自由に付けてください。課題ができたら、メールで連絡してください。(連絡後、posit上でコードと、実行結果を確認します)

関数の実行例1

source("dep_relation_viz.R")

dep_relation_viz(text_en)

関数の実行例2

text_fr <- "Je vous souhaite ✨ des journées d'automne douillettes 🎃 remplies de feuilles colorées 🍁!"
dep_relation_viz(text_fr)

LS0tCnRpdGxlOiAiTGVjMDQ6IOmgu+W6puihqOS9nOaIkO+8iERhdGEgRnJhbWXlnovvvIkiCm91dHB1dDogaHRtbF9ub3RlYm9vawplZGl0b3Jfb3B0aW9uczogCiAgY2h1bmtfb3V0cHV0X3R5cGU6IGlubGluZQotLS0KIyMg5L2c5qWt44OH44Kj44Os44Kv44OI44Oq44Gu6Kit5a6aCmBgYHtyfQpzZXR3ZCgiL2Nsb3VkL3Byb2plY3QiKQpgYGAKCiMg57W15paH5a2X44O76KOF6aO+5paH5a2X44Gu5Yem55CGCiMjIyDlj4LogIPos4fmlpkKLSA8YSBocmVmPSJodHRwczovL3d3dy51bmljb2RlLm9yZy9yZXBvcnRzL3RyNTEvdHI1MS0yNy5odG1sIiB0YXJnZXQ9Il9ibGFuayI+VW5pY29kZSBFbW9qaTwvYT4KCgojIyMg44K144Oz44OX44Or5paHCmBgYHtyfQp0ZXh0X2VuIDwtICJXaXNoaW5nIOKcqCB5b3UgY296eSBhdXR1bW4gZGF5cyDwn46DIGZpbGxlZCB3aXRoIGNvbG9yZnVsIGxlYXZlcyDwn42BICIKdGV4dF9qYSA8LSAi6Imy6a6u44KE44GL44Gq6JGJIPCfjYEg44Go44Go44KC44Gr44CB44G744Gj44GT44KK44Go44GX44Gf56eL44Gu5pel44CF44KSIOKcqCDjgYrpgY7jgZTjgZfjgY/jgaDjgZXjgYTwn46DIO+8gSIKdGV4dF9mYSA8LSAi2KLYsdiy2YjbjCDinKgg2LHZiNiy2YfYp9uMINm+2KfbjNuM2LLbjCDYr9mG2Kwg8J+OgyDYqNix2KfbjCDYtNmF2KfYjCDZvtixINin2LIg2KjYsdqv4oCM2YfYp9uMINix2Ybar9in2LHZhtqvIPCfjYEhIgpgYGAKCiMjIOe1teaWh+Wtl+ODu+ijhemjvuaWh+Wtl+OBruaknOe0ogotIDxhIGhyZWY9Imh0dHBzOi8vd3d3LnJkb2N1bWVudGF0aW9uLm9yZy9wYWNrYWdlcy9iYXNlL3ZlcnNpb25zLzMuNi4yL3RvcGljcy9ncmVwIiB0YXJnZXQ9Il9ibGFuayI+Z3N1YjogUGF0dGVybiBNYXRjaGluZyBhbmQgUmVwbGFjZW1lbnQ8L2E+Ci0gPGEgaHJlZj0iaHR0cHM6Ly93d3cucmRvY3VtZW50YXRpb24ub3JnL3BhY2thZ2VzL2Jhc2UvdmVyc2lvbnMvMy42LjIvdG9waWNzL3JlZ21hdGNoZXMiIHRhcmdldD0iX2JsYW5rIj5yZWdtYXRjaGVzOiBFeHRyYWN0IG9yIFJlcGxhY2UgTWF0Y2hlZCBTdWJzdHJpbmdzPC9hPgoKYGBge3J9CnVuaV9lbW9qaV9wYXR0ZXJuIDwtICJbXFV7MUY2MDB9LVxVezFGNjRGfVxVezFGMzAwfS1cVXsxRjVGRn1cVXsxRjY4MH0tXFV7MUY2RkZ9XFV7MUYxRTB9LVxVezFGMUZGfVxVezI2MDB9LVxVezI2RkZ9XFV7MjcwMH0tXFV7MjdCRn1dIgoKI3JldHVybiB0aGUgZW1vamkgcG9zaXRpb25zIGluIGEgZ2l2ZW4gdGV4dAplbW9qaV9wb3NpdGlvbnMgPC0gZ3JlZ2V4cHIodW5pX2Vtb2ppX3BhdHRlcm4sIHRleHRfZW4pCgojZXh0cmFjdCB0aGUgbWF0Y2hlZCBlbW9qaXMKcmVnbWF0Y2hlcyh0ZXh0X2VuLCBlbW9qaV9wb3NpdGlvbnMpCmBgYAoKIyMg57W15paH5a2X44O76KOF6aO+5paH5a2X44Gu5YmK6ZmkCmBgYHtyfQpnc3ViKHVuaV9lbW9qaV9wYXR0ZXJuLCAiIiwgdGV4dF9lbikKYGBgCgojIyDjgIzntbXmloflrZfjg7voo4Xpo77mloflrZfliYrpmaTjgI3plqLmlbDkvZzmiJAKYGBge3J9CnJlbW92ZV9lbW9qaXMgPC0gZnVuY3Rpb24odHh0KSB7CiAgICBlbW9qaV9wYXR0ZXJuIDwtICJbXFV7MUY2MDB9LVxVezFGNjRGfVxVezFGMzAwfS1cVXsxRjVGRn1cVXsxRjY4MH0tXFV7MUY2RkZ9XFV7MUYxRTB9LVxVezFGMUZGfVxVezI2MDB9LVxVezI2RkZ9XFV7MjcwMH0tXFV7MjdCRn1dIgogICAgZ3N1YihlbW9qaV9wYXR0ZXJuLCAiIiwgdHh0KQp9CmBgYAoKIyMjIFJ1biB0aGUgZnVuY3Rpb24gd2l0aCB0ZXh0X2VuCmBgYHtyfQpyZW1vdmVfZW1vamlzKHRleHRfZW4pCmBgYAojIyMgUnVuIHRoZSBmdW5jdGlvbiB3aXRoIHRleHRfamEKYGBge3J9CnJlbW92ZV9lbW9qaXModGV4dF9qYSkKYGBgCiMjIyBSdW4gdGhlIGZ1bmN0aW9uIHdpdGggdGV4dF9mYQpgYGB7cn0KcmVtb3ZlX2Vtb2ppcyh0ZXh0X2ZhKQpgYGAKCiMg5L2/55So6KiA6Kqe44Gu54m55a6aOiA8YSBocmVmPSJodHRwczovL2dpdGh1Yi5jb20vZ29vZ2xlL2NsZDMiIHRhcmdldD0iX2JsYW5rIj5Db21wYWN0IExhbmd1YWdlIERldGVjdG9yIHYzIChDTEQzKTwvYT4KLSA8YSBocmVmPSJodHRwczovL2RvY3Mucm9wZW5zY2kub3JnL2NsZDMvIiB0YXJnZXQ9Il9ibGFuayI+UiBXcmFwcGVyIGZvciBHb29nbGXigJlzIENvbXBhY3QgTGFuZ3VhZ2UgRGV0ZWN0b3IgMzwvYT4KYGBge3IsIGV2YWw9RkFMU0V9Cmluc3RhbGwucGFja2FnZXMoImNsZDMiKQppbnN0YWxsLnBhY2thZ2VzKCJqc29ubGl0ZSIpCmBgYAoKYGBge3J9CmxpYnJhcnkoY2xkMykKbGlicmFyeShqc29ubGl0ZSkKYGBgCgojIyBFeHRyYWN0IElTTyA2MzktMSBMYW5ndWFnZSBjb2RlCmBgYHtyfQpsYW5nX2NvZGUgPC0gZGV0ZWN0X2xhbmd1YWdlKHRleHRfZW4pCmBgYAoKIyMgRXh0cmFjdCBMYW5ndWFnZSBuYW1lIAotIDxhIGhyZWY9Imh0dHBzOi8vZ2l0aHViLmNvbS91bmljb2RlLWNsZHIvY2xkci1sb2NhbGVuYW1lcy1tb2Rlcm4vYmxvYi9tYXN0ZXIvbWFpbi9lbi9sYW5ndWFnZXMuanNvbiIgdGFyZ2V0PSJfYmxhbmsiPiJsYW5ndWFnZXMuanNvbiI8L2E+IGZpbGUKYGBge3J9Cmxhbmd1YWdlc19qc29uIDwtIGZyb21KU09OKCJjbGRyLWxvY2FsZW5hbWVzLW1vZGVybi9sYW5ndWFnZXMuanNvbiIpCgojTWFwIGNvZGVzIHRvIG5hbWVzIHVzaW5nIHRoZSBKU09OIGRhdGEKbGFuZ3VhZ2VfbmFtZSA8LSBsYW5ndWFnZXNfanNvbiRtYWluJGVuJGxvY2FsZURpc3BsYXlOYW1lcyRsYW5ndWFnZXNbW2xhbmdfY29kZV1dCmxhbmd1YWdlX25hbWUKYGBgCiMjIOODhuOCreOCueODiOWHpueQhjogVURQaXBlCiMjIyDlj4LogIPos4fmlpkKLSA8YSBocmVmPSJodHRwczovL2xpbmRhdC5tZmYuY3VuaS5jei9zZXJ2aWNlcy91ZHBpcGUvIiB0YXJnZXQ9Il9ibGFuayI+TElOREFUIFJFU1Qgc2VydmljZTwvYT4KLSA8YSBocmVmPSJodHRwczovL3VmYWwubWZmLmN1bmkuY3ovdWRwaXBlIiB0YXJnZXQ9Il9ibGFuayI+VURQaXBlIFZlcnNpb25zPC9hPgotIDxhIGhyZWY9Imh0dHBzOi8vdW5pdmVyc2FsZGVwZW5kZW5jaWVzLm9yZy8iIHRhcmdldD0iX2JsYW5rIj5Vbml2ZXJzYWwgRGVwZW5kZW5jaWVzPC9hPgoKIyA8YSBocmVmPSJodHRwczovL2Jub3NhYy5naXRodWIuaW8vdWRwaXBlL2VuL2luZGV4Lmh0bWwiIHRhcmdldD0iX2JsYW5rIj5VRFBpcGUgTmF0dXJhbCBMYW5ndWFnZSBQcm9jZXNzaW5nPC9hPgoKIyMjIOODqeOCpOODluODqeODquOBruiqreOBv+i+vOOBvwpgYGB7cn0KbGlicmFyeSh1ZHBpcGUpCmBgYAoKIyMjIEFubm90YXRpb24gd2l0aCBVRFBpcGUKYGBge3J9CnBhcnNlZF9zZW50ZW5jZSA8LSB1ZHBpcGUocmVtb3ZlX2Vtb2ppcyh0ZXh0X2VuKSwgdG9sb3dlcihsYW5nX25hbWUpKQpoZWFkKHBhcnNlZF9zZW50ZW5jZSkKYGBgCiMjIyBWaWV36Zai5pWwCmBgYHtyLCBldmFsID0gRkFMU0V9ClZpZXcocGFyc2VkX3NlbnRlbmNlKQpgYGAKCiMjIyDliJflkI3jgpLmir3lh7oKYGBge3J9CmNvbG5hbWVzKHBhcnNlZF9zZW50ZW5jZSkKYGBgCgojIyMgaGVhZApgYGB7cn0KaGVhZChwYXJzZWRfc2VudGVuY2VbYygidG9rZW5faWQiLCAidG9rZW4iLCAiaGVhZF90b2tlbl9pZCIpXSkKYGBgCgojIyBXb3JkIEZyZWNpZW5jaWVzCmBgYHtyfQpmcmVxQnlVRFBpcGVfbGVtbWE8LXRhYmxlKHBhcnNlZF9zZW50ZW5jZSRsZW1tYSkKZnJlcUJ5VURQaXBlX2xlbW1hCmBgYAojIyBEYXRhIEZvcm1hdHRpbmcKIyMjIOWPguiAg+izh+aWmQotIDxhIGhyZWY9Imh0dHBzOi8vaHRzdWRhLm5ldC9zdGF0cy9kYXRhLWJhc2ljcy5odG1sI2RhdGEtYmFzaWNzLWRhdGEtc3RydWN0dXJlIiB0YXJnZXQ9Il9ibGFuayI+UuOBruODh+ODvOOCv+ani+mAoDwvYT4KCiMjIyBDaGVjayB0aGUgY2xhc3MgJiB0eXBlIG9mIGZyZXFCeVVEUGlwZV9sZW1tYQpgYGB7cn0KY2xhc3MoZnJlcUJ5VURQaXBlX2xlbW1hKQp0eXBlb2YoZnJlcUJ5VURQaXBlX2xlbW1hKQpgYGAKIyMjIENvbnZlcnQgdGhlIHRhYmxlIHRvIGEgZGF0YSBmcmFtZSBmb3JtYXQKYGBge3J9CmZyZXFEYXRhIDwtIGRhdGEuZnJhbWUoZnJlcUJ5VURQaXBlX2xlbW1hKQpgYGAKIyMjIENoZWNrIHRoZSBjbGFzcyAmIHR5cGUgb2YgZnJlcURhdGEKYGBge3J9CmNsYXNzKGZyZXFEYXRhKQp0eXBlb2YoZnJlcURhdGEpCmBgYAoKIyMgUHV0IHRoZSBmaXJzdCBjb2x1bW4ncyB2YWx1ZXMgaW50byB0aGUgcm93bmFtZXMKYGBge3J9CmNvbG5hbWVzKGZyZXFEYXRhKQpyb3duYW1lcyhmcmVxRGF0YSkgPC0gZnJlcURhdGEkVmFyMQpoZWFkKGZyZXFEYXRhKQpgYGAKCiMjIERlbGV0ZSBhIHNwZWNpZmljIGNvbHVtbgpgYGB7cn0KZnJlcURhdGEgPC0gZnJlcURhdGFbLTFdCmhlYWQoZnJlcURhdGEpCmBgYAoKIyMg5L+C5Y+X44GR6Kej5p6Q44Gu6KaW6Kaa5YyWCi0gcmVmLiA8YSBocmVmPSJodHRwczovL3d3dy5yLWJsb2dnZXJzLmNvbS8yMDE5LzA3L2RlcGVuZGVuY3ktcGFyc2luZy13aXRoLXVkcGlwZS8iIHRhcmdldD0iX2JsYW5rIj5kZXBlbmRlbmN5IHBhcnNpbmcgd2l0aCB1ZHBpcGU8L2E+CgotIDxhIGhyZWY9Imh0dHBzOi8vZ2l0aHViLmNvbS9ibm9zYWMvdGV4dHBsb3QiIHRhcmdldD0iX2JsYW5rIj50ZXh0cGxvdDwvYT4KCiMjIyDplqLmlbDjg5XjgqHjgqTjg6vjga7oqq3jgb/ovrzjgb8KYGBge3J9CnNvdXJjZSgiZnVuY19wbG90X2Fubm90YXRpb24uUiIpCmBgYAoKIyMg6KaW6Kaa5YyWCmBgYHtyfQpwbG90X2Fubm90YXRpb24ocGFyc2VkX3NlbnRlbmNlLCBzaXplID0gNCkKYGBgCgojIOiqsumhjO+8ke+8iOe3oOOCgeWIh+OCijEx5pyINeaXpe+8iQojIyDlhaXlipvmlofjga7kv4Ljgorlj5fjgZHop6PmnpDntZDmnpzjgpLmj4/nlLvlh7rlipvjgZnjgovplqLmlbDjgpLkvZzmiJDjgZfjgabjgY/jgaDjgZXjgYQKLSDmnaHku7YxOiDlhaXlipvmlofjgavntbXjg7voo4Xpo77mloflrZfjgYzlkKvjgb7jgozjgabjgYTjgovloLTlkIjjga/jgIHpmaTljrvjgZnjgosKLSDmnaHku7YyOiDplqLmlbDjga7lvJXmlbDjga/mloflrZfliJcKLSBOT1RFOiDplqLmlbDjga7lkI3liY3jga/oh6rnlLHjgavku5jjgZHjgabjgY/jgaDjgZXjgYTjgILoqrLpoYzjgYzjgafjgY3jgZ/jgonjgIHjg6Hjg7zjg6vjgafpgKPntaHjgZfjgabjgY/jgaDjgZXjgYTjgILvvIjpgKPntaHlvozjgIFwb3NpdOS4iuOBp+OCs+ODvOODieOBqOOAgeWun+ihjOe1kOaenOOCkueiuuiqjeOBl+OBvuOBme+8iQoKIyMg6Zai5pWw44Gu5a6f6KGM5L6LMQpgYGB7cn0Kc291cmNlKCJkZXBfcmVsYXRpb25fdml6LlIiKQoKZGVwX3JlbGF0aW9uX3Zpeih0ZXh0X2VuKQpgYGAKIyMg6Zai5pWw44Gu5a6f6KGM5L6LMgpgYGB7cn0KdGV4dF9mciA8LSAiSmUgdm91cyBzb3VoYWl0ZSDinKggZGVzIGpvdXJuw6llcyBkJ2F1dG9tbmUgZG91aWxsZXR0ZXMg8J+OgyByZW1wbGllcyBkZSBmZXVpbGxlcyBjb2xvcsOpZXMg8J+NgSEiCmRlcF9yZWxhdGlvbl92aXoodGV4dF9mcikKYGBgCgoK