作業三
資料前處理
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/kcfor2016.RData', destfile="kcfor2016.RData")
getwd()
## [1] "D:/OS DATA/Documents"
load('kcfor2016.RData')
str(kcfor2016)
## 'data.frame': 336 obs. of 3 variables:
## $ id : Factor w/ 337 levels "???剖振??閮游??????虫犖蝡銝\u0080韏瘀??寥銝蝢拍?瘜?嚗?鈭箇?脯\u0080?霈???摰嗡犖?末?亙?嚗瘥?璅詨祕???犖?\u"| __truncated__,..: 337 336 335 334 333 332 331 330 329 328 ...
## $ created_time: Factor w/ 337 levels "","2015-08-07T11:36:30+0000",..: 337 336 335 334 333 332 331 330 329 328 ...
## $ message : Factor w/ 358 levels "","*緊急動員* (轉) 國昌黃 明天早上馬英九要去立法院報告馬習會,時代力量將赴立法院抗議,請大家加入我們,09:00立法院門口集合!",..: 72 59 104 216 154 110 191 205 47 153 ...
kcfor2016$message = as.character(kcfor2016$message)
使用Jieba 斷詞
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
mixseg = worker()
message.seg = lapply(kcfor2016$message, function(m) segment(code = m, jiebar=mixseg))
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
message.seg[[1]]
## [1] "宏國" "大鎮" "母親節" "特別" "活動" "片名"
## [7] "不老騎士" "歐兜" "邁環台" "日記" "時間" "5"
## [13] "月" "7" "日" "六" "下午" "2"
## [19] "點" "30" "分" "4" "點" "25"
## [25] "分" "地點" "新" "北" "市" "汐止"
## [31] "區" "大同路" "二段" "285" "號" "活動中心"
## [37] "流程" "14" "30" "14" "35" "主辦單位"
## [43] "代表" "致詞" "14" "35" "16" "05"
## [49] "紀錄片" "放映" "16" "05" "16" "25"
## [55] "觀眾" "分享" "提問" "弘道" "基金會" "宣傳"
## [61] "不老" "夢想" "聯合" "招募" "16" "25"
## [67] "之後" "為" "宏國" "住戶" "社區" "卡拉"
## [73] "OK" "歡唱" "活動" "主辦單位" "宏國" "大鎮"
## [79] "社區" "管理" "委員會" "立法委員" "黃國昌" "汐止"
## [85] "服務處" "公益" "夥伴" "弘道" "老人" "福利"
## [91] "基金會"
繪製文字雲
words = unlist(message.seg)
word.tb = table(words)
word.tb = word.tb[nchar(names(word.tb)) >= 2 & word.tb >=20]
# 用str_match篩選中文詞彙 [\u4e00-\u9fa5]
library(stringr)
word.tb = word.tb[!is.na(str_match(names(word.tb), '[\u4e00-\u9fa5]+'))]
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.2.5
wordcloud2(as.table(word.tb))
Str_match 與中文比對
UTF-8是Unicode的一种实现方式,也就是它的字节结构有特殊要求,所以我们说一个汉字的范围是0X4E00到0x9FA5
str_match('文字雲', '[\u4e00-\u9fa5]+')