作業三

資料前處理

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/kcfor2016.RData', destfile="kcfor2016.RData")

getwd()
## [1] "D:/OS DATA/Documents"
load('kcfor2016.RData')
str(kcfor2016)
## 'data.frame':    336 obs. of  3 variables:
##  $ id          : Factor w/ 337 levels "???剖振??閮游??????虫犖蝡銝\u0080韏瘀??寥銝蝢拍?瘜?嚗?鈭箇?脯\u0080?霈???摰嗡犖?末?亙?嚗瘥?璅詨祕???犖?\u"| __truncated__,..: 337 336 335 334 333 332 331 330 329 328 ...
##  $ created_time: Factor w/ 337 levels "","2015-08-07T11:36:30+0000",..: 337 336 335 334 333 332 331 330 329 328 ...
##  $ message     : Factor w/ 358 levels "","*緊急動員* (轉) 國昌黃 明天早上馬英九要去立法院報告馬習會,時代力量將赴立法院抗議,請大家加入我們,09:00立法院門口集合!",..: 72 59 104 216 154 110 191 205 47 153 ...
kcfor2016$message = as.character(kcfor2016$message)

使用Jieba 斷詞

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
mixseg = worker()
message.seg = lapply(kcfor2016$message, function(m) segment(code = m, jiebar=mixseg))

edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
message.seg[[1]]
##  [1] "宏國"     "大鎮"     "母親節"   "特別"     "活動"     "片名"    
##  [7] "不老騎士" "歐兜"     "邁環台"   "日記"     "時間"     "5"       
## [13] "月"       "7"        "日"       "六"       "下午"     "2"       
## [19] "點"       "30"       "分"       "4"        "點"       "25"      
## [25] "分"       "地點"     "新"       "北"       "市"       "汐止"    
## [31] "區"       "大同路"   "二段"     "285"      "號"       "活動中心"
## [37] "流程"     "14"       "30"       "14"       "35"       "主辦單位"
## [43] "代表"     "致詞"     "14"       "35"       "16"       "05"      
## [49] "紀錄片"   "放映"     "16"       "05"       "16"       "25"      
## [55] "觀眾"     "分享"     "提問"     "弘道"     "基金會"   "宣傳"    
## [61] "不老"     "夢想"     "聯合"     "招募"     "16"       "25"      
## [67] "之後"     "為"       "宏國"     "住戶"     "社區"     "卡拉"    
## [73] "OK"       "歡唱"     "活動"     "主辦單位" "宏國"     "大鎮"    
## [79] "社區"     "管理"     "委員會"   "立法委員" "黃國昌"   "汐止"    
## [85] "服務處"   "公益"     "夥伴"     "弘道"     "老人"     "福利"    
## [91] "基金會"

繪製文字雲

words = unlist(message.seg)
word.tb = table(words)
word.tb = word.tb[nchar(names(word.tb)) >= 2 & word.tb >=20]

# 用str_match篩選中文詞彙 [\u4e00-\u9fa5]
library(stringr)
word.tb = word.tb[!is.na(str_match(names(word.tb), '[\u4e00-\u9fa5]+'))]
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.2.5
wordcloud2(as.table(word.tb))

Str_match 與中文比對

UTF-8是Unicode的一种实现方式,也就是它的字节结构有特殊要求,所以我们说一个汉字的范围是0X4E00到0x9FA5

str_match('文字雲', '[\u4e00-\u9fa5]+')