文字處理

s = 'I love this book'
strsplit(s, ' ')
## [[1]]
## [1] "I"    "love" "this" "book"
s2 = '我喜歡這本書'

Bi-Gram

library(NLP)
s <- strsplit(x="那我們酸民婉君也可以報名嗎", split ='')
bigram <-ngrams(unlist(s), 2)
vapply(bigram, paste, "", collapse = "")
##  [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"

Tri-Gram

s <- strsplit(x="那我們酸民婉君也可以報名嗎", split ='')
trigram <-ngrams(unlist(s), 3)
vapply(trigram, paste, "", collapse = "")
##  [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
##  [8] "也可以" "可以報" "以報名" "報名嗎"

根據統計抓出有意義詞彙

article <- "身兼中華職棒聯盟會長的國民黨立委吳志揚今天透露,台灣積極爭取的2017世界棒球經典賽分區預賽主辦權,因為大巨蛋遲遲無法孵出來,確定被競爭對手韓國拿走。吳志揚批評,當初中央政府拿台北市的精華地跟北市府交換,就是希望在大巨蛋現址成立體育園區,就如果北市府要改變使用目的,教育部都漠不關心,部長跟體育署長乾脆下台。"
w <- strsplit(x=article, split ='')
bigram <-ngrams(unlist(w), 2)
bigram.str <- vapply(bigram, paste, "", collapse = "")
tb <- table(bigram.str)
tb[tb>=2]
## bigram.str
## ,就 大巨 北市 巨蛋 市府 吳志 志揚 體育 
##    2    2    3    2    2    2    2    2

斷句後再做bi-Gram

strsplit(article, "、|,|。")
## [[1]]
##  [1] "身兼中華職棒聯盟會長的國民黨立委吳志揚今天透露"
##  [2] "台灣積極爭取的2017世界棒球經典賽分區預賽主辦權"
##  [3] "因為大巨蛋遲遲無法孵出來"                      
##  [4] "確定被競爭對手韓國拿走"                        
##  [5] "吳志揚批評"                                    
##  [6] "當初中央政府拿台北市的精華地跟北市府交換"      
##  [7] "就是希望在大巨蛋現址成立體育園區"              
##  [8] "就如果北市府要改變使用目的"                    
##  [9] "教育部都漠不關心"                              
## [10] "部長跟體育署長乾脆下台"
a.split <- strsplit(article, "、|,|。")
w.split <- strsplit(x=unlist(a.split), split ='')


bigram <- function(w){
  bigram <-ngrams(unlist(w), 3)
  bigram.str <- vapply(bigram, paste, "", collapse = "")
  bigram.str
}

bigram.all <- sapply(w.split, bigram)
tb <- table(unlist(bigram.all))
tb[tb>=2]
## 
## 大巨蛋 北市府 吳志揚 
##      2      2      2

移除關鍵字

s = "當初中央政府拿台北市的精華地跟北市府交換" 
s.split = strsplit(s, '台北市')
paste(unlist(s.split), collapse = "", sep="")
## [1] "當初中央政府拿的精華地跟北市府交換"

建立移除關鍵字函式

keywords = c('台北市', '中央')
for (key in keywords){
  print(key)
}
## [1] "台北市"
## [1] "中央"
removekey <- function(s, keys){
  for (key in keys){
    s.split = strsplit(s, key)
    s = paste(unlist(s.split), collapse = "", sep="")
  }
  s
}

removekey("當初中央政府拿台北市的精華地跟北市府交換", c("台北市", "中央"))
## [1] "當初政府拿的精華地跟北市府交換"

建立 ngram 斷詞函式

ngram.func <- function(w, n){
  n.gram <-ngrams(unlist(w), n)
  n.gram.str <- vapply(n.gram, paste, "", collapse = "")
  n.gram.str
}

實作長詞優先斷詞

article = '陳冠希日前開砲怒罵林志玲「婊子」引發不少風波,今(29日)陳冠希現身透露曾3度找林志玲方面談話,有意私下解決,沒想到對方冷處理,才逼他出面開砲,稍早林志玲其中一位經紀人閻柔怡表示,陳冠希一個月前的確透過朋友傳話,但只有兩次,內容則是:「《我的新衣》是不是林志玲不讓秦舒培參加?」

據《蘋果日報》報導,閻柔怡稍早證實一個月前曾接到兩方中間朋友來電,陳冠希要他帶話問:「《我的新衣》是不是林志玲不讓秦舒培參加?」但閻柔怡表示林志玲僅是受邀參加的嘉賓,認為其中出了誤會,第二天同位友人又再打來,問一樣的問題,閻柔怡仍則給出一樣的答覆。

閻柔怡說,當時不感覺對方有怒氣,所以以為事情應該就這樣告一段落,對於陳冠希今透露早告知對方:「我是瘋子,我會罵你啊!」閻柔怡則說沒聽到這句話,而該節目製作人曹青稍早改口表示,劉雯、何穗、秦舒培、孫菲菲等人確實一開始在同一類別「高冷超模女神」的嘉賓候選人中,但節目組最後選擇何穗,因此嚴格說起來擠掉秦舒培的非林志玲,而是何穗。'
longTermFirst <- function(article, keywords){
  for(i in seq(4,2,-1)){
    article = removekey(article, keywords)
    a.split <- strsplit(article, "、|,|。")
    w.split <- strsplit(x=unlist(a.split), split ='')
    n.gram.all <- sapply(w.split, function(e) ngram.func(e,i))
    
    tb <- table(unlist(n.gram.all))
    candidate <- names(tb[tb>=5])
    keywords = c(keywords, candidate)
  }
  keywords
}

keywords = c()
longTermFirst(article, keywords)
## [1] "林志玲" "陳冠希" "閻柔怡"

啟用JiebaR

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
segmenter = worker()
segmenter <= article 
##   [1] "陳冠希"   "日前"     "開"       "砲"       "怒罵"     "林志玲"  
##   [7] "婊子"     "引發"     "不少"     "風波"     "今"       "29"      
##  [13] "日"       "陳冠希"   "現身"     "透露"     "曾"       "3"       
##  [19] "度"       "找"       "林志玲"   "方面"     "談話"     "有意"    
##  [25] "私下"     "解決"     "沒想到"   "對方"     "冷處理"   "才"      
##  [31] "逼"       "他"       "出面"     "開"       "砲"       "稍早"    
##  [37] "林志玲"   "其中"     "一位"     "經紀人"   "閻柔怡"   "表示"    
##  [43] "陳冠希"   "一個月"   "前"       "的確"     "透過"     "朋友"    
##  [49] "傳話"     "但"       "只有"     "兩次"     "內容"     "則是"    
##  [55] "我"       "的"       "新衣"     "是不是"   "林志玲"   "不讓"    
##  [61] "秦舒培"   "參加"     "據"       "蘋果日報" "報導"     "閻柔怡"  
##  [67] "稍早"     "證實"     "一個月"   "前"       "曾"       "接到"    
##  [73] "兩方"     "中間"     "朋友"     "來電"     "陳冠希"   "要"      
##  [79] "他"       "帶話問"   "我"       "的"       "新衣"     "是不是"  
##  [85] "林志玲"   "不讓"     "秦舒培"   "參加"     "但"       "閻"      
##  [91] "柔"       "怡"       "表示"     "林志玲"   "僅是"     "受邀"    
##  [97] "參加"     "的"       "嘉賓"     "認為"     "其中"     "出"      
## [103] "了"       "誤會"     "第二天"   "同位"     "友人"     "又"      
## [109] "再"       "打來"     "問"       "一樣"     "的"       "問題"    
## [115] "閻柔怡"   "仍則"     "給出"     "一樣"     "的"       "答覆"    
## [121] "閻柔怡"   "說"       "當時"     "不"       "感覺"     "對方"    
## [127] "有"       "怒氣"     "所以"     "以為"     "事情"     "應該"    
## [133] "就"       "這樣"     "告一段落" "對於"     "陳冠希"   "今"      
## [139] "透露"     "早"       "告知"     "對方"     "我"       "是"      
## [145] "瘋子"     "我會"     "罵"       "你啊"     "閻"       "柔"      
## [151] "怡"       "則"       "說"       "沒"       "聽到"     "這句"    
## [157] "話"       "而"       "該"       "節目"     "製作"     "人"      
## [163] "曹青"     "稍早"     "改口"     "表示"     "劉雯"     "何"      
## [169] "穗"       "秦"       "舒"       "培"       "孫菲菲"   "等"      
## [175] "人"       "確實"     "一"       "開始"     "在"       "同一"    
## [181] "類別"     "高"       "冷"       "超模"     "女神"     "的"      
## [187] "嘉賓"     "候選人"   "中"       "但"       "節目組"   "最後"    
## [193] "選擇"     "何穗"     "因此"     "嚴格"     "說"       "起來"    
## [199] "擠掉"     "秦舒培"   "的"       "非"       "林志玲"   "而是"    
## [205] "何穗"

Jieba 斷詞

s="那我們酸民婉君也可以報名嗎"
mixseg = worker()
segment(code= s , jiebar = mixseg)
## [1] "那"   "我們" "酸民" "婉君" "也"   "可以" "報名" "嗎"

編輯使用者自定義字典

#edit_dict()
#USERPATH

抓出詞性

tagseg = worker('tag')
segment(s, tagseg)
##      r      r      n      x      d      c      v      y 
##   "那" "我們" "酸民" "婉君"   "也" "可以" "報名"   "嗎"

自由時報擴充

library(rvest)
## Warning: package 'rvest' was built under R version 3.2.5
## Loading required package: xml2
read_html('http://news.ltn.com.tw/news/politics/breakingnews/1779370') %>% html_nodes('.con_keyword a') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
## [1] "國民黨" "太平島" "李登輝" "蔡英文"

計算關鍵字

s="那我們酸民婉君也可以報名嗎"
key = worker('keywords', topn = 3)
key <= s
## 11.7392 11.7392 11.7392 
##  "報名"  "婉君"  "我們"

計算TF-IDF

a <- c("a")
abb <- c("a", "b", "b")
abc <- c("a", "b", "c")
D <- list(a, abb, abc)

tfidf <- function(t,d, D){
 tf <- table(d)[names(table(d)) == t]/ sum(table(d))
 idf <- log(length(D) /sum(sapply(D, function(e) t %in% e)))
 tf*idf
}

tfidf('a',a,D)
## a 
## 0
tf1 = table(a)[names(table(a)) == 'a'] / sum(table(a))
idf1 = log(length(D) / sum(sapply(D, function(e) 'a' %in% e)))


tfidf('b',abb,D)
##         b 
## 0.2703101
tf2 = table(abb)[names(table(abb)) == 'b'] / sum(table(abb))
idf2 = log(length(D) / sum(sapply(D, function(e) 'b' %in% e)))
tf2 * idf2
##         b 
## 0.2703101
tfidf('b',abc,D)
##        b 
## 0.135155
tfidf('c',abc,D)
##         c 
## 0.3662041
library(jiebaR)
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')
str(applenews)
## 'data.frame':    1500 obs. of  5 variables:
##  $ content : chr  "(更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台中市一名黃姓男子覬覦頭獎高達2600萬的「開門見喜」刮刮樂彩券,上月佯"| __truncated__ "澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片,還大肆批評留澳學生是一群「留學豬」。消息傳出後,這名博士生立"| __truncated__ "【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明的租屋族嗎? 由蘋果地產與FBS TV合作的全新節目-房市大追擊,本集節"| __truncated__ "本內容由中央廣播電臺提供<U+00A0><U+00A0> <U+00A0> <U+00A0> <U+00A0>美國國防部長卡特(Ash Carter)今天(15日)表示,他今天將前往在菲"| __truncated__ ...
##  $ title   : chr  "【更新】搶2.2萬彩券刮中1.4萬 沒發財還得入獄" "拿到澳洲護照後 他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演  美防長南海登艦" ...
##  $ dt      : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
##  $ category: chr  "社會" "國際" "地產" "國際" ...
##  $ view_cnt: chr  "1754" "0" "0" "0" ...
mixseg = worker()
content <- applenews$content[grepl('肯亞', applenews$content)]
seg.str <- segment(code=content, mixseg)
seg.tb <- table(seg.str)
seg.tb <- seg.tb[nchar(names(seg.tb)) >= 2]
tb <- sort(seg.tb, decreasing = TRUE)[0:100]

library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.2.5
## Loading required package: RColorBrewer
wordcloud(names(tb), tb, , min.freq = 1, random.order = F, ordered.colors = F, colors = rainbow(length(1:3)))

s = "大巨蛋案對市府同仁下封口令?柯P否認"
mixseg = worker()
segment(code= s , jiebar = mixseg)
## [1] "大巨蛋" "案"     "對"     "市府"   "同仁"   "下"     "封口令" "柯P"   
## [9] "否認"
library(tm)
## Warning: package 'tm' was built under R version 3.2.5
e3 = 'Hello, I am David. I have taken over 100 courses ~~~' 
e3.vec = strsplit(e3, ' ')
e3.corpus = Corpus(VectorSource(e3.vec))
e3.dtm = DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)
## <<DocumentTermMatrix (documents: 1, terms: 8)>>
## Non-/sparse entries: 8/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs ~~~ 100 courses david. have hello, over taken
##    1   1   1       1      1    1      1    1     1
doc = tm_map(e3.corpus, removeNumbers)
doc = tm_map(doc, removePunctuation)
dtm = DocumentTermMatrix(doc)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs courses david have hello over taken
##    1       1     1    1     1    1     1
removetilde <- content_transformer(function(x, pattern) {return (gsub("~", "", x))})
doc = tm_map(e3.corpus, removetilde)
dtm = DocumentTermMatrix(doc)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs 100 courses david. have hello, over taken
##    1   1       1      1    1      1    1     1

建立詞頻矩陣

e1 = 'this is a book'
e2 = 'this is my car'
e1.vec = strsplit(e1, ' ')[[1]]
e2.vec = strsplit(e2, ' ')[[1]]
e.vec = list(e1.vec, e2.vec)
e.corpus = Corpus(VectorSource(e.vec))
e.dtm = DocumentTermMatrix(e.corpus)
inspect(e.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity           : 33%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## 
##     Terms
## Docs book car this
##    1    1   0    1
##    2    0   1    1