文字處理
s = 'I love this book'
strsplit(s, ' ')
## [[1]]
## [1] "I" "love" "this" "book"
s2 = '我喜歡這本書'
Bi-Gram
library(NLP)
s <- strsplit(x="那我們酸民婉君也可以報名嗎", split ='')
bigram <-ngrams(unlist(s), 2)
vapply(bigram, paste, "", collapse = "")
## [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"
Tri-Gram
s <- strsplit(x="那我們酸民婉君也可以報名嗎", split ='')
trigram <-ngrams(unlist(s), 3)
vapply(trigram, paste, "", collapse = "")
## [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
## [8] "也可以" "可以報" "以報名" "報名嗎"
根據統計抓出有意義詞彙
article <- "身兼中華職棒聯盟會長的國民黨立委吳志揚今天透露,台灣積極爭取的2017世界棒球經典賽分區預賽主辦權,因為大巨蛋遲遲無法孵出來,確定被競爭對手韓國拿走。吳志揚批評,當初中央政府拿台北市的精華地跟北市府交換,就是希望在大巨蛋現址成立體育園區,就如果北市府要改變使用目的,教育部都漠不關心,部長跟體育署長乾脆下台。"
w <- strsplit(x=article, split ='')
bigram <-ngrams(unlist(w), 2)
bigram.str <- vapply(bigram, paste, "", collapse = "")
tb <- table(bigram.str)
tb[tb>=2]
## bigram.str
## ,就 大巨 北市 巨蛋 市府 吳志 志揚 體育
## 2 2 3 2 2 2 2 2
斷句後再做bi-Gram
strsplit(article, "、|,|。")
## [[1]]
## [1] "身兼中華職棒聯盟會長的國民黨立委吳志揚今天透露"
## [2] "台灣積極爭取的2017世界棒球經典賽分區預賽主辦權"
## [3] "因為大巨蛋遲遲無法孵出來"
## [4] "確定被競爭對手韓國拿走"
## [5] "吳志揚批評"
## [6] "當初中央政府拿台北市的精華地跟北市府交換"
## [7] "就是希望在大巨蛋現址成立體育園區"
## [8] "就如果北市府要改變使用目的"
## [9] "教育部都漠不關心"
## [10] "部長跟體育署長乾脆下台"
a.split <- strsplit(article, "、|,|。")
w.split <- strsplit(x=unlist(a.split), split ='')
bigram <- function(w){
bigram <-ngrams(unlist(w), 3)
bigram.str <- vapply(bigram, paste, "", collapse = "")
bigram.str
}
bigram.all <- sapply(w.split, bigram)
tb <- table(unlist(bigram.all))
tb[tb>=2]
##
## 大巨蛋 北市府 吳志揚
## 2 2 2
移除關鍵字
s = "當初中央政府拿台北市的精華地跟北市府交換"
s.split = strsplit(s, '台北市')
paste(unlist(s.split), collapse = "", sep="")
## [1] "當初中央政府拿的精華地跟北市府交換"
建立移除關鍵字函式
keywords = c('台北市', '中央')
for (key in keywords){
print(key)
}
## [1] "台北市"
## [1] "中央"
removekey <- function(s, keys){
for (key in keys){
s.split = strsplit(s, key)
s = paste(unlist(s.split), collapse = "", sep="")
}
s
}
removekey("當初中央政府拿台北市的精華地跟北市府交換", c("台北市", "中央"))
## [1] "當初政府拿的精華地跟北市府交換"
建立 ngram 斷詞函式
ngram.func <- function(w, n){
n.gram <-ngrams(unlist(w), n)
n.gram.str <- vapply(n.gram, paste, "", collapse = "")
n.gram.str
}
實作長詞優先斷詞
article = '陳冠希日前開砲怒罵林志玲「婊子」引發不少風波,今(29日)陳冠希現身透露曾3度找林志玲方面談話,有意私下解決,沒想到對方冷處理,才逼他出面開砲,稍早林志玲其中一位經紀人閻柔怡表示,陳冠希一個月前的確透過朋友傳話,但只有兩次,內容則是:「《我的新衣》是不是林志玲不讓秦舒培參加?」
據《蘋果日報》報導,閻柔怡稍早證實一個月前曾接到兩方中間朋友來電,陳冠希要他帶話問:「《我的新衣》是不是林志玲不讓秦舒培參加?」但閻柔怡表示林志玲僅是受邀參加的嘉賓,認為其中出了誤會,第二天同位友人又再打來,問一樣的問題,閻柔怡仍則給出一樣的答覆。
閻柔怡說,當時不感覺對方有怒氣,所以以為事情應該就這樣告一段落,對於陳冠希今透露早告知對方:「我是瘋子,我會罵你啊!」閻柔怡則說沒聽到這句話,而該節目製作人曹青稍早改口表示,劉雯、何穗、秦舒培、孫菲菲等人確實一開始在同一類別「高冷超模女神」的嘉賓候選人中,但節目組最後選擇何穗,因此嚴格說起來擠掉秦舒培的非林志玲,而是何穗。'
longTermFirst <- function(article, keywords){
for(i in seq(4,2,-1)){
article = removekey(article, keywords)
a.split <- strsplit(article, "、|,|。")
w.split <- strsplit(x=unlist(a.split), split ='')
n.gram.all <- sapply(w.split, function(e) ngram.func(e,i))
tb <- table(unlist(n.gram.all))
candidate <- names(tb[tb>=5])
keywords = c(keywords, candidate)
}
keywords
}
keywords = c()
longTermFirst(article, keywords)
## [1] "林志玲" "陳冠希" "閻柔怡"
啟用JiebaR
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
segmenter = worker()
segmenter <= article
## [1] "陳冠希" "日前" "開" "砲" "怒罵" "林志玲"
## [7] "婊子" "引發" "不少" "風波" "今" "29"
## [13] "日" "陳冠希" "現身" "透露" "曾" "3"
## [19] "度" "找" "林志玲" "方面" "談話" "有意"
## [25] "私下" "解決" "沒想到" "對方" "冷處理" "才"
## [31] "逼" "他" "出面" "開" "砲" "稍早"
## [37] "林志玲" "其中" "一位" "經紀人" "閻柔怡" "表示"
## [43] "陳冠希" "一個月" "前" "的確" "透過" "朋友"
## [49] "傳話" "但" "只有" "兩次" "內容" "則是"
## [55] "我" "的" "新衣" "是不是" "林志玲" "不讓"
## [61] "秦舒培" "參加" "據" "蘋果日報" "報導" "閻柔怡"
## [67] "稍早" "證實" "一個月" "前" "曾" "接到"
## [73] "兩方" "中間" "朋友" "來電" "陳冠希" "要"
## [79] "他" "帶話問" "我" "的" "新衣" "是不是"
## [85] "林志玲" "不讓" "秦舒培" "參加" "但" "閻"
## [91] "柔" "怡" "表示" "林志玲" "僅是" "受邀"
## [97] "參加" "的" "嘉賓" "認為" "其中" "出"
## [103] "了" "誤會" "第二天" "同位" "友人" "又"
## [109] "再" "打來" "問" "一樣" "的" "問題"
## [115] "閻柔怡" "仍則" "給出" "一樣" "的" "答覆"
## [121] "閻柔怡" "說" "當時" "不" "感覺" "對方"
## [127] "有" "怒氣" "所以" "以為" "事情" "應該"
## [133] "就" "這樣" "告一段落" "對於" "陳冠希" "今"
## [139] "透露" "早" "告知" "對方" "我" "是"
## [145] "瘋子" "我會" "罵" "你啊" "閻" "柔"
## [151] "怡" "則" "說" "沒" "聽到" "這句"
## [157] "話" "而" "該" "節目" "製作" "人"
## [163] "曹青" "稍早" "改口" "表示" "劉雯" "何"
## [169] "穗" "秦" "舒" "培" "孫菲菲" "等"
## [175] "人" "確實" "一" "開始" "在" "同一"
## [181] "類別" "高" "冷" "超模" "女神" "的"
## [187] "嘉賓" "候選人" "中" "但" "節目組" "最後"
## [193] "選擇" "何穗" "因此" "嚴格" "說" "起來"
## [199] "擠掉" "秦舒培" "的" "非" "林志玲" "而是"
## [205] "何穗"
Jieba 斷詞
s="那我們酸民婉君也可以報名嗎"
mixseg = worker()
segment(code= s , jiebar = mixseg)
## [1] "那" "我們" "酸民" "婉君" "也" "可以" "報名" "嗎"
編輯使用者自定義字典
#edit_dict()
#USERPATH
抓出詞性
tagseg = worker('tag')
segment(s, tagseg)
## r r n x d c v y
## "那" "我們" "酸民" "婉君" "也" "可以" "報名" "嗎"
自由時報擴充
library(rvest)
## Warning: package 'rvest' was built under R version 3.2.5
## Loading required package: xml2
read_html('http://news.ltn.com.tw/news/politics/breakingnews/1779370') %>% html_nodes('.con_keyword a') %>% html_text() %>% iconv(from='UTF-8', to='UTF-8')
## [1] "國民黨" "太平島" "李登輝" "蔡英文"
計算關鍵字
s="那我們酸民婉君也可以報名嗎"
key = worker('keywords', topn = 3)
key <= s
## 11.7392 11.7392 11.7392
## "報名" "婉君" "我們"
計算TF-IDF
a <- c("a")
abb <- c("a", "b", "b")
abc <- c("a", "b", "c")
D <- list(a, abb, abc)
tfidf <- function(t,d, D){
tf <- table(d)[names(table(d)) == t]/ sum(table(d))
idf <- log(length(D) /sum(sapply(D, function(e) t %in% e)))
tf*idf
}
tfidf('a',a,D)
## a
## 0
tf1 = table(a)[names(table(a)) == 'a'] / sum(table(a))
idf1 = log(length(D) / sum(sapply(D, function(e) 'a' %in% e)))
tfidf('b',abb,D)
## b
## 0.2703101
tf2 = table(abb)[names(table(abb)) == 'b'] / sum(table(abb))
idf2 = log(length(D) / sum(sapply(D, function(e) 'b' %in% e)))
tf2 * idf2
## b
## 0.2703101
tfidf('b',abc,D)
## b
## 0.135155
tfidf('c',abc,D)
## c
## 0.3662041
library(jiebaR)
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/data/applenews.RData', 'applenews.RData')
load('applenews.RData')
str(applenews)
## 'data.frame': 1500 obs. of 5 variables:
## $ content : chr "(更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台中市一名黃姓男子覬覦頭獎高達2600萬的「開門見喜」刮刮樂彩券,上月佯"| __truncated__ "澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片,還大肆批評留澳學生是一群「留學豬」。消息傳出後,這名博士生立"| __truncated__ "【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明的租屋族嗎? 由蘋果地產與FBS TV合作的全新節目-房市大追擊,本集節"| __truncated__ "本內容由中央廣播電臺提供<U+00A0><U+00A0> <U+00A0> <U+00A0> <U+00A0>美國國防部長卡特(Ash Carter)今天(15日)表示,他今天將前往在菲"| __truncated__ ...
## $ title : chr "【更新】搶2.2萬彩券刮中1.4萬 沒發財還得入獄" "拿到澳洲護照後 他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演 美防長南海登艦" ...
## $ dt : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
## $ category: chr "社會" "國際" "地產" "國際" ...
## $ view_cnt: chr "1754" "0" "0" "0" ...
mixseg = worker()
content <- applenews$content[grepl('肯亞', applenews$content)]
seg.str <- segment(code=content, mixseg)
seg.tb <- table(seg.str)
seg.tb <- seg.tb[nchar(names(seg.tb)) >= 2]
tb <- sort(seg.tb, decreasing = TRUE)[0:100]
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.2.5
## Loading required package: RColorBrewer
wordcloud(names(tb), tb, , min.freq = 1, random.order = F, ordered.colors = F, colors = rainbow(length(1:3)))

s = "大巨蛋案對市府同仁下封口令?柯P否認"
mixseg = worker()
segment(code= s , jiebar = mixseg)
## [1] "大巨蛋" "案" "對" "市府" "同仁" "下" "封口令" "柯P"
## [9] "否認"
library(tm)
## Warning: package 'tm' was built under R version 3.2.5
e3 = 'Hello, I am David. I have taken over 100 courses ~~~'
e3.vec = strsplit(e3, ' ')
e3.corpus = Corpus(VectorSource(e3.vec))
e3.dtm = DocumentTermMatrix(e3.corpus)
inspect(e3.dtm)
## <<DocumentTermMatrix (documents: 1, terms: 8)>>
## Non-/sparse entries: 8/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs ~~~ 100 courses david. have hello, over taken
## 1 1 1 1 1 1 1 1 1
doc = tm_map(e3.corpus, removeNumbers)
doc = tm_map(doc, removePunctuation)
dtm = DocumentTermMatrix(doc)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs courses david have hello over taken
## 1 1 1 1 1 1 1
removetilde <- content_transformer(function(x, pattern) {return (gsub("~", "", x))})
doc = tm_map(e3.corpus, removetilde)
dtm = DocumentTermMatrix(doc)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs 100 courses david. have hello, over taken
## 1 1 1 1 1 1 1 1
建立詞頻矩陣
e1 = 'this is a book'
e2 = 'this is my car'
e1.vec = strsplit(e1, ' ')[[1]]
e2.vec = strsplit(e2, ' ')[[1]]
e.vec = list(e1.vec, e2.vec)
e.corpus = Corpus(VectorSource(e.vec))
e.dtm = DocumentTermMatrix(e.corpus)
inspect(e.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity : 33%
## Maximal term length: 4
## Weighting : term frequency (tf)
##
## Terms
## Docs book car this
## 1 1 0 1
## 2 0 1 1