英文斷詞
sentence <- 'This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.'
str.split <- strsplit(x = sentence, split = ' ')
str.split[[1]]
## [1] "This" "is" "an" "R" "Markdown"
## [6] "document." "Markdown" "is" "a" "simple"
## [11] "formatting" "syntax" "for" "authoring" "HTML,"
## [16] "PDF," "and" "MS" "Word" "documents."
unlist(str.split)
## [1] "This" "is" "an" "R" "Markdown"
## [6] "document." "Markdown" "is" "a" "simple"
## [11] "formatting" "syntax" "for" "authoring" "HTML,"
## [16] "PDF," "and" "MS" "Word" "documents."
sentences <- c('This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.', 'When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.')
strsplit(sentences, ' ')
## [[1]]
## [1] "This" "is" "an" "R" "Markdown"
## [6] "document." "Markdown" "is" "a" "simple"
## [11] "formatting" "syntax" "for" "authoring" "HTML,"
## [16] "PDF," "and" "MS" "Word" "documents."
##
## [[2]]
## [1] "When" "you" "click" "the" "**Knit**"
## [6] "button" "a" "document" "will" "be"
## [11] "generated" "that" "includes" "both" "content"
## [16] "as" "well" "as" "the" "output"
## [21] "of" "any" "embedded" "R" "code"
## [26] "chunks" "within" "the" "document."
中文斷句
sentence <- '自然語言處理的學派大致上可以分為:統計自然語言處理、基於規則的自然語言處理,而在今天的開張日中,我們聚焦在比較大的方向,來說明我們如何用自然語言處理,處理一篇文章~'
strsplit(sentence, '、|,|~|:')
## [[1]]
## [1] "自然語言處理的學派大致上可以分為" "統計自然語言處理"
## [3] "基於規則的自然語言處理" "而在今天的開張日中"
## [5] "我們聚焦在比較大的方向" "來說明我們如何用自然語言處理"
## [7] "處理一篇文章"
密碼分析
a <- 'abbccddeeedd'
pwd <- list('a'='b','b'='c','c'='d',
'd'='e', 'e'='f', 'f'='g', 'g'='h', 'h'='i', 'i'='j', 'j'='k', 'k'='l', 'l'='m', 'm'='n',
'n'='o','o'='p','p'='q','q'='r', 'r'='s', 's'='t','t'='u', 'z' = 'a', ' '= ' ', 'u'= 'v')
newsentence <- c()
for (ele in unlist(strsplit(a, ''))){
#print(pwd$ele)
newsentence <- c(newsentence, pwd[ele])
}
unlist(newsentence)
## a b b c c d d e e e d d
## "b" "c" "c" "d" "d" "e" "e" "f" "f" "f" "e" "e"
JiebaR
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.3
## Loading required package: jiebaRD
s <- '那我們酸民婉君也可以報名嗎?'
#?worker
mixseg <- worker(type='mix')
segment(code = s, jiebar = mixseg)
## [1] "那" "我們" "酸民婉君" "也" "可以" "報名"
## [7] "嗎"
#?edit_dict()
edit_dict(name= 'system')
## Warning in edit_dict(name = "system"): Open system dictionary will take a
## long time
## Warning in edit_dict(name = "system"): You should save the dictionary
## without BOM on Windows
edit_dict(name= 'user')
## Warning in edit_dict(name = "user"): You should save the dictionary without
## BOM on Windows
edit_dict(name= 'stop_word')
## Warning in edit_dict(name = "stop_word"): You should save the dictionary
## without BOM on Windows
USERPATH
## [1] "C:/Program Files/R/R-3.4.1/library/jiebaRD/dict/user.dict.utf8"
抓出詞性
tagseg <- worker('tag')
segment(code = s, jiebar = tagseg)
## r r x d c v
## "那" "我們" "酸民婉君" "也" "可以" "報名"
## y
## "嗎"
擴增字典
library(rvest)
?file
f <- file('C:/Program Files/R/R-3.4.1/library/jiebaRD/dict/user.dict.utf8', 'wa')
alinks <- read_html('http://news.ltn.com.tw/list/breakingnews') %>% html_nodes('.tit') %>% html_attr('href')
for (link in alinks){
keywords <- read_html(link) %>% html_nodes('.keyword a') %>% html_text()
for(k in keywords){
cat(paste0(k, '\n'), file = f)
}
}
edit_dict()
Bi-gram
library(NLP)
s<- strsplit('那我們酸民婉君也可以報名嗎', '')
bigram <- ngrams(unlist(s), 2)
s <- c('我', '們')
paste(s, sep = '', collapse = '')
## [1] "我們"
# method 1
unlist(lapply(bigram,
function(e)
paste(e, sep= '',
collapse= '')))
## [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"
# method 2
vapply(bigram, paste, '', collapse='')
## [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"
Tri-gram
s<- strsplit('那我們酸民婉君也可以報名嗎', '')
trigram <- ngrams(unlist(s), 3)
# method 2
vapply(trigram, paste, '', collapse='')
## [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
## [8] "也可以" "可以報" "以報名" "報名嗎"
ngram-function
ngram_func <- function(s,n=2){
s<- strsplit(s, '')
ngram <- ngrams(unlist(s), n)
vapply(ngram, paste, '', collapse='')
}
s <- '那酸民婉君也可以報名嗎'
ngram_func(s, 4)
## [1] "那酸民婉" "酸民婉君" "民婉君也" "婉君也可" "君也可以" "也可以報"
## [7] "可以報名" "以報名嗎"
找出有意義詞彙
article <- "近來新台幣走強,民眾逢低買進加持美元的意願增加,相較於台幣保單而言,美元保單訂價使用較高的利率,因此美元保單有費率便宜的優勢,保額與保單價值均會隨著保單年度逐年遞增,成為多數人退休理財規劃的首選,而在去年掀起一波美元保單熱賣潮"
bigram <- ngrams(unlist(strsplit(article, '')), 2)
bigram.str <- vapply(bigram, paste, '', collapse = '')
tb <- table(bigram.str)
#sort(tb, decreasing = TRUE)
tb[tb >=2]
quadgram <- ngrams(unlist(strsplit(article, '')), 4)
quadgram.str <- vapply(quadgram, paste, '', collapse = '')
tb <- table(quadgram.str)
#sort(tb, decreasing = TRUE)
tb[tb >=2]
a.split <- strsplit(article, ',|。')
w.split <- strsplit(unlist(a.split), '')
#w.split
ngram_func <- function(w, n = 2){
bigram.str <- vapply(ngrams(unlist(w), n),paste, '', collpase='')
bigram.str
}
bigram.all <- lapply(w.split, ngram_func)
sort(table(unlist(bigram.all)), decreasing = TRUE)
s <- '民眾逢低買進美元意願增加'
s.split <- strsplit(s, '美元')
s.split
paste(unlist(s.split), collapse = '', sep = '' )
gsub('美元', '', s)
removekey<- function(s, keys){
for(key in keys){
s<-gsub(key, '', s)
}
s
}
removekey('民眾逢低買進美元意願增加',c('民眾','美元'))
ngram.func <- function(w, n){
n.gram <-ngrams(unlist(w), n)
n.gram.str <- vapply(n.gram, paste, "", collapse = "")
n.gram.str
}
實做長詞優先法
removekey<- function(s, keys){
for(key in keys){
s<-gsub(key, '', s)
}
s
}
ngram.func <- function(w, n){
n.gram <-ngrams(unlist(w), n)
n.gram.str <- vapply(n.gram, paste, "", collapse = "")
n.gram.str
}
longTermFirst <- function(article, keywords, threshold){
for (i in seq(4,2,-1)){
#print(i)
article <- removekey(article, keywords)
a.split <- strsplit(article, ',|。|、|「|」|;')
w.split <- strsplit(x = unlist(a.split), split = '')
n.gram.all <- sapply(w.split,function(e) ngram.func(e,i) )
tb <- table(unlist(n.gram.all))
candidates <- names(tb[tb>=threshold])
keywords <- c(keywords, candidates)
}
keywords
}
article <- '立法院臨時會處理今年度總預算案,經過上周密集協商,昨上午開始宣讀有共識部分後,繼續就無共識的提案逐案表決,由於國民黨團已大幅撤案,時代力量黨團也將撤回法務部矯正署的上百項刪減凍結案,總預算案在下午三讀,包括軍公教加薪3%等重要項目都順利過關,預計2月起就能開始加薪。
不過,由於兩岸關係急凍,依朝野協商結論,中央政府各機關大陸地區旅費遭通案刪減25%,共計2千萬元。另外,也刪減委辦費3%、水電費1%、政策宣導費3%、設備及投資9.2%、對國內團體補助2%、加上其餘刪減數46.6億元,算出預算共刪減249.1億元,刪減比率達歲出預算1兆9918億元的1.25%。
時代力量原提191案需要表決,針對法務部矯正署就有上百案刪減、凍結案,民進黨團總召柯建銘昨酸,「兩萬塊文具費也要專案報告,離譜至極」;時代力量黨團總召徐永明昨則說,矯正署已同意貪污法官胡景彬案有問題,凍結矯正署、八德外獄監典獄長特別費,因此撤至3案,今表決進度也大幅提升。時代力量也要求全數刪減公教退休人員年終及三節慰問金,不過,民進黨團認為應維持2.5萬元門檻,最後未通過。
此外,台大校長當選人管中閔遴選過程引發爭議,時代力量立法院黨團日前提預算主決議,要求「台大校長遴選爭議於釐清前不得逕行核定校長名單」,但昨進行文字修正,將「台大」改為「多所公私立大學」。然而時代力量立委林昶佐今天早上在立法院院會發言時指出,由於民進黨團的反對,認為是新案,因此不得表決,但他還是要表達時代力量的立場。'
longTermFirst(article, c(), 3)
## [1] "民進黨團" "時代力量" "\n \n" "立法院" "矯正署" "3%"
## [7] "台大" "由於" "刪減" "表決" "凍結" "校長"
## [13] "預算" "億元" "黨團"
article <- '實施近7年的日本福島等5縣食品禁令,衛福部昨透露可望打破地區限制,改採高風險食品管制,待行政院裁定時間表。對此,行政院發言人徐國勇今受訪時,衛福部沒有送任何公文來,他並強調,行政院會以國人健康安全為首要考量、並參酌國際作法和科學依據等三原則不變。
2011年3月11日日本大地震後,我國即禁止福島、茨城、群馬、櫪木、千葉5縣市食品輸台。衛福部昨透露可望打破地區限制,改採高風險食品管制,但新制無時間表,待行政院裁定。行政院食安辦公室主任許輔今天也說,目前還沒有接到相關訊息。
不過許輔說,現在日本食品很多狀況和7年前不同,大多數國家已認同,日本流通品是控制在安全範圍。其中美國限制日本福島附近共14個縣市特定品項進口,但台灣及中國仍停留在2011年核災爆發時,針對福島周遭5個縣市所有商品管制。
至於傳台灣爭取加入由日本主導的「跨太平洋夥伴全面進展協定(CPTPP)」之下,才想加速解禁日核災食品。對此,許輔說,政府不會用食安換經貿,但是兩邊也都要顧好,「我們盡量把食安做到滴水不漏,至於國際經貿當然也要接軌。」在前年底公聽會之後,政府有強化食安檢驗和管理,也持續蒐集國外相關資訊。 '
longTermFirst(article, c(), 4)
## [1] "行政院" "日本" "食安" "食品" "福島"
抓取關鍵字
library(jiebaR)
key <- worker('keywords', topn = 3)
s <- '那酸民婉君也可以報名嗎'
key <= s
## 11.7392 11.7392
## "報名" "那酸民婉君"
a <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')
D <- list(a, abb, abc)
#tfidf('a', a, D)
tf <- 1/1
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('a', abb, D)
tf <- 1/3
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('a', abc, D)
tf <- 1/3
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('b', abb, D)
tf <- 2/3
idf <- log(3/2)
tf * idf
## [1] 0.2703101
#tfidf('b', abc, D)
tf <- 1/3
idf <- log(3/2)
tf * idf
## [1] 0.135155
#tfidf('c', abc, D)
tf <- 1/3
idf <- log(3/1)
tf * idf
## [1] 0.3662041
tfidf <- function(t, d, D){
tf <- sum(d == t) / length(abc)
idf <- log(length(D) / sum(sapply(D, function(e) t %in% e)) )
tf * idf
}
繪製文字雲
article <- '立法院臨時會處理今年度總預算案,經過上周密集協商,昨上午開始宣讀有共識部分後,繼續就無共識的提案逐案表決,由於國民黨團已大幅撤案,時代力量黨團也將撤回法務部矯正署的上百項刪減凍結案,總預算案在下午三讀,包括軍公教加薪3%等重要項目都順利過關,預計2月起就能開始加薪。
不過,由於兩岸關係急凍,依朝野協商結論,中央政府各機關大陸地區旅費遭通案刪減25%,共計2千萬元。另外,也刪減委辦費3%、水電費1%、政策宣導費3%、設備及投資9.2%、對國內團體補助2%、加上其餘刪減數46.6億元,算出預算共刪減249.1億元,刪減比率達歲出預算1兆9918億元的1.25%。
時代力量原提191案需要表決,針對法務部矯正署就有上百案刪減、凍結案,民進黨團總召柯建銘昨酸,「兩萬塊文具費也要專案報告,離譜至極」;時代力量黨團總召徐永明昨則說,矯正署已同意貪污法官胡景彬案有問題,凍結矯正署、八德外獄監典獄長特別費,因此撤至3案,今表決進度也大幅提升。時代力量也要求全數刪減公教退休人員年終及三節慰問金,不過,民進黨團認為應維持2.5萬元門檻,最後未通過。
此外,台大校長當選人管中閔遴選過程引發爭議,時代力量立法院黨團日前提預算主決議,要求「台大校長遴選爭議於釐清前不得逕行核定校長名單」,但昨進行文字修正,將「台大」改為「多所公私立大學」。然而時代力量立委林昶佐今天早上在立法院院會發言時指出,由於民進黨團的反對,認為是新案,因此不得表決,但他還是要表達時代力量的立場。'
library(jiebaR)
edit_dict(name = 'user')
## Warning in edit_dict(name = "user"): You should save the dictionary without
## BOM on Windows
mixseg <- worker()
words <- segment(code = article, jiebar = mixseg)
tb <- table(words)
sorted.words <- sort(tb[tb >= 2 & nchar(names(tb)) >= 2], decreasing = TRUE)
#install.packages('wordcloud2')
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.3
wordcloud2(sorted.words, shape='pant')
?wordcloud2
## starting httpd help server ... done
article <- '綠委段宜康於2014年11月縣市長選舉期間,為了替魏明谷助選彰化縣長,指控魏的對手、擔任曲棍球協會理事長的前藍委林滄敏詐領補助款500萬元,魏也在報紙登廣告指林用假單據詐領公款,段甚至揚言若指控不是事實,要公開吞曲棍球,但檢方事後調查認定林滄敏並無不法行為,林怒告段、魏,高等法院今判段宜康、魏明谷須連帶賠林滄敏共100萬元並登報道歉。段宜康今早受訪說,會上訴到底。林滄敏則說,段宜康不要再歹戲拖棚。
林滄敏接受《蘋果》訪問說,要上訴是段宜康的權利,但是法院都是看證據說話,段宜康再怎麼上訴,結果都還是一樣,段宜康既然在全國人民面前承諾,要吞三顆曲棍球,就不要再歹戲拖棚,趕快吞一吞,「不然你還有人格嗎?還有臉當立院司法法制委員會召委嗎?」'
paintWordCloud <- function(article){
mixseg <- worker()
words <- segment(code = article, jiebar = mixseg)
tb <- table(words)
sorted.words <- sort(tb[tb >= 2 & nchar(names(tb)) >= 2], decreasing = TRUE)
wordcloud2(sorted.words, shape='star')
}
paintWordCloud(article)
#install.packages('wordcloud')
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.3
## Loading required package: RColorBrewer
wordcloud(names(sorted.words), sorted.words,random.color=TRUE, colors=rainbow(length(sorted.words)))

#?rainbow
建立詞頻矩陣
s <- '大巨蛋案對市府同仁下封口令? 柯P否認'
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
segment(code = s, jiebar = mixseg)
## [1] "大巨蛋" "案對" "市府" "同仁" "下" "封口令" "柯P" "否認"
# install.packages('tm')
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
e <- 'Hello, I am David. I have taken over 100 courses ~~~'
e.vec <- strsplit(e, ' ')
corpus <- Corpus(VectorSource(e.vec))
?DocumentTermMatrix
dtm <- DocumentTermMatrix(corpus)
dtm
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 100 courses david have hello over taken
## 1 1 1 1 1 1 1 1
tdm <- TermDocumentMatrix(corpus)
tdm
## <<TermDocumentMatrix (terms: 7, documents: 1)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 7, documents: 1)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 1
## 100 1
## courses 1
## david 1
## have 1
## hello 1
## over 1
## taken 1
dtm <- DocumentTermMatrix(corpus,control = list(wordLengths = c(1,20)))
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 100 am c courses david have hello i over taken
## 1 1 1 1 1 1 1 1 2 1 1
doc <- tm_map(corpus, removeNumbers)
doc <- tm_map(doc , removePunctuation)
#install.packages('SnowballC')
doc <- tm_map(doc , stemDocument)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity : 0%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs chello cours david have over taken
## 1 1 1 1 1 1 1
removeN <- content_transformer(function(x, pattern){
return(gsub('\\d','X',x))})
doc <- tm_map(corpus, removeN)
dtm <- DocumentTermMatrix(doc,control = list(wordLengths = c(1,20)))
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs am c courses david have hello i over taken xxx
## 1 1 1 1 1 1 1 2 1 1 1
titles <- c("Trump's war on Russia probe reaches new peak", "Trump administration declines to issue new Russia sanctions")
seg <- strsplit(titles, ' ')
corpus <- Corpus(VectorSource(seg))
doc <- tm_map(corpus, stemDocument)
dtm <- DocumentTermMatrix(doc,
control = list(weighting = function(x)
weightTfIdf(x, normalize =FALSE)))
#?DocumentTermMatrix
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 11)>>
## Non-/sparse entries: 8/14
## Sparsity : 64%
## Maximal term length: 14
## Weighting : term frequency - inverse document frequency (tf-idf)
## Sample :
## Terms
## Docs administration declines issue new peak probe reaches russia sanctions
## 1 0 0 0 0 1 1 1 0 0
## 2 1 1 1 0 0 0 0 0 1
## Terms
## Docs war
## 1 1
## 2 0