英文斷詞

sentence <- 'This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.'

str.split <- strsplit(x = sentence, split = ' ')
str.split[[1]]
##  [1] "This"       "is"         "an"         "R"          "Markdown"  
##  [6] "document."  "Markdown"   "is"         "a"          "simple"    
## [11] "formatting" "syntax"     "for"        "authoring"  "HTML,"     
## [16] "PDF,"       "and"        "MS"         "Word"       "documents."
unlist(str.split)
##  [1] "This"       "is"         "an"         "R"          "Markdown"  
##  [6] "document."  "Markdown"   "is"         "a"          "simple"    
## [11] "formatting" "syntax"     "for"        "authoring"  "HTML,"     
## [16] "PDF,"       "and"        "MS"         "Word"       "documents."
sentences <- c('This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents.', 'When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.')

strsplit(sentences, ' ')
## [[1]]
##  [1] "This"       "is"         "an"         "R"          "Markdown"  
##  [6] "document."  "Markdown"   "is"         "a"          "simple"    
## [11] "formatting" "syntax"     "for"        "authoring"  "HTML,"     
## [16] "PDF,"       "and"        "MS"         "Word"       "documents."
## 
## [[2]]
##  [1] "When"      "you"       "click"     "the"       "**Knit**" 
##  [6] "button"    "a"         "document"  "will"      "be"       
## [11] "generated" "that"      "includes"  "both"      "content"  
## [16] "as"        "well"      "as"        "the"       "output"   
## [21] "of"        "any"       "embedded"  "R"         "code"     
## [26] "chunks"    "within"    "the"       "document."

中文斷句

sentence <- '自然語言處理的學派大致上可以分為:統計自然語言處理、基於規則的自然語言處理,而在今天的開張日中,我們聚焦在比較大的方向,來說明我們如何用自然語言處理,處理一篇文章~'

strsplit(sentence, '、|,|~|:')
## [[1]]
## [1] "自然語言處理的學派大致上可以分為" "統計自然語言處理"                
## [3] "基於規則的自然語言處理"           "而在今天的開張日中"              
## [5] "我們聚焦在比較大的方向"           "來說明我們如何用自然語言處理"    
## [7] "處理一篇文章"

密碼分析

a <- 'abbccddeeedd'

pwd <- list('a'='b','b'='c','c'='d', 
            'd'='e', 'e'='f', 'f'='g', 'g'='h', 'h'='i', 'i'='j', 'j'='k', 'k'='l', 'l'='m', 'm'='n',
            'n'='o','o'='p','p'='q','q'='r', 'r'='s', 's'='t','t'='u', 'z' = 'a', ' '= ' ', 'u'= 'v')

newsentence <- c()
for (ele in unlist(strsplit(a, ''))){
  #print(pwd$ele)
  newsentence <- c(newsentence, pwd[ele])
}
unlist(newsentence)
##   a   b   b   c   c   d   d   e   e   e   d   d 
## "b" "c" "c" "d" "d" "e" "e" "f" "f" "f" "e" "e"

JiebaR

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.3
## Loading required package: jiebaRD
s <- '那我們酸民婉君也可以報名嗎?'

#?worker
mixseg <- worker(type='mix')
segment(code = s, jiebar = mixseg)
## [1] "那"       "我們"     "酸民婉君" "也"       "可以"     "報名"    
## [7] "嗎"
#?edit_dict()
edit_dict(name= 'system')
## Warning in edit_dict(name = "system"): Open system dictionary will take a
## long time
## Warning in edit_dict(name = "system"): You should save the dictionary
## without BOM on Windows
edit_dict(name= 'user')
## Warning in edit_dict(name = "user"): You should save the dictionary without
## BOM on Windows
edit_dict(name= 'stop_word')
## Warning in edit_dict(name = "stop_word"): You should save the dictionary
## without BOM on Windows
USERPATH
## [1] "C:/Program Files/R/R-3.4.1/library/jiebaRD/dict/user.dict.utf8"

抓出詞性

tagseg <- worker('tag')
segment(code = s, jiebar = tagseg)
##          r          r          x          d          c          v 
##       "那"     "我們" "酸民婉君"       "也"     "可以"     "報名" 
##          y 
##       "嗎"

擴增字典

library(rvest)
?file

f <- file('C:/Program Files/R/R-3.4.1/library/jiebaRD/dict/user.dict.utf8', 'wa')

alinks <- read_html('http://news.ltn.com.tw/list/breakingnews') %>% html_nodes('.tit') %>% html_attr('href')

for (link in alinks){
keywords <- read_html(link) %>% html_nodes('.keyword a') %>% html_text()
for(k in keywords){
  cat(paste0(k, '\n'), file = f)
}
}



edit_dict()

Bi-gram

library(NLP)
s<- strsplit('那我們酸民婉君也可以報名嗎', '')
bigram <- ngrams(unlist(s), 2)


s <- c('我', '們')
paste(s, sep = '', collapse = '')
## [1] "我們"
# method 1
unlist(lapply(bigram, 
        function(e) 
          paste(e, sep= '', 
                collapse= '')))
##  [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"
# method 2
vapply(bigram, paste, '', collapse='')
##  [1] "那我" "我們" "們酸" "酸民" "民婉" "婉君" "君也" "也可" "可以" "以報"
## [11] "報名" "名嗎"

Tri-gram

s<- strsplit('那我們酸民婉君也可以報名嗎', '')
trigram <- ngrams(unlist(s), 3)

# method 2
vapply(trigram, paste, '', collapse='')
##  [1] "那我們" "我們酸" "們酸民" "酸民婉" "民婉君" "婉君也" "君也可"
##  [8] "也可以" "可以報" "以報名" "報名嗎"

ngram-function

ngram_func <-  function(s,n=2){
  s<- strsplit(s, '')
  ngram <- ngrams(unlist(s), n)

  vapply(ngram, paste, '', collapse='')
}
s <- '那酸民婉君也可以報名嗎'
ngram_func(s, 4)
## [1] "那酸民婉" "酸民婉君" "民婉君也" "婉君也可" "君也可以" "也可以報"
## [7] "可以報名" "以報名嗎"

找出有意義詞彙

article <- "近來新台幣走強,民眾逢低買進加持美元的意願增加,相較於台幣保單而言,美元保單訂價使用較高的利率,因此美元保單有費率便宜的優勢,保額與保單價值均會隨著保單年度逐年遞增,成為多數人退休理財規劃的首選,而在去年掀起一波美元保單熱賣潮"

bigram <- ngrams(unlist(strsplit(article, '')), 2)
bigram.str <- vapply(bigram, paste, '', collapse = '')
tb <- table(bigram.str)
#sort(tb, decreasing = TRUE)
tb[tb >=2]


quadgram <- ngrams(unlist(strsplit(article, '')), 4)
quadgram.str <- vapply(quadgram, paste, '', collapse = '')
tb <- table(quadgram.str)
#sort(tb, decreasing = TRUE)
tb[tb >=2]

a.split <- strsplit(article, ',|。')
w.split <- strsplit(unlist(a.split), '')
#w.split

ngram_func <- function(w, n = 2){
  bigram.str <- vapply(ngrams(unlist(w), n),paste, '', collpase='')
  bigram.str
}

bigram.all <- lapply(w.split, ngram_func)
sort(table(unlist(bigram.all)), decreasing = TRUE)

s <- '民眾逢低買進美元意願增加'
s.split <- strsplit(s, '美元')
s.split
paste(unlist(s.split), collapse = '', sep = '' )

gsub('美元', '', s)

removekey<- function(s, keys){
  for(key in keys){
    s<-gsub(key, '', s)
  }
  s
}

removekey('民眾逢低買進美元意願增加',c('民眾','美元'))

ngram.func <- function(w, n){
  n.gram <-ngrams(unlist(w), n)
  n.gram.str <- vapply(n.gram, paste, "", collapse = "")
  n.gram.str
}


實做長詞優先法

removekey<- function(s, keys){
  for(key in keys){
    s<-gsub(key, '', s)
  }
  s
}

ngram.func <- function(w, n){
  n.gram <-ngrams(unlist(w), n)
  n.gram.str <- vapply(n.gram, paste, "", collapse = "")
  n.gram.str
}
longTermFirst <- function(article, keywords, threshold){
  for (i in seq(4,2,-1)){
    #print(i)
    article <- removekey(article, keywords)
    a.split <- strsplit(article, ',|。|、|「|」|;')
    w.split <- strsplit(x = unlist(a.split), split = '')
    n.gram.all <- sapply(w.split,function(e) ngram.func(e,i) )
    tb <- table(unlist(n.gram.all))
    candidates <- names(tb[tb>=threshold])
    keywords <- c(keywords, candidates)
  }
  keywords
}




article <- '立法院臨時會處理今年度總預算案,經過上周密集協商,昨上午開始宣讀有共識部分後,繼續就無共識的提案逐案表決,由於國民黨團已大幅撤案,時代力量黨團也將撤回法務部矯正署的上百項刪減凍結案,總預算案在下午三讀,包括軍公教加薪3%等重要項目都順利過關,預計2月起就能開始加薪。
 
不過,由於兩岸關係急凍,依朝野協商結論,中央政府各機關大陸地區旅費遭通案刪減25%,共計2千萬元。另外,也刪減委辦費3%、水電費1%、政策宣導費3%、設備及投資9.2%、對國內團體補助2%、加上其餘刪減數46.6億元,算出預算共刪減249.1億元,刪減比率達歲出預算1兆9918億元的1.25%。
 
時代力量原提191案需要表決,針對法務部矯正署就有上百案刪減、凍結案,民進黨團總召柯建銘昨酸,「兩萬塊文具費也要專案報告,離譜至極」;時代力量黨團總召徐永明昨則說,矯正署已同意貪污法官胡景彬案有問題,凍結矯正署、八德外獄監典獄長特別費,因此撤至3案,今表決進度也大幅提升。時代力量也要求全數刪減公教退休人員年終及三節慰問金,不過,民進黨團認為應維持2.5萬元門檻,最後未通過。
 
此外,台大校長當選人管中閔遴選過程引發爭議,時代力量立法院黨團日前提預算主決議,要求「台大校長遴選爭議於釐清前不得逕行核定校長名單」,但昨進行文字修正,將「台大」改為「多所公私立大學」。然而時代力量立委林昶佐今天早上在立法院院會發言時指出,由於民進黨團的反對,認為是新案,因此不得表決,但他還是要表達時代力量的立場。'

longTermFirst(article, c(), 3)
##  [1] "民進黨團" "時代力量" "\n \n"    "立法院"   "矯正署"   "3%"      
##  [7] "台大"     "由於"     "刪減"     "表決"     "凍結"     "校長"    
## [13] "預算"     "億元"     "黨團"
article <- '實施近7年的日本福島等5縣食品禁令,衛福部昨透露可望打破地區限制,改採高風險食品管制,待行政院裁定時間表。對此,行政院發言人徐國勇今受訪時,衛福部沒有送任何公文來,他並強調,行政院會以國人健康安全為首要考量、並參酌國際作法和科學依據等三原則不變。

2011年3月11日日本大地震後,我國即禁止福島、茨城、群馬、櫪木、千葉5縣市食品輸台。衛福部昨透露可望打破地區限制,改採高風險食品管制,但新制無時間表,待行政院裁定。行政院食安辦公室主任許輔今天也說,目前還沒有接到相關訊息。

不過許輔說,現在日本食品很多狀況和7年前不同,大多數國家已認同,日本流通品是控制在安全範圍。其中美國限制日本福島附近共14個縣市特定品項進口,但台灣及中國仍停留在2011年核災爆發時,針對福島周遭5個縣市所有商品管制。

至於傳台灣爭取加入由日本主導的「跨太平洋夥伴全面進展協定(CPTPP)」之下,才想加速解禁日核災食品。對此,許輔說,政府不會用食安換經貿,但是兩邊也都要顧好,「我們盡量把食安做到滴水不漏,至於國際經貿當然也要接軌。」在前年底公聽會之後,政府有強化食安檢驗和管理,也持續蒐集國外相關資訊。 '

longTermFirst(article, c(), 4)
## [1] "行政院" "日本"   "食安"   "食品"   "福島"

抓取關鍵字

library(jiebaR)
key <- worker('keywords', topn = 3)
s   <- '那酸民婉君也可以報名嗎'
key <= s
##      11.7392      11.7392 
##       "報名" "那酸民婉君"
a   <- c('a')
abb <- c('a', 'b', 'b')
abc <- c('a', 'b', 'c')

D   <- list(a, abb, abc)

#tfidf('a', a, D)
tf  <- 1/1 
idf <- log(3/3) 
tf * idf
## [1] 0
#tfidf('a', abb, D)
tf  <- 1/3
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('a', abc, D)
tf  <- 1/3
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('b', abb, D) 
tf  <- 2/3
idf <- log(3/2)
tf * idf
## [1] 0.2703101
#tfidf('b', abc, D)
tf  <- 1/3
idf <- log(3/2)
tf * idf
## [1] 0.135155
#tfidf('c', abc, D)
tf  <- 1/3
idf <- log(3/1)
tf * idf
## [1] 0.3662041
tfidf <- function(t, d, D){
  tf  <- sum(d == t) / length(abc)
  idf <- log(length(D) / sum(sapply(D, function(e) t %in% e)) )
  tf * idf
}

繪製文字雲

article <- '立法院臨時會處理今年度總預算案,經過上周密集協商,昨上午開始宣讀有共識部分後,繼續就無共識的提案逐案表決,由於國民黨團已大幅撤案,時代力量黨團也將撤回法務部矯正署的上百項刪減凍結案,總預算案在下午三讀,包括軍公教加薪3%等重要項目都順利過關,預計2月起就能開始加薪。
 
不過,由於兩岸關係急凍,依朝野協商結論,中央政府各機關大陸地區旅費遭通案刪減25%,共計2千萬元。另外,也刪減委辦費3%、水電費1%、政策宣導費3%、設備及投資9.2%、對國內團體補助2%、加上其餘刪減數46.6億元,算出預算共刪減249.1億元,刪減比率達歲出預算1兆9918億元的1.25%。
 
時代力量原提191案需要表決,針對法務部矯正署就有上百案刪減、凍結案,民進黨團總召柯建銘昨酸,「兩萬塊文具費也要專案報告,離譜至極」;時代力量黨團總召徐永明昨則說,矯正署已同意貪污法官胡景彬案有問題,凍結矯正署、八德外獄監典獄長特別費,因此撤至3案,今表決進度也大幅提升。時代力量也要求全數刪減公教退休人員年終及三節慰問金,不過,民進黨團認為應維持2.5萬元門檻,最後未通過。
 
此外,台大校長當選人管中閔遴選過程引發爭議,時代力量立法院黨團日前提預算主決議,要求「台大校長遴選爭議於釐清前不得逕行核定校長名單」,但昨進行文字修正,將「台大」改為「多所公私立大學」。然而時代力量立委林昶佐今天早上在立法院院會發言時指出,由於民進黨團的反對,認為是新案,因此不得表決,但他還是要表達時代力量的立場。'

library(jiebaR)
edit_dict(name = 'user')
## Warning in edit_dict(name = "user"): You should save the dictionary without
## BOM on Windows
mixseg <- worker()
words <- segment(code = article, jiebar = mixseg)
tb <- table(words)

sorted.words <- sort(tb[tb >= 2 & nchar(names(tb)) >= 2], decreasing = TRUE)

#install.packages('wordcloud2')
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.3
wordcloud2(sorted.words, shape='pant')
?wordcloud2
## starting httpd help server ... done
article <- '綠委段宜康於2014年11月縣市長選舉期間,為了替魏明谷助選彰化縣長,指控魏的對手、擔任曲棍球協會理事長的前藍委林滄敏詐領補助款500萬元,魏也在報紙登廣告指林用假單據詐領公款,段甚至揚言若指控不是事實,要公開吞曲棍球,但檢方事後調查認定林滄敏並無不法行為,林怒告段、魏,高等法院今判段宜康、魏明谷須連帶賠林滄敏共100萬元並登報道歉。段宜康今早受訪說,會上訴到底。林滄敏則說,段宜康不要再歹戲拖棚。
 
林滄敏接受《蘋果》訪問說,要上訴是段宜康的權利,但是法院都是看證據說話,段宜康再怎麼上訴,結果都還是一樣,段宜康既然在全國人民面前承諾,要吞三顆曲棍球,就不要再歹戲拖棚,趕快吞一吞,「不然你還有人格嗎?還有臉當立院司法法制委員會召委嗎?」'

paintWordCloud <- function(article){
  mixseg <- worker()
  words <- segment(code = article, jiebar = mixseg)
  tb <- table(words)

  sorted.words <- sort(tb[tb >= 2 & nchar(names(tb)) >= 2], decreasing = TRUE)

  wordcloud2(sorted.words, shape='star')
}

paintWordCloud(article)
#install.packages('wordcloud')
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.3
## Loading required package: RColorBrewer
wordcloud(names(sorted.words), sorted.words,random.color=TRUE, colors=rainbow(length(sorted.words)))

#?rainbow

建立詞頻矩陣

s <- '大巨蛋案對市府同仁下封口令? 柯P否認'
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
segment(code = s, jiebar = mixseg)
## [1] "大巨蛋" "案對"   "市府"   "同仁"   "下"     "封口令" "柯P"    "否認"
# install.packages('tm')
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
e      <- 'Hello, I am David. I have taken over 100 courses ~~~'
e.vec  <- strsplit(e, ' ')
corpus <- Corpus(VectorSource(e.vec))
?DocumentTermMatrix
dtm    <- DocumentTermMatrix(corpus)
dtm
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 courses david have hello over taken
##    1   1       1     1    1     1    1     1
tdm    <- TermDocumentMatrix(corpus)
tdm
## <<TermDocumentMatrix (terms: 7, documents: 1)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
inspect(tdm)
## <<TermDocumentMatrix (terms: 7, documents: 1)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##          Docs
## Terms     1
##   100     1
##   courses 1
##   david   1
##   have    1
##   hello   1
##   over    1
##   taken   1
dtm    <- DocumentTermMatrix(corpus,control = list(wordLengths = c(1,20)))
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 am c courses david have hello i over taken
##    1   1  1 1       1     1    1     1 2    1     1
doc <- tm_map(corpus, removeNumbers)
doc <- tm_map(doc   , removePunctuation)
#install.packages('SnowballC')
doc <- tm_map(doc   , stemDocument)
dtm <- DocumentTermMatrix(doc)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 6)>>
## Non-/sparse entries: 6/0
## Sparsity           : 0%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs chello cours david have over taken
##    1      1     1     1    1    1     1
removeN <- content_transformer(function(x, pattern){
  return(gsub('\\d','X',x))})

doc <- tm_map(corpus, removeN)
dtm <- DocumentTermMatrix(doc,control = list(wordLengths = c(1,20)))
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs am c courses david have hello i over taken xxx
##    1  1 1       1     1    1     1 2    1     1   1
titles <- c("Trump's war on Russia probe reaches new peak", "Trump administration declines to issue new Russia sanctions")

seg <- strsplit(titles, ' ')
corpus <- Corpus(VectorSource(seg))
doc <- tm_map(corpus, stemDocument)
dtm <- DocumentTermMatrix(doc, 
          control = list(weighting = function(x)
                          weightTfIdf(x, normalize =FALSE)))
#?DocumentTermMatrix
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 11)>>
## Non-/sparse entries: 8/14
## Sparsity           : 64%
## Maximal term length: 14
## Weighting          : term frequency - inverse document frequency (tf-idf)
## Sample             :
##     Terms
## Docs administration declines issue new peak probe reaches russia sanctions
##    1              0        0     0   0    1     1       1      0         0
##    2              1        1     1   0    0     0       0      0         1
##     Terms
## Docs war
##    1   1
##    2   0