文章分群

library(readr)
applenews <- read_csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/applenews2.csv')

library(jiebaR)
mixseg <- worker()
str(applenews)

apple.seg <- lapply(applenews$content, function(e) segment(code = e, jiebar = mixseg))

#apple.seg[1]

library(tm)
corpus <- Corpus(VectorSource(apple.seg))
doc    <- tm_map(corpus, removeNumbers)
dtm    <- DocumentTermMatrix(doc, control = list(WordLength =c(2,20)))
class(dtm)

library(proxy)

dtm.new  <- removeSparseTerms(dtm, 0.995)
dtm.new
dtm.dist <- proxy::dist(as.matrix(dtm.new), method = 'cosine')

fit      <- hclust(dtm.dist)
plot(fit, hang=-0.1)
clusters <- cutree(fit, 30)
applenews$title[clusters == 2]

Community Detection

#install.packages('igraph')
library(igraph)
m        <- as.matrix(dtm.dist)
m[1:3,1:3]

m2 <- ifelse(m < 0.4, 1, 0)
m2[1:3,1:3]

G <- graph_from_adjacency_matrix(m2)

wc <- cluster_walktrap(G)
modularity(wc)
table(membership(wc))

group <- membership(wc)
applenews$title[group ==4]
#plot(wc, G)

讀取對話資料

library(readr)
results <- read_csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/results2.csv')


messages <- sapply(strsplit(results$`message    Timestamp`, '\t'), function(e) e[1])

library(jiebaR)
messages
#segment('', jiebar = mixseg)
class(messages)
chat.seg <- lapply(messages, function(e) tryCatch({segment(code= e, jiebar = mixseg)},error= function(e){}) )
corpus <- Corpus(VectorSource(chat.seg))
doc    <- tm_map(corpus, removeNumbers)
dtm    <- DocumentTermMatrix(doc)

library(proxy)
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine' )
fit      <- hclust(dtm.dist)
plot(fit, hang=-0.1)


m <- as.matrix(dtm.dist)
m[1:3,1:3]

m2 <- ifelse(m < 0.4, 1, 0)
m2[1:3, 1:3]

library(igraph)
G <- graph_from_adjacency_matrix(m2)

wc <- cluster_walktrap(G)
modularity(wc)
tb <- table(membership(wc))
sort(tb, decreasing = TRUE)
group <- membership(wc)

messages[group ==237]

根據相似問題自動應答

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.2
## Loading required package: jiebaRD
qa <- data.frame()
r1 <- data.frame(question = '怎麼查詢信用卡點數?', answer = '問櫃台', stringsAsFactors= FALSE)
r2 <- data.frame(question = '怎麼查詢貸款額度?', answer = '打1900', stringsAsFactors = FALSE)

qa <- rbind(qa, r1)
qa <- rbind(qa, r2)



mixseg <- worker()
q <- '我想知道我現在的信用卡點數?'
qa.seg <- lapply(c(q, qa$question), function(e) segment(e, jiebar=mixseg))

library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
corpus <- Corpus(VectorSource(qa.seg))
dtm    <- DocumentTermMatrix(corpus)

dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')
m <- as.matrix(dtm.dist)
qa[order(m[1,])[1],]
##               question answer
## 1 怎麼查詢信用卡點數? 問櫃台

Community Detection (Get Power Term)


library(readr)
applenews <- read_csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/applenews2.csv')

library(jiebaR)
mixseg <- worker()

apple.seg <- lapply(applenews$content, function(e) segment(code = e, jiebar = mixseg))


library(tm)
corpus <- Corpus(VectorSource(apple.seg))
doc    <- tm_map(corpus, removeNumbers)
dtm    <- DocumentTermMatrix(doc, control = list(WordLength =c(2,20)))


library(proxy)
dtm.new  <- removeSparseTerms(dtm, 0.995)
dtm.dist <- proxy::dist(as.matrix(dtm.new), method = 'cosine')


library(igraph)
m   <- as.matrix(dtm.dist)
m2  <- ifelse(m < 0.4, 1, 0)


G <- graph_from_adjacency_matrix(m2)

wc <- cluster_walktrap(G)

group <- membership(wc)
applenews$title[group ==1]
tb <- table(unlist(apple.seg[group ==1]))
words <- sort(tb[nchar(names(tb))>=2 & grepl('[\u4e00-\u9fa5]+',names(tb))], decreasing = TRUE)

library(wordcloud2)
wordcloud2(words)


getPowerTerm <- function(idx){
  
  tb <- table(unlist(apple.seg[group ==idx]))
  words <- sort(tb[nchar(names(tb))>=2 & grepl('[\u4e00-\u9fa5]+',names(tb))], decreasing = TRUE)
  print(names(words[1:20]))
  print('======================================')
  print(applenews$title[group ==idx])
}

getPowerTerm(7)

新聞分類

## Read Data
# https://github.com/ywchiu/fubonr/blob/master/data/applenews.RData
getwd()
## [1] "D:/OS DATA/Desktop"
load('applenews.RData')

str(applenews)
## 'data.frame':    1500 obs. of  5 variables:
##  $ content : chr  "(更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台中市一名黃姓男子覬覦頭獎高達2600萬的「開門見喜」刮"| __truncated__ "澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片,還大肆批評留澳學生是一群「留學豬」。消息傳出"| __truncated__ "【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明的租屋族嗎? 由蘋果地產與FBS TV合作的全新節目-房"| __truncated__ "本內容由中央廣播電臺提供<U+00A0><U+00A0> <U+00A0> <U+00A0> <U+00A0>美國國防部長卡特(Ash Carter)今天(15日)表示,"| __truncated__ ...
##  $ title   : chr  "【更新】搶2.2萬彩券刮中1.4萬 沒發財還得入獄" "拿到澳洲護照後 他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演  美防長南海登艦" ...
##  $ dt      : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
##  $ category: chr  "社會" "國際" "地產" "國際" ...
##  $ view_cnt: chr  "1754" "0" "0" "0" ...
## Data Pre-Processing
applenews$category <- as.factor(applenews$category)

table(applenews$category)
## 
##                       3C                     正妹                     生活 
##                       37                        8                      298 
##                     地產                     社會                     政治 
##                       32                      194                      143 
##                     娛樂                     時尚                     財經 
##                      113                       38                      121 
##                     動物                     國際 國際","LA","SF","NY","US 
##                       26                      273                        9 
##           國際","SF","US                     搜奇                     論壇 
##                        3                       55                       55 
##                     體育 
##                       95
apple.subset <- applenews[applenews$category %in% c('財經', '娛樂'),]
apple.subset$category <- factor(apple.subset$category)

library(jiebaR)
mixseg <- worker()

apple.seg <- lapply(apple.subset$content, 
       function(e) segment(e, jiebar = mixseg))

#apple.seg[1]

library(tm)
corpus <- Corpus(VectorSource(apple.seg))
doc    <- tm_map(corpus, removeNumbers)
dtm    <- DocumentTermMatrix(doc)
dim(dtm)
## [1]  234 9784
ft     <- findFreqTerms(dtm, 5, )
dtm.new<- DocumentTermMatrix(doc, 
            control = list(
              dictionary  = ft, 
              wordLengths = c(2,20)))
dim(dtm.new)
## [1]  234 1441
m <- as.matrix(dtm.new)
m[1:3,1:10]
##     Terms
## Docs dishtv sony max love led apple car motor trend app
##    1      0    0   0    0   0     0   0     0     0   0
##    2     10    0   0    0   0     0   0     0     0   0
##    3      0    0   0    0   0     0   0     0     0   0
?DocumentTermMatrix
## starting httpd help server ... done
convert_counts <- function(x){
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
  return(x)
}

?apply
m.count <- apply(dtm.new, MARGIN= 2, convert_counts)

m.count[,1]
##     1     2     3     4     5     6     7     8     9    10    11    12 
##  "No" "Yes"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##    13    14    15    16    17    18    19    20    21    22    23    24 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##    25    26    27    28    29    30    31    32    33    34    35    36 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##    37    38    39    40    41    42    43    44    45    46    47    48 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##    49    50    51    52    53    54    55    56    57    58    59    60 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##    61    62    63    64    65    66    67    68    69    70    71    72 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##    73    74    75    76    77    78    79    80    81    82    83    84 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##    85    86    87    88    89    90    91    92    93    94    95    96 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##    97    98    99   100   101   102   103   104   105   106   107   108 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   109   110   111   112   113   114   115   116   117   118   119   120 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   121   122   123   124   125   126   127   128   129   130   131   132 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   133   134   135   136   137   138   139   140   141   142   143   144 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   145   146   147   148   149   150   151   152   153   154   155   156 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   157   158   159   160   161   162   163   164   165   166   167   168 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   169   170   171   172   173   174   175   176   177   178   179   180 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   181   182   183   184   185   186   187   188   189   190   191   192 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   193   194   195   196   197   198   199   200   201   202   203   204 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   205   206   207   208   209   210   211   212   213   214   215   216 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   217   218   219   220   221   222   223   224   225   226   227   228 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##   229   230   231   232   233   234 
##  "No"  "No"  "No"  "No"  "No"  "No"
m.count[1:3, 1:10]
##     Terms
## Docs dishtv sony max  love led  apple car  motor trend app 
##    1 "No"   "No" "No" "No" "No" "No"  "No" "No"  "No"  "No"
##    2 "Yes"  "No" "No" "No" "No" "No"  "No" "No"  "No"  "No"
##    3 "No"   "No" "No" "No" "No" "No"  "No" "No"  "No"  "No"
set.seed(123)
sample.int(42, 6)
## [1] 13 33 17 35 36  2
## Split the data into trainning set and testing set
set.seed(123)
sample.int(2, 100, replace=TRUE, prob = c(0.7, 0.3))
##   [1] 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 2 2 1 2 1
##  [36] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1
##  [71] 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1
dim(m.count)
## [1]  234 1441
set.seed(123)
idx <- sample.int(2, nrow(m.count), replace=TRUE, prob=c(0.7,0.3))
table(idx)
## idx
##   1   2 
## 171  63
m <- as.data.frame(m.count)
trainset <- m[idx == 1, ]
traintag <- apple.subset[idx==1,"category"]
dim(trainset)
## [1]  171 1441
length(traintag)
## [1] 171
testset  <- m[idx == 2, ]
testtag  <- apple.subset[idx==2,"category"]
dim(testset)
## [1]   63 1441
length(testtag)
## [1] 63

Naive Bayes分類

#install.packages('e1071')
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
model <- naiveBayes(trainset, traintag)
predicted <- predict(model, testset)
sum(testtag == predicted) / length(testtag)
## [1] 0.6825397
# Do not filter words
dtm.new<- DocumentTermMatrix(doc, 
            control = list(
              wordLengths = c(2,20)))

convert_counts <- function(x){
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
  return(x)
}

m.count <- apply(dtm.new, MARGIN= 2, convert_counts)

## Split the data into trainning set and testing set
set.seed(123)
sample.int(2, 100, replace=TRUE, prob = c(0.7, 0.3))
##   [1] 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 2 2 1 2 1
##  [36] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1
##  [71] 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1
dim(m.count)
## [1]   234 11206
set.seed(123)
idx <- sample.int(2, nrow(m.count), replace=TRUE, prob=c(0.7,0.3))

m <- as.data.frame(m.count)
trainset <- m[idx == 1, ]
traintag <- apple.subset[idx==1,"category"]

testset  <- m[idx == 2, ]
testtag  <- apple.subset[idx==2,"category"]

model <- naiveBayes(trainset, traintag)
predicted <- predict(model, testset)
sum(testtag == predicted) / length(testtag)
## [1] 0.968254
table(testtag, predicted)
##        predicted
## testtag 娛樂 財經
##    娛樂   28    1
##    財經    1   33
testtag!=predicted
##  [1] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
apple.subset[idx==2, ][testtag!=predicted,c('title', 'content', 'category')]
##                                           title
## 21   【唱新聞】詐騙嗎?R.O.C.有CHINA但不是CHINA
## 1471           大倒角握感如何? 網友試機HTC 10
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               content
## 21                                                                                                                                                                                                                                                                                                                                                                                                                            肯亞警方強押45名台灣人遣送中國,輿論譁然。「星光幫」冠軍賴銘偉有感,改陳奕迅《你的背包》為《你的台胞》,要唱給話術不輸詐騙集團的中國人民共和國與中華民國政府。(娛樂中心/綜合報導)<U+00A0> 肯亞被判無罪釋放的台籍嫌犯(綠衣者)竟被押送到中國。新華社
## 1471 宏達電新機HTC 10已經能在專賣店試機了,網路熱議HTC 10的體驗,Mobile 01網友甚至列出5大優缺點,粉絲都說詳細又實用。<U+00A0>最棒的優點是HTC 10手感超好,樓主說不必擔心切邊很大的倒角會割手,握在手上的穩定感,有旗艦機的水準,SIM卡槽、電源鍵做工也細緻不馬虎。<U+00A0>另外包括外放音質、螢幕等品質都很好,還有新的Sense介面。<U+00A0>不過,樓主說事情都是有好有壞,缺點還是要說一下,相機bug多,在半小時的試用中,出現多次「手遮擋到雷射對焦」的提醒訊息,拿衛生紙擦過對焦處也一樣,感到困擾。<U+00A0>室內的白平衡也不準,喇叭開到最大聲、高音時有明顯雜音,「是工程機的問題?」儘管如此,樓主還是喊話HTC你這次賺得到我的錢,因為「手感真的愛不釋手」。(馬婉珍/綜合報導)<U+00A0>
##      category
## 21       娛樂
## 1471     財經

新聞分類總整理

## Loading Data
load('applenews.RData')

## Data Pre-Processing
apple.subset <- applenews[
  applenews$category %in% c('財經', '娛樂', '社會'),]
apple.subset$category <- factor(apple.subset$category)

## Word Segmentation
library(jiebaR)
mixseg <- worker()

apple.seg <- lapply(apple.subset$content, 
       function(e) segment(e, jiebar = mixseg))

## Turning Data Into DocumentTermMatrix
library(tm)
corpus <- Corpus(VectorSource(apple.seg))
doc    <- tm_map(corpus, removeNumbers)
dtm.new<- DocumentTermMatrix(doc, 
            control = list(
              wordLengths = c(2,20)))

## Split Data into Training and Testing Dataset
convert_counts <- function(x){
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
  return(x)
}

m.count <- apply(dtm.new, MARGIN= 2, convert_counts)

set.seed(123)
idx <- sample.int(2, nrow(m.count), replace=TRUE, prob=c(0.7,0.3))
m <- as.data.frame(m.count)
trainset <- m[idx == 1, ]
traintag <- apple.subset[idx==1,"category"]


testset  <- m[idx == 2, ]
testtag  <- apple.subset[idx==2,"category"]

## Building Model
library(e1071)
model     <- naiveBayes(trainset, traintag)
predicted <- predict(model, testset)

## Validate Result
sum(testtag == predicted) / length(testtag)

table(testtag, predicted)


apple.subset[idx==2, ][testtag!=predicted,c('title', 'content', 'category')]

自動判斷正負評

library(rvest)

dfall <- data.frame()

for(page in 1:27){
  reviews <- read_html(paste0('https://movies.yahoo.com.tw/movieinfo_review.html/id=7064?sort=update_ts&order=desc&page=', page)) %>% html_nodes('.form_good') 
  
  
  for(ele in reviews){
    comments <- ele %>% html_nodes('span') %>% .[3] %>% html_text()
    score <- ele %>% html_node('input[name="score"]') %>% html_attr('value')
    dfall <- rbind(dfall, data.frame(comments, score, stringsAsFactors = FALSE))
  }

}
write.csv(x = dfall, file = 'movie_reviews.csv')

dfall <- read.csv('movie_reviews.csv', stringsAsFactors = FALSE)
head(dfall)

movies <- dfall[dfall$score %in% c(1,5), ]

movies$score <- factor(movies$score, levels = c(1,5), labels = c('bad', 'good'))


## Word Segmentation
library(jiebaR)
mixseg <- worker()

movie.seg <- lapply(movies$comments, 
       function(e) segment(e, jiebar = mixseg))

## Turning Data Into DocumentTermMatrix
library(tm)
corpus <- Corpus(VectorSource(movie.seg))
doc    <- tm_map(corpus, removeNumbers)
dtm.new<- DocumentTermMatrix(doc, 
            control = list(
              wordLengths = c(2,20)))

## Split Data into Training and Testing Dataset
convert_counts <- function(x){
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
  return(x)
}

m.count <- apply(dtm.new, MARGIN= 2, convert_counts)
m.count[1:3,1:10]

set.seed(123)
idx <- sample.int(2, nrow(m.count), replace=TRUE, prob=c(0.5,0.5))
m <- as.data.frame(m.count)
trainset <- m[idx == 1, ]
traintag <- movies[idx==1,"score"]


testset  <- m[idx == 2, ]
testtag  <- movies[idx==2,"score"]

## Building Model
library(e1071)
model     <- naiveBayes(trainset, traintag)
predicted <- predict(model, testset)

## Validate Result
sum(testtag == predicted) / length(testtag)

table(testtag, predicted)
#testtag