文章分群
library(readr)
applenews <- read_csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/applenews2.csv')
library(jiebaR)
mixseg <- worker()
str(applenews)
apple.seg <- lapply(applenews$content, function(e) segment(code = e, jiebar = mixseg))
#apple.seg[1]
library(tm)
corpus <- Corpus(VectorSource(apple.seg))
doc <- tm_map(corpus, removeNumbers)
dtm <- DocumentTermMatrix(doc, control = list(WordLength =c(2,20)))
class(dtm)
library(proxy)
dtm.new <- removeSparseTerms(dtm, 0.995)
dtm.new
dtm.dist <- proxy::dist(as.matrix(dtm.new), method = 'cosine')
fit <- hclust(dtm.dist)
plot(fit, hang=-0.1)
clusters <- cutree(fit, 30)
applenews$title[clusters == 2]
讀取對話資料
library(readr)
results <- read_csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/results2.csv')
messages <- sapply(strsplit(results$`message Timestamp`, '\t'), function(e) e[1])
library(jiebaR)
messages
#segment('', jiebar = mixseg)
class(messages)
chat.seg <- lapply(messages, function(e) tryCatch({segment(code= e, jiebar = mixseg)},error= function(e){}) )
corpus <- Corpus(VectorSource(chat.seg))
doc <- tm_map(corpus, removeNumbers)
dtm <- DocumentTermMatrix(doc)
library(proxy)
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine' )
fit <- hclust(dtm.dist)
plot(fit, hang=-0.1)
m <- as.matrix(dtm.dist)
m[1:3,1:3]
m2 <- ifelse(m < 0.4, 1, 0)
m2[1:3, 1:3]
library(igraph)
G <- graph_from_adjacency_matrix(m2)
wc <- cluster_walktrap(G)
modularity(wc)
tb <- table(membership(wc))
sort(tb, decreasing = TRUE)
group <- membership(wc)
messages[group ==237]
根據相似問題自動應答
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.2
## Loading required package: jiebaRD
qa <- data.frame()
r1 <- data.frame(question = '怎麼查詢信用卡點數?', answer = '問櫃台', stringsAsFactors= FALSE)
r2 <- data.frame(question = '怎麼查詢貸款額度?', answer = '打1900', stringsAsFactors = FALSE)
qa <- rbind(qa, r1)
qa <- rbind(qa, r2)
mixseg <- worker()
q <- '我想知道我現在的信用卡點數?'
qa.seg <- lapply(c(q, qa$question), function(e) segment(e, jiebar=mixseg))
library(tm)
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
corpus <- Corpus(VectorSource(qa.seg))
dtm <- DocumentTermMatrix(corpus)
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')
m <- as.matrix(dtm.dist)
qa[order(m[1,])[1],]
## question answer
## 1 怎麼查詢信用卡點數? 問櫃台
新聞分類
## Read Data
# https://github.com/ywchiu/fubonr/blob/master/data/applenews.RData
getwd()
## [1] "D:/OS DATA/Desktop"
load('applenews.RData')
str(applenews)
## 'data.frame': 1500 obs. of 5 variables:
## $ content : chr "(更新:新增影片)想要透過刮刮樂彩券一夕致富,但他卻用錯方法!台中市一名黃姓男子覬覦頭獎高達2600萬的「開門見喜」刮"| __truncated__ "澳洲一名就讀雪梨大學的華裔博士生,日前公開一段燒毀中國護照的影片,還大肆批評留澳學生是一群「留學豬」。消息傳出"| __truncated__ "【行銷專題企劃】房價高高在上,沒錢買房沒關係,但你認為自己是聰明的租屋族嗎? 由蘋果地產與FBS TV合作的全新節目-房"| __truncated__ "本內容由中央廣播電臺提供<U+00A0><U+00A0> <U+00A0> <U+00A0> <U+00A0>美國國防部長卡特(Ash Carter)今天(15日)表示,"| __truncated__ ...
## $ title : chr "【更新】搶2.2萬彩券刮中1.4萬 沒發財還得入獄" "拿到澳洲護照後 他放火燒中國護照" "【特企】房市大追擊- 租屋這些事情要小心" "【央廣RTI】美菲軍演 美防長南海登艦" ...
## $ dt : POSIXct, format: "2016-04-15 14:32:00" "2016-04-15 14:32:00" ...
## $ category: chr "社會" "國際" "地產" "國際" ...
## $ view_cnt: chr "1754" "0" "0" "0" ...
## Data Pre-Processing
applenews$category <- as.factor(applenews$category)
table(applenews$category)
##
## 3C 正妹 生活
## 37 8 298
## 地產 社會 政治
## 32 194 143
## 娛樂 時尚 財經
## 113 38 121
## 動物 國際 國際","LA","SF","NY","US
## 26 273 9
## 國際","SF","US 搜奇 論壇
## 3 55 55
## 體育
## 95
apple.subset <- applenews[applenews$category %in% c('財經', '娛樂'),]
apple.subset$category <- factor(apple.subset$category)
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(apple.subset$content,
function(e) segment(e, jiebar = mixseg))
#apple.seg[1]
library(tm)
corpus <- Corpus(VectorSource(apple.seg))
doc <- tm_map(corpus, removeNumbers)
dtm <- DocumentTermMatrix(doc)
dim(dtm)
## [1] 234 9784
ft <- findFreqTerms(dtm, 5, )
dtm.new<- DocumentTermMatrix(doc,
control = list(
dictionary = ft,
wordLengths = c(2,20)))
dim(dtm.new)
## [1] 234 1441
m <- as.matrix(dtm.new)
m[1:3,1:10]
## Terms
## Docs dishtv sony max love led apple car motor trend app
## 1 0 0 0 0 0 0 0 0 0 0
## 2 10 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0
?DocumentTermMatrix
## starting httpd help server ... done
convert_counts <- function(x){
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
return(x)
}
?apply
m.count <- apply(dtm.new, MARGIN= 2, convert_counts)
m.count[,1]
## 1 2 3 4 5 6 7 8 9 10 11 12
## "No" "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 13 14 15 16 17 18 19 20 21 22 23 24
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 25 26 27 28 29 30 31 32 33 34 35 36
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 37 38 39 40 41 42 43 44 45 46 47 48
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 49 50 51 52 53 54 55 56 57 58 59 60
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 61 62 63 64 65 66 67 68 69 70 71 72
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 73 74 75 76 77 78 79 80 81 82 83 84
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 85 86 87 88 89 90 91 92 93 94 95 96
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 97 98 99 100 101 102 103 104 105 106 107 108
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 109 110 111 112 113 114 115 116 117 118 119 120
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 121 122 123 124 125 126 127 128 129 130 131 132
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 133 134 135 136 137 138 139 140 141 142 143 144
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 145 146 147 148 149 150 151 152 153 154 155 156
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 157 158 159 160 161 162 163 164 165 166 167 168
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 169 170 171 172 173 174 175 176 177 178 179 180
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 181 182 183 184 185 186 187 188 189 190 191 192
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 193 194 195 196 197 198 199 200 201 202 203 204
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 205 206 207 208 209 210 211 212 213 214 215 216
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 217 218 219 220 221 222 223 224 225 226 227 228
## "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 229 230 231 232 233 234
## "No" "No" "No" "No" "No" "No"
m.count[1:3, 1:10]
## Terms
## Docs dishtv sony max love led apple car motor trend app
## 1 "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 2 "Yes" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## 3 "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
set.seed(123)
sample.int(42, 6)
## [1] 13 33 17 35 36 2
## Split the data into trainning set and testing set
set.seed(123)
sample.int(2, 100, replace=TRUE, prob = c(0.7, 0.3))
## [1] 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 2 2 1 2 1
## [36] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1
## [71] 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1
dim(m.count)
## [1] 234 1441
set.seed(123)
idx <- sample.int(2, nrow(m.count), replace=TRUE, prob=c(0.7,0.3))
table(idx)
## idx
## 1 2
## 171 63
m <- as.data.frame(m.count)
trainset <- m[idx == 1, ]
traintag <- apple.subset[idx==1,"category"]
dim(trainset)
## [1] 171 1441
length(traintag)
## [1] 171
testset <- m[idx == 2, ]
testtag <- apple.subset[idx==2,"category"]
dim(testset)
## [1] 63 1441
length(testtag)
## [1] 63
Naive Bayes分類
#install.packages('e1071')
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
model <- naiveBayes(trainset, traintag)
predicted <- predict(model, testset)
sum(testtag == predicted) / length(testtag)
## [1] 0.6825397
# Do not filter words
dtm.new<- DocumentTermMatrix(doc,
control = list(
wordLengths = c(2,20)))
convert_counts <- function(x){
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
return(x)
}
m.count <- apply(dtm.new, MARGIN= 2, convert_counts)
## Split the data into trainning set and testing set
set.seed(123)
sample.int(2, 100, replace=TRUE, prob = c(0.7, 0.3))
## [1] 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 2 2 1 2 1
## [36] 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1
## [71] 2 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1
dim(m.count)
## [1] 234 11206
set.seed(123)
idx <- sample.int(2, nrow(m.count), replace=TRUE, prob=c(0.7,0.3))
m <- as.data.frame(m.count)
trainset <- m[idx == 1, ]
traintag <- apple.subset[idx==1,"category"]
testset <- m[idx == 2, ]
testtag <- apple.subset[idx==2,"category"]
model <- naiveBayes(trainset, traintag)
predicted <- predict(model, testset)
sum(testtag == predicted) / length(testtag)
## [1] 0.968254
table(testtag, predicted)
## predicted
## testtag 娛樂 財經
## 娛樂 28 1
## 財經 1 33
testtag!=predicted
## [1] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
apple.subset[idx==2, ][testtag!=predicted,c('title', 'content', 'category')]
## title
## 21 【唱新聞】詐騙嗎?R.O.C.有CHINA但不是CHINA
## 1471 大倒角握感如何? 網友試機HTC 10
## content
## 21 肯亞警方強押45名台灣人遣送中國,輿論譁然。「星光幫」冠軍賴銘偉有感,改陳奕迅《你的背包》為《你的台胞》,要唱給話術不輸詐騙集團的中國人民共和國與中華民國政府。(娛樂中心/綜合報導)<U+00A0> 肯亞被判無罪釋放的台籍嫌犯(綠衣者)竟被押送到中國。新華社
## 1471 宏達電新機HTC 10已經能在專賣店試機了,網路熱議HTC 10的體驗,Mobile 01網友甚至列出5大優缺點,粉絲都說詳細又實用。<U+00A0>最棒的優點是HTC 10手感超好,樓主說不必擔心切邊很大的倒角會割手,握在手上的穩定感,有旗艦機的水準,SIM卡槽、電源鍵做工也細緻不馬虎。<U+00A0>另外包括外放音質、螢幕等品質都很好,還有新的Sense介面。<U+00A0>不過,樓主說事情都是有好有壞,缺點還是要說一下,相機bug多,在半小時的試用中,出現多次「手遮擋到雷射對焦」的提醒訊息,拿衛生紙擦過對焦處也一樣,感到困擾。<U+00A0>室內的白平衡也不準,喇叭開到最大聲、高音時有明顯雜音,「是工程機的問題?」儘管如此,樓主還是喊話HTC你這次賺得到我的錢,因為「手感真的愛不釋手」。(馬婉珍/綜合報導)<U+00A0>
## category
## 21 娛樂
## 1471 財經
新聞分類總整理
## Loading Data
load('applenews.RData')
## Data Pre-Processing
apple.subset <- applenews[
applenews$category %in% c('財經', '娛樂', '社會'),]
apple.subset$category <- factor(apple.subset$category)
## Word Segmentation
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(apple.subset$content,
function(e) segment(e, jiebar = mixseg))
## Turning Data Into DocumentTermMatrix
library(tm)
corpus <- Corpus(VectorSource(apple.seg))
doc <- tm_map(corpus, removeNumbers)
dtm.new<- DocumentTermMatrix(doc,
control = list(
wordLengths = c(2,20)))
## Split Data into Training and Testing Dataset
convert_counts <- function(x){
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
return(x)
}
m.count <- apply(dtm.new, MARGIN= 2, convert_counts)
set.seed(123)
idx <- sample.int(2, nrow(m.count), replace=TRUE, prob=c(0.7,0.3))
m <- as.data.frame(m.count)
trainset <- m[idx == 1, ]
traintag <- apple.subset[idx==1,"category"]
testset <- m[idx == 2, ]
testtag <- apple.subset[idx==2,"category"]
## Building Model
library(e1071)
model <- naiveBayes(trainset, traintag)
predicted <- predict(model, testset)
## Validate Result
sum(testtag == predicted) / length(testtag)
table(testtag, predicted)
apple.subset[idx==2, ][testtag!=predicted,c('title', 'content', 'category')]
自動判斷正負評
library(rvest)
dfall <- data.frame()
for(page in 1:27){
reviews <- read_html(paste0('https://movies.yahoo.com.tw/movieinfo_review.html/id=7064?sort=update_ts&order=desc&page=', page)) %>% html_nodes('.form_good')
for(ele in reviews){
comments <- ele %>% html_nodes('span') %>% .[3] %>% html_text()
score <- ele %>% html_node('input[name="score"]') %>% html_attr('value')
dfall <- rbind(dfall, data.frame(comments, score, stringsAsFactors = FALSE))
}
}
write.csv(x = dfall, file = 'movie_reviews.csv')
dfall <- read.csv('movie_reviews.csv', stringsAsFactors = FALSE)
head(dfall)
movies <- dfall[dfall$score %in% c(1,5), ]
movies$score <- factor(movies$score, levels = c(1,5), labels = c('bad', 'good'))
## Word Segmentation
library(jiebaR)
mixseg <- worker()
movie.seg <- lapply(movies$comments,
function(e) segment(e, jiebar = mixseg))
## Turning Data Into DocumentTermMatrix
library(tm)
corpus <- Corpus(VectorSource(movie.seg))
doc <- tm_map(corpus, removeNumbers)
dtm.new<- DocumentTermMatrix(doc,
control = list(
wordLengths = c(2,20)))
## Split Data into Training and Testing Dataset
convert_counts <- function(x){
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
return(x)
}
m.count <- apply(dtm.new, MARGIN= 2, convert_counts)
m.count[1:3,1:10]
set.seed(123)
idx <- sample.int(2, nrow(m.count), replace=TRUE, prob=c(0.5,0.5))
m <- as.data.frame(m.count)
trainset <- m[idx == 1, ]
traintag <- movies[idx==1,"score"]
testset <- m[idx == 2, ]
testtag <- movies[idx==2,"score"]
## Building Model
library(e1071)
model <- naiveBayes(trainset, traintag)
predicted <- predict(model, testset)
## Validate Result
sum(testtag == predicted) / length(testtag)
table(testtag, predicted)
#testtag