中文詞頻矩陣
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.3
## Loading required package: jiebaRD
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
## Loading required package: NLP
mixseg <- worker()
s <- '大巨蛋案對市府同仁下封口令? 柯P否認'
s1 <- '柯P市府近來飽受大巨蛋案爭議'
s.vec <- segment(code = s, jiebar = mixseg)
s1.vec <- segment(code = s1, jiebar = mixseg)
s.corpus <- Corpus(VectorSource(list(s.vec, s1.vec)))
s.dtm <- DocumentTermMatrix(s.corpus)
inspect(s.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 8)>>
## Non-/sparse entries: 10/6
## Sparsity : 38%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 大巨蛋 市府 同仁 爭議 近來 封口令 案對 飽受
## 1 1 1 1 0 0 1 1 0
## 2 1 1 0 1 1 0 0 1
抓取問答資料
library(rvest)
## Warning: package 'rvest' was built under R version 3.4.2
## Loading required package: xml2
dfall <- data.frame()
for (p in seq(1,3)){
page <- read_html(paste0('http://www.tw-insurance.info/faq.cfm?faq=3&p=',p))
questions <- page %>%
html_nodes('table[summary="Article list"] h3.title') %>%
html_text() %>% sub('♥','', .)
answers <- page %>%
html_nodes('table[summary="Article list"] tr[id] td') %>% html_text()
df <- data.frame(questions, answers, stringsAsFactors = FALSE)
dfall <- rbind(dfall, df)
}
#dfall
library(jiebaR)
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
qa.seg <- lapply(dfall$answers, function(e) segment(e, mixseg))
corpus <- Corpus(VectorSource(qa.seg))
dtm <- DocumentTermMatrix(corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 25, terms: 491)>>
## Non-/sparse entries: 628/11647
## Sparsity : 95%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 投資 係指 保戶 保單 保險 保險公司 契約 就是 壽險 機能
## 10 0 0 0 3 3 0 1 1 0 0
## 11 0 0 0 0 1 0 0 0 3 0
## 13 2 0 1 6 0 0 0 1 2 0
## 14 0 0 1 5 0 0 0 0 0 0
## 19 0 0 0 0 2 0 0 0 2 0
## 21 10 0 4 5 2 3 0 1 2 0
## 24 0 0 0 1 0 1 0 0 0 0
## 25 0 0 2 1 0 2 0 0 0 0
## 6 0 0 0 0 4 0 3 0 0 0
## 7 0 11 0 0 0 0 0 0 0 12
findFreqTerms(dtm,10)
## [1] "保單" "保險" "係指" "機能" "壽險" "投資"
## [7] "保戶" "保險公司"
findAssocs(dtm , '保單' , 0.5)
## $保單
## 這樣 保戶 公司 投資 委託 代操 全委 實質 操<a7> 雙重
## 0.68 0.63 0.61 0.54 0.54 0.52 0.52 0.52 0.52 0.52
## 全權 利率
## 0.52 0.52
findAssocs(dtm , '保險' , 0.5)
## $保險
## 契約 價值 標的 載明 不定 之訂 方式 可供 可知 市場
## 0.75 0.68 0.68 0.67 0.56 0.56 0.56 0.56 0.56 0.56
## 估計 客<c6> 須有 須至
## 0.56 0.56 0.56 0.56
findAssocs(dtm , '契約' , 0.5)
## $契約
## 載明 價值 標的 保險 定值 不定 之訂 方式 可供 可知
## 0.95 0.89 0.89 0.75 0.65 0.65 0.65 0.65 0.65 0.65
## 市場 估計 客<c6> 須有 須至
## 0.65 0.65 0.65 0.65 0.65
抓取Facebook 資料來做分析
library(jsonlite)
access_token <- '1784463381857274|9900d3b0e28a8ae9b5d263240bb13626'
fbposts <- fromJSON(paste0('https://graph.facebook.com/v2.12/745262895539259/posts?limit=100&since=2016-01-01&access_token=', access_token))
n <- fbposts$paging$`next`
n
fbposts2 <- fromJSON(n)
! is.null(fbposts2$paging$`next`)
library(jiebaR)
mixseg <- worker()
seg <- lapply(fbposts$data$message, function(m) segment(code = m, jiebar = mixseg))
library(wordcloud2)
tb <- table(unlist(seg))
tb <- tb[tb >= 2 & nchar(names(tb)) >= 2]
wordcloud2(tb)
! is.null(fbposts$paging$`next`)
library(jiebaR)
analyzeFBPosts <- function(pageid){
dfall <- data.frame()
df <- fromJSON(paste0('https://graph.facebook.com/v2.12/', pageid,'/posts?limit=100&since=2016-01-01&access_token=', access_token))
dfall <- rbind(dfall, df$data)
while(! is.null(df$paging$`next`)){
df <- fromJSON(df$paging$`next`)
dfall <- rbind(dfall, df$data)
}
mixseg <- worker()
seg <- lapply(dfall$message, function(m) segment(code = m, jiebar = mixseg))
tb <- table(unlist(seg))
tb <- tb[tb >= 5 & nchar(names(tb)) >= 2 & grepl('[\u4e00-\u9fa5]+', names(tb))]
wordcloud2(tb, shape= 'star')
}
grepl('[\u4e00-\u9fa5]+', '中文')
grepl('[\u4e00-\u9fa5]+', 'google')
analyzeFBPosts('745262895539259')
library(tm)
library(jsonlite)
access_token <- '1784463381857274|9900d3b0e28a8ae9b5d263240bb13626'
fblikes <- fromJSON(paste0('https://graph.facebook.com/v2.12/157654257691971_662594383864620?fields=reactions.summary(True),comments.summary(True)&access_token=', access_token))
fblikes
分析FB 粉絲頁(淺談保險觀念)
library(jsonlite)
access_token <- '1784463381857274|9900d3b0e28a8ae9b5d263240bb13626'
pageid <- 'Forinsurance.tw'
dfall <- data.frame()
df <- fromJSON(paste0('https://graph.facebook.com/v2.12/', pageid,'/posts?limit=100&since=2016-01-01&access_token=', access_token))
dfall <- rbind(dfall, df$data)
while(! is.null(df$paging$`next`)){
df <- fromJSON(df$paging$`next`)
dfall <- rbind(dfall, df$data)
}
library(jiebaR)
mixseg <- worker()
qa.seg <- lapply(dfall$message, function(e) segment(e, mixseg))
library(tm)
corpus <- Corpus(VectorSource(qa.seg))
dtm <- DocumentTermMatrix(corpus)
inspect(dtm)
#dtm$dimnames$Terms
findAssocs(dtm,'保險', 0.4)
計算距離
x <- c(0,0,1,1,1,1)
y <- c(1,0,1,1,0,1)
sqrt(sum((x - y) ^ 2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
## x
## y 1.414214
dist(rbind(x,y), method = 'minkowski', p = 2)
## x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
## x
## y 2
dist(rbind(x,y), method = 'minkowski', p = 1)
## x
## y 2
文章相似度分析
library(jsonlite)
access_token <- '1784463381857274|9900d3b0e28a8ae9b5d263240bb13626'
pageid <- 'Forinsurance.tw'
dfall <- data.frame()
df <- fromJSON(paste0('https://graph.facebook.com/v2.12/', pageid,'/posts?limit=100&since=2016-01-01&access_token=', access_token))
dfall <- rbind(dfall, df$data)
while(! is.null(df$paging$`next`)){
df <- fromJSON(df$paging$`next`)
dfall <- rbind(dfall, df$data)
}
dfall
library(jiebaR)
mixseg <- worker()
dfall <- dfall[! grepl('懶人', dfall$message), ]
dfall <- dfall[! grepl('回顧', dfall$message), ]
dfall <- dfall[! grepl('十大', dfall$message), ]
seg <- lapply(dfall$message, function(e) segment(e, jiebar = mixseg) )
class(seg)
corpus <- Corpus(VectorSource(seg))
dtm <- DocumentTermMatrix(corpus)
dtm
# 柯文哲 大巨蛋 趙藤雄 遠雄 檢察官 ..... ...
#a 1 1 1 1 1 1 ....
#b 1 1 1 0 0 0
#install.packages('proxy')
library(proxy)
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')
dtm.mat <- as.matrix(dtm.dist)
dtm.mat
dfall$message[as.integer(names(sort(dtm.mat[5, which(dtm.mat[5,] < 0.8)])))]
article.query <- function(idx){
dfall$message[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] < 0.8)])))]
}
article.query(7)[1:10]
新聞推薦引擎
library(jiebaR)
mixseg <- worker()
seg <- lapply(applenews$content, function(e) segment(e, jiebar = mixseg))
library(tm)
corpus <- Corpus(VectorSource(seg))
dtm <- DocumentTermMatrix(corpus)
dtm.remove <- removeSparseTerms(dtm, 0.99)
dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
dtm.mat <- as.matrix(dtm.dist)
article.query <- function(idx){
applenews$title[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] < 0.8)])))]
}
article.query(6)[1:10]
文章分群
dim(dtm.dist)
hc <- hclust(dtm.dist)
plot(hc, hang=-0.1)
fit <- cutree(hc, k = 20)
applenews$title[fit == 2]
新聞分類
apple <- applenews[applenews$category %in% c('財經', '娛樂'),]
apple$category <- factor(apple$category)
library(jiebaR)
mixseg<- worker()
apple.seg <- lapply(apple$content, function(e) segment(code = e, jiebar = mixseg))
corpus <- Corpus(VectorSource(apple.seg))
library(tm)
doc <- tm_map(corpus, removeNumbers)
dtm <- DocumentTermMatrix(doc)
dim(dtm)
dtm.new<- DocumentTermMatrix(doc,
control = list(
wordLengths = c(2,20)))
dim(dtm.new)
m <- as.matrix(dtm.new)
m[1:3,1:3]
convert_counts <- function(x){
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
return(x)
}
m.count <- apply(dtm.new, MARGIN= 2, convert_counts)
m.count[1:3,1:3]
m <- as.data.frame(m.count)
nrow(m)
set.seed(123)
idx <- sample.int(2,nrow(m), replace=TRUE, prob = c(0.7,0.3))
trainset <- m[idx == 1,]
testset <- m[idx == 2,]
traintag <- apple[idx == 1, 'category']
testtag <- apple[idx == 2, 'category']
dim(trainset)
dim(testset)
length(traintag)
length(testtag)
#install.packages('e1071')
library(e1071)
model <- naiveBayes(trainset, as.factor(traintag))
pred <- predict(model, testset)
table(pred, testtag )
(28 + 33) / (28+33+1+1)
save(model, file= 'classfication.RData')
load('classfication.RData')
pred2 <- predict(model, testset)