中文詞頻矩陣

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.4.3
## Loading required package: jiebaRD
library(tm)
## Warning: package 'tm' was built under R version 3.4.3
## Loading required package: NLP
mixseg <- worker()
s <- '大巨蛋案對市府同仁下封口令? 柯P否認'
s1 <- '柯P市府近來飽受大巨蛋案爭議'

s.vec    <- segment(code = s, jiebar = mixseg)
s1.vec   <- segment(code = s1, jiebar = mixseg)
s.corpus <- Corpus(VectorSource(list(s.vec, s1.vec)))
s.dtm    <- DocumentTermMatrix(s.corpus)
inspect(s.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 8)>>
## Non-/sparse entries: 10/6
## Sparsity           : 38%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 大巨蛋 市府 同仁 爭議 近來 封口令 案對 飽受
##    1      1    1    1    0    0      1    1    0
##    2      1    1    0    1    1      0    0    1

抓取問答資料

library(rvest)
## Warning: package 'rvest' was built under R version 3.4.2
## Loading required package: xml2
dfall <- data.frame()
for (p in seq(1,3)){
page <- read_html(paste0('http://www.tw-insurance.info/faq.cfm?faq=3&p=',p)) 

questions <- page %>%
  html_nodes('table[summary="Article list"] h3.title') %>% 
  html_text() %>% sub('♥','', .)

answers   <- page %>%
  html_nodes('table[summary="Article list"] tr[id] td') %>% html_text()

df <- data.frame(questions, answers, stringsAsFactors = FALSE)
dfall <- rbind(dfall, df)
}
#dfall


library(jiebaR)
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
qa.seg <- lapply(dfall$answers, function(e) segment(e, mixseg))
corpus <- Corpus(VectorSource(qa.seg))
dtm    <- DocumentTermMatrix(corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 25, terms: 491)>>
## Non-/sparse entries: 628/11647
## Sparsity           : 95%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 投資 係指 保戶 保單 保險 保險公司 契約 就是 壽險 機能
##   10    0    0    0    3    3        0    1    1    0    0
##   11    0    0    0    0    1        0    0    0    3    0
##   13    2    0    1    6    0        0    0    1    2    0
##   14    0    0    1    5    0        0    0    0    0    0
##   19    0    0    0    0    2        0    0    0    2    0
##   21   10    0    4    5    2        3    0    1    2    0
##   24    0    0    0    1    0        1    0    0    0    0
##   25    0    0    2    1    0        2    0    0    0    0
##   6     0    0    0    0    4        0    3    0    0    0
##   7     0   11    0    0    0        0    0    0    0   12
findFreqTerms(dtm,10)
## [1] "保單"     "保險"     "係指"     "機能"     "壽險"     "投資"    
## [7] "保戶"     "保險公司"
findAssocs(dtm , '保單' , 0.5)
## $保單
##   這樣   保戶   公司   投資   委託   代操   全委   實質 操<a7>   雙重 
##   0.68   0.63   0.61   0.54   0.54   0.52   0.52   0.52   0.52   0.52 
##   全權   利率 
##   0.52   0.52
findAssocs(dtm , '保險' , 0.5)
## $保險
##   契約   價值   標的   載明   不定   之訂   方式   可供   可知   市場 
##   0.75   0.68   0.68   0.67   0.56   0.56   0.56   0.56   0.56   0.56 
##   估計 客<c6>   須有   須至 
##   0.56   0.56   0.56   0.56
findAssocs(dtm , '契約' , 0.5)
## $契約
##   載明   價值   標的   保險   定值   不定   之訂   方式   可供   可知 
##   0.95   0.89   0.89   0.75   0.65   0.65   0.65   0.65   0.65   0.65 
##   市場   估計 客<c6>   須有   須至 
##   0.65   0.65   0.65   0.65   0.65

抓取Facebook 資料來做分析

library(jsonlite)
access_token <- '1784463381857274|9900d3b0e28a8ae9b5d263240bb13626'

fbposts <- fromJSON(paste0('https://graph.facebook.com/v2.12/745262895539259/posts?limit=100&since=2016-01-01&access_token=', access_token))

n <- fbposts$paging$`next`
n
fbposts2 <- fromJSON(n)
! is.null(fbposts2$paging$`next`)

library(jiebaR)
mixseg <- worker()
seg <- lapply(fbposts$data$message, function(m) segment(code = m, jiebar = mixseg))

library(wordcloud2)
tb <- table(unlist(seg))
tb <- tb[tb >= 2 & nchar(names(tb)) >= 2]
wordcloud2(tb)

! is.null(fbposts$paging$`next`)

library(jiebaR)

analyzeFBPosts <- function(pageid){
  dfall <- data.frame()
  df    <- fromJSON(paste0('https://graph.facebook.com/v2.12/', pageid,'/posts?limit=100&since=2016-01-01&access_token=', access_token))
  dfall <- rbind(dfall, df$data)
  while(! is.null(df$paging$`next`)){
    df <- fromJSON(df$paging$`next`)
    dfall <- rbind(dfall, df$data)
  }
  
  mixseg <- worker()
  seg <- lapply(dfall$message, function(m) segment(code = m, jiebar = mixseg))
  
  tb <- table(unlist(seg))
  tb <- tb[tb >= 5 & nchar(names(tb)) >= 2 & grepl('[\u4e00-\u9fa5]+', names(tb))]
  wordcloud2(tb, shape= 'star')  
}


grepl('[\u4e00-\u9fa5]+', '中文')
grepl('[\u4e00-\u9fa5]+', 'google')

analyzeFBPosts('745262895539259')





library(tm)

library(jsonlite)
access_token <- '1784463381857274|9900d3b0e28a8ae9b5d263240bb13626'

fblikes <- fromJSON(paste0('https://graph.facebook.com/v2.12/157654257691971_662594383864620?fields=reactions.summary(True),comments.summary(True)&access_token=', access_token))
fblikes

分析FB 粉絲頁(淺談保險觀念)

library(jsonlite)
access_token <- '1784463381857274|9900d3b0e28a8ae9b5d263240bb13626'
pageid <- 'Forinsurance.tw'

dfall <- data.frame()
  df    <- fromJSON(paste0('https://graph.facebook.com/v2.12/', pageid,'/posts?limit=100&since=2016-01-01&access_token=', access_token))
  dfall <- rbind(dfall, df$data)
  while(! is.null(df$paging$`next`)){
    df <- fromJSON(df$paging$`next`)
    dfall <- rbind(dfall, df$data)
  }



library(jiebaR)
mixseg <- worker()
qa.seg <- lapply(dfall$message, function(e) segment(e, mixseg))

library(tm)
corpus <- Corpus(VectorSource(qa.seg))
dtm    <- DocumentTermMatrix(corpus)
inspect(dtm)

#dtm$dimnames$Terms

findAssocs(dtm,'保險', 0.4)

計算距離

x <- c(0,0,1,1,1,1)
y <- c(1,0,1,1,0,1)

sqrt(sum((x - y) ^ 2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
##          x
## y 1.414214
dist(rbind(x,y), method = 'minkowski', p = 2)
##          x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
##   x
## y 2
dist(rbind(x,y), method = 'minkowski', p = 1)
##   x
## y 2

文章相似度分析

library(jsonlite)
access_token <- '1784463381857274|9900d3b0e28a8ae9b5d263240bb13626'
pageid <- 'Forinsurance.tw'

dfall <- data.frame()
df    <- fromJSON(paste0('https://graph.facebook.com/v2.12/', pageid,'/posts?limit=100&since=2016-01-01&access_token=', access_token))
dfall <- rbind(dfall, df$data)
while(! is.null(df$paging$`next`)){
    df <- fromJSON(df$paging$`next`)
    dfall <- rbind(dfall, df$data)
}
dfall


library(jiebaR)
mixseg <- worker()

dfall <- dfall[! grepl('懶人', dfall$message), ]
dfall <- dfall[! grepl('回顧', dfall$message), ]
dfall <- dfall[! grepl('十大', dfall$message), ]
seg <- lapply(dfall$message, function(e) segment(e, jiebar = mixseg) )
class(seg)


corpus <- Corpus(VectorSource(seg))
dtm    <- DocumentTermMatrix(corpus)
dtm


#   柯文哲 大巨蛋 趙藤雄 遠雄  檢察官 ..... ...
#a    1     1      1     1       1     1 ....
#b    1     1      1     0       0     0

#install.packages('proxy')
library(proxy) 
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')
dtm.mat  <- as.matrix(dtm.dist)
dtm.mat

dfall$message[as.integer(names(sort(dtm.mat[5, which(dtm.mat[5,] < 0.8)])))]

article.query <- function(idx){
  dfall$message[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] < 0.8)])))]
}

article.query(7)[1:10]

新聞推薦引擎


library(jiebaR)
mixseg <- worker()
seg <- lapply(applenews$content, function(e) segment(e, jiebar = mixseg))

library(tm)
corpus <- Corpus(VectorSource(seg))
dtm    <- DocumentTermMatrix(corpus)
dtm.remove <- removeSparseTerms(dtm, 0.99)

dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
dtm.mat  <- as.matrix(dtm.dist)


article.query <- function(idx){
  applenews$title[as.integer(names(sort(dtm.mat[idx, which(dtm.mat[idx,] < 0.8)])))]
}

article.query(6)[1:10]

文章分群

dim(dtm.dist)
hc <- hclust(dtm.dist)
plot(hc, hang=-0.1)

fit <- cutree(hc, k = 20)
applenews$title[fit == 2]

Community Detection

#install.packages('igraph')
library(igraph)
library(igraph)

m <- ifelse(dtm.mat < 0.4, 1, 0)

G <- graph_from_adjacency_matrix(m)

wc <- cluster_walktrap(G)
modularity(wc)
table(membership(wc))

group <- membership(wc)
applenews$title[group ==7]

新聞分類

apple <- applenews[applenews$category %in% c('財經', '娛樂'),]
apple$category <- factor(apple$category)

library(jiebaR)
mixseg<- worker()
apple.seg <- lapply(apple$content, function(e) segment(code = e, jiebar = mixseg))
corpus <- Corpus(VectorSource(apple.seg))

library(tm)
doc    <- tm_map(corpus, removeNumbers)
dtm    <- DocumentTermMatrix(doc)
dim(dtm)


dtm.new<- DocumentTermMatrix(doc, 
            control = list(
              wordLengths = c(2,20)))
dim(dtm.new)
m <- as.matrix(dtm.new)
m[1:3,1:3]

convert_counts <- function(x){
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels=c(0,1), labels = c('No', 'Yes'))
  return(x)
}

m.count <- apply(dtm.new, MARGIN= 2, convert_counts)

m.count[1:3,1:3]
m <- as.data.frame(m.count)
nrow(m)

set.seed(123)
idx <- sample.int(2,nrow(m), replace=TRUE, prob = c(0.7,0.3))
trainset <- m[idx == 1,]
testset  <- m[idx == 2,]


traintag <- apple[idx == 1, 'category']
testtag <- apple[idx == 2, 'category']

dim(trainset)
dim(testset)
length(traintag)
length(testtag)

#install.packages('e1071')
library(e1071)
model <- naiveBayes(trainset, as.factor(traintag))
pred  <- predict(model, testset)
table(pred, testtag )

(28 + 33) / (28+33+1+1)

save(model, file= 'classfication.RData')
load('classfication.RData')
pred2  <- predict(model, testset)