Demo20180401

使用爬蟲取得關鍵字詞

#install.packages('rvest')
library(rvest)

## Loading required package: xml2

keywords <- read_html('http://news.ltn.com.tw/news/politics/breakingnews/2383080') %>% html_nodes('meta[name=keywords]') %>% html_attr('content') %>% strsplit(., ',') %>% unlist() %>% trimws()


keywords <- read_html('https://udn.com/news/story/6656/3063666?from=udn-ch1_breaknews-1-0-news') %>% html_nodes('meta[name=news_keywords]') %>% html_attr('content') %>% strsplit(., ',') %>% unlist() %>% trimws()

library(jiebaR)

## Loading required package: jiebaRD

#edit_dict()
USERPATH

## [1] "C:/Program Files/R/R-3.4.4/library/jiebaRD/dict/user.dict.utf8"

userdict <- "C:/Program Files/R/R-3.4.4/library/jiebaRD/dict/user.dict.utf8"
f <- file(userdict, 'a', encoding = 'utf-8')
for(k in keywords){
  cat(k , '\n', file= f)
}
close(f)

TFIDF

a   <- c('a')
abb <- c('a','b','b')
abc <- c('a','b','c')
D   <- list(a,abb,abc)

#tfidf('a', a, D)
tf  <- 1/1
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('a', abb, D)
tf  <- 1/3
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('a', abc, D)
tf  <- 1/3
idf <- log(3/3)
tf * idf

## [1] 0

#tfidf('b', abb, D)
tf  <- 2/3
idf <- log(3/2)
tf * idf

## [1] 0.2703101

#tfidf('b', abc, D)
tf  <- 1/3
idf <- log(3/2)
tf * idf

## [1] 0.135155

#tfidf('c', abc, D)
tf  <- 1/3
idf <- log(3/1)
tf * idf

## [1] 0.3662041

tfidf <- function(t, d, D){
  tf  <- sum(d == t)  / length(d)
  idf <- log(length(D) / sum(sapply(D, function(doc) t %in% doc)) )
  return(tf * idf)
}


tfidf('a', a, D)

## [1] 0

tfidf('a', abb, D)

## [1] 0

tfidf('a', abc, D)

## [1] 0

tfidf('b', abb, D)

## [1] 0.2703101

tfidf('b', abc, D)

## [1] 0.135155

tfidf('c', abc, D)

## [1] 0.3662041

詞頻計算

news <- '行政院長賴清德近日表示，蔡政府擬重啟的深澳火力發電廠所燒的煤是「乾淨的煤」；對因推動非核家園調整能源政策而導致電價上漲，賴也聲稱兩者無關。國民黨發言人洪孟楷今趁愚人節揶揄，賴所稱「乾淨的煤」是本世界最偉大發現，應拿諾貝爾。
洪孟楷上午在個人臉書發表4月1日專屬愚人節版「自冉報」，透過「記者余仁傑」在頭版報導，以諧音的方式，揶揄賴清德（賴青德）、政院發言人徐國勇（徐國男）、總統蔡英文（蔡央文），「乾淨的煤」說法應獲「言若貝爾獎」（暗指「諾貝爾」）。

洪孟楷還以主標「台灣：乾淨的煤找到了！」，副標「本世界最偉大發現，卻無緣今年言若貝爾，賴青德表示：就當作功德」，在個人臉書推播該則「本日頭版新聞」。

洪孟楷在所發行的「自冉報」撰文中表示，台灣本月20號提出跨時代發現，由行政院長賴青德公開、行政院發言人徐國男認證，足以拿下今年言若貝爾獎的「乾淨的煤」，因為提名截止，所以今年確定無法得獎。行政院長賴青德因此公開表示：沒能在今年得此殊榮，非常遺憾，這不只是功德台灣、更是功德世界。

文中也提到，據我國中研院表示：乾淨的煤是「無汙染、對人體健康無害、且存在於人類幻想」的新發明，這項發明顛覆了人們過往的化學、醫學、還有神學的研究，明明是炭，燃燒卻不會產生煤灰，其偉大成就如同「不臭的臭豆腐」、「玫瑰花香的榴蓮」齊名；明明是pm2.5卻不會傷害人體。其影響力絕對有87分那麼高，可稱之為21世紀的工業革命。

文中指出，這樣子顛覆性的發明，有望一舉奪下言若貝爾化學、醫學雙獎項，更有可能藉由販賣乾淨的煤改善經濟結構，連言若貝爾經濟學獎一併囊獲入袋。只遺憾言若貝爾獎的提名在今年的二月截止，若不是民進黨政府今年二月忙著修改勞基法，台灣將有機會破天荒的同年一舉拿下三項不同領域大獎，這是言若貝爾獎成立以來的第一次。

洪孟楷還揶揄，經記者余仁傑實地訪問，發現言若貝爾委員會對乾淨的煤不能獲獎也抱持著遺憾。因「勞工是委員會心裡最軟的一塊」、「台中市是空汙防治最成功的」。所以直言明年一定提名乾淨的煤。

洪孟楷說，若讀者覺得今天新聞有問題？那「蔡央文說過:有問題去跟你們老闆說！一定要自立自強歐！」'


library(jiebaR)
edit_dict()

## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows

mixseg <- worker()
seg.vec <- segment(code = news, jiebar = mixseg)
tb <- table(seg.vec)

#install.packages('wordcloud2')
library(wordcloud2)
wordcloud2(tb[nchar(names(tb)) >= 2 & tb >= 2 & grepl(names(tb), pattern = '^[\u4e00-\u9fa5]+$')])

詞頻矩陣

#install.packages('tm')
library(tm)

## Loading required package: NLP

e3 <- 'Hello, I am David. I have taken over 100 courses ~~~'

e3.list   <- strsplit(e3, ' ')
e3.corpus <- Corpus(VectorSource(e3.list))
dtm <- DocumentTermMatrix(e3.corpus)
dtm

## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)

inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 courses david have hello over taken
##    1   1       1     1    1     1    1     1

dtm <- DocumentTermMatrix(e3.corpus, control= list(wordLengths = c(1,20)) )
inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity           : 0%
## Maximal term length: 7
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 100 am c courses david have hello i over taken
##    1   1  1 1       1     1    1     1 2    1     1

getTransformations()

## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

doc <- tm_map(e3.corpus, removeNumbers)
doc <- tm_map(doc, removePunctuation)


#install.packages('SnowballC')
doc <- tm_map(doc, stemDocument)
dtm <- DocumentTermMatrix(doc, control= list(wordLengths = c(1,20)) )
inspect(dtm)

## <<DocumentTermMatrix (documents: 1, terms: 8)>>
## Non-/sparse entries: 8/0
## Sparsity           : 0%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs am chello cours david have i over taken
##    1  1      1     1     1    1 2    1     1

e1 <- 'this is a book'
e2 <- 'this is my car'
e.list <- strsplit(c(e1,e2), ' ')
e.corpus <- Corpus(VectorSource(e.list))
dtm <- DocumentTermMatrix(e.corpus)
inspect(dtm)

## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity           : 33%
## Maximal term length: 4
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs book car this
##    1    1   0    1
##    2    0   1    1

?DocumentTermMatrix

## starting httpd help server ...

##  done

sw <- stopwords('en')
sw

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"

tfidf.dtm <- DocumentTermMatrix(e.corpus, control= list(weighting =function(x)
        weightTfIdf(x, normalize = FALSE), stopwords=TRUE))
inspect(tfidf.dtm)

## <<DocumentTermMatrix (documents: 2, terms: 2)>>
## Non-/sparse entries: 2/2
## Sparsity           : 50%
## Maximal term length: 4
## Weighting          : term frequency - inverse document frequency (tf-idf)
## Sample             :
##     Terms
## Docs book car
##    1    1   0
##    2    0   1

中文詞頻矩陣

library(jiebaR)
edit_dict()

## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows

mixseg=worker()
s  <- "大巨蛋案對市府同仁下封口令？柯P否認"
s1 <- "柯P市府近來飽受大巨蛋爭議"

s.vec  <- segment(code=s, jiebar = mixseg)
s1.vec <- segment(code=s1, jiebar = mixseg)

sw <- c('近來')

s.corpus <- Corpus(VectorSource(list(s.vec, s1.vec)))
dtm      <- DocumentTermMatrix(s.corpus, control = list(stopwords=sw) )
inspect(dtm)

## <<DocumentTermMatrix (documents: 2, terms: 7)>>
## Non-/sparse entries: 9/5
## Sparsity           : 36%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 大巨蛋 市府 同仁 爭議 封口令 案對 飽受
##    1      1    1    1    0      1    1    0
##    2      1    1    0    1      0    0    1

詞頻矩陣應用

https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/appledaily.RData

load("C:/Users/user/Downloads/appledaily.RData")
#head(appledaily)

library(jiebaR)
#edit_dict()
mixseg <- worker()

apple.seg <- lapply(appledaily$content, function(article) {
  s <- segment(code=article, jiebar = mixseg)
  return(s[grepl('[\u4e00-\u9fa5]+', s)])
  })


#apple.seg[[1]]
doc <- Corpus(VectorSource(apple.seg))
doc <- tm_map(doc, removeNumbers)

dtm <- DocumentTermMatrix(doc)

dtm

## <<DocumentTermMatrix (documents: 1500, terms: 35247)>>
## Non-/sparse entries: 157074/52713426
## Sparsity           : 100%
## Maximal term length: 12
## Weighting          : term frequency (tf)

findFreqTerms(dtm, lowfreq = 200, highfreq = 300)

##  [1] "已經"   "男子"   "要求"   "調查"   "相關"   "人員"   "其中"  
##  [8] "照片"   "今<a6>" "未來"   "政府"   "第<a4>" "這些"   "媒體"  
## [15] "影響"   "民眾"   "國家"   "希望"   "最後"   "工<a7>" "如果"  
## [22] "是<a7>" "經濟"   "不是"   "不過"   "這個"   "這樣"   "不<b7>"
## [29] "市府"   "還是"   "萬元"   "大巨蛋"

findAssocs(dtm,'大巨蛋', 0.5)

## $大巨蛋
##     遠雄     解約     市府     展延     工期     口稱     已朝 已無太多 
##     0.86     0.78     0.74     0.72     0.70     0.69     0.69     0.69 
##   心叵測   王貞治     他怕 若不<c4>     哲見   核子彈 問及此事   理還亂 
##     0.69     0.69     0.69     0.69     0.69     0.69     0.69     0.69 
##     連通   期議約     菸廠   進度條     聘書     運棄     孵出   蔡宗易 
##     0.69     0.69     0.69     0.69     0.69     0.69     0.69     0.69 
##     談盤   應<a5>     遽增     歸責     難解     觸礁     議約     無解 
##     0.69     0.69     0.69     0.69     0.69     0.69     0.69     0.68 
##     停工     方向     五大     溫室     拋出     南線     貿然   和<a5> 
##     0.64     0.62     0.62     0.61     0.61     0.60     0.60     0.60 
##     量體     案朝     逾期     逃生     違約     實地     懶人     容納 
##     0.59     0.56     0.56     0.55     0.54     0.53     0.52     0.51

dtm.remove <- removeSparseTerms(dtm,0.995)
dim(dtm.remove)

## [1] 1500 3466

head(dtm.remove$dimnames$Terms)

## [1] "上個月" "已將"   "已經"   "不已"   "之際"   "分局"

文章相似度計算


a  <- c(1,1)
b  <- c(1,1)

a1 <- c(1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
b1 <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)

dist(rbind(a,b), method = 'euclidean')
dist(rbind(a1,b1), method = 'euclidean')


# install.packages('proxy')
library(proxy)
proxy::dist(rbind(a,b), method = 'cosine')
proxy::dist(rbind(a1,b1), method = 'cosine')


a <- c(1,2,2,1,1,1,0)
b <- c(1,2,2,1,1,2,1)
1 - proxy::dist(rbind(a,b), method = 'cosine')


library(jiebaR)
mixseg=worker()
s  <- "大巨蛋案對市府同仁下封口令？柯P否認"
s1 <- "柯P市府近來飽受大巨蛋爭議"
s2 <- "扁爆綠將與柯P合作 蔡英文：對扁保外就醫不會是好事"

s.vec  <- segment(code=s, jiebar = mixseg)
s1.vec <- segment(code=s1, jiebar = mixseg)
s2.vec <- segment(code=s2, jiebar = mixseg)

sw <- c('近來')

s.corpus <- Corpus(VectorSource(list(s.vec, s1.vec, s2.vec)))
dtm      <- DocumentTermMatrix(s.corpus, control = list(stopwords=sw) )
inspect(dtm)


dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')
as.matrix(dtm.dist)



load("C:/Users/user/Downloads/appledaily.RData")

library(jiebaR)
#edit_dict()
mixseg <- worker()

apple.seg <- lapply(appledaily$content, function(article) {
  s <- segment(code=article, jiebar = mixseg)
  return(s[grepl('[\u4e00-\u9fa5]+', s)])
  })

doc <- Corpus(VectorSource(apple.seg))
doc <- tm_map(doc, removeNumbers)
dtm <- DocumentTermMatrix(doc)

dim(dtm)

dtm.remove <- removeSparseTerms(dtm,0.995)

dim(dtm.remove)

dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
dtm.mat <- as.matrix(dtm.dist)



getSimiliarNews <- function(idx){
print(paste('查詢新聞:', appledaily$title[idx]))
rank <- order(dtm.mat[idx,])[2:10]
  for (rec in rank){
    if (dtm.mat[idx, rec] <= 0.6){
      print(paste('相關新聞:', appledaily$title[rec]))  
    }
  }
}

getSimiliarNews(39)

產生自動問答機器人

library(rvest)
page <- read_html('https://www.aia.com.tw/zh-tw/help-and-support/important-info/faq.html') 

questions <- page %>% html_nodes('.collapsible-table-title') %>% html_text() %>% trimws()

answers <- page %>% html_nodes('.collapsible-table-content') %>% html_text() %>% trimws()


#answers

qa <- data.frame(questions, answers, stringsAsFactors = FALSE)

q <- '如何選擇正確的保險商品'
library(jiebaR)
mixseg <- worker()
qa.seg <- lapply(c(q, qa$questions), function(e) segment(code = e, mixseg))
qa.corpus <- Corpus(VectorSource(qa.seg))
qa.dtm <- DocumentTermMatrix(qa.corpus)
qa.dist <- proxy::dist(as.matrix(qa.dtm), method = 'cosine')
qa.mat <- as.matrix(qa.dist)


answers   <- c('DUMMY',qa$answers)
questions <- c(q, qa$questions)

questions[order(qa.mat[1,])[2]]

## [1] "如何選擇合適的保險商品？"

answers[order(qa.mat[1,])[2]]

## [1] "答：\n\n\n\n人生邁入不同階段，所需的保障內容也隨之改變，本公司貼心提供保障缺口分析服務，協助您做好完整的保障規劃，降低未來個人或家庭經濟生活之不確定性。"

文章分群

load("C:/Users/user/Downloads/appledaily.RData")

library(jiebaR)

mixseg <- worker()

apple.seg <- lapply(appledaily$content, function(article) {
  s <- segment(code=article, jiebar = mixseg)
  return(s[grepl('[\u4e00-\u9fa5]+', s)])
  })

doc <- Corpus(VectorSource(apple.seg))
doc <- tm_map(doc, removeNumbers)
dtm <- DocumentTermMatrix(doc)

dim(dtm)

dtm.remove <- removeSparseTerms(dtm,0.995)

dim(dtm.remove)

dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
dtm.mat <- as.matrix(dtm.dist)

hc <- hclust(dtm.dist, method = 'ward.D2')
plot(hc, hang=-0.1)
rect.hclust(hc, k = 22)

fit <- cutree(hc, k = 22)
appledaily$title[fit == 8]


#install.packages('igraph')
library(igraph)

filter_score <- function(x) {
  x <- ifelse(x < 0.5, 1, 0)
  return(x)
}
score.csm <- apply(dtm.mat, MARGIN = 2, filter_score)
score.csm[1:3,1:3]
G <- graph_from_adjacency_matrix(score.csm, mode = c( "undirected"))

wc <- cluster_walktrap(G)
sort(table(wc$membership), decreasing = TRUE)

appledaily$title[wc$membership == 8]

文章分類

load("C:/Users/user/Downloads/appledaily.RData")
apple.subset <- appledaily[appledaily$category %in% c('財經', '娛樂', '社會' ), ]
apple.subset$category <- as.factor(apple.subset$category)


library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(apple.subset$content, function(article) {
  s <- segment(code=article, jiebar = mixseg)
  return(s[grepl('[\u4e00-\u9fa5]+', s)])
  })


library(tm)
apple.corpus <- Corpus(VectorSource(apple.seg))
dtm          <- DocumentTermMatrix(apple.corpus)

dim(dtm)

## [1]   428 15915

dtm.remove <- removeSparseTerms(dtm, 0.995)

dim(dtm.remove)

## [1]  428 3620

convert_counts<-function(x){
  x <-ifelse(x >0, 1, 0)
  x <-factor(x, levels=c(0, 1), labels=c("No", "Yes"))
  return(x)
}

dtm.count<-apply(dtm.remove, MARGIN =2, convert_counts)
dtm.count[1:3,1:3]

##     Terms
## Docs 口罩  已將  已經 
##    1 "Yes" "Yes" "Yes"
##    2 "No"  "No"  "No" 
##    3 "No"  "No"  "No"

m <- as.data.frame(dtm.count)

set.seed(42)
idx <- sample.int(2, nrow(m), replace=TRUE, prob = c(0.7,0.3))

trainset <- m[idx== 1, ]
traintitle <- apple.subset$title[idx == 1]
traintag  <- apple.subset$category[idx==1]

testset <- m[idx== 2, ]
testtitle <- apple.subset$title[idx == 2]
testtag  <- apple.subset$category[idx==2]

dim(trainset)

## [1]  298 3620

length(traintag)

## [1] 298

dim(testset)

## [1]  130 3620

length(testtag)

## [1] 130

library(e1071)
model <- naiveBayes(trainset, traintag)
pred <- predict(model, testset)
tb<- table(testtag,pred)

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

confusionMatrix(tb)

## Confusion Matrix and Statistics
## 
##        pred
## testtag 社會 娛樂 財經
##    社會   50    2    0
##    娛樂    1   41    0
##    財經    0    1   35
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9692          
##                  95% CI : (0.9231, 0.9916)
##     No Information Rate : 0.3923          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9533          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 社會 Class: 娛樂 Class: 財經
## Sensitivity               0.9804      0.9318      1.0000
## Specificity               0.9747      0.9884      0.9895
## Pos Pred Value            0.9615      0.9762      0.9722
## Neg Pred Value            0.9872      0.9659      1.0000
## Prevalence                0.3923      0.3385      0.2692
## Detection Rate            0.3846      0.3154      0.2692
## Detection Prevalence      0.4000      0.3231      0.2769
## Balanced Accuracy         0.9775      0.9601      0.9947

which(pred != testtag)

## [1]   4 113 127 128

i <- 128
pred[i]

## [1] 娛樂
## Levels: 社會 娛樂 財經

testtag[i]

## [1] 財經
## Levels: 社會 娛樂 財經

testtitle[i]

## [1] "大倒角握感如何？　網友試機HTC 10"

(37 + 38) / (37 +38 +1 + 2)

## [1] 0.9615385

電影正負評判斷

library(readxl)
yahoo_movie <- read_excel("C:/Users/user/Downloads/yahoo_movie.xlsx")
#View(yahoo_movie)
movies <- yahoo_movie[yahoo_movie$status %in% c('good', 'bad'),]
movies <- movies[nchar(movies$content) >=2, ]

movies$status <- as.factor(movies$status)

library(jiebaR)
mixseg <- worker()
movie.seg <- lapply(movies$content, function(article) {
      s <- segment(code=article, jiebar = mixseg)
      return(s[grepl('[\u4e00-\u9fa5]+', s)])
  })

movie.seg <- lapply(movies$content, function(article) {
      s <- segment(code=article, jiebar = mixseg)
      return(s)
  })

movie.corpus <- Corpus(VectorSource(movie.seg))
dtm <- DocumentTermMatrix(movie.corpus)
dim(dtm)

## [1]  963 4916

convert_counts<-function(x){
  x <-ifelse(x >0, 1, 0)
  x <-factor(x, levels=c(0, 1), labels=c("No", "Yes"))
  return(x)
}

dtm.count<-apply(dtm, MARGIN =2, convert_counts)

m <- as.data.frame(dtm.count)


set.seed(42)
idx <- sample.int(2, nrow(m), replace=TRUE, prob = c(0.7,0.3))

trainset <- m[idx== 1, ]
traintag  <- movies$status[idx==1]

testset <- m[idx== 2, ]
testtag  <- movies$status[idx==2]

dim(trainset)

## [1]  680 4916

length(traintag)

## [1] 680

dim(testset)

## [1]  283 4916

length(testtag)

## [1] 283

library(e1071)
model <- naiveBayes(trainset, traintag)
pred <- predict(model, testset)
tb<- table(testtag,pred)
confusionMatrix(tb)

## Confusion Matrix and Statistics
## 
##        pred
## testtag bad good
##    bad  113   15
##    good  77   78
##                                          
##                Accuracy : 0.6749         
##                  95% CI : (0.617, 0.7292)
##     No Information Rate : 0.6714         
##     P-Value [Acc > NIR] : 0.4777         
##                                          
##                   Kappa : 0.3704         
##  Mcnemar's Test P-Value : 2.022e-10      
##                                          
##             Sensitivity : 0.5947         
##             Specificity : 0.8387         
##          Pos Pred Value : 0.8828         
##          Neg Pred Value : 0.5032         
##              Prevalence : 0.6714         
##          Detection Rate : 0.3993         
##    Detection Prevalence : 0.4523         
##       Balanced Accuracy : 0.7167         
##                                          
##        'Positive' Class : bad            
##

Yahoo Crawler

library(httr)

## 
## Attaching package: 'httr'

## The following object is masked from 'package:caret':
## 
##     progress

## The following object is masked from 'package:NLP':
## 
##     content

library(rvest)

reviews_vec <- c()

for(i in seq(1,3)){
page <- read_html(paste0('https://movies.yahoo.com.tw/movieinfo_review.html/id=7352?sort=update_ts&order=desc&page=',i)) 

reviews <- page %>% html_nodes('#form_good1') 
  for(rec in reviews){
    reviews_vec <- c(reviews_vec,rec %>% html_nodes('span') %>% .[3] %>% html_text)
  }

}
reviews_vec

##  [1] "大部分場景在中國 決戰地卻在日本 應該是為了要迎合中國市場的片子 不要動腦的話覺得這部還好 大概就3.6分吧 不能再高了"                                                                                                                                                                                                                              
##  [2] "我覺得以爽片來說，這次續集絕對符合達標，但就是少了第一集的末日絕望感，但續集可以看到很多隻機甲戰士也算彌補這個缺點了。"                                                                                                                                                                                                                        
##  [3] "覺得第二集確實沒有第一集當初給的感動，不過卻也是有另外一種看頭，希望第三集可以結合一二集的特長。"                                                                                                                                                                                                                                              
##  [4] "有趣的題材，可以拍成這樣平淡無聊，我也是服了！"                                                                                                                                                                                                                                                                                                
##  [5] "其實，機甲打怪獸的橋段都相當喜歡啊！非常精彩！看到一個不錯客觀的影評，分享給大家參考：https://youtu.be/nUytG9s26OY"                                                                                                                                                                                                                            
##  [6] "期待哪麼久的片子  非常的失望,與第一集相比第二集是為了年輕人硬拍的片子,影片該有的陳續 劇情 根本連接不起來 就是特效片,跟第一集看完的感動跟看到主角努力重頭來過,根本無法相比  還不如不要拍"                                                                                                                                                       
##  [7] "不知道看了什!感覺是不同導演，不同特效做的..."                                                                                                                                                                                                                                                                                                  
##  [8] "超好看，說不好看的眼睛是瞎了嗎！"                                                                                                                                                                                                                                                                                                              
##  [9] "劇情比第一集薄弱些，除此之外，創意、畫面都比第一集強，怪物跟機器人都大升級，當爽片看的話，比第一集爽了些；抱著爽片的心情去看的話，這部片值得4.5以上。"                                                                                                                                                                                         
## [10] "如果我唸小學，應該會覺得這部片好看。。。"                                                                                                                                                                                                                                                                                                      
## [11] "知道負評一堆，但我哥堅持要看。抱著有些些網友說，當爽片看的心，看了。結果…糟點太多，我都不知從何吐嘈起。看完整個怒阿，整部戲一路眼神死加出戲。真心覺得網友負評好實在，好貼切，把我心聲都說出來了^_^。不辜負那3.6分阿，應該還會降低吧～"                                                                                                         
## [12] "打鬥場面很少，除了吉普賽，其他隻出來幾分鐘而已"                                                                                                                                                                                                                                                                                                
## [13] "第一集的故事、演員整體表現、畫面<U+22EF>簡直是<f0><U+009F><U+0092><U+00AF>\r\n可能是第一集太好了\r\n第二集的故事性顯得有些草率\r\n尤其是看到景甜的演技屢屢讓我出戲\r\n實在很扣分"                                                                                                                                                              
## [14] "覺得失望，跟第一集差太多。第二集幾乎沒有劇情也沒有重點，慶幸沒有買4DX的票。"                                                                                                                                                                                                                                                                   
## [15] "是也沒有那麼難看，當爽片看娛樂性很高，\r\n無論如何都比金剛戰士好看太多"                                                                                                                                                                                                                                                                        
## [16] "看完環2會想再看一次環1,比較喜歡環1的人物和劇情,環2的重點放對地方會好看許多~~~"                                                                                                                                                                                                                                                                 
## [17] "第一集真讚\r\n為了字數打第二排字\r\n為了字數打第三排字"                                                                                                                                                                                                                                                                                        
## [18] "2真的超爛,硬要找個黑鬼男當主角沒顏質,還很弱也能當戰士,瞎\r\n多年過後機器人也沒啥升級,然後搞個敵人突襲就可以把他們打趴了\r\n沒有一個人可以迎戰瞎爆,劇情比不上1不打緊,連個特效和顏質都贏不了慘"                                                                                                                                                  
## [19] "工讀生毀了這部片~~~真的~~~"                                                                                                                                                                                                                                                                                                                    
## [20] "沒什麼劇情\r\n\r\n沒有第一集的感動\r\n\r\n連打鬥的場面都還好"                                                                                                                                                                                                                                                                                  
## [21] "黑豹我給7.5分，這部我給2分。唯一亮點男主角很帥？真的沒第一部好看。中心駕駛就只有2個男主？然後那幾個訓練生？完全不合常理。因為受到突然攻擊全部掛掉？代表之前的訓練爛到爆。大家集合給飛彈打？完全不和常理。還有落下時早該全內出血爆了，裡面又沒任何防護。"                                                                                       
## [22] "簡直為景甜量身打造,從頭到尾刷存在感"                                                                                                                                                                                                                                                                                                           
## [23] "不錯看呀 ， 沒看過第一集也能看得懂。機甲戰士多了好幾隻"                                                                                                                                                                                                                                                                                        
## [24] "超級無敵大爛片！就像是鹹蛋超人打酷斯拉"                                                                                                                                                                                                                                                                                                        
## [25] "一級爛片，一個按鈕全部怪獸死光，看到睡著"                                                                                                                                                                                                                                                                                                      
## [26] "看到景田就要puke了<f0><U+009F><U+0098><U+0082><f0><U+009F><U+0098><U+0082><f0><U+009F><U+0098><U+0082><f0><U+009F><U+0098><U+0082><f0><U+009F><U+0098><U+0082>"                                                                                                                                                                                
## [27] "這部片名應該要改成環太平洋之變形金剛5+ID4星際重生，\r\n頂著環太平洋系列卻沒依循該系列的世界觀設定，\r\n就好像外盒寫著鼎泰豐裡面卻是路邊攤一樣，\r\n真的是非常的不好看，\r\n如果有看過第一集的更加千萬別去看，\r\n連加點吉拿棒-超粒方他們直播討論都罵到不行，\r\n專業影評都說爛了，\r\n你們總該相信了吧，\r\n\r\n~~真的不要去看，不然你會後悔~~"
## [28] "劇情跟第一集稍嫌薄弱\r\n中間的一個人講中文 一個人講英文 讓人有點出戲\r\n(真的有人會這樣溝通嗎.......)\r\n還有每個人的情感連結 都只有輕輕帶過 這部分也有點可惜\r\n但因為我對機器人的喜愛~~所以還是給了四顆星\r\n如果是抱持著想看到跟第一集一樣精彩的內容 可能會失望\r\n但如果只是想看機器人打怪獸 還是可以去看一下"                             
## [29] "一看到景甜就知道完蛋了，忍著滿腔的怒火看完整場！\r\n真的很失望，把我對第一集的好印象都抹殺了！"                                                                                                                                                                                                                                                
## [30] "老實說，昏昏睡睡總時間起碼超過了電影播放一半時間，超級大爛片！"

Demo20180401

David Chiu

2018年4月1日

使用爬蟲取得關鍵字詞

TFIDF

詞頻計算

詞頻矩陣

中文詞頻矩陣

詞頻矩陣應用

文章相似度計算

產生自動問答機器人

文章分群

文章分類

電影正負評判斷

Yahoo Crawler