使用爬蟲取得關鍵字詞
#install.packages('rvest')
library(rvest)
## Loading required package: xml2
keywords <- read_html('http://news.ltn.com.tw/news/politics/breakingnews/2383080') %>% html_nodes('meta[name=keywords]') %>% html_attr('content') %>% strsplit(., ',') %>% unlist() %>% trimws()
keywords <- read_html('https://udn.com/news/story/6656/3063666?from=udn-ch1_breaknews-1-0-news') %>% html_nodes('meta[name=news_keywords]') %>% html_attr('content') %>% strsplit(., ',') %>% unlist() %>% trimws()
library(jiebaR)
## Loading required package: jiebaRD
#edit_dict()
USERPATH
## [1] "C:/Program Files/R/R-3.4.4/library/jiebaRD/dict/user.dict.utf8"
userdict <- "C:/Program Files/R/R-3.4.4/library/jiebaRD/dict/user.dict.utf8"
f <- file(userdict, 'a', encoding = 'utf-8')
for(k in keywords){
cat(k , '\n', file= f)
}
close(f)
TFIDF
a <- c('a')
abb <- c('a','b','b')
abc <- c('a','b','c')
D <- list(a,abb,abc)
#tfidf('a', a, D)
tf <- 1/1
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('a', abb, D)
tf <- 1/3
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('a', abc, D)
tf <- 1/3
idf <- log(3/3)
tf * idf
## [1] 0
#tfidf('b', abb, D)
tf <- 2/3
idf <- log(3/2)
tf * idf
## [1] 0.2703101
#tfidf('b', abc, D)
tf <- 1/3
idf <- log(3/2)
tf * idf
## [1] 0.135155
#tfidf('c', abc, D)
tf <- 1/3
idf <- log(3/1)
tf * idf
## [1] 0.3662041
tfidf <- function(t, d, D){
tf <- sum(d == t) / length(d)
idf <- log(length(D) / sum(sapply(D, function(doc) t %in% doc)) )
return(tf * idf)
}
tfidf('a', a, D)
## [1] 0
tfidf('a', abb, D)
## [1] 0
tfidf('a', abc, D)
## [1] 0
tfidf('b', abb, D)
## [1] 0.2703101
tfidf('b', abc, D)
## [1] 0.135155
tfidf('c', abc, D)
## [1] 0.3662041
詞頻計算
news <- '行政院長賴清德近日表示,蔡政府擬重啟的深澳火力發電廠所燒的煤是「乾淨的煤」;對因推動非核家園調整能源政策而導致電價上漲,賴也聲稱兩者無關。國民黨發言人洪孟楷今趁愚人節揶揄,賴所稱「乾淨的煤」是本世界最偉大發現,應拿諾貝爾。
洪孟楷上午在個人臉書發表4月1日專屬愚人節版「自冉報」,透過「記者余仁傑」在頭版報導,以諧音的方式,揶揄賴清德(賴青德)、政院發言人徐國勇(徐國男)、總統蔡英文(蔡央文),「乾淨的煤」說法應獲「言若貝爾獎」(暗指「諾貝爾」)。
洪孟楷還以主標「台灣:乾淨的煤找到了!」,副標「本世界最偉大發現,卻無緣今年言若貝爾,賴青德表示:就當作功德」,在個人臉書推播該則「本日頭版新聞」。
洪孟楷在所發行的「自冉報」撰文中表示,台灣本月20號提出跨時代發現,由行政院長賴青德公開、行政院發言人徐國男認證,足以拿下今年言若貝爾獎的「乾淨的煤」,因為提名截止,所以今年確定無法得獎。行政院長賴青德因此公開表示:沒能在今年得此殊榮,非常遺憾,這不只是功德台灣、更是功德世界。
文中也提到,據我國中研院表示:乾淨的煤是「無汙染、對人體健康無害、且存在於人類幻想」的新發明,這項發明顛覆了人們過往的化學、醫學、還有神學的研究,明明是炭,燃燒卻不會產生煤灰,其偉大成就如同「不臭的臭豆腐」、「玫瑰花香的榴蓮」齊名;明明是pm2.5卻不會傷害人體。其影響力絕對有87分那麼高,可稱之為21世紀的工業革命。
文中指出,這樣子顛覆性的發明,有望一舉奪下言若貝爾化學、醫學雙獎項,更有可能藉由販賣乾淨的煤改善經濟結構,連言若貝爾經濟學獎一併囊獲入袋。只遺憾言若貝爾獎的提名在今年的二月截止,若不是民進黨政府今年二月忙著修改勞基法,台灣將有機會破天荒的同年一舉拿下三項不同領域大獎,這是言若貝爾獎成立以來的第一次。
洪孟楷還揶揄,經記者余仁傑實地訪問,發現言若貝爾委員會對乾淨的煤不能獲獎也抱持著遺憾。因「勞工是委員會心裡最軟的一塊」、「台中市是空汙防治最成功的」。所以直言明年一定提名乾淨的煤。
洪孟楷說,若讀者覺得今天新聞有問題?那「蔡央文說過:有問題去跟你們老闆說!一定要自立自強歐!」'
library(jiebaR)
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg <- worker()
seg.vec <- segment(code = news, jiebar = mixseg)
tb <- table(seg.vec)
#install.packages('wordcloud2')
library(wordcloud2)
wordcloud2(tb[nchar(names(tb)) >= 2 & tb >= 2 & grepl(names(tb), pattern = '^[\u4e00-\u9fa5]+$')])
詞頻矩陣
#install.packages('tm')
library(tm)
## Loading required package: NLP
e3 <- 'Hello, I am David. I have taken over 100 courses ~~~'
e3.list <- strsplit(e3, ' ')
e3.corpus <- Corpus(VectorSource(e3.list))
dtm <- DocumentTermMatrix(e3.corpus)
dtm
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 7)>>
## Non-/sparse entries: 7/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 100 courses david have hello over taken
## 1 1 1 1 1 1 1 1
dtm <- DocumentTermMatrix(e3.corpus, control= list(wordLengths = c(1,20)) )
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 10)>>
## Non-/sparse entries: 10/0
## Sparsity : 0%
## Maximal term length: 7
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 100 am c courses david have hello i over taken
## 1 1 1 1 1 1 1 1 2 1 1
getTransformations()
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
doc <- tm_map(e3.corpus, removeNumbers)
doc <- tm_map(doc, removePunctuation)
#install.packages('SnowballC')
doc <- tm_map(doc, stemDocument)
dtm <- DocumentTermMatrix(doc, control= list(wordLengths = c(1,20)) )
inspect(dtm)
## <<DocumentTermMatrix (documents: 1, terms: 8)>>
## Non-/sparse entries: 8/0
## Sparsity : 0%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs am chello cours david have i over taken
## 1 1 1 1 1 1 2 1 1
e1 <- 'this is a book'
e2 <- 'this is my car'
e.list <- strsplit(c(e1,e2), ' ')
e.corpus <- Corpus(VectorSource(e.list))
dtm <- DocumentTermMatrix(e.corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 3)>>
## Non-/sparse entries: 4/2
## Sparsity : 33%
## Maximal term length: 4
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs book car this
## 1 1 0 1
## 2 0 1 1
?DocumentTermMatrix
## starting httpd help server ...
## done
sw <- stopwords('en')
sw
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
tfidf.dtm <- DocumentTermMatrix(e.corpus, control= list(weighting =function(x)
weightTfIdf(x, normalize = FALSE), stopwords=TRUE))
inspect(tfidf.dtm)
## <<DocumentTermMatrix (documents: 2, terms: 2)>>
## Non-/sparse entries: 2/2
## Sparsity : 50%
## Maximal term length: 4
## Weighting : term frequency - inverse document frequency (tf-idf)
## Sample :
## Terms
## Docs book car
## 1 1 0
## 2 0 1
中文詞頻矩陣
library(jiebaR)
edit_dict()
## Warning in edit_dict(): You should save the dictionary without BOM on
## Windows
mixseg=worker()
s <- "大巨蛋案對市府同仁下封口令?柯P否認"
s1 <- "柯P市府近來飽受大巨蛋爭議"
s.vec <- segment(code=s, jiebar = mixseg)
s1.vec <- segment(code=s1, jiebar = mixseg)
sw <- c('近來')
s.corpus <- Corpus(VectorSource(list(s.vec, s1.vec)))
dtm <- DocumentTermMatrix(s.corpus, control = list(stopwords=sw) )
inspect(dtm)
## <<DocumentTermMatrix (documents: 2, terms: 7)>>
## Non-/sparse entries: 9/5
## Sparsity : 36%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 大巨蛋 市府 同仁 爭議 封口令 案對 飽受
## 1 1 1 1 0 1 1 0
## 2 1 1 0 1 0 0 1
詞頻矩陣應用
load("C:/Users/user/Downloads/appledaily.RData")
#head(appledaily)
library(jiebaR)
#edit_dict()
mixseg <- worker()
apple.seg <- lapply(appledaily$content, function(article) {
s <- segment(code=article, jiebar = mixseg)
return(s[grepl('[\u4e00-\u9fa5]+', s)])
})
#apple.seg[[1]]
doc <- Corpus(VectorSource(apple.seg))
doc <- tm_map(doc, removeNumbers)
dtm <- DocumentTermMatrix(doc)
dtm
## <<DocumentTermMatrix (documents: 1500, terms: 35247)>>
## Non-/sparse entries: 157074/52713426
## Sparsity : 100%
## Maximal term length: 12
## Weighting : term frequency (tf)
findFreqTerms(dtm, lowfreq = 200, highfreq = 300)
## [1] "已經" "男子" "要求" "調查" "相關" "人員" "其中"
## [8] "照片" "今<a6>" "未來" "政府" "第<a4>" "這些" "媒體"
## [15] "影響" "民眾" "國家" "希望" "最後" "工<a7>" "如果"
## [22] "是<a7>" "經濟" "不是" "不過" "這個" "這樣" "不<b7>"
## [29] "市府" "還是" "萬元" "大巨蛋"
findAssocs(dtm,'大巨蛋', 0.5)
## $大巨蛋
## 遠雄 解約 市府 展延 工期 口稱 已朝 已無太多
## 0.86 0.78 0.74 0.72 0.70 0.69 0.69 0.69
## 心叵測 王貞治 他怕 若不<c4> 哲見 核子彈 問及此事 理還亂
## 0.69 0.69 0.69 0.69 0.69 0.69 0.69 0.69
## 連通 期議約 菸廠 進度條 聘書 運棄 孵出 蔡宗易
## 0.69 0.69 0.69 0.69 0.69 0.69 0.69 0.69
## 談盤 應<a5> 遽增 歸責 難解 觸礁 議約 無解
## 0.69 0.69 0.69 0.69 0.69 0.69 0.69 0.68
## 停工 方向 五大 溫室 拋出 南線 貿然 和<a5>
## 0.64 0.62 0.62 0.61 0.61 0.60 0.60 0.60
## 量體 案朝 逾期 逃生 違約 實地 懶人 容納
## 0.59 0.56 0.56 0.55 0.54 0.53 0.52 0.51
dtm.remove <- removeSparseTerms(dtm,0.995)
dim(dtm.remove)
## [1] 1500 3466
head(dtm.remove$dimnames$Terms)
## [1] "上個月" "已將" "已經" "不已" "之際" "分局"
文章相似度計算
a <- c(1,1)
b <- c(1,1)
a1 <- c(1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
b1 <- c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
dist(rbind(a,b), method = 'euclidean')
dist(rbind(a1,b1), method = 'euclidean')
# install.packages('proxy')
library(proxy)
proxy::dist(rbind(a,b), method = 'cosine')
proxy::dist(rbind(a1,b1), method = 'cosine')
a <- c(1,2,2,1,1,1,0)
b <- c(1,2,2,1,1,2,1)
1 - proxy::dist(rbind(a,b), method = 'cosine')
library(jiebaR)
mixseg=worker()
s <- "大巨蛋案對市府同仁下封口令?柯P否認"
s1 <- "柯P市府近來飽受大巨蛋爭議"
s2 <- "扁爆綠將與柯P合作 蔡英文:對扁保外就醫不會是好事"
s.vec <- segment(code=s, jiebar = mixseg)
s1.vec <- segment(code=s1, jiebar = mixseg)
s2.vec <- segment(code=s2, jiebar = mixseg)
sw <- c('近來')
s.corpus <- Corpus(VectorSource(list(s.vec, s1.vec, s2.vec)))
dtm <- DocumentTermMatrix(s.corpus, control = list(stopwords=sw) )
inspect(dtm)
dtm.dist <- proxy::dist(as.matrix(dtm), method = 'cosine')
as.matrix(dtm.dist)
load("C:/Users/user/Downloads/appledaily.RData")
library(jiebaR)
#edit_dict()
mixseg <- worker()
apple.seg <- lapply(appledaily$content, function(article) {
s <- segment(code=article, jiebar = mixseg)
return(s[grepl('[\u4e00-\u9fa5]+', s)])
})
doc <- Corpus(VectorSource(apple.seg))
doc <- tm_map(doc, removeNumbers)
dtm <- DocumentTermMatrix(doc)
dim(dtm)
dtm.remove <- removeSparseTerms(dtm,0.995)
dim(dtm.remove)
dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
dtm.mat <- as.matrix(dtm.dist)
getSimiliarNews <- function(idx){
print(paste('查詢新聞:', appledaily$title[idx]))
rank <- order(dtm.mat[idx,])[2:10]
for (rec in rank){
if (dtm.mat[idx, rec] <= 0.6){
print(paste('相關新聞:', appledaily$title[rec]))
}
}
}
getSimiliarNews(39)
產生自動問答機器人
library(rvest)
page <- read_html('https://www.aia.com.tw/zh-tw/help-and-support/important-info/faq.html')
questions <- page %>% html_nodes('.collapsible-table-title') %>% html_text() %>% trimws()
answers <- page %>% html_nodes('.collapsible-table-content') %>% html_text() %>% trimws()
#answers
qa <- data.frame(questions, answers, stringsAsFactors = FALSE)
q <- '如何選擇正確的保險商品'
library(jiebaR)
mixseg <- worker()
qa.seg <- lapply(c(q, qa$questions), function(e) segment(code = e, mixseg))
qa.corpus <- Corpus(VectorSource(qa.seg))
qa.dtm <- DocumentTermMatrix(qa.corpus)
qa.dist <- proxy::dist(as.matrix(qa.dtm), method = 'cosine')
qa.mat <- as.matrix(qa.dist)
answers <- c('DUMMY',qa$answers)
questions <- c(q, qa$questions)
questions[order(qa.mat[1,])[2]]
## [1] "如何選擇合適的保險商品?"
answers[order(qa.mat[1,])[2]]
## [1] "答:\n\n\n\n人生邁入不同階段,所需的保障內容也隨之改變,本公司貼心提供保障缺口分析服務,協助您做好完整的保障規劃,降低未來個人或家庭經濟生活之不確定性。"
文章分群
load("C:/Users/user/Downloads/appledaily.RData")
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(appledaily$content, function(article) {
s <- segment(code=article, jiebar = mixseg)
return(s[grepl('[\u4e00-\u9fa5]+', s)])
})
doc <- Corpus(VectorSource(apple.seg))
doc <- tm_map(doc, removeNumbers)
dtm <- DocumentTermMatrix(doc)
dim(dtm)
dtm.remove <- removeSparseTerms(dtm,0.995)
dim(dtm.remove)
dtm.dist <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
dtm.mat <- as.matrix(dtm.dist)
hc <- hclust(dtm.dist, method = 'ward.D2')
plot(hc, hang=-0.1)
rect.hclust(hc, k = 22)
fit <- cutree(hc, k = 22)
appledaily$title[fit == 8]
#install.packages('igraph')
library(igraph)
filter_score <- function(x) {
x <- ifelse(x < 0.5, 1, 0)
return(x)
}
score.csm <- apply(dtm.mat, MARGIN = 2, filter_score)
score.csm[1:3,1:3]
G <- graph_from_adjacency_matrix(score.csm, mode = c( "undirected"))
wc <- cluster_walktrap(G)
sort(table(wc$membership), decreasing = TRUE)
appledaily$title[wc$membership == 8]
文章分類
load("C:/Users/user/Downloads/appledaily.RData")
apple.subset <- appledaily[appledaily$category %in% c('財經', '娛樂', '社會' ), ]
apple.subset$category <- as.factor(apple.subset$category)
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(apple.subset$content, function(article) {
s <- segment(code=article, jiebar = mixseg)
return(s[grepl('[\u4e00-\u9fa5]+', s)])
})
library(tm)
apple.corpus <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(apple.corpus)
dim(dtm)
## [1] 428 15915
dtm.remove <- removeSparseTerms(dtm, 0.995)
dim(dtm.remove)
## [1] 428 3620
convert_counts<-function(x){
x <-ifelse(x >0, 1, 0)
x <-factor(x, levels=c(0, 1), labels=c("No", "Yes"))
return(x)
}
dtm.count<-apply(dtm.remove, MARGIN =2, convert_counts)
dtm.count[1:3,1:3]
## Terms
## Docs 口罩 已將 已經
## 1 "Yes" "Yes" "Yes"
## 2 "No" "No" "No"
## 3 "No" "No" "No"
m <- as.data.frame(dtm.count)
set.seed(42)
idx <- sample.int(2, nrow(m), replace=TRUE, prob = c(0.7,0.3))
trainset <- m[idx== 1, ]
traintitle <- apple.subset$title[idx == 1]
traintag <- apple.subset$category[idx==1]
testset <- m[idx== 2, ]
testtitle <- apple.subset$title[idx == 2]
testtag <- apple.subset$category[idx==2]
dim(trainset)
## [1] 298 3620
length(traintag)
## [1] 298
dim(testset)
## [1] 130 3620
length(testtag)
## [1] 130
library(e1071)
model <- naiveBayes(trainset, traintag)
pred <- predict(model, testset)
tb<- table(testtag,pred)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## pred
## testtag 社會 娛樂 財經
## 社會 50 2 0
## 娛樂 1 41 0
## 財經 0 1 35
##
## Overall Statistics
##
## Accuracy : 0.9692
## 95% CI : (0.9231, 0.9916)
## No Information Rate : 0.3923
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9533
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 社會 Class: 娛樂 Class: 財經
## Sensitivity 0.9804 0.9318 1.0000
## Specificity 0.9747 0.9884 0.9895
## Pos Pred Value 0.9615 0.9762 0.9722
## Neg Pred Value 0.9872 0.9659 1.0000
## Prevalence 0.3923 0.3385 0.2692
## Detection Rate 0.3846 0.3154 0.2692
## Detection Prevalence 0.4000 0.3231 0.2769
## Balanced Accuracy 0.9775 0.9601 0.9947
which(pred != testtag)
## [1] 4 113 127 128
i <- 128
pred[i]
## [1] 娛樂
## Levels: 社會 娛樂 財經
testtag[i]
## [1] 財經
## Levels: 社會 娛樂 財經
testtitle[i]
## [1] "大倒角握感如何? 網友試機HTC 10"
(37 + 38) / (37 +38 +1 + 2)
## [1] 0.9615385
電影正負評判斷
library(readxl)
yahoo_movie <- read_excel("C:/Users/user/Downloads/yahoo_movie.xlsx")
#View(yahoo_movie)
movies <- yahoo_movie[yahoo_movie$status %in% c('good', 'bad'),]
movies <- movies[nchar(movies$content) >=2, ]
movies$status <- as.factor(movies$status)
library(jiebaR)
mixseg <- worker()
movie.seg <- lapply(movies$content, function(article) {
s <- segment(code=article, jiebar = mixseg)
return(s[grepl('[\u4e00-\u9fa5]+', s)])
})
movie.seg <- lapply(movies$content, function(article) {
s <- segment(code=article, jiebar = mixseg)
return(s)
})
movie.corpus <- Corpus(VectorSource(movie.seg))
dtm <- DocumentTermMatrix(movie.corpus)
dim(dtm)
## [1] 963 4916
convert_counts<-function(x){
x <-ifelse(x >0, 1, 0)
x <-factor(x, levels=c(0, 1), labels=c("No", "Yes"))
return(x)
}
dtm.count<-apply(dtm, MARGIN =2, convert_counts)
m <- as.data.frame(dtm.count)
set.seed(42)
idx <- sample.int(2, nrow(m), replace=TRUE, prob = c(0.7,0.3))
trainset <- m[idx== 1, ]
traintag <- movies$status[idx==1]
testset <- m[idx== 2, ]
testtag <- movies$status[idx==2]
dim(trainset)
## [1] 680 4916
length(traintag)
## [1] 680
dim(testset)
## [1] 283 4916
length(testtag)
## [1] 283
library(e1071)
model <- naiveBayes(trainset, traintag)
pred <- predict(model, testset)
tb<- table(testtag,pred)
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## pred
## testtag bad good
## bad 113 15
## good 77 78
##
## Accuracy : 0.6749
## 95% CI : (0.617, 0.7292)
## No Information Rate : 0.6714
## P-Value [Acc > NIR] : 0.4777
##
## Kappa : 0.3704
## Mcnemar's Test P-Value : 2.022e-10
##
## Sensitivity : 0.5947
## Specificity : 0.8387
## Pos Pred Value : 0.8828
## Neg Pred Value : 0.5032
## Prevalence : 0.6714
## Detection Rate : 0.3993
## Detection Prevalence : 0.4523
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : bad
##
Yahoo Crawler
library(httr)
##
## Attaching package: 'httr'
## The following object is masked from 'package:caret':
##
## progress
## The following object is masked from 'package:NLP':
##
## content
library(rvest)
reviews_vec <- c()
for(i in seq(1,3)){
page <- read_html(paste0('https://movies.yahoo.com.tw/movieinfo_review.html/id=7352?sort=update_ts&order=desc&page=',i))
reviews <- page %>% html_nodes('#form_good1')
for(rec in reviews){
reviews_vec <- c(reviews_vec,rec %>% html_nodes('span') %>% .[3] %>% html_text)
}
}
reviews_vec
## [1] "大部分場景在中國 決戰地卻在日本 應該是為了要迎合中國市場的片子 不要動腦的話覺得這部還好 大概就3.6分吧 不能再高了"
## [2] "我覺得以爽片來說,這次續集絕對符合達標,但就是少了第一集的末日絕望感,但續集可以看到很多隻機甲戰士也算彌補這個缺點了。"
## [3] "覺得第二集確實沒有第一集當初給的感動,不過卻也是有另外一種看頭,希望第三集可以結合一二集的特長。"
## [4] "有趣的題材,可以拍成這樣平淡無聊,我也是服了!"
## [5] "其實,機甲打怪獸的橋段都相當喜歡啊!非常精彩!看到一個不錯客觀的影評,分享給大家參考:https://youtu.be/nUytG9s26OY"
## [6] "期待哪麼久的片子 非常的失望,與第一集相比第二集是為了年輕人硬拍的片子,影片該有的陳續 劇情 根本連接不起來 就是特效片,跟第一集看完的感動跟看到主角努力重頭來過,根本無法相比 還不如不要拍"
## [7] "不知道看了什!感覺是不同導演,不同特效做的..."
## [8] "超好看,說不好看的眼睛是瞎了嗎!"
## [9] "劇情比第一集薄弱些,除此之外,創意、畫面都比第一集強,怪物跟機器人都大升級,當爽片看的話,比第一集爽了些;抱著爽片的心情去看的話,這部片值得4.5以上。"
## [10] "如果我唸小學,應該會覺得這部片好看。。。"
## [11] "知道負評一堆,但我哥堅持要看。抱著有些些網友說,當爽片看的心,看了。結果…糟點太多,我都不知從何吐嘈起。看完整個怒阿,整部戲一路眼神死加出戲。真心覺得網友負評好實在,好貼切,把我心聲都說出來了^_^。不辜負那3.6分阿,應該還會降低吧~"
## [12] "打鬥場面很少,除了吉普賽,其他隻出來幾分鐘而已"
## [13] "第一集的故事、演員整體表現、畫面<U+22EF>簡直是<f0><U+009F><U+0092><U+00AF>\r\n可能是第一集太好了\r\n第二集的故事性顯得有些草率\r\n尤其是看到景甜的演技屢屢讓我出戲\r\n實在很扣分"
## [14] "覺得失望,跟第一集差太多。第二集幾乎沒有劇情也沒有重點,慶幸沒有買4DX的票。"
## [15] "是也沒有那麼難看,當爽片看娛樂性很高,\r\n無論如何都比金剛戰士好看太多"
## [16] "看完環2會想再看一次環1,比較喜歡環1的人物和劇情,環2的重點放對地方會好看許多~~~"
## [17] "第一集真讚\r\n為了字數打第二排字\r\n為了字數打第三排字"
## [18] "2真的超爛,硬要找個黑鬼男當主角沒顏質,還很弱也能當戰士,瞎\r\n多年過後機器人也沒啥升級,然後搞個敵人突襲就可以把他們打趴了\r\n沒有一個人可以迎戰瞎爆,劇情比不上1不打緊,連個特效和顏質都贏不了慘"
## [19] "工讀生毀了這部片~~~真的~~~"
## [20] "沒什麼劇情\r\n\r\n沒有第一集的感動\r\n\r\n連打鬥的場面都還好"
## [21] "黑豹我給7.5分,這部我給2分。唯一亮點男主角很帥?真的沒第一部好看。中心駕駛就只有2個男主?然後那幾個訓練生?完全不合常理。因為受到突然攻擊全部掛掉?代表之前的訓練爛到爆。大家集合給飛彈打?完全不和常理。還有落下時早該全內出血爆了,裡面又沒任何防護。"
## [22] "簡直為景甜量身打造,從頭到尾刷存在感"
## [23] "不錯看呀 , 沒看過第一集也能看得懂。機甲戰士多了好幾隻"
## [24] "超級無敵大爛片!就像是鹹蛋超人打酷斯拉"
## [25] "一級爛片,一個按鈕全部怪獸死光,看到睡著"
## [26] "看到景田就要puke了<f0><U+009F><U+0098><U+0082><f0><U+009F><U+0098><U+0082><f0><U+009F><U+0098><U+0082><f0><U+009F><U+0098><U+0082><f0><U+009F><U+0098><U+0082>"
## [27] "這部片名應該要改成環太平洋之變形金剛5+ID4星際重生,\r\n頂著環太平洋系列卻沒依循該系列的世界觀設定,\r\n就好像外盒寫著鼎泰豐裡面卻是路邊攤一樣,\r\n真的是非常的不好看,\r\n如果有看過第一集的更加千萬別去看,\r\n連加點吉拿棒-超粒方他們直播討論都罵到不行,\r\n專業影評都說爛了,\r\n你們總該相信了吧,\r\n\r\n~~真的不要去看,不然你會後悔~~"
## [28] "劇情跟第一集稍嫌薄弱\r\n中間的一個人講中文 一個人講英文 讓人有點出戲\r\n(真的有人會這樣溝通嗎.......)\r\n還有每個人的情感連結 都只有輕輕帶過 這部分也有點可惜\r\n但因為我對機器人的喜愛~~所以還是給了四顆星\r\n如果是抱持著想看到跟第一集一樣精彩的內容 可能會失望\r\n但如果只是想看機器人打怪獸 還是可以去看一下"
## [29] "一看到景甜就知道完蛋了,忍著滿腔的怒火看完整場!\r\n真的很失望,把我對第一集的好印象都抹殺了!"
## [30] "老實說,昏昏睡睡總時間起碼超過了電影播放一半時間,超級大爛片!"