Abstract
情緒分析 Treasure IslandSys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 3.4.4
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 3.4.4
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.4.4
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.4
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.4.4
library(scales)
## Warning: package 'scales' was built under R version 3.4.4
require(caTools)
## Loading required package: caTools
require(knitr)
## Loading required package: knitr
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
## Warning: package 'rtweet' was built under R version 3.4.4
library(dplyr)
library(xml2)
## Warning: package 'xml2' was built under R version 3.4.4
library(httr)
## Warning: package 'httr' was built under R version 3.4.4
library(jsonlite)
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:rtweet':
##
## flatten
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(data.tree)
## Warning: package 'data.tree' was built under R version 3.4.4
library(tidytext)
library(stringr)
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #轉換編碼
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址
txt = gsub("[\t]{2,}", "", txt) #去除兩個以上的tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
#最後再整理空格
txt = gsub("\\s+"," ",txt) #去除一個以上的空格
txt = gsub("^\\s+|\\s+$","",txt) #去除前後一個以上的空格
#只留下我們想看的字元
txt = gsub("[^a-zA-Z0-9?!.;\" ']","",txt) #除了字母,數字 ?!.' ,空白的都去掉
txt=gsub("(Mr|Dr|Miss|Ms|Mstr|Rs|Dr)\\.","",txt)
txt }
# 下載 "Treasure Island" 書籍,並且將text欄位為空的行給清除,以及將重複的語句清除 338
red_org <- gutenberg_download(120) %>% filter(text!="") %>% distinct(gutenberg_id, text) %>%
mutate(linenumber = row_number(),chapter = cumsum(str_detect(text, regex("^[0-9]+$",ignore_case = TRUE))),
chapter_ind = str_detect(text, regex("^[0-9]+$",ignore_case = TRUE)) |
lag(str_detect(text, regex("^[0-9]+$",ignore_case = TRUE))))
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
# 因為 "Treasure Island" 這本書,若直接合併,章節較難尋找,所以這裡的 code 比較迂迴一點
title_data=red_org %>% filter(chapter_ind) %>% select(text) %>% as.matrix() %>% matrix(byrow = T,ncol = 2) %>% data.frame()
colnames(title_data)=c("chapter","chapter_name")
red_org2<- red_org %>% filter(!chapter_ind & chapter!=0)
split_red_org2=split(red_org2,red_org2$chapter);
n=length(split_red_org2)
red=lapply(1:n, function(i){
tmp_red_org=split_red_org2[[i]];
doc = paste0(tmp_red_org$text,collapse = " ") %>% clean()
#利用 '. ','? '和'! '分成不同句子
docVector = unlist(strsplit(doc,"\\. |\\? |\\! "), use.names=FALSE)
red_tmp = data.frame(gutenberg_id = "120" , text = docVector,chapter=i,stringsAsFactors = FALSE) %>% filter(text!="" & text!=" ")
return(red_tmp)
}) %>% do.call(what = "rbind")%>%
mutate(linenumber = row_number());
#斷詞
red2=red%>%
unnest_tokens(word, text)
#計算 "AFinn" 情緒值
afinn <- red2 %>%
left_join(get_sentiments("afinn")) %>%
group_by(index = linenumber) %>%
summarise(sentiment = sum(score,na.rm = T)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
#計算"bing" 和 "NRC"情緒值
bing_and_nrc <- bind_rows(red2 %>%
left_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
red2 %>%
left_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive",
"negative"))) %>%
mutate(method = "NRC"))%>%
count(method, index = linenumber, sentiment)%>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)%>% select(index,sentiment,method)
## Joining, by = "word"
## Joining, by = "word"
red3=bind_rows(afinn,
bing_and_nrc) %>% spread(method,sentiment) %>% select(index,AFINN,`Bing et al.`,NRC)
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 生產core-nlp的api url,可以設定斷詞依據、以及要標註的任務
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}',
host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
generate_API_url("127.0.0.1")
host = "127.0.0.1"
source("sentiment_function.R")
取得coreNLP回傳的物件
load("coreNLP_HW_0330_all_120.RData")
# gc() #釋放不使用的記憶體
#
# t0 = Sys.time()
# #all
# obj = red[,c(3,4,2)] %>% filter(text != "") %>% coreNLP(host)
#
#
# #先過濾掉沒有內容的的tweet
# #丟入coreNLP的物件 必須符合: 是一個data.frame 且有一個text欄位
#
# Sys.time() - t0 #執行時間
# #Time difference of 17.89611 mins
#
# save.image("coreNLP_HW_0330_all_120.RData")
char2num<-function(x){
x %>% as.character %>% as.numeric()
}
sentiment = coreNLP_sentiment_parser(obj)
sentiment=sentiment %>% mutate(linenumber=char2num(linenumber),chapter=char2num(chapter),sentimentValue=(char2num(sentimentValue)-2))
red8=red3 %>% left_join(sentiment,c("index"="linenumber"))
#將情緒值轉換成 -1,0,1
red8_sign=red8 %>% select(AFINN,`Bing et al.`,NRC,sentimentValue) %>% mutate_all(sign)
#計算重疊率
sapply(red8_sign, function(x) sapply(red8_sign, function(y) mean(x==y))) %>% print
## AFINN Bing et al. NRC sentimentValue
## AFINN 1.0000000 0.6488076 0.5419797 0.4723946
## Bing et al. 0.6488076 1.0000000 0.5619079 0.4743548
## NRC 0.5419797 0.5619079 1.0000000 0.4227377
## sentimentValue 0.4723946 0.4743548 0.4227377 1.0000000
tmp=(table(red8_sign$sentimentValue,red8_sign$`Bing et al.`)/nrow(red8_sign))
rownames(tmp)=paste("corenlp",c("負","中","正"))
colnames(tmp)=paste("bing",c("負","中","正"))
tmp
##
## bing 負 bing 中 bing 正
## corenlp 負 0.18948056 0.18131330 0.13459654
## corenlp 中 0.06403136 0.21071545 0.07219863
## corenlp 正 0.01306762 0.06043777 0.07415877
red_tmp=red8 %>% filter(sign(sentimentValue)==-1,sign(`Bing et al.`)==1)
red_tmp %>% head(10) %>% kable
| index | AFINN | Bing et al. | NRC | chapter | text | sentiment | sentimentValue |
|---|---|---|---|---|---|---|---|
| 1 | 4 | 3 | 5 | 1 | SQUIRE TRELAWNEY Livesey and the rest of these gentlemen having asked me to write down the whole particulars about Treasure Island from the beginning to the end keeping nothing back but the bearings of the island and that only because there is still treasure not yet lifted I take up my pen in the year of grace 17 and go back to the time when my father kept the Admiral Benbow inn and the brown old seaman with the sabre cut first took up his lodging under our roof | Negative | -1 |
| 14 | 0 | 1 | 1 | 1 | Oh I see what you’re atthere“; and he threw down three or four gold pieces on the threshold | Negative | -1 |
| 24 | 0 | 1 | 3 | 1 | When a seaman did put up at the Admiral Benbow as now and then some did making by the coast road for Bristol he would look in at him through the curtained door before he entered the parlour; and he was always sure to be as silent as a mouse when any such was present | Negative | -1 |
| 26 | 1 | 1 | 0 | 1 | He had taken me aside one day and promised me a silver fourpenny on the first of every month if I would only keep my “weathereye open for a seafaring man with one leg” and let him know the moment he appeared | Negative | -1 |
| 52 | 5 | 3 | 1 | 1 | I followed him in and I remember observing the contrast the neat bright doctor with his powder as white as snow and his bright black eyes and pleasant manners made with the coltish country folk and above all with that filthy heavy bleared scarecrow of a pirate of ours sitting far gone in rum with his arms on the table | Negative | -1 |
| 55 | 1 | 1 | 3 | 1 | But by this time we had all long ceased to pay any particular notice to the song; it was new that night to nobody but Livesey and on him I observed it did not produce an agreeable effect for he looked up for a moment quite angrily before he went on with his talk to old Taylor the gardener on a new cure for the rheumatics | Negative | -1 |
| 61 | 7 | 3 | 2 | 1 | He spoke to him as before over his shoulder and in the same tone of voice rather high so that all the room might hear but perfectly calm and steady “If you do not put that knife this instant in your pocket I promise upon my honour you shall hang at the next assizes.” Then followed a battle of looks between them but the captain soon knuckled under put up his weapon and resumed his seat grumbling like a beaten dog | Negative | -1 |
| 67 | 0 | 1 | -1 | 2 | He sank daily and my mother and I had all the inn upon our hands and were kept busy enough without paying much regard to our unpleasant guest | Negative | -1 |
| 71 | 0 | 1 | 0 | 2 | Well mother was upstairs with father and I was laying the breakfasttable against the captain’s return when the parlour door opened and a man stepped in on whom I had never set my eyes before | Negative | -1 |
| 83 | 3 | 3 | 0 | 2 | We’ll put it for argument like that your captain has a cut on one cheekand we’ll put it if you like that that cheek’s the right one | Negative | -1 |
red9=red8 %>% group_by(index_line80=index%/%80) %>% summarise_each(funs(mean),-index,-chapter,-sentiment,-text)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.)
##
## # After:
## list(name = ~f(.))
## This warning is displayed once per session.
red10=red9%>% gather(method,sentiment,-index_line80)
#cor(red9 %>% select(-index_line80)) %>% print
#若要轉換成是以 chapter 是用這個 code
# red10=red8 %>% group_by(index_line80=chapter) %>% summarise_each(funs(mean),-index,-chapter,-sentiment,-text) %>% gather(method,sentiment,-index_line80)
red10%>%
ggplot(aes(index_line80, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
par(mfrow=c(1,1))
red_len=red8 %>% mutate(sentense_len=lengths(gregexpr("\\W+", text)) + 1)
m1=aov(log(sentense_len)~factor(sentimentValue),data=red_len)
summary(m1)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(sentimentValue) 4 388.7 97.17 216.2 <2e-16 ***
## Residuals 3056 1373.2 0.45
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(red_len$sentimentValue %>% as.factor(),red_len$sentense_len,xlab="情緒值",ylab="字數")
red_tmp1=red8 %>% filter(sign(sentimentValue)==-1)
red_bing=red2 %>% filter(linenumber %in% red_tmp1$index)
tmp1=red_bing %>% group_by(word) %>%
summarise(count=n()) %>%ungroup %>% mutate(pro=count/sum(count))%>% arrange(desc(count))
red_tmp2=red8 %>% filter(sign(sentimentValue)==1)
red_bing2=red2 %>% filter(linenumber %in% red_tmp2$index)
tmp2=red_bing2 %>% group_by(word) %>%
summarise(count=n())%>%ungroup%>% mutate(pro=count/sum(count))%>% arrange(desc(count))
tmp_all=tmp1 %>% full_join(tmp2,by=c("word")) %>% mutate_at(vars(-word), function(x) ifelse(is.na(x),0,x)) %>%
anti_join(stop_words)
## Joining, by = "word"
tmp_all2=tmp_all %>% mutate(abs_pro_diff=abs(pro.y-pro.x),sign_pro_diff=sign(pro.y-pro.x)) %>% mutate(freq=abs_pro_diff)%>% arrange(desc(abs_pro_diff)) %>% select(word,abs_pro_diff,sign_pro_diff,freq) %>% top_n(60,abs_pro_diff);
wordcloud2(tmp_all2,color = c("black","red")[(tmp_all2$sign_pro_diff+3)/2])
tokens = coreNLP_tokens_parser(obj)
tokens %>% head
## chapter linenumber word lemma pos ner
## 1 1 1 SQUIRE SQUIRE NNP O
## 2 1 1 TRELAWNEY TRELAWNEY NNP PERSON
## 3 1 1 Livesey Livesey NNP PERSON
## 4 1 1 and and CC O
## 5 1 1 the the DT O
## 6 1 1 rest rest NN O
coreNLP_tokens_parser欄位:
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 15, count) %>%
ungroup() %>%
mutate(lower_word = reorder(lower_word, count)) %>%
ggplot(aes(lower_word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(lower_word = reorder(lower_word, count)) %>%
ggplot(aes(lower_word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()
tokens %>%
anti_join(stop_words,by = c("lower_lemma"="word")) %>%
filter(str_detect(lower_lemma, regex("^[a-z].*$",ignore_case = TRUE)))%>%
group_by(lower_lemma) %>%
summarise(count = n()) %>%
top_n(n = 20, count) %>%
arrange(desc(count)) %>% kable
| lower_lemma | count |
|---|---|
| hand | 169 |
| captain | 159 |
| silver | 149 |
| time | 124 |
| doctor | 123 |
| ship | 121 |
| lay | 103 |
| begin | 91 |
| hear | 85 |
| run | 85 |
| jim | 84 |
| head | 77 |
| leave | 75 |
| squire | 75 |
| sea | 73 |
| fall | 72 |
| word | 72 |
| round | 69 |
| island | 68 |
| eye | 67 |