系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

## [1] ""

安裝需要的packages

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

require(dplyr)

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.4.4

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(tidytext)

## Loading required package: tidytext

require(jiebaR)

## Loading required package: jiebaR

## Warning: package 'jiebaR' was built under R version 3.4.4

## Loading required package: jiebaRD

require(gutenbergr)

## Loading required package: gutenbergr

## Warning: package 'gutenbergr' was built under R version 3.4.4

library(stringr)

## Warning: package 'stringr' was built under R version 3.4.4

library(wordcloud2)

## Warning: package 'wordcloud2' was built under R version 3.4.4

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.4.4

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.4.4

library(scales)

## Warning: package 'scales' was built under R version 3.4.4

require(caTools)

## Loading required package: caTools

require(knitr)

## Loading required package: knitr

library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)

## Warning: package 'rtweet' was built under R version 3.4.4

library(dplyr)
library(xml2)

## Warning: package 'xml2' was built under R version 3.4.4

library(httr)

## Warning: package 'httr' was built under R version 3.4.4

library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:rtweet':
## 
##     flatten

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

library(data.tree)

## Warning: package 'data.tree' was built under R version 3.4.4

library(tidytext)
library(stringr)

clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #轉換編碼
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址
  txt = gsub("[\t]{2,}", "", txt) #去除兩個以上的tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  
  #最後再整理空格
  txt = gsub("\\s+"," ",txt) #去除一個以上的空格
  txt = gsub("^\\s+|\\s+$","",txt) #去除前後一個以上的空格
  
  #只留下我們想看的字元
  txt = gsub("[^a-zA-Z0-9?!.;\" ']","",txt) #除了字母,數字 ?!.' ,空白的都去掉
  txt=gsub("(Mr|Dr|Miss|Ms|Mstr|Rs|Dr)\\.","",txt)

  txt }

書籍下載

從 gutenberg 下載 id 為 120 的【Treasure Island】書籍
整理書籍成一列一句的資料格式

# 下載 "Treasure Island" 書籍，並且將text欄位為空的行給清除，以及將重複的語句清除 338
red_org <- gutenberg_download(120) %>% filter(text!="") %>% distinct(gutenberg_id, text) %>% 
  mutate(linenumber = row_number(),chapter = cumsum(str_detect(text, regex("^[0-9]+$",ignore_case = TRUE))),
         chapter_ind = str_detect(text, regex("^[0-9]+$",ignore_case = TRUE)) | 
           lag(str_detect(text, regex("^[0-9]+$",ignore_case = TRUE))))

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

# 因為 "Treasure Island" 這本書，若直接合併，章節較難尋找，所以這裡的 code 比較迂迴一點
title_data=red_org %>% filter(chapter_ind) %>% select(text) %>% as.matrix() %>%  matrix(byrow = T,ncol = 2) %>% data.frame() 
colnames(title_data)=c("chapter","chapter_name")

red_org2<- red_org %>% filter(!chapter_ind & chapter!=0)
split_red_org2=split(red_org2,red_org2$chapter);

n=length(split_red_org2)
red=lapply(1:n, function(i){
  tmp_red_org=split_red_org2[[i]];
  doc = paste0(tmp_red_org$text,collapse = " ") %>% clean() 
  #利用 '. ','? '和'! '分成不同句子  
  docVector = unlist(strsplit(doc,"\\. |\\? |\\! "), use.names=FALSE)
  red_tmp = data.frame(gutenberg_id = "120" , text = docVector,chapter=i,stringsAsFactors = FALSE) %>% filter(text!="" & text!=" ")
  return(red_tmp)
}) %>% do.call(what = "rbind")%>%
  mutate(linenumber = row_number());

情緒分析

Afinn, bing 和 NRC 情緒計算

使用 “Afinn”、“bing” 和 “NRC” 情緒辭典分別計算每句的情緒
Afinn 直接進行相加
“bing” 和 “NRC” 是使用 ‘positive’ 個數想減 ‘negetive’ 個數

#斷詞
red2=red%>%
  unnest_tokens(word, text)

#計算 "AFinn" 情緒值
afinn <- red2 %>% 
  left_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber) %>% 
  summarise(sentiment = sum(score,na.rm = T)) %>% 
  mutate(method = "AFINN")

## Joining, by = "word"

#計算"bing" 和 "NRC"情緒值
bing_and_nrc <- bind_rows(red2 %>% 
                            left_join(get_sentiments("bing")) %>%
                            mutate(method = "Bing et al."),
                          red2 %>% 
                            left_join(get_sentiments("nrc") %>% 
                                         filter(sentiment %in% c("positive", 
                                                                 "negative"))) %>%
                            mutate(method = "NRC"))%>%
  count(method, index = linenumber, sentiment)%>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)%>% select(index,sentiment,method)

## Joining, by = "word"
## Joining, by = "word"

red3=bind_rows(afinn,
          bing_and_nrc)  %>% spread(method,sentiment) %>% select(index,AFINN,`Bing et al.`,NRC)

使用CoreNLP 計算情緒

API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 生產core-nlp的api url，可以設定斷詞依據、以及要標註的任務
generate_API_url <- function(host, port="9000",
                             tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
  url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}',
                 host, port, tokenize.whitespace, annotators)
  url <- URLencode(url)
}
generate_API_url("127.0.0.1")

host = "127.0.0.1"
source("sentiment_function.R")

coreNLP 服務分析句子

將句子丟入服務

取得coreNLP回傳的物件

load("coreNLP_HW_0330_all_120.RData")
# gc() #釋放不使用的記憶體
# 
# t0 = Sys.time()
# #all
# obj = red[,c(3,4,2)]  %>% filter(text != "") %>% coreNLP(host)
# 
# 
# #先過濾掉沒有內容的的tweet
# #丟入coreNLP的物件 必須符合: 是一個data.frame 且有一個text欄位
# 
# Sys.time() - t0 #執行時間
# #Time difference of 17.89611 mins
# 
# save.image("coreNLP_HW_0330_all_120.RData")

利用 coreNLP 計算句子情緒
與 “AFinn”、“bing” 和 “NRC” 的情緒值合併
將情緒值轉換成 -1,0,1 負面，無情緒與正面，並計算重疊率

char2num<-function(x){
  x %>% as.character %>% as.numeric()
}
sentiment = coreNLP_sentiment_parser(obj)
sentiment=sentiment %>% mutate(linenumber=char2num(linenumber),chapter=char2num(chapter),sentimentValue=(char2num(sentimentValue)-2))

red8=red3 %>% left_join(sentiment,c("index"="linenumber"))

#將情緒值轉換成 -1,0,1 
red8_sign=red8 %>% select(AFINN,`Bing et al.`,NRC,sentimentValue) %>% mutate_all(sign)
#計算重疊率
sapply(red8_sign, function(x) sapply(red8_sign, function(y) mean(x==y))) %>% print

##                    AFINN Bing et al.       NRC sentimentValue
## AFINN          1.0000000   0.6488076 0.5419797      0.4723946
## Bing et al.    0.6488076   1.0000000 0.5619079      0.4743548
## NRC            0.5419797   0.5619079 1.0000000      0.4227377
## sentimentValue 0.4723946   0.4743548 0.4227377      1.0000000

從上表可得出在句子上 “bing” 和 “corenlp” 情緒結果較為相近
將 “bing” 和 “corenlp” 各句子的正負面對應表以比例輸出

tmp=(table(red8_sign$sentimentValue,red8_sign$`Bing et al.`)/nrow(red8_sign)) 
rownames(tmp)=paste("corenlp",c("負","中","正"))
colnames(tmp)=paste("bing",c("負","中","正"))
tmp

##             
##                 bing 負    bing 中    bing 正
##   corenlp 負 0.18948056 0.18131330 0.13459654
##   corenlp 中 0.06403136 0.21071545 0.07219863
##   corenlp 正 0.01306762 0.06043777 0.07415877

從結果可以知道大部分不合理資料集中在當 “corenlp” 為負面和 “bing” 為正面時
故將部部分不一致的結果進行輸出

red_tmp=red8 %>% filter(sign(sentimentValue)==-1,sign(`Bing et al.`)==1)
red_tmp %>% head(10) %>% kable

index	AFINN	Bing et al.	NRC	chapter	text	sentiment	sentimentValue
1	4	3	5	1	SQUIRE TRELAWNEY Livesey and the rest of these gentlemen having asked me to write down the whole particulars about Treasure Island from the beginning to the end keeping nothing back but the bearings of the island and that only because there is still treasure not yet lifted I take up my pen in the year of grace 17 and go back to the time when my father kept the Admiral Benbow inn and the brown old seaman with the sabre cut first took up his lodging under our roof	Negative	-1
14	0	1	1	1	Oh I see what you’re atthere“; and he threw down three or four gold pieces on the threshold	Negative	-1
24	0	1	3	1	When a seaman did put up at the Admiral Benbow as now and then some did making by the coast road for Bristol he would look in at him through the curtained door before he entered the parlour; and he was always sure to be as silent as a mouse when any such was present	Negative	-1
26	1	1	0	1	He had taken me aside one day and promised me a silver fourpenny on the first of every month if I would only keep my “weathereye open for a seafaring man with one leg” and let him know the moment he appeared	Negative	-1
52	5	3	1	1	I followed him in and I remember observing the contrast the neat bright doctor with his powder as white as snow and his bright black eyes and pleasant manners made with the coltish country folk and above all with that filthy heavy bleared scarecrow of a pirate of ours sitting far gone in rum with his arms on the table	Negative	-1
55	1	1	3	1	But by this time we had all long ceased to pay any particular notice to the song; it was new that night to nobody but Livesey and on him I observed it did not produce an agreeable effect for he looked up for a moment quite angrily before he went on with his talk to old Taylor the gardener on a new cure for the rheumatics	Negative	-1
61	7	3	2	1	He spoke to him as before over his shoulder and in the same tone of voice rather high so that all the room might hear but perfectly calm and steady “If you do not put that knife this instant in your pocket I promise upon my honour you shall hang at the next assizes.” Then followed a battle of looks between them but the captain soon knuckled under put up his weapon and resumed his seat grumbling like a beaten dog	Negative	-1
67	0	1	-1	2	He sank daily and my mother and I had all the inn upon our hands and were kept busy enough without paying much regard to our unpleasant guest	Negative	-1
71	0	1	0	2	Well mother was upstairs with father and I was laying the breakfasttable against the captain’s return when the parlour door opened and a man stepped in on whom I had never set my eyes before	Negative	-1
83	3	3	0	2	We’ll put it for argument like that your captain has a cut on one cheekand we’ll put it if you like that that cheek’s the right one	Negative	-1

以每 80 局分別計算情緒值，並以 barplot 顯示

red9=red8 %>% group_by(index_line80=index%/%80) %>% summarise_each(funs(mean),-index,-chapter,-sentiment,-text)

## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
## 
## # Before:
## funs(name = f(.)
## 
## # After: 
## list(name = ~f(.))
## This warning is displayed once per session.

red10=red9%>% gather(method,sentiment,-index_line80)

#cor(red9 %>% select(-index_line80)) %>% print

#若要轉換成是以 chapter 是用這個 code
# red10=red8 %>% group_by(index_line80=chapter) %>% summarise_each(funs(mean),-index,-chapter,-sentiment,-text) %>% gather(method,sentiment,-index_line80)

red10%>%
  ggplot(aes(index_line80, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

從上圖可得知 “corenlp” 的句子多數為負面
因為該小說的句子字數都較長，故懷疑 “corenlp” 對句子較長的句子分析結果較差
故想探討句子的字數，對情緒的判斷有無關係
從 anova 可得出不同情緒的字數是有顯著差異

par(mfrow=c(1,1))
red_len=red8 %>% mutate(sentense_len=lengths(gregexpr("\\W+", text)) + 1)
m1=aov(log(sentense_len)~factor(sentimentValue),data=red_len)
summary(m1)

##                          Df Sum Sq Mean Sq F value Pr(>F)    
## factor(sentimentValue)    4  388.7   97.17   216.2 <2e-16 ***
## Residuals              3056 1373.2    0.45                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(red_len$sentimentValue %>% as.factor(),red_len$sentense_len,xlab="情緒值",ylab="字數")

比較“corenlp” 判斷為正面句子中的詞彙比率與判斷為負面句子中的詞彙比率
詞彙大小是以比率差異，即差異越大，詞彙就越大
紅色為詞彙在正面句子出現比率大於負面句子出現比率，黑色則相反

red_tmp1=red8 %>% filter(sign(sentimentValue)==-1)
red_bing=red2 %>% filter(linenumber %in% red_tmp1$index)
tmp1=red_bing %>%  group_by(word) %>%
  summarise(count=n()) %>%ungroup %>% mutate(pro=count/sum(count))%>% arrange(desc(count))

red_tmp2=red8 %>% filter(sign(sentimentValue)==1)
red_bing2=red2 %>% filter(linenumber %in% red_tmp2$index) 
tmp2=red_bing2 %>%  group_by(word) %>%
  summarise(count=n())%>%ungroup%>% mutate(pro=count/sum(count))%>% arrange(desc(count))



tmp_all=tmp1 %>% full_join(tmp2,by=c("word")) %>% mutate_at(vars(-word), function(x) ifelse(is.na(x),0,x)) %>% 
  anti_join(stop_words)

## Joining, by = "word"

tmp_all2=tmp_all %>% mutate(abs_pro_diff=abs(pro.y-pro.x),sign_pro_diff=sign(pro.y-pro.x)) %>% mutate(freq=abs_pro_diff)%>% arrange(desc(abs_pro_diff)) %>% select(word,abs_pro_diff,sign_pro_diff,freq) %>% top_n(60,abs_pro_diff);
wordcloud2(tmp_all2,color = c("black","red")[(tmp_all2$sign_pro_diff+3)/2])

從回傳的物件中提取斷詞、詞彙還原、詞性標註、命名實體標註等結果

tokens

tokens =  coreNLP_tokens_parser(obj)
tokens %>% head

##   chapter linenumber      word     lemma pos    ner
## 1       1          1    SQUIRE    SQUIRE NNP      O
## 2       1          1 TRELAWNEY TRELAWNEY NNP PERSON
## 3       1          1   Livesey   Livesey NNP PERSON
## 4       1          1       and       and  CC      O
## 5       1          1       the       the  DT      O
## 6       1          1      rest      rest  NN      O

coreNLP_tokens_parser欄位:

status_id : 對應原本df裡的status_id，為一則tweets的唯一id
word: 原始斷詞
lemma : 對斷詞做詞形還原
pos : part-of-speech,詞性
ner: 命名實體

從NER查看特定類型的實體

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 15, count) %>%
  ungroup() %>%
  mutate(lower_word = reorder(lower_word, count)) %>%
  ggplot(aes(lower_word, count)) +
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>%
  mutate(lower_word = reorder(lower_word, count)) %>%
  ggplot(aes(lower_word, count)) +
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

tokens %>%
  anti_join(stop_words,by = c("lower_lemma"="word")) %>% 
  filter(str_detect(lower_lemma, regex("^[a-z].*$",ignore_case = TRUE)))%>%
  group_by(lower_lemma) %>%
  summarise(count = n()) %>% 
  top_n(n = 20, count) %>%
  arrange(desc(count)) %>% kable

lower_lemma	count
hand	169
captain	159
silver	149
time	124
doctor	123
ship	121
lay	103
begin	91
hear	85
run	85
jim	84
head	77
leave	75
squire	75
sea	73
fall	72
word	72
round	69
island	68
eye	67

第九組：結合jiebar與Tidy text套件，處理中文文字資料（Treasure Island）

第九組

2019/04/01

系統參數設定

安裝需要的packages

書籍下載

情緒分析

Afinn, bing 和 NRC 情緒計算

使用CoreNLP 計算情緒

API呼叫的設定

coreNLP 服務分析句子

將句子丟入服務

從回傳的物件中提取斷詞、詞彙還原、詞性標註、命名實體標註等結果

tokens

從NER查看特定類型的實體