系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""

安裝需要的packages

## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: tidytext
## Loading required package: jiebaR
## Loading required package: jiebaRD
## Loading required package: gutenbergr
## Loading required package: caTools
## Loading required package: knitr
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:rtweet':
## 
##     flatten
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
## 
##     extract

書籍下載

#load("coreNLP_HW_0407_all_120.RData")

red_word_count <- gutenberg_download(c(120, 98, 11, 74), 
                              meta_fields = "author")%>%
  unnest_tokens(word, text) %>%
  count(author, word, sort = TRUE)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
book_word_count=red_word_count %>% group_by(author) %>% summarise(total=n()) 

freq_by_rank=red_word_count  %>% left_join(book_word_count,by="author") %>% 
  group_by(author) %>% 
  mutate(TF=n/total,rank=row_number()) ;

m1=lm(log10(TF)~log10(rank),data = freq_by_rank)
summary(m1)
## 
## Call:
## lm(formula = log10(TF) ~ log10(rank), data = freq_by_rank)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.80666 -0.05566  0.01383  0.06783  0.14989 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.505005   0.004003   126.1   <2e-16 ***
## log10(rank) -1.157971   0.001160  -998.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08702 on 26194 degrees of freedom
## Multiple R-squared:  0.9744, Adjusted R-squared:  0.9744 
## F-statistic: 9.972e+05 on 1 and 26194 DF,  p-value: < 2.2e-16
freq_by_rank %>% 
  ggplot(aes(rank, TF, color = factor(author))) + 
  geom_abline(intercept = 0.5, slope = -1.16, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = T) + 
  scale_x_log10() +
  scale_y_log10()+ 
  scale_colour_discrete(name = "人物",breaks=levels(factor(freq_by_rank$author)),labels =c("Alice","Two Cities","Treasure","Tom"))

得出結果基本符合 Zipf’s law

TF-IDF 結果

red_word_count <- red_word_count  %>%
  bind_tf_idf(word, author, n)

red_word_count %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(author) %>% 
  top_n(20) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = author)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~author, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

加入 bigram 進行分析

load("coreNLP_HW_0407_all_120.RData")
## Warning: namespace 'tmcn' is not available and has been replaced
## by .GlobalEnv when processing object '.tmcnEnv'
## Attempting to load the environment 'package:tmcn'
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'tmcn'
## Specified environment not found: using '.GlobalEnv' instead
red_bigram <- red  %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

red_bigram%>%
  count(bigram, sort = TRUE) %>% head(10) %>% kable
bigram n
of the 498
in the 276
and the 232
it was 207
on the 181
and i 174
i was 173
to the 159
i had 156
the captain 147
bigrams_separated <- red_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% setdiff(stop_words$word,negation.words)) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts %>% na.omit%>% head(10) %>% kable
word1 word2 n
ben gunn 31
captain smollett 30
black dog 18
block house 18
john silver 16
admiral benbow 15
cried silver 14
no time 12
jim hawkins 11
north inlet 11

使用 bigram 進行情緒分析

data.frame(negation_words=negation.words) %>% kable
negation_words
ain’t
aren’t
can’t
couldn’t
didn’t
doesn’t
don’t
hasn’t
isn’t
mightn’t
mustn’t
neither
never
no
nobody
nor
not
shan’t
shouldn’t
wasn’t
weren’t
won’t
wouldn’t
negated_words <- bigrams_separated %>%
  filter(word1 %in% negation.words) %>%
  inner_join(get_sentiments("bing"), by = c(word2 = "word")) %>%
  count(word1, word2, sentiment , sort = TRUE) %>% 
  mutate(score=ifelse(sentiment=="positive",1,-1))



top_negation_vec=negated_words %>% group_by(word1) %>% 
  summarise(word1_count=sum(abs(n*score))) %>% 
  top_n(4,wt = word1_count) %>% select(word1) %>% as.matrix() %>% as.vector()

negated_words %>% 
  filter(word1%in%top_negation_vec)%>% 
  arrange(desc(-1*n*score)) %>% 
  mutate(score_sign=(-1*n*score)>0)%>%
  mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>% 
  group_by(word1) %>%   
  top_n(15,wt = abs(n*score)) %>% 
  ungroup() %>%
  ggplot(aes(word2,-1*n*score , fill = score_sign)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~word1, ncol = 2, scales = "free") +
  coord_flip()

negated_words_with_linenumber <- bigrams_separated %>%
  filter(word1 %in% negation.words) %>%
  inner_join(get_sentiments("bing"), by = c(word2 = "word")) %>%
  count(linenumber,word1, word2, sentiment , sort = TRUE) %>% 
  group_by(linenumber) %>% 
  summarise(total_bigram_score=sum(-1*ifelse(sentiment=="positive",1,-1)*n)) %>% arrange(linenumber)



red3_bing=red3 %>% select(index,`Bing et al.`) %>% left_join(negated_words_with_linenumber,by=c("index"="linenumber"))
red3_bing=red3_bing %>% mutate(`Bing et al.new`=`Bing et al.`+ 2*ifelse(is.na(total_bigram_score),0,total_bigram_score))
red3_bing2=red3_bing %>% select(index,`Bing et al.`,`Bing et al.new`)
tmp_table=red3_bing2 %>% select(`Bing et al.`,`Bing et al.new`) %>% mutate_all(sign) %>% table
rownames(tmp_table)=paste("bing",c("負","中","正"),sep="<br>")
colnames(tmp_table)=paste("bigram bing",c("負","中","正"),sep="<br>")
tmp_table %>% kable
bigram bing
bigram bing
bigram bing
bing
786 9 21
bing
4 1369 12
bing
23 4 833
red8_bing=red3_bing2 %>% left_join(sentiment,c("index"="linenumber"))
change_sentiment_text=red8_bing %>% filter(sign(`Bing et al.`)!=sign(`Bing et al.new`))
change_sentiment_text %>% head(10) %>% kable
index Bing et al. Bing et al.new chapter text sentiment sentimentValue
156 -3 1 2 “It’s the name of a buccaneer of my acquaintance; and I call you by it for the sake of shortness and what I have to say to you is this; one glass of rum won’t kill you but if you take one you’ll take another and another and I stake my wig if you don’t break off short you’ll diedo you understand that?die and go to your own place like the man in the Bible Negative -1
177 -1 1 3 Your doctor hisself said one glass wouldn’t hurt me Neutral 0
189 -2 2 3 I never wasted good money of mine nor lost it neither; and I’ll trick ’em again Verynegative -2
190 -1 1 3 I’m not afraid on ’em Neutral 0
205 1 -1 3 But you won’t peach unless they get the black spot on me or unless you see that Black Dog again or a seafaring man with one leg Jimhim above all." “But what is the black spot captain?” I asked Negative -1
267 -1 1 4 But there was no unusual soundnothing but the low wash of the ripple and the croaking of the inmates of the wood Negative -1
431 -1 1 6 His eyebrows were very black and moved readily and this gave him a look of some temper not bad you would say but quick and high Negative -1
527 1 -1 7 “Redruth” said I interrupting the letter " Livesey will not like that Negative -1
553 0 2 7 So now Livesey come post; do not lose an hour if you respect me Negative -1
584 0 2 7 In one sailors were singing at their work in another there were men aloft high over my head hanging to threads that seemed no thicker than a spider’s Negative -1
red8_bing_summarize_chapter_org=red8_bing %>% group_by(chapter) %>% 
  summarise(mean_bing=mean(`Bing et al.`),mean_bing_new=mean(`Bing et al.new`))


red8_bing_summarize_chapter=red8_bing_summarize_chapter_org%>%
  gather(method,sentiment,-chapter)

red8_bing_summarize_chapter=red8_bing_summarize_chapter %>% group_by(method) %>% mutate(high_ind=(sentiment==max(sentiment)),low_ind=(sentiment==min(sentiment)))


red8_bing_summarize_chapter%>%
  ggplot(aes(chapter, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = ifelse(high_ind,chapter,"")), vjust = 1,fontface="bold",size=4,color="blue")+
  geom_text(aes(label = ifelse(low_ind,chapter,"")), vjust = -0.5,fontface="bold",size=4,color="darkred")+
  facet_wrap(~method, ncol = 1, scales = "free_y")

tmp=red8_bing_summarize_chapter_org %>% filter(sign(mean_bing)!=sign(mean_bing_new));
tmp %>% 
  left_join(title_data,by=c("chapter"="chapter")) %>% kable
chapter mean_bing mean_bing_new chapter_name
22 -0.0483871 0.016129 How My Sea Adventure Began
29 0.0291971 0.000000 The Black Spot Again