Abstract
TF-IDF 與 Bigram 情緒分析Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: tidytext
## Loading required package: jiebaR
## Loading required package: jiebaRD
## Loading required package: gutenbergr
## Loading required package: caTools
## Loading required package: knitr
##
## Attaching package: 'jsonlite'
## The following object is masked from 'package:rtweet':
##
## flatten
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
#load("coreNLP_HW_0407_all_120.RData")
red_word_count <- gutenberg_download(c(120, 98, 11, 74),
meta_fields = "author")%>%
unnest_tokens(word, text) %>%
count(author, word, sort = TRUE)
## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest
## Using mirror http://aleph.gutenberg.org
book_word_count=red_word_count %>% group_by(author) %>% summarise(total=n())
freq_by_rank=red_word_count %>% left_join(book_word_count,by="author") %>%
group_by(author) %>%
mutate(TF=n/total,rank=row_number()) ;
m1=lm(log10(TF)~log10(rank),data = freq_by_rank)
summary(m1)
##
## Call:
## lm(formula = log10(TF) ~ log10(rank), data = freq_by_rank)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.80666 -0.05566 0.01383 0.06783 0.14989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.505005 0.004003 126.1 <2e-16 ***
## log10(rank) -1.157971 0.001160 -998.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08702 on 26194 degrees of freedom
## Multiple R-squared: 0.9744, Adjusted R-squared: 0.9744
## F-statistic: 9.972e+05 on 1 and 26194 DF, p-value: < 2.2e-16
freq_by_rank %>%
ggplot(aes(rank, TF, color = factor(author))) +
geom_abline(intercept = 0.5, slope = -1.16, color = "gray50", linetype = 2) +
geom_line(size = 1.1, alpha = 0.8, show.legend = T) +
scale_x_log10() +
scale_y_log10()+
scale_colour_discrete(name = "人物",breaks=levels(factor(freq_by_rank$author)),labels =c("Alice","Two Cities","Treasure","Tom"))
得出結果基本符合 Zipf’s law
red_word_count <- red_word_count %>%
bind_tf_idf(word, author, n)
red_word_count %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(author) %>%
top_n(20) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill = author)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~author, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
load("coreNLP_HW_0407_all_120.RData")
## Warning: namespace 'tmcn' is not available and has been replaced
## by .GlobalEnv when processing object '.tmcnEnv'
## Attempting to load the environment 'package:tmcn'
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'tmcn'
## Specified environment not found: using '.GlobalEnv' instead
red_bigram <- red %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
red_bigram%>%
count(bigram, sort = TRUE) %>% head(10) %>% kable
| bigram | n |
|---|---|
| of the | 498 |
| in the | 276 |
| and the | 232 |
| it was | 207 |
| on the | 181 |
| and i | 174 |
| i was | 173 |
| to the | 159 |
| i had | 156 |
| the captain | 147 |
bigrams_separated <- red_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% setdiff(stop_words$word,negation.words)) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts %>% na.omit%>% head(10) %>% kable
| word1 | word2 | n |
|---|---|---|
| ben | gunn | 31 |
| captain | smollett | 30 |
| black | dog | 18 |
| block | house | 18 |
| john | silver | 16 |
| admiral | benbow | 15 |
| cried | silver | 14 |
| no | time | 12 |
| jim | hawkins | 11 |
| north | inlet | 11 |
data.frame(negation_words=negation.words) %>% kable
| negation_words |
|---|
| ain’t |
| aren’t |
| can’t |
| couldn’t |
| didn’t |
| doesn’t |
| don’t |
| hasn’t |
| isn’t |
| mightn’t |
| mustn’t |
| neither |
| never |
| no |
| nobody |
| nor |
| not |
| shan’t |
| shouldn’t |
| wasn’t |
| weren’t |
| won’t |
| wouldn’t |
negated_words <- bigrams_separated %>%
filter(word1 %in% negation.words) %>%
inner_join(get_sentiments("bing"), by = c(word2 = "word")) %>%
count(word1, word2, sentiment , sort = TRUE) %>%
mutate(score=ifelse(sentiment=="positive",1,-1))
top_negation_vec=negated_words %>% group_by(word1) %>%
summarise(word1_count=sum(abs(n*score))) %>%
top_n(4,wt = word1_count) %>% select(word1) %>% as.matrix() %>% as.vector()
negated_words %>%
filter(word1%in%top_negation_vec)%>%
arrange(desc(-1*n*score)) %>%
mutate(score_sign=(-1*n*score)>0)%>%
mutate(word2 = factor(word2, levels = rev(unique(word2)))) %>%
group_by(word1) %>%
top_n(15,wt = abs(n*score)) %>%
ungroup() %>%
ggplot(aes(word2,-1*n*score , fill = score_sign)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~word1, ncol = 2, scales = "free") +
coord_flip()
negated_words_with_linenumber <- bigrams_separated %>%
filter(word1 %in% negation.words) %>%
inner_join(get_sentiments("bing"), by = c(word2 = "word")) %>%
count(linenumber,word1, word2, sentiment , sort = TRUE) %>%
group_by(linenumber) %>%
summarise(total_bigram_score=sum(-1*ifelse(sentiment=="positive",1,-1)*n)) %>% arrange(linenumber)
red3_bing=red3 %>% select(index,`Bing et al.`) %>% left_join(negated_words_with_linenumber,by=c("index"="linenumber"))
red3_bing=red3_bing %>% mutate(`Bing et al.new`=`Bing et al.`+ 2*ifelse(is.na(total_bigram_score),0,total_bigram_score))
red3_bing2=red3_bing %>% select(index,`Bing et al.`,`Bing et al.new`)
tmp_table=red3_bing2 %>% select(`Bing et al.`,`Bing et al.new`) %>% mutate_all(sign) %>% table
rownames(tmp_table)=paste("bing",c("負","中","正"),sep="<br>")
colnames(tmp_table)=paste("bigram bing",c("負","中","正"),sep="<br>")
tmp_table %>% kable
| bigram bing 負 |
bigram bing 中 |
bigram bing 正 |
|
|---|---|---|---|
| bing 負 |
786 | 9 | 21 |
| bing 中 |
4 | 1369 | 12 |
| bing 正 |
23 | 4 | 833 |
red8_bing=red3_bing2 %>% left_join(sentiment,c("index"="linenumber"))
change_sentiment_text=red8_bing %>% filter(sign(`Bing et al.`)!=sign(`Bing et al.new`))
change_sentiment_text %>% head(10) %>% kable
| index | Bing et al. | Bing et al.new | chapter | text | sentiment | sentimentValue |
|---|---|---|---|---|---|---|
| 156 | -3 | 1 | 2 | “It’s the name of a buccaneer of my acquaintance; and I call you by it for the sake of shortness and what I have to say to you is this; one glass of rum won’t kill you but if you take one you’ll take another and another and I stake my wig if you don’t break off short you’ll diedo you understand that?die and go to your own place like the man in the Bible | Negative | -1 |
| 177 | -1 | 1 | 3 | Your doctor hisself said one glass wouldn’t hurt me | Neutral | 0 |
| 189 | -2 | 2 | 3 | I never wasted good money of mine nor lost it neither; and I’ll trick ’em again | Verynegative | -2 |
| 190 | -1 | 1 | 3 | I’m not afraid on ’em | Neutral | 0 |
| 205 | 1 | -1 | 3 | But you won’t peach unless they get the black spot on me or unless you see that Black Dog again or a seafaring man with one leg Jimhim above all." “But what is the black spot captain?” I asked | Negative | -1 |
| 267 | -1 | 1 | 4 | But there was no unusual soundnothing but the low wash of the ripple and the croaking of the inmates of the wood | Negative | -1 |
| 431 | -1 | 1 | 6 | His eyebrows were very black and moved readily and this gave him a look of some temper not bad you would say but quick and high | Negative | -1 |
| 527 | 1 | -1 | 7 | “Redruth” said I interrupting the letter " Livesey will not like that | Negative | -1 |
| 553 | 0 | 2 | 7 | So now Livesey come post; do not lose an hour if you respect me | Negative | -1 |
| 584 | 0 | 2 | 7 | In one sailors were singing at their work in another there were men aloft high over my head hanging to threads that seemed no thicker than a spider’s | Negative | -1 |
red8_bing_summarize_chapter_org=red8_bing %>% group_by(chapter) %>%
summarise(mean_bing=mean(`Bing et al.`),mean_bing_new=mean(`Bing et al.new`))
red8_bing_summarize_chapter=red8_bing_summarize_chapter_org%>%
gather(method,sentiment,-chapter)
red8_bing_summarize_chapter=red8_bing_summarize_chapter %>% group_by(method) %>% mutate(high_ind=(sentiment==max(sentiment)),low_ind=(sentiment==min(sentiment)))
red8_bing_summarize_chapter%>%
ggplot(aes(chapter, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = ifelse(high_ind,chapter,"")), vjust = 1,fontface="bold",size=4,color="blue")+
geom_text(aes(label = ifelse(low_ind,chapter,"")), vjust = -0.5,fontface="bold",size=4,color="darkred")+
facet_wrap(~method, ncol = 1, scales = "free_y")
tmp=red8_bing_summarize_chapter_org %>% filter(sign(mean_bing)!=sign(mean_bing_new));
tmp %>%
left_join(title_data,by=c("chapter"="chapter")) %>% kable
| chapter | mean_bing | mean_bing_new | chapter_name |
|---|---|---|---|
| 22 | -0.0483871 | 0.016129 | How My Sea Adventure Began |
| 29 | 0.0291971 | 0.000000 | The Black Spot Again |