系統參數設定

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業
## 系統回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

## [1] ""

安裝需要的packages

packages = c("dplyr", "tidytext", "stringr", "wordcloud2","gutenbergr", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales',"data.tree", "NLP", "igraph")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(ggplot2)
library(reshape2)

## 
## Attaching package: 'reshape2'

## The following objects are masked from 'package:data.table':
## 
##     dcast, melt

library(wordcloud)

## Loading required package: RColorBrewer

library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:reshape2':
## 
##     smiths

library(readr)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:readr':
## 
##     col_factor

library(gutenbergr)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

library(data.tree)

下載書籍

scarlet <- gutenberg_download(244)

## Determining mirror for Project Gutenberg from http://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

scarlet$text[1:32] = ""
scarlet <- scarlet %>% filter(text!="")
scarlet <- scarlet %>% 
  mutate(book = "A Study in Scarlet",
         part = cumsum(str_detect(.$text, regex("^PART (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\."))),
         chapter = cumsum(str_detect(.$text, regex("^CHAPTER (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\\..*"))),
         linenumber = row_number())
  
hound <- gutenberg_download(2852) %>% filter(text!="")
hound <- hound %>% 
  mutate(book = "The Hound of the Baskervilles",
         chapter = cumsum(str_detect(.$text, regex("^Chapter [0-9][0-9]?\\..*"))),
         linenumber = row_number())

sign <- gutenberg_download(2097) %>% filter(text!="")
sign <- sign %>% 
  mutate(book = "The Sign of the Four",
         chapter = cumsum(str_detect(.$text, regex("^Chapter (XC|XL|L?X{0,3})(IX|IV|V?I{0,3})"))),
         linenumber = row_number())

valley <- gutenberg_download(3289) %>% filter(text!="")
valley <- valley %>% 
  mutate(book = "The Valley of Fear",
         part = cumsum(str_detect(.$text, regex("^Part [0-9].*"))),
         chapter = cumsum(str_detect(.$text, regex("^Chapter [0-9].*"))),
         linenumber = row_number())

斷詞

tokens_scar <- scarlet %>% unnest_tokens(word, text)
tokens_hound <- hound %>% unnest_tokens(word, text)
tokens_sign <- sign %>% unnest_tokens(word, text)
tokens_valley <- valley %>% unnest_tokens(word, text)
tokens_all <- bind_rows(tokens_scar, tokens_hound, tokens_sign, tokens_valley)

用Bing詞庫分析情緒

scar_bing <- tokens_scar %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

hound_bing <- tokens_hound %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

sign_bing <- tokens_sign %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

valley_bing <- tokens_valley %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

book_bing <- bind_rows(scar_bing, hound_bing, sign_bing, valley_bing) %>% 
  mutate(method = "Bing")
Bing_compare <- ggplot(book_bing, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(title = "Bing", x = NULL, y =NULL)
Bing_compare

此四篇皆偏向負向分數，尤以血字的研究與巴斯克維爾的獵犬十分明顯。

由於前項結果的發現，因此統計出各篇負向詞出現的前十名，試圖解釋負向分數高的原因

scar_tokens_count <- tokens_scar %>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>%
  count(book, word, sort = TRUE) %>% 
  head(., 10)

## Joining, by = "word"

hound_tokens_count <- tokens_hound %>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>%
  count(book, word, sort = TRUE) %>% 
  head(., 10)

## Joining, by = "word"

sign_tokens_count <- tokens_sign %>%
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>%
  count(book, word, sort = TRUE) %>% 
  head(., 10)

## Joining, by = "word"

valley_tokens_count <- tokens_valley %>% 
  inner_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>% 
  count(book, word, sort = TRUE) %>% 
  head(., 10)

## Joining, by = "word"

all_tokens_count <- bind_rows(scar_tokens_count, hound_tokens_count, sign_tokens_count, valley_tokens_count)
all_tokens_count %>% 
  ggplot(aes(reorder(word, n), n)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, scales = "free_y", ncol = 2) +
  labs(x = "字詞", y = NULL) +
  theme(text=element_text(size=12)) +
  theme(text = element_text(family = "Heiti TC Light")) +
  coord_flip()

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x,
## x$y, : font family not found in Windows font database

結果中可看出death(死), dead(死), murder(謀殺), fear(恐懼), dark(黑暗), danger(危險)等字出現多次，符合偵探小說的特性，因此導致通篇小說的分數偏向負向。

用AFINN詞庫分析情緒

scar_afinn <- tokens_scar %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN", book = "A Study in Scarlet")

## Joining, by = "word"

hound_afinn <- tokens_hound %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN", book = "The Hound of the Baskervilles")

## Joining, by = "word"

sign_afinn <- tokens_sign %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN", book = "The Sign of the Four")

## Joining, by = "word"

valley_afinn <- tokens_valley %>% 
  inner_join(get_sentiments("afinn")) %>% 
  group_by(index = linenumber %/% 80) %>% 
  summarise(sentiment = sum(score)) %>% 
  mutate(method = "AFINN", book = "The Valley of Fear")

## Joining, by = "word"

book_afinn <- bind_rows(scar_afinn, hound_afinn, sign_afinn, valley_afinn)
ggplot(book_afinn, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs(title = "AFINN", x = NULL, y = NULL)

結果血字的研究負向十分明顯，巴斯克維爾的獵犬、四個簽名、恐怖谷傾向負向。

用NRC詞庫分析情緒

scar_nrc <- tokens_scar %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

hound_nrc <- tokens_hound %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

sign_nrc <- tokens_sign %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

valley_nrc <- tokens_valley %>%
  inner_join(get_sentiments("nrc") %>% filter(sentiment %in% c("positive", "negative"))) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)

## Joining, by = "word"

book_nrc <- bind_rows(scar_nrc, hound_nrc, sign_nrc, valley_nrc) %>% 
  mutate(method = "NRC")
ggplot(book_nrc, aes(index, sentiment, fill= book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x") +
  labs("NRC", x = NULL, y = NULL)

血字的研究、四個簽名、恐怖谷分數皆轉正，巴斯克維爾的獵犬末段負向明顯。

四本書與三種詞庫分析出來的總結果

book_all <- bind_rows(book_bing, book_afinn, book_nrc)
book_all %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_grid(method~book, scales = "free_x") +
  labs(x = NULL, y = NULL)

可明顯看出 NRC詞庫的正向明顯較其他兩個詞庫高。

分析各詞庫差異

get_sentiments("afinn") %>% 
  summarise(n = sum(score))

get_sentiments("bing") %>% 
  count(sentiment)

get_sentiments("nrc") %>% 
     filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)

tokens_scar %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

## Joining, by = "word"

wordcounts_chapter <- tokens_all %>%
  group_by(book, chapter) %>%
  summarize(words = n())

tokens_all %>% 
  semi_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>% 
  group_by(book, chapter) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts_chapter, by = c("book", "chapter")) %>%
  mutate(ratio = negativewords/words) %>%
  filter(chapter != 0) %>%
  top_n(1) %>%
  ungroup()

## Joining, by = "word"

## Selecting by ratio

wordcounts_all <- tokens_all %>%
  group_by(book) %>%
  summarize(words = n())

tokens_all %>% 
  semi_join(get_sentiments("bing") %>% filter(sentiment == "negative")) %>% 
  group_by(book) %>%
  summarize(negativewords = n()) %>%
  left_join(wordcounts_all, by = c("book")) %>%
  mutate(negative_ratio = negativewords/words) %>%
  ungroup()

## Joining, by = "word"

tokens_all %>% 
  semi_join(get_sentiments("bing") %>% filter(sentiment == "positive")) %>% 
  group_by(book) %>%
  summarize(positivewords = n()) %>%
  left_join(wordcounts_all, by = c("book")) %>%
  mutate(positive_ratio = positivewords/words) %>%
  ungroup()

## Joining, by = "word"

Bing_compare

# book_all %>% filter(.$book == "A Study in Scarlet") %>% 
#   ggplot(aes(index, sentiment, fill= method)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~method, ncol = 1, scales = "free") +
#   labs(x = NULL, y = NULL) +
#   ggtitle("A Study in Scarlet")
# 
# book_all %>% filter(.$book == "The Hound of the Baskervilles") %>% 
#   ggplot(aes(index, sentiment, fill= method)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~method, ncol = 1, scales = "free") +
#   labs(x = NULL, y = NULL) +
#   ggtitle("The Hound of the Baskervilles")
# 
# book_all %>% filter(.$book == "The Sign of the Four") %>% 
#   ggplot(aes(index, sentiment, fill= method)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~method, ncol = 1, scales = "free") +
#   labs(x = NULL, y = NULL) +
#   ggtitle("The Sign of the Four")
# 
# book_all %>% filter(.$book == "The Valley of Fear") %>% 
#   ggplot(aes(index, sentiment, fill= method)) +
#   geom_col(show.legend = FALSE) +
#   facet_wrap(~method, ncol = 1, scales = "free") +
#   labs(x = NULL, y = NULL) +
#   ggtitle("The Valley of Fear")

SMS_HW3

1.1 自行算出Term Frequency

book_words <- tokens_all %>%
  count(book, word, sort = TRUE)

total_words <- book_words %>% 
  group_by(book) %>% 
  summarize(total = sum(n))

book_words <- left_join(book_words, total_words)

## Joining, by = "book"

book_words

出現最多的word分別是 the,and,of

1.1.1畫圖

library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 574 rows containing non-finite values (stat_bin).

## Warning: Removed 4 rows containing missing values (geom_bar).

2.1 Zipf’s Law

freq_by_rank <- book_words %>% 
  group_by(book) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank

2.1.1 畫圖

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

+前半段rank曲線較為平滑，term frequency幾乎一樣

2.2 只抓取字出現次數介於10%

filter(rank < 500, rank > 10)

lm(log10(term frequency) ~ log10(rank), data = rank_subset)

Call:

lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)

Coefficients:

(Intercept) log10(rank)

-0.6592 -1.1090

```

2.2.1 畫圖

freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = book)) + 
  geom_abline(intercept = -0.66, slope = -1.11, color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

結果顯示前半部的rank偏差還是很高，後面的分布就有趨近於-1斜率

3.1使用bind_tf-idf function

book_words <- book_words %>%
  bind_tf_idf(word, book, n)
book_words

3.1.1找出tf-idf較高的字

book_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))

故事中所具代表性的主角、主要場景(moor:沼澤)的tf-idf較高

3.1.2 將四本書的tf-idf視覺化

book_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(book) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

血字的研究：ferrier人名、drepper人名、strangerson人名
巴斯克維爾的獵犬：moor場景、henry人名、baskerville場景
四個簽名：sholto人名、morstan人名、jones人名
恐怖谷：douglas人名、mcmurdo人名、mcginty人名

4.1 Tokenizing by n-gram

ngram_scar <- scarlet %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngram_hound <- hound %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngram_sign <- sign %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngram_valley <- valley %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngram_all <- bind_rows(ngram_scar, ngram_hound, ngram_sign, ngram_valley)

4.1.1 計算bigram

ngram_all %>%
  count(bigram, sort = TRUE)

4.1.2 將bigram拆開成word1,word2並且過濾stop_words

library(tidyr)


bigrams_separated <- ngram_all %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) 

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)
bigram_counts<-bigram_counts[-1,] #刪除第一列是na


bigram_counts

4.1.3 將整理好的字重新合併

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigrams_united

4.2 使用trigram

trigram_scar <- scarlet %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_hound <- hound %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_sign <- sign %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_valley <- valley %>% unnest_tokens(trigram, text, token = "ngrams", n = 3)
trigram_all <- bind_rows(trigram_scar, trigram_hound, trigram_sign, trigram_valley)

trigram<-bind_rows(trigram_scar, trigram_hound, trigram_sign, trigram_valley)%>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

trigram<-trigram[-1,]
trigram

5.1 分析bigram

bigrams_filtered %>%
  filter(word1 == "death") %>%
  count(book, word2, sort = TRUE)

6.1 計算四本書tf-idf

bigram_tf_idf <- bigrams_united %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf

6.1.1 畫圖

bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(book) %>% 
  top_n(10) %>% 
  ungroup() %>%
  ggplot(aes(bigram, tf_idf, fill = book)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~book, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by tf_idf

7.1 過濾出word1為not的bigram

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)

7.2 用AFINN計算情緒分數

AFINN <- get_sentiments("afinn")

AFINN

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, score, sort = TRUE)

not_words

7.2.1 畫圖

library(ggplot2)

not_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("Words preceded by \"not\"") +
  ylab("Sentiment score * number of occurrences") +
  coord_flip()

7.3 加入更多否定字

negation_words <- c("not", "no", "never", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, score, sort = TRUE)

negated_words %>%
  mutate(contribution = n * score) %>%
  arrange(desc(abs(contribution))) %>%
  head(10) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  group_by(word1) %>% 
  ungroup() %>%
  ggplot(aes(word2, n * score, fill = n * score > 0)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~word1, ncol = 2, scales = "free") +
  coord_flip()

8.1 ngram視覺化

library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

# original counts
bigram_counts

8.2 只抓取大於5筆的資料，並轉換成graph可以吃的格式

# filter for only relatively common combinations
bigram_graph <- bigram_counts %>%
  filter(n > 5) %>%
  graph_from_data_frame()

bigram_graph

## IGRAPH e56e303 DN-- 146 91 -- 
## + attr: name (v/c), n (e/n)
## + edges from e56e303 (vertex names):
##  [1] sir        ->henry       sherlock   ->holmes     
##  [3] dr         ->mortimer    sir        ->charles    
##  [5] dr         ->watson      jefferson  ->hope       
##  [7] miss       ->morstan     baker      ->street     
##  [9] baskerville->hall        john       ->ferrier    
## [11] sir        ->charles's   white      ->mason      
## [13] wedding    ->ring        henry      ->baskerville
## [15] manor      ->house       athelney   ->jones      
## + ... omitted several edges

8.2.1 畫圖

library(ggraph)
set.seed(2017)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

set.seed(2016)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

SMS_HW3:不同情緒字典分析福爾摩斯中篇小說

第五組

系統參數設定

安裝需要的packages

下載書籍

斷詞

用Bing詞庫分析情緒

由於前項結果的發現，因此統計出各篇負向詞出現的前十名，試圖解釋負向分數高的原因

用AFINN詞庫分析情緒

用NRC詞庫分析情緒

四本書與三種詞庫分析出來的總結果

分析各詞庫差異

SMS_HW3

1.1 自行算出Term Frequency

1.1.1畫圖

2.1 Zipf’s Law

2.1.1 畫圖

2.2 只抓取字出現次數介於10%

Call:

lm(formula = log10(term frequency) ~ log10(rank), data = rank_subset)

Coefficients:

(Intercept) log10(rank)

-0.6592 -1.1090

2.2.1 畫圖

3.1使用bind_tf-idf function

3.1.1找出tf-idf較高的字

3.1.2 將四本書的tf-idf視覺化

4.1 Tokenizing by n-gram

4.1.1 計算bigram

4.1.2 將bigram拆開成word1,word2並且過濾stop_words

4.1.3 將整理好的字重新合併

4.2 使用trigram

5.1 分析bigram

6.1 計算四本書tf-idf

6.1.1 畫圖

7.1 過濾出word1為not的bigram

7.2 用AFINN計算情緒分數

7.2.1 畫圖

7.3 加入更多否定字

8.1 ngram視覺化

8.2 只抓取大於5筆的資料，並轉換成graph可以吃的格式

8.2.1 畫圖

lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)