1 Data介紹

2 文本探索

2.1 讀取資料

pacman::p_load(tidytext, dplyr, ggplot2, magrittr, plotly, readr, stringr, wordcloud2, wordcloud, reshape2)
simps <- read_file("./simps/all.txt")
stpw <- read_file("./simps/stpw.txt")

2.2 文字前處理

2.2.1 查看斷詞結果

2.2.2 標註集數

simps_ep = simps %>% filter(nchar(word)>1) %>% 
  mutate(episode = cumsum(str_detect(word, regex("^ep[1-9]|1[6]$"))))

3 情緒分析

3.1 情緒字典介紹

  • 英文(get_sentiments)
    • NRC (10 Category)
      • 34% (+): joy, positive, surprise, trust
      • 66% (-): anger, anticipation, disgust, fear, negative, sadness
    • Afinn (Numeric: -5~+5, mean: -0.6)
    • Bing (2 Category: 30% Positive, 70% Negative)
    • Loughran (6 Category)
      • (+): positive
      • (-): constraining, litigious, negative, superfluous, uncertainty

3.2 準備字典

afinn = get_sentiments("afinn")
nrc = get_sentiments("nrc")
bing = get_sentiments("bing")
loughran = get_sentiments("loughran")

sapply(c(afinn, nrc, bing, loughran), head)
##      word         value word        sentiment  word         sentiment 
## [1,] "abandon"    "-2"  "abacus"    "trust"    "2-faces"    "negative"
## [2,] "abandoned"  "-2"  "abandon"   "fear"     "abnormal"   "negative"
## [3,] "abandons"   "-2"  "abandon"   "negative" "abolish"    "negative"
## [4,] "abducted"   "-2"  "abandon"   "sadness"  "abominable" "negative"
## [5,] "abduction"  "-2"  "abandoned" "anger"    "abominably" "negative"
## [6,] "abductions" "-2"  "abandoned" "fear"     "abominate"  "negative"
##      word           sentiment 
## [1,] "abandon"      "negative"
## [2,] "abandoned"    "negative"
## [3,] "abandoning"   "negative"
## [4,] "abandonment"  "negative"
## [5,] "abandonments" "negative"
## [6,] "abandons"     "negative"

3.3 以Bing字典判斷辛普森一家台詞的情緒傾向

simps_ep %>% 
  select(word) %>%
  inner_join(bing) %>% 
  group_by(sentiment) %>% 
  summarise(cnt = n())
## # A tibble: 2 x 2
##   sentiment   cnt
##   <chr>     <int>
## 1 negative   1176
## 2 positive    919
simps_cnt = simps_ep %>% 
  group_by(episode, word) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

3.4 分析四種字典每集情緒

p_afinn = simps_cnt %>% 
  inner_join(afinn) %>% 
  group_by(episode, value) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_boxplot(aes(x=episode, y=value, colour=as.factor(episode))) +
  ggtitle("Afinn-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,16,1)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5)) 
ggplotly(p_afinn)
  • 對Afinn而言,第2, 12, 15集平均值高於總體;第4, 8集低於總體
p_nrc = simps_cnt %>% 
  inner_join(nrc) %>% 
  group_by(episode, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
  ggtitle("NRC-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,16,1)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))
ggplotly(p_nrc)
  • 對NRC而言,第9集特別正面
p_bing = simps_cnt %>% 
  inner_join(bing) %>% 
  group_by(episode, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
  ggtitle("Bing-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,16,1)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5)) 
ggplotly(p_bing)
  • 對Bing而言,差距最大的為第10集(負面),差距最小為第12集(中性)
p_loughran = simps_cnt %>% 
  inner_join(loughran) %>% 
  group_by(episode, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
  ggtitle("Loughran-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,16,1)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5)) 
ggplotly(p_loughran)
  • 對Loughran而言,第14集最負面
  • 整體來說,Afinn較為中性,NRC正面>負面,其餘字典則是負面>正面

4 文字雲視覺化

4.1 整體字幕庫文字雲

tokens_count = simps_ep %>% 
  group_by(word) %>% 
  summarise(count = n()) %>% 
  filter(count > 30) %>%
  arrange(desc(count))

head(tokens_count, 10)
## # A tibble: 10 x 2
##    word   count
##    <chr>  <int>
##  1 time      84
##  2 homer     76
##  3 hey       63
##  4 marge     53
##  5 love      52
##  6 boy       51
##  7 bart      44
##  8 people    43
##  9 kid       40
## 10 day       39
js_color_fun = "function (word, weight) {
return (weight > 65) ? '#f02222' : '#c09292';
}"
tokens_count %>% wordcloud2(color = htmlwidgets::JS(js_color_fun), backgroundColor = "black")

4.2 分析第13集中的正負面詞彙

simps_ep %>% 
  group_by(episode, word) %>% 
  summarise(count = n()) %>% 
  filter(episode==13) %>%
  inner_join(bing) %>%
  group_by(word, sentiment) %>%
  summarise(count=sum(count)) %>%
  filter(count > 1) %>% # 只有1次的太多了因此濾掉
  acast(word ~ sentiment, value.var = "count", fill = 0) %>%
  comparison.cloud(colors = c("gray80", "gray20"),
                   max.words = 100, family = "蘋方-繁 中黑體")
## Joining, by = "word"