資料介紹

Data Source: 辛普森一家字幕庫
第31季第1集~第16集(共16集)

文本探索

讀取資料

library(readr)
library(tidyr)
library(dplyr)
library(tidytext)
library(stringr)
library(ggplot2)
library(wordcloud)
library(wordcloud2)
library(textdata)
library(reshape2)
text = read_file("./simps/all.txt")

文字前處理

查看斷詞結果

text_tidy = tibble(text) %>%
  unnest_tokens(word, text) %>%
  mutate(episode = cumsum(str_detect(word, regex("^EP([1-9]|1[0-6])$", ignore_case = T)))) # 標註集數
text_tidy %>% head(10)

## # A tibble: 10 x 2
##    word  episode
##    <chr>   <int>
##  1 ep1         1
##  2 fold        1
##  3 down        1
##  4 that        1
##  5 and         1
##  6 then        1
##  7 up          1
##  8 that        1
##  9 and         1
## 10 this        1

手動加入停用字

data(stop_words) # 使用內建字典

text_tidy = text_tidy %>%
  anti_join(stop_words) %>%
  filter(!(word %in% c("baby", "kid", "kids", "boy", "son", "dad", "daddy", "mom", "wife", "sir", "doo", "guy", "guys", "aw", "uh", "ooh", "huh", "eh", "ah", "yeah", "hey", "gonna")))

統計字頻

text_tidy %>%
  count(word, sort = T) %>% 
  head(10)

## # A tibble: 10 x 2
##    word       n
##    <chr>  <int>
##  1 time      84
##  2 homer     76
##  3 god       69
##  4 marge     53
##  5 love      52
##  6 bart      44
##  7 people    43
##  8 day       39
##  9 lisa      36
## 10 school    35

text_tidy %>%
  count(word, sort = T) %>%
  filter(n > 50) %>%
  mutate(word = reorder(word, n)) %>% # 使 x 軸依 y 軸數值大小排序
  ggplot(aes(word, n)) +
  geom_col(fill = "#00bfc4") +
  coord_flip()

話題：Time > God > Love
角色：Homer(荷馬·辛普森) > Marge(美枝·辛普森)

文字雲分析

整季(第31季)總體文字雲

text_tidy %>% 
  group_by(word) %>% 
  summarise(count = n()) %>% 
  filter(count > 15) %>% 
  arrange(desc(count)) %>% 
  wordcloud2( size = 0.7, shape = 'circle')

區分正、負面情緒的文字雲(採用Bing字典)

text_tidy  %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

以整季來看，正面詞彙字頻較大
Time的話題消失了
形容詞較多

特別集數(第10集)的文字雲

計算每集的正負性差距，找出差距最大的一集再畫出文字雲

text_tidy_neg_pos <- text_tidy %>%
  inner_join(get_sentiments("bing")) %>% 
  group_by(episode,sentiment) %>%
  summarise(count=sum(episode))
text_tidy_neg <- subset( text_tidy_neg_pos, sentiment == "negative")
text_tidy_pos <- subset( text_tidy_neg_pos, sentiment == "positive")

text_tidy_max <- which.max(abs( text_tidy_neg$count - text_tidy_pos$count)) # 第十集差距最大

subset( text_tidy ,episode ==  text_tidy_max)  %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"),
                   max.words = 100)

第十集的負面詞字頻較整季來得大
第十集特別的負面話題為：Shark, burns, steal
第十集特別的正面話題為：hug, fantastice, smile

情緒分析

四個情緒字典介紹

NRC (10 Category)
- 34% (+): joy, positive, surprise, trust
- 66% (-): anger, anticipation, disgust, fear, negative, sadness
Afinn (Numeric: -5~+5, mean: -0.6)
Bing (2 Category: 30% Positive, 70% Negative)
Loughran (6 Category)
- (+): positive
- (-): constraining(被迫/限制), litigious(好爭論的), negative, superfluous(多餘的), uncertainty

NRC字典

準備資料

nrc <- get_sentiments("nrc")

simlist <- text_tidy %>%
  filter(nchar(.$word)>1) %>%
  mutate(rown = row_number())


simnrc<-simlist %>%
  inner_join(nrc, by = "word") 

tt_neg<-filter(simnrc,sentiment == 'negative'| sentiment== 'anger'| sentiment== 'disgust'| sentiment == 'fear'| sentiment == 'sadness')

tt_neg_dis<-distinct(tt_neg,word,rown,episode)

neg_re<-tt_neg_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()

NRC最常出現的負面情緒字

neg_re %>% 
  top_n(10,wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label=n))+
  theme(text=element_text(size=14))+
  coord_flip()

God > Wait > Bad

NRC最常出現的正面情緒字

tt_pos<-filter(simnrc,sentiment == 'anticipation'| sentiment== 'joy'| sentiment== 'surprise'| sentiment == 'trust'| sentiment == 'positive')

tt_pos_dis<-distinct(tt_pos,word,rown,episode)

pos_re<-tt_pos_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()


pos_re %>% 
  top_n(10,wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label=n))+
  theme(text=element_text(size=14))+
  coord_flip()

Time > God > Love> School > Wait

找出“Love”最常出現的前三大集數

pos_love<-tt_pos_dis%>%
  filter(word == 'love')%>%
  group_by(episode)%>%
  count(word, sort = TRUE)%>%
  ungroup()
pos_love %>% top_n(3)

## # A tibble: 4 x 3
##   episode word      n
##     <int> <chr> <int>
## 1       8 love      7
## 2       6 love      6
## 3       4 love      5
## 4       9 love      5

結果為第八集(November 24, 2019)
這集在美國播送時，剛好也是感恩節前夕

Afinn字典

準備資料

afinn = get_sentiments("afinn")

episodeafinn <- text_tidy %>% 
  inner_join(afinn) %>%
  group_by(episode) %>% 
  summarise(sentiment = sum(value)) # 以每集為單位將情緒值加總

## Joining, by = "word"

每集情緒加總變化趨勢

episodeafinn %>%
  ggplot(aes(episode, sentiment, fill = "#F7766D")) +
  geom_col(show.legend = FALSE) +
  scale_x_continuous(breaks=seq(1,16,1))

第1集最負面；第11集最正面
有一點週期性變化

Afinn最常出現的負面情緒字

tt_neg <- text_tidy %>% 
  inner_join(afinn) %>% 
  filter(value < 0)

tt_neg_dis<-distinct(tt_neg,word,rown,episode)

neg_re<-tt_neg_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()

neg_re %>% 
  top_n(10,wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label=n))+
  theme(text=element_text(size=14))+
  coord_flip()

Stop > Leave > Wrong = Stupid > Bad

Afinn最常出現的正面情緒字

tt_pos <- text_tidy %>% 
  inner_join(afinn) %>%
  filter(value > 0)

tt_pos_dis <- distinct(tt_pos,word,rown,episode)

pos_re <- tt_pos_dis%>%
  group_by(word)%>%
  count(word, sort = TRUE)%>%
  ungroup()
pos_re %>% 
  top_n(10,wt = n) %>%
  ggplot(aes(word, n, fill = word)) +
  geom_col(show.legend = FALSE) +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  geom_text(aes(label=n))+
  theme(text=element_text(size=14))+
  coord_flip()

Love > God > Fine = Cool > Rich = Nice

Bing字典

準備資料

bing = get_sentiments("bing")
simp_bing <- text_tidy %>%
  filter(nchar(.$word)>1) %>%
  mutate(nrow = row_number())%>%
  inner_join(bing, by = "word")

Bing最常出現的正/負面情緒字

simp_bing_all <- simp_bing　%>%
  group_by(sentiment)%>%
  count(word, sort = TRUE)%>%
  ungroup()


simp_bing_all　%>%
  group_by(sentiment) %>%
  top_n(10,wt = n) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

#正負面字詞出現總次數
sum(simp_bing_all[simp_bing_all$sentiment == "positive",]$n)

## [1] 923

sum(simp_bing_all[simp_bing_all$sentiment == "negative",]$n)

## [1] 1187

若只看總數，會發現辛普森雖然是喜劇，仍然是負面詞彙比較多
考慮到可能是受Bing字典比例(30正面/70負面)影響，且正/負面詞總數僅有約200之差，我們認為差距不大

Loughran字典

Loughran中各情緒類型包含多少字

loughran = get_sentiments("loughran")
loughran %>%
  count(sentiment, sort = T)

## # A tibble: 6 x 2
##   sentiment        n
##   <chr>        <int>
## 1 negative      2355
## 2 litigious      904
## 3 positive       354
## 4 uncertainty    297
## 5 constraining   184
## 6 superfluous     56

可看出 negative 詞彙明顯比其他情緒的詞彙數量多

整季中各情緒類型包含多少字

loughran %>%
  inner_join(text_tidy) %>%
  count(sentiment, sort = T)

## # A tibble: 5 x 2
##   sentiment        n
##   <chr>        <int>
## 1 negative       407
## 2 positive       197
## 3 uncertainty     36
## 4 litigious       34
## 5 constraining    17

可看出在 Simpson EP1-16 中負面詞彙比正面詞彙多

整季中各情緒類型包含哪些字

Map(function(i){
  get_sentiments("loughran") %>%
  filter(sentiment == i) %>%
  inner_join(text_tidy) %>%
  count(word, sort = T)
}, c("negative", "positive", "uncertainty", "litigious", "constraining", "superfluous"))

## $negative
## # A tibble: 165 x 2
##    word          n
##    <chr>     <int>
##  1 bad          26
##  2 wrong        25
##  3 worry        18
##  4 poor         12
##  5 break        11
##  6 lost         11
##  7 miss         11
##  8 detention    10
##  9 late          9
## 10 shut          9
## # … with 155 more rows
## 
## $positive
## # A tibble: 63 x 2
##    word          n
##    <chr>     <int>
##  1 happy        27
##  2 perfect      16
##  3 beautiful    12
##  4 dream        11
##  5 win          10
##  6 easy          9
##  7 enjoy         9
##  8 favorite      6
##  9 boom          5
## 10 excellent     4
## # … with 53 more rows
## 
## $uncertainty
## # A tibble: 21 x 2
##    word          n
##    <chr>     <int>
##  1 possibly      6
##  2 depends       5
##  3 believing     4
##  4 believed      2
##  5 doubts        2
##  6 risk          2
##  7 arbitrary     1
##  8 assume        1
##  9 believes      1
## 10 dependent     1
## # … with 11 more rows
## 
## $litigious
## # A tibble: 23 x 2
##    word         n
##    <chr>    <int>
##  1 court        5
##  2 appeal       3
##  3 contract     2
##  4 crime        2
##  5 jury         2
##  6 justice      2
##  7 law          2
##  8 appeals      1
##  9 attorney     1
## 10 consent      1
## # … with 13 more rows
## 
## $constraining
## # A tibble: 12 x 2
##    word           n
##    <chr>      <int>
##  1 depends        5
##  2 limits         2
##  3 commit         1
##  4 committed      1
##  5 dependent      1
##  6 forbid         1
##  7 insist         1
##  8 limit          1
##  9 permission     1
## 10 required       1
## 11 requires       1
## 12 restraint      1
## 
## $superfluous
## # A tibble: 0 x 2
## # … with 2 variables: word <chr>, n <int>

可看出 Simpson 大多採用較生活化的單字表達情緒，例如：
- 負面詞彙最常被使用的是 bad
- 正面詞彙最常被使用的是 happy
至於 superfluous 的情緒在 Simpson EP1-16 則沒有出現對應的詞彙

Loughran最常出現的正/負面情緒字

loughran_wc = get_sentiments("loughran") %>%
  filter(sentiment == "negative" | sentiment == "positive") %>%
  inner_join(text_tidy) %>%
  count(word, sentiment, sort = T)

loughran_wc %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()

四字典各集比較分析

正/負面情緒詞總數差異

準備資料

simps_cnt = text_tidy %>% 
  group_by(episode, word) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

每集情緒詞總數比較

simps_cnt %>% 
  inner_join(afinn) %>% 
  group_by(episode, value) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_boxplot(aes(x=episode, y=value, colour=as.factor(episode)),show.legend = FALSE) +
  ggtitle("Afinn-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,16,1)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))

simps_cnt %>% 
  inner_join(nrc) %>% 
  group_by(episode, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
  ggtitle("NRC-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,16,1)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))

simps_cnt %>% 
  inner_join(bing) %>% 
  group_by(episode, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
  ggtitle("Bing-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,16,1)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))

simps_cnt %>% 
  inner_join(loughran) %>% 
  group_by(episode, sentiment) %>% 
  summarise(cnt = sum(count)) %>% ggplot() +
  geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
  ggtitle("Loughran-每集情緒詞數量差異") + 
  scale_x_continuous(breaks=seq(1,16,1)) +
  theme(text=element_text(family="蘋方-繁 中黑體", size=12),
      plot.title=element_text(hjust = 0.5))

基本上皆為正面>負面
第13集中，正面與負面差距最大，可能是五集中最歡樂的一集
第14集中，正面與負面差距最小，是較為中性的一集

正面-負面情緒差異

準備資料

#整理字典"bing"正負面情緒各級出現次數與差值
bing_neg_pos <-  text_tidy  %>%
  dplyr::select(word,episode) %>%
  inner_join(get_sentiments("bing"))  %>% 
  group_by(episode,sentiment) %>%
  summarise(count=sum(episode))

bing_trend<-bing_neg_pos%>%group_by(episode)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "bing")


#整理字典"nrc"正負面情緒各級出現次數與差值
nrc_count_pos<-tt_pos_dis%>%group_by(episode)%>%summarise(count=n())%>%mutate(sentiment = "positive")
nrc_count_neg<-tt_neg_dis%>%group_by(episode)%>%summarise(count=n())%>%mutate(sentiment = "negative")
nrc_neg_pos<-bind_rows(nrc_count_pos,nrc_count_neg)%>%arrange(episode)%>%mutate(method='nrc') 

nrc_trend<-nrc_neg_pos%>%group_by(episode)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative)


#整理字典"afinn"正負面情緒各級出現次數與差值
afinn_pos <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value>0)%>%group_by(episode)%>%summarise(count=n())%>%mutate(sentiment = "positive")

afinn_neg <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value<0)%>%group_by(episode)%>%summarise(count=n())%>%mutate(sentiment = "negative")

afinn_neg_pos<-bind_rows(afinn_pos,afinn_neg)%>%arrange(episode)%>%mutate(method='nrc') 

afinn_trend<-afinn_neg_pos%>%group_by(episode)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "AFINN")


#整理字典"loughran"正負面情緒各級出現次數與差值
loughran_data <-  text_tidy  %>%
  dplyr::select(word,episode) %>%
  inner_join(get_sentiments("loughran"))  %>% 
  group_by(episode,sentiment) %>%
  summarise(count=sum(episode))

loughran_neg <- subset( loughran_data, sentiment == "negative")
loughran_pos <- subset( loughran_data, sentiment == "positive")

loughran_neg_pos<-bind_rows(loughran_neg,loughran_pos)%>%arrange(episode)%>%mutate(method='nrc') 

loughran_trend<-loughran_neg_pos%>%group_by(episode)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "loughran")

合併四個字典並畫出長條圖

bind_rows(bing_trend,nrc_trend,afinn_trend,loughran_trend) %>%
  ggplot(aes(x= episode,y=Deviation,fill=method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "fixed")+
  geom_text(aes(label=Deviation))

合併四個字典並劃出折線圖

bind_rows(bing_trend,nrc_trend,afinn_trend,loughran_trend) %>%
  ggplot(aes(x= episode,y=Deviation,col=method)) +
  geom_line() +facet_wrap(~method, ncol = 1, scales = "fixed")+
  geom_text(aes(label=Deviation))

分工說明

統整字幕資料：曾建嘉
統整程式碼：沈育嬋
NoteBook製作：唐思琪
文字雲分析：盧伯維
情緒分析
- NRC字典分析：曾建嘉
- Afinn字典分析：蔡柏毅
- Bing字典分析：賴俞雯
- Loughran字典分析：沈育嬋
四字典各集比較分析
- 每集情緒詞總數比較：唐思琪
- 比較正面-負面情緒差異：陳信瑋

文字分析_Simspons

文字分析_Simspons

資料介紹

文本探索

讀取資料

文字前處理

查看斷詞結果

手動加入停用字

統計字頻

文字雲分析

整季(第31季)總體文字雲

區分正、負面情緒的文字雲(採用Bing字典)

特別集數(第10集)的文字雲

情緒分析

四個情緒字典介紹

NRC字典

準備資料

NRC最常出現的負面情緒字

NRC最常出現的正面情緒字

找出“Love”最常出現的前三大集數

Afinn字典

準備資料

每集情緒加總變化趨勢

Afinn最常出現的負面情緒字

Afinn最常出現的正面情緒字

Bing字典

準備資料

Bing最常出現的正/負面情緒字

Loughran字典

Loughran中各情緒類型包含多少字

整季中各情緒類型包含多少字

整季中各情緒類型包含哪些字

Loughran最常出現的正/負面情緒字

四字典各集比較分析

正/負面情緒詞總數差異

準備資料

每集情緒詞總數比較

正面-負面情緒差異

準備資料

合併四個字典並畫出長條圖

合併四個字典並劃出折線圖

分工說明