文字分析_Simspons
資料介紹
- Data Source: 辛普森一家 字幕庫
- 第31季第1集~第16集(共16集)
文本探索
讀取資料
文字前處理
查看斷詞結果
text_tidy = tibble(text) %>%
unnest_tokens(word, text) %>%
mutate(episode = cumsum(str_detect(word, regex("^EP([1-9]|1[0-6])$", ignore_case = T)))) # 標註集數
text_tidy %>% head(10)## # A tibble: 10 x 2
## word episode
## <chr> <int>
## 1 ep1 1
## 2 fold 1
## 3 down 1
## 4 that 1
## 5 and 1
## 6 then 1
## 7 up 1
## 8 that 1
## 9 and 1
## 10 this 1
統計字頻
## # A tibble: 10 x 2
## word n
## <chr> <int>
## 1 time 84
## 2 homer 76
## 3 god 69
## 4 marge 53
## 5 love 52
## 6 bart 44
## 7 people 43
## 8 day 39
## 9 lisa 36
## 10 school 35
text_tidy %>%
count(word, sort = T) %>%
filter(n > 50) %>%
mutate(word = reorder(word, n)) %>% # 使 x 軸依 y 軸數值大小排序
ggplot(aes(word, n)) +
geom_col(fill = "#00bfc4") +
coord_flip()- 話題:Time > God > Love
- 角色:Homer(荷馬·辛普森) > Marge(美枝·辛普森)
文字雲分析
整季(第31季)總體文字雲
區分正、負面情緒的文字雲(採用Bing字典)
text_tidy %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)- 以整季來看,正面詞彙字頻較大
- Time的話題消失了
- 形容詞較多
特別集數(第10集)的文字雲
- 計算每集的正負性差距,找出差距最大的一集再畫出文字雲
text_tidy_neg_pos <- text_tidy %>%
inner_join(get_sentiments("bing")) %>%
group_by(episode,sentiment) %>%
summarise(count=sum(episode))
text_tidy_neg <- subset( text_tidy_neg_pos, sentiment == "negative")
text_tidy_pos <- subset( text_tidy_neg_pos, sentiment == "positive")
text_tidy_max <- which.max(abs( text_tidy_neg$count - text_tidy_pos$count)) # 第十集差距最大
subset( text_tidy ,episode == text_tidy_max) %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"),
max.words = 100)- 第十集的負面詞字頻較整季來得大
- 第十集特別的負面話題為:Shark, burns, steal
- 第十集特別的正面話題為:hug, fantastice, smile
情緒分析
四個情緒字典介紹
- NRC (10 Category)
- 34% (+): joy, positive, surprise, trust
- 66% (-): anger, anticipation, disgust, fear, negative, sadness
- Afinn (Numeric: -5~+5, mean: -0.6)
- Bing (2 Category: 30% Positive, 70% Negative)
- Loughran (6 Category)
- (+): positive
- (-): constraining(被迫/限制), litigious(好爭論的), negative, superfluous(多餘的), uncertainty
NRC字典
準備資料
nrc <- get_sentiments("nrc")
simlist <- text_tidy %>%
filter(nchar(.$word)>1) %>%
mutate(rown = row_number())
simnrc<-simlist %>%
inner_join(nrc, by = "word")
tt_neg<-filter(simnrc,sentiment == 'negative'| sentiment== 'anger'| sentiment== 'disgust'| sentiment == 'fear'| sentiment == 'sadness')
tt_neg_dis<-distinct(tt_neg,word,rown,episode)
neg_re<-tt_neg_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()NRC最常出現的負面情緒字
neg_re %>%
top_n(10,wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label=n))+
theme(text=element_text(size=14))+
coord_flip()- God > Wait > Bad
NRC最常出現的正面情緒字
tt_pos<-filter(simnrc,sentiment == 'anticipation'| sentiment== 'joy'| sentiment== 'surprise'| sentiment == 'trust'| sentiment == 'positive')
tt_pos_dis<-distinct(tt_pos,word,rown,episode)
pos_re<-tt_pos_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
pos_re %>%
top_n(10,wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label=n))+
theme(text=element_text(size=14))+
coord_flip()- Time > God > Love> School > Wait
找出“Love”最常出現的前三大集數
pos_love<-tt_pos_dis%>%
filter(word == 'love')%>%
group_by(episode)%>%
count(word, sort = TRUE)%>%
ungroup()
pos_love %>% top_n(3)## # A tibble: 4 x 3
## episode word n
## <int> <chr> <int>
## 1 8 love 7
## 2 6 love 6
## 3 4 love 5
## 4 9 love 5
- 結果為第八集(November 24, 2019)
- 這集在美國播送時,剛好也是感恩節前夕
Afinn字典
準備資料
afinn = get_sentiments("afinn")
episodeafinn <- text_tidy %>%
inner_join(afinn) %>%
group_by(episode) %>%
summarise(sentiment = sum(value)) # 以每集為單位將情緒值加總## Joining, by = "word"
每集情緒加總變化趨勢
episodeafinn %>%
ggplot(aes(episode, sentiment, fill = "#F7766D")) +
geom_col(show.legend = FALSE) +
scale_x_continuous(breaks=seq(1,16,1)) - 第1集最負面;第11集最正面
- 有一點週期性變化
Afinn最常出現的負面情緒字
tt_neg <- text_tidy %>%
inner_join(afinn) %>%
filter(value < 0)
tt_neg_dis<-distinct(tt_neg,word,rown,episode)
neg_re<-tt_neg_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
neg_re %>%
top_n(10,wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label=n))+
theme(text=element_text(size=14))+
coord_flip()- Stop > Leave > Wrong = Stupid > Bad
Afinn最常出現的正面情緒字
tt_pos <- text_tidy %>%
inner_join(afinn) %>%
filter(value > 0)
tt_pos_dis <- distinct(tt_pos,word,rown,episode)
pos_re <- tt_pos_dis%>%
group_by(word)%>%
count(word, sort = TRUE)%>%
ungroup()
pos_re %>%
top_n(10,wt = n) %>%
ggplot(aes(word, n, fill = word)) +
geom_col(show.legend = FALSE) +
labs(y = "Contribution to sentiment",
x = NULL) +
geom_text(aes(label=n))+
theme(text=element_text(size=14))+
coord_flip()- Love > God > Fine = Cool > Rich = Nice
Bing字典
準備資料
Bing最常出現的正/負面情緒字
simp_bing_all <- simp_bing %>%
group_by(sentiment)%>%
count(word, sort = TRUE)%>%
ungroup()
simp_bing_all %>%
group_by(sentiment) %>%
top_n(10,wt = n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()## [1] 923
## [1] 1187
- 若只看總數,會發現辛普森雖然是喜劇,仍然是負面詞彙比較多
- 考慮到可能是受Bing字典比例(30正面/70負面)影響,且正/負面詞總數僅有約200之差,我們認為差距不大
Loughran字典
Loughran中各情緒類型包含多少字
## # A tibble: 6 x 2
## sentiment n
## <chr> <int>
## 1 negative 2355
## 2 litigious 904
## 3 positive 354
## 4 uncertainty 297
## 5 constraining 184
## 6 superfluous 56
- 可看出 negative 詞彙明顯比其他情緒的詞彙數量多
整季中各情緒類型包含多少字
## # A tibble: 5 x 2
## sentiment n
## <chr> <int>
## 1 negative 407
## 2 positive 197
## 3 uncertainty 36
## 4 litigious 34
## 5 constraining 17
- 可看出在 Simpson EP1-16 中負面詞彙比正面詞彙多
整季中各情緒類型包含哪些字
Map(function(i){
get_sentiments("loughran") %>%
filter(sentiment == i) %>%
inner_join(text_tidy) %>%
count(word, sort = T)
}, c("negative", "positive", "uncertainty", "litigious", "constraining", "superfluous")) ## $negative
## # A tibble: 165 x 2
## word n
## <chr> <int>
## 1 bad 26
## 2 wrong 25
## 3 worry 18
## 4 poor 12
## 5 break 11
## 6 lost 11
## 7 miss 11
## 8 detention 10
## 9 late 9
## 10 shut 9
## # … with 155 more rows
##
## $positive
## # A tibble: 63 x 2
## word n
## <chr> <int>
## 1 happy 27
## 2 perfect 16
## 3 beautiful 12
## 4 dream 11
## 5 win 10
## 6 easy 9
## 7 enjoy 9
## 8 favorite 6
## 9 boom 5
## 10 excellent 4
## # … with 53 more rows
##
## $uncertainty
## # A tibble: 21 x 2
## word n
## <chr> <int>
## 1 possibly 6
## 2 depends 5
## 3 believing 4
## 4 believed 2
## 5 doubts 2
## 6 risk 2
## 7 arbitrary 1
## 8 assume 1
## 9 believes 1
## 10 dependent 1
## # … with 11 more rows
##
## $litigious
## # A tibble: 23 x 2
## word n
## <chr> <int>
## 1 court 5
## 2 appeal 3
## 3 contract 2
## 4 crime 2
## 5 jury 2
## 6 justice 2
## 7 law 2
## 8 appeals 1
## 9 attorney 1
## 10 consent 1
## # … with 13 more rows
##
## $constraining
## # A tibble: 12 x 2
## word n
## <chr> <int>
## 1 depends 5
## 2 limits 2
## 3 commit 1
## 4 committed 1
## 5 dependent 1
## 6 forbid 1
## 7 insist 1
## 8 limit 1
## 9 permission 1
## 10 required 1
## 11 requires 1
## 12 restraint 1
##
## $superfluous
## # A tibble: 0 x 2
## # … with 2 variables: word <chr>, n <int>
- 可看出 Simpson 大多採用較生活化的單字表達情緒,例如:
- 負面詞彙最常被使用的是 bad
- 正面詞彙最常被使用的是 happy
- 至於 superfluous 的情緒在 Simpson EP1-16 則沒有出現對應的詞彙
Loughran最常出現的正/負面情緒字
loughran_wc = get_sentiments("loughran") %>%
filter(sentiment == "negative" | sentiment == "positive") %>%
inner_join(text_tidy) %>%
count(word, sentiment, sort = T)
loughran_wc %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()四字典各集比較分析
正/負面情緒詞總數差異
準備資料
每集情緒詞總數比較
simps_cnt %>%
inner_join(afinn) %>%
group_by(episode, value) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_boxplot(aes(x=episode, y=value, colour=as.factor(episode)),show.legend = FALSE) +
ggtitle("Afinn-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))simps_cnt %>%
inner_join(nrc) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("NRC-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5)) simps_cnt %>%
inner_join(bing) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("Bing-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5)) simps_cnt %>%
inner_join(loughran) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("Loughran-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5)) - 基本上皆為正面>負面
- 第13集中,正面與負面差距最大,可能是五集中最歡樂的一集
- 第14集中,正面與負面差距最小,是較為中性的一集
正面-負面情緒差異
準備資料
#整理字典"bing"正負面情緒各級出現次數與差值
bing_neg_pos <- text_tidy %>%
dplyr::select(word,episode) %>%
inner_join(get_sentiments("bing")) %>%
group_by(episode,sentiment) %>%
summarise(count=sum(episode))
bing_trend<-bing_neg_pos%>%group_by(episode)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "bing")
#整理字典"nrc"正負面情緒各級出現次數與差值
nrc_count_pos<-tt_pos_dis%>%group_by(episode)%>%summarise(count=n())%>%mutate(sentiment = "positive")
nrc_count_neg<-tt_neg_dis%>%group_by(episode)%>%summarise(count=n())%>%mutate(sentiment = "negative")
nrc_neg_pos<-bind_rows(nrc_count_pos,nrc_count_neg)%>%arrange(episode)%>%mutate(method='nrc')
nrc_trend<-nrc_neg_pos%>%group_by(episode)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative)
#整理字典"afinn"正負面情緒各級出現次數與差值
afinn_pos <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value>0)%>%group_by(episode)%>%summarise(count=n())%>%mutate(sentiment = "positive")
afinn_neg <- text_tidy %>% inner_join(get_sentiments("afinn")) %>% filter(value<0)%>%group_by(episode)%>%summarise(count=n())%>%mutate(sentiment = "negative")
afinn_neg_pos<-bind_rows(afinn_pos,afinn_neg)%>%arrange(episode)%>%mutate(method='nrc')
afinn_trend<-afinn_neg_pos%>%group_by(episode)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "AFINN")
#整理字典"loughran"正負面情緒各級出現次數與差值
loughran_data <- text_tidy %>%
dplyr::select(word,episode) %>%
inner_join(get_sentiments("loughran")) %>%
group_by(episode,sentiment) %>%
summarise(count=sum(episode))
loughran_neg <- subset( loughran_data, sentiment == "negative")
loughran_pos <- subset( loughran_data, sentiment == "positive")
loughran_neg_pos<-bind_rows(loughran_neg,loughran_pos)%>%arrange(episode)%>%mutate(method='nrc')
loughran_trend<-loughran_neg_pos%>%group_by(episode)%>%spread(sentiment, count, fill = 0) %>%mutate(Deviation = positive - negative,method = "loughran")分工說明
- 統整字幕資料:曾建嘉
- 統整程式碼:沈育嬋
- NoteBook製作:唐思琪
- 文字雲分析:盧伯維
- 情緒分析
- NRC字典分析:曾建嘉
- Afinn字典分析:蔡柏毅
- Bing字典分析:賴俞雯
- Loughran字典分析:沈育嬋
- 四字典各集比較分析
- 每集情緒詞總數比較:唐思琪
- 比較正面-負面情緒差異:陳信瑋