準備字典
afinn = get_sentiments("afinn")
nrc = get_sentiments("nrc")
bing = get_sentiments("bing")
loughran = get_sentiments("loughran")
sapply(c(afinn, nrc, bing, loughran), head)
## word value word sentiment word sentiment
## [1,] "abandon" "-2" "abacus" "trust" "2-faces" "negative"
## [2,] "abandoned" "-2" "abandon" "fear" "abnormal" "negative"
## [3,] "abandons" "-2" "abandon" "negative" "abolish" "negative"
## [4,] "abducted" "-2" "abandon" "sadness" "abominable" "negative"
## [5,] "abduction" "-2" "abandoned" "anger" "abominably" "negative"
## [6,] "abductions" "-2" "abandoned" "fear" "abominate" "negative"
## word sentiment
## [1,] "abandon" "negative"
## [2,] "abandoned" "negative"
## [3,] "abandoning" "negative"
## [4,] "abandonment" "negative"
## [5,] "abandonments" "negative"
## [6,] "abandons" "negative"
以Bing字典判斷辛普森一家台詞的情緒傾向
simps_ep %>%
select(word) %>%
inner_join(bing) %>%
group_by(sentiment) %>%
summarise(cnt = n())
## # A tibble: 2 x 2
## sentiment cnt
## <chr> <int>
## 1 negative 1176
## 2 positive 919
simps_cnt = simps_ep %>%
group_by(episode, word) %>%
summarise(count = n()) %>%
arrange(desc(count))
分析四種字典每集情緒
p_afinn = simps_cnt %>%
inner_join(afinn) %>%
group_by(episode, value) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_boxplot(aes(x=episode, y=value, colour=as.factor(episode))) +
ggtitle("Afinn-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))
ggplotly(p_afinn)
- 對Afinn而言,第2, 12, 15集平均值高於總體;第4, 8集低於總體
p_nrc = simps_cnt %>%
inner_join(nrc) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("NRC-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))
ggplotly(p_nrc)
p_bing = simps_cnt %>%
inner_join(bing) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("Bing-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))
ggplotly(p_bing)
- 對Bing而言,差距最大的為第10集(負面),差距最小為第12集(中性)
p_loughran = simps_cnt %>%
inner_join(loughran) %>%
group_by(episode, sentiment) %>%
summarise(cnt = sum(count)) %>% ggplot() +
geom_line(aes(x=episode, y=cnt, colour=sentiment)) +
ggtitle("Loughran-每集情緒詞數量差異") +
scale_x_continuous(breaks=seq(1,16,1)) +
theme(text=element_text(family="蘋方-繁 中黑體", size=12),
plot.title=element_text(hjust = 0.5))
ggplotly(p_loughran)
- 對Loughran而言,第14集最負面
- 整體來說,Afinn較為中性,NRC正面>負面,其餘字典則是負面>正面