library(tidytext)
library(janeaustenr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(gridExtra)
library(tidyr)
bing <- get_sentiments("bing")
debate <- read.csv("C:\\Users\\es901\\Documents\\dsR\\data\\debate.csv", head = T)
#只選取第一場辯論, 第三場辯論
debate <- filter(debate, Date == "9/26/16" | Date == "10/19/2016")
干擾發生的時候,可以從下面三種情況來分析文本資料
debate$Num <- as.integer(row.names(debate))
interrupt <- debate %>%
filter(Speaker == "CANDIDATES" | Text == "(CROSSTALK)" | grepl("\\.\\.\\.$", Text))
#定義被打斷的時候
interrupt$Location[grepl("\\.\\.\\.$", interrupt$Text)] <- "After" #表示"之後"被打斷
interrupt$Location[interrupt$Text == "(CROSSTALK)"] <- "Crosstalk" #表示兩個候選人皆在講話
deb.int <- full_join(debate, interrupt, by = c("Line", "Speaker", "Text", "Date", "Num"))
a <- levels(deb.int$Speaker)
deb.int <- as.data.table(deb.int)
deb.int <- deb.int[ , Culprit := ifelse(Location == "After", a[Speaker[shift(Num, n = 1, type = "lead")]], NA)]
deb.int <- deb.int[ , Culprit := ifelse(Location == "Crosstalk" & Speaker[shift(Num, n = 1, type = "lag")] == "Trump", "Clinton", Culprit)]
deb.int <- deb.int[ , Culprit := ifelse(Location == "Crosstalk" & Speaker[shift(Num, n = 1, type = "lag")] == "Clinton", "Trump", Culprit)]
#只選取第一場辯論
deb1 <- filter(deb.int, Date == "9/26/16")
deb1$Speaker <- factor(deb1$Speaker) # 去掉沒用到的level
table(deb1$Culprit, deb1$Speaker, useNA = "ifany", dnn = c("Interrupter", "Interrupted"))
## Interrupted
## Interrupter Audience CANDIDATES Clinton Holt Trump
## Audience 0 0 0 0 1
## CANDIDATES 0 0 1 3 3
## Clinton 0 5 0 7 4
## Holt 0 0 2 0 8
## Trump 0 1 27 27 0
## <NA> 17 3 67 61 115
#只選取第三場辯論
deb3 <- filter(deb.int, Date == "10/19/2016")
deb3$Speaker <- factor(deb3$Speaker) # 去掉沒用到的level
table(deb3$Culprit, deb3$Speaker, useNA = "ifany", dnn = c("Interrupter", "Interrupted"))
## Interrupted
## Interrupter Audience CANDIDATES Clinton Trump Wallace
## Audience 0 0 1 0 0
## Clinton 0 2 0 0 6
## Trump 0 5 8 0 12
## Wallace 0 0 2 8 0
## <NA> 7 3 73 123 113
int1 <- filter(deb1, !is.na(Culprit)) %>%
group_by(Speaker, Culprit) %>%
summarize(Interruptions = n())
int3 <- filter(deb3, !is.na(Culprit)) %>%
group_by(Speaker, Culprit) %>%
summarize(Interruptions = n())
deb1.plot <- ggplot(int1, aes(Speaker, Interruptions, fill = Culprit)) +
geom_bar(stat="identity") +
coord_flip(ylim = c(0, 38)) +
labs( x = "Person Interrupted",
y = "Number of Interruptions by Interrupter",
title = "The First Debate") +
scale_fill_manual(values = c("#6E8B3D", "#CD5555", "#4169E1", "#303030", "#E91D0E"))
deb3.plot <- ggplot(int3, aes(Speaker, Interruptions, fill = Culprit)) +
geom_bar(stat="identity") +
coord_flip(ylim = c(0, 40.5)) +
labs( x = "Person Interrupted",
y = "Number of Interruptions by Interrupter",
title = "The Third Debate") +
scale_fill_manual(values = c("#6E8B3D", "#4169E1", "#E91D0E", "#303030"))
grid.arrange(deb1.plot, deb3.plot, nrow = 2)
第一場辯論的主持人為Holt,第三場則是Wallace。
第一場辯論中可以看到川普不停打斷希拉蕊與主持的談話,且兩位候選人同時講話的時候(Speaker = CANDIDATES),幾乎都是希拉蕊試圖停止這種混亂的局面(希拉蕊打斷CANDIDATES的比例較川普高)。有趣的是,川普被主持人打斷最多次,而主持人的職責在於維持辯論會的秩序,可大略歸納出川普在第一場辯論中,是使得秩序混亂的罪魁禍首。
到了第三場辯論會,整體趨勢與第一次辯論會相同,但對於被打擾的講者,打斷的次數都大大減少了許多,而在混亂局面時,甚至是川普終結的比例較高,可能是川普的音量與氣勢壓過希拉蕊,使得在混亂局面的最後由川普掌握。根據這兩場辯論會,會覺得川普在“尊重言論”這方面需要多多加強!
用文字雲大略呈現兩個候選人各自偏愛使用的詞彙,只包含正反面的詞彙,像“I”、“you”、“are”、“Trump”、“Clinton”等則沒有呈現。
library(wordcloud)
library(RColorBrewer)
n <- 1
word <- c()
sentiment <- c()
speaker <- c()
date <- c()
line <- c()
for(i in (1:length(debate$Text)))
{
wordp <- gregexpr("[[:alpha:]]+", as.character(debate$Text[i]))
wordr <- unlist(regmatches(debate$Text[i], wordp))
df1 <- data.frame(word = wordr, stringsAsFactors = FALSE)
df2 <- inner_join(df1, bing, by = "word")
df3 <- mutate(df2, speaker = rep(as.character(debate$Speaker[i]), times = nrow(df2)),
date = rep(as.character(debate$Date[i]), times = nrow(df2)),
line = rep(as.character(debate$Line[i]), times = nrow(df2)))
if(length(df2$word) == 0)
{
word <- word
sentiment <- sentiment
speaker <- speaker
date <- date
line <- line
}
else
{
word[n:(n + length(df3$word) - 1)] <- df3$word
sentiment[n:(n + length(df3$word) - 1)] <- df3$sentiment
speaker[n:(n + length(df3$word) - 1)] <- df3$speaker
date[n:(n + length(df3$word) - 1)] <- df3$date
line[n:(n + length(df3$word) - 1)] <- df3$line
}
n <- n + length(df2$word)
}
result <- data.frame(word = word, sentiment = sentiment, speaker = speaker,
date = date, line = line)
#第一場辯論的 word cloud
Trump1 <- filter(result, speaker == "Trump" & date == "9/26/16")
Trump1.tb <- sort(table(Trump1$word), decreasing = T)
Trump1.tb <- Trump1.tb[Trump1.tb >= 4] #只選出現頻率4以上的
Clinton1 <- filter(result, speaker == "Clinton" & date == "9/26/16")
Clinton1.tb <- sort(table(Clinton1$word), decreasing = T)
Clinton1.tb <- Clinton1.tb[Clinton1.tb >= 4] #只選出現頻率4以上的
#紅色代表Trump(共和黨),藍色代表Clinton(民主黨)
par(mfrow = c(1, 2))
wordcloud(names(Trump1.tb), Trump1.tb,
colors = c("indianred1", "indianred2", "indianred3", "indianred"))
wordcloud(names(Clinton1.tb), Clinton1.tb,
colors= c("steelblue1", "steelblue2", "steelblue3", "steelblue"))
#第三場辯論的 word cloud
Trump3 <- filter(result, speaker == "Trump" & date == "10/19/2016")
Trump3.tb <- sort(table(Trump3$word), decreasing = T)
Trump3.tb <- Trump3.tb[Trump3.tb >= 4] #只選出現頻率4以上的
Clinton3 <- filter(result, speaker == "Clinton" & date == "10/19/2016")
Clinton3.tb <- sort(table(Clinton3$word), decreasing = T)
Clinton3.tb <- Clinton3.tb[Clinton3.tb >= 4] #只選出現頻率4以上的
par(mfrow = c(1, 2))
wordcloud(names(Trump3.tb), Trump3.tb,
colors = c("indianred1", "indianred2", "indianred3", "indianred"))
wordcloud(names(Clinton3.tb), Clinton3.tb,
colors= c("steelblue1", "steelblue2", "steelblue3", "steelblue"))
第一場辯論的文字雲中可以看出川普較常用like、bad、great、better、right等詞,而希拉蕊較常使用good、work、well、important、debt等詞;到了第三場辯論,川普依舊常用great、right、bad、like等詞,而希拉蕊的常用詞除了work以外,多了clear、undocumented、like、important等詞。
驚奇的是,川普辯論的文字雲中,respect這個詞的使用頻率從第一場幾乎沒有出現在文字雲中到第三場的使用頻率上升,對一向給人較激進印象的川普而言,這個結果有些令人意外,而與第一場辯論相比,高頻使用的字也較少。
而希拉蕊辯論的文字雲中,第三場undocumented這個詞的使用的上升,也許是她在極力反駁川普的攻擊,認為對方並無事實證據只是憑空推測,也不難看出兩方辯論的激烈程度,高頻使用的字量則無明顯變化。
分析在兩場辯論中,各候選人在用詞上的情感分佈。
df.sent <- result %>%
count(speaker, date, line, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
#在同一段文字裡,正面的用詞個數減掉負面的用詞個數
deb1.sent <- df.sent %>%
filter(date == "9/26/16") %>%
filter(speaker %in% c("Clinton", "Trump"))
deb1.sent$line <- as.integer(deb1.sent$line)
deb1.sent <- arrange(deb1.sent, line)
deb3.sent <- df.sent %>%
filter(date == "10/19/2016") %>%
filter(speaker %in% c("Clinton", "Trump"))
deb3.sent$line <- as.integer(deb3.sent$line)
deb3.sent <- arrange(deb3.sent, line)
deb1.sent.plot <- ggplot(deb1.sent, aes(x = line, y = sentiment, fill = speaker)) +
geom_bar(stat = "identity") +
scale_x_continuous(breaks = seq(0, nrow(deb1), 25)) +
labs(x = "Line in Transcript",
y = "Sentiment",
title = "The First Debate") +
scale_fill_manual(values = c("#4169E1", "#E91D0E"))
deb3.sent.plot <- ggplot(deb3.sent, aes(x = line, y = sentiment, fill = speaker)) +
geom_bar(stat = "identity") +
scale_x_continuous(breaks = seq(0, nrow(deb3), 25)) +
labs(x = "Line in Transcript",
y = "Sentiment",
title = "The Third Debate") +
scale_fill_manual(values = c("#4169E1", "#E91D0E"))
grid.arrange(deb1.sent.plot, deb3.sent.plot, nrow = 2)
圖表為正面情緒詞彙與負面情緒詞彙相減後的結果。
第一場辯論,川普在整體上使用的正面情緒詞彙較負面多,其中前半部分較為明顯。希拉蕊在前段與後半部分使用比較多的正面情緒詞彙,負面情緒詞彙在中間段落較多。
第三場辯論,有趣的是川普在前段的兩種詞彙使用幅度如雲霄飛車般,呈現急劇變化的結果,這一刻使用較多的正面情緒詞彙,下一刻使用較多的負面情緒詞彙。希拉蕊在整體上正面情緒詞彙的使用略多一些,後段則有大量使用正面與負面情緒詞彙的時刻。
另外,與第一場辯論相比,川普在第三場辯論的負面情緒詞彙的使用變多了。希拉蕊第一場辯論時多量使用負面情緒詞彙的時刻集中在中段,第三場辯論則較為分散。