(一)題目 鐵路警察遭刺死案
(二)研究動機 108/7/3-事件發生 25歲員警李承翰去年7月3日晚間,前往台鐵自強號列車,處理逃票糾紛時,不料,鄭姓乘客突然拿出尖刀,朝他的腹部猛刺,導致當場鮮血直流、臟器外露。負傷的李承翰因擔心乘客受到傷害,忍痛拚命壓制鄭男,經緊急送醫搶救後,仍因傷重宣告不治。 109/3/13-開庭 鐵路警察李承翰去年7月遭鄭姓嫌犯刺死案,今天下午開庭傳喚證身心科醫師作證,認為鄭是思覺失調症患者,當時行為受妄想和幻聽影響,攻擊時已喪失辨識能力,也喪失部分控制力,醫療團隊的證詞將是鄭姓嫌犯未來判決是否有罪的重要因素。 109/4/30-一審判決出爐 刺死鐵路警李承翰 嫌「思覺失調」無罪50萬交保! 法官考量鄭姓嫌犯因患有思覺失調症,將其判無罪(可上訴)、強制就醫5年,上訴期間內可撤銷羈押可50萬交保,如未繳50萬元繼續羈押。 109/5/1-醫師遭撻伐 殺鐵路警判無罪》鑑定醫師沈正哲遭出征 寫近2千字吐苦衷:專業沒被尊重
109/6/4-鐵路警李承翰父悲憤過世
因此我們希望透過分析數據,探討本事件中各階段,相關媒體報導與民眾輿論的主要內容、方向,以及事件轉折點造成議題、情緒變化之情況。
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] ""
##資料來源:PTT八卦版,關鍵字「李承翰_鐵路警」
Posts <- read_csv('./李承翰_鐵路警_articleMetaData.csv')
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## commentNum = col_double(),
## push = col_double(),
## boo = col_double(),
## sentence = col_character()
## )
head(Posts)
## # A tibble: 6 x 10
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 2 [新聞]快訊/~ 2019-07-03 16:32:53 https~ DDDDRR Gossi~ 1496 1048 103
## 3 [問卦]刺死鐵~ 2019-07-03 16:40:34 https~ Elsie1999 Gossi~ 290 120 17
## 4 Re:[新聞]~ 2019-07-03 16:42:58 https~ ziggs1222 Gossi~ 29 4 20
## 5 [新聞]嘉義鐵~ 2019-07-03 17:16:55 https~ lilskies Gossi~ 29 2 11
## 6 [問卦]鐵路警~ 2019-07-03 17:37:29 https~ v963610 Gossi~ 72 18 13
## # ... with 1 more variable: sentence <chr>
Reviews <- read_csv('./李承翰_鐵路警_articleReviews.csv')
## Parsed with column specification:
## cols(
## artTitle = col_character(),
## artDate = col_date(format = ""),
## artTime = col_time(format = ""),
## artUrl = col_character(),
## artPoster = col_character(),
## artCat = col_character(),
## cmtPoster = col_character(),
## cmtStatus = col_character(),
## cmtDate = col_datetime(format = ""),
## cmtContent = col_character()
## )
## Warning: 16123 parsing failures.
## row col expected actual file
## 11047 cmtDate date like 2020/3/13 22:55 './李承翰_鐵路警_articleReviews.csv'
## 11048 cmtDate date like 2020/3/13 22:55 './李承翰_鐵路警_articleReviews.csv'
## 11049 cmtDate date like 2020/3/13 22:55 './李承翰_鐵路警_articleReviews.csv'
## 11050 cmtDate date like 2020/3/13 22:55 './李承翰_鐵路警_articleReviews.csv'
## 11051 cmtDate date like 2020/3/13 22:56 './李承翰_鐵路警_articleReviews.csv'
## ..... ....... .......... ............... ....................................
## See problems(...) for more details.
head(Reviews)
## # A tibble: 6 x 10
## artTitle artDate artTime artUrl artPoster artCat cmtPoster cmtStatus
## <chr> <date> <time> <chr> <chr> <chr> <chr> <chr>
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ gankgf 推
## 2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ saya2185 →
## 3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Leo4891 推
## 6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ iam0718 推
## # ... with 2 more variables: cmtDate <dttm>, cmtContent <chr>
Posts %>%
group_by(artDate) %>%
summarise(count = n())%>%
ggplot(aes(artDate,count))+
geom_line(color="red")+
geom_point()
Posts Tokenization
# 排除Re類型文章
Posts <- Posts %>%
filter(!substr(artTitle,start=1,stop=3) %in% c("Re:"))
# 依"文章"進行斷詞
# 取得所有與"鐵路警察遭刺死案"有關之PTT文章,將同一天出現重複的文章去除。
#Posts <- Posts %>%
# distinct(artTitle, artDate, sentence)
# 匯入專用字典、停用字
mask_lexicon <- scan(file = "./mask_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
# 使用專用字典,並去除停用字。
jieba_tokenizer = worker(write = "NOFILE") #worker()
new_user_word(jieba_tokenizer, c(mask_lexicon))
## [1] TRUE
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
tokens <- Posts %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]")))
tokens %>%head(20)
## # A tibble: 20 x 10
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 7 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 8 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 9 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 10 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 11 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 12 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 13 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 14 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 15 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 16 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 17 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 18 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 19 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 20 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## # ... with 1 more variable: word <chr>
# 一審判決(2020/4/30)前後詞彙在全文中出現比率的差異
frequency <- tokens %>%
mutate(event = ifelse(artDate < as.Date('2020-04-30'), "First", "Last")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(event, word) %>%
group_by(event) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(event, proportion) %>%
gather(event, proportion, `Last`)
frequency
## # A tibble: 4,599 x 4
## word First event proportion
## <chr> <dbl> <chr> <dbl>
## 1 <U+7535><U+89C6> NA Last 0.0000919
## 2 丁允 NA Last 0.000368
## 3 丁世傑 0.000122 Last NA
## 4 丁怡銘 NA Last 0.000276
## 5 丁怡銘今 NA Last 0.0000919
## 6 丁偉杰 NA Last 0.000184
## 7 丁盛豐 0.000122 Last NA
## 8 丁勝豐 0.000122 Last NA
## 9 了不起 NA Last 0.0000919
## 10 了會 0.000122 Last NA
## # ... with 4,589 more rows
# 匯出圖表
ggplot(frequency, aes(x = proportion, y = `First`, color = abs(`First` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "一審判決前", x = "一審判決後")
## Warning: Removed 3601 rows containing missing values (geom_point).
## Warning: Removed 3602 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
探討“一審判決前後”個別出現頻率較高的詞彙
一審判決前:電擊槍、告別式、旅客
一審判決後:法官、判決、父親、李增文(李父)
媒體報導在“一審判決前”,主要有兩個方向:
1.事件始末
2.因公殉職以及鐵道員警執勤業務
媒體報導在“一審判決後”,主要有兩個方向:
1.案件解讀以及對社會影響
2.李父過世
Posts
## # A tibble: 128 x 10
## artTitle artDate artTime artUrl artPoster artCat commentNum push boo
## <chr> <date> <time> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ 133 57 8
## 2 [新聞]快訊/~ 2019-07-03 16:32:53 https~ DDDDRR Gossi~ 1496 1048 103
## 3 [問卦]刺死鐵~ 2019-07-03 16:40:34 https~ Elsie1999 Gossi~ 290 120 17
## 4 [新聞]嘉義鐵~ 2019-07-03 17:16:55 https~ lilskies Gossi~ 29 2 11
## 5 [問卦]鐵路警~ 2019-07-03 17:37:29 https~ v963610 Gossi~ 72 18 13
## 6 [問卦]殺鐵路~ 2019-07-03 17:41:58 https~ Lcyy Gossi~ 37 10 6
## 7 [新聞]【鐵路~ 2019-07-03 18:08:00 https~ vic2211 Gossi~ 797 473 52
## 8 [新聞]【鐵路~ 2019-07-03 18:28:28 https~ ash99119~ Gossi~ 183 70 47
## 9 [新聞]哀!身~ 2019-07-03 18:44:46 https~ DOOHDLIHC Gossi~ 60 26 2
## 10 [新聞]台鐵持~ 2019-07-03 19:08:27 https~ friedrich Gossi~ 280 103 68
## # ... with 118 more rows, and 1 more variable: sentence <chr>
allPoster <- c(Posts$artPoster, Reviews$cmtPoster)
#length(unique(allPoster))
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%Posts$artPoster, "poster", "replyer"))
#userList
# 把原文與回覆依據artUrl innerJoin起來
#posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")
posts_Reviews<-Reviews
#posts_Reviews
#Reviews_sentences <- strsplit(Reviews$cmtContent,"[。!;?!?;,:]")
#Reviews_sentences <- data.frame(
# artTitle = rep(Reviews$artTitle, sapply(Reviews_sentences, length)),
# artDate = rep(Reviews$artDate, sapply(Reviews_sentences, length)),
# artTime = rep(Reviews$artTime, sapply(Reviews_sentences, length)),
# artUrl = rep(Reviews$artUrl, sapply(Reviews_sentences, length)),
# cmtContent = unlist(Reviews_sentences)
# ) %>%
# filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
#Reviews_sentences$cmtContent <- as.character (Reviews_sentences$cmtContent)
# 匯入專用字典、停用字
mask_lexicon <- scan(file = "./mask_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
stop_words <- scan(file = "./stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
# 使用專用字典,並去除停用字。
jieba_tokenizer = worker(write = "NOFILE") #worker()
new_user_word(jieba_tokenizer, c(mask_lexicon))
## [1] TRUE
Reviews_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
tokens <- Reviews %>%
unnest_tokens(word, cmtContent, token=Reviews_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) #%>%
#count(artUrl, word) %>%
#rename(count=n)
tokens %>%head(20)
## # A tibble: 20 x 10
## artTitle artDate artTime artUrl artPoster artCat cmtPoster cmtStatus
## <chr> <date> <time> <chr> <chr> <chr> <chr> <chr>
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ gankgf 推
## 2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ saya2185 →
## 3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 7 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 8 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 9 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 10 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Leo4891 推
## 11 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Leo4891 推
## 12 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Leo4891 推
## 13 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Leo4891 推
## 14 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ iam0718 推
## 15 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ WolfTeac~ →
## 16 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ WolfTeac~ →
## 17 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ WolfTeac~ →
## 18 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ darkbrig~ →
## 19 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ darkbrig~ →
## 20 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ darkbrig~ →
## # ... with 2 more variables: cmtDate <dttm>, word <chr>
# 一審判決(2020/4/30)前後詞彙在全文中出現比率的差異
frequency <- tokens %>%
mutate(event = ifelse(artDate < as.Date('2020-04-30'), "First", "Last")) %>%
filter(nchar(.$word)>1) %>%
mutate(word = str_extract(word, "[^0-9a-z']+")) %>%
mutate(word = str_extract(word, "^[^一二三四五六七八九十]+")) %>%
count(event, word) %>%
group_by(event) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(event, proportion) %>%
gather(event, proportion, `Last`)
frequency
## # A tibble: 14,920 x 4
## word First event proportion
## <chr> <dbl> <chr> <dbl>
## 1 <U+6653>得 NA Last 0.0000199
## 2 <U+6CA1>事 0.0000334 Last NA
## 3 <U+6CA1>罪 0.0000334 Last NA
## 4 <U+7962>們 0.0000334 Last NA
## 5 丁世傑 0.0000334 Last NA
## 6 丁蟹 NA Last 0.0000199
## 7 了不起 0.000134 Last 0.0000596
## 8 了吧 0.000267 Last 0.000139
## 9 了沒 0.0000668 Last 0.000159
## 10 了呢 NA Last 0.0000199
## # ... with 14,910 more rows
# 匯出圖表
ggplot(frequency, aes(x = proportion, y = `First`, color = abs(`First` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5, family="Heiti TC Light") +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none") +
labs(y = "一審判決前", x = "一審判決後")
## Warning: Removed 11314 rows containing missing values (geom_point).
## Warning: Removed 11315 rows containing missing values (geom_text).
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
Reviews
## # A tibble: 27,169 x 10
## artTitle artDate artTime artUrl artPoster artCat cmtPoster cmtStatus
## <chr> <date> <time> <chr> <chr> <chr> <chr> <chr>
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ gankgf 推
## 2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ saya2185 →
## 3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Leo4891 推
## 6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ iam0718 推
## 7 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ WolfTeac~ →
## 8 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ darkbrig~ →
## 9 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Mradult 推
## 10 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ darkbrig~ →
## # ... with 27,159 more rows, and 2 more variables: cmtDate <dttm>,
## # cmtContent <chr>
Reviews
## # A tibble: 27,169 x 10
## artTitle artDate artTime artUrl artPoster artCat cmtPoster cmtStatus
## <chr> <date> <time> <chr> <chr> <chr> <chr> <chr>
## 1 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ gankgf 推
## 2 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ saya2185 →
## 3 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 4 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ xu3 →
## 5 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Leo4891 推
## 6 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ iam0718 推
## 7 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ WolfTeac~ →
## 8 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ darkbrig~ →
## 9 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ Mradult 推
## 10 [新聞]男子持~ 2019-07-03 07:05:46 https~ kuro Gossi~ darkbrig~ →
## # ... with 27,159 more rows, and 2 more variables: cmtDate <dttm>,
## # cmtContent <chr>
tokens <- Reviews %>%
unnest_tokens(word, cmtContent, token=Reviews_tokenizer) %>%
#filter(!str_detect(word, regex("[0-9a-zA-Z]"))) #%>%
count(artDate,artUrl, word) %>%
rename(count=n)
word_count <- tokens %>%
select(artUrl,word,count) %>%
group_by(artUrl,word) %>%
summarise(count = sum(count)) %>%
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count
## # A tibble: 3,286 x 3
## # Groups: artUrl [127]
## artUrl word count
## <chr> <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html rip 308
## 2 https://www.ptt.cc/bbs/Gossiping/M.1591280234.A.753.html 法官 237
## 3 https://www.ptt.cc/bbs/Gossiping/M.1591280741.A.9DA.html 法官 180
## 4 https://www.ptt.cc/bbs/Gossiping/M.1563244035.A.811.html qq 165
## 5 https://www.ptt.cc/bbs/Gossiping/M.1591283225.A.36E.html 法官 142
## 6 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html r.i.p 137
## 7 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html r.i.p. 116
## 8 https://www.ptt.cc/bbs/Gossiping/M.1591334261.A.915.html 裝病 107
## 9 https://www.ptt.cc/bbs/Gossiping/M.1591341952.A.398.html 速食店 107
## 10 https://www.ptt.cc/bbs/Gossiping/M.1562246171.A.40E.html 警察 98
## # ... with 3,276 more rows
reserved_word <- tokens %>%
group_by(word) %>%
count() %>%
filter(n > 3) %>%
unlist()
mask_removed <- tokens %>%
filter(word %in% reserved_word)
tokens_dtm <- mask_removed %>% cast_dtm(artUrl, word, count)
tokens_dtm
## <<DocumentTermMatrix (documents: 154, terms: 2739)>>
## Non-/sparse entries: 29979/391827
## Sparsity : 93%
## Maximal term length: 6
## Weighting : term frequency (tf)
report_lda <- LDA(tokens_dtm, k = 4, control = list(seed = 1234))
# 移除常見詞彙
tidy(report_lda, matrix = "beta") %>%
filter(! term %in% c("警察","垃圾","台灣","法官","殺人","以後","韓粉","蔡英文","25","問題","霸氣","政府","幹話","消費","幹你娘","")) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()
topic_name_a = c('精神病殺人無罪','支持用槍','司法改革','願死者安息')
tmResult_a <- posterior(report_lda)
doc_pro_a <- tmResult_a$topics
dim(doc_pro_a)
## [1] 154 4
#reviews
document_topics_a <- doc_pro_a[Reviews$artUrl,]
document_topics_df_a =data.frame(document_topics_a)
colnames(document_topics_df_a) = topic_name_a
rownames(document_topics_df_a) = NULL
news_topic_a = cbind(Reviews,document_topics_df_a)
mycolors_t <- colorRampPalette(brewer.pal(6, "Set3"))(12)
news_topic_a %>%
filter(artDate=='2019-07-03'|artDate=='2020-04-30'|artDate=='2020-06-04')%>%
group_by(artDate = format(artDate,'%Y%m%d')) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "artDate")%>%
group_by(artDate)%>%
mutate(total_value =sum(value))%>%
ggplot( aes(x=artDate, y=value/total_value, fill=variable)) +
geom_bar(stat = "identity" , width = 0.8) +
ylab("proportion") +
scale_fill_manual(values=mycolors_t)+
theme(axis.text.x = element_text(angle = 90, hjust = 0.5))
alytopic_a<-news_topic_a[news_topic_a[11]>0.8 | news_topic_a[12]>0.8 | news_topic_a[13]>0.8 | news_topic_a[14]>0.8,]%>%
filter(cmtStatus!="噓")
alytopic_re<-alytopic_a%>%
mutate(topicna = ifelse(as.double(unlist(alytopic_a[11]))>0.8, "1",
ifelse(as.double(unlist(alytopic_a[12]))>0.8, "2",
ifelse(as.double(unlist(alytopic_a[13]))>0.8, "3",
ifelse(as.double(unlist(alytopic_a[14]))>0.8, "4", "0")))))%>%
select(artUrl,cmtPoster,topicna)
alytopic_graf<-alytopic_re%>%
group_by(cmtPoster, artUrl) %>%
filter(n()>5) %>% #回文超過4次
ungroup()%>%
select(topicna,cmtPoster)%>%
unique()
g_alytopicAll<-graph_from_data_frame(d=alytopic_graf, directed=F)
ceb2 <- cluster_fast_greedy(g_alytopicAll)
V(g_alytopicAll)$shape <- ifelse(V(g_alytopicAll)$name %in% c("1","2","3","4") , "square", "circle")
V(g_alytopicAll)$label <- ifelse(V(g_alytopicAll)$name %in% c("1","2","3","4") , V(g_alytopicAll)$name, "")
V(g_alytopicAll)$size <- ifelse( V(g_alytopicAll)$name %in% c("1","2","3","4") , 12,8 )
plot(ceb2,g_alytopicAll)
# 正向字典txt檔
# 以,將字分隔
P <- read_file("./positive.txt")
# 負向字典txt檔
N <- read_file("./negative.txt")
#將字串依,分割
#strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
與LIWC情緒字典做join 文集中的字出現在LIWC字典中是屬於positive還是negative
word_count %>% inner_join(LIWC)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## # A tibble: 512 x 4
## # Groups: artUrl [77]
## artUrl word count sentiment
## <chr> <chr> <int> <fct>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.ht~ 垃圾 78 negative
## 2 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.ht~ 難過 60 negative
## 3 https://www.ptt.cc/bbs/Gossiping/M.1591327929.A.29C.ht~ 心痛 60 negative
## 4 https://www.ptt.cc/bbs/Gossiping/M.1591280741.A.9DA.ht~ 難過 57 negative
## 5 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.ht~ 死刑 55 negative
## 6 https://www.ptt.cc/bbs/Gossiping/M.1562206442.A.05D.ht~ 難過 47 negative
## 7 https://www.ptt.cc/bbs/Gossiping/M.1591280741.A.9DA.ht~ 垃圾 46 negative
## 8 https://www.ptt.cc/bbs/Gossiping/M.1591283225.A.36E.ht~ 垃圾 43 negative
## 9 https://www.ptt.cc/bbs/Gossiping/M.1591283225.A.36E.ht~ 難過 41 negative
## 10 https://www.ptt.cc/bbs/Gossiping/M.1562506779.A.39D.ht~ 大膽 37 positive
## # ... with 502 more rows
tokens %>% inner_join(LIWC) %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
tokens %>%
filter(artDate == as.Date('2019/07/03')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
count = n()
) %>% data.frame() %>%
top_n(30,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
正向字眼的數量,很明顯的比負向字眼的數量還要少很多….
sent_cloud <- word_count %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(count=sum(count)) %>%
acast(word ~ sentiment, value.var = "count", fill = 0) %>%
comparison.cloud(colors = c("gray80", "gray20"),
max.words = 100)
## Joining, by = "word"
Word Correlation (文字相關性)
# 計算兩個詞彙同時出現的總次數
word_pairs <- word_count %>%
pairwise_count(word, count, sort = TRUE)
word_pairs
## # A tibble: 41,880 x 4
## # Groups: artUrl [88]
## artUrl item1 item2 n
## <chr> <chr> <chr> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 殺人 老人 1
## 2 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 老人 殺人 1
## 3 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 殺警 判死 1
## 4 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 判死 殺警 1
## 5 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 補票 司法 1
## 6 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 司法 補票 1
## 7 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 媽的 年輕人 1
## 8 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 年輕人 媽的 1
## 9 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 希望 兇手 1
## 10 https://www.ptt.cc/bbs/Gossiping/M.1562200736.A.F41.html 兇手 希望 1
## # ... with 41,870 more rows
ptt_word_cors <- word_count %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
ptt_word_cors_filter <- ptt_word_cors %>%
#我們選擇幾個主題來尋找各自的相關字
filter(item1 %in% c("警察", "法官", "醫生", "家屬", "精神病")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation))%>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, ncol = 2, scales = "free") +
coord_flip()+
#加入中文字型設定,避免中文字顯示錯誤。
theme(text = element_text(family = "Heiti TC Light"))
## Selecting by correlation
ptt_word_cors_filter
關於“法官”與“精神病”相關詞:都是圍繞著司法、死刑與廢死相關詞
關於“家屬”與“警察”相關詞:充滿著對政府不滿的負面詞
# 顯示相關性大於0.4的組合
set.seed(2020)
ptt_word_cors %>%
filter(correlation > 0.4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database