一、工作事項與需要能力
1. 斷詞分析(Segment analysis):
請將臉書各則貼文之「文字敘述」,切割為「詞」(最小有意義且可以自由使用的語言單位)。
1. 熟悉斷詞模型,包括(但不限於):最大概率法( MPSegment )、隱式馬爾科夫模型( HMMSegment )、混合模型( MixSegment )、索引模型( QuerySegment )
2. 熟悉文字探勘處理套件,包括(但不限於):tm、tidy、jiebaR
詞頻(關鍵字)統計: 請依據本研究提供之關鍵字列表,統計各個關鍵字在各則貼文中出現的「次數」。
→舉例:
本研究給定關鍵字「我們」;希望回傳檔案能在後方加入一欄位,統計出各則貼文所出現的關鍵字「我們」次數。
例如:第一、第二、第三則貼文文字敘述中的「我們」,各出現0、0、2次。
詞頻(關鍵字)是否出現:請依據本研究提供之關鍵字列表,標記各則貼文中有無出現各個關鍵字。 →舉例:本研究給定關鍵字「我們」;希望回傳檔案能在後方加入一欄位,標記各則貼文有無出現關鍵字「我們」。
✓ 例如:第一、第二則貼文文字敘述中,沒有出現「我們」關鍵字(標記為0);第三則貼文文字敘述中,有出現「我們」關鍵字(標記為1)。
情緒分析 (Sentiment analysis):請依據本研究提供之情緒詞關鍵字字典,將各則貼文中出現的「正負情緒詞」進行權重計算,統計各則貼文的情緒值。
✓ 需要能去除停用詞及分詞
✓ 需要能套入指定字典之詞彙
✓ 需要能針對字典中的詞彙進行正負評價加權
依候選人,將各類目關鍵字出現次數加總:請依據候選人,將類目所有出現的關鍵字次數進行加總。
→舉例:本研究給定類目「1-01提及自己名字」總共有「柯文哲」、「文哲」、「阿北」三個關鍵字;希望回傳檔案能在「柯文哲」、「姚文智」等14位候選人類目後方均加入一欄位,標記「各關鍵字出現次數總和」、「各關鍵字出現種類總和」。 例如:在所有柯文哲粉絲專頁貼文,出現0次「柯文哲」、4次「柯P」、1次「阿北」,則標註「各關鍵字出現次數總和」標記為 5 (= 0+4+1)、「各關鍵字出現種類總和」標記為 2 (共兩種關鍵字)。
library(xlsx)
library(readr)
library(dplyr)
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(data.table)
library(reshape2)
library(scales)
library(tm)
library(tmcn)
library(lubridate)
# 7月份資料
july_data <- read.xlsx("data/07.xlsx", sheetIndex = 1, encoding = "UTF-8")
# 8月份資料
# aug_data <- read.xlsx("data/08.xlsx", sheetIndex = 1, encoding = "UTF-8")
只篩選政治人物自己的貼文,並挑選欄位:
- url - content - topicBy
colnames(july_data)[1:5] <- c("note", "dpm", "day", "mid", "id")
july <- july_data[is.na(july_data$note),] %>%
select(url, content, topicBy)
碰到第一個問題:
表情符號會是亂碼,且不是分析標的,要把emoji表情符號過濾掉
參考資料: 所有表情符號的unicode
刪除表情符號後,還有一些內容是原始檔案就有的編碼錯誤, 如:
july_sentences <- july %>%
# remove emoji
mutate(sentence = gsub("(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", "", content)) %>%
# remove non UTF-8
mutate(sentence = gsub("[^[:alnum:][:blank:]?&/\\-]", "", sentence))
# july_sentences <- strsplit(july_sentences$sentence, "[。!;?!?;]")
#
# july_sentences <- data.frame(
# url = rep(july$url, sapply(july_sentences, length)),
# topicBy = rep(july$topicBy, sapply(july_sentences, length)),
# sentence = unlist(july_sentences)
# )
再過濾一次tab與換行符號,並將sentence轉成文字format。
july_sentences <- july_sentences %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
july_sentences$sentence <- as.character(july_sentences$sentence)
載入第一類關鍵字,候選人提及他們自己的名字。
their_names <- scan(file = "dict/their_names.txt",
what = character(),
sep = "\n",
encoding = "utf-8",
fileEncoding = "utf-8")
head(their_names)
## [1] "我" "我們" "自己" "小弟" "小妹" "偶"
此關鍵字字典已先用python處理完,所有各候選人的綽號作爲keyword,共323個字。
candidate_name <- scan(file = "dict/candidate_name_new.txt",
what = character(),
sep = "\n",
encoding = "utf-8",
fileEncoding = "utf-8")
head(candidate_name)
## [1] "丁爺爺" "丁能兒" "叮手中" "輸不起" "subuki" "老狐狸"
初步先用jieba進行斷詞,並新增上述字典。
jieba_tokenizer <- worker()
new_user_word(jieba_tokenizer, c(their_names, candidate_name))
## [1] TRUE
中文斷詞函式
chi_tokenizer <- function(t) {
lapply(t, function(x) {
#if(nchar(x) > 1){
tokens <- segment(x, jieba_tokenizer)
# tokens <- tokens[nchar(tokens) > 1]
return(tokens)
#}
})
}
將sentence進行斷詞,斷詞結果再過濾英文與數字,同時以各文章計算詞頻。
tokens <- july_sentences %>%
unnest_tokens(word, sentence, token = chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(url, word) %>%
rename(count = n)
將載入的字典轉換成dataframe,並合併成一個字典。
their_names_df <- data.frame(word=their_names)
candidate_name_df <- data.frame(word=candidate_name)
all_keys_df <- rbind(their_names_df, candidate_name_df)
過濾我們研究需要的關鍵字,並將value轉換成column
dcast_tokens <- tokens %>%
inner_join(all_keys_df) %>%
dcast(url ~ word, value.var = "count", fill = 0)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## Aggregation function missing: defaulting to length
non_cols <- all_keys_df %>%
anti_join(tokens) %>%
dcast(. ~ word) %>%
select(-1)
## Joining, by = "word"
## Warning: Column `word` joining factor and character vector, coercing into
## character vector
## Using word as value column: use value.var to override.
## Aggregation function missing: defaulting to length
# 所有設為0
non_cols[1:nrow(non_cols),1:ncol(non_cols)] <- 0
# 合併回原tokens
dcast_tokens <- cbind(dcast_tokens, non_cols)
output_count <- july %>%
left_join(dcast_tokens, by=c("url"))
output_count[is.na(output_count)] <- 0
有75篇文章關鍵字都沒有出現過,將NA補0。
output_exist <- tokens %>%
mutate(exists = ifelse(word %in% all_keys_df$word, 1, 0)) %>%
filter(exists == 1) %>%
dcast(url ~ word, value.var = "exists", fill = 0)
temp <- july %>%
left_join(output_exist) %>%
replace(is.na(.), 0)
## Joining, by = "url"
總共有74篇都沒有這些關鍵字,所以left join為na,將這些na填補0。
# write.csv(output_count, file="output/output_count_07.csv", fileEncoding = "UTF-8")
write.csv(output_exist, file="output/output_exist_07.csv", fileEncoding = "UTF-8")
url_person <- july %>% select(url, topicBy)
tokens <- tokens %>% left_join(url_person) %>% filter(!is.na(word))
## Joining, by = "url"
tokens_count <- tokens %>%
# filter(nchar(.$word) > 1) %>%
group_by(word, topicBy) %>%
summarise(sum = n()) %>%
filter(sum > 10) %>%
arrange(topicBy, desc(sum))
tokens_count %>%
filter(word=="我") %>%
ggplot(aes(word, sum, fill=topicBy)) +
geom_col() +
facet_grid(topicBy~.) +
xlab(NULL) +
coord_flip()
tokens_count %>%
filter(word=="我們") %>%
ggplot(aes(word, sum, fill=topicBy)) +
geom_col() +
facet_grid(topicBy~.) +
xlab(NULL) +
coord_flip()
import pandas as pd
import numpy as np
keywords=pd.read_excel("/Users/yuzhe/Downloads/資工背景工讀生_關鍵字整理列表(初步建構中)__20191111.xlsx",
sheet_name="1-02、1-04、1-06 主要政治人物")
keywords=keywords[['子類目', '關鍵字', '(此部分為綽號須注意) 負面關鍵字','(此部分為綽號須注意) 中性 or 正面關鍵字']]
keywords.columns=['name', 'first_name', 'negative_nick', 'positive_nick']
for i in range(len(keywords)):
keywords['keyword'].iloc[i,]=str(keywords['first_name'].iloc[i,])+' or '+str(keywords['negative_nick'].iloc[i,])+ ' or '+str(keywords['positive_nick'].iloc[i,])
temp = keywords["keyword"].str.split(" or ")
kname=keywords['name']
name_nick.replace('nan', np.NaN, inplace=True)
name_nick.dropna(how='any',inplace=True)
name_nick.to_csv('cadidate_nickname.csv', index=False)
載入上述python處理完對應之候選人暱稱
candidate_nickname <- read_csv("dict/cadidate_nickname.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## keyword = col_character()
## )
colnames(candidate_nickname) <- c("name", "word")
各候選人替換暱稱後
true_name <- tokens %>%
inner_join(candidate_nickname) %>%
mutate(word = name, name=NULL) %>%
group_by(url, word) %>%
summarise(count=sum(count)) %>%
dcast(url ~ word, value.var = "count", fill = 0)
## Joining, by = "word"
# temp <- tokens %>%
# mutate(word = ifelse(word %in% c("守中", "丁爺爺", "丁能兒", "叮手中", "輸不起", "subuki"), "丁守中", word)) %>%
# mutate(word = ifelse(word %in% c("通靈王", "國民阿舅", "上海女婿", "五億探長", "地獄刑事", "文大包租公"), "侯友宜", word))
#
請依據本研究提供之情緒詞關鍵字字典,將各則貼文中出現的「正負情緒詞」進行權重計算,統計各則貼文的情緒值。
✓ 需要能去除停用詞及分詞
✓ 需要能套入指定字典之詞彙
✓ 需要能針對字典中的詞彙進行正負評價加權
因為tmcn套件裡的stopwords許多為簡體字,故stopwords採用 tomlinNTUB所提供繁中-停用詞表,並移除一些標點符,只留中文
# 台大情緒字典
data("NTUSD")
# 停用字
stopwords <- scan(file = "dict/stopwords_tc.txt",
what = character(),
sep = "\n",
encoding = "utf-8",
fileEncoding = "utf-8")
stopwords_df <- data.frame(word=stopwords)
head(stopwords_df, 10)
## word
## 1 啊
## 2 阿
## 3 哎
## 4 哎呀
## 5 哎喲
## 6 唉
## 7 我
## 8 我們
## 9 按
## 10 按照
整理台大情緒字典格式
positive <- data.frame(word = NTUSD$positive_cht, sentiment="positive")
negative <- data.frame(word = NTUSD$negative_cht, sentiment="negative")
NTUSD_ch <- rbind(positive, negative)
head(NTUSD_ch)
## word sentiment
## 1 一帆風順 positive
## 2 一帆風順的 positive
## 3 一流 positive
## 4 一致 positive
## 5 一致的 positive
## 6 了不起 positive
LIWC字典,還在尋找更多LIWC字典,這是課堂助教提供的字典,字數不多。
原本想做比較,但台大字典有1w多個字,這只有1k字左右,暫不比較。
p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, ",")[[1]]
negative <- strsplit(n, ",")[[1]]
positive <- data.frame(word = positive, sentiments = "positive", stringsAsFactors = FALSE)
negative <- data.frame(word = negative, sentiemtns = "negative", stringsAsFactors = FALSE)
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
word_count <- tokens %>%
# 去除stopwords
anti_join(stopwords_df) %>%
select(word, count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
# 玩玩看文字雲
wordcloud2(word_count)
word_count %>%
inner_join(NTUSD_ch) %>%
group_by(sentiment) %>%
top_n(15, wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
將台大情緒字典的正面詞給予權重=1,負面詞給予權重=-1,計算每篇總和。
sentiment_weight <- tokens %>%
inner_join(NTUSD_ch) %>%
mutate(sentiment=ifelse(sentiment %in% "positive", 1, -1)) %>%
group_by(url) %>%
summarise(swt = sum(sentiment))
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
head(sentiment_weight)
## # A tibble: 6 x 2
## url swt
## <fct> <dbl>
## 1 http://www.facebook.com/109391162488374/posts/1778415255585948 3
## 2 http://www.facebook.com/109391162488374/posts/1779467668814040 -1
## 3 http://www.facebook.com/109391162488374/posts/1781234038637403 3
## 4 http://www.facebook.com/109391162488374/posts/1781941188566688 5
## 5 http://www.facebook.com/109391162488374/posts/1785964631497677 -3
## 6 http://www.facebook.com/109391162488374/posts/1787733981320742 3
依照各候選人統計每篇文章平均情緒分數
avg_candidate_sentiment <- july %>%
left_join(sentiment_weight) %>%
group_by(topicBy) %>%
summarise(mean_wt = mean(swt, na.rm = T))
## Joining, by = "url"
head(avg_candidate_sentiment)
## # A tibble: 6 x 2
## topicBy mean_wt
## <fct> <dbl>
## 1 丁守中粉絲團 -1.02
## 2 侯友宜 3.46
## 3 姚文智翻台北 1.48
## 4 林佳龍 5.34
## 5 林義豐MarkLin 1.58
## 6 柯文哲 3.48
視覺化
avg_candidate_sentiment %>%
arrange(desc(mean_wt)) %>%
ggplot(aes(x=topicBy, y=mean_wt, fill=topicBy)) +
geom_col(show.legend = F) +
labs(x=NULL) +
theme(axis.text.x = element_text(size=10, angle=30))
senti_changes_date <- july_data %>%
select(url, date, topicBy) %>%
mutate(date = format(date, "%Y-%m-%d")) %>%
inner_join(tokens) %>%
inner_join(NTUSD_ch) %>%
group_by(topicBy, date, sentiment) %>%
summarise(count=sum(count))
## Joining, by = c("url", "topicBy")
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
all_dates <- expand.grid(seq(as.Date(min(july_data$date)), as.Date(max(july_data$date)), by="day"), c("positive", "negative"))
colnames(all_dates) <- c("date", "sentiment")
senti_changes_date <- all_dates %>%
merge(senti_changes_date, by.x=c('date', "sentiment"), by.y=c('date', "sentiment"),
all.x=T, all.y=T) %>%
mutate(count = replace_na(count, 0))
senti_changes_date %>%
ggplot()+
geom_line(aes(x=date, y=count, colour=topicBy)) +
facet_wrap(~ sentiment, nrow=2, scales="free_y") +
scale_x_date(labels = date_format("%m/%d"))
https://www.facebook.com/339483189800311/posts/532367277178567 高思博這篇名字計算為37次