TIGCR Opinion Analysis

Work Requirement

一、工作事項與需要能力
1. 斷詞分析(Segment analysis)：

請將臉書各則貼文之「文字敘述」，切割為「詞」（最小有意義且可以自由使用的語言單位）。

1. 熟悉斷詞模型，包括（但不限於）：最大概率法( MPSegment )、隱式馬爾科夫模型( HMMSegment )、混合模型( MixSegment )、索引模型( QuerySegment )    

2. 熟悉文字探勘處理套件，包括（但不限於）：tm、tidy、jiebaR

詞頻（關鍵字）統計：請依據本研究提供之關鍵字列表，統計各個關鍵字在各則貼文中出現的「次數」。
→舉例：
本研究給定關鍵字「我們」；希望回傳檔案能在後方加入一欄位，統計出各則貼文所出現的關鍵字「我們」次數。
例如：第一、第二、第三則貼文文字敘述中的「我們」，各出現0、0、2次。
詞頻（關鍵字）是否出現：請依據本研究提供之關鍵字列表，標記各則貼文中有無出現各個關鍵字。 →舉例：本研究給定關鍵字「我們」；希望回傳檔案能在後方加入一欄位，標記各則貼文有無出現關鍵字「我們」。
✓ 例如：第一、第二則貼文文字敘述中，沒有出現「我們」關鍵字（標記為0）；第三則貼文文字敘述中，有出現「我們」關鍵字（標記為1）。
情緒分析 (Sentiment analysis)：請依據本研究提供之情緒詞關鍵字字典，將各則貼文中出現的「正負情緒詞」進行權重計算，統計各則貼文的情緒值。
✓ 需要能去除停用詞及分詞
✓ 需要能套入指定字典之詞彙
✓ 需要能針對字典中的詞彙進行正負評價加權
依候選人，將各類目關鍵字出現次數加總：請依據候選人，將類目所有出現的關鍵字次數進行加總。
→舉例：本研究給定類目「1-01提及自己名字」總共有「柯文哲」、「文哲」、「阿北」三個關鍵字；希望回傳檔案能在「柯文哲」、「姚文智」等14位候選人類目後方均加入一欄位，標記「各關鍵字出現次數總和」、「各關鍵字出現種類總和」。  例如：在所有柯文哲粉絲專頁貼文，出現0次「柯文哲」、4次「柯P」、1次「阿北」，則標註「各關鍵字出現次數總和」標記為 5 (= 0+4+1)、「各關鍵字出現種類總和」標記為 2 (共兩種關鍵字)。

Loading Packages

library(xlsx)
library(readr)
library(dplyr)
library(readr)
library(dplyr)
library(jiebaR)
library(tidyr)
library(tidytext)
library(igraph)
library(topicmodels)
library(stringr)
library(ggplot2)
library(wordcloud2)
library(data.table)
library(reshape2)
library(scales)
library(tm)
library(tmcn)
library(lubridate)

Loading Datasets

# 7月份資料
july_data <- read.xlsx("data/07.xlsx", sheetIndex = 1, encoding = "UTF-8")
# 8月份資料
# aug_data <- read.xlsx("data/08.xlsx", sheetIndex = 1, encoding = "UTF-8")

Data cleasing

只篩選政治人物自己的貼文，並挑選欄位:
- url - content - topicBy

colnames(july_data)[1:5] <- c("note", "dpm", "day", "mid", "id")
july <- july_data[is.na(july_data$note),] %>% 
  select(url, content, topicBy)

碰到第一個問題：
表情符號會是亂碼，且不是分析標的，要把emoji表情符號過濾掉
參考資料：所有表情符號的unicode

刪除表情符號後，還有一些內容是原始檔案就有的編碼錯誤，如：類似這種亂碼，把它排除。

[stackoverflow]Remove a string starting with or in R 類似的解答，但不適用於中文編碼
[RPubs]Remove non UTF-8 characters from text

july_sentences <-  july %>% 
  # remove emoji
  mutate(sentence = gsub("(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", "", content)) %>% 
  # remove non UTF-8
  mutate(sentence = gsub("[^[:alnum:][:blank:]?&/\\-]", "", sentence))

# july_sentences <- strsplit(july_sentences$sentence, "[。！；？!?;]")
# 
# july_sentences <- data.frame(
#   url = rep(july$url, sapply(july_sentences, length)),
#   topicBy = rep(july$topicBy, sapply(july_sentences, length)),
#   sentence = unlist(july_sentences)
# )

再過濾一次tab與換行符號，並將sentence轉成文字format。

july_sentences <- july_sentences %>% 
  filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
july_sentences$sentence <- as.character(july_sentences$sentence)

Loading dictionary

載入第一類關鍵字，候選人提及他們自己的名字。

their_names <- scan(file = "dict/their_names.txt", 
                    what = character(),
                    sep = "\n",
                    encoding = "utf-8",
                    fileEncoding = "utf-8")
head(their_names)

## [1] "我"   "我們" "自己" "小弟" "小妹" "偶"

主要政治人物正面、中性、負面關鍵字(含綽號)

此關鍵字字典已先用python處理完，所有各候選人的綽號作爲keyword，共323個字。

candidate_name <- scan(file = "dict/candidate_name_new.txt",
                       what = character(),
                       sep = "\n",
                       encoding = "utf-8",
                       fileEncoding = "utf-8")
head(candidate_name)

## [1] "丁爺爺" "丁能兒" "叮手中" "輸不起" "subuki" "老狐狸"

Tokenization

初步先用jieba進行斷詞，並新增上述字典。

jieba_tokenizer <- worker()
new_user_word(jieba_tokenizer, c(their_names, candidate_name))

## [1] TRUE

中文斷詞函式

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    #if(nchar(x) > 1){
      tokens <- segment(x, jieba_tokenizer)
     # tokens <- tokens[nchar(tokens) > 1]
      return(tokens)
    #}
  })
}

將sentence進行斷詞，斷詞結果再過濾英文與數字，同時以各文章計算詞頻。

tokens <- july_sentences %>% 
  unnest_tokens(word, sentence, token = chi_tokenizer) %>% 
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(url, word) %>%
  rename(count = n)

將載入的字典轉換成dataframe，並合併成一個字典。

their_names_df <- data.frame(word=their_names)
candidate_name_df <- data.frame(word=candidate_name)
all_keys_df <- rbind(their_names_df, candidate_name_df)

過濾我們研究需要的關鍵字，並將value轉換成column

dcast_tokens <- tokens %>% 
  inner_join(all_keys_df) %>%
  dcast(url ~ word, value.var = "count", fill = 0)

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

## Aggregation function missing: defaulting to length

non_cols <- all_keys_df %>% 
  anti_join(tokens) %>% 
  dcast(. ~ word) %>% 
  select(-1)

## Joining, by = "word"

## Warning: Column `word` joining factor and character vector, coercing into
## character vector

## Using word as value column: use value.var to override.

## Aggregation function missing: defaulting to length

# 所有設為0
non_cols[1:nrow(non_cols),1:ncol(non_cols)] <- 0

# 合併回原tokens
dcast_tokens <- cbind(dcast_tokens, non_cols)

output_count <- july %>% 
  left_join(dcast_tokens, by=c("url"))
output_count[is.na(output_count)] <- 0

有75篇文章關鍵字都沒有出現過，將NA補0。

關鍵字有無出現

output_exist <- tokens %>% 
  mutate(exists = ifelse(word %in% all_keys_df$word, 1, 0)) %>% 
  filter(exists == 1) %>% 
  dcast(url ~ word, value.var = "exists", fill = 0)

temp <- july %>% 
  left_join(output_exist) %>% 
  replace(is.na(.), 0)

## Joining, by = "url"

總共有74篇都沒有這些關鍵字，所以left join為na，將這些na填補0。

輸出結果為CSV

# write.csv(output_count, file="output/output_count_07.csv", fileEncoding = "UTF-8")

write.csv(output_exist, file="output/output_exist_07.csv", fileEncoding = "UTF-8")

計算全部詞頻

url_person <- july %>% select(url, topicBy)
tokens <- tokens %>% left_join(url_person) %>% filter(!is.na(word))

## Joining, by = "url"

tokens_count <- tokens %>% 
#  filter(nchar(.$word) > 1) %>%
  group_by(word, topicBy) %>% 
  summarise(sum = n()) %>% 
  filter(sum > 10) %>% 
  arrange(topicBy, desc(sum))

「我」關鍵字各候選人比較

tokens_count %>% 
  filter(word=="我") %>% 
  ggplot(aes(word, sum, fill=topicBy)) +
  geom_col() +
  facet_grid(topicBy~.) +
  xlab(NULL) +
  coord_flip()

「我們」關鍵字各候選人比較

tokens_count %>% 
  filter(word=="我們") %>% 
  ggplot(aes(word, sum, fill=topicBy)) +
  geom_col() +
  facet_grid(topicBy~.) +
  xlab(NULL) +
  coord_flip()

綽號整理為候選人名字

import pandas as pd
import numpy as np
keywords=pd.read_excel("/Users/yuzhe/Downloads/資工背景工讀生_關鍵字整理列表(初步建構中)__20191111.xlsx",
                      sheet_name="1-02、1-04、1-06 主要政治人物")
keywords=keywords[['子類目', '關鍵字', '(此部分為綽號須注意)          負面關鍵字','(此部分為綽號須注意)             中性 or 正面關鍵字']]
keywords.columns=['name', 'first_name', 'negative_nick', 'positive_nick']
for i in range(len(keywords)):
    keywords['keyword'].iloc[i,]=str(keywords['first_name'].iloc[i,])+' or '+str(keywords['negative_nick'].iloc[i,])+ ' or '+str(keywords['positive_nick'].iloc[i,])
  
temp = keywords["keyword"].str.split(" or ")
kname=keywords['name']
name_nick.replace('nan', np.NaN, inplace=True)
name_nick.dropna(how='any',inplace=True)
name_nick.to_csv('cadidate_nickname.csv', index=False)

載入上述python處理完對應之候選人暱稱

candidate_nickname <- read_csv("dict/cadidate_nickname.csv")

## Parsed with column specification:
## cols(
##   name = col_character(),
##   keyword = col_character()
## )

colnames(candidate_nickname) <- c("name", "word")

各候選人替換暱稱後

true_name <- tokens %>% 
  inner_join(candidate_nickname) %>% 
  mutate(word = name, name=NULL) %>% 
  group_by(url, word) %>% 
  summarise(count=sum(count)) %>% 
  dcast(url ~ word, value.var = "count", fill = 0)

## Joining, by = "word"

# temp <- tokens %>% 
#   mutate(word = ifelse(word %in% c("守中", "丁爺爺", "丁能兒", "叮手中", "輸不起", "subuki"), "丁守中", word)) %>% 
#   mutate(word = ifelse(word %in% c("通靈王", "國民阿舅", "上海女婿", "五億探長", "地獄刑事", "文大包租公"), "侯友宜", word)) 
#

Sentiment Analysis

請依據本研究提供之情緒詞關鍵字字典，將各則貼文中出現的「正負情緒詞」進行權重計算，統計各則貼文的情緒值。
✓ 需要能去除停用詞及分詞
✓ 需要能套入指定字典之詞彙
✓ 需要能針對字典中的詞彙進行正負評價加權

載入情緒字典、stopwords

因為tmcn套件裡的stopwords許多為簡體字，故stopwords採用 tomlinNTUB所提供繁中-停用詞表，並移除一些標點符，只留中文

# 台大情緒字典
data("NTUSD")
# 停用字
stopwords <- scan(file = "dict/stopwords_tc.txt",
                       what = character(),
                       sep = "\n",
                       encoding = "utf-8",
                       fileEncoding = "utf-8")
stopwords_df <- data.frame(word=stopwords)
head(stopwords_df, 10)

##    word
## 1    啊
## 2    阿
## 3    哎
## 4  哎呀
## 5  哎喲
## 6    唉
## 7    我
## 8  我們
## 9    按
## 10 按照

整理台大情緒字典格式

positive <- data.frame(word = NTUSD$positive_cht, sentiment="positive") 
negative <- data.frame(word = NTUSD$negative_cht, sentiment="negative")
NTUSD_ch <- rbind(positive, negative)
head(NTUSD_ch)

##         word sentiment
## 1   一帆風順  positive
## 2 一帆風順的  positive
## 3       一流  positive
## 4       一致  positive
## 5     一致的  positive
## 6     了不起  positive

LIWC字典，還在尋找更多LIWC字典，這是課堂助教提供的字典，字數不多。
原本想做比較，但台大字典有1w多個字，這只有1k字左右，暫不比較。

p <- read_file("dict/liwc/positive.txt")
n <- read_file("dict/liwc/negative.txt")
positive <- strsplit(p, ",")[[1]]
negative <- strsplit(n, ",")[[1]]
positive <- data.frame(word = positive, sentiments = "positive", stringsAsFactors = FALSE) 
negative <- data.frame(word = negative, sentiemtns = "negative", stringsAsFactors = FALSE)
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)

整體7月常出現用詞與文字雲

word_count <- tokens %>%
  # 去除stopwords
  anti_join(stopwords_df) %>% 
  select(word, count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>3)

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

# 玩玩看文字雲
wordcloud2(word_count)

正負面情緒常出現用詞

word_count %>% 
  inner_join(NTUSD_ch) %>%
  group_by(sentiment) %>%
  top_n(15, wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

計算各貼文情緒用詞

將台大情緒字典的正面詞給予權重=1，負面詞給予權重=-1，計算每篇總和。

sentiment_weight <- tokens %>% 
  inner_join(NTUSD_ch) %>% 
  mutate(sentiment=ifelse(sentiment %in% "positive", 1, -1)) %>% 
  group_by(url) %>% 
  summarise(swt = sum(sentiment))

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

head(sentiment_weight)

## # A tibble: 6 x 2
##   url                                                              swt
##   <fct>                                                          <dbl>
## 1 http://www.facebook.com/109391162488374/posts/1778415255585948     3
## 2 http://www.facebook.com/109391162488374/posts/1779467668814040    -1
## 3 http://www.facebook.com/109391162488374/posts/1781234038637403     3
## 4 http://www.facebook.com/109391162488374/posts/1781941188566688     5
## 5 http://www.facebook.com/109391162488374/posts/1785964631497677    -3
## 6 http://www.facebook.com/109391162488374/posts/1787733981320742     3

依照各候選人統計每篇文章平均情緒分數

avg_candidate_sentiment <- july %>% 
  left_join(sentiment_weight) %>% 
  group_by(topicBy) %>% 
  summarise(mean_wt = mean(swt, na.rm = T))

## Joining, by = "url"

head(avg_candidate_sentiment)

## # A tibble: 6 x 2
##   topicBy       mean_wt
##   <fct>           <dbl>
## 1 丁守中粉絲團    -1.02
## 2 侯友宜           3.46
## 3 姚文智翻台北     1.48
## 4 林佳龍           5.34
## 5 林義豐MarkLin    1.58
## 6 柯文哲           3.48

視覺化

avg_candidate_sentiment %>% 
  arrange(desc(mean_wt)) %>% 
  ggplot(aes(x=topicBy, y=mean_wt, fill=topicBy)) + 
  geom_col(show.legend = F) +
  labs(x=NULL) +
  theme(axis.text.x = element_text(size=10, angle=30))

隨時間貼文的情緒變化

senti_changes_date <- july_data %>% 
  select(url, date, topicBy) %>%
  mutate(date = format(date, "%Y-%m-%d")) %>% 
  inner_join(tokens) %>% 
  inner_join(NTUSD_ch) %>% 
  group_by(topicBy, date, sentiment) %>% 
  summarise(count=sum(count))

## Joining, by = c("url", "topicBy")

## Joining, by = "word"

## Warning: Column `word` joining character vector and factor, coercing into
## character vector

all_dates <- expand.grid(seq(as.Date(min(july_data$date)), as.Date(max(july_data$date)), by="day"), c("positive", "negative"))
colnames(all_dates) <- c("date", "sentiment")

senti_changes_date <- all_dates %>% 
  merge(senti_changes_date, by.x=c('date', "sentiment"), by.y=c('date', "sentiment"),
        all.x=T, all.y=T) %>% 
  mutate(count = replace_na(count, 0))

senti_changes_date %>%
  ggplot()+
  geom_line(aes(x=date, y=count, colour=topicBy)) +
  facet_wrap(~ sentiment, nrow=2, scales="free_y") +
  scale_x_date(labels = date_format("%m/%d"))

記錄區

https://www.facebook.com/339483189800311/posts/532367277178567 高思博這篇名字計算為37次