library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(lubridate)
library(janeaustenr)
library(tidyr)
require(jiebaR)
require(widyr)
require(NLP)
require(ggraph)
require(igraph)### 載入已抓取資料
load(file = "C:/Users/ASUS/Desktop/group2_project/ah.RData")df共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
nrow(df)## [1] 9742
min(df$created_at)## [1] "2021-04-24 04:22:43 UTC"
max(df$created_at)## [1] "2021-04-28 16:26:35 UTC"
data <- df %>%
dplyr::select(created_at, status_id,text) %>%
distinct()
data$date = as.Date(data$created_at)
article_count_by_date <- data %>%
group_by(date) %>%
summarise(count = n())
head(article_count_by_date, 20)## # A tibble: 5 x 2
## date count
## <date> <int>
## 1 2021-04-24 2078
## 2 2021-04-25 2198
## 3 2021-04-26 2533
## 4 2021-04-27 1878
## 5 2021-04-28 1055
tidy_data <- data %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) ## Joining, by = "word"
tidy_data## # A tibble: 117,519 x 4
## created_at status_id date word
## <dttm> <chr> <date> <chr>
## 1 2021-04-28 16:26:35 1387443134104182786 2021-04-28 talk
## 2 2021-04-28 16:26:35 1387443134104182786 2021-04-28 hate
## 3 2021-04-28 16:26:35 1387443134104182786 2021-04-28 crimes
## 4 2021-04-28 16:26:35 1387443134104182786 2021-04-28 asians
## 5 2021-04-28 16:26:35 1387443134104182786 2021-04-28 lot
## 6 2021-04-28 16:26:35 1387443134104182786 2021-04-28 tbd
## 7 2021-04-28 16:26:35 1387443134104182786 2021-04-28 asian
## 8 2021-04-28 16:26:35 1387443134104182786 2021-04-28 people
## 9 2021-04-28 16:26:35 1387443134104182786 2021-04-28 excluding
## 10 2021-04-28 16:26:35 1387443134104182786 2021-04-28 hate
## # ... with 117,509 more rows
tidy_data %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
top_n(10, n) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL)y <- c("asian", "hate")
tidy_data %>% count(word, sort = TRUE) %>%filter(!word %in% c('asian','hate')) %>% filter(n > 50) %>% wordcloud2()#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) ## [1] 3598
#轉小寫
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()dependencies = coreNLP_dependency_parser(obj)parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
sentiment = coreNLP_sentiment_parser(obj)
head(sentiment,20)#了解情緒文章的分佈
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric
sentiment$sentiment %>% table()## .
## Negative Neutral Positive Verynegative Verypositive
## 6487 1882 806 498 6
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
### 正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)ah_words <- tokens %>%
count(status_id, lower_word, sort = TRUE)
total_words <- ah_words %>%
group_by(status_id) %>%
summarize(total = sum(n))
ah_words <- left_join(ah_words, total_words)## Joining, by = "status_id"
ah_words_tf_idf <- ah_words %>%
bind_tf_idf(lower_word, status_id, n)
head(ah_words_tf_idf,20)## status_id lower_word n total tf idf tf_idf
## 1 1387389984202711042 asian 14 56 0.2500000 0.3035458 0.07588644
## 2 1387389984202711042 hate 14 56 0.2500000 0.1811857 0.04529643
## 3 1387389984202711042 i 14 56 0.2500000 1.6866263 0.42165657
## 4 1387389984202711042 women 14 56 0.2500000 4.3985904 1.09964759
## 5 1386690850051133449 not 9 44 0.2045455 2.1837809 0.44668246
## 6 1386923628374200323 black 9 45 0.2000000 1.9228291 0.38456581
## 7 1386108746976354308 and 8 45 0.1777778 1.0435396 0.18551815
## 8 1387151033554685953 the 8 50 0.1600000 0.6887144 0.11019431
## 9 1387275893463080961 i 8 53 0.1509434 1.6866263 0.25458510
## 10 1385999224249090054 you 7 52 0.1346154 1.9529605 0.26289852
## 11 1386140608092418059 you 7 46 0.1521739 1.9529605 0.29718964
## 12 1386289314548486149 the 7 48 0.1458333 0.6887144 0.10043752
## 13 1386418725885136897 the 7 43 0.1627907 0.6887144 0.11211630
## 14 1386449748211552258 we 7 52 0.1346154 2.4196194 0.32571799
## 15 1386461474676264961 the 7 51 0.1372549 0.6887144 0.09452943
## 16 1386523862343593986 a 7 44 0.1590909 1.1857833 0.18864735
## 17 1386703648391041028 the 7 44 0.1590909 0.6887144 0.10956820
## 18 1386716167188975616 the 7 52 0.1346154 0.6887144 0.09271156
## 19 1386724743265591297 on 7 50 0.1400000 1.7906236 0.25068731
## 20 1386740367442300928 the 7 55 0.1272727 0.6887144 0.08765456
+Stop asian hate,SHUT UP YOUR MOUTH, DON’T MAKE FAKE NEWS ABOUT COVID-19
ah_words_tf_idf %>%
group_by(status_id) %>%
slice_max(tf_idf, n=5)## # A tibble: 48,438 x 7
## # Groups: status_id [9,679]
## status_id lower_word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 1385811415051636737 counteract 1 48 0.0208 9.18 0.191
## 2 1385811415051636737 gained 1 48 0.0208 9.18 0.191
## 3 1385811415051636737 popularity.it 1 48 0.0208 9.18 0.191
## 4 1385811415051636737 floyds 1 48 0.0208 8.48 0.177
## 5 1385811415051636737 matter 2 48 0.0417 4.17 0.174
## 6 1385811416863629314 muchness! 1 15 0.0667 9.18 0.612
## 7 1385811416863629314 punish 1 15 0.0667 7.57 0.505
## 8 1385811416863629314 whiteness 1 15 0.0667 7.39 0.492
## 9 1385811416863629314 yup 1 15 0.0667 7.10 0.473
## 10 1385811416863629314 who's 1 15 0.0667 6.69 0.446
## # ... with 48,428 more rows
ah_words_tf_idf %>%
group_by(status_id) %>%
slice_max(tf_idf, n=10) %>%
ungroup() %>%
count(lower_word, sort=TRUE)## # A tibble: 19,199 x 2
## lower_word n
## <chr> <int>
## 1 asian 1446
## 2 hate 1411
## 3 stop 573
## 4 antiasian 400
## 5 news 365
## 6 crime 351
## 7 bill 324
## 8 fake 321
## 9 the 309
## 10 up 309
## # ... with 19,189 more rows
# load stop words
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')jieba_tokenizer = worker()
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
df_gram <- subset(df,select = c(status_id,text,source))
bigram <- ngrams(tokens, 2)
df_bigram <- df_gram %>%
unnest_tokens(bigram, text, token = jieba_bigram)
df_bigram %>%
filter(!str_detect(bigram, regex("[0-9]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")## # A tibble: 88,351 x 2
## bigram n
## <chr> <int>
## 1 asian hate 4580
## 2 hate crimes 1893
## 3 hate crime 1394
## 4 antiasian hate 1283
## 5 stop asian 1161
## 6 in the 632
## 7 the asian 591
## 8 asian americans 534
## 9 of the 520
## 10 don t 509
## # ... with 88,341 more rows
head(df_bigram,20)## # A tibble: 20 x 3
## status_id source bigram
## <chr> <chr> <chr>
## 1 1387443134104182786 Twitter for iPhone we talk
## 2 1387443134104182786 Twitter for iPhone talk about
## 3 1387443134104182786 Twitter for iPhone about hate
## 4 1387443134104182786 Twitter for iPhone hate crimes
## 5 1387443134104182786 Twitter for iPhone crimes against
## 6 1387443134104182786 Twitter for iPhone against asians
## 7 1387443134104182786 Twitter for iPhone asians a
## 8 1387443134104182786 Twitter for iPhone a lot
## 9 1387443134104182786 Twitter for iPhone lot and
## 10 1387443134104182786 Twitter for iPhone and tbd
## 11 1387443134104182786 Twitter for iPhone tbd if
## 12 1387443134104182786 Twitter for iPhone if any
## 13 1387443134104182786 Twitter for iPhone any of
## 14 1387443134104182786 Twitter for iPhone of these
## 15 1387443134104182786 Twitter for iPhone these end
## 16 1387443134104182786 Twitter for iPhone end up
## 17 1387443134104182786 Twitter for iPhone up being
## 18 1387443134104182786 Twitter for iPhone being that
## 19 1387443134104182786 Twitter for iPhone that but
## 20 1387443134104182786 Twitter for iPhone but asian
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
df_trigram <- df_gram %>%
unnest_tokens(ngrams, text, token = jieba_trigram)
df_trigram %>%
filter(!str_detect(ngrams, regex("[0-9]"))) %>%
count(ngrams, sort = TRUE)## # A tibble: 149,906 x 2
## ngrams n
## <chr> <int>
## 1 stop asian hate 1116
## 2 asian hate crimes 486
## 3 hate crime bill 474
## 4 antiasian hate crimes 473
## 5 asian hate crime 461
## 6 anti asian hate 434
## 7 make fake news 319
## 8 t make fake 319
## 9 fake news about 316
## 10 the asian hate 297
## # ... with 149,896 more rows
df_trigram %>%
filter(!str_detect(ngrams, regex("[0-9]"))) %>%
separate(ngrams, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
count(word1, word2, word3, sort = TRUE) %>%
unite_("ngrams", c("word1", "word2", "word3"), sep=" ")## # A tibble: 149,906 x 2
## ngrams n
## <chr> <int>
## 1 stop asian hate 1116
## 2 asian hate crimes 486
## 3 hate crime bill 474
## 4 antiasian hate crimes 473
## 5 asian hate crime 461
## 6 anti asian hate 434
## 7 make fake news 319
## 8 t make fake 319
## 9 fake news about 316
## 10 the asian hate 297
## # ... with 149,896 more rows
head(df_trigram,20)## # A tibble: 20 x 3
## status_id source ngrams
## <chr> <chr> <chr>
## 1 1387443134104182786 Twitter for iPhone we talk about
## 2 1387443134104182786 Twitter for iPhone talk about hate
## 3 1387443134104182786 Twitter for iPhone about hate crimes
## 4 1387443134104182786 Twitter for iPhone hate crimes against
## 5 1387443134104182786 Twitter for iPhone crimes against asians
## 6 1387443134104182786 Twitter for iPhone against asians a
## 7 1387443134104182786 Twitter for iPhone asians a lot
## 8 1387443134104182786 Twitter for iPhone a lot and
## 9 1387443134104182786 Twitter for iPhone lot and tbd
## 10 1387443134104182786 Twitter for iPhone and tbd if
## 11 1387443134104182786 Twitter for iPhone tbd if any
## 12 1387443134104182786 Twitter for iPhone if any of
## 13 1387443134104182786 Twitter for iPhone any of these
## 14 1387443134104182786 Twitter for iPhone of these end
## 15 1387443134104182786 Twitter for iPhone these end up
## 16 1387443134104182786 Twitter for iPhone end up being
## 17 1387443134104182786 Twitter for iPhone up being that
## 18 1387443134104182786 Twitter for iPhone being that but
## 19 1387443134104182786 Twitter for iPhone that but asian
## 20 1387443134104182786 Twitter for iPhone but asian people
# load mask_lexicon
asian_lexicon <- scan(file = "./dict/asian_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8',quiet = T)
# 自建亞裔相關字典
asian_lexicon## [1] "asian hate" "hate crime" "hate crimes" "antiasian hate"
## [5] "black people" "hate crime bill" "fake news"
jieba_tokenizer = worker()
# 使用亞裔相關字典重新斷詞
new_user_word(jieba_tokenizer, c(asian_lexicon))## [1] TRUE
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 剛才的斷詞結果沒有使用新增的辭典,因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數
df_words <- ah_sentences %>%
unnest_tokens(word, text, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9]"))) %>%
count(status_id, word, sort = TRUE)
head(df_words,20)## status_id word n
## 1 1387389984202711042 asian 14
## 2 1387389984202711042 hate 14
## 3 1387389984202711042 women 14
## 4 1386690850051133449 not 11
## 5 1386923628374200323 men 10
## 6 1386923628374200323 black 9
## 7 1386108746976354308 and 8
## 8 1387091619023712259 the 8
## 9 1387098777895378944 the 8
## 10 1387151033554685953 the 8
## 11 1387287630291873793 men 8
## 12 1385999224249090054 you 7
## 13 1386140608092418059 you 7
## 14 1386289314548486149 the 7
## 15 1386418725885136897 the 7
## 16 1386449748211552258 we 7
## 17 1386461474676264961 the 7
## 18 1386703648391041028 the 7
## 19 1386716167188975616 the 7
## 20 1386724743265591297 on 7
# 過濾掉兩個關鍵字"asian", "hate"
df_pairs <- df_words %>%
pairwise_count(word, status_id, sort = TRUE)%>%
filter(!item1 %in% c("asian", "hate") & !item2 %in% c("asian", "hate"))
head(df_pairs,20)word_cors <- df_words %>%
group_by(word) %>%
filter(n() >= 30) %>%
pairwise_cor(word, status_id, sort = TRUE)
head(word_cors,20)## # A tibble: 20 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 hellbent forcasting 1
## 2 weaponizing forcasting 1
## 3 forcasting hellbent 1
## 4 weaponizing hellbent 1
## 5 forcasting weaponizing 1
## 6 hellbent weaponizing 1
## 7 hogans maryland 0.978
## 8 hogans doubled 0.978
## 9 maryland hogans 0.978
## 10 doubled hogans 0.978
## 11 sinners humanity 0.977
## 12 humanity sinners 0.977
## 13 sparks headkicking 0.972
## 14 headkicking sparks 0.972
## 15 sold profit 0.970
## 16 profit sold 0.970
## 17 retweet forcasting 0.964
## 18 retweet hellbent 0.964
## 19 forcasting retweet 0.964
## 20 hellbent retweet 0.964
word_cors %>%
filter(item1 %in% c("asian", "hate")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))## Selecting by correlation
set.seed(2020)
word_cors %>%
filter(correlation > 0.9) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light")