安裝package
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)(1). Twitter API設定 透過rtweet抓取tweets
app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權(2). 設定關鍵字抓tweets
# 查詢關鍵字
key = c("#EverGiven")
context = "Suez"
q = paste(c(key,context),collapse=" AND ")
# 查詢字詞 "#EverGiven AND Suez"
# 為了避免只下#EverGiven 會找到非在suez中的tweets,加入Suez要同時出現的條件
#抓5000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)(3). tweets內容清理
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweets## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 441208… 13787921… 2021-04-04 19:30:31 BorjeMelin Simo… Twitt…
## 2 441208… 13763228… 2021-03-28 23:58:39 BorjeMelin Traf… Twitt…
## 3 441208… 13787731… 2021-04-04 18:15:10 BorjeMelin To b… Twitt…
## 4 441208… 13783128… 2021-04-03 11:45:55 BorjeMelin Upda… Twitt…
## 5 322805… 13787823… 2021-04-04 18:51:55 Deveshjais… SEUZ… Twitt…
## 6 322805… 13787823… 2021-04-04 18:51:41 Deveshjais… SEUZ… Twitt…
## # … with 84 more variables: display_text_width <dbl>, reply_to_status_id <chr>,
## # reply_to_user_id <chr>, reply_to_screen_name <chr>, is_quote <lgl>,
## # is_retweet <lgl>, favorite_count <int>, retweet_count <int>,
## # quote_count <int>, reply_count <int>, hashtags <list>, symbols <list>,
## # urls_url <list>, urls_t.co <list>, urls_expanded_url <list>,
## # media_url <list>, media_t.co <list>, media_expanded_url <list>,
## # media_type <list>, ext_media_url <list>, ext_media_t.co <list>,
## # ext_media_expanded_url <list>, ext_media_type <chr>,
## # mentions_user_id <list>, mentions_screen_name <list>, lang <chr>,
## # quoted_status_id <chr>, quoted_text <chr>, quoted_created_at <dttm>,
## # quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
df共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
## [1] 4958
## [1] "2021-03-27 18:35:51 UTC"
## [1] "2021-04-04 19:30:31 UTC"
(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)
(1). 斷詞、詞彙還原、詞性標註、NER
## status_id word lemma pos ner
## 1 1378792114037870600 Simon Simon NNP PERSON
## 2 1378792114037870600 Parkes Parkes NNP PERSON
## 3 1378792114037870600 Update Update NNP O
## 4 1378792114037870600 With with IN O
## 5 1378792114037870600 great great JJ O
## 6 1378792114037870600 regret regret NN O
## 7 1378792114037870600 I I PRP O
## 8 1378792114037870600 must must MD O
## 9 1378792114037870600 announce announce VB O
## 10 1378792114037870600 that that IN O
## 11 1378792114037870600 children child NNS O
## 12 1378792114037870600 were be VBD O
## 13 1378792114037870600 being be VBG O
## 14 1378792114037870600 transported transport VBN O
## 15 1378792114037870600 in in IN O
## 16 1378792114037870600 cargo cargo NN O
## 17 1378792114037870600 containers. containers. NN O
## 18 1378792114037870600 The the DT O
## 19 1378792114037870600 operation operation NN O
## 20 1378792114037870600 in in IN O
(2). 命名實體標註(NER)
## [1] "PERSON" "O" "CITY"
## [4] "LOCATION" "NUMBER" "ORDINAL"
## [7] "ORGANIZATION" "TITLE" "MISC"
## [10] "DATE" "DURATION" "MONEY"
## [13] "PERCENT" "COUNTRY" "TIME"
## [16] "NATIONALITY" "SET" "STATE_OR_PROVINCE"
## [19] "CAUSE_OF_DEATH" "CRIMINAL_CHARGE" "IDEOLOGY"
## [22] "RELIGION"
## [1] 1880
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
我們可以透過coreNLP中的NER解析出在Twitter上面談論長賜號擱淺蘇伊士運河,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論長賜號擱淺蘇伊士運河,所涉及到的組織(ORGANIZATION),以初步了解這個議題的主要公司/單位。
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論長賜號擱淺蘇伊士運河,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()## status_id dep governor governorGloss dependent
## 1 1378792114037870600 ROOT 0 ROOT 26
## 2 1378792114037870600 compound 3 Update 1
## 3 1378792114037870600 compound 3 Update 2
## 4 1378792114037870600 obl:tmod 26 success 3
## 5 1378792114037870600 case 6 regret 4
## 6 1378792114037870600 amod 6 regret 5
## 7 1378792114037870600 obl 9 announce 6
## 8 1378792114037870600 nsubj 9 announce 7
## 9 1378792114037870600 aux 9 announce 8
## 10 1378792114037870600 acl:relcl 3 Update 9
## 11 1378792114037870600 mark 14 transported 10
## 12 1378792114037870600 nsubj:pass 14 transported 11
## 13 1378792114037870600 aux 14 transported 12
## 14 1378792114037870600 aux:pass 14 transported 13
## 15 1378792114037870600 ccomp 9 announce 14
## 16 1378792114037870600 case 17 containers. 15
## 17 1378792114037870600 compound 17 containers. 16
## 18 1378792114037870600 obl 14 transported 17
## 19 1378792114037870600 det 19 operation 18
## 20 1378792114037870600 nsubj 26 success 19
## dependentGloss
## 1 success
## 2 Simon
## 3 Parkes
## 4 Update
## 5 With
## 6 great
## 7 regret
## 8 I
## 9 must
## 10 announce
## 11 that
## 12 children
## 13 were
## 14 being
## 15 transported
## 16 in
## 17 cargo
## 18 containers.
## 19 The
## 20 operation
情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
## status_id
## 1 1378792114037870600
## 2 1376322876274532359
## 3 1378773153304956930
## 4 1378312805724602368
## 5 1378782399681589249
## 6 1378782341825372160
## 7 1378780519807193093
## 8 1376524772608196609
## 9 1376616766047186951
## 10 1376256378986201088
## 11 1378772889982410766
## 12 1378772308693688321
## 13 1378405661877403649
## 14 1375905467482931202
## 15 1376599080198103045
## 16 1376487077441855491
## 17 1378564346498928642
## 18 1376294537027584000
## 19 1376352746593456129
## 20 1376563871012507649
## text
## 1 Simon Parkes Update With great regret I must announce that children were being transported in cargo containers. The operation in the Suez Canal was a success and warrants me to do a video update...
## 2 Traffic jam at the Suez Canal. About 350 vessels.
## 3 To be clear The USS Dwight D. Eisenhower Aircraft Carrier.
## 4 Update
## 5 SEUZ CANAL BLOCKAGE HOW SHIP FROM SEUZ CANAL IN DETAIL
## 6 SEUZ CANAL BLOCKAGE HOW SHIP FROM SEUZ CANAL IN DETAIL
## 7 Who knows why the Asia Ruby III is still waiting to enter the? She was tugged out after thewas stranded and normally should have been one of the first vessels to pass
## 8 Ever Given is moving !
## 9 Good to see that Suez Canal Authorities give priority to vessels with livestock
## 10 Yes ! The Dutch have arrived !! Salvation is near !!
## 11 Can I name mine the Suez Canal because she's too shallow for the ? I hope it's not
## 12 Do you miss the fun ofblocking the Suez? The last time it happened was even weirder I really loved reading about the Great Bitter Lake Association via this piece onis wonderful and full of photos
## 13 The owner of the shipfiles a lawsuit against its operator for delinquency in theCanal
## 14 TheAuthority reveals the case of.. and explains the story of the bulldozers photos
## 15 International newspapers after the float one of the largest rescue operations in history
## 16 How the shipgot stuck in thethrough satellite A new video
## 17 The owner of the shipfiles a lawsuit against its operator for delinquency in theCanal
## 18 How the shipgot stuck in thethrough satellite A very new video
## 19 How the shipgot stuck in thethroughA very new video
## 20 The joy of the crew of theship as it exits the
## sentiment sentimentValue
## 1 Positive 3
## 2 Neutral 2
## 3 Neutral 2
## 4 Neutral 2
## 5 Neutral 2
## 6 Neutral 2
## 7 Negative 1
## 8 Positive 3
## 9 Neutral 2
## 10 Positive 3
## 11 Negative 1
## 12 Positive 3
## 13 Neutral 2
## 14 Neutral 2
## 15 Neutral 2
## 16 Neutral 2
## 17 Neutral 2
## 18 Neutral 2
## 19 Positive 3
## 20 Neutral 2
## [1] "Positive" "Neutral" "Negative" "Verypositive" "Verynegative"
## .
## Negative Neutral Positive Verynegative Verypositive
## 1180 2875 869 2 12
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()## Joining, by = "word"
#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()“wordcloud”
library(sentimentr)
mytext <- c(
'do you like it? But I hate really bad dogs',
'I am the best friend.',
'Do you really like it? I\'m not a fan'
)
mytext <- get_sentences(mytext) #物件,將character向量轉成list,list裡放著character向量(已斷句)情緒分數為-1~1之間,<0屬於負面,>0屬於正面,0屬於中性
## element_id word_count sd ave_sentiment
## 1: 1 10 1.497465 -0.8088680
## 2: 2 5 NA 0.5813777
## 3: 3 9 0.284605 0.2196345
## element_id sentence_id word_count sentiment
## 1: 1 1 4 0.2500000
## 2: 1 2 6 -1.8677359
## 3: 2 1 5 0.5813777
## 4: 3 1 5 0.4024922
## 5: 3 2 4 0.0000000
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: please 1.0 47
## 2: successfully 1.0 30
## 3: brilliant 1.0 3
## 4: quickly 1.0 3
## 5: master 1.0 3
## ---
## 335: considerable 0.1 1
## 336: sermon 0.1 1
## 337: prepare 0.1 1
## 338: reading 0.1 1
## 339: expand 0.1 1
## Selecting by n
## words polarity n
## 1 stuck -0.25 114
## 2 blocking -0.50 65
## 3 stranded -1.00 34
## 4 blocked -0.40 25
## 5 crisis -0.75 24
## 6 bitter -0.50 21
## 7 blockage -0.60 18
## 8 jam -1.00 17
## 9 aground -0.50 15
## 10 grounded -0.25 10
## 11 problem -0.75 10
set.seed(12)
df%>%
filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
mutate(review = get_sentences(text)) %$%
sentiment_by(review, status_id) %>%
highlight()## Saved in /tmp/RtmpENlj5O/polarity.html
## Opening /tmp/RtmpENlj5O/polarity.html ...
(out = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% with(
sentiment_by(
get_sentences(text),
list(source, date)
)
))
plot(out)coreNLP
sentimentr
Practice
以讀書會為單位,針對有興趣的議題分析資料,不限定資料來源,可以是tweet或文字分析平台上的文字資料,練習用coreNLP或sentimentr來分析,最後將作業轉成RPubs發布,並將連結上傳至網大「第7週HW」,每組一人上傳即可。