基本介紹

1. coreNLP

安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
load("coreNLP_all.RData")

1.1 資料收集:tweets

(1). Twitter API設定 透過rtweet抓取tweets

app = 'Emotions COVID-19 Vaccine'
consumer_key = 'sldS3M1c37owWAxx88lRg8anU'
consumer_secret = 'lCsUtxqA6DWC9nW7xH2a5KAITLXEX8oj10tcWE7zRVTxgHARfC'
access_token = '1283052584312410112-LocNkHahyAJ50KR0sADTmHryO0k3Kq'
access_secret = 'gLssR17xxOZUDLeiF6sB5LiwSYAAVBE0mLjXQolINF4k3'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

(2). 設定關鍵字抓tweets

# 查詢關鍵字
key = c("#COVID-19")
context = "Vaccine"
q = paste(c(key,context),collapse=" AND ")   
# 查詢字詞 "#COVID-19 AND Vaccine"
# 為了避免只下#COVID-19 會找到非在Vaccine中的tweets,加入Vaccine要同時出現的條件

#抓5000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)

(3). tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets
head(df)
## # A tibble: 6 x 90
##   user_id  status_id   created_at          screen_name  text             source 
##   <chr>    <chr>       <dttm>              <chr>        <chr>            <chr>  
## 1 4412084~ 1378792114~ 2021-04-04 19:30:31 BorjeMelin   Simon Parkes Up~ Twitte~
## 2 4412084~ 1376322876~ 2021-03-28 23:58:39 BorjeMelin   Traffic jam at ~ Twitte~
## 3 4412084~ 1378773153~ 2021-04-04 18:15:10 BorjeMelin   To be clear The~ Twitte~
## 4 4412084~ 1378312805~ 2021-04-03 11:45:55 BorjeMelin   Update           Twitte~
## 5 3228051~ 1378782399~ 2021-04-04 18:51:55 Deveshjaiswl SEUZ CANAL BLOC~ Twitte~
## 6 3228051~ 1378782341~ 2021-04-04 18:51:41 Deveshjaiswl SEUZ CANAL BLOC~ Twitte~
## # ... with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## #   lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## #   quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

df共有90個欄位,但我們在這裡僅會使用幾個欄位:

  • user_id: 用戶id
  • status_id : 推文id
  • created_at : 發文時間
  • text : 推文內容
  • source : 發文來源

了解資料的資料筆數以及時間分布

created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料

nrow(df)
## [1] 4958
min(df$created_at)
## [1] "2021-03-27 18:35:51 UTC"
max(df$created_at)
## [1] "2021-04-04 19:30:31 UTC"