
require(rtweet) # load the required libraries
require(plotly)
require(wordcloud)
require(tidytext)
jmschku_handle <- readRDS("jmschku_handle.rds")
jmschku_folls_data <- readRDS("jmschku_folls_data.rds")
jmschku_timeline <- readRDS("jmschku_timeline.rds")
jmschku_handle$name # Name of the user
## [1] "HKU Journalism"
jmschku_handle$followers_count # Followers count
## [1] 6597
jmschku_handle$description # User's description
## [1] "Founded in 1999, the Journalism and Media Studies Centre of The University of Hong Kong offers professional journalism education at Asia’s premier university."
jmschku_handle$location # Location
## [1] "Hong Kong"
jmschku_handle$statuses_count # Total number of posts
## [1] 7149
jmschku_handle$account_created_at # Date of account created
## [1] "2009-11-12 10:02:33 UTC"
duration <- difftime(Sys.time(),jmschku_handle$account_created_at,units='days')
duration
## Time difference of 4170.985 days
jmschku_handle$statuses_count/as.numeric(duration) # Average number of posts per day
## [1] 1.713983
head(jmschku_folls_data$screen_name)
## [1] "HKU_BASc" "DigitalMarathi1" "Baxter74740635" "imtherealdeal63"
## [5] "shirley841222" "chihin_ko"
head(jmschku_folls_data$location)
## [1] "Hong Kong" "" "" "" "" "Shandong"
class(jmschku_folls_data$location)
## [1] "character"
sort(table(jmschku_folls_data$location),decreasing = TRUE)[1:20] # List top 20 user locations
##
## Hong Kong
## 2169 719
## Beijing People's Republic of China
## 42 42
## <U+9999><U+6E2F> Shanghai
## 35 32
## United States London, England
## 30 29
## Singapore Washington, DC
## 26 26
## London Central & Western District
## 23 22
## Guangdong India
## 19 17
## New York, NY China
## 17 15
## New Delhi, India United Kingdom
## 15 15
## California, USA Los Angeles, CA
## 12 11
jfd_loc_top20 <- sort(table(jmschku_folls_data$location),decreasing = TRUE)[1:20]
jfd_loc_top20 <- jfd_loc_top20[names(jfd_loc_top20) != ""] # Remove ""
class(jmschku_folls_data$followers_count) # Check the data class of followers_count
## [1] "integer"
summary(jmschku_folls_data$followers_count) # Show its summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 4 48 3596 411 3177652
jmschku_folls_data[order(jmschku_folls_data$followers_count,decreasing = T)[1:10],c("screen_name","followers_count")] # Top 10 followers
## # A tibble: 10 x 2
## screen_name followers_count
## <chr> <int>
## 1 verified 3177652
## 2 AlaattinCAGIL 1468118
## 3 soledadobrien 1346396
## 4 zlj517 893778
## 5 _yavuzatalay 760808
## 6 joshuawongcf 743024
## 7 DaveVescio 519527
## 8 BraveLad 488232
## 9 roseluqiu 311032
## 10 zenjournalist 272657
jmschku_followers_grp <- cut(jmschku_folls_data$followers_count,breaks = c(0,10,100,500,1000,2500,max(jmschku_folls_data$followers_count)))
jmschku_followers_grp_table <- table(jmschku_followers_grp)
p <- plot_ly(labels = names(jmschku_followers_grp_table), values = jmschku_followers_grp_table, type = 'pie', sort = FALSE, textposition = 'inside', textinfo = 'label+percent')
p <- layout(p, title = "JMSC's Followers", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
sort(table(jmschku_folls_data$lang),decreasing=T)
##
## en zh und ja es fr in ko pt de ne hi ar it tl ru
## 2435 435 351 159 80 43 32 22 22 21 20 18 15 15 13 12
## tr ur fi sv nl th et cs fa da no pl ca el ht lv
## 12 10 9 9 8 8 7 5 4 3 3 3 2 2 2 2
## pa vi am eu km kn ps sr
## 2 2 1 1 1 1 1 1
Preprocessing <- function(x){
x <- gsub('https://[0-9a-zA-Z./]+','',x) # removing all url
x <- gsub('#[0-9a-zA-Z_]+','',x) # removing all hashtag
x <- gsub('@[0-9a-zA-Z_]+','',x) # removing all @mention
x <- gsub('[[:punct:]]',' ',x) # removing all punctuations
x <- tolower(x) # lower case
x <- trimws(x) # removing both
stopw <- get_stopwords()
stopw <- tolower(stopw$word)
x <- unnest_tokens(data.frame(txt=x),word,txt) # tokenization
x <- tolower(x$word)
x <- x[!(x %in% stopw)]
return(x)
}
jfd_des <- paste(jmschku_folls_data$description,collapse=" ") # Collapse into one single line
jfd_des_tm <- Preprocessing(jfd_des)
wordcloud(jfd_des_tm, scale=c(4,.5), min.freq=30, max.words=Inf, random.order=F, colors=brewer.pal(8, "Accent"))
## Loading required namespace: tm
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

jmschku_folls_data$avgppd <- jmschku_folls_data$statuses_count/as.numeric(difftime(Sys.time(),jmschku_folls_data$account_created_at,units='days'))
p <- plot_ly(data = jmschku_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')
# Output in Linear scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (in linear scale)", xaxis = list(title = "Followers Count"), yaxis = list (title = "Average posts per day"))
p <- plot_ly(data = jmschku_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')
# Output in log scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (both xy in log scale)", xaxis = list(title = "Followers Count", type = "log"), yaxis = list (title = "Average posts per day", type = "log"))
## % of retweet
sum(jmschku_timeline$is_retweet)/nrow(jmschku_timeline)
## [1] 0.6125806
# Top retweet
jmschku_timeline[which.max(jmschku_timeline$retweet_count),c("created_at","text","retweet_count","is_retweet")]
## # A tibble: 1 x 4
## created_at text retweet_count is_retweet
## <dttm> <chr> <int> <lgl>
## 1 2020-11-07 18:43:22 The front page of The New York T~ 22607 TRUE
jmschku_timeline_ori <- jmschku_timeline[!jmschku_timeline$is_retweet,]
# Top original post
jmschku_timeline_ori[which.max(jmschku_timeline_ori$retweet_count),c("created_at","text","retweet_count","is_retweet")]
## # A tibble: 1 x 4
## created_at text retweet_count is_retweet
## <dttm> <chr> <int> <lgl>
## 1 2020-05-03 11:48:44 "We’re excited to announce the l~ 247 FALSE
# Top 5 replies
sort(table(jmschku_timeline$reply_to_screen_name),decreasing=T)[1:5]
##
## JMSCHKU keithrichburg MasatoKJ HKUniversity annekrugernews
## 145 27 12 4 3
## Top 5 mentions
sort(table(unlist(jmschku_timeline$mentions_screen_name)),decreasing=T)[1:5]
##
## JMSCHKU keithrichburg fcchk HKUniversity annielab_jmsc
## 437 297 189 123 101
## Time trend
jmschku_tt <- table(format(jmschku_timeline$created_at,"%Y-%m"))
jmschku_tt_ori <- table(format(jmschku_timeline_ori$created_at,"%Y-%m"))
p <- plot_ly(x = names(jmschku_tt), y = jmschku_tt, name = "Tweets + Retweets", type = 'scatter', mode = 'lines')
p <- add_trace(p, y = jmschku_tt_ori, name = "Tweets only")
layout(p, title = "@JMSCHKU's Post History (latest 3200 posts)", xaxis = list(title = "Year-Month"), yaxis = list (title = "Number of Posts"))
#The following indicators provide the strongest signals to separate bots from humans:
# 1. absence of user description (whether the public Twitter profile looks like the default one or it is customized);
# 2. absence of geographical metadata (humans often use smartphones and the Twitter iPhone/Android App, which records as digital footprint the physical location of the mobile device); and,
# 3. Low number of followers, say less than 10
# 4. Account creation date, less than a year
# 5. Non-verified user
#
# x is the data.frame returned from rtweet::lookup_users
remove_bot <- function(x,followers_cutoff,date_cutoff){
return(x[
x$followers_count > followers_cutoff |
nchar(gsub('^[ ]*|[ ]*$','',x$location)) != 0 |
nchar(gsub('^[ ]*|[ ]*$','',x$description)) != 0 |
as.Date(x$account_created_at) < as.Date(date_cutoff) |
x$verified,])
}
jmschku_folls_data_nobot <- remove_bot(jmschku_folls_data,10,"2019-04-03")
# Number of followers before bot removal
nrow(jmschku_folls_data)
## [1] 5000
nrow(jmschku_folls_data_nobot)
## [1] 4761
# % of detected bots (@JMSCHKU)
round(100*(nrow(jmschku_folls_data)-nrow(jmschku_folls_data_nobot))/nrow(jmschku_folls_data),2)
## [1] 4.78
## Show names of detected "Bots"
jmschku_folls_data[!(jmschku_folls_data$screen_name %in% jmschku_folls_data_nobot$screen_name),]$screen_name
## [1] "DigitalMarathi1" "uHaOVXZy9hXztlp" "J9Bv2" "KittyWo14750316"
## [5] "JasonCh32901805" "suhanigurung14" "margaretyako" "FrauHKHKHKHK"
## [9] "conutooh" "tLbH8TPAk33SHsN" "SC41008188" "thphlfxl"
## [13] "TrnHuGiaKhnh3" "AlexLeePhD" "SaSa64120452" "JvR9ftBUp5FISBO"
## [17] "TracerGG" "Q41323703" "yrzdmng" "DianaLAlvarezM1"
## [21] "WGeynS4gtxofq7y" "shGjXkH4fsaI42X" "olivert90770261" "Gin78952955"
## [25] "hamidirayhane" "Khnh97100230" "agnesbubugmail1" "allenwa75315010"
## [29] "ianclar75196745" "AliceSerene" "laahi_sahal" "hui11231"
## [33] "Robert201909" "abbyhlr" "aJ57v8RKlHdGbff" "Pert16183118"
## [37] "YMHYudeovFDpTqL" "Ningwei214" "wp6sOyxusisDWSy" "BeeDee41591352"
## [41] "lufi18042218" "zengdafei" "AT18611" "Wellin54982085"
## [45] "ZzxCooper" "sunnyltc1" "natalie19252113" "shinerise7"
## [49] "kikilam55658006" "lila42799429" "CheungLlewellyn" "rachel3913"
## [53] "wish82268072" "hub_lit" "BillyLa93066135" "Jessica63579549"
## [57] "beok99225775" "PLmama2" "EssentialK" "ParisJussieu"
## [61] "Aslanthelion3" "DBecks1975" "vhLat52fut9RlcY" "OliviaW74069096"
## [65] "PieaGac" "GSTrump2" "matthewkllaw" "Faye71359424"
## [69] "Desmond95990546" "lina07307655" "PeterHendrix17" "shenghaoke"
## [73] "andrew_carbo" "stephan35270357" "onlyyouboy186" "yingyun_he"
## [77] "p8JjlU83oGgdFdu" "Anya21894105" "ChuenChuen11" "AliceWa53732257"
## [81] "ENovSBMhwr625y5" "kcohbo" "KwaUW3JD6xHUQK7" "JikKam"
## [85] "dDQxRGZakRunlan" "10SGo2yxFGgGB5F" "MarcoWu31508834" "Shirley06029973"
## [89] "gyhklllve" "wangzi84433595" "lumosstar_" "4MjZ1J5nsFw3RLq"
## [93] "Kyosekika" "Barbara31437208" "Novice99502312" "ChanYiug"
## [97] "CandyCh51213290" "Jack94219819" "harrypotter9678" "bluefloyd4"
## [101] "liutongecho" "achilles1019" "v3JB0yWnziruYMf" "Cheinhui1"
## [105] "yangyang618713" "aC8x5k87GLUBJcV" "Jerome_Ou" "HExcitg"
## [109] "iansui1" "joechen34032604" "wangRicky3" "wosirn"
## [113] "k18023081959" "sihuichen2" "Brandon26491947" "W0OG6OO2Z76jFCo"
## [117] "Momo55078240" "jameslee_1229" "FEFDYI91ywqubB1" "wwwww_abcde"
## [121] "jack57958615" "XVBLbYJk6Knk6gC" "chestnattiioslo" "n38879746"
## [125] "ghftlam" "yongzi8599" "JoshuaL89291212" "candy61518804"
## [129] "Asahina27182321" "Zhangyf67903612" "AgwmRNHKOWD8hx3" "alovelycat234"
## [133] "handsome0624" "Egg06817374" "SBC86148004" "RZFo3LkDd2zkjE2"
## [137] "Hjs_Kirito" "egg62087207" "lxy880524" "vandavinci2"
## [141] "yyevayu" "faker74727225" "Muhamma25230295" "File014"
## [145] "Hongkon49849006" "kdUWy3xVuHHkYNV" "fefayama" "ChehAmber"
## [149] "emmad64412036" "william53384478" "13_2w" "chehanhui"
## [153] "U8tbUEMUkUo9byh" "bolinlai1" "amy37236916" "chingyuennn"
## [157] "Beichen_Xiong" "iYTJVbCPoNE8yk0" "jessica62613216" "chanyungpen"
## [161] "gulu65349260" "CarmanChan18" "yufei0514" "CatherineCY2"
## [165] "DollyDu2" "ruth81826833" "chen_chuqiao" "Orangant1"
## [169] "lohasli" "tonykai7" "TyroneOOOOO" "Acliec1"
## [173] "YMLeung1" "EY78868337" "Rossellawang" "LiaoZhenyi"
## [177] "nan36247006" "ann_10699" "SARIPHUONG1" "ZhangJamine"
## [181] "Cristal56781" "fkuhkpopo" "Dai84922137" "kyle92157825"
## [185] "hoho95763942" "clarenc99465820" "adrianlwy1" "8964Freedom"
## [189] "LIUYU0101" "Ocean12865" "Chenxiayan1" "zhanlupany13"
## [193] "liaoshirushen1" "KenChan90747657" "linshuiyifeng" "kkhong14"
## [197] "Darry25529523" "runtolearnlove" "Celeste32079130" "zhangjiakuan1"
## [201] "immdaone" "mizukwii" "ChanGamwai" "jeb_alex"
## [205] "samibenji1" "Rebecca91728695" "ALPACIN80871083" "Lydia16551709"
## [209] "Parry1229295143" "wakemybody" "P3kIOHIB4FNAjUO" "Shiva56378847"
## [213] "UGyC8RlO8pYQjIl" "Loveee56835683" "aNMdOFuWdUHCVz2" "JU3KjLWP9HeuVQU"
## [217] "Hassanm56727211" "Shp4rBvXVuyAYOK" "DmpDrQhPsRefnRe" "Wutter8"
## [221] "vanessatmk" "yyg35176704" "live_likein2004" "jj03967293"
## [225] "WCleo1225" "tanshaoqiang" "nimph_peng" "YukiLi79584282"
## [229] "hu_hongxia" "monica741789632" "EllaSong18" "SagaAndersson13"
## [233] "GuanSuzee" "Ko06095832" "jaycc84801703" "yuyaojie12138"
## [237] "noel_liang" "AndySui10" "GaN99258952"