
require(rtweet) # load the required libraries
require(plotly)
require(wordcloud)
require(tm)
jmschku_handle <- readRDS("jmschku_handle.rds")
jmschku_folls_data <- readRDS("jmschku_folls_data.rds")
jmschku_timeline <- readRDS("jmschku_timeline.rds")
jmschku_handle$name # Name of the user
## [1] "JMSC"
jmschku_handle$followers_count # Followers count
## [1] 5748
jmschku_handle$description # User's description
## [1] "Founded in 1999, the Journalism and Media Studies Centre of The University of Hong Kong offers professional journalism education at Asia’s premier university."
jmschku_handle$location # Location
## [1] "Hong Kong"
jmschku_handle$statuses_count # Total number of posts
## [1] 6724
jmschku_handle$account_created_at # Date of account created
## [1] "2009-11-12 10:02:33 UTC"
duration <- difftime(Sys.time(),jmschku_handle$account_created_at,units='days')
duration
## Time difference of 3794.79 days
jmschku_handle$statuses_count/as.numeric(duration) # Average number of posts per day
## [1] 1.771903
head(jmschku_folls_data$screen_name)
## [1] "HenryCheung_7" "andre__gabriel" "Chen8975" "wwm1029"
## [5] "RyanYue18" "VivekjiSharma"
head(jmschku_folls_data$location)
## [1] "" "Linz, Austria" "Salt Lake City" ""
## [5] "Hong Kong" "India"
class(jmschku_folls_data$location)
## [1] "character"
sort(table(jmschku_folls_data$location),decreasing = TRUE)[1:20] # List top 20 user locations
##
## Hong Kong
## 2074 820
## Beijing People's Republic of China
## 50 40
## London, England Shanghai
## 36 34
## <U+9999><U+6E2F> New York, NY
## 27 25
## United States Washington, DC
## 24 24
## London Singapore
## 23 22
## Central & Western District Guangdong
## 21 21
## China New Delhi, India
## 19 15
## HK Los Angeles, CA
## 13 13
## California, USA Hong Kong
## 12 12
jfd_loc_top20 <- sort(table(jmschku_folls_data$location),decreasing = TRUE)[1:20]
jfd_loc_top20 <- jfd_loc_top20[names(jfd_loc_top20) != ""] # Remove ""
class(jmschku_folls_data$followers_count) # Check the data class of followers_count
## [1] "integer"
summary(jmschku_folls_data$followers_count) # Show its summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 5 55 4004 432 2893336
jmschku_folls_data[order(jmschku_folls_data$followers_count,decreasing = T)[1:10],c("screen_name","followers_count")] # Top 10 followers
## # A tibble: 10 x 2
## screen_name followers_count
## <chr> <int>
## 1 verified 2893336
## 2 Lara 1755024
## 3 AlaattinCAGIL 1554834
## 4 soledadobrien 1179506
## 5 _yavuzatalay 855688
## 6 TayeDiggs 643144
## 7 DaveVescio 547762
## 8 zlj517 544359
## 9 BraveLad 524660
## 10 joshuawongcf 519713
jmschku_followers_grp <- cut(jmschku_folls_data$followers_count,breaks = c(0,10,100,500,1000,2500,max(jmschku_folls_data$followers_count)))
jmschku_followers_grp_table <- table(jmschku_followers_grp)
p <- plot_ly(labels = names(jmschku_followers_grp_table), values = jmschku_followers_grp_table, type = 'pie', sort = FALSE, textposition = 'inside', textinfo = 'label+percent')
p <- layout(p, title = "JMSC's Followers", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
sort(table(jmschku_folls_data$lang),decreasing=T)
##
## en zh und ja es fr de in ko ar pt ne tl it hi ru
## 2668 397 328 126 75 53 30 30 21 20 17 16 16 14 12 12
## ur tr da et nl th pl sv ca fi bn ht no ro cs vi
## 11 10 8 7 6 6 5 5 4 4 3 3 3 3 2 2
## am cy el eu fa hu km kn lv ps sl sr
## 1 1 1 1 1 1 1 1 1 1 1 1
Preprocessing <- function(doc){
#create corpus
doc.corpus <- VCorpus(VectorSource(doc))
doc.corpus <- tm_map(doc.corpus, content_transformer(tolower))### Convert to lower case
doc.corpus <- tm_map(doc.corpus, content_transformer(function(x)gsub('https://[0-9a-zA-Z./]+','',x))) #### remove urls
doc.corpus <- tm_map(doc.corpus, content_transformer(function(x)gsub('#[0-9a-zA-Z_]+','',x))) #### remove hashtags
doc.corpus <- tm_map(doc.corpus, content_transformer(function(x)gsub('@[0-9a-zA-Z_]+','',x))) #### remove nametags
doc.corpus <- tm_map(doc.corpus, removePunctuation) ### remove punctuation
doc.corpus <- tm_map(doc.corpus, removeWords,stopwords("english")) #### remove stopwords
return(doc.corpus)
}
jfd_des <- paste(jmschku_folls_data$description,collapse=" ") # Collapse into one single line
jfd_des_tm <- Preprocessing(jfd_des)
wordcloud(jfd_des_tm, scale=c(4,.5), min.freq=30, max.words=Inf, random.order=F, colors=brewer.pal(8, "Accent"))

jmschku_folls_data$avgppd <- jmschku_folls_data$statuses_count/as.numeric(difftime(Sys.time(),jmschku_folls_data$account_created_at,units='days'))
p <- plot_ly(data = jmschku_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')
# Output in Linear scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (in linear scale)", xaxis = list(title = "Followers Count"), yaxis = list (title = "Average posts per day"))
p <- plot_ly(data = jmschku_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')
# Output in log scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (both xy in log scale)", xaxis = list(title = "Followers Count", type = "log"), yaxis = list (title = "Average posts per day", type = "log"))
## % of retweet
sum(jmschku_timeline$is_retweet)/nrow(jmschku_timeline)
## [1] 0.6466165
# Top retweet
jmschku_timeline[which.max(jmschku_timeline$retweet_count),c("created_at","text","retweet_count","is_retweet")]
## # A tibble: 1 x 4
## created_at text retweet_count is_retweet
## <dttm> <chr> <int> <lgl>
## 1 2017-03-07 03:20:47 Camera shutter speed synced to h~ 11368 TRUE
jmschku_timeline_ori <- jmschku_timeline[!jmschku_timeline$is_retweet,]
# Top original post
jmschku_timeline_ori[which.max(jmschku_timeline_ori$retweet_count),c("created_at","text","retweet_count","is_retweet")]
## # A tibble: 1 x 4
## created_at text retweet_count is_retweet
## <dttm> <chr> <int> <lgl>
## 1 2020-03-28 14:50:07 Our Bachelor of Journalism alumn~ 133 FALSE
# Top 5 replies
sort(table(jmschku_timeline$reply_to_screen_name),decreasing=T)[1:5]
##
## JMSCHKU keithrichburg MasatoKJ HKUniversity annekrugernews
## 91 15 11 4 3
## Top 5 mentions
sort(table(unlist(jmschku_timeline$mentions_screen_name)),decreasing=T)[1:5]
##
## JMSCHKU keithrichburg fcchk HKUniversity HongKongFP
## 477 257 135 121 96
## Time trend
jmschku_tt <- table(format(jmschku_timeline$created_at,"%Y-%m"))
jmschku_tt_ori <- table(format(jmschku_timeline_ori$created_at,"%Y-%m"))
p <- plot_ly(x = names(jmschku_tt), y = jmschku_tt, name = "Tweets + Retweets", type = 'scatter', mode = 'lines')
p <- add_trace(p, y = jmschku_tt_ori, name = "Tweets only")
layout(p, title = "@JMSCHKU's Post History (latest 3200 posts)", xaxis = list(title = "Year-Month"), yaxis = list (title = "Number of Posts"))
#The following indicators provide the strongest signals to separate bots from humans:
# 1. absence of user description (whether the public Twitter profile looks like the default one or it is customized);
# 2. absence of geographical metadata (humans often use smartphones and the Twitter iPhone/Android App, which records as digital footprint the physical location of the mobile device); and,
# 3. Low number of followers, say less than 10
# 4. Account creation date, less than a year
# 5. Non-verified user
#
# x is the data.frame returned from rtweet::lookup_users
remove_bot <- function(x,followers_cutoff,date_cutoff){
return(x[
x$followers_count > followers_cutoff |
nchar(gsub('^[ ]*|[ ]*$','',x$location)) != 0 |
nchar(gsub('^[ ]*|[ ]*$','',x$description)) != 0 |
as.Date(x$account_created_at) < as.Date(date_cutoff) |
x$verified,])
}
jmschku_folls_data_nobot <- remove_bot(jmschku_folls_data,10,"2019-04-03")
# Number of followers before bot removal
nrow(jmschku_folls_data)
## [1] 5000
nrow(jmschku_folls_data_nobot)
## [1] 4839
# % of detected bots (@JMSCHKU)
round(100*(nrow(jmschku_folls_data)-nrow(jmschku_folls_data_nobot))/nrow(jmschku_folls_data),2)
## [1] 3.22
## Show names of detected "Bots"
jmschku_folls_data[!(jmschku_folls_data$screen_name %in% jmschku_folls_data_nobot$screen_name),]$screen_name
## [1] "LinusChen7" "gyhklllve" "wangzi84433595" "KennethLiu"
## [5] "lumosstar_" "4MjZ1J5nsFw3RLq" "Kyosekika" "ocVoD1Mxje4O70m"
## [9] "Barbara31437208" "JamesLo76136894" "Novice99502312" "ChanYiug"
## [13] "CandyCh51213290" "Jack94219819" "harrypotter9678" "bluefloyd4"
## [17] "missliutong" "achilles1019" "Bon9421" "v3JB0yWnziruYMf"
## [21] "Cheinhui1" "yangyang618713" "aC8x5k87GLUBJcV" "Jerome_Ou"
## [25] "HExcitg" "iansui1" "joechen34032604" "wangRicky3"
## [29] "wosirn" "k18023081959" "onlyJanuaryaaa" "sihuichen2"
## [33] "Brandon26491947" "W0OG6OO2Z76jFCo" "Momo55078240" "FEFDYI91ywqubB1"
## [37] "Megan52986522" "wangharry5" "jack57958615" "XVBLbYJk6Knk6gC"
## [41] "chestnattiioslo" "n38879746" "ghftlam" "yongzi8599"
## [45] "JoshuaL89291212" "candy61518804" "Asahina27182321" "Zhangyf67903612"
## [49] "AgwmRNHKOWD8hx3" "alovelycat234" "handsome0624" "Egg06817374"
## [53] "SBC86148004" "RZFo3LkDd2zkjE2" "Hjs_Kirito" "egg62087207"
## [57] "lxy880524" "yyevayu" "faker74727225" "Muhamma25230295"
## [61] "File014" "Hongkon49849006" "FarhanH63398493" "cerliem"
## [65] "kdUWy3xVuHHkYNV" "Praveen85188042" "fefayama" "ChehAmber"
## [69] "emmad64412036" "william53384478" "13_2w" "chehanhui"
## [73] "U8tbUEMUkUo9byh" "bolinlai1" "amy37236916" "chingyuennn"
## [77] "Beichen_Xiong" "iYTJVbCPoNE8yk0" "jessica62613216" "chanyungpen"
## [81] "gulu65349260" "CarmanChan18" "yufei0514" "CatherineCY2"
## [85] "DollyDu2" "ruth81826833" "chen_chuqiao" "Orangant1"
## [89] "lohasli" "tonykai7" "TyroneOOOOO" "Acliec1"
## [93] "YMLeung1" "EY78868337" "Rossellawang" "LiaoZhenyi"
## [97] "nan36247006" "ann_10699" "Cristal56781" "fkuhkpopo"
## [101] "Dai84922137" "kyle92157825" "hoho95763942" "clarenc99465820"
## [105] "adrianlwy1" "8964Freedom" "LIUYU0101" "MinGW80854183"
## [109] "Chenxiayan1" "zhanlupany13" "liaoshirushen1" "KenChan90747657"
## [113] "linshuiyifeng" "kkhong14" "Darry25529523" "ChanKalim"
## [117] "runtolearnlove" "Celeste32079130" "zhangjiakuan1" "immdaone"
## [121] "mizukwii" "ChanGamwai" "jeb_alex" "samibenji1"
## [125] "Rebecca91728695" "ALPACIN80871083" "Lydia16551709" "Parry1229295143"
## [129] "wakemybody" "VXDHlmqI5ZVOI6m" "P3kIOHIB4FNAjUO" "falinjizhen"
## [133] "Shiva56378847" "UGyC8RlO8pYQjIl" "Loveee56835683" "aNMdOFuWdUHCVz2"
## [137] "JU3KjLWP9HeuVQU" "Hassanm56727211" "anisha_kukreja" "Shp4rBvXVuyAYOK"
## [141] "DmpDrQhPsRefnRe" "Wutter8" "vanessa10034" "yyg35176704"
## [145] "live_likein2004" "jj03967293" "WCleo1225" "tanshaoqiang"
## [149] "nimph_peng" "YukiLi79584282" "hu_hongxia" "monica741789632"
## [153] "EllaSong18" "SagaAndersson13" "GuanSuzee" "Ko06095832"
## [157] "jaycc84801703" "yuyaojie12138" "noel_liang" "AndySui10"
## [161] "GaN99258952"