1. @JMSCHKU
# Twitter Handle Profile
jmsc_handle <- lookup_users("JMSCHKU")
jmsc_handle$name # Name of the user
## [1] "JMSC"
jmsc_handle$followers_count # Followers count
## [1] 3391
jmsc_handle$description # User's description
## [1] "The Journalism and Media Studies Centre of The University of Hong Kong is Asia's premier journalism school."
jmsc_handle$location # Location
## [1] "HKU, Hong Kong"
jmsc_handle$statuses_count # Total number of posts
## [1] 5989
jmsc_handle$account_created_at # Date of account created
## [1] "2009-11-12 10:02:33 UTC"
# Time since created
duration <- difftime(Sys.time(),jmsc_handle$account_created_at,units='days')
duration
## Time difference of 3069.694 days
# Average no. of posts per day
jmsc_handle$statuses_count/as.numeric(duration)
## [1] 1.951009
# Obtain the list of @JMSCHKU's Followers
jmsc_folls <- get_followers("JMSCHKU", n = 5000)
head(jmsc_folls) # First 6 followers
## # A tibble: 6 x 1
## user_id
## <chr>
## 1 983139092522196992
## 2 983046779632418816
## 3 983028214040707072
## 4 958262990435794944
## 5 983001119499567104
## 6 982997640907341824
jmsc_folls_data <- lookup_users(jmsc_folls$user_id) # Obtain the followers' profile
jmsc_folls_data[1,]$screen_name # Show first follower's screen name
## [1] "EEopM91F0JnnwCq"
jmsc_folls_data[1,]$location # location
## [1] ""
## Display Top 10 Locations of the followers
class(jmsc_folls_data$location) # Check the data class of location
## [1] "character"
sort(table(jmsc_folls_data$location),decreasing = TRUE)[1:20]
##
## Hong Kong
## 1121 721
## Beijing Shanghai
## 50 29
## New York, NY Central & Western District
## 26 23
## Hong Kong Singapore
## 19 18
## China London
## 17 17
## New Delhi, India Paris, France
## 15 15
## People's Republic of China Washington, DC
## 14 14
## London, England Beijing, China
## 13 11
## HK United States
## 11 11
## New York <U+9999><U+6E2F>
## 10 9
jfd_loc_top20 <- sort(table(jmsc_folls_data$location),decreasing = TRUE)[1:20]
jfd_loc_top20 <- jfd_loc_top20[names(jfd_loc_top20) != ""] # Remove ""
# Geocode the locations and plot a map
jfd_loc <- geocode(names(jfd_loc_top20)) # Geocode top20 locations
wmp <- getMap()
mapCountryData(wmp, colourPalette = rep('wheat',7), oceanCol='lightblue', addLegend=FALSE, mapTitle = "Locations of JMSC's Followers")
p_size <- (1 + 5*(jfd_loc_top20 - min(jfd_loc_top20))/(max(jfd_loc_top20)-min(jfd_loc_top20)))
points(jfd_loc$lon, jfd_loc$lat, col = "red", cex = p_size, type = "p")

## More about Followers
class(jmsc_folls_data$followers_count) # Check the data class of followers_count
## [1] "integer"
summary(jmsc_folls_data$followers_count) # Show its summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 15 107 4494 610 1942736
jmsc_folls_data[order(jmsc_folls_data$followers_count,decreasing = T)[1:10],c("screen_name","followers_count")] # Top 10 followers
## # A tibble: 10 x 2
## screen_name followers_count
## <chr> <int>
## 1 britishmuseum 1942736
## 2 Lara 1783498
## 3 soledadobrien 830095
## 4 DrEricGrabowsky 707526
## 5 TayeDiggs 670277
## 6 BraveLad 542922
## 7 TelNews1 453481
## 8 indymediacn 333754
## 9 roseluqiu 298268
## 10 HerbertRSim 259351
jmsc_followers_grp <- cut(jmsc_folls_data$followers_count,breaks = c(0,10,100,500,1000,2500,max(jmsc_folls_data$followers_count)))
jmsc_followers_grp_table <- table(jmsc_followers_grp)
p <- plot_ly(labels = names(jmsc_followers_grp_table), values = jmsc_followers_grp_table, type = 'pie', sort = FALSE, textposition = 'inside', textinfo = 'label+percent')
p <- layout(p, title = "JMSC's Followers", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
# Language
sort(table(jmsc_folls_data$account_lang),decreasing=T)
##
## en zh-cn zh-tw en-gb zh-CN es fr ko de
## 2644 222 110 72 64 52 43 28 22
## zh-TW ja zh-Hans en-GB id it pt nl ru
## 17 16 16 12 12 9 7 6 6
## ar ca da sv zh-HK th tr vi bn
## 5 4 4 4 4 3 3 2 1
## el hi zh-Hant
## 1 1 1
sort(table(tolower(jmsc_folls_data$account_lang)),decreasing=T)
##
## en zh-cn zh-tw en-gb es fr ko de ja
## 2644 286 127 84 52 43 28 22 16
## zh-hans id it pt nl ru ar ca da
## 16 12 9 7 6 6 5 4 4
## sv zh-hk th tr vi bn el hi zh-hant
## 4 4 3 3 2 1 1 1 1
# Description
Preprocessing <- function(doc){
#create corpus
doc.corpus <- Corpus(VectorSource(doc))
#clean up
doc.corpus <- tm_map(doc.corpus, function(x)chartr("ABCDEFGHIJKLMNOPQRSTUVWXYZ","abcdefghijklmnopqrstuvwxyz",x))### Convert to lower case
doc.corpus <- tm_map(doc.corpus, removePunctuation) ### remove punctuation
doc.corpus <- tm_map(doc.corpus, function(x)removeWords(x,stopwords("english"))) #### remove stopwords
return(doc.corpus)
}
jfd_des <- gsub('[^[:alnum:] ]','',jmsc_folls_data$description) # remove punctuation non-print characters
jfd_des <- paste(jfd_des,collapse=" ") # Collapse into one single line
jfd_des_tm <- Preprocessing(jfd_des)
wordcloud(jfd_des_tm, scale=c(4,.5), min.freq=30, max.words=Inf, random.order=F, colors=brewer.pal(8, "Accent"))

# Followers count vs. Average posts per day
jmsc_folls_data$avgppd <- jmsc_folls_data$statuses_count/as.numeric(difftime(Sys.time(),jmsc_folls_data$account_created_at,units='days'))
p <- plot_ly(data = jmsc_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')
# Output in Linear scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (in linear scale)", xaxis = list(title = "Followers Count"), yaxis = list (title = "Average posts per day"))
p <- plot_ly(data = jmsc_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')
# Output in log scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (both xy in log scale)", xaxis = list(title = "Followers Count", type = "log"), yaxis = list (title = "Average posts per day", type = "log"))
# @JMSCHKU's timeline
jmsc_timeline <- get_timeline("JMSCHKU", n = 3200)
## % of retweet
sum(jmsc_timeline$is_retweet)/nrow(jmsc_timeline)
## [1] 0.6612953
## Most retweeted post (tweet + retweet)
jmsc_timeline[which.max(jmsc_timeline$retweet_count),c("text","retweet_count","is_retweet")]
## # A tibble: 1 x 3
## text retweet_count is_retweet
## <chr> <int> <lgl>
## 1 RT @kreshjun: Camera shutter speed synced to h~ 12293 T
## Most retweeted post (tweet only)
jmsc_timeline_ori <- jmsc_timeline[!jmsc_timeline$is_retweet,]
jmsc_timeline_ori[which.max(jmsc_timeline_ori$retweet_count),c("text","retweet_count","is_retweet")]
## # A tibble: 1 x 3
## text retweet_count is_retweet
## <chr> <int> <lgl>
## 1 It's Prof Chan's last day today. First present~ 19 F
## Top 3 reply
sort(table(jmsc_timeline$reply_to_screen_name),decreasing=T)[1:3]
##
## JMSCHKU keithrichburg Wasifshakil
## 62 5 5
## Top 3 mention
sort(table(unlist(jmsc_timeline$mentions_screen_name)),decreasing=T)[1:3]
##
## JMSCHKU keithrichburg cmphku
## 446 197 126
## Time trend
jmsc_tt <- table(format(jmsc_timeline$created_at,"%Y-%m"))
jmsc_tt_ori <- table(format(jmsc_timeline_ori$created_at,"%Y-%m"))
p <- plot_ly(x = names(jmsc_tt), y = jmsc_tt, name = "Tweets + Retweets", type = 'scatter', mode = 'lines')
p <- add_trace(p, y = jmsc_tt_ori, name = "Tweets only")
layout(p, title = "@JMSCHKU's Post History (latest 3200 posts)", xaxis = list(title = "Year-Month"), yaxis = list (title = "Number of Posts"))
2. @KEITHRICHBURG

keith_folls <- get_followers("keithrichburg", n = 5500)
keith_folls_data <- lookup_users(keith_folls$user_id) # Obtain the followers'
#The following indicators provide the strongest signals to separate bots from humans:
# 1. absence of user description (whether the public Twitter profile looks like the default one or it is customized);
# 2. absence of geographical metadata (humans often use smartphones and the Twitter iPhone/Android App, which records as digital footprint the physical location of the mobile device); and,
# 3. Low number of followers, say less than 10
# 4. Account creation date, less than a year
# 5. Non-verified user
#
# x is the data.frame returned from rtweet::lookup_users
remove_bot <- function(x,followers_cutoff,date_cutoff){
return(x[
x$followers_count > followers_cutoff |
nchar(x$location) != 0 |
nchar(x$description) != 0 |
as.Date(x$account_created_at) < as.Date(date_cutoff) |
x$verified,])
}
keith_folls_data_nobot <- remove_bot(keith_folls_data,10,"2017-04-09")
# Number of followers before bot removal
nrow(keith_folls_data)
## [1] 5183
# Number of followers after bot removal
nrow(keith_folls_data_nobot)
## [1] 5014
jmsc_folls_data_nobot <- remove_bot(jmsc_folls_data,10,"2017-04-09")
# Number of followers before bot removal
nrow(jmsc_folls_data)
## [1] 3391
# Number of followers after bot removal
nrow(jmsc_folls_data_nobot)
## [1] 3280
# % of detected bots (@keithrichburg)
round(100*(nrow(keith_folls_data)-nrow(keith_folls_data_nobot))/nrow(keith_folls_data),2)
## [1] 3.26
# % of detected bots (@JMSCHKU)
round(100*(nrow(jmsc_folls_data)-nrow(jmsc_folls_data_nobot))/nrow(jmsc_folls_data),2)
## [1] 3.27
## Show names of detected "Bots"
keith_folls_data[!(keith_folls_data$screen_name %in% keith_folls_data_nobot$screen_name),]$screen_name
## [1] "jason67449857" "bennykung51" "ajcfnzbqvix1959"
## [4] "bradleyleong28" "tommy63823274" "PcbjasonZhang"
## [7] "AFfe58" "rOgfBWnW6SNfo7J" "10693_net"
## [10] "willing_hunter" "EvemVon" "Phoebe_729"
## [13] "wolf_12138" "ppppppp42936934" "ggsgydhhy"
## [16] "Honma45859841" "Jacksonshi4" "Rita__yin"
## [19] "zyj1361" "rach53978713" "copwowo"
## [22] "BarclayMoses" "ogegetfreaahen2" "RYU_2387"
## [25] "tanxj1" "ALVIN53501090" "K0HAF1DkJMuupGM"
## [28] "fibryika87" "Sally94841473" "Puxun119937224"
## [31] "woshinibabahh" "1fEN9hUbdHW4xGj" "chenyunwen98"
## [34] "AianLiu" "iris114hk" "cJE9GuNpkm5CBsD"
## [37] "rhanhphong6409" "quynhchaoia560" "SP_Yeung"
## [40] "wx_mjh" "dannymak48" "pbvmgjhihv"
## [43] "vVQZf9ROepk9zJw" "805106862Snow" "903914249Cherry"
## [46] "Moore14252420" "KoCheukLun" "zahirud03311953"
## [49] "mrferrero1" "light9_Man" "cyw012"
## [52] "lwwhiufung1" "marthaproduct" "mmbpfbzibe"
## [55] "jaypaul41950649" "franslo811" "WongWaiCheong5"
## [58] "anYnRM3UkMvUEGo" "Huhang1995" "JacqueMaonio"
## [61] "zhr2018331" "DavidY72984261" "HeyVlogDV"
## [64] "RebeccaChung18" "Alex007Grant" "KSR_neal"
## [67] "marques2987" "BillyLai10" "LEE02637871"
## [70] "yangchiking" "herman_hok" "bobochan01"
## [73] "dchen020" "breeze842200979" "pengyuan8592121"
## [76] "laoyuanqu" "lyy45541064" "samuelw73271848"
## [79] "zzy50774888" "chungch65495555" "miketso200528"
## [82] "Kevinzhang113" "yezhixingxh" "JudgerLee"
## [85] "zHPwWPEiiedksvl" "ju520jing" "xiaofeng1898"
## [88] "qfish3" "TANG_Louie_BJ" "Jinkela63Nmb63"
## [91] "PargatSinghGil7" "SuihanshiTunfo" "liuchaovsying"
## [94] "wuwei75272952" "Shahid97591971" "fzjXBnhLR2fIeAm"
## [97] "cici07317316" "Jerry60992076" "Melondly"
## [100] "khakaghy" "Vivienn37402853" "Carmanc2018"
## [103] "hahuekin" "Gemingrm" "ChanRaff"
## [106] "shzu_wu" "RoyiJiang" "Hese87821976"
## [109] "RanMinzhuwansiu" "guoan007" "JeremyChiang711"
## [112] "J71S6v" "oML69dbOONXZ6iG" "xiao_bo1999"
## [115] "gLcDgwHcREyvF1l" "GzLtabcovWcr2Zf" "EasyMining_2018"
## [118] "qianz262" "JackMjj007" "CXLGkSW6GiXiv0C"
## [121] "Jifsy2" "GIFTOFLIFE1412" "Lucio39917218"
## [124] "Zowip2" "KBz2lQB2eQf1RYq" "johnnywang1108"
## [127] "eobeans" "Jasmine91726794" "hao30624300"
## [130] "louis5881" "Mogehair" "GWaiGeorge1"
## [133] "kwTP9LqjWQ694sB" "lixiaoming88" "LucyZeng4602"
## [136] "cathy63149106" "ng_laikwan" "johntit39768620"
## [139] "pmZm5aEutttmJAE" "saifai1211" "michaelw2001_76"
## [142] "chen_everywill" "Liyinggang6" "ERICGONG13"
## [145] "soochikeung3" "zwcode" "klh11649452500"
## [148] "Yau00608694" "onchakchui1" "skyuan953"
## [151] "76RrPTXmO1WQDmM" "Wilson1983NG" "jason_zx08"
## [154] "BenjaminYan9" "yvialLvVDnd3ThD" "liuyanjunzoe"
## [157] "Pauline64849978" "thanuka66" "fordhambaldie"
## [160] "mediazoomer" "Fyngirz" "Joris23492276"
## [163] "adam_addieba" "Ice_anywhere" "ChhimSithar"
## [166] "maria_juanias" "JoyRead7" "francesisgreat"
## [169] "Geraldi29921239"