require(rtweet)  # load the required libraries
require(plotly)
require(wordcloud)
require(tidytext)
jmschku_handle <- readRDS("jmschku_handle.rds")
jmschku_folls_data <- readRDS("jmschku_folls_data.rds")
jmschku_timeline <- readRDS("jmschku_timeline.rds")
jmschku_handle$name # Name of the user
## [1] "HKU Journalism"
jmschku_handle$followers_count # Followers count
## [1] 6597
jmschku_handle$description # User's description
## [1] "Founded in 1999, the Journalism and Media Studies Centre of The University of Hong Kong offers professional journalism education at Asia’s premier university."
jmschku_handle$location # Location
## [1] "Hong Kong"
jmschku_handle$statuses_count # Total number of posts 
## [1] 7149
jmschku_handle$account_created_at # Date of account created
## [1] "2009-11-12 10:02:33 UTC"
duration <- difftime(Sys.time(),jmschku_handle$account_created_at,units='days') 
duration
## Time difference of 4170.985 days
jmschku_handle$statuses_count/as.numeric(duration) # Average number of posts per day
## [1] 1.713983
head(jmschku_folls_data$screen_name)
## [1] "HKU_BASc"        "DigitalMarathi1" "Baxter74740635"  "imtherealdeal63"
## [5] "shirley841222"   "chihin_ko"
head(jmschku_folls_data$location)
## [1] "Hong Kong" ""          ""          ""          ""          "Shandong"
class(jmschku_folls_data$location)
## [1] "character"
sort(table(jmschku_folls_data$location),decreasing = TRUE)[1:20] # List top 20 user locations
## 
##                                             Hong Kong 
##                       2169                        719 
##                    Beijing People's Republic of China 
##                         42                         42 
##           <U+9999><U+6E2F>                   Shanghai 
##                         35                         32 
##              United States            London, England 
##                         30                         29 
##                  Singapore             Washington, DC 
##                         26                         26 
##                     London Central & Western District 
##                         23                         22 
##                  Guangdong                      India 
##                         19                         17 
##               New York, NY                      China 
##                         17                         15 
##           New Delhi, India             United Kingdom 
##                         15                         15 
##            California, USA            Los Angeles, CA 
##                         12                         11
jfd_loc_top20 <- sort(table(jmschku_folls_data$location),decreasing = TRUE)[1:20]
jfd_loc_top20 <- jfd_loc_top20[names(jfd_loc_top20) != ""] # Remove "" 

class(jmschku_folls_data$followers_count) # Check the data class of followers_count
## [1] "integer"
summary(jmschku_folls_data$followers_count) # Show its summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       4      48    3596     411 3177652
jmschku_folls_data[order(jmschku_folls_data$followers_count,decreasing = T)[1:10],c("screen_name","followers_count")] # Top 10 followers
## # A tibble: 10 x 2
##    screen_name   followers_count
##    <chr>                   <int>
##  1 verified              3177652
##  2 AlaattinCAGIL         1468118
##  3 soledadobrien         1346396
##  4 zlj517                 893778
##  5 _yavuzatalay           760808
##  6 joshuawongcf           743024
##  7 DaveVescio             519527
##  8 BraveLad               488232
##  9 roseluqiu              311032
## 10 zenjournalist          272657
jmschku_followers_grp <- cut(jmschku_folls_data$followers_count,breaks = c(0,10,100,500,1000,2500,max(jmschku_folls_data$followers_count)))
jmschku_followers_grp_table <- table(jmschku_followers_grp)

p <- plot_ly(labels = names(jmschku_followers_grp_table), values = jmschku_followers_grp_table, type = 'pie', sort = FALSE, textposition = 'inside', textinfo = 'label+percent') 
p <- layout(p, title = "JMSC's Followers", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
sort(table(jmschku_folls_data$lang),decreasing=T)
## 
##   en   zh  und   ja   es   fr   in   ko   pt   de   ne   hi   ar   it   tl   ru 
## 2435  435  351  159   80   43   32   22   22   21   20   18   15   15   13   12 
##   tr   ur   fi   sv   nl   th   et   cs   fa   da   no   pl   ca   el   ht   lv 
##   12   10    9    9    8    8    7    5    4    3    3    3    2    2    2    2 
##   pa   vi   am   eu   km   kn   ps   sr 
##    2    2    1    1    1    1    1    1
Preprocessing <- function(x){
  x <- gsub('https://[0-9a-zA-Z./]+','',x) # removing all url
  x <- gsub('#[0-9a-zA-Z_]+','',x) # removing all hashtag
  x <- gsub('@[0-9a-zA-Z_]+','',x) # removing all @mention
  x <- gsub('[[:punct:]]',' ',x)   # removing all punctuations
  x <- tolower(x) # lower case
  x <- trimws(x) # removing both
  stopw <- get_stopwords()
  stopw <- tolower(stopw$word)
  x <- unnest_tokens(data.frame(txt=x),word,txt) # tokenization
  x <- tolower(x$word)
  x <- x[!(x %in% stopw)]
  return(x)
}

jfd_des <- paste(jmschku_folls_data$description,collapse=" ") # Collapse into one single line
jfd_des_tm <- Preprocessing(jfd_des)
wordcloud(jfd_des_tm, scale=c(4,.5), min.freq=30, max.words=Inf, random.order=F, colors=brewer.pal(8, "Accent"))  
## Loading required namespace: tm
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

jmschku_folls_data$avgppd <- jmschku_folls_data$statuses_count/as.numeric(difftime(Sys.time(),jmschku_folls_data$account_created_at,units='days'))

p <- plot_ly(data = jmschku_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')

# Output in Linear scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (in linear scale)", xaxis = list(title = "Followers Count"), yaxis = list (title = "Average posts per day"))
p <- plot_ly(data = jmschku_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')

# Output in log scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (both xy in log scale)", xaxis = list(title = "Followers Count", type = "log"), yaxis = list (title = "Average posts per day", type = "log"))
## % of retweet
sum(jmschku_timeline$is_retweet)/nrow(jmschku_timeline)
## [1] 0.6125806
# Top retweet
jmschku_timeline[which.max(jmschku_timeline$retweet_count),c("created_at","text","retweet_count","is_retweet")]
## # A tibble: 1 x 4
##   created_at          text                              retweet_count is_retweet
##   <dttm>              <chr>                                     <int> <lgl>     
## 1 2020-11-07 18:43:22 The front page of The New York T~         22607 TRUE
jmschku_timeline_ori <- jmschku_timeline[!jmschku_timeline$is_retweet,]
# Top original post
jmschku_timeline_ori[which.max(jmschku_timeline_ori$retweet_count),c("created_at","text","retweet_count","is_retweet")]
## # A tibble: 1 x 4
##   created_at          text                              retweet_count is_retweet
##   <dttm>              <chr>                                     <int> <lgl>     
## 1 2020-05-03 11:48:44 "We’re excited to announce the l~           247 FALSE
# Top 5 replies
sort(table(jmschku_timeline$reply_to_screen_name),decreasing=T)[1:5]
## 
##        JMSCHKU  keithrichburg       MasatoKJ   HKUniversity annekrugernews 
##            145             27             12              4              3
## Top 5 mentions
sort(table(unlist(jmschku_timeline$mentions_screen_name)),decreasing=T)[1:5]
## 
##       JMSCHKU keithrichburg         fcchk  HKUniversity annielab_jmsc 
##           437           297           189           123           101
## Time trend
jmschku_tt <- table(format(jmschku_timeline$created_at,"%Y-%m"))
jmschku_tt_ori <- table(format(jmschku_timeline_ori$created_at,"%Y-%m"))
p <- plot_ly(x = names(jmschku_tt), y = jmschku_tt, name = "Tweets + Retweets", type = 'scatter', mode = 'lines')
p <- add_trace(p, y = jmschku_tt_ori, name = "Tweets only")
layout(p, title = "@JMSCHKU's Post History (latest 3200 posts)", xaxis = list(title = "Year-Month"), yaxis = list (title = "Number of Posts"))
#The following indicators provide the strongest signals to separate bots from humans: 
# 1.    absence of user description (whether the public Twitter profile looks like the default one or it is customized); 
# 2.    absence of geographical metadata (humans often use smartphones and the Twitter iPhone/Android App, which records as digital footprint the physical location of the mobile device); and, 
# 3.    Low number of followers, say less than 10
# 4.    Account creation date, less than a year
# 5.    Non-verified user
#
# x is the data.frame returned from rtweet::lookup_users

remove_bot <- function(x,followers_cutoff,date_cutoff){
 return(x[
 x$followers_count > followers_cutoff |
 nchar(gsub('^[ ]*|[ ]*$','',x$location)) != 0 |
 nchar(gsub('^[ ]*|[ ]*$','',x$description)) != 0 |
 as.Date(x$account_created_at) < as.Date(date_cutoff) |
 x$verified,])
}

jmschku_folls_data_nobot <- remove_bot(jmschku_folls_data,10,"2019-04-03")

# Number of followers before bot removal
nrow(jmschku_folls_data)
## [1] 5000
nrow(jmschku_folls_data_nobot)
## [1] 4761
# % of detected bots (@JMSCHKU)
round(100*(nrow(jmschku_folls_data)-nrow(jmschku_folls_data_nobot))/nrow(jmschku_folls_data),2)
## [1] 4.78
## Show names of detected "Bots"
jmschku_folls_data[!(jmschku_folls_data$screen_name %in% jmschku_folls_data_nobot$screen_name),]$screen_name
##   [1] "DigitalMarathi1" "uHaOVXZy9hXztlp" "J9Bv2"           "KittyWo14750316"
##   [5] "JasonCh32901805" "suhanigurung14"  "margaretyako"    "FrauHKHKHKHK"   
##   [9] "conutooh"        "tLbH8TPAk33SHsN" "SC41008188"      "thphlfxl"       
##  [13] "TrnHuGiaKhnh3"   "AlexLeePhD"      "SaSa64120452"    "JvR9ftBUp5FISBO"
##  [17] "TracerGG"        "Q41323703"       "yrzdmng"         "DianaLAlvarezM1"
##  [21] "WGeynS4gtxofq7y" "shGjXkH4fsaI42X" "olivert90770261" "Gin78952955"    
##  [25] "hamidirayhane"   "Khnh97100230"    "agnesbubugmail1" "allenwa75315010"
##  [29] "ianclar75196745" "AliceSerene"     "laahi_sahal"     "hui11231"       
##  [33] "Robert201909"    "abbyhlr"         "aJ57v8RKlHdGbff" "Pert16183118"   
##  [37] "YMHYudeovFDpTqL" "Ningwei214"      "wp6sOyxusisDWSy" "BeeDee41591352" 
##  [41] "lufi18042218"    "zengdafei"       "AT18611"         "Wellin54982085" 
##  [45] "ZzxCooper"       "sunnyltc1"       "natalie19252113" "shinerise7"     
##  [49] "kikilam55658006" "lila42799429"    "CheungLlewellyn" "rachel3913"     
##  [53] "wish82268072"    "hub_lit"         "BillyLa93066135" "Jessica63579549"
##  [57] "beok99225775"    "PLmama2"         "EssentialK"      "ParisJussieu"   
##  [61] "Aslanthelion3"   "DBecks1975"      "vhLat52fut9RlcY" "OliviaW74069096"
##  [65] "PieaGac"         "GSTrump2"        "matthewkllaw"    "Faye71359424"   
##  [69] "Desmond95990546" "lina07307655"    "PeterHendrix17"  "shenghaoke"     
##  [73] "andrew_carbo"    "stephan35270357" "onlyyouboy186"   "yingyun_he"     
##  [77] "p8JjlU83oGgdFdu" "Anya21894105"    "ChuenChuen11"    "AliceWa53732257"
##  [81] "ENovSBMhwr625y5" "kcohbo"          "KwaUW3JD6xHUQK7" "JikKam"         
##  [85] "dDQxRGZakRunlan" "10SGo2yxFGgGB5F" "MarcoWu31508834" "Shirley06029973"
##  [89] "gyhklllve"       "wangzi84433595"  "lumosstar_"      "4MjZ1J5nsFw3RLq"
##  [93] "Kyosekika"       "Barbara31437208" "Novice99502312"  "ChanYiug"       
##  [97] "CandyCh51213290" "Jack94219819"    "harrypotter9678" "bluefloyd4"     
## [101] "liutongecho"     "achilles1019"    "v3JB0yWnziruYMf" "Cheinhui1"      
## [105] "yangyang618713"  "aC8x5k87GLUBJcV" "Jerome_Ou"       "HExcitg"        
## [109] "iansui1"         "joechen34032604" "wangRicky3"      "wosirn"         
## [113] "k18023081959"    "sihuichen2"      "Brandon26491947" "W0OG6OO2Z76jFCo"
## [117] "Momo55078240"    "jameslee_1229"   "FEFDYI91ywqubB1" "wwwww_abcde"    
## [121] "jack57958615"    "XVBLbYJk6Knk6gC" "chestnattiioslo" "n38879746"      
## [125] "ghftlam"         "yongzi8599"      "JoshuaL89291212" "candy61518804"  
## [129] "Asahina27182321" "Zhangyf67903612" "AgwmRNHKOWD8hx3" "alovelycat234"  
## [133] "handsome0624"    "Egg06817374"     "SBC86148004"     "RZFo3LkDd2zkjE2"
## [137] "Hjs_Kirito"      "egg62087207"     "lxy880524"       "vandavinci2"    
## [141] "yyevayu"         "faker74727225"   "Muhamma25230295" "File014"        
## [145] "Hongkon49849006" "kdUWy3xVuHHkYNV" "fefayama"        "ChehAmber"      
## [149] "emmad64412036"   "william53384478" "13_2w"           "chehanhui"      
## [153] "U8tbUEMUkUo9byh" "bolinlai1"       "amy37236916"     "chingyuennn"    
## [157] "Beichen_Xiong"   "iYTJVbCPoNE8yk0" "jessica62613216" "chanyungpen"    
## [161] "gulu65349260"    "CarmanChan18"    "yufei0514"       "CatherineCY2"   
## [165] "DollyDu2"        "ruth81826833"    "chen_chuqiao"    "Orangant1"      
## [169] "lohasli"         "tonykai7"        "TyroneOOOOO"     "Acliec1"        
## [173] "YMLeung1"        "EY78868337"      "Rossellawang"    "LiaoZhenyi"     
## [177] "nan36247006"     "ann_10699"       "SARIPHUONG1"     "ZhangJamine"    
## [181] "Cristal56781"    "fkuhkpopo"       "Dai84922137"     "kyle92157825"   
## [185] "hoho95763942"    "clarenc99465820" "adrianlwy1"      "8964Freedom"    
## [189] "LIUYU0101"       "Ocean12865"      "Chenxiayan1"     "zhanlupany13"   
## [193] "liaoshirushen1"  "KenChan90747657" "linshuiyifeng"   "kkhong14"       
## [197] "Darry25529523"   "runtolearnlove"  "Celeste32079130" "zhangjiakuan1"  
## [201] "immdaone"        "mizukwii"        "ChanGamwai"      "jeb_alex"       
## [205] "samibenji1"      "Rebecca91728695" "ALPACIN80871083" "Lydia16551709"  
## [209] "Parry1229295143" "wakemybody"      "P3kIOHIB4FNAjUO" "Shiva56378847"  
## [213] "UGyC8RlO8pYQjIl" "Loveee56835683"  "aNMdOFuWdUHCVz2" "JU3KjLWP9HeuVQU"
## [217] "Hassanm56727211" "Shp4rBvXVuyAYOK" "DmpDrQhPsRefnRe" "Wutter8"        
## [221] "vanessatmk"      "yyg35176704"     "live_likein2004" "jj03967293"     
## [225] "WCleo1225"       "tanshaoqiang"    "nimph_peng"      "YukiLi79584282" 
## [229] "hu_hongxia"      "monica741789632" "EllaSong18"      "SagaAndersson13"
## [233] "GuanSuzee"       "Ko06095832"      "jaycc84801703"   "yuyaojie12138"  
## [237] "noel_liang"      "AndySui10"       "GaN99258952"