require(rtweet)  # load the required libraries
require(plotly)
require(wordcloud)
require(tm)
jmschku_handle <- readRDS("jmschku_handle.rds")
jmschku_folls_data <- readRDS("jmschku_folls_data.rds")
jmschku_timeline <- readRDS("jmschku_timeline.rds")
jmschku_handle$name # Name of the user
## [1] "JMSC"
jmschku_handle$followers_count # Followers count
## [1] 5748
jmschku_handle$description # User's description
## [1] "Founded in 1999, the Journalism and Media Studies Centre of The University of Hong Kong offers professional journalism education at Asia’s premier university."
jmschku_handle$location # Location
## [1] "Hong Kong"
jmschku_handle$statuses_count # Total number of posts 
## [1] 6724
jmschku_handle$account_created_at # Date of account created
## [1] "2009-11-12 10:02:33 UTC"
duration <- difftime(Sys.time(),jmschku_handle$account_created_at,units='days') 
duration
## Time difference of 3794.79 days
jmschku_handle$statuses_count/as.numeric(duration) # Average number of posts per day
## [1] 1.771903
head(jmschku_folls_data$screen_name)
## [1] "HenryCheung_7"  "andre__gabriel" "Chen8975"       "wwm1029"       
## [5] "RyanYue18"      "VivekjiSharma"
head(jmschku_folls_data$location)
## [1] ""               "Linz, Austria"  "Salt Lake City" ""              
## [5] "Hong Kong"      "India"
class(jmschku_folls_data$location)
## [1] "character"
sort(table(jmschku_folls_data$location),decreasing = TRUE)[1:20] # List top 20 user locations
## 
##                                             Hong Kong 
##                       2074                        820 
##                    Beijing People's Republic of China 
##                         50                         40 
##            London, England                   Shanghai 
##                         36                         34 
##           <U+9999><U+6E2F>               New York, NY 
##                         27                         25 
##              United States             Washington, DC 
##                         24                         24 
##                     London                  Singapore 
##                         23                         22 
## Central & Western District                  Guangdong 
##                         21                         21 
##                      China           New Delhi, India 
##                         19                         15 
##                         HK            Los Angeles, CA 
##                         13                         13 
##            California, USA                 Hong Kong  
##                         12                         12
jfd_loc_top20 <- sort(table(jmschku_folls_data$location),decreasing = TRUE)[1:20]
jfd_loc_top20 <- jfd_loc_top20[names(jfd_loc_top20) != ""] # Remove "" 

class(jmschku_folls_data$followers_count) # Check the data class of followers_count
## [1] "integer"
summary(jmschku_folls_data$followers_count) # Show its summary
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       5      55    4004     432 2893336
jmschku_folls_data[order(jmschku_folls_data$followers_count,decreasing = T)[1:10],c("screen_name","followers_count")] # Top 10 followers
## # A tibble: 10 x 2
##    screen_name   followers_count
##    <chr>                   <int>
##  1 verified              2893336
##  2 Lara                  1755024
##  3 AlaattinCAGIL         1554834
##  4 soledadobrien         1179506
##  5 _yavuzatalay           855688
##  6 TayeDiggs              643144
##  7 DaveVescio             547762
##  8 zlj517                 544359
##  9 BraveLad               524660
## 10 joshuawongcf           519713
jmschku_followers_grp <- cut(jmschku_folls_data$followers_count,breaks = c(0,10,100,500,1000,2500,max(jmschku_folls_data$followers_count)))
jmschku_followers_grp_table <- table(jmschku_followers_grp)

p <- plot_ly(labels = names(jmschku_followers_grp_table), values = jmschku_followers_grp_table, type = 'pie', sort = FALSE, textposition = 'inside', textinfo = 'label+percent') 
p <- layout(p, title = "JMSC's Followers", xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE), yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
p
sort(table(jmschku_folls_data$lang),decreasing=T)
## 
##   en   zh  und   ja   es   fr   de   in   ko   ar   pt   ne   tl   it   hi   ru 
## 2668  397  328  126   75   53   30   30   21   20   17   16   16   14   12   12 
##   ur   tr   da   et   nl   th   pl   sv   ca   fi   bn   ht   no   ro   cs   vi 
##   11   10    8    7    6    6    5    5    4    4    3    3    3    3    2    2 
##   am   cy   el   eu   fa   hu   km   kn   lv   ps   sl   sr 
##    1    1    1    1    1    1    1    1    1    1    1    1
Preprocessing <- function(doc){
  #create corpus
  doc.corpus <- VCorpus(VectorSource(doc))
  doc.corpus <- tm_map(doc.corpus, content_transformer(tolower))### Convert to lower case
  doc.corpus <- tm_map(doc.corpus, content_transformer(function(x)gsub('https://[0-9a-zA-Z./]+','',x)))  #### remove urls
  doc.corpus <- tm_map(doc.corpus, content_transformer(function(x)gsub('#[0-9a-zA-Z_]+','',x)))  #### remove hashtags
  doc.corpus <- tm_map(doc.corpus, content_transformer(function(x)gsub('@[0-9a-zA-Z_]+','',x)))  #### remove nametags
  doc.corpus <- tm_map(doc.corpus, removePunctuation)   ### remove punctuation
  doc.corpus <- tm_map(doc.corpus, removeWords,stopwords("english"))  #### remove stopwords
  return(doc.corpus)
}

jfd_des <- paste(jmschku_folls_data$description,collapse=" ") # Collapse into one single line
jfd_des_tm <- Preprocessing(jfd_des)
wordcloud(jfd_des_tm, scale=c(4,.5), min.freq=30, max.words=Inf, random.order=F, colors=brewer.pal(8, "Accent"))  

jmschku_folls_data$avgppd <- jmschku_folls_data$statuses_count/as.numeric(difftime(Sys.time(),jmschku_folls_data$account_created_at,units='days'))

p <- plot_ly(data = jmschku_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')

# Output in Linear scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (in linear scale)", xaxis = list(title = "Followers Count"), yaxis = list (title = "Average posts per day"))
p <- plot_ly(data = jmschku_folls_data, x = ~followers_count, y = ~avgppd, text = ~screen_name, type = 'scatter', mode = 'markers')

# Output in log scale
layout(p, title = "Followers count vs. Average activities per day @JMSCHKU (both xy in log scale)", xaxis = list(title = "Followers Count", type = "log"), yaxis = list (title = "Average posts per day", type = "log"))
## % of retweet
sum(jmschku_timeline$is_retweet)/nrow(jmschku_timeline)
## [1] 0.6466165
# Top retweet
jmschku_timeline[which.max(jmschku_timeline$retweet_count),c("created_at","text","retweet_count","is_retweet")]
## # A tibble: 1 x 4
##   created_at          text                              retweet_count is_retweet
##   <dttm>              <chr>                                     <int> <lgl>     
## 1 2017-03-07 03:20:47 Camera shutter speed synced to h~         11368 TRUE
jmschku_timeline_ori <- jmschku_timeline[!jmschku_timeline$is_retweet,]
# Top original post
jmschku_timeline_ori[which.max(jmschku_timeline_ori$retweet_count),c("created_at","text","retweet_count","is_retweet")]
## # A tibble: 1 x 4
##   created_at          text                              retweet_count is_retweet
##   <dttm>              <chr>                                     <int> <lgl>     
## 1 2020-03-28 14:50:07 Our Bachelor of Journalism alumn~           133 FALSE
# Top 5 replies
sort(table(jmschku_timeline$reply_to_screen_name),decreasing=T)[1:5]
## 
##        JMSCHKU  keithrichburg       MasatoKJ   HKUniversity annekrugernews 
##             91             15             11              4              3
## Top 5 mentions
sort(table(unlist(jmschku_timeline$mentions_screen_name)),decreasing=T)[1:5]
## 
##       JMSCHKU keithrichburg         fcchk  HKUniversity    HongKongFP 
##           477           257           135           121            96
## Time trend
jmschku_tt <- table(format(jmschku_timeline$created_at,"%Y-%m"))
jmschku_tt_ori <- table(format(jmschku_timeline_ori$created_at,"%Y-%m"))
p <- plot_ly(x = names(jmschku_tt), y = jmschku_tt, name = "Tweets + Retweets", type = 'scatter', mode = 'lines')
p <- add_trace(p, y = jmschku_tt_ori, name = "Tweets only")
layout(p, title = "@JMSCHKU's Post History (latest 3200 posts)", xaxis = list(title = "Year-Month"), yaxis = list (title = "Number of Posts"))
#The following indicators provide the strongest signals to separate bots from humans: 
# 1.    absence of user description (whether the public Twitter profile looks like the default one or it is customized); 
# 2.    absence of geographical metadata (humans often use smartphones and the Twitter iPhone/Android App, which records as digital footprint the physical location of the mobile device); and, 
# 3.    Low number of followers, say less than 10
# 4.    Account creation date, less than a year
# 5.    Non-verified user
#
# x is the data.frame returned from rtweet::lookup_users

remove_bot <- function(x,followers_cutoff,date_cutoff){
 return(x[
 x$followers_count > followers_cutoff |
 nchar(gsub('^[ ]*|[ ]*$','',x$location)) != 0 |
 nchar(gsub('^[ ]*|[ ]*$','',x$description)) != 0 |
 as.Date(x$account_created_at) < as.Date(date_cutoff) |
 x$verified,])
}

jmschku_folls_data_nobot <- remove_bot(jmschku_folls_data,10,"2019-04-03")

# Number of followers before bot removal
nrow(jmschku_folls_data)
## [1] 5000
nrow(jmschku_folls_data_nobot)
## [1] 4839
# % of detected bots (@JMSCHKU)
round(100*(nrow(jmschku_folls_data)-nrow(jmschku_folls_data_nobot))/nrow(jmschku_folls_data),2)
## [1] 3.22
## Show names of detected "Bots"
jmschku_folls_data[!(jmschku_folls_data$screen_name %in% jmschku_folls_data_nobot$screen_name),]$screen_name
##   [1] "LinusChen7"      "gyhklllve"       "wangzi84433595"  "KennethLiu"     
##   [5] "lumosstar_"      "4MjZ1J5nsFw3RLq" "Kyosekika"       "ocVoD1Mxje4O70m"
##   [9] "Barbara31437208" "JamesLo76136894" "Novice99502312"  "ChanYiug"       
##  [13] "CandyCh51213290" "Jack94219819"    "harrypotter9678" "bluefloyd4"     
##  [17] "missliutong"     "achilles1019"    "Bon9421"         "v3JB0yWnziruYMf"
##  [21] "Cheinhui1"       "yangyang618713"  "aC8x5k87GLUBJcV" "Jerome_Ou"      
##  [25] "HExcitg"         "iansui1"         "joechen34032604" "wangRicky3"     
##  [29] "wosirn"          "k18023081959"    "onlyJanuaryaaa"  "sihuichen2"     
##  [33] "Brandon26491947" "W0OG6OO2Z76jFCo" "Momo55078240"    "FEFDYI91ywqubB1"
##  [37] "Megan52986522"   "wangharry5"      "jack57958615"    "XVBLbYJk6Knk6gC"
##  [41] "chestnattiioslo" "n38879746"       "ghftlam"         "yongzi8599"     
##  [45] "JoshuaL89291212" "candy61518804"   "Asahina27182321" "Zhangyf67903612"
##  [49] "AgwmRNHKOWD8hx3" "alovelycat234"   "handsome0624"    "Egg06817374"    
##  [53] "SBC86148004"     "RZFo3LkDd2zkjE2" "Hjs_Kirito"      "egg62087207"    
##  [57] "lxy880524"       "yyevayu"         "faker74727225"   "Muhamma25230295"
##  [61] "File014"         "Hongkon49849006" "FarhanH63398493" "cerliem"        
##  [65] "kdUWy3xVuHHkYNV" "Praveen85188042" "fefayama"        "ChehAmber"      
##  [69] "emmad64412036"   "william53384478" "13_2w"           "chehanhui"      
##  [73] "U8tbUEMUkUo9byh" "bolinlai1"       "amy37236916"     "chingyuennn"    
##  [77] "Beichen_Xiong"   "iYTJVbCPoNE8yk0" "jessica62613216" "chanyungpen"    
##  [81] "gulu65349260"    "CarmanChan18"    "yufei0514"       "CatherineCY2"   
##  [85] "DollyDu2"        "ruth81826833"    "chen_chuqiao"    "Orangant1"      
##  [89] "lohasli"         "tonykai7"        "TyroneOOOOO"     "Acliec1"        
##  [93] "YMLeung1"        "EY78868337"      "Rossellawang"    "LiaoZhenyi"     
##  [97] "nan36247006"     "ann_10699"       "Cristal56781"    "fkuhkpopo"      
## [101] "Dai84922137"     "kyle92157825"    "hoho95763942"    "clarenc99465820"
## [105] "adrianlwy1"      "8964Freedom"     "LIUYU0101"       "MinGW80854183"  
## [109] "Chenxiayan1"     "zhanlupany13"    "liaoshirushen1"  "KenChan90747657"
## [113] "linshuiyifeng"   "kkhong14"        "Darry25529523"   "ChanKalim"      
## [117] "runtolearnlove"  "Celeste32079130" "zhangjiakuan1"   "immdaone"       
## [121] "mizukwii"        "ChanGamwai"      "jeb_alex"        "samibenji1"     
## [125] "Rebecca91728695" "ALPACIN80871083" "Lydia16551709"   "Parry1229295143"
## [129] "wakemybody"      "VXDHlmqI5ZVOI6m" "P3kIOHIB4FNAjUO" "falinjizhen"    
## [133] "Shiva56378847"   "UGyC8RlO8pYQjIl" "Loveee56835683"  "aNMdOFuWdUHCVz2"
## [137] "JU3KjLWP9HeuVQU" "Hassanm56727211" "anisha_kukreja"  "Shp4rBvXVuyAYOK"
## [141] "DmpDrQhPsRefnRe" "Wutter8"         "vanessa10034"    "yyg35176704"    
## [145] "live_likein2004" "jj03967293"      "WCleo1225"       "tanshaoqiang"   
## [149] "nimph_peng"      "YukiLi79584282"  "hu_hongxia"      "monica741789632"
## [153] "EllaSong18"      "SagaAndersson13" "GuanSuzee"       "Ko06095832"     
## [157] "jaycc84801703"   "yuyaojie12138"   "noel_liang"      "AndySui10"      
## [161] "GaN99258952"