if (!require("rtweet")) install.packages("rtweet", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("plotly")) install.packages("plotly", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("stringr")) install.packages("stringr", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)

require(plotly)  # load the required libraries
require(rtweet)  
require(stringr)

1. Tweet Mining

First, We conduct a hashtag search by “bitcoin” from Twitter which returns 500 tweets (no retweet) in English. Pattern matching functions is performed one by one for examples.

bitcoin <- search_tweets("#bitcoin", n=500, include_rts = FALSE, lang="en") # Twitter search
bitcoin.text <- bitcoin$text   # Get the text of the tweets

### Which tweet containing URL?
grep("https",bitcoin.text)
##   [1]   1   2   3   4   5   6   8   9  10  11  12  13  14  15  16  17  18
##  [18]  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  38
##  [35]  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53  55  56
##  [52]  57  58  61  63  64  65  66  67  68  71  72  73  74  75  76  77  78
##  [69]  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
##  [86]  96  97  99 100 101 102 103 104 105 106 107 108 110 111 112 115 117
## [103] 122 123 124 125 126 127 128 129 130 131 132 133 135 136 137 138 139
## [120] 140 141 142 143 144 145 146 147 148 149 152 153 156 157 158 160 161
## [137] 162 163 164 165 166 167 169 170 172 174 176 177 179 180 181 182 183
## [154] 184 185 186 187 188 190 196 197 198 199 202 203 204 205 206 208 210
## [171] 218 219 224 225 227 228 230 234 235 236 238 239 242 243 244 245 248
## [188] 251 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 270
## [205] 271 272 274 275 277 278 279 280 281 283 284 285 286 288 289 290 292
## [222] 293 295 297 299 300 301 302 303 304 305 306 308 309 311 312 313 314
## [239] 315 316 317 318 320 321 322 323 324 325 330 332 333 334 335 337 338
## [256] 339 340 341 342 343 344 347 349 350 351 353 354 355 356 357 358 359
## [273] 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
## [290] 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
## [307] 395 397
### Which tweet containing URL (TRUE/FALSE)?
grepl("https",bitcoin.text)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [23]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [34]  TRUE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [45]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [56]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
##  [67]  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [78]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [89]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [100]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [111]  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
## [122]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [133]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [144]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
## [155] FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [166]  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE
## [177]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [188]  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
## [199]  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [210]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [221] FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE
## [232] FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE
## [243]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE
## [254]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [265]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
## [276] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [287] FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [298] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [309]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [320]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [331] FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [342]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE FALSE
## [353]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [364]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [375]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [386]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
## [397]  TRUE
### Display all those matached
head(grep("https",bitcoin.text,value=TRUE))
## [1] "Blockchain and business have a bright future together <U+0001F929>\nIf you want to learn more about the realworld impact of blockchain tech and cryptocurrency, this is your chance. <U+0001F44F><U+0001F3FC>\nJoin us on Friday for a talk on blockchain!\n\n#bitcoin #blockchain  #coworking #cowork #cryptocurrency https://t.co/7UvITkkrDW"
## [2] "Text predictor for #AI #machinelearning \nhttps://t.co/sNMCp7lOgT\n#deeplearn #code #developer #python #bitcoin #btc #cryptocurrency #DigitalMineNetwork\no"                                                                                                                                                  
## [3] "The blog of our new project, here infos, and details about our #cryptocurrency related website\nhttps://t.co/j8PfbQ0nEU\n#bitcoin #blockchain #btc #DigitalMineNetwork o"                                                                                                                                     
## [4] "Here a simple #python program that allows to integrate cryptocurrencies payments in websites \n#bitcoin #btc #cryptocurrency #programming #DigitalMineNetwork\nhttps://t.co/tWlqriqaiE o"                                                                                                                     
## [5] "My main website the home of my network\nhttps://t.co/S22LmLuAZ4\n#bitcoin #btc #cryptocurrency #blockchain #AI #DigitalMineNetwork o"                                                                                                                                                                         
## [6] "G-point system #eGoldMining  is there to incentivize the long term token holder the most.with  payout structures,because of the fact that each user trades a lot and saves on an exchange\njoin: https://t.co/NsTlcCbaWt \n#eGold #EGM #ICO #Crowdsale #Bitcoin #Blockchain #ETH #Ethereum"
### Which tweet NOT containing URL?
grep("https",bitcoin.text,invert=TRUE)
##  [1]   7  35  36  37  54  59  60  62  69  70  98 109 113 114 116 118 119
## [18] 120 121 134 150 151 154 155 159 168 171 173 175 178 189 191 192 193
## [35] 194 195 200 201 207 209 211 212 213 214 215 216 217 220 221 222 223
## [52] 226 229 231 232 233 237 240 241 246 247 249 250 252 268 269 273 276
## [69] 282 287 291 294 296 298 307 310 319 326 327 328 329 331 336 345 346
## [86] 348 352 394 396
### Logical or/and 
grep("blockchain",bitcoin.text,ignore.case = TRUE)
##   [1]   1   3   5   6   7  11  12  13  18  26  28  29  30  31  32  33  35
##  [18]  36  37  40  41  46  47  48  49  51  52  53  56  59  63  65  66  71
##  [35]  79  80  81  82  84  89  90  91  92  93  97  99 113 114 115 116 117
##  [52] 118 119 120 121 126 127 128 129 130 131 133 136 139 142 147 152 153
##  [69] 157 160 165 166 169 170 187 189 196 202 203 230 233 239 241 246 247
##  [86] 248 261 264 271 272 278 281 282 283 292 295 300 302 303 312 315 316
## [103] 318 321 322 326 331 333 337 341 344 345 346 348 349 352 389 391 392
## [120] 396 397
grep("cryptocurrency",bitcoin.text,ignore.case = TRUE)
##   [1]   1   2   3   4   5  11  12  16  23  27  28  43  48  49  50  52  56
##  [18]  58  65  66  72  73  74  75  77  79  80  81  82  83  84  85  89  96
##  [35]  97 103 104 105 110 114 124 126 127 133 148 149 153 164 166 176 179
##  [52] 181 183 184 191 192 196 202 203 204 205 206 209 213 214 223 231 232
##  [69] 233 238 246 247 248 254 255 261 263 264 271 272 274 277 279 281 287
##  [86] 290 295 299 303 308 315 318 326 334 341 345 346 350 351 353 354 355
## [103] 356 386 389 390 397
### Mentioning blockchain or cryptocurrency
grepl("blockchain|cryptocurrency",bitcoin.text,ignore.case = TRUE)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE
##  [12]  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [23]  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [34] FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE
##  [45] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
##  [56]  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE
##  [67] FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
##  [78] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
##  [89]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE
## [100] FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [111] FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [122] FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [133]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [144] FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
## [155] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE
## [166]  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE
## [177] FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE
## [188] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
## [199] FALSE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
## [210] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
## [232]  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE
## [243] FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [254]  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE  TRUE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE
## [276] FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE
## [287]  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE
## [298] FALSE  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE  TRUE
## [309] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE
## [320] FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [331]  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
## [342] FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [353]  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386]  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE  TRUE
## [397]  TRUE
grepl("blockchain",bitcoin.text,ignore.case = TRUE) & grepl("cryptocurrency",bitcoin.text,ignore.case = TRUE)
##   [1]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE
##  [12]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
##  [56]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78] FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [89]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [133]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [199] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [342] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397]  TRUE
### Use grep/grepl as an index vector
bitcoin$screen_name[grepl("blockchain",bitcoin.text,ignore.case = TRUE) & grepl("cryptocurrency",bitcoin.text,ignore.case = TRUE)]
##  [1] "iCampusMalaga"   "digital_mine_"   "digital_mine_"  
##  [4] "ActivosDig"      "ActivosDig"      "Espnlover365"   
##  [7] "coinspectator"   "coinspectator"   "coinspectator"  
## [10] "WorknbDAO"       "kind_kris"       "ridecaribbean"  
## [13] "CoinCapsAi"      "CoinCapsAi"      "CoinCapsAi"     
## [16] "CoinCapsAi"      "CoinCapsAi"      "TheBlockchain"  
## [19] "LTCTheRightCoin" "sarahjayanti982" "makesideproject"
## [22] "Lettley81"       "Bitcoinikdotcom" "grabalaji"      
## [25] "CryptoBlockCon"  "EricTippetts"    "GetMIO2"        
## [28] "ICObello_com"    "nataliatycennia" "sryyufriana09"  
## [31] "sryyufriana09"   "sryyufriana09"   "ryrartist"      
## [34] "DanNorberts"     "Cryptowarrior88" "Cryptowarrior88"
## [37] "bc_bitcoin"      "BelieversCrypto" "ErnestOnaiwu"   
## [40] "Trade24F"        "Smurf182"        "coinpricenow"   
## [43] "robelahommed"    "morcano1"        "morcano1"       
## [46] "Minddeft"        "Cryptocabulary"
###
# Regular Expression - 
# . - anything
# + - wildcard (at least 1 match)
# * - wildcard (zero or more matches)
# ? - non-greedy
# $ - end of line
# [] - either one within the bracket
# | - or
#

### Extract the urls
tweet_url <- str_extract_all(bitcoin.text,'https://[0-9a-zA-Z./]+|https://[0-9a-zA-Z./]+$')
tweet_url <- unlist(tweet_url)
head(tweet_url)
## [1] "https://t.co/7UvITkkrDW" "https://t.co/sNMCp7lOgT"
## [3] "https://t.co/j8PfbQ0nEU" "https://t.co/tWlqriqaiE"
## [5] "https://t.co/S22LmLuAZ4" "https://t.co/NsTlcCbaWt"
### Top 5 Most Mentioned @
tweet_mention <- str_extract_all(bitcoin.text,'@[0-9a-zA-Z_]+|@[0-9a-zA-Z_]+$')
tweet_mention <- unlist(tweet_mention)
top10mention <- sort(table(tweet_mention),decreasing=TRUE)[1:5]
top10mention
## tweet_mention
## @realDonaldTrump        @coindesk      @APompliano        @bitstamp 
##                4                3                2                2 
##      @bomblytics 
##                2
## Plot
p <- plot_ly(x = names(top10mention), y = top10mention, name = "Top 10 Mention", type = 'bar')
layout(p, title = "Top 10 Mention (bitcoin)", xaxis = list(title = "@Name"), yaxis = list (title = "Frequency"))
### Top 5 Most Popular Hashtag #
tweet_hashtag <- str_extract_all(bitcoin.text,'#[0-9a-zA-Z_]+|#[0-9a-zA-Z_]+$')
tweet_hashtag <- unlist(tweet_hashtag)
top10hashtag <- sort(table(tweet_hashtag),decreasing=TRUE)[1:5]
top10hashtag
## tweet_hashtag
##        #bitcoin        #Bitcoin         #crypto     #blockchain 
##             255             141              89              81 
## #cryptocurrency 
##              79
## Plot
p <- plot_ly(x = names(top10hashtag), y = top10hashtag, name = "Top 10 Hashtag", type = 'bar')
layout(p, title = "Top 10 Hashtag (bitcoin)", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Frequency"))
### Remove all URLs, mention and hashtag and 'RT' and puncutnation
bitcoin.text <- gsub('https://[0-9a-zA-Z./]+|https://[0-9a-zA-Z./]+$','',bitcoin.text)
bitcoin.text <- gsub('#[0-9a-zA-Z_]+|#[0-9a-zA-Z_]+$','',bitcoin.text)
bitcoin.text <- gsub('@[0-9a-zA-Z_]+|@[0-9a-zA-Z_]+$','',bitcoin.text)
bitcoin.text <- gsub('RT','',bitcoin.text)
bitcoin.text <- gsub('[[:punct:]]','',bitcoin.text)
### Remove leading and ending whitespace
bitcoin.text <- gsub('^[ ]*','',bitcoin.text)
bitcoin.text <- gsub('[ ]*$','',bitcoin.text)
head(bitcoin.text)
## [1] "Blockchain and business have a bright future together <U+0001F929>\nIf you want to learn more about the realworld impact of blockchain tech and cryptocurrency this is your chance <U+0001F44F><U+0001F3FC>\nJoin us on Friday for a talk on blockchain\n\n"
## [2] "Text predictor for   \n\n       \no"                                                                                                                                                                                       
## [3] "The blog of our new project here infos and details about our  related website\n\n    o"                                                                                                                                    
## [4] "Here a simple  program that allows to integrate cryptocurrencies payments in websites \n    \n o"                                                                                                                          
## [5] "My main website the home of my network\n\n      o"                                                                                                                                                                         
## [6] "Gpoint system   is there to incentivize the long term token holder the mostwith  payout structuresbecause of the fact that each user trades a lot and saves on an exchange\njoin  \n"
### nchar("simple text sample")  ### Number of characters
### strsplit("simple text sample",' ')  ### Splitting a string
### substr("simple text sample",1,3) ### Extracting a substring from a string
### paste("sample","text","sample", sep=' ')  ### Concatenating strings
### tolower("SimPle TeXt sAmpLe")  ### Converting letters to lower or upper-case
### toupper("SimPle TeXt sAmpLe")  ### Converting letters to lower or upper-case

2. Web scraping

if (!require("httr")) install.packages("httr", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)

require(httr) # load the required library

In the second project, We use pattern matching functions to develop a web-scrapper to crawl the press release headlines from the Government Information Service website.

### Read Today's Hong Kong's GIS
today_gis <- GET("https://www.info.gov.hk/gia/general/today.htm")
today_gis <- content(today_gis,as="text")
all_news_title <- str_extract_all(today_gis,'.htm">.+?</a></li>')[[1]]  ## Extract the titles from the web page
all_news_title <- gsub('.htm">','',all_news_title) ### Removing heading
all_news_title <- gsub('</a></li>','',all_news_title)  ### Removing tailing
all_news_title <- gsub('<span>|</span>','',all_news_title)  ### Removing span
head(all_news_title)
## character(0)
#
# Create a function for title extraction
# 
title_extraction <- function(url){
  nt_gis <- GET(url)
  nt_gis <- content(nt_gis,as="text")
  all_news_title <- str_extract_all(nt_gis,'.htm">.*</a></li>')[[1]]
  all_news_title <- gsub('.htm">','',all_news_title)
  all_news_title <- gsub('</a></li>','',all_news_title)
  all_news_title <- gsub('<span>|</span>','',all_news_title)
}

# Date sequence from Jan 01 to Feb 21 2019
d_seq <- seq(as.Date("2019-01-01"),as.Date("2019-02-21"),1)

# Formatting the sequence into the web url pattern
date_seq <- format(d_seq,format='%Y%m/%d')

#
# Headline containing CE/FS/CS?
#
headline_dataset <- data.frame()
for (day in date_seq){
  url <- paste("http://www.info.gov.hk/gia/general/",day,".htm",sep="")
  day_nt <- title_extraction(url)
  ce <- sum(grepl('^CE | CE | CE$',day_nt))
  fs <- sum(grepl('^FS | FS | FS$',day_nt))
  cs <- sum(grepl('^CS | CS | CS$',day_nt))
  legco <- sum(grepl('^LegCo | LegCo | LegCo$',day_nt))
  headline_dataset <- rbind(headline_dataset,data.frame(day=day,ce=ce,fs=fs,cs=cs,legco=legco))
  Sys.sleep(2)
} 

Last, the headline mention of CE, CS, FS, and LegCo are plotted by lines.

if (!require("plotly")) install.packages("plotly", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)

require(plotly)

p <- plot_ly(x = headline_dataset$day, y = headline_dataset$ce, name = "CE", type = 'scatter', mode = 'lines')
p <- add_trace(p, y = headline_dataset$cs, name = "CS")
p <- add_trace(p, y = headline_dataset$fs, name = "FS")
p <- add_trace(p, y = headline_dataset$legco, name = "LegCo")
layout(p, title = "GIS Headline Mention of CE/CS/FS/LegCo", xaxis = list(title = "Day"), yaxis = list (title = "Number of hits"))