if (!require("rtweet")) install.packages("rtweet", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("plotly")) install.packages("plotly", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("stringr")) install.packages("stringr", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)

library(plotly)  # load the required libraries
library(rtweet)  
library(stringr)

1. Tweet Mining

First, We conduct a hashtag search by “bitcoin” from Twitter which returns 500 tweets (no retweet) in English. Pattern matching functions is performed one by one for examples.

bitcoin <- search_tweets("#bitcoin", n=500, include_rts = FALSE, lang="en") # Twitter search
bitcoin.text <- bitcoin$text   # Get the text of the tweets

### Which tweet containing URL?
grep("https",bitcoin.text)
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  32  34  35  36
##  [35]  37  38  39  40  41  42  43  44  45  46  47  49  51  52  53  54  55
##  [52]  56  57  58  59  60  61  62  63  64  65  67  68  69  70  71  72  73
##  [69]  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89  90
##  [86]  91  92  93  95  96  97  98  99 100 101 102 103 104 105 107 109 110
## [103] 113 114 115 117 119 120 121 122 123 124 125 126 127 128 129 130 131
## [120] 132 133 135 136 137 138 140 141 142 143 145 146 147 149 150 152 154
## [137] 155 157 158 160 161 162 163 165 166 167 168 169 170 171 172 174 175
## [154] 177 179 180 181 182 183 184 185 186 187 188 190 191 192 193 194 195
## [171] 196 197 198 199 200 201 202 203 204 205 206 209 210 211 213 214 215
## [188] 216 217 219 220 221 222 223 226 227 228 230 231 232 233 235 237 238
## [205] 239 240 241 243 244 245 246 247 248 249 250 251 252 255 256 257 258
## [222] 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 277
## [239] 278 279 280 281 282 283 284 287 288 289 290 291 292 293 294 295 297
## [256] 298 299 300 303 304 305 306 307 308 310 311 312 313 316 317 318 319
## [273] 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 336 337
## [290] 339 340 341 342 343 344 345 346 347 348 350
### Which tweet containing URL (TRUE/FALSE)?
grepl("https",bitcoin.text)
##   [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [12]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [23]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
##  [34]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [45]  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [56]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
##  [67]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [78]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
##  [89]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE
## [100]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE
## [111] FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [122]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [133]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE
## [144] FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [155]  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [166]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE
## [177]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [188]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [199]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
## [210]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
## [221]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE
## [232]  TRUE  TRUE FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [243]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [254] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [265]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE
## [276] FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE
## [287]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE
## [298]  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [309] FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE
## [320]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [331]  TRUE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
## [342]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE
### Display all those matached
head(grep("https",bitcoin.text,value=TRUE))
## [1] "Just signed up for WCX, the global digital currency exchange. Sign up &amp; earn 50 WCX tokens. @wcxofficial #bitcoin https://t.co/1tlAN1RAhs"
## [2] "Great Buy Trading Opportunity of #ETCBTC , Don't miss it!\n#Ethereum_Classic #Bitcoin \nhttps://t.co/XBtB1t4Ast"                              
## [3] "Arizona May Soon Allow Residents To Pay Taxes Using Bitcoin! via /r/#Bitcoin https://t.co/OKagqLkQj8 https://t.co/1HqlNeXgjx"                 
## [4] "CCs are unlikely to disappear completely via /r/#Bitcoin https://t.co/ibiokzKXKq https://t.co/1HqlNeXgjx"                                     
## [5] "Bitcoin mining incentives via /r/#Bitcoin https://t.co/bu1MWCzOCP https://t.co/1HqlNeXgjx"                                                    
## [6] "Just found out I have BCH. Wanna get rid of it. But I'm afraid. via /r/#Bitcoin https://t.co/xZeJDK30Fe https://t.co/1HqlNeXgjx"
### Which tweet NOT containing URL?
grep("https",bitcoin.text,invert=TRUE)
##  [1]  31  33  48  50  66  94 106 108 111 112 116 118 134 139 144 148 151
## [18] 153 156 159 164 173 176 178 189 207 208 212 218 224 225 229 234 236
## [35] 242 253 254 275 276 285 286 296 301 302 309 314 315 335 338 349 351
## [52] 352
### Logical or/and 
grep("blockchain",bitcoin.text)
##  [1]  32  36  38  39  46  47  51  54  55  56  58  59  60  61  62  63  81
## [18]  82 101 106 109 114 122 123 124 125 126 127 128 129 130 131 132 133
## [35] 139 148 149 155 165 191 196 227 239 248 249 251 252 260 261 266 270
## [52] 281 282 284 295 307 310 312 316 317 319 322 341 342 346 349
grep("cryptocurrency",bitcoin.text)
##  [1]  29  30  32  36  39  47  50  51  57  65  68  71 103 106 108 109 114
## [18] 144 148 151 155 161 165 170 172 175 177 178 181 184 191 196 210 223
## [35] 225 227 228 239 250 252 255 259 260 264 266 273 274 275 281 282 284
## [52] 286 299 302 303 306 310 313 316 321 322 325 327 329 330 336 341 344
## [69] 345
### Mentioning blockchain or bank: default case sensitive
grepl("blockchain|cryptocurrency",bitcoin.text, ignore.case = TRUE)
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE
##  [34]  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE FALSE FALSE
##  [45] FALSE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE FALSE  TRUE  TRUE  TRUE
##  [56]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE  TRUE FALSE
##  [67] FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [78] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE  TRUE
## [111] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [122]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [133]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
## [144]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE
## [155]  TRUE FALSE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE
## [166] FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE
## [177]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE
## [188]  TRUE FALSE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE
## [199]  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE
## [210]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [232] FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE
## [243]  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
## [254] FALSE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE
## [265]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE  TRUE
## [276] FALSE  TRUE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE
## [298]  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE
## [309] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE  TRUE  TRUE FALSE  TRUE
## [320]  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE
## [331] FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE  TRUE FALSE  TRUE  TRUE
## [342]  TRUE FALSE  TRUE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
grepl("blockchain",bitcoin.text,ignore.case = TRUE) & grepl("cryptocurrency",bitcoin.text,ignore.case = TRUE)
##   [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [34] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
##  [56] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
##  [78] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE
## [111] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE
## [155]  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [166] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE
## [199] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE
## [232] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE
## [243] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE
## [265] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [298]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE
## [320] FALSE  TRUE  TRUE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE
## [342] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
### Use grep/grepl as an index vector
bitcoin$screen_name[grepl("blockchain",bitcoin.text,ignore.case = TRUE) & grepl("cryptocurrency",bitcoin.text,ignore.case = TRUE)]
##  [1] "creamcoin"       "Remi_Vladuceanu" "cryptocyinvest" 
##  [4] "Medici_Bank"     "IDXHCI"          "coinspectator"  
##  [7] "1FoxyCrypto"     "harkorede911"    "abrahammia01"   
## [10] "Remi_Vladuceanu" "elena201232"     "myavikram"      
## [13] "elena201232"     "Dash_Fans_"      "CryptoCurrent"  
## [16] "hossein761"      "povr_"           "Remi_Vladuceanu"
## [19] "dangsaoha2"      "taisnot"         "1FoxyCrypto"    
## [22] "torsten_gartner" "Euro_Hyips"      "ricardokozak"   
## [25] "BitcoinNewsAus"  "Remi_Vladuceanu" "OfficialVenkat" 
## [28] "BitcoinNewsAus"  "Crypto_Kate"     "ridleyg"        
## [31] "Crypto_Kate"     "cryptonews_____" "xitcliff_msk"   
## [34] "crypt0_airdrop"  "trilodi"         "Remi_Vladuceanu"
## [37] "henyamaryn"
###
# Regular Expression - 
# . - anything
# + - wildcard (at least 1 match)
# * - wildcard (zero or more matches)
# ? - non-greedy
# $ - end of line
# [] - either one within the bracket
# | - or
#

### Extract the urls
tweet_url <- str_extract_all(bitcoin.text,'https://[0-9a-zA-Z./]+|https://[0-9a-zA-Z./]+$')
tweet_url <- unlist(tweet_url)
head(tweet_url)
## [1] "https://t.co/1tlAN1RAhs" "https://t.co/XBtB1t4Ast"
## [3] "https://t.co/OKagqLkQj8" "https://t.co/1HqlNeXgjx"
## [5] "https://t.co/ibiokzKXKq" "https://t.co/1HqlNeXgjx"
### Top 5 Most Mentioned @
tweet_mention <- str_extract_all(bitcoin.text,'@[0-9a-zA-Z_]+|@[0-9a-zA-Z_]+$')
tweet_mention <- unlist(tweet_mention)
top10mention <- sort(table(tweet_mention),decreasing=TRUE)[1:5]
top10mention
## tweet_mention
##        @mijanya     @shineonmag        @St0rmar       @0rjiinal 
##               2               2               2               1 
## @aelfblockchain 
##               1
## Plot
p <- plot_ly(x = names(top10mention), y = top10mention, name = "Top 10 Mention", type = 'bar')
layout(p, title = "Top 10 Mention (bitcoin)", xaxis = list(title = "@Name"), yaxis = list (title = "Frequency"))
### Top 5 Most Popular Hashtag #
tweet_hashtag <- str_extract_all(bitcoin.text,'#[0-9a-zA-Z_]+|#[0-9a-zA-Z_]+$')
tweet_hashtag <- unlist(tweet_hashtag)
top10hashtag <- sort(table(tweet_hashtag),decreasing=TRUE)[1:5]
top10hashtag
## tweet_hashtag
##        #Bitcoin        #bitcoin #cryptocurrency     #blockchain 
##             180             172              64              62 
##         #crypto 
##              60
## Plot
p <- plot_ly(x = names(top10hashtag), y = top10hashtag, name = "Top 10 Hashtag", type = 'bar')
layout(p, title = "Top 10 Hashtag (bitcoin)", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Frequency"))
### Remove all URLs, mention and hashtag and 'RT' and puncutnation
bitcoin.text <- gsub('https://[0-9a-zA-Z./]+|https://[0-9a-zA-Z./]+$','',bitcoin.text)
bitcoin.text <- gsub('#[0-9a-zA-Z_]+|#[0-9a-zA-Z_]+$','',bitcoin.text)
bitcoin.text <- gsub('@[0-9a-zA-Z_]+|@[0-9a-zA-Z_]+$','',bitcoin.text)
bitcoin.text <- gsub('RT','',bitcoin.text)
bitcoin.text <- gsub('[[:punct:]]','',bitcoin.text)
### Remove leading and ending whitespace
bitcoin.text <- gsub('^[ ]*','',bitcoin.text)
bitcoin.text <- gsub('[ ]*$','',bitcoin.text)
head(bitcoin.text)
## [1] "Just signed up for WCX the global digital currency exchange Sign up amp earn 50 WCX tokens"
## [2] "Great Buy Trading Opportunity of   Dont miss it\n  \n"                                     
## [3] "Arizona May Soon Allow Residents To Pay Taxes Using Bitcoin via r"                         
## [4] "CCs are unlikely to disappear completely via r"                                            
## [5] "Bitcoin mining incentives via r"                                                           
## [6] "Just found out I have BCH Wanna get rid of it But Im afraid via r"
### nchar("simple text sample")  ### Number of characters
### strsplit("simple text sample",' ')  ### Splitting a string
### substr("simple text sample",1,3) ### Extracting a substring from a string
### paste("sample","text","sample", sep=' ')  ### Concatenating strings
### tolower("SimPle TeXt sAmpLe")  ### Converting letters to lower or upper-case
### toupper("SimPle TeXt sAmpLe")  ### Converting letters to lower or upper-case

2. Web scraping

if (!require("RCurl")) install.packages("RCurl", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)

library(RCurl) # load the required library

In the second project, We use pattern matching functions to develop a web-scrapper to crawl the press release headlines from the Government Information Service website.

### Read Today's Hong Kong's GIS
today_gis <- getURL("http://www.info.gov.hk/gia/general/today.htm")

all_news_title <- unlist(str_extract_all(today_gis,'.htm">.+?</a></li>'))  ## Extract the titles from the web page
all_news_title <- gsub('.htm">','',all_news_title) ### Removing heading
all_news_title <- gsub('</a></li>','',all_news_title)  ### Removing tailing
head(all_news_title)
## [1] "<span>HAD opens temporary cold shelters</span>"                                                       
## [2] "<span>New mattress and bedding start up from San Francisco launches in Hong Kong (with photos)</span>"
## [3] "&quot;2016 Population By-census Thematic Report: Youths&quot; published"                              
## [4] "Cancellation of LegCo subcommittee meeting"                                                           
## [5] "Government appoints directors to HKEX Board"                                                          
## [6] "SCMA visits TWGHs hotline service centre for sexual minorities (with photos)"
#
# Create a function for title extraction
# 
title_extraction <- function(url){
  nt_gis <- getURL(url)
  all_news_title <- unlist(str_extract_all(nt_gis,'.htm">.*</a></li>'))
  all_news_title <- gsub('.htm">','',all_news_title)
  all_news_title <- gsub('</a></li>','',all_news_title)
}

# Date sequence from Jan 01 to Feb 11 2018
d_seq <- seq(as.Date("2018-01-01"),as.Date("2018-02-11"),1)

# Formatting the sequence into the web url pattern
date_seq <- format(d_seq,format='%Y%m/%d')

#
# Headline containing CE/FS/CS?
#
for (day in date_seq){
  url <- paste("http://www.info.gov.hk/gia/general/",day,".htm",sep="")
  day_nt <- title_extraction(url)
  ce <- length(grep('^CE | CE | CE$',day_nt,ignore.case = TRUE))
  fs <- length(grep('^FS | FS | FS$',day_nt,ignore.case = TRUE))
  cs <- length(grep('^CS | CS | CS$',day_nt,ignore.case = TRUE))
  print(paste0(day,":","Chief Executive -",ce," | Financial Secretary -",fs," | Chief Secretary for Administration-",cs))
  Sys.sleep(2)
} 
## [1] "201801/01:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/02:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-1"
## [1] "201801/03:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/04:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/05:Chief Executive -2 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/06:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/07:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/08:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/09:Chief Executive -2 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/10:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/11:Chief Executive -2 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/12:Chief Executive -2 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/13:Chief Executive -0 | Financial Secretary -1 | Chief Secretary for Administration-1"
## [1] "201801/14:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/15:Chief Executive -1 | Financial Secretary -2 | Chief Secretary for Administration-0"
## [1] "201801/16:Chief Executive -3 | Financial Secretary -0 | Chief Secretary for Administration-1"
## [1] "201801/17:Chief Executive -1 | Financial Secretary -1 | Chief Secretary for Administration-1"
## [1] "201801/18:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/19:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-1"
## [1] "201801/20:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/21:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/22:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/23:Chief Executive -2 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/24:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/25:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/26:Chief Executive -3 | Financial Secretary -0 | Chief Secretary for Administration-2"
## [1] "201801/27:Chief Executive -2 | Financial Secretary -0 | Chief Secretary for Administration-1"
## [1] "201801/28:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/29:Chief Executive -0 | Financial Secretary -1 | Chief Secretary for Administration-0"
## [1] "201801/30:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201801/31:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201802/01:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201802/02:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-1"
## [1] "201802/03:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-1"
## [1] "201802/04:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201802/05:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-1"
## [1] "201802/06:Chief Executive -2 | Financial Secretary -0 | Chief Secretary for Administration-1"
## [1] "201802/07:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201802/08:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201802/09:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201802/10:Chief Executive -1 | Financial Secretary -0 | Chief Secretary for Administration-0"
## [1] "201802/11:Chief Executive -0 | Financial Secretary -0 | Chief Secretary for Administration-1"