if (!require("rtweet")) install.packages("rtweet", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("plotly")) install.packages("plotly", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
if (!require("stringr")) install.packages("stringr", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
require(plotly) # load the required libraries
require(rtweet)
require(stringr)
First, We conduct a hashtag search by “bitcoin” from Twitter which returns 500 tweets (no retweet) in English. Pattern matching functions is performed one by one for examples.
bitcoin <- search_tweets("#bitcoin", n=500, include_rts = FALSE, lang="en") # Twitter search
bitcoin.text <- bitcoin$text # Get the text of the tweets
### Which tweet containing URL?
grep("https",bitcoin.text)
## [1] 1 2 3 4 5 6 8 9 10 11 12 13 14 15 16 17 18
## [18] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 38
## [35] 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 55 56
## [52] 57 58 61 63 64 65 66 67 68 71 72 73 74 75 76 77 78
## [69] 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
## [86] 96 97 99 100 101 102 103 104 105 106 107 108 110 111 112 115 117
## [103] 122 123 124 125 126 127 128 129 130 131 132 133 135 136 137 138 139
## [120] 140 141 142 143 144 145 146 147 148 149 152 153 156 157 158 160 161
## [137] 162 163 164 165 166 167 169 170 172 174 176 177 179 180 181 182 183
## [154] 184 185 186 187 188 190 196 197 198 199 202 203 204 205 206 208 210
## [171] 218 219 224 225 227 228 230 234 235 236 238 239 242 243 244 245 248
## [188] 251 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 270
## [205] 271 272 274 275 277 278 279 280 281 283 284 285 286 288 289 290 292
## [222] 293 295 297 299 300 301 302 303 304 305 306 308 309 311 312 313 314
## [239] 315 316 317 318 320 321 322 323 324 325 330 332 333 334 335 337 338
## [256] 339 340 341 342 343 344 347 349 350 351 353 354 355 356 357 358 359
## [273] 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376
## [290] 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393
## [307] 395 397
### Which tweet containing URL (TRUE/FALSE)?
grepl("https",bitcoin.text)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [12] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [23] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [45] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [56] TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE
## [67] TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [78] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [89] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [100] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [111] TRUE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [122] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [133] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [144] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE
## [155] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE
## [166] TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
## [177] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [188] TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
## [199] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [210] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE
## [221] FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE
## [232] FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE FALSE TRUE
## [243] TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE
## [254] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [265] TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE
## [276] FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [287] FALSE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE FALSE TRUE
## [298] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [309] TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [320] TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [331] FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [342] TRUE TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE FALSE
## [353] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [364] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [375] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [386] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [397] TRUE
### Display all those matached
head(grep("https",bitcoin.text,value=TRUE))
## [1] "Blockchain and business have a bright future together <U+0001F929>\nIf you want to learn more about the realworld impact of blockchain tech and cryptocurrency, this is your chance. <U+0001F44F><U+0001F3FC>\nJoin us on Friday for a talk on blockchain!\n\n#bitcoin #blockchain #coworking #cowork #cryptocurrency https://t.co/7UvITkkrDW"
## [2] "Text predictor for #AI #machinelearning \nhttps://t.co/sNMCp7lOgT\n#deeplearn #code #developer #python #bitcoin #btc #cryptocurrency #DigitalMineNetwork\no"
## [3] "The blog of our new project, here infos, and details about our #cryptocurrency related website\nhttps://t.co/j8PfbQ0nEU\n#bitcoin #blockchain #btc #DigitalMineNetwork o"
## [4] "Here a simple #python program that allows to integrate cryptocurrencies payments in websites \n#bitcoin #btc #cryptocurrency #programming #DigitalMineNetwork\nhttps://t.co/tWlqriqaiE o"
## [5] "My main website the home of my network\nhttps://t.co/S22LmLuAZ4\n#bitcoin #btc #cryptocurrency #blockchain #AI #DigitalMineNetwork o"
## [6] "G-point system #eGoldMining is there to incentivize the long term token holder the most.with payout structures,because of the fact that each user trades a lot and saves on an exchange\njoin: https://t.co/NsTlcCbaWt \n#eGold #EGM #ICO #Crowdsale #Bitcoin #Blockchain #ETH #Ethereum"
### Which tweet NOT containing URL?
grep("https",bitcoin.text,invert=TRUE)
## [1] 7 35 36 37 54 59 60 62 69 70 98 109 113 114 116 118 119
## [18] 120 121 134 150 151 154 155 159 168 171 173 175 178 189 191 192 193
## [35] 194 195 200 201 207 209 211 212 213 214 215 216 217 220 221 222 223
## [52] 226 229 231 232 233 237 240 241 246 247 249 250 252 268 269 273 276
## [69] 282 287 291 294 296 298 307 310 319 326 327 328 329 331 336 345 346
## [86] 348 352 394 396
### Logical or/and
grep("blockchain",bitcoin.text,ignore.case = TRUE)
## [1] 1 3 5 6 7 11 12 13 18 26 28 29 30 31 32 33 35
## [18] 36 37 40 41 46 47 48 49 51 52 53 56 59 63 65 66 71
## [35] 79 80 81 82 84 89 90 91 92 93 97 99 113 114 115 116 117
## [52] 118 119 120 121 126 127 128 129 130 131 133 136 139 142 147 152 153
## [69] 157 160 165 166 169 170 187 189 196 202 203 230 233 239 241 246 247
## [86] 248 261 264 271 272 278 281 282 283 292 295 300 302 303 312 315 316
## [103] 318 321 322 326 331 333 337 341 344 345 346 348 349 352 389 391 392
## [120] 396 397
grep("cryptocurrency",bitcoin.text,ignore.case = TRUE)
## [1] 1 2 3 4 5 11 12 16 23 27 28 43 48 49 50 52 56
## [18] 58 65 66 72 73 74 75 77 79 80 81 82 83 84 85 89 96
## [35] 97 103 104 105 110 114 124 126 127 133 148 149 153 164 166 176 179
## [52] 181 183 184 191 192 196 202 203 204 205 206 209 213 214 223 231 232
## [69] 233 238 246 247 248 254 255 261 263 264 271 272 274 277 279 281 287
## [86] 290 295 299 303 308 315 318 326 334 341 345 346 350 351 353 354 355
## [103] 356 386 389 390 397
### Mentioning blockchain or cryptocurrency
grepl("blockchain|cryptocurrency",bitcoin.text,ignore.case = TRUE)
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## [12] TRUE TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [23] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [34] FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
## [45] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## [56] TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## [67] FALSE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## [78] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
## [89] TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE
## [100] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [111] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [122] FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE
## [133] TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [144] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE
## [155] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE TRUE
## [166] TRUE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## [177] FALSE FALSE TRUE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE
## [188] FALSE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE
## [199] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
## [210] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [232] TRUE TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
## [243] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [254] TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
## [276] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE
## [287] TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE
## [298] FALSE TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE FALSE TRUE
## [309] FALSE FALSE FALSE TRUE FALSE FALSE TRUE TRUE FALSE TRUE FALSE
## [320] FALSE TRUE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [331] TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
## [342] FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE TRUE
## [353] TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## [397] TRUE
grepl("blockchain",bitcoin.text,ignore.case = TRUE) & grepl("cryptocurrency",bitcoin.text,ignore.case = TRUE)
## [1] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE
## [12] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [56] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] FALSE TRUE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [89] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [100] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [111] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [122] FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [133] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [144] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [155] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [166] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [177] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [188] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [199] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [210] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [221] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [232] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [243] FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
## [254] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE
## [276] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [287] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [298] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [309] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE TRUE FALSE
## [320] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [331] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [342] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [353] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [364] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [375] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [386] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] TRUE
### Use grep/grepl as an index vector
bitcoin$screen_name[grepl("blockchain",bitcoin.text,ignore.case = TRUE) & grepl("cryptocurrency",bitcoin.text,ignore.case = TRUE)]
## [1] "iCampusMalaga" "digital_mine_" "digital_mine_"
## [4] "ActivosDig" "ActivosDig" "Espnlover365"
## [7] "coinspectator" "coinspectator" "coinspectator"
## [10] "WorknbDAO" "kind_kris" "ridecaribbean"
## [13] "CoinCapsAi" "CoinCapsAi" "CoinCapsAi"
## [16] "CoinCapsAi" "CoinCapsAi" "TheBlockchain"
## [19] "LTCTheRightCoin" "sarahjayanti982" "makesideproject"
## [22] "Lettley81" "Bitcoinikdotcom" "grabalaji"
## [25] "CryptoBlockCon" "EricTippetts" "GetMIO2"
## [28] "ICObello_com" "nataliatycennia" "sryyufriana09"
## [31] "sryyufriana09" "sryyufriana09" "ryrartist"
## [34] "DanNorberts" "Cryptowarrior88" "Cryptowarrior88"
## [37] "bc_bitcoin" "BelieversCrypto" "ErnestOnaiwu"
## [40] "Trade24F" "Smurf182" "coinpricenow"
## [43] "robelahommed" "morcano1" "morcano1"
## [46] "Minddeft" "Cryptocabulary"
###
# Regular Expression -
# . - anything
# + - wildcard (at least 1 match)
# * - wildcard (zero or more matches)
# ? - non-greedy
# $ - end of line
# [] - either one within the bracket
# | - or
#
### Extract the urls
tweet_url <- str_extract_all(bitcoin.text,'https://[0-9a-zA-Z./]+|https://[0-9a-zA-Z./]+$')
tweet_url <- unlist(tweet_url)
head(tweet_url)
## [1] "https://t.co/7UvITkkrDW" "https://t.co/sNMCp7lOgT"
## [3] "https://t.co/j8PfbQ0nEU" "https://t.co/tWlqriqaiE"
## [5] "https://t.co/S22LmLuAZ4" "https://t.co/NsTlcCbaWt"
### Top 5 Most Mentioned @
tweet_mention <- str_extract_all(bitcoin.text,'@[0-9a-zA-Z_]+|@[0-9a-zA-Z_]+$')
tweet_mention <- unlist(tweet_mention)
top10mention <- sort(table(tweet_mention),decreasing=TRUE)[1:5]
top10mention
## tweet_mention
## @realDonaldTrump @coindesk @APompliano @bitstamp
## 4 3 2 2
## @bomblytics
## 2
## Plot
p <- plot_ly(x = names(top10mention), y = top10mention, name = "Top 10 Mention", type = 'bar')
layout(p, title = "Top 10 Mention (bitcoin)", xaxis = list(title = "@Name"), yaxis = list (title = "Frequency"))
### Top 5 Most Popular Hashtag #
tweet_hashtag <- str_extract_all(bitcoin.text,'#[0-9a-zA-Z_]+|#[0-9a-zA-Z_]+$')
tweet_hashtag <- unlist(tweet_hashtag)
top10hashtag <- sort(table(tweet_hashtag),decreasing=TRUE)[1:5]
top10hashtag
## tweet_hashtag
## #bitcoin #Bitcoin #crypto #blockchain
## 255 141 89 81
## #cryptocurrency
## 79
## Plot
p <- plot_ly(x = names(top10hashtag), y = top10hashtag, name = "Top 10 Hashtag", type = 'bar')
layout(p, title = "Top 10 Hashtag (bitcoin)", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Frequency"))
### Remove all URLs, mention and hashtag and 'RT' and puncutnation
bitcoin.text <- gsub('https://[0-9a-zA-Z./]+|https://[0-9a-zA-Z./]+$','',bitcoin.text)
bitcoin.text <- gsub('#[0-9a-zA-Z_]+|#[0-9a-zA-Z_]+$','',bitcoin.text)
bitcoin.text <- gsub('@[0-9a-zA-Z_]+|@[0-9a-zA-Z_]+$','',bitcoin.text)
bitcoin.text <- gsub('RT','',bitcoin.text)
bitcoin.text <- gsub('[[:punct:]]','',bitcoin.text)
### Remove leading and ending whitespace
bitcoin.text <- gsub('^[ ]*','',bitcoin.text)
bitcoin.text <- gsub('[ ]*$','',bitcoin.text)
head(bitcoin.text)
## [1] "Blockchain and business have a bright future together <U+0001F929>\nIf you want to learn more about the realworld impact of blockchain tech and cryptocurrency this is your chance <U+0001F44F><U+0001F3FC>\nJoin us on Friday for a talk on blockchain\n\n"
## [2] "Text predictor for \n\n \no"
## [3] "The blog of our new project here infos and details about our related website\n\n o"
## [4] "Here a simple program that allows to integrate cryptocurrencies payments in websites \n \n o"
## [5] "My main website the home of my network\n\n o"
## [6] "Gpoint system is there to incentivize the long term token holder the mostwith payout structuresbecause of the fact that each user trades a lot and saves on an exchange\njoin \n"
### nchar("simple text sample") ### Number of characters
### strsplit("simple text sample",' ') ### Splitting a string
### substr("simple text sample",1,3) ### Extracting a substring from a string
### paste("sample","text","sample", sep=' ') ### Concatenating strings
### tolower("SimPle TeXt sAmpLe") ### Converting letters to lower or upper-case
### toupper("SimPle TeXt sAmpLe") ### Converting letters to lower or upper-case
if (!require("httr")) install.packages("httr", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
require(httr) # load the required library
In the second project, We use pattern matching functions to develop a web-scrapper to crawl the press release headlines from the Government Information Service website.
### Read Today's Hong Kong's GIS
today_gis <- GET("https://www.info.gov.hk/gia/general/today.htm")
today_gis <- content(today_gis,as="text")
all_news_title <- str_extract_all(today_gis,'.htm">.+?</a></li>')[[1]] ## Extract the titles from the web page
all_news_title <- gsub('.htm">','',all_news_title) ### Removing heading
all_news_title <- gsub('</a></li>','',all_news_title) ### Removing tailing
all_news_title <- gsub('<span>|</span>','',all_news_title) ### Removing span
head(all_news_title)
## character(0)
#
# Create a function for title extraction
#
title_extraction <- function(url){
nt_gis <- GET(url)
nt_gis <- content(nt_gis,as="text")
all_news_title <- str_extract_all(nt_gis,'.htm">.*</a></li>')[[1]]
all_news_title <- gsub('.htm">','',all_news_title)
all_news_title <- gsub('</a></li>','',all_news_title)
all_news_title <- gsub('<span>|</span>','',all_news_title)
}
# Date sequence from Jan 01 to Feb 21 2019
d_seq <- seq(as.Date("2019-01-01"),as.Date("2019-02-21"),1)
# Formatting the sequence into the web url pattern
date_seq <- format(d_seq,format='%Y%m/%d')
#
# Headline containing CE/FS/CS?
#
headline_dataset <- data.frame()
for (day in date_seq){
url <- paste("http://www.info.gov.hk/gia/general/",day,".htm",sep="")
day_nt <- title_extraction(url)
ce <- sum(grepl('^CE | CE | CE$',day_nt))
fs <- sum(grepl('^FS | FS | FS$',day_nt))
cs <- sum(grepl('^CS | CS | CS$',day_nt))
legco <- sum(grepl('^LegCo | LegCo | LegCo$',day_nt))
headline_dataset <- rbind(headline_dataset,data.frame(day=day,ce=ce,fs=fs,cs=cs,legco=legco))
Sys.sleep(2)
}
Last, the headline mention of CE, CS, FS, and LegCo are plotted by lines.
if (!require("plotly")) install.packages("plotly", repos="https://cran.cnr.berkeley.edu/", dependencies = TRUE)
require(plotly)
p <- plot_ly(x = headline_dataset$day, y = headline_dataset$ce, name = "CE", type = 'scatter', mode = 'lines')
p <- add_trace(p, y = headline_dataset$cs, name = "CS")
p <- add_trace(p, y = headline_dataset$fs, name = "FS")
p <- add_trace(p, y = headline_dataset$legco, name = "LegCo")
layout(p, title = "GIS Headline Mention of CE/CS/FS/LegCo", xaxis = list(title = "Day"), yaxis = list (title = "Number of hits"))