Source: Piqsels.com
require(plotly) # load the required libraries
require(rtweet)
require(stringr)
First, We conduct a hashtag search by “#coronavirues” from Twitter which returns 500 tweets (no retweet) in English. Pattern matching functions is performed one by one for examples.
coronavirues <- search_tweets("#coronavirues", n=500, include_rts = FALSE, lang="en") # Twitter search
coronavirues.text <- coronavirues$text # Get the text of the tweets
### Which tweet containing URL?
grep("https",coronavirues.text)
## [1] 1 2 4 5 6 9 10 12 19 20 22 25 27 29 34 37 38 41
## [19] 42 43 44 45 47 50 51 52 54 55 56 61 62 63 65 67 68 69
## [37] 70 71 78 79 82 85 89 91 92 95 100 101 102 103 104 106 108 113
## [55] 115 116 117 118 121 124 126 127 128 129 130 133 134 135 136 137 138 139
## [73] 140 141 142 143 144 145 146 148 149 151 152 153 154 157 159 162 165 167
## [91] 168 170 171 172 173 174 175 178 179 180 182 183 185 186 188 189 190 192
## [109] 194 197 203 204 205 209 210 211 212 213 215 219 222 225 235 237 238 242
## [127] 243 244 245 254 256 257 258 268 282 287 288 293 294 295 297 298 299 300
## [145] 301 304 305 306 308 309 310 314 317 318 319 320 321 322 323 324 325 326
## [163] 327 328 329 330 332 333 334 335 337 340 341 343 344 345 346 349 350 353
## [181] 355 356 358 359 360 361 362 363 366 367 368 369 372 374 377 378 379 380
## [199] 384 386 387 388 389 391 392 394 395 396 397 398 401 402 403 405 406 407
## [217] 408 409 410 411 413 414 416 417 418 423 424 425 426 427 428 429 430 431
## [235] 432 434 441 442 446 447 453 454 459 460 462 463 464 466 467 468 469 470
## [253] 472 474 477 478 479 480 481 482 484
### Which tweet containing URL (TRUE/FALSE)?
grepl("https",coronavirues.text)
## [1] TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE TRUE TRUE FALSE TRUE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE TRUE FALSE FALSE
## [25] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [37] TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [49] FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [61] TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
## [85] TRUE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE TRUE FALSE
## [97] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
## [109] FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
## [121] TRUE FALSE FALSE TRUE FALSE TRUE TRUE TRUE TRUE TRUE FALSE FALSE
## [133] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [145] TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
## [157] TRUE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE TRUE
## [169] FALSE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE
## [181] FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE
## [193] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [205] TRUE FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE FALSE TRUE FALSE
## [217] FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE TRUE FALSE FALSE
## [241] FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE TRUE TRUE
## [289] FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [301] TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## [313] FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [325] TRUE TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE
## [337] TRUE FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE
## [349] TRUE TRUE FALSE FALSE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE
## [361] TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE TRUE
## [373] FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## [385] FALSE TRUE TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE
## [397] TRUE TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE TRUE
## [409] TRUE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE FALSE
## [421] FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [433] FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [445] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## [457] FALSE FALSE TRUE TRUE FALSE TRUE TRUE TRUE FALSE TRUE TRUE TRUE
## [469] TRUE TRUE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE TRUE TRUE
## [481] TRUE TRUE FALSE TRUE
### Display all those matached
head(grep("https",coronavirues.text,value=TRUE))
## [1] "If you want to protect yourself from #coronavirues, recite this verse after Morning prayer and then Maghreb (Evening) prayer, no virus could hit you. https://t.co/lyfmEIWCaV"
## [2] "Protect yourself from the #coronavirues disease-2019\n#Covid_19 #Qatar #Doha https://t.co/ZtnbU3qGZr"
## [3] "Early morning I purchased N95 at 500.\nNow again I have buy more , bilkul be nae Mill rahy Lahore ma ....! \nIn next days \nIt'll be 1000/mask \nOriginal price of mask is only 120\nShame on us \n#Coronaviruspakistan \n#coronavirues \n#coronavirus https://t.co/WXQ2qG5iT1"
## [4] "Mary Dil ke bat \nSsly this is so true. Mayn \n#Coronaviruspakistan \n#coronavirues \n#coronavirus https://t.co/74w7hjp6sd"
## [5] "<U+0001D416><U+0001D421><U+0001D41A><U+0001D42D> <U+0001D422><U+0001D41F> <U+0001D402><U+0001D428><U+0001D42B><U+0001D428><U+0001D427><U+0001D41A> <U+0001D422><U+0001D42C> <U+0001D41A> <U+0001D56F><U+0001D58A><U+0001D586><U+0001D599><U+0001D58D> <U+0001D586><U+0001D593><U+0001D58C><U+0001D58A><U+0001D591> <U+0001D41C><U+0001D428><U+0001D426><U+0001D41E><U+0001D42C> <U+0001D42D><U+0001D428> <U+0001D42D><U+0001D421><U+0001D428><U+0001D42C><U+0001D41E> <U+0001D430><U+0001D421><U+0001D428>'<U+0001D42F><U+0001D41E> <U+0001D41D><U+0001D428><U+0001D427><U+0001D41E> <U+0001D41B><U+0001D41A><U+0001D41D> <U+0001D42D><U+0001D428> <U+0001D42E><U+0001D42C> & <U+0001D42D><U+0001D421><U+0001D41E><U+0001D432> <U+0001D41A><U+0001D42B><U+0001D41E> <U+0001D429><U+0001D41A><U+0001D432><U+0001D422><U+0001D427><U+0001D420> <U+0001D42D><U+0001D421><U+0001D41E> <U+0001D429><U+0001D42B><U+0001D422><U+0001D41C><U+0001D41E> ...\n\n#Karma \n#coronavirues https://t.co/v6IXHufiEC"
## [6] "Are you ready to defense yourself from corona virus?<U+0001F608> #coronavirususa #coronavirues #CoronaVirusUpdates https://t.co/GAX9sY7uBU"
### Which tweet NOT containing URL?
grep("https",coronavirues.text,invert=TRUE)
## [1] 3 7 8 11 13 14 15 16 17 18 21 23 24 26 28 30 31 32
## [19] 33 35 36 39 40 46 48 49 53 57 58 59 60 64 66 72 73 74
## [37] 75 76 77 80 81 83 84 86 87 88 90 93 94 96 97 98 99 105
## [55] 107 109 110 111 112 114 119 120 122 123 125 131 132 147 150 155 156 158
## [73] 160 161 163 164 166 169 176 177 181 184 187 191 193 195 196 198 199 200
## [91] 201 202 206 207 208 214 216 217 218 220 221 223 224 226 227 228 229 230
## [109] 231 232 233 234 236 239 240 241 246 247 248 249 250 251 252 253 255 259
## [127] 260 261 262 263 264 265 266 267 269 270 271 272 273 274 275 276 277 278
## [145] 279 280 281 283 284 285 286 289 290 291 292 296 302 303 307 311 312 313
## [163] 315 316 331 336 338 339 342 347 348 351 352 354 357 364 365 370 371 373
## [181] 375 376 381 382 383 385 390 393 399 400 404 412 415 419 420 421 422 433
## [199] 435 436 437 438 439 440 443 444 445 448 449 450 451 452 455 456 457 458
## [217] 461 465 471 473 475 476 483
### Logical or/and
grep("Wuhan",coronavirues.text,ignore.case = TRUE)
## [1] 35 54 63 95 97 135 157 170 172 198 209 247 294 330 417 441 469 480 482
grep("WHO",coronavirues.text,ignore.case = TRUE)
## [1] 31 39 49 93 98 127 139 147 149 156 178 180 181 218 221 246 248 256 258
## [20] 285 293 307 317 324 334 336 357 383 392 398 399 430 450 470 479
### Mentioning Wuhan or WHO
grepl("Wuhan|WHO",coronavirues.text,ignore.case = TRUE)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE
## [37] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE
## [97] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [157] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE TRUE
## [181] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE TRUE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## [325] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE FALSE TRUE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
## [397] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## [481] FALSE TRUE FALSE FALSE
grepl("Wuhan",coronavirues.text,ignore.case = TRUE) & grepl("WHO",coronavirues.text,ignore.case = TRUE)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [13] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [25] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [37] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [49] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [61] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [73] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [85] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [97] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [109] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [121] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [133] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [145] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [157] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [169] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [181] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [193] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [205] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [217] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [229] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [241] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [253] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [265] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [277] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [289] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [301] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [313] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [325] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [337] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [349] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [361] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [373] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [385] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [397] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [409] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [421] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [433] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [445] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [457] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [469] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [481] FALSE FALSE FALSE FALSE
### Use grep/grepl as an index vector
coronavirues$screen_name[grepl("Wuhan",coronavirues.text,ignore.case = TRUE) & grepl("WHO",coronavirues.text,ignore.case = TRUE)]
## character(0)
###
# Regular Expression -
# . - anything
# + - wildcard (at least 1 match)
# * - wildcard (zero or more matches)
# ? - non-greedy
# $ - end of line
# [] - either one within the bracket
# | - or
#
### Extract the urls
tweet_url <- str_extract_all(coronavirues.text,'https://[0-9a-zA-Z./]+|https://[0-9a-zA-Z./]+$')
tweet_url <- unlist(tweet_url)
head(tweet_url)
## [1] "https://t.co/lyfmEIWCaV" "https://t.co/ZtnbU3qGZr"
## [3] "https://t.co/WXQ2qG5iT1" "https://t.co/74w7hjp6sd"
## [5] "https://t.co/v6IXHufiEC" "https://t.co/GAX9sY7uBU"
### Top 5 Most Mentioned @
tweet_mention <- str_extract_all(coronavirues.text,'@[0-9a-zA-Z_]+|@[0-9a-zA-Z_]+$')
tweet_mention <- unlist(tweet_mention)
top10mention <- sort(table(tweet_mention),decreasing=TRUE)[1:5]
top10mention
## tweet_mention
## @realDonaldTrump @VP @CDCgov @POTUS
## 25 11 9 7
## @ImranKhanPTI
## 4
## Plot
p <- plot_ly(x = names(top10mention), y = top10mention, name = "Top 10 Mention", type = 'bar')
layout(p, title = "Top 10 Mention (#coronavirues)", xaxis = list(title = "@Name"), yaxis = list (title = "Frequency"))
### Top 5 Most Popular Hashtag #
tweet_hashtag <- str_extract_all(coronavirues.text,'#[0-9a-zA-Z_]+|#[0-9a-zA-Z_]+$')
tweet_hashtag <- unlist(tweet_hashtag)
top10hashtag <- sort(table(tweet_hashtag),decreasing=TRUE)[1:5]
top10hashtag
## tweet_hashtag
## #coronavirues #CoronavirusOutbreak #coronavirususa
## 476 138 87
## #COVID #CoronaVirusUpdates
## 68 56
## Plot
p <- plot_ly(x = names(top10hashtag), y = top10hashtag, name = "Top 10 Hashtag", type = 'bar')
layout(p, title = "Top 10 Hashtag (#coronavirues)", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Frequency"))
### Remove all URLs, mention and hashtag and 'RT' and puncutnation
coronavirues.text <- gsub('https://[0-9a-zA-Z./]+|https://[0-9a-zA-Z./]+$','',coronavirues.text)
coronavirues.text <- gsub('#[0-9a-zA-Z_]+|#[0-9a-zA-Z_]+$','',coronavirues.text)
coronavirues.text <- gsub('@[0-9a-zA-Z_]+|@[0-9a-zA-Z_]+$','',coronavirues.text)
coronavirues.text <- gsub('RT','',coronavirues.text)
coronavirues.text <- gsub('[[:punct:]]','',coronavirues.text)
### Remove leading and ending whitespace
coronavirues.text <- gsub('^[ ]*','',coronavirues.text)
coronavirues.text <- gsub('[ ]*$','',coronavirues.text)
head(coronavirues.text)
## [1] "If you want to protect yourself from recite this verse after Morning prayer and then Maghreb Evening prayer no virus could hit you"
## [2] "Protect yourself from the disease2019\n"
## [3] "Well support our Chinese ppl at this crisis\nEvery single Indian is standby our Chinese people\nWeve been praying for speedy recovery of affected ppl from this epidemic\nThere shud be single person priority of world to tackle this menace\n<U+0001F64F><U+2764><U+FE0F>"
## [4] "Early morning I purchased N95 at 500\nNow again I have buy more bilkul be nae Mill rahy Lahore ma \nIn next days \nItll be 1000mask \nOriginal price of mask is only 120\nShame on us \n \n \n"
## [5] "Mary Dil ke bat \nSsly this is so true Mayn \n \n \n"
## [6] "<U+0001D416><U+0001D421><U+0001D41A><U+0001D42D> <U+0001D422><U+0001D41F> <U+0001D402><U+0001D428><U+0001D42B><U+0001D428><U+0001D427><U+0001D41A> <U+0001D422><U+0001D42C> <U+0001D41A> <U+0001D56F><U+0001D58A><U+0001D586><U+0001D599><U+0001D58D> <U+0001D586><U+0001D593><U+0001D58C><U+0001D58A><U+0001D591> <U+0001D41C><U+0001D428><U+0001D426><U+0001D41E><U+0001D42C> <U+0001D42D><U+0001D428> <U+0001D42D><U+0001D421><U+0001D428><U+0001D42C><U+0001D41E> <U+0001D430><U+0001D421><U+0001D428><U+0001D42F><U+0001D41E> <U+0001D41D><U+0001D428><U+0001D427><U+0001D41E> <U+0001D41B><U+0001D41A><U+0001D41D> <U+0001D42D><U+0001D428> <U+0001D42E><U+0001D42C> amp <U+0001D42D><U+0001D421><U+0001D41E><U+0001D432> <U+0001D41A><U+0001D42B><U+0001D41E> <U+0001D429><U+0001D41A><U+0001D432><U+0001D422><U+0001D427><U+0001D420> <U+0001D42D><U+0001D421><U+0001D41E> <U+0001D429><U+0001D42B><U+0001D422><U+0001D41C><U+0001D41E> \n\n \n"
### nchar("simple text sample") ### Number of characters
### strsplit("simple text sample",' ') ### Splitting a string
### substr("simple text sample",1,3) ### Extracting a substring from a string
### paste("sample","text","sample", sep=' ') ### Concatenating strings
### tolower("SimPle TeXt sAmpLe") ### Converting letters to lower or upper-case
### toupper("SimPle TeXt sAmpLe") ### Converting letters to lower or upper-case
require(httr) # load the required library
In the second project, We use pattern matching functions to develop a web-scrapper to crawl the press release headlines from the Government Information Service website.
### Read Today's Hong Kong's GIS
today_gis <- GET("https://www.info.gov.hk/gia/general/today.htm")
today_gis <- content(today_gis,as="text")
all_news_title <- str_extract_all(today_gis,'.htm">.+?</a></li>')[[1]] ## Extract the titles from the web page
all_news_title <- gsub('.htm">','',all_news_title) ### Removing heading
all_news_title <- gsub('</a></li>','',all_news_title) ### Removing tailing
all_news_title <- gsub('<span>|</span>','',all_news_title) ### Removing span
head(all_news_title)
## [1] "SWD launches Child Care Centre Special Grant"
## [2] "Public hospitals daily update on COVID-19 cases"
## [3] "SCMA explains to ethnic minorities measures to fight COVID-19"
## [4] "Import of poultry meat and products from areas in Poland and Vietnam suspended"
## [5] "Public urged not to buy or consume slimming product with undeclared Western drug ingredients (with photos)"
## [6] "Announcement by Judiciary"
#
# Create a function for title extraction
#
title_extraction <- function(url){
nt_gis <- GET(url)
nt_gis <- content(nt_gis,as="text")
all_news_title <- str_extract_all(nt_gis,'.htm">.*</a></li>')[[1]]
all_news_title <- gsub('.htm">','',all_news_title)
all_news_title <- gsub('</a></li>','',all_news_title)
all_news_title <- gsub('<span>|</span>','',all_news_title)
}
# Date sequence from May to Dec 2019
d_seq <- seq(as.Date("2019-05-01"),as.Date("2019-12-31"),1)
# Formatting the sequence into the web url pattern
date_seq <- format(d_seq,format='%Y%m/%d')
#
# Headline containing CE/FS/CS?
#
headline_dataset <- data.frame()
for (day in date_seq){
url <- paste("http://www.info.gov.hk/gia/general/",day,".htm",sep="")
day_nt <- title_extraction(url)
ce <- sum(grepl('^CE | CE | CE$|Chief Executive',day_nt))
fs <- sum(grepl('^FS | FS | FS$|Financial Secretary',day_nt))
cs <- sum(grepl('^CS | CS | CS$|Chief Secretary',day_nt))
pf <- sum(grepl('HKPF|Police',day_nt))
rr <- sum(grepl('rioter|Rioter',day_nt))
headline_dataset <- rbind(headline_dataset,data.frame(day=day,ce=ce,fs=fs,cs=cs,pf=pf,rr=rr))
Sys.sleep(2)
}
Aggreate daily data into weekly data
headline_dataset$week <- as.integer(strftime(d_seq,"%W"))
headline_weekly <- setNames(aggregate(ce~week,headline_dataset,sum),c("week","count"))
headline_weekly <- rbind(headline_weekly,setNames(aggregate(cs~week,headline_dataset,sum),c("week","count")))
headline_weekly <- rbind(headline_weekly,setNames(aggregate(fs~week,headline_dataset,sum),c("week","count")))
headline_weekly <- rbind(headline_weekly,setNames(aggregate(pf~week,headline_dataset,sum),c("week","count")))
headline_weekly <- rbind(headline_weekly,setNames(aggregate(rr~week,headline_dataset,sum),c("week","count")))
headline_weekly$grp <- c(rep("CE",nrow(headline_weekly)/5),rep("CS",nrow(headline_weekly)/5),rep("FS",nrow(headline_weekly)/5),rep("PF",nrow(headline_weekly)/5),rep("RR",nrow(headline_weekly)/5))
The headline mention of CE, CS, FS, PF, and RR are plotted by lines.
p <- plot_ly(data=headline_weekly, x = ~week, y = ~count, name = ~grp, type = 'scatter', mode = 'lines')
p <- layout(p, title = "GIS Headline Mention of CE/CS/FS/Police/Rioter", xaxis = list(title = "Week"), yaxis = list (title = "Number of hits"))
p
Here is an example of animation using “markers”.
q <- plot_ly(data=headline_weekly, x = ~week, y = ~count, name = ~grp, frame = ~week, type = 'scatter', mode = 'markers',marker=list(size=10))
q <- layout(q, title = "GIS Headline Mention of CE/CS/FS/Police/Rioter (Animation)", xaxis = list(title = "Week", range = c(min(headline_weekly$week),max(headline_weekly$week))), yaxis = list (title = "Number of hits", range = c(min(headline_weekly$count),max(headline_weekly$count))))
q <- animation_opts(q, easing = "linear", redraw = FALSE)
q