Source: Piqsels.com

require(plotly)  # load the required libraries
require(rtweet)  
require(stringr)

1. Tweet Mining

First, We conduct a hashtag search by “#coronavirues” from Twitter which returns 2000 tweets (no retweet) in English. Pattern matching functions is performed one by one for examples.

coronavirues <- search_tweets("#COVIDVaccine", n=2000, include_rts = FALSE, lang="en") # Twitter search
coronavirues.text <- coronavirues$text   # Get the text of the tweets

### Which tweet containing URL?
head(grep("https",coronavirues.text))

## [1] 1 2 3 4 5 7

### Which tweet containing URL (TRUE/FALSE)?
head(grepl("https",coronavirues.text))

## [1]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE

### Display all those matached
head(grep("https",coronavirues.text,value=TRUE))

## [1] "Five centres will conduct vaccination drive 24*7 in #Gurgaon \nPrivate hospitals will be holding drive all days in a week from 9am to 5pm\n#CovidVaccine #VaccinationDrive \n@ICMRDELHI @DC_Gurugram \n@cmohry https://t.co/Zrgkiv0KRi"                                                                        
## [2] "@GregAbbott_TX COVID-19 has already taken the lives of over 44,000 people in Texas. Only 7% of the state’s population is fully vaccinated as of today. It's not Biden but you put Texas folks in danger. #COVID19 #COVIDIOT #CovidVaccine #TexasMaskMandate #Texas https://t.co/NRvu3oe9yc"                    
## [3] "@GregAbbott_TX COVID-19 has already taken the lives of over 44,000 people in Texas. Only 7% of the state’s population is fully vaccinated as of today. It's not Biden but you put Texas folks in danger. #COVID19 #COVIDIOT #CovidVaccine #TexasMaskMandate #Texas https://t.co/vDGtULFS0M"                    
## [4] "COVID-19 has already taken the lives of over 44,000 people in Texas. Only 7% of the state’s population is fully vaccinated as of today. It's not Biden but you put Texas folks in danger. #COVID19 #COVIDIOT #CovidVaccine #TexasMaskMandate #TexasMaskMandate https://t.co/hjg6jrW2xJ https://t.co/FLdMDGp0IX"
## [5] "Man dies after taking second dose of #COVID-19 vaccine in Thane.\n\n#CovidVaccine <U+0001F644>\n\nhttps://t.co/cFIb42AG58"                                                                                                                                                                                     
## [6] "New @NPF #COVID19 TF recs for Ad26.COV2.S #CovidVaccine . Take 1st  #vaccine offered. For #johnsonandjohnson 1 dose vax, age 60+ w 1+  comorbidity, consider holding #methotrexte x 2 weeks post #immunization if #psoriasis #psoriatic #arthritis are well-controlled @AADmember https://t.co/oFz1NCZOEB"

### Which tweet NOT containing URL?
head(grep("https",coronavirues.text,invert=TRUE))

## [1]  6  8 11 17 30 32

### Logical or/and 
grep("side effect",coronavirues.text,ignore.case = TRUE)

##  [1]   82  114  169  258  317  445  467  471  522  524  652  677  731  839  949
## [16] 1043 1071 1134 1212 1231 1255 1337 1363 1398 1627 1682 1728 1761 1791 1806
## [31] 1808 1842 1966

grep("fever",coronavirues.text,ignore.case = TRUE)

## [1]    6   34   36  121 1505 1880

### Mentioning Side Effect or Fever
head(grepl("side effect|fever",coronavirues.text,ignore.case = TRUE))

## [1] FALSE FALSE FALSE FALSE FALSE  TRUE

head(grepl("side effect",coronavirues.text,ignore.case = TRUE) & grepl("fever",coronavirues.text,ignore.case = TRUE))

## [1] FALSE FALSE FALSE FALSE FALSE FALSE

### Use grep/grepl as an index vector
coronavirues$screen_name[grepl("side effect",coronavirues.text,ignore.case = TRUE) | grepl("fever",coronavirues.text,ignore.case = TRUE)]

##  [1] "DrEstherFreeman" "RelandHe"        "kelslanae"       "Rainbow_Ark"    
##  [5] "IshaxIsha"       "sommeronearth"   "DGoing5"         "lynngoheels"    
##  [9] "Anstald"         "JXDirector"      "ecarltonOT"      "mbebinger"      
## [13] "Teresa_FTW"      "muslimdoccymru"  "LauraIsNora"     "PSdxb"          
## [17] "justpastmidlife" "kenkendavies"    "KeiroConnect"    "jekawwass"      
## [21] "ScienceVsFN"     "Fastfashion25"   "JimPenceChalk"   "Jessica_1174"   
## [25] "debzella"        "SandroIsaack"    "ConsumerLab"     "covidtestskit"  
## [29] "SarahBethNews"   "Salamanderfs"    "rk_arvind99"     "b_auntie"       
## [33] "NotTheNewz"      "gypcy_2"         "NHSLincsCCG"     "LincsNHS"       
## [37] "ArunRane9"       "SakshiPost"      "IAmElizabethIta"

###
# Regular Expression - 
# . - anything
# + - wildcard (at least 1 match)
# * - wildcard (zero or more matches)
# ? - non-greedy
# $ - end of line
# [] - either one within the bracket
# | - or
#

### Extract the urls
tweet_url <- str_extract_all(coronavirues.text,'http[^( |\n)]+')
tweet_url <- unlist(tweet_url)
head(tweet_url)

## [1] "https://t.co/Zrgkiv0KRi" "https://t.co/NRvu3oe9yc"
## [3] "https://t.co/vDGtULFS0M" "https://t.co/hjg6jrW2xJ"
## [5] "https://t.co/FLdMDGp0IX" "https://t.co/cFIb42AG58"

### Top 5 Most Mentioned @
tweet_mention <- str_extract_all(coronavirues.text,'@[0-9a-zA-Z_]+')
tweet_mention <- unlist(tweet_mention)
top10mention <- sort(table(tweet_mention),decreasing=TRUE)[1:10]
top10mention

## tweet_mention
##    @DollyParton @GovRonDeSantis    @MoHFW_INDIA    @cvspharmacy          @POTUS 
##              38              22              21              20              18 
##         @CDCgov    @nyphospital @drharshvardhan      @ArmoryNYC      @CVSHealth 
##              17              15              14              13              13

## Plot
p <- plot_ly(x = names(top10mention), y = top10mention, name = "Top 10 Mention", type = 'bar')
layout(p, title = "Top 10 Mention (#coronavirues)", xaxis = list(title = "@Name"), yaxis = list (title = "Frequency"))

### Top 5 Most Popular Hashtag #
tweet_hashtag <- str_extract_all(coronavirues.text,'#[0-9a-zA-Z_]+')
tweet_hashtag <- unlist(tweet_hashtag)
top10hashtag <- sort(table(tweet_hashtag),decreasing=TRUE)[1:10]
top10hashtag

## tweet_hashtag
##     #CovidVaccine          #COVID19     #COVIDVaccine            #COVID 
##              1501               292               242               181 
##     #covidvaccine          #vaccine   #COVID19Vaccine     #COVIDvaccine 
##               164                99                85                76 
##      #coronavirus #COVIDVaccination 
##                68                47

### To Lower
top10hashtag <- sort(table(tolower(tweet_hashtag)),decreasing=TRUE)[1:10]
top10hashtag

## 
##        #covidvaccine             #covid19               #covid 
##                 2008                  364                  229 
##             #vaccine      #covid19vaccine         #coronavirus 
##                  124                   95                   88 
##    #covidvaccination            #covid_19  #covid19vaccination 
##                   53                   50                   44 
## #largestvaccinedrive 
##                   40

## Plot
p <- plot_ly(x = names(top10hashtag), y = top10hashtag, name = "Top 10 Hashtag", type = 'bar')
layout(p, title = "Top 10 Hashtag (#coronavirues)", xaxis = list(title = "#Hashtag"), yaxis = list (title = "Frequency"))

### Remove all URLs, mention and hashtag and 'RT' and puncutnation and newline
coronavirues.text <- gsub('http[^( |\n)]+','',coronavirues.text)
coronavirues.text <- gsub('#[0-9a-zA-Z_]+','',coronavirues.text)
coronavirues.text <- gsub('@[0-9a-zA-Z_]+','',coronavirues.text)
coronavirues.text <- gsub('RT','',coronavirues.text)
coronavirues.text <- gsub('[[:punct:]]','',coronavirues.text)
coronavirues.text <- gsub('\n','',coronavirues.text)
### Remove leading and ending whitespace
coronavirues.text <- gsub('^[ ]*','',coronavirues.text)
coronavirues.text <- gsub('[ ]*$','',coronavirues.text)
head(coronavirues.text)

## [1] "Five centres will conduct vaccination drive 247 in  Private hospitals will be holding drive all days in a week from 9am to 5pm"                                                                                                          
## [2] "COVID19 has already taken the lives of over 44000 people in Texas Only 7 of the state’s population is fully vaccinated as of today Its not Biden but you put Texas folks in danger"                                                      
## [3] "COVID19 has already taken the lives of over 44000 people in Texas Only 7 of the state’s population is fully vaccinated as of today Its not Biden but you put Texas folks in danger"                                                      
## [4] "COVID19 has already taken the lives of over 44000 people in Texas Only 7 of the state’s population is fully vaccinated as of today Its not Biden but you put Texas folks in danger"                                                      
## [5] "Man dies after taking second dose of 19 vaccine in Thane <U+0001F644>"                                                                                                                                                                   
## [6] "Agree completely You can’t have a vaccine reaction if you don’t have access to the vaccine Listened to a fantastic presentation by  today about racial inequities in  access Actively soliciting vaccine reaction images from  community"

### nchar("simple text sample")  ### Number of characters
### strsplit("simple text sample",' ')  ### Splitting a string
### substr("simple text sample",1,3) ### Extracting a substring from a string
### paste("sample","text","sample", sep=' ')  ### Concatenating strings
### tolower("SimPle TeXt sAmpLe")  ### Converting letters to lower or upper-case
### toupper("SimPle TeXt sAmpLe")  ### Converting letters to lower or upper-case

2. Web scraping

require(httr) # load the required library

In the second project, We use pattern matching functions to develop a simple web-scrapper to crawl the press release headlines from the Government Information Service website.

### Read Today's Hong Kong's GIS
today_gis <- GET("https://www.info.gov.hk/gia/general/today.htm")
today_gis <- content(today_gis,as="text")
all_news_title <- str_extract_all(today_gis,'.htm">.+?</a></li>')[[1]]  ## Extract the titles from the web page
all_news_title <- gsub('.htm">','',all_news_title) ### Removing heading
all_news_title <- gsub('</a></li>','',all_news_title)  ### Removing tailing
all_news_title <- gsub('<span>|</span>','',all_news_title)  ### Removing span
head(all_news_title)

## [1] "Office of The Ombudsman announces results of direct investigation into&nbsp;Government&#39;s mechanism for monitoring vaccines provided by&nbsp;private healthcare facilities"
## [2] "Lands Department achieves notable results of enhanced land control and enforcement"                                                                                           
## [3] "Key statistics on service demand of A&amp;E Departments and occupancy rates in public hospitals"

#
# Create a function for title extraction
# 
title_extraction <- function(url){
  nt_gis <- GET(url)
  nt_gis <- content(nt_gis,as="text")
  all_news_title <- str_extract_all(nt_gis,'.htm">.*</a></li>')[[1]]
  all_news_title <- gsub('.htm">','',all_news_title)
  all_news_title <- gsub('</a></li>','',all_news_title)
  all_news_title <- gsub('<span>|</span>','',all_news_title)
}

# Date sequence from May to Dec 2019
d_seq <- seq(as.Date("2020-01-01"),as.Date("2020-06-30"),1)

# Formatting the sequence into the web url pattern
date_seq <- format(d_seq,format='%Y%m/%d')

#
# Headline containing CE/FS/CS/SFH/SCS?
#
headline_dataset <- data.frame()
for (day in date_seq){
  url <- paste("http://www.info.gov.hk/gia/general/",day,".htm",sep="")
  day_nt <- title_extraction(url)
  ce <- sum(grepl('^CE | CE | CE$|Chief Executive',day_nt))
  fs <- sum(grepl('^FS | FS | FS$|Financial Secretary',day_nt))
  cs <- sum(grepl('^CS | CS | CS$|Chief Secretary',day_nt))
  sfh <- sum(grepl('^SFH | SFH | SFH$|Secretary for Food and Health',day_nt))
  scs <- sum(grepl('^SCS | SCS | SCS$|Secretary for the Civil Service',day_nt))
  headline_dataset <- rbind(headline_dataset,data.frame(day=day,ce=ce,fs=fs,cs=cs,sfh=sfh,scs=scs))
  Sys.sleep(2)
}

Aggreate daily data into weekly data

headline_dataset$week <- as.integer(strftime(d_seq,"%W"))
headline_weekly_ce <- aggregate(ce~week,headline_dataset,sum)
colnames(headline_weekly_ce) <- c("week","count")
headline_weekly_ce$grp <- "CE"
headline_weekly_cs <- aggregate(cs~week,headline_dataset,sum)
colnames(headline_weekly_cs) <- c("week","count")
headline_weekly_cs$grp <- "CS"
headline_weekly_fs <- aggregate(fs~week,headline_dataset,sum)
colnames(headline_weekly_fs) <- c("week","count")
headline_weekly_fs$grp <- "FS"
headline_weekly_sfh <- aggregate(sfh~week,headline_dataset,sum)
colnames(headline_weekly_sfh) <- c("week","count")
headline_weekly_sfh$grp <- "SFH"
headline_weekly_scs <- aggregate(scs~week,headline_dataset,sum)
colnames(headline_weekly_scs) <- c("week","count")
headline_weekly_scs$grp <- "SCS"

headline_weekly <- rbind(headline_weekly_ce,headline_weekly_cs,headline_weekly_fs,headline_weekly_sfh,headline_weekly_scs)

The headline mention of CE, CS, FS, SFH, and SCMA are plotted by lines.

p <- plot_ly(data=headline_weekly, x = ~week, y = ~count, name = ~grp, type = 'scatter', mode = 'lines')
p <- layout(p, title = "GIS Headline Mention of CE/CS/FS/SFH/SCS", xaxis = list(title = "Week"), yaxis = list (title = "Number of hits"))
p

JMSC 6116 Lecture 5: Text Mining: Tweet Mining and Web Scraping

King-wa Fu

March 5, 2021

1. Tweet Mining

2. Web scraping