A brief demonstration of Tweet extraction and sentiment analysis
A search was done on a specific hashtag #bitcoin and 1000 tweets retrieved
A sentiment analysis was then performed on the tweets
Fri Dec 22 12:54:33 2017
R version 3.3.3 (2017-03-06) - Another Canoe - svn 72310
Package | Version |
---|---|
dplyr | 0.7.4 |
ggplot2 | 2.2.1 |
httr | 1.3.1 |
ROAuth | 0.9.6 |
stringr | 1.2.0 |
1.1.9 |
Minqing Hu and Bing Liu. “Mining and Summarizing Customer Reviews.”, Proceedings of the ACM SIGKDD International Conference on Knowledge, Discovery and Data Mining (KDD-2004), Aug 22-25, 2004, Seattle, Washington, USA
(Additional opinion words were added to the lists)
API keys and secrets assigned to variables in code below, but not displayed for security reasons
setup_twitter_oauth(consumer_key = twitterAPIKey,
consumer_secret = twitterAPISecret,
access_token = twitterAccessToken,
access_secret = twitterTokenSecret)
## [1] "Using direct authentication"
Set search string to #Bitcoin Set number of tweets to 1000 Set language to English
searchString <- "#bitcoin"
numberOfTweets <- 1000
tweets <- searchTwitter(searchString, n = numberOfTweets, lang="en")
Save tweets to data frame
tweetsDF <- twListToDF(tweets)
# Convert to dataframe, and encode to native
x <- tweetsDF
x$text <- enc2native(x$text)
# Extract URLs
url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA
-F][0-9a-fA-F]))+"
x$contentURL <- str_extract(x$text, url_pattern)
# Clean content of text
x$text <- gsub("^[[:space:]]*","",x$text) # Remove leading whitespaces
x$text <- gsub("[[:space:]]*$","",x$text) # Remove trailing whitespaces
x$text <- gsub(" +"," ",x$text) # Remove extra whitespaces
x$text <- gsub("'", "%%", x$text) # Replacing apostrophes with %%
x$text <- iconv(x$text, "latin1", "ASCII", sub="") # Remove emojis/dodgy unicode
x$text <- gsub("<(.*)>", "", x$text) # Remove pesky Unicodes like <U+A>
x$text <- gsub("\\ \\. ", " ", x$text) # Replace orphaned fullstops with space
x$text <- gsub(" ", " ", x$text) # Replace double space with single space
x$text <- gsub("%%", "\'", x$text) # Changing %% back to apostrophes
x$text <- gsub("https(.*)*$", "", x$text) # remove tweet URL
x$text <- gsub("\\n", "-", x$text) # replace line breaks with -
x$text <- gsub("--", "-", x$text) # remove double - from double line breaks
x$text <- gsub("&", "&", x$text) # fix ampersand &
x$text[x$text == " "] <- "<no text>"
for (i in 1:nrow(x)) {
if (x$truncated[i] == TRUE) {
x$text[i] <- gsub("[[:space:]]*$","...",x$text[i])
}
}
# Remove unused columns
cleanTweets <- x %>% select("text", "contentURL", "favorited", "favoriteCount",
"created", "truncated", "id", "screenName",
"retweetCount", "isRetweet", "retweeted")
head(cleanTweets)
## text
## 1 Majority of the $crypto markets are down today. You can do one of many things.-Drink bleach-Panic sell-HODL-Buy cr...
## 2 RT @zloadr: #Bitcoin mainstream #Adoption amplifies as #NYSE confirms launch of Bitcoin #ETF -#trading #finance #stocks -
## 3 RT @crypto_Pickle: $LINK Lets board this ship together FAM #Bitcoin #crypto #Shitcoins-The coin to #LINK every business #mewn
## 4 RT @Schuldensuehner: First stress test for #Bitcoin Future as underlying drops like a stone!
## 5 RT @FGordillo: Bitcoin and almost every other cryptocurrency crashed hard today
## 6 RT @Fisher85M: The #Blockchain Ecosystem {Infographic}-[@evankirstel] #MT-#fintech #banking #cryptocurrency #bitcoin #XRP #FinServ #AI #ML
## contentURL favorited favoriteCount created
## 1 https://t.co/rTWU8ATy9y FALSE 0 2017-12-22 10:54:20
## 2 https://t.co/aOx5 FALSE 0 2017-12-22 10:54:20
## 3 https://t. FALSE 0 2017-12-22 10:54:19
## 4 https://t.co/vfydaJi4AW FALSE 0 2017-12-22 10:54:18
## 5 https://t.co/LTpKYg11JG FALSE 0 2017-12-22 10:54:16
## 6 <NA> FALSE 0 2017-12-22 10:54:15
## truncated id screenName retweetCount isRetweet
## 1 TRUE 944159163764101120 DeCryptografter 0 FALSE
## 2 FALSE 944159160253460480 freakycrytpo 8 TRUE
## 3 FALSE 944159157254373376 IsufAmeti 12 TRUE
## 4 FALSE 944159153244852224 nantran2 56 TRUE
## 5 FALSE 944159146299088896 JacekSalaj 3 TRUE
## 6 FALSE 944159141727174656 itknowingness 110 TRUE
## retweeted
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
Run through tweets, extract text, split words into list, reduce list levels, match postive and negative words,
for (j in 1:nrow(cleanTweets)) {
theTweet <- tolower(cleanTweets$text[j])
tweetWords <- str_split(theTweet, "\\s+")
words <- unlist(tweetWords)
posMatches <- match(words, positiveWords)
negMatches <- match(words, negativeWords)
posMatches <- !is.na(posMatches)
negMatches <- !is.na(negMatches)
score <- sum(posMatches) - sum(negMatches)
cleanTweets$sentimentScore[j] <- score
}
plotData <- cleanTweets[c("text", "sentimentScore", "favoriteCount")]
xLabel <- paste("Sentiment Score. Mean sentiment: ",
round(mean(cleanTweets$sentimentScore), 2), sep = "")
yLabel <- paste("Number of Tweets (", nrow(cleanTweets),")", sep = "")
graphTitle <- paste("Twitter Sentiment Analysis of ", searchString, sep = "")
qplot(factor(sentimentScore), data=plotData,
geom="bar",
fill=factor(sentimentScore),
xlab = xLabel,
ylab = yLabel,
main = graphTitle) +
theme(legend.position="none")