library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(rtweet)
library(ggplot2)
library(igraph)
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
library(devtools)
install_github("dgrtwo/widyr")
## Skipping install of 'widyr' from a github remote, the SHA1 (d2a659e6) has not changed since last install.
## Use `force = TRUE` to force installation
library(widyr)
library(maps)
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
##
## crossing
tweet_query <- "#bitcoin + #Bitcoin + bitcoin + Bitcoin + #BTC + #btc + #Btc"
tweets_raw <- search_tweets(tweet_query, n =18000, lang = "en", include_rts = FALSE, retryonratelimit = TRUE, since = "2018-12-28")
## Searching for tweets...
## This may take a few seconds...
## Finished collecting tweets!
tweets <- tweets_raw %>%
select(screen_name, text)
Let’s preview some tweets from the top tweets, should we remove bots?
top_tweeters <- tweets %>% group_by(screen_name) %>%
count(screen_name) %>%
arrange(desc(n)) %>%
filter(n > 50)
sample_top_tweets <- inner_join(tweets, top_tweeters, by = "screen_name")
print(sample_top_tweets)
## # A tibble: 948 x 3
## screen_name text n
## <chr> <chr> <int>
## 1 ElixiumCrypto "When you finally mine 0,0003 Bitcoin after one mo… 60
## 2 ElixiumCrypto "Every GPU's worst nightmare\n\n👉 https://t.co/HQa… 60
## 3 ElixiumCrypto "Working Hard or Hardly Working?\n\n👉 https://t.co… 60
## 4 ElixiumCrypto "This is governments trying to regulate bitcoin.\n… 60
## 5 ElixiumCrypto "Amazing bitcoin billboard near LAX!\n\n👉 https://… 60
## 6 ElixiumCrypto "There is no CEO. That's the point.\n\n👉 https://t… 60
## 7 ElixiumCrypto "Total Crypto MCap has had a strong rally today. 🔥… 60
## 8 ElixiumCrypto "Looking at the price when you invested only what … 60
## 9 ElixiumCrypto "Me in 60 years\n\n👉 https://t.co/6Abt3c8Qd8 👀\n\n… 60
## 10 ElixiumCrypto "This article is the metaphorical money-shot of my… 60
## # ... with 938 more rows
The top tweet is ohiobitcoin, doesn’t seem to be a bot as they are tweeting substantive information, not simply alerts. We’ll leave the top tweets in for now.
Create new column “word”; remove special characters and blanks.
#Reshape tweet column into rows indexed on username
tweets <- tweets %>%
mutate(word = strsplit(as.character(text), " ")) %>%
unnest(word)
tweets$word <- gsub('[[:punct:]]', '', tweets$word)
tweets$word = gsub("&", "", tweets$word)
tweets$word = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets$word)
tweets$word = gsub("@\\w+", "", tweets$word)
tweets$word = gsub("[[:punct:]]", "", tweets$word)
tweets$word = gsub("[[:digit:]]", "", tweets$word)
tweets$word = gsub("http\\w+", "", tweets$word)
tweets$word = gsub("[ \t]{2,}", "", tweets$word)
tweets$word = gsub("^\\s+|\\s+$", "", tweets$word)
data("stop_words")
#Convert all tweets to lowercase for stop words antijoin
tweets$word <- tolower(tweets$word)
cleaned_tweet_words <- anti_join(tweets, stop_words, by = "word")
cleaned_tweet_words %>%
count(word, sort = TRUE) %>%
top_n(35) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
## Selecting by n
#Get list of top words
top_words <- cleaned_tweet_words %>%
count(word, sort = TRUE) %>%
top_n(100) %>%
mutate(word = reorder(word, n)) %>%
select(word)
## Selecting by n
cleaned_tweet_words <- anti_join(cleaned_tweet_words, top_words, by = "word")
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
## create lat/lng variables using all available tweet and profile geo-location data
tweets_raw <- lat_lng(tweets_raw)
## plot world boundaries
par(mar = c(0, 0, 0, 0))
maps::map("world", lwd = .25)
## plot lat and lng points onto world map
with(tweets_raw, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))
nrc <- get_sentiments("nrc")
#Only two classes for "bing" positive/negative
bing <- get_sentiments("bing")
#Merge NRC sentiment
tweet_sentiment_nrc <- inner_join(tweets, nrc, by = "word")
tweet_sentiment_bing <- inner_join(tweets, bing, by = "word")
tweet_sentiment_nrc %>%
count(word, sort = TRUE) %>%
top_n(35) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Word",
title = "Top Bitcoin Twitter Words for 12-28-2018")
## Selecting by n
tweet_sentiment_nrc %>%
count(sentiment, sort = TRUE) %>%
top_n(35) %>%
mutate(sentiment = reorder(sentiment, n)) %>%
ggplot(aes(x = sentiment, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Sentiment",
title = " Bitcoin Twitter Sentiment for 12-28-2018")
## Selecting by n
tweet_sentiment_bing %>%
count(word, sort = TRUE) %>%
top_n(35) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Word",
title = "Top Bitcoin Twitter Words for 12-28-2018")
## Selecting by n
tweet_sentiment_bing %>%
count(sentiment, sort = TRUE) %>%
top_n(35) %>%
mutate(sentiment = reorder(sentiment, n)) %>%
ggplot(aes(x = sentiment, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Sentiment",
title = " Bitcoin Twitter Sentiment for 12-28-2018")
## Selecting by n