Initialization

Load Libraries

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(rtweet)
library(ggplot2)
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)
library(devtools)
install_github("dgrtwo/widyr")
## Skipping install of 'widyr' from a github remote, the SHA1 (d2a659e6) has not changed since last install.
##   Use `force = TRUE` to force installation
library(widyr)
library(maps)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
## 
##     crossing

Import and Preprocess BTC Twitter Data

tweet_query <- "#bitcoin + #Bitcoin + bitcoin + Bitcoin + #BTC + #btc + #Btc"
tweets_raw <- search_tweets(tweet_query, n =18000, lang = "en", include_rts = FALSE, retryonratelimit = TRUE, since = "2018-12-28")
## Searching for tweets...
## This may take a few seconds...
## Finished collecting tweets!
tweets <- tweets_raw %>% 
  select(screen_name, text)

Examine top crypto tweets

Let’s preview some tweets from the top tweets, should we remove bots?

top_tweeters <- tweets %>% group_by(screen_name) %>%
  count(screen_name) %>%
  arrange(desc(n)) %>% 
  filter(n > 50)

sample_top_tweets <- inner_join(tweets, top_tweeters, by = "screen_name")
print(sample_top_tweets)
## # A tibble: 948 x 3
##    screen_name   text                                                    n
##    <chr>         <chr>                                               <int>
##  1 ElixiumCrypto "When you finally mine 0,0003 Bitcoin after one mo…    60
##  2 ElixiumCrypto "Every GPU's worst nightmare\n\n👉 https://t.co/HQa…    60
##  3 ElixiumCrypto "Working Hard or Hardly Working?\n\n👉 https://t.co…    60
##  4 ElixiumCrypto "This is governments trying to regulate bitcoin.\n…    60
##  5 ElixiumCrypto "Amazing bitcoin billboard near LAX!\n\n👉 https://…    60
##  6 ElixiumCrypto "There is no CEO. That's the point.\n\n👉 https://t…    60
##  7 ElixiumCrypto "Total Crypto MCap has had a strong rally today. 🔥…    60
##  8 ElixiumCrypto "Looking at the price when you invested only what …    60
##  9 ElixiumCrypto "Me in 60 years\n\n👉 https://t.co/6Abt3c8Qd8 👀\n\n…    60
## 10 ElixiumCrypto "This article is the metaphorical money-shot of my…    60
## # ... with 938 more rows

The top tweet is ohiobitcoin, doesn’t seem to be a bot as they are tweeting substantive information, not simply alerts. We’ll leave the top tweets in for now.

Seperate tweets into word column and clean.

Create new column “word”; remove special characters and blanks.

#Reshape tweet column into rows indexed on username
tweets <- tweets %>% 
    mutate(word = strsplit(as.character(text), " ")) %>% 
    unnest(word)

tweets$word <- gsub('[[:punct:]]', '', tweets$word)
tweets$word = gsub("&amp", "", tweets$word)
tweets$word = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", tweets$word)
tweets$word = gsub("@\\w+", "", tweets$word)
tweets$word = gsub("[[:punct:]]", "", tweets$word)
tweets$word = gsub("[[:digit:]]", "", tweets$word)
tweets$word = gsub("http\\w+", "", tweets$word)
tweets$word = gsub("[ \t]{2,}", "", tweets$word)
tweets$word = gsub("^\\s+|\\s+$", "", tweets$word)

Anti Join Stop Words

data("stop_words")
#Convert all tweets to lowercase for stop words antijoin
tweets$word <- tolower(tweets$word)
cleaned_tweet_words <- anti_join(tweets, stop_words, by = "word")

cleaned_tweet_words %>%
  count(word, sort = TRUE) %>%
  top_n(35) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets")
## Selecting by n

#Get list of top words
top_words <- cleaned_tweet_words %>%
  count(word, sort = TRUE) %>%
  top_n(100) %>%
  mutate(word = reorder(word, n)) %>%
  select(word)
## Selecting by n
cleaned_tweet_words <- anti_join(cleaned_tweet_words, top_words, by = "word")
## Warning: Column `word` joining character vector and factor, coercing into
## character vector

EDA

Visualize Tweet Location

## create lat/lng variables using all available tweet and profile geo-location data
tweets_raw <- lat_lng(tweets_raw)

## plot world boundaries
par(mar = c(0, 0, 0, 0))
maps::map("world", lwd = .25)

## plot lat and lng points onto world map
with(tweets_raw, points(lng, lat, pch = 20, cex = .75, col = rgb(0, .3, .7, .75)))

Merge tweet sentiment

nrc <- get_sentiments("nrc") 
#Only two classes for "bing" positive/negative
bing <- get_sentiments("bing") 
#Merge NRC sentiment
tweet_sentiment_nrc <- inner_join(tweets, nrc, by = "word")

tweet_sentiment_bing <- inner_join(tweets, bing, by = "word") 

Visualize top words (nrc)

tweet_sentiment_nrc %>%
  count(word, sort = TRUE) %>%
  top_n(35) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Word",
      title = "Top Bitcoin Twitter Words  for 12-28-2018")
## Selecting by n

Visualize sentiment (nrc)

tweet_sentiment_nrc %>%
  count(sentiment, sort = TRUE) %>%
  top_n(35) %>%
  mutate(sentiment = reorder(sentiment, n)) %>%
  ggplot(aes(x = sentiment, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Sentiment",
      title = " Bitcoin Twitter Sentiment for 12-28-2018")
## Selecting by n

Visualize top words (bing)

tweet_sentiment_bing %>%
  count(word, sort = TRUE) %>%
  top_n(35) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Word",
      title = "Top Bitcoin Twitter Words  for 12-28-2018")
## Selecting by n

Visualize sentiment (bing)

tweet_sentiment_bing %>%
  count(sentiment, sort = TRUE) %>%
  top_n(35) %>%
  mutate(sentiment = reorder(sentiment, n)) %>%
  ggplot(aes(x = sentiment, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Sentiment",
      title = " Bitcoin Twitter Sentiment for 12-28-2018")
## Selecting by n