universal basic income

################# Harvest Tweets about Universal basic income ###############

# load packages
library(twitteR)
library(ROAuth)
library(ggplot2)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(scales)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(stringr)
library(wordcloud)

## Loading required package: RColorBrewer

library(syuzhet)

## 
## Attaching package: 'syuzhet'

## The following object is masked from 'package:scales':
## 
##     rescale

library(reshape2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union

## The following objects are masked from 'package:twitteR':
## 
##     id, location

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(RColorBrewer)

#Twitter API Authorization
setup_twitter_oauth('4GLxyq5LmomwbAtVmHV4tTdMJ', 
                    'wq4FSa1zC0dLYYMygWk6wTxra1Vrpx4luMnW2Z92XNHfmpwjGD', 
                    '174544158-Zn9NARAAZCOoQBtsCXtzqDpDk4eHvJK6fj8ykfvA', 
                    'I8uIbwuoZL9A1xjJ0hdDUrVsl1bdBuNAfBNURvvzyqk0i')

## [1] "Using direct authentication"

basicincome_tweets = searchTwitter("#basicincome", n=3200, lang="en") # Tweets inlcuding "#basicincome"
basicincome <- twListToDF(basicincome_tweets)
write.csv(basicincome, file="#basicincome_tweets1030.csv")

UBI_tweets = searchTwitter("#UBI", n=3200, lang="en") # Tweets inlcuding "#UBI"

## Warning in doRppAPICall("search/tweets", n, params = params,
## retryOnRateLimit = retryOnRateLimit, : 3200 tweets were requested but the
## API can only return 2100

UBI <- twListToDF(UBI_tweets)
write.csv(UBI, file="#UBI_tweets1030.csv")

#################################################################################################
##########################################################################################################################

# Convert the timestamps of the tweets to the same time zone

basicincome$created <- ymd_hms(basicincome$created)
basicincome$created <- with_tz(basicincome$created, "America/New_York")

# Show twitter volume by month

ggplot(data = basicincome, aes(x = month(created, label = TRUE))) +
  geom_bar(aes(fill = ..count..)) +
  theme(legend.position = "none") +
  xlab("Month") + ylab("Number of tweets") +
  scale_fill_gradient(low = "midnightblue", high = "aquamarine4")

# Show twitter volume by weekday

ggplot(data = basicincome, aes(x = wday(created, label = TRUE))) +
  geom_bar(aes(fill = ..count..)) +
  theme(legend.position = "none") +
  xlab("Day of the Week") + ylab("Number of tweets") + 
  scale_fill_gradient(low = "midnightblue", high = "aquamarine4")

# Show twitter volume by hour
#extract days
basicincome$timeonly <- as.numeric(basicincome$created - trunc(basicincome$created, "days"))
class(basicincome$timeonly) <- "POSIXct"

ggplot(data = basicincome, aes(x = timeonly)) +
  geom_histogram(aes(fill = ..count..)) +
  theme(legend.position = "none") +
  xlab("Time") + ylab("Number of tweets") + 
  scale_x_datetime(breaks = date_breaks("2 hours"), 
                   labels = date_format("%H:00")) +
  scale_fill_gradient(low = "midnightblue", high = "aquamarine4")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#################################################################################################
#################################################################################################

############################## Word cloud of frequent terms ##############################


nohandles <- str_replace_all(basicincome$text, "@\\w+", "") #remove special characters, in particular, the symbol @
wordCorpus <- Corpus(VectorSource(nohandles)) 
wordCorpus <- tm_map(wordCorpus, removePunctuation) #remove punctuations
wordCorpus <- tm_map(wordCorpus, removeNumbers) #remove numbers 
wordCorpus <- tm_map(wordCorpus, content_transformer(tolower)) #converted to lower case
wordCorpus <- tm_map(wordCorpus, removeWords, stopwords("english")) #remove english common stopwords
wordCorpus <- tm_map(wordCorpus, removeWords, c("https", "http", "4yo")) # specify your stopwords as a character vector
wordCorpus <- tm_map(wordCorpus, stripWhitespace) # Eliminate extra white spaces

pal <- brewer.pal(9,"YlGnBu")
pal <- pal[-(1:4)]
set.seed(123)
wordcloud(words = wordCorpus, scale=c(10,0.3), max.words=100, random.order=FALSE, 
          rot.per=0.35, use.r.layout=FALSE, colors=pal)

## Warning in wordcloud(words = wordCorpus, scale = c(10, 0.3), max.words =
## 100, : basicincome could not be fit on page. It will not be plotted.

# Produce a word cloud of frequent twitter mentions

friends <- str_extract_all(basicincome$text, "@\\w+")
namesCorpus <- Corpus(VectorSource(friends))

set.seed(146)
wordcloud(words = namesCorpus, scale=c(3,0.5), max.words=50, random.order=FALSE, 
          rot.per=0.10, use.r.layout=FALSE, colors=pal)

#################################################################################################
#################################################################################################

############################ sentiment analysis #################################################

# Generate sentiment score for each tweet 
basicincome$clean_text <- str_replace_all(basicincome$text, "@\\w+", "")
Sentiment <- get_nrc_sentiment(basicincome$clean_text)
basicincome_senti <- cbind(basicincome, Sentiment)

sentimentTotals <- data.frame(colSums(basicincome_senti[,c(19:26)]))
names(sentimentTotals) <- "count"
sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals), sentimentTotals)
rownames(sentimentTotals) <- NULL

## Twitter volume in each sentiment category
ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
  geom_bar(aes(fill = sentiment), stat = "identity") +
  theme(legend.position = "none") +
  xlab("Sentiment") + ylab("Total Count") + ggtitle("Total Sentiment Score for All Tweets")

# Positive and negative sentiment over time
posnegtime <- basicincome_senti %>% 
  group_by(created = cut(created, breaks="2 days")) %>%
  summarise(negative = mean(negative),
            positive = mean(positive)) %>% melt

## Using created as id variables

# Using created as id variables
names(posnegtime) <- c("timestamp", "sentiment", "meanvalue")
posnegtime$sentiment = factor(posnegtime$sentiment,levels(posnegtime$sentiment)[c(2,1)])

ggplot(data = posnegtime, aes(x = as.Date(timestamp), y = meanvalue, group = sentiment)) +
  geom_line(size = 1.5, alpha = 0.7, aes(color = sentiment)) +
  geom_point(size = 0.3) +
  ylim(0, NA) + 
  scale_colour_manual(values = c("springgreen4", "firebrick3")) +
  theme(legend.title=element_blank(), axis.title.x = element_blank()) +
  scale_x_date(breaks = date_breaks("3 months"), 
               labels = date_format("%Y-%b")) +
  ylab("Average sentiment score") + 
  ggtitle("Sentiment Over Time")

# Various sentiment type over time

basicincome_senti$month <- month(basicincome_senti$created, label = TRUE)
monthlysentiment <- basicincome_senti %>% group_by(month) %>% 
  summarise(anger = mean(anger), 
            anticipation = mean(anticipation), 
            disgust = mean(disgust), 
            fear = mean(fear), 
            joy = mean(joy), 
            sadness = mean(sadness), 
            surprise = mean(surprise), 
            trust = mean(trust)) %>% melt

## Using month as id variables

## Using month as id variables
names(monthlysentiment) <- c("month", "sentiment", "meanvalue")

ggplot(data = monthlysentiment, aes(x = month, y = meanvalue, group = sentiment)) +
  geom_line(size = 2.5, alpha = 0.7, aes(color = sentiment)) +
  geom_point(size = 0.5) +
  ylim(0, NA) +
  theme(legend.title=element_blank(), axis.title.x = element_blank()) +
  ylab("Average sentiment score") + 
  ggtitle("Sentiment During the Year")

## geom_path: Each group consists of only one observation. Do you need to
## adjust the group aesthetic?