Data 607 Final Project - Twitter Sentiment Analysis

II. Method

_Based on a sample historical database of Tweets (a subset of Tweets done by Twitter users for the last week), the tool aims to examine the correlation between a stock’s Twitter Sentiment Analysis score (ratio of negative to positve tweets) and the closing price and trading volume

I chose to examine 3 of the most popular stocks in the market today - Facebook (FB), Amazon (AMZN) and Apple (AAPL). I then ran routines to text mine the Twitter API database for these stocks and performed both a sentiment and polarity analysis of twits regarding these stocks. Since the sample API database of Twits only contain 1 week of historical data, I was only able to obtain 10 trading days of data - from May 9 to 18, 2016. I then assigned a sentiment value for each stock for the 10 trading days by simply getting the ratio of the total of negatively classified twits and positively classified twits. I then extracted the stocks’ closing and volume of trade from the Yahoo Finance website and downloaded this data to a CSV file for each of the stock. I then merged (joined) the CSV files to the sentiment score dataframe on the trading date to produce the dataframe that was now ready for regression analysis.

I then performed the regression analysis to answer the question:
1. Is there a correlation between a stock’s Twitter sentiment score and its closing price? 2. Is there a correlation between a stock’s Twitter sentiment score and its trading volume?

Include Relevant Libraries

library(dplyr)
library(sentiment)
library(twitteR)

Gain Access to the Twitter TWeet Database through the Twitter API

# authorisation
if (!require("pacman")) install.packages("pacman")
pacman::p_load(twitteR, ROAuth, RCurl)
 
api_key = "W2skqV4BOlxefxQm66XD5bYJJ"
api_secret = "e937ccSE2OiIdTcnjGZWrwgUqEJnJhVpEVzJR9LsSOxbRbBM11"
access_token = "560054161-J3VmL5Ss4qF7oEkxBqcxsHiZ5PCIH19U34s61Uoi"
access_token_secret = "VZDWV7ZYf8SKpttf2X8AZBQMIgIgkH8ARDjn9GaeWkjTV"
 
# Set SSL certs globally
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
 
# set up the URLs
reqURL = "https://api.twitter.com/oauth/request_token"
accessURL = "https://api.twitter.com/oauth/access_token"
authURL = "https://api.twitter.com/oauth/authorize"
 
twitCred = OAuthFactory$new(consumerKey = api_key, consumerSecret = api_secret, requestURL = reqURL, accessURL = accessURL, authURL = authURL)
 
twitCred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))

if (!require("pacman")) install.packages("pacman")
pacman::p_load(devtools, installr)
install.Rtools()
install_url("http://cran.r-project.org/src/contrib/Archive/Rstem/Rstem_0.4-1.tar.gz")
install_url("http://cran.r-project.org/src/contrib/Archive/sentiment/sentiment_0.2.tar.gz")



if (!require("pacman")) install.packages("pacman")
pacman::p_load(twitteR, sentiment, plyr, ggplot2, wordcloud, RColorBrewer, httpuv, RCurl, base64enc)
 
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))

api_key = "W2skqV4BOlxefxQm66XD5bYJJ"
api_secret = "e937ccSE2OiIdTcnjGZWrwgUqEJnJhVpEVzJR9LsSOxbRbBM11"
access_token = "560054161-J3VmL5Ss4qF7oEkxBqcxsHiZ5PCIH19U34s61Uoi"
access_token_secret = "VZDWV7ZYf8SKpttf2X8AZBQMIgIgkH8ARDjn9GaeWkjTV" 

setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)

Text Mine Twitter for Twits concerning Apple, Amazon and Facebook Stocks

TextMine <- function (srchstr,frdate,todate)

{
##srchstr = "FB+Facebook Stock"
##frdate = "2016-05-08"
##todate = "2016-05-09"

  # harvest some tweets
some_tweets = list()
some_tweets = searchTwitter(srchstr, n=10000, since=frdate, until=todate, lang="en")
 
# get the text
some_txt = list()
some_txt = sapply(some_tweets, function(x) x$getText())


# remove retweet entities
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
# remove at people
some_txt = gsub("@\\w+", "", some_txt)
# remove punctuation
some_txt = gsub("[[:punct:]]", "", some_txt)
# remove numbers
some_txt = gsub("[[:digit:]]", "", some_txt)
# remove html links
some_txt = gsub("http\\w+", "", some_txt)
# remove unnecessary spaces
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
 
# define "tolower error handling" function
try.error = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# lower case using try.error with sapply
some_txt = sapply(some_txt, try.error)
 
# remove NAs in some_txt
some_txt = some_txt[!is.na(some_txt)]
names(some_txt) = NULL

return(some_txt)

}

#from 5/8/2016 to 5/18/2016
stock = "FB+Facebook Stock"
fdate = as.character(as.Date("2016-05-15"))
tdate = as.character(as.Date("2016-05-18"))
emotfb = TextMine(stock, fdate, tdate)


#from 5/8/2016 to 5/18/2016
stock = "AAPL+Apple Stock"
fdate = as.character(as.Date("2016-05-08"))
tdate = as.character(as.Date("2016-05-18"))
emotaapl = TextMine(stock, fdate, tdate)

#from 5/8/2016 to 5/18/2016
stock = "AMZN+Amazon Stock"
fdate = as.character(as.Date("2016-05-08"))
tdate = as.character(as.Date("2016-05-18"))
emotamzn = TextMine(stock, fdate, tdate)


#from 5/17/2016 to 5/18/2016
stock = "AAPL+Apple Stock"
fdate = as.character(as.Date("2016-05-17"))
tdate = as.character(as.Date("2016-05-18"))
stxtrep = TextMine(stock, fdate, tdate)
stxtaapl[[10]] = stxtrep

#from 5/17/2016 to 5/18/2016
stock = "AMZN+Amazon Stock"
fdate = as.character(as.Date("2016-05-17"))
tdate = as.character(as.Date("2016-05-18"))
stxtrep = TextMine(stock, fdate, tdate)
stxtamzn[[10]] = stxtrep

#from 5/17/2016 to 5/18/2016
stock = "FB+Facebook Stock"
fdate = as.character(as.Date("2016-05-17"))
tdate = as.character(as.Date("2016-05-18"))
stxtrep = TextMine(stock, fdate, tdate)
stxtfb[[10]] = stxtrep


#from 5/8/2016 to 5/17/2016
tdate = "0000-00-00"
stock = "AAPL+Apple Stock"
offset = 0
stxtaapl = list()
while (tdate != "2016-05-17") 
{
 fdate = as.character(as.Date("2016-05-08") + offset)
 tdate = as.character(as.Date("2016-05-09") + offset)
 stxtrep = TextMine(stock, fdate, tdate)
 offset = offset + 1
 stxtaapl[[offset]] = stxtrep
 
}

#from 5/8/2016 to 5/17/2016
tdate = "0000-00-00"
stock = "AMZN+Amazon Stock"
offset = 0
stxtamzn = list()
while (tdate != "2016-05-17") 
{
 fdate = as.character(as.Date("2016-05-08") + offset)
 tdate = as.character(as.Date("2016-05-09") + offset)
 stxtrep = TextMine(stock, fdate, tdate)
 offset = offset + 1
 stxtamzn[[offset]] = stxtrep
 
}


#from 5/8/2016 to 5/17/2016
tdate = "0000-00-00"
stock = "FB+Facebook Stock"
offset = 0
stxtfb = list()
while (tdate != "2016-05-17") 
{
 fdate = as.character(as.Date("2016-05-08") + offset)
 tdate = as.character(as.Date("2016-05-09") + offset)
 stxtrep = TextMine(stock, fdate, tdate)
 offset = offset + 1
 stxtfb[[offset]] = stxtrep
 
}

################################################################################

Classify Twits regarding stocks into emotions - joy, sadness, suprise, fear, etc.

EmotAnal = function(srchstr,some_txt)
{ 
# Perform Sentiment Analysis
# classify emotion
class_emo = classify_emotion(emotfb, algorithm="bayes", prior=1.0)
# get emotion best fit
emotion = class_emo[,7]
# substitute NA's by "unknown"
emotion = emotion[is.na(emotion) == FALSE]
#emotion[is.na(emotion)] = "unknown"
# data frame with results
sent_df = data.frame(emotion=emotion, stringsAsFactors=FALSE)
# sort data frame
sent_df = within(sent_df,
emotion <- factor(emotion, levels=names(sort(table(emotion), decreasing=TRUE))))
#dev.off()
# Let's do some plots of the obtained results
# plot distribution of emotions
ggplot(sent_df, aes(x=emotion)) +
geom_bar(aes(y=..count.., fill=emotion)) +
scale_fill_brewer(palette="Dark2") +
labs(x="emotion categories", y="number of tweets") +
ggtitle(paste("Sentiment Analysis of Tweets (classification by emotion)\n", srchstr)) +
theme(plot.title = element_text(size=12, face="bold"))


}


stock = "FB+Facebook Stock"
EmotAnal(stock, emotfb)

stock = "AMZN+Amazon Stock"
EmotAnal(stock, emotamzn)

stock = "AAPL+Apple Stock"
EmotAnal(stock, emotaapl)

############################################################################################

classify stock tweets as positive, negative or neutral and computer for the degree of negativity(positivity)

PolAnal <- function (some_txt) 
{
# classify polarity
class_pol = classify_polarity(some_txt, algorithm="bayes")
# get polarity best fit
polarity = class_pol[,4]
neg = length(which(polarity == "negative"))
neu = length(which(polarity == "neutral"))
pos = length(which(polarity == "positive"))
pol=(neg/pos)
return(pol)
}


tabfb = vector(mode = "numeric", length = 0)
for (i in 1:length(stxtfb)) {
  tabfb[i] = PolAnal(stxtfb[[i]]) 
}
tabaapl = vector(mode = "numeric", length = 0)
for (i in 1:length(stxtaapl)) {
  tabaapl[i] = PolAnal(stxtaapl[[i]]) 
}
tabamzn = vector(mode = "numeric", length = 0)
for (i in 1:length(stxtamzn)) {
  tabamzn[i] = PolAnal(stxtamzn[[i]]) 
}

Visualization to view positivity/negativity of stock twits

ShowPol <- function(srchstr, some_txt)
{ 
class_pol = classify_polarity(some_txt, algorithm="bayes")
# get polarity best fit
polarity = class_pol[,4]
# Create data frame with the results and obtain some general statistics
# data frame with results
sent_df2 = data.frame(text=some_txt, polarity=polarity, stringsAsFactors=FALSE)
#dev.next()
# plot distribution of polarity
ggplot(sent_df2, aes(x=polarity)) +
geom_bar(aes(y=..count.., fill=polarity)) +
scale_fill_brewer(palette="RdGy") +
labs(x="polarity categories", y="number of tweets") +
ggtitle(paste("Sentiment Analysis of Tweets (classification by polarity)\n", srchstr)) +
theme(plot.title = element_text(size=12, face="bold"))
}

stock = "AMZN+Amazon Stock"
ShowPol(stock,stxtamzn[[7]])
stock = "AAPL+Apple Stock"
ShowPol(stock,stxtaapl[[2]])
stock = "FB+Facebook Stock"
ShowPol(stock,stxtfb[[5]])

#############################################################################################

Find correlation between stock polarity and stock closing price and trading volume using linear regression

Sentdf_fb <- data.frame(Date = character(length(tabfb)), Sentiment = numeric (length(tabfb)), stringsAsFactors = FALSE)
for (i in 1:length(tabfb)) {
  
  Sentdf_fb$Date[i] = as.character(as.Date("2016-05-08") + i)
  Sentdf_fb$Sentiment[i] = tabfb[i]
}
  
Sentdf_fb = subset(Sentdf_fb, Date != "2016-05-15" & Date != "2016-05-14")

Sentdf_amzn <- data.frame(Date = character(length(tabamzn)), Sentiment = numeric (length(tabamzn)), stringsAsFactors = FALSE)
for (i in 1:length(tabamzn)) {
  Sentdf_amzn$Date[i] = as.character(as.Date("2016-05-08") + i)
  Sentdf_amzn$Sentiment[i] = tabamzn[i]
}

Sentdf_amzn = subset(Sentdf_amzn, Date != "2016-05-15" & Date != "2016-05-14")

Sentdf_aapl <- data.frame(Date = character(length(tabaapl)), Sentiment = numeric (length(tabaapl)), stringsAsFactors = FALSE)
for (i in 1:length(tabaapl)) {
  Sentdf_aapl$Date[i] = as.character(as.Date("2016-05-08") + i)
  Sentdf_aapl$Sentiment[i] = tabaapl[i]
}

 Sentdf_aapl = subset(Sentdf_aapl, Date != "2016-05-15" & Date != "2016-05-14") 
#############################################################################################


#dev.next()


############################################################################################

apple = read.csv("apple.csv", header = TRUE, sep = ",", stringsAsFactors = F)
amazon = read.csv("amazon.csv", header = TRUE, sep = ",", stringsAsFactors = F)
facebook = read.csv("facebook.csv", header = TRUE, sep = ",", stringsAsFactors = F)

apple = subset(apple, select=c(Date,Close,Volume) )
amazon = subset(amazon, select=c(Date,Close,Volume) )
facebook = subset(facebook, select=c(Date,Close,Volume) )

Sapple = join(apple, Sentdf_aapl, by = "Date")
Samazon = join(amazon, Sentdf_amzn, by = "Date")
Sfacebook = join(facebook, Sentdf_fb, by = "Date")

plot(Sfacebook$Sentiment, Sfacebook$Close, col="red")
plot(Sfacebook$Sentiment, Sfacebook$Volume, col="blue")

plot(Sapple$Sentiment, Sapple$Close, col="red")
plot(Sapple$Sentiment, Sapple$Volume, col="blue")

plot(Samazon$Sentiment, Samazon$Close, col="red")
plot(Samazon$Sentiment, Samazon$Volume, col="blue")

m_apple = lm(Sapple$Volume ~ Sapple$Sentiment, data = Sapple)
summary(m_apple)

layout(matrix(1:4,2,2))
plot(m_apple)

layout(matrix(1:1,1,1))
plot(Sapple$Sentiment, Sapple$Volume)
abline(m_apple)

m_amazon = lm(Samazon$Volume ~ Samazon$Sentiment, data = Samazon)
summary(m_amazon)

layout(matrix(1:4,2,2))
plot(m_amazon)

layout(matrix(1:1,1,1))
plot(Samazon$Sentiment, Samazon$Volume)
abline(m_amazon)

m_facebook = lm(Sfacebook$Volume ~ Sfacebook$Sentiment, data = Sfacebook)
summary(m_facebook)

layout(matrix(1:4,2,2))
plot(m_facebook)

layout(matrix(1:1,1,1))
plot(Sfacebook$Sentiment, Sfacebook$Volume)
abline(m_facebook)


mc_apple = lm(Sapple$Close ~ Sapple$Sentiment, data = Sapple)
summary(mc_apple)

layout(matrix(1:4,2,2))
plot(mc_apple)

layout(matrix(1:1,1,1))
plot(Sapple$Sentiment, Sapple$Close)
abline(mc_apple)

mc_amazon = lm(Samazon$Close ~ Samazon$Sentiment, data = Samazon)
summary(m_amazon)

layout(matrix(1:4,2,2))
plot(mc_amazon)

layout(matrix(1:1,1,1))
plot(Samazon$Sentiment, Samazon$Close)
abline(mc_amazon)

mc_facebook = lm(Sfacebook$Close ~ Sfacebook$Sentiment, data = Sfacebook)
summary(mc_facebook)

layout(matrix(1:4,2,2))
plot(mc_facebook)

layout(matrix(1:1,1,1))
plot(Sfacebook$Sentiment, Sfacebook$Close)
abline(mc_facebook)

III. Conclusion - The small R-squared score for all 3 stocks and the non-horizontal shape of the the fitted to residual graph as well as the presence of influential outliers, it can be concluded that their is no adequate proof to conclude that our data follows a linear data model. As such we cannot conclude that there is a relationship between the twitter sentiment scores of the stocks in questions on the given week with the stock’s closing price and trading volume. There may be various reason for this - the main one being the limited sample size. If we can repeat this study wtih a larger representation of the Twitter twitts and for at least a month’s worth of data, then a better conclusion and more insights can be drawn from the data