The objective of this project is to be able to start developing a tool that will be able to predict, in majority of cases, the closing price and trading volume of a stock based on a twitter sentiment analysis
_Based on a sample historical database of Tweets (a subset of Tweets done by Twitter users for the last week), the tool aims to examine the correlation between a stock’s Twitter Sentiment Analysis score (ratio of negative to positve tweets) and the closing price and trading volume
I chose to examine 3 of the most popular stocks in the market today - Facebook (FB), Amazon (AMZN) and Apple (AAPL). I then ran routines to text mine the Twitter API database for these stocks and performed both a sentiment and polarity analysis of twits regarding these stocks. Since the sample API database of Twits only contain 1 week of historical data, I was only able to obtain 10 trading days of data - from May 9 to 18, 2016. I then assigned a sentiment value for each stock for the 10 trading days by simply getting the ratio of the total of negatively classified twits and positively classified twits. I then extracted the stocks’ closing and volume of trade from the Yahoo Finance website and downloaded this data to a CSV file for each of the stock. I then merged (joined) the CSV files to the sentiment score dataframe on the trading date to produce the dataframe that was now ready for regression analysis.
I then performed the regression analysis to answer the question:
1. Is there a correlation between a stock’s Twitter sentiment score and its closing price? 2. Is there a correlation between a stock’s Twitter sentiment score and its trading volume?
library(dplyr)
library(sentiment)
library(twitteR)
# authorisation
if (!require("pacman")) install.packages("pacman")
pacman::p_load(twitteR, ROAuth, RCurl)
api_key = "W2skqV4BOlxefxQm66XD5bYJJ"
api_secret = "e937ccSE2OiIdTcnjGZWrwgUqEJnJhVpEVzJR9LsSOxbRbBM11"
access_token = "560054161-J3VmL5Ss4qF7oEkxBqcxsHiZ5PCIH19U34s61Uoi"
access_token_secret = "VZDWV7ZYf8SKpttf2X8AZBQMIgIgkH8ARDjn9GaeWkjTV"
# Set SSL certs globally
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
# set up the URLs
reqURL = "https://api.twitter.com/oauth/request_token"
accessURL = "https://api.twitter.com/oauth/access_token"
authURL = "https://api.twitter.com/oauth/authorize"
twitCred = OAuthFactory$new(consumerKey = api_key, consumerSecret = api_secret, requestURL = reqURL, accessURL = accessURL, authURL = authURL)
twitCred$handshake(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl"))
if (!require("pacman")) install.packages("pacman")
pacman::p_load(devtools, installr)
install.Rtools()
install_url("http://cran.r-project.org/src/contrib/Archive/Rstem/Rstem_0.4-1.tar.gz")
install_url("http://cran.r-project.org/src/contrib/Archive/sentiment/sentiment_0.2.tar.gz")
if (!require("pacman")) install.packages("pacman")
pacman::p_load(twitteR, sentiment, plyr, ggplot2, wordcloud, RColorBrewer, httpuv, RCurl, base64enc)
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
api_key = "W2skqV4BOlxefxQm66XD5bYJJ"
api_secret = "e937ccSE2OiIdTcnjGZWrwgUqEJnJhVpEVzJR9LsSOxbRbBM11"
access_token = "560054161-J3VmL5Ss4qF7oEkxBqcxsHiZ5PCIH19U34s61Uoi"
access_token_secret = "VZDWV7ZYf8SKpttf2X8AZBQMIgIgkH8ARDjn9GaeWkjTV"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
TextMine <- function (srchstr,frdate,todate)
{
##srchstr = "FB+Facebook Stock"
##frdate = "2016-05-08"
##todate = "2016-05-09"
# harvest some tweets
some_tweets = list()
some_tweets = searchTwitter(srchstr, n=10000, since=frdate, until=todate, lang="en")
# get the text
some_txt = list()
some_txt = sapply(some_tweets, function(x) x$getText())
# remove retweet entities
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
# remove at people
some_txt = gsub("@\\w+", "", some_txt)
# remove punctuation
some_txt = gsub("[[:punct:]]", "", some_txt)
# remove numbers
some_txt = gsub("[[:digit:]]", "", some_txt)
# remove html links
some_txt = gsub("http\\w+", "", some_txt)
# remove unnecessary spaces
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
# define "tolower error handling" function
try.error = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# lower case using try.error with sapply
some_txt = sapply(some_txt, try.error)
# remove NAs in some_txt
some_txt = some_txt[!is.na(some_txt)]
names(some_txt) = NULL
return(some_txt)
}
#from 5/8/2016 to 5/18/2016
stock = "FB+Facebook Stock"
fdate = as.character(as.Date("2016-05-15"))
tdate = as.character(as.Date("2016-05-18"))
emotfb = TextMine(stock, fdate, tdate)
#from 5/8/2016 to 5/18/2016
stock = "AAPL+Apple Stock"
fdate = as.character(as.Date("2016-05-08"))
tdate = as.character(as.Date("2016-05-18"))
emotaapl = TextMine(stock, fdate, tdate)
#from 5/8/2016 to 5/18/2016
stock = "AMZN+Amazon Stock"
fdate = as.character(as.Date("2016-05-08"))
tdate = as.character(as.Date("2016-05-18"))
emotamzn = TextMine(stock, fdate, tdate)
#from 5/17/2016 to 5/18/2016
stock = "AAPL+Apple Stock"
fdate = as.character(as.Date("2016-05-17"))
tdate = as.character(as.Date("2016-05-18"))
stxtrep = TextMine(stock, fdate, tdate)
stxtaapl[[10]] = stxtrep
#from 5/17/2016 to 5/18/2016
stock = "AMZN+Amazon Stock"
fdate = as.character(as.Date("2016-05-17"))
tdate = as.character(as.Date("2016-05-18"))
stxtrep = TextMine(stock, fdate, tdate)
stxtamzn[[10]] = stxtrep
#from 5/17/2016 to 5/18/2016
stock = "FB+Facebook Stock"
fdate = as.character(as.Date("2016-05-17"))
tdate = as.character(as.Date("2016-05-18"))
stxtrep = TextMine(stock, fdate, tdate)
stxtfb[[10]] = stxtrep
#from 5/8/2016 to 5/17/2016
tdate = "0000-00-00"
stock = "AAPL+Apple Stock"
offset = 0
stxtaapl = list()
while (tdate != "2016-05-17")
{
fdate = as.character(as.Date("2016-05-08") + offset)
tdate = as.character(as.Date("2016-05-09") + offset)
stxtrep = TextMine(stock, fdate, tdate)
offset = offset + 1
stxtaapl[[offset]] = stxtrep
}
#from 5/8/2016 to 5/17/2016
tdate = "0000-00-00"
stock = "AMZN+Amazon Stock"
offset = 0
stxtamzn = list()
while (tdate != "2016-05-17")
{
fdate = as.character(as.Date("2016-05-08") + offset)
tdate = as.character(as.Date("2016-05-09") + offset)
stxtrep = TextMine(stock, fdate, tdate)
offset = offset + 1
stxtamzn[[offset]] = stxtrep
}
#from 5/8/2016 to 5/17/2016
tdate = "0000-00-00"
stock = "FB+Facebook Stock"
offset = 0
stxtfb = list()
while (tdate != "2016-05-17")
{
fdate = as.character(as.Date("2016-05-08") + offset)
tdate = as.character(as.Date("2016-05-09") + offset)
stxtrep = TextMine(stock, fdate, tdate)
offset = offset + 1
stxtfb[[offset]] = stxtrep
}
################################################################################
EmotAnal = function(srchstr,some_txt)
{
# Perform Sentiment Analysis
# classify emotion
class_emo = classify_emotion(emotfb, algorithm="bayes", prior=1.0)
# get emotion best fit
emotion = class_emo[,7]
# substitute NA's by "unknown"
emotion = emotion[is.na(emotion) == FALSE]
#emotion[is.na(emotion)] = "unknown"
# data frame with results
sent_df = data.frame(emotion=emotion, stringsAsFactors=FALSE)
# sort data frame
sent_df = within(sent_df,
emotion <- factor(emotion, levels=names(sort(table(emotion), decreasing=TRUE))))
#dev.off()
# Let's do some plots of the obtained results
# plot distribution of emotions
ggplot(sent_df, aes(x=emotion)) +
geom_bar(aes(y=..count.., fill=emotion)) +
scale_fill_brewer(palette="Dark2") +
labs(x="emotion categories", y="number of tweets") +
ggtitle(paste("Sentiment Analysis of Tweets (classification by emotion)\n", srchstr)) +
theme(plot.title = element_text(size=12, face="bold"))
}
stock = "FB+Facebook Stock"
EmotAnal(stock, emotfb)
stock = "AMZN+Amazon Stock"
EmotAnal(stock, emotamzn)
stock = "AAPL+Apple Stock"
EmotAnal(stock, emotaapl)
############################################################################################
PolAnal <- function (some_txt)
{
# classify polarity
class_pol = classify_polarity(some_txt, algorithm="bayes")
# get polarity best fit
polarity = class_pol[,4]
neg = length(which(polarity == "negative"))
neu = length(which(polarity == "neutral"))
pos = length(which(polarity == "positive"))
pol=(neg/pos)
return(pol)
}
tabfb = vector(mode = "numeric", length = 0)
for (i in 1:length(stxtfb)) {
tabfb[i] = PolAnal(stxtfb[[i]])
}
tabaapl = vector(mode = "numeric", length = 0)
for (i in 1:length(stxtaapl)) {
tabaapl[i] = PolAnal(stxtaapl[[i]])
}
tabamzn = vector(mode = "numeric", length = 0)
for (i in 1:length(stxtamzn)) {
tabamzn[i] = PolAnal(stxtamzn[[i]])
}
ShowPol <- function(srchstr, some_txt)
{
class_pol = classify_polarity(some_txt, algorithm="bayes")
# get polarity best fit
polarity = class_pol[,4]
# Create data frame with the results and obtain some general statistics
# data frame with results
sent_df2 = data.frame(text=some_txt, polarity=polarity, stringsAsFactors=FALSE)
#dev.next()
# plot distribution of polarity
ggplot(sent_df2, aes(x=polarity)) +
geom_bar(aes(y=..count.., fill=polarity)) +
scale_fill_brewer(palette="RdGy") +
labs(x="polarity categories", y="number of tweets") +
ggtitle(paste("Sentiment Analysis of Tweets (classification by polarity)\n", srchstr)) +
theme(plot.title = element_text(size=12, face="bold"))
}
stock = "AMZN+Amazon Stock"
ShowPol(stock,stxtamzn[[7]])
stock = "AAPL+Apple Stock"
ShowPol(stock,stxtaapl[[2]])
stock = "FB+Facebook Stock"
ShowPol(stock,stxtfb[[5]])
#############################################################################################
Sentdf_fb <- data.frame(Date = character(length(tabfb)), Sentiment = numeric (length(tabfb)), stringsAsFactors = FALSE)
for (i in 1:length(tabfb)) {
Sentdf_fb$Date[i] = as.character(as.Date("2016-05-08") + i)
Sentdf_fb$Sentiment[i] = tabfb[i]
}
Sentdf_fb = subset(Sentdf_fb, Date != "2016-05-15" & Date != "2016-05-14")
Sentdf_amzn <- data.frame(Date = character(length(tabamzn)), Sentiment = numeric (length(tabamzn)), stringsAsFactors = FALSE)
for (i in 1:length(tabamzn)) {
Sentdf_amzn$Date[i] = as.character(as.Date("2016-05-08") + i)
Sentdf_amzn$Sentiment[i] = tabamzn[i]
}
Sentdf_amzn = subset(Sentdf_amzn, Date != "2016-05-15" & Date != "2016-05-14")
Sentdf_aapl <- data.frame(Date = character(length(tabaapl)), Sentiment = numeric (length(tabaapl)), stringsAsFactors = FALSE)
for (i in 1:length(tabaapl)) {
Sentdf_aapl$Date[i] = as.character(as.Date("2016-05-08") + i)
Sentdf_aapl$Sentiment[i] = tabaapl[i]
}
Sentdf_aapl = subset(Sentdf_aapl, Date != "2016-05-15" & Date != "2016-05-14")
#############################################################################################
#dev.next()
############################################################################################
apple = read.csv("apple.csv", header = TRUE, sep = ",", stringsAsFactors = F)
amazon = read.csv("amazon.csv", header = TRUE, sep = ",", stringsAsFactors = F)
facebook = read.csv("facebook.csv", header = TRUE, sep = ",", stringsAsFactors = F)
apple = subset(apple, select=c(Date,Close,Volume) )
amazon = subset(amazon, select=c(Date,Close,Volume) )
facebook = subset(facebook, select=c(Date,Close,Volume) )
Sapple = join(apple, Sentdf_aapl, by = "Date")
Samazon = join(amazon, Sentdf_amzn, by = "Date")
Sfacebook = join(facebook, Sentdf_fb, by = "Date")
plot(Sfacebook$Sentiment, Sfacebook$Close, col="red")
plot(Sfacebook$Sentiment, Sfacebook$Volume, col="blue")
plot(Sapple$Sentiment, Sapple$Close, col="red")
plot(Sapple$Sentiment, Sapple$Volume, col="blue")
plot(Samazon$Sentiment, Samazon$Close, col="red")
plot(Samazon$Sentiment, Samazon$Volume, col="blue")
m_apple = lm(Sapple$Volume ~ Sapple$Sentiment, data = Sapple)
summary(m_apple)
layout(matrix(1:4,2,2))
plot(m_apple)
layout(matrix(1:1,1,1))
plot(Sapple$Sentiment, Sapple$Volume)
abline(m_apple)
m_amazon = lm(Samazon$Volume ~ Samazon$Sentiment, data = Samazon)
summary(m_amazon)
layout(matrix(1:4,2,2))
plot(m_amazon)
layout(matrix(1:1,1,1))
plot(Samazon$Sentiment, Samazon$Volume)
abline(m_amazon)
m_facebook = lm(Sfacebook$Volume ~ Sfacebook$Sentiment, data = Sfacebook)
summary(m_facebook)
layout(matrix(1:4,2,2))
plot(m_facebook)
layout(matrix(1:1,1,1))
plot(Sfacebook$Sentiment, Sfacebook$Volume)
abline(m_facebook)
mc_apple = lm(Sapple$Close ~ Sapple$Sentiment, data = Sapple)
summary(mc_apple)
layout(matrix(1:4,2,2))
plot(mc_apple)
layout(matrix(1:1,1,1))
plot(Sapple$Sentiment, Sapple$Close)
abline(mc_apple)
mc_amazon = lm(Samazon$Close ~ Samazon$Sentiment, data = Samazon)
summary(m_amazon)
layout(matrix(1:4,2,2))
plot(mc_amazon)
layout(matrix(1:1,1,1))
plot(Samazon$Sentiment, Samazon$Close)
abline(mc_amazon)
mc_facebook = lm(Sfacebook$Close ~ Sfacebook$Sentiment, data = Sfacebook)
summary(mc_facebook)
layout(matrix(1:4,2,2))
plot(mc_facebook)
layout(matrix(1:1,1,1))
plot(Sfacebook$Sentiment, Sfacebook$Close)
abline(mc_facebook)
III. Conclusion - The small R-squared score for all 3 stocks and the non-horizontal shape of the the fitted to residual graph as well as the presence of influential outliers, it can be concluded that their is no adequate proof to conclude that our data follows a linear data model. As such we cannot conclude that there is a relationship between the twitter sentiment scores of the stocks in questions on the given week with the stock’s closing price and trading volume. There may be various reason for this - the main one being the limited sample size. If we can repeat this study wtih a larger representation of the Twitter twitts and for at least a month’s worth of data, then a better conclusion and more insights can be drawn from the data