Analyze the daily stocks price changes(ex Tesla,Amazon) based on tweets on Twitter, extended the analysis of stock prices(Tesla,Amazon,Microsoft) with S&P 500 Index. Finally ported the tweets to Graph Dataframe.
tweet_list_TL <- searchTwitter("@AskTesla OR @TeslaInc OR @elonmusk OR @Tesla OR @TSLA OR @TSLAStock OR #TSLA OR #Tesla OR #TeslaInc" , n = 1000 , lang = "en", since = "2021-09-01")
tweet_list_AMZN <- searchTwitter("@AmazonStock OR @AMZN OR @jeffBezos OR @Amazon OR @AMZNStock OR @Bezos OR #AMZN OR AMZN" , n = 1000 , lang = "en", since = "2021-09-01")
#Convert list to dataframe
tweetstl.df <- twListToDF(tweet_list_TL)
tweetsamzn.df <- twListToDF(tweet_list_AMZN)
#removing duplicate tweets (retweets) from dataframe
tweetstl.nodups.df <- distinct(tweetstl.df, text, .keep_all = TRUE)
tweetsamzn.nodups.df <- distinct(tweetsamzn.df, text, .keep_all = TRUE)
#clean up dataframe a bit TL
tweetstl.nodups.df$text <- gsub('…', '', tweetstl.nodups.df$text) #remove ... at end of tweets
tweetstl.nodups.df <- plyr::rename(tweetstl.nodups.df, c("created" = "Date")) #rename created to Date
tweetstl.nodups.df$Date <- as.Date(tweetstl.nodups.df$Date) #convert from datetime to date format
head(tweetstl.nodups.df,3)
## text
## 1 @dominooo0123 @theHodljesus @RiseCryptoRise @shaunuk4 @dogeflokiglobal @elonmusk Old photo that. I sold at 5k paid https://t.co/8XhLLUOWjl
## 2 RT @WholeMarsBlog: Breaking — Apple’s Car Project Loses Three More Key Engineers to Startups $AAPL <U+2066>@elonmusk<U+2069> https://t.co/YHdKI9wQIN
## 3 RT @elonmusk: Nothing is more permanent than a “temporary” government program
## favorited favoriteCount replyToSN Date truncated replyToSID
## 1 FALSE 0 dominooo0123 2021-12-08 TRUE 1468720967484641287
## 2 FALSE 0 <NA> 2021-12-08 FALSE <NA>
## 3 FALSE 0 <NA> 2021-12-08 FALSE <NA>
## id replyToUID
## 1 1468721387204456451 2976691108
## 2 1468721387082854403 <NA>
## 3 1468721386915041291 <NA>
## statusSource
## 1 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
## 2 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
## 3 <a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>
## screenName retweetCount isRetweet retweeted longitude latitude
## 1 xcryptobob 0 FALSE FALSE NA NA
## 2 ray_hessel 7 TRUE FALSE NA NA
## 3 ClarkCousinit1 4699 TRUE FALSE NA NA
#clean up dataframe a bit AMZN
tweetsamzn.nodups.df$text <- gsub('…', '', tweetsamzn.nodups.df$text) #remove ... at end of tweets
tweetsamzn.nodups.df <- plyr::rename(tweetsamzn.nodups.df, c("created" = "Date")) #rename created to Date
tweetsamzn.nodups.df$Date <- as.Date(tweetsamzn.nodups.df$Date) #convert from datetime to date format
head(tweetsamzn.nodups.df,3)
## text
## 1 @loganjm2 We'd like to look into this with you. At your earliest convenience, please reach us by phone here: https://t.co/hApLpMlfHN. -Josue
## 2 RT @mattwridley: Viral is up to #116 on Amazon.\n\nHelp us keep that momentum going by ordering it now if you haven't yet (or retweeting this
## 3 RT @Rod_Marchand: Unlike most other books on business, this book clearly defines the difficulties of life and the challenges of managing an
## favorited favoriteCount replyToSN Date truncated replyToSID
## 1 FALSE 0 loganjm2 2021-12-08 FALSE 1468715392818368516
## 2 FALSE 0 <NA> 2021-12-08 FALSE <NA>
## 3 FALSE 0 <NA> 2021-12-08 FALSE <NA>
## id replyToUID
## 1 1468721414882615304 14022452
## 2 1468721408490590213 <NA>
## 3 1468721398730432512 <NA>
## statusSource
## 1 <a href="https://www.sprinklr.com" rel="nofollow">Sprinklr</a>
## 2 <a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>
## 3 <a href="https://twitter.com/" rel="nofollow">Deepa pantha</a>
## screenName retweetCount isRetweet retweeted longitude latitude
## 1 AmazonHelp 0 FALSE FALSE NA NA
## 2 okuSalar 23 TRUE FALSE NA NA
## 3 deepa_pantha 1 TRUE FALSE NA NA
#create text list with tweets for sentiment analysis TL
tweets_text_tl <- lapply(tweet_list_TL, function(x) x$getText())
tweets_text_tl <- sapply(tweets_text_tl,function(row) iconv(row, "latin1", "ASCII", sub=""))
tweets_nodups_text_tl <- unique(tweets_text_tl)
#create text list with tweets for sentiment analysis AMZN
tweets_text_amzn <- lapply(tweet_list_AMZN, function(x) x$getText())
tweets_text_amzn <- sapply(tweets_text_amzn,function(row) iconv(row, "latin1", "ASCII", sub=""))
tweets_nodups_text_amzn <- unique(tweets_text_amzn)
examine a word cloud of the tweets to see the types of words that make up the returned tweets.
#Create tweet corpus
r_stats_text_corpus_ttl <- Corpus(VectorSource(tweets_nodups_text_tl))
r_stats_text_corpus_amzn <- Corpus(VectorSource(tweets_nodups_text_amzn))
#Clean up corpus in prepartion for word cloud
r_stats_text_corpus_ttl <- tm_map(r_stats_text_corpus_ttl, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(r_stats_text_corpus_ttl,
## content_transformer(tolower)): transformation drops documents
r_stats_text_corpus_amzn <- tm_map(r_stats_text_corpus_amzn, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(r_stats_text_corpus_amzn,
## content_transformer(tolower)): transformation drops documents
#Transform all text to lower case
r_stats_text_corpus_ttl <- tm_map(r_stats_text_corpus_ttl, removePunctuation)
## Warning in tm_map.SimpleCorpus(r_stats_text_corpus_ttl, removePunctuation):
## transformation drops documents
r_stats_text_corpus_amzn <- tm_map(r_stats_text_corpus_amzn, removePunctuation)
## Warning in tm_map.SimpleCorpus(r_stats_text_corpus_amzn, removePunctuation):
## transformation drops documents
#remove all punctuation
r_stats_text_corpus_ttl <- tm_map(r_stats_text_corpus_ttl, function(x)removeWords(x,stopwords()))
## Warning in tm_map.SimpleCorpus(r_stats_text_corpus_ttl, function(x)
## removeWords(x, : transformation drops documents
r_stats_text_corpus_amzn <- tm_map(r_stats_text_corpus_amzn, function(x)removeWords(x,stopwords()))
## Warning in tm_map.SimpleCorpus(r_stats_text_corpus_amzn, function(x)
## removeWords(x, : transformation drops documents
#remove all stop words
#Apply sentiment to words
#Create color word cloud
wordcloud(r_stats_text_corpus_ttl, min.freq = 10, max.words = 150, colors=brewer.pal(8, "Dark2"))
wordcloud(r_stats_text_corpus_amzn, min.freq = 10, max.words = 150, colors=brewer.pal(8, "Dark2"))
## Sentiment Analysis For our sentiment analysis we will use a function created that uses a published lexicon of positive and negative words. This function will create a score for each tweet. A score of 0 indicates the tweet is neutral. A score of 1 or more indicates the tweet is positive. A score of -1 or less indicates the tweet is negative. The higher (or lower) the number indicates the relative strength of the sentiment (based on the count of words).
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list or a vector as an "l" for us
# we want a simple array of scores back, so we use "l" + "a" + "ply" = laply:
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
#The positive and negative words lexicons are stored in a local director
#Please see appendix/reference for more information on origin
hu.liu.pos = scan('https://raw.githubusercontent.com/jayatveluri/DataAcquisition/main/positive-words.txt', what = 'character', comment.char = ';')
hu.liu.neg = scan('https://raw.githubusercontent.com/jayatveluri/DataAcquisition/main/negative-words.txt', what = 'character', comment.char = ';')
#Here we add some additional words that were discovered from initial review of tweets
pos.words <- c(hu.liu.pos)
neg.words <- c(hu.liu.neg, 'wait', 'waiting', 'hold', 'onhold' , 'on hold', 'asshole', 'cancel','spam', 'spams', 'cancel', 'wtf')
tesla.scores <- score.sentiment(tweets_nodups_text_tl, pos.words, neg.words, .progress='none')
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:twitteR':
##
## id
## Loading required package: stringr
amazon.scores <- score.sentiment(tweets_nodups_text_amzn, pos.words, neg.words, .progress='none')
tesla.scores.merge <- merge(tesla.scores, tweetstl.nodups.df, by = 'text')
amazon.scores.merge <- merge(amazon.scores, tweetsamzn.nodups.df, by = 'text')
hist(tesla.scores.merge$score,xlab=" ",main="Sentiment of tweets that mention Tesla",
border="black",col="skyblue")
hist(amazon.scores.merge$score,xlab=" ",main="Sentiment of tweets that mention Amazon",
border="black",col="skyblue")
## Scatter plot of tweet date vs sentiment score
plot(tesla.scores.merge$Date, tesla.scores.merge$score, xlab = "Date", ylab = "Sentiment Score", main = "Sentiment of tweets that mention Tesla by Date")
plot(amazon.scores.merge$Date, amazon.scores.merge$score, xlab = "Date", ylab = "Sentiment Score", main = "Sentiment of tweets that mention Amazon by Date")
## TESLA total evaluation: positive / negative / neutral
tlstat <- tesla.scores.merge$score
tlstat <- mutate(tesla.scores.merge, tweet=ifelse(tesla.scores.merge$score > 0, 'positive', ifelse(tesla.scores.merge$score < 0, 'negative', 'neutral')))
tlby.tweet <- group_by(tlstat, tweet, Date)
tlby.tweet <- dplyr::summarise(tlby.tweet, number=n())
## `summarise()` has grouped output by 'tweet'. You can override using the `.groups` argument.
tlby.tweet
## # A tibble: 3 x 3
## # Groups: tweet [3]
## tweet Date number
## <chr> <date> <int>
## 1 negative 2021-12-08 69
## 2 neutral 2021-12-08 155
## 3 positive 2021-12-08 91
#Sentiment (positive, negative and neutral) over time
ggplot(tlby.tweet, aes(Date, number)) + geom_line(aes(group=tweet, color=tweet), size=2) +
geom_point(aes(group=tweet, color=tweet), size=4) +
theme(text = element_text(size=18), axis.text.x = element_text(angle=90, vjust=1))
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
## AMAZON total evaluation: positive / negative / neutral
amznstat <- amazon.scores.merge$score
amznstat <- mutate(amazon.scores.merge, tweet=ifelse(amazon.scores.merge$score > 0, 'positive', ifelse(amazon.scores.merge$score < 0, 'negative', 'neutral')))
amznby.tweet <- group_by(amznstat, tweet, Date)
amznby.tweet <- dplyr::summarise(amznby.tweet, number=n())
## `summarise()` has grouped output by 'tweet'. You can override using the `.groups` argument.
amznby.tweet
## # A tibble: 3 x 3
## # Groups: tweet [3]
## tweet Date number
## <chr> <date> <int>
## 1 negative 2021-12-08 131
## 2 neutral 2021-12-08 316
## 3 positive 2021-12-08 171
#Sentiment (positive, negative and neutral) over time
ggplot(amznby.tweet, aes(Date, number)) + geom_line(aes(group=tweet, color=tweet), size=2) +
geom_point(aes(group=tweet, color=tweet), size=4) +
theme(text = element_text(size=18), axis.text.x = element_text(angle=90, vjust=1))
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?
#Read stock price CSV in
stock_prices_tl <- read.csv("https://raw.githubusercontent.com/jayatveluri/DataAcquisition/main/TSLA.csv")
#Format date so R knows this is a date field
stock_prices_tl$Date <- as.Date(stock_prices_tl$Date, "%m/%d/%y")
#Left join the sentiment analysis with the stock prices
tweet_stock_tl <- left_join(tesla.scores.merge, stock_prices_tl, by = "Date")
tweet_stock_tl <- select(tweet_stock_tl, c('score', 'text'))
tweet_stock_tl <- head(tweet_stock_tl, 10)
tweet_stock_tl
## score
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 -1
## 7 -1
## 8 0
## 9 0
## 10 1
## text
## 1 $jar to 1$ due to the partnership with #Tesla\n#crypto #KuCoin $doge $doge $inx $btc $erth $link $rev $wbtc $fil https://t.co/FEXLdTwOiV
## 2 $JAR x #Tesla, Huge deal for $JAR you can still buy from @KuCoincom\n$ALGO $NANO $COMP $SXP $LAVA $NFT $CVC $GRT https://t.co/La2I9kLeil
## 3 @0xClaytonBigsby @elonmusk "Can Americans do the same thing with their debt that Congress does each year?" Leverage https://t.co/sl7N5oOVdT
## 4 @0xPete_ @elonmusk True. Atomic energy isn't the long time way to go but we should keep the plants running, that we https://t.co/7wLhNkGplQ
## 5 @4ThePlayer @JeffHut14 @eddie82s @PaulNeal123 @NecatiAslannn @elonmusk I have great concern for our planet and I'm https://t.co/XfTgNgBZDX
## 6 @4ThePlayer @PaulNeal123 @JeffHut14 @eddie82s @NecatiAslannn @elonmusk Is this the Greenpeace that protests spent f https://t.co/wZSNZ8ELVp
## 7 @Acers_ @elonmusk You were warned
## 8 @afneil All focus on @elonmusk @richardbranson @JeffBezos for a new planet to run away to.
## 9 @AidenRandd @melopacare @daniyalqaiserr @JanWaider @FenrirLow @SolLapse @toriimaddison1 @savnnhx @minato_mei https://t.co/mYOC2uxq5D
## 10 @andrewgmartin @ArmoAlice @elonmusk Well yeah lol \n\nBut many towns have man made lakes etc that could easily be used for this purpose.
#Read stock price CSV in
stock_prices_amzn <- read.csv("https://raw.githubusercontent.com/jayatveluri/DataAcquisition/main/AMZN.csv")
#Format date so R knows this is a date field
stock_prices_amzn$Date <- as.Date(stock_prices_amzn$Date, "%m/%d/%y")
#Left join the sentiment analysis with the stock prices
tweet_stock_amzn <- left_join(amazon.scores.merge, stock_prices_amzn, by = "Date")
tweet_stock_amzn <- select(tweet_stock_amzn, c('score', 'text'))
tweet_stock_amzn <- head(tweet_stock_amzn, 10)
tweet_stock_amzn
## score
## 1 1
## 2 0
## 3 0
## 4 -1
## 5 1
## 6 0
## 7 1
## 8 0
## 9 1
## 10 0
## text
## 1 "Infinite Dendrogram" Vol. 14 Paperback - Now Available!\n\nBarnes & Noble: https://t.co/K28PL9r7nc\nRightStuf: https://t.co/xFkAykn39C
## 2 #ad Crocs Kids' Classic Clog for $9!!!!\n\nhttps://t.co/9rnXye4npq https://t.co/xd3MnsqJO0
## 3 #azure #aws #googlecloud @microsoft @google @oracle @ibm @ibmdata @amazonwebservices @amazon @verizon @tmobile @att https://t.co/Y68nNKNM3L
## 4 #NowPlaying : Lost In You by #RodStewart Listen Live at https://t.co/KgDuz5xICO Get this song here: https://t.co/9yyONxZysm
## 5 #NowPlaying: Do You Love Me by Brian Poole #listen at https://t.co/ZItnxQQbHc Buy this song here: https://t.co/TPWeZfyqBF
## 6 #NowPlaying: Monday Monday by Mama's & Papa's #listen at https://t.co/ZItnxQQbHc Buy this song here: https://t.co/JHbKuEyRvl
## 7 #NowPlaying: Thank U Very Much by Scaffold #listen at https://t.co/ZItnxQQbHc Buy this song here: https://t.co/MFoiGSXlnz
## 8 #overpackaging, @amazon, do we really need that much of packaging for such small item? https://t.co/Bf0FtPZKTb
## 9 #RARE #starwars #behindthescenes photo #autographed by #ILM #modelmaker Tom St. Amand available @beckettmedia authe https://t.co/urXhRLDAE7
## 10 #SpecialPromotion for this revengeful, passionate and plenty of action historical romance\nhttps://t.co/MaBWIgGElq https://t.co/gh3TyxFkcF
plot(jitter(tweet_stock_tl$score), tweet_stock_tl$Daily.Change, xlab = "Sentiment Score", ylab = "Tesla Daily Change in Stock Price")
### Amazon Raw plot of sentiment score versus daily change in stock price
plot(jitter(tweet_stock_amzn$score), tweet_stock_amzn$Daily.Change, xlab = "Sentiment Score", ylab = "Amazon Daily Change in Stock Price")
## Conclusion of Amazon and Tesla Daily Stock Price Change With Tweets As we observe the above Raw plots for both Amazon and Tesla, both stock prices rose on positive tweets(tweet score), but Tesla Stock price rose more on positive tweets.
library(quantmod)
## Warning: package 'quantmod' was built under R version 4.1.2
## Loading required package: xts
## Warning: package 'xts' was built under R version 4.1.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.1.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Warning: package 'TTR' was built under R version 4.1.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
library(magrittr)
library(broom)
## Warning: package 'broom' was built under R version 4.1.2
start = as.Date("2021-09-01")
end = as.Date("2021-12-06")
getSymbols(c("TSLA", "AMZN", "MSFT","^GSPC"), src = "yahoo", from = start, to = end)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "TSLA" "AMZN" "MSFT" "^GSPC"
stocks = as.xts(data.frame(A = TSLA[, "TSLA.Adjusted"],
B = AMZN[, "AMZN.Adjusted"], C = MSFT[, "MSFT.Adjusted"],
E = GSPC[,"GSPC.Adjusted"]))
names(stocks) = c("Tesla", "Amazon", "Microsoft","S&P 500")
index(stocks) = as.Date(index(stocks))
## Plotting
stocks_series = tidy(stocks) %>%
ggplot(aes(x=index,y=value, color=series)) +
geom_line() +
facet_grid(series~.,scales = "free") +
labs(title = "Top Three US Tech Comany and S&P 500: Daily Stock Prices January 2020 - January 2021 (2)",
subtitle = "End of Day Adjusted Prices",
caption = " Source: Yahoo Finance") +
xlab("Date") + ylab("Price") +
scale_color_manual(values = c("Red", "Black", "DarkBlue","Orange"))
stocks_series
## Conclusion Stock prices of TESLA,MICROSOFT,AMAZON(daily/monthly) are in line(accordingly) with S&P 500 Index.
library("neo4r")
## Warning: package 'neo4r' was built under R version 4.1.2
library(magrittr)
con <- neo4j_api$new(
url = "http://localhost:7474",
user = "neo4j",
password = "password"
)
## call_neo4j(con, type = "graph")
## TESLA GRAPH DATA FRAME
g_tl <- graph.data.frame(tweet_stock_tl)
V(g_tl)$score <- sample(1:5, vcount(g_tl), replace=TRUE)
plot(g_tl, layout = layout.circle,vertex.label=V(g_tl)$text, edge.arrow.size=2)
plot(g_tl, layout = layout.circle,vertex.label=V(g_tl)$score, edges=c(1,2, 2,3, 3,1), n=3, directed=F)
## AMAZON GRAPH DATA FRAME
g_amzn <- graph.data.frame(tweet_stock_amzn)
V(g_amzn)$score <- sample(1:10, vcount(g_amzn), replace=TRUE)
plot(g_amzn, layout = layout.circle,vertex.label=V(g_amzn)$text, edge.arrow.size=2)
plot(g_amzn, layout = layout.circle,vertex.label=V(g_amzn)$score, edges=c(1,2, 2,3, 3,1), n=3, directed=F)
##query1 = "USING PERIODIC COMMIT
##LOAD CSV WITH HEADERS FROM 'https://raw.githubusercontent.com/jayatveluri/DataAcquisition/main/TSLA.csv' AS row
##CREATE (:Stocks {Date: row.Date, Open: row.Open, High: row.High, Low: row.Low, Close: row.Close});"
##cypher(graphdb, query1)
1)Yahoo Finance - Stock Prices 2) http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html - positive and negative lexicons 3)Twitter API (Using OAuth 2.0)
1)Extend the stock price analysis based on Feds report and jobs report. 2)From the .csv files and tweets first build a relation database in neo4j and then plot the stock price changes.