The purpose is looking for correlation of twitter sentiment Jeff Bezos and amazone(AMZN). Amazon(AMZN) data can be downloaded from Yahoo Finance. Scrapping twitter use "#jeffbezos", and "@jeffBezos"
api_key <- "use your api_key"
api_secret <- "use your api_secret"
access_token <- "use your access_token "
access_token_secret <- "use your access_token_secret"
setup_twitter_oauth(api_key, api_secret,
access_token, access_token_secret)
## [1] "Using direct authentication"
numberOfTweets <- 3000
#Scrape tweets containing "#jeffBezos" and "@jeffBezos"
tweets <- searchTwitter(searchString="#jeffbezos", n = numberOfTweets, lang="en")
tweets2 <- searchTwitter(searchString="@jeffBezos", n = numberOfTweets, lang="en")
tweetsDF <- twListToDF(tweets)
tweetsDF2 <- twListToDF(tweets2)
tweetsFullDF <- rbind(tweetsDF, tweetsDF2)
write.csv(tweetsFullDF,"d:/twitter-sentiment/tweetsFullDF-N.csv")
Create subset of data
amzn <- subset(amzn, select = c(Date, Close))
Convert factors to dates
amzn$Date <- as.Date(amzn$Date)
Remove white spaces
Replace apostrophes with %% (for later replacement)
Remove emojis and other Unicode characters
Remove additional Unicode parts that may have remained
Remove orphaned full-stops
Reduce double spaces to single spaces
Change %% back to apostrophes
Remove URL from tweet
Replace any line breaks with “-”
Remove double hyphens where there were two line breaks
Fix ampersand
Add string to empty values (when only a URL was posted)
Look for truncated tweets (the API only retrieves 140 characters)
and add ellipses
Write new data frame for cleaned tweets
x <- tweetsFullDF
x$text <- enc2native(x$text)
x$text <- gsub("^[[:space:]]*","",x$text) # Remove leading whitespaces
x$text <- gsub("[[:space:]]*$","",x$text) # Remove trailing whitespaces
x$text <- gsub(" +"," ",x$text) #Remove extra whitespaces
x$text <- gsub("'", "%%", x$text) #Replace apostrophes with %%
x$text <- iconv(x$text, "latin1", "ASCII", sub="") # Remove emojis
x$text <- gsub("<(.*)>", "", x$text) #Remove Unicodes like <U+A>
x$text <- gsub("\\ \\. ", " ", x$text) #Replace orphaned fullstops with space
x$text <- gsub(" ", " ", x$text) #Replace double space with single space
x$text <- gsub("%%", "\'", x$text) #Change %% back to apostrophes
x$text <- gsub("https(.*)*$", "", x$text) #Remove tweet URL
x$text <- gsub("\\n", "-", x$text) #Replace line breaks with "-"
x$text <- gsub("--", "-", x$text) #Remove double "-" from double line breaks
x$text <- gsub("&", "&", x$text) #Fix ampersand &
x$text[x$text == " "] <- "<no text>"
for (i in 1:nrow(x)) {
if (x$truncated[i] == TRUE) {
x$text[i] <- gsub("[[:space:]]*$","...",x$text[i])
}
}
#Select desired column
cleanTweets <- x %>%
select("text")
sentiment <- analyzeSentiment(cleanTweets)
#---Extract dictionary-based sentiment according to the QDAP dictionary
sentiment2 <- sentiment$SentimentQDAP
#---View sentiment direction (i.e. positive, neutral and negative)
sentiment3 <- convertToDirection(sentiment$SentimentQDAP)
#Extract and convert 'date' column
date <- x$created
date <- str_extract(date, "\\d{4}-\\d{2}-\\d{2}")
date <- as.Date(date)
date <- as.Date(date, format = "%m/%d/%y")
#Create new dataframe with desired columns
df <- cbind(cleanTweets, sentiment2, sentiment3, date)
#Remove rows with NA
df <- df[complete.cases(df), ]
#df <- df[complete.cases(df), ]
#write.csv(df,"d:/data/JEFF-df-01.csv")
#Calculate the average of daily sentiment score
df <- read.csv("d:/data/JEFF-df-01.csv")
df <- df[,-1]
df2 <- df %>%
group_by(date) %>%
#summarise(meanSentiment = mean(sentiment2, na.rm=TRUE))
mutate(meanSentiment = mean(sentiment2, na.rm=TRUE))
#Get frquency of each sentiment i.e. positive, neutral, and negative
freq <- df %>%
group_by(date,sentiment3) %>%
dplyr::summarise(Freq=n())
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
#Convert data from long to wide
freq2 <- freq %>%
spread(key = sentiment3, value = Freq)
DT::datatable(freq2, editable = TRUE)
head(freq2)
## # A tibble: 6 x 4
## # Groups: date [6]
## date negative neutral positive
## <chr> <int> <int> <int>
## 1 2021-08-20 1 9 4
## 2 2021-08-21 15 41 61
## 3 2021-08-22 5 52 79
## 4 2021-08-23 8 58 74
## 5 2021-08-24 11 34 37
## 6 2021-08-25 12 33 39
p1 <- ggplot() +
geom_bar(mapping = aes(x = freq$date, y = freq$Freq,
fill = freq$sentiment3), stat = "identity") +
ylab('Sentiment Frequency') +
xlab('Date')+
theme(axis.text.x = element_text(angle = 90))
p1
AMZ <- read.csv("d:/twitter-sentiment/AMZ_CLEAN.csv")
AMZ <- AMZ[,-1]
mu <- mean(AMZ$close)
sd <- sd(AMZ$close)
AMZ2 <- AMZ%>%
mutate(AMZScore = (AMZ$close-mu)/sd)
head(AMZ2)
## date close AMZScore
## 1 2011-01-03 184.22 -0.9109478
## 2 2011-01-04 185.01 -0.9101420
## 3 2011-01-05 187.42 -0.9076837
## 4 2011-01-06 185.86 -0.9092749
## 5 2011-01-07 185.49 -0.9096523
## 6 2011-01-10 184.68 -0.9104786
AMZ02 <- AMZ2[2676:2682,]
p02 <- ggplot(data=AMZ02, aes(x=date,y=AMZScore, group=1)) +
geom_line()+
geom_point()+
ylab('AMZScore') +
xlab('date')+
ggtitle('Twitter sentiment AMZN stock')+
theme(axis.text.x = element_text(angle = 90))
p02
p01 <- ggplot(data=df2, aes(x=date,y=meanSentiment, group=1)) +
geom_line()+
geom_point()+
ylab('meanSentiment') +
xlab('Date')+
ggtitle('Twitter sentiment Jeff Bezos')+
theme(axis.text.x = element_text(angle = 90))
p01
grid.arrange(p02,p01)
Based on the data collected and analysis of twitter sentiment regarding Jeff Bezos and the
closing price of amazone stock(AMZN) from yahoo finance, it does not have the same trend and
does not have predictive properties.
When the average twitter score sentiment Jeff Bezos decreases(2021-08-27), the AMZN closing price increases.
also twitter score sentiment Jeff Bezos decreases(2021-08-24),the AMZN closing price increases.
Scrapping twitter data, is general in nature, while fluctuations in amazon shares
in yahoo finance are more influenced by economic fundamentals
Further investigations included collecting more data over a larger timeframe.