library(SentimentAnalysis)
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(rtweet)
##
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
##
## flatten
Getting aapl data
##Creating the connection with rtweet
create_token(
app = "607",
consumer_key = "KmfIjPiLdO1mZdam7fgEA9XXf",
consumer_secret = "trP9eQKvDjzzERU2oyRHcJKqUlv3R8hq0ABWRCYy5V3L23lGmY",
access_token = "480323958-6kQk1ST8tm0CA2qLShYtnozd2Y3rJSsePg5ejqci",
access_secret = "Zj4q5qqm7pcBpQ4ZOT2TaFBIcZeJ2g1b3bDE3SgMrS1rY"
)
dates <- seq.Date(from = as.Date("2019-11-07"), to = as.Date("2019-12-06"), by = 1)
##Apple tweets
df_aapl <- data.frame()
for (i in seq_along(dates)) {
df_temp<-search_tweets("@$AAPL OR @AAPLNews", n =10000,
lang = 'en', include_rts = FALSE,
retryonratelimit = TRUE)
df_apl <- rbind(df_aapl, df_temp)
}
aapl<-df_aapl
#Encode to utf8
aapl$text <- enc2utf8(aapl$text)
date <- aapl$created_at
date <- str_extract(date, "\\d{4}-\\d{2}-\\d{2}")
date <- as.Date(date)
date <- as.Date(date, format ="%m/%d/%y")
aapl$Date <- date
#Keep only the columns we need
aapl<-subset(aapl, select=c(Date,text))
#Write as csv file, this has been upload at
write.csv(aapl, file ="aapl.csv", row.names = FALSE, fileEncoding="UTF-8")
Gettingprice
#setting dates on which we need the prices
start <- as.Date("2019-11-27")
end <- as.Date("2019-12-06")
getSymbols("AAPL", src = "yahoo", from = start, to = end)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "AAPL"
price_aapl<-as.data.frame(AAPL)
#Extracing date
price_aapl$Date<-as.Date(index(AAPL))
#read the file into R
aapl<-read.csv('https://raw.githubusercontent.com/zahirf/Data607/master/aapl.csv', stringsAsFactors = FALSE)
class(aapl$Date)
## [1] "character"
#date is read in as character, so we convert
aapl$Date<-as.Date(aapl$Date)
#clean up the text
aapl$text <- gsub("http.*", "", aapl$text)#remove url
aapl$text <- gsub("https.*", "", aapl$text)#remove url
aapl$text <- gsub("&", "&", aapl$text) #remove &
aapl$text <- gsub("$aapl ", "", aapl$text)#remove handle
aapl$text <- gsub("^[[:space:]]*","",aapl$text) # Remove leading whitespaces
aapl$text <- gsub("[[:space:]]*$","",aapl$text) # Remove trailing whitespaces
aapl$text <- gsub(" +"," ",aapl$text) #Remove extra whitespaces
aapl$text <- iconv(aapl$text, "latin1", "ASCII", sub="") # Remove emojis
aapl$text <- gsub("\\n", "", aapl$text) #Replace line breaks with ""
aapl$text <- gsub("[[:punct:]]","",aapl$text) # Remove punctuation
glimpse(aapl$text)
## chr [1:31620] "Felt bearish for next week In AAPL GOOGL SPY puts" ...
Sentiment Analysis
#run sentiment analysis
sentiment_aapl<- analyzeSentiment(aapl$text,
language = "english",
removeStopwords = TRUE, stemming = TRUE)
Build dataframe
#Build the final dataframe for analysis
df_aapl<-cbind(aapl$Date, sentiment_aapl$WordCount, sentiment_aapl$SentimentGI)
df_aapl<-as.data.frame(df_aapl)
#remove na values
df_aapl <- df_aapl[complete.cases(df_aapl), ]
#rename columns after cbind
colnames(df_aapl)[1:3]<-c("Date", "Count", "MeanSentiment")
#Frequency of sentiment words by date
count_aapl<-df_aapl%>%
group_by(Date)%>%
summarise(Count = sum(Count))
#Mean sentiments by date
mean_aapl<-df_aapl%>%
group_by(Date)%>%
summarise(Mean = mean(MeanSentiment))
Binding the data together
ret<-read.csv("returns.csv", stringsAsFactors = FALSE)
mean_aapl<-cbind(mean_aapl,ret$AAPL, count_aapl$Count)
colnames(mean_aapl)[3]<-("Returns")
colnames(mean_aapl)[4]<-("Count")
write.csv(mean_aapl, file ="data_aapl.csv", row.names = FALSE, fileEncoding="UTF-8")