library(SentimentAnalysis)
##
## Attaching package: 'SentimentAnalysis'
## The following object is masked from 'package:base':
##
## write
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Version 0.4-0 included new data defaults. See ?getSymbols.
library(rtweet)
##
## Attaching package: 'rtweet'
## The following object is masked from 'package:purrr':
##
## flatten
Getting msft data
##Creating the connection with rtweet
create_token(
app = "607",
consumer_key = "KmfIjPiLdO1mZdam7fgEA9XXf",
consumer_secret = "trP9eQKvDjzzERU2oyRHcJKqUlv3R8hq0ABWRCYy5V3L23lGmY",
access_token = "480323958-6kQk1ST8tm0CA2qLShYtnozd2Y3rJSsePg5ejqci",
access_secret = "Zj4q5qqm7pcBpQ4ZOT2TaFBIcZeJ2g1b3bDE3SgMrS1rY"
)
dates <- seq.Date(from = as.Date("2019-11-07"), to = as.Date("2019-12-06"), by = 1)
##Apple tweets
df_msf <- data.frame()
for (i in seq_along(dates)) {
df_temp<-search_tweets("@$MSFT OR @MSFTNews", n =10000,
lang = 'en', include_rts = FALSE,
retryonratelimit = TRUE)
df_apl <- rbind(df_msft, df_temp)
}
msft<-df_msft
#Encode to utf8
msft$text <- enc2utf8(msft$text)
date <- msft$created_at
date <- str_extract(date, "\\d{4}-\\d{2}-\\d{2}")
date <- as.Date(date)
date <- as.Date(date, format ="%m/%d/%y")
msft$Date <- date
#Keep only the columns we need
msft<-subset(msft, select=c(Date,text))
#Write as csv file, this has been upload at
write.csv(msft, file ="msftraw.csv", row.names = FALSE, fileEncoding="UTF-8")
Gettingprice
#setting dates on which we need the prices
start <- as.Date("2019-11-27")
end <- as.Date("2019-12-06")
getSymbols("MSFT", src = "yahoo", from = start, to = end)
## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
##
## This message is shown once per session and may be disabled by setting
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
## [1] "MSFT"
price_msft<-as.data.frame(MSFT)
#Extracing date
price_msft$Date<-as.Date(index(MSFT))
#read the file into R
msft<-read.csv('https://raw.githubusercontent.com/zahirf/Data607/master/msft.csv', stringsAsFactors = FALSE)
class(msft$Date)
## [1] "character"
#date is read in as character, so we convert
msft$Date<-as.Date(msft$Date)
#clean up the text
msft$text <- gsub("http.*", "", msft$text)#remove url
msft$text <- gsub("https.*", "", msft$text)#remove url
msft$text <- gsub("&", "&", msft$text) #remove &
msft$text <- gsub("$msft ", "", msft$text)#remove handle
msft$text <- gsub("^[[:space:]]*","",msft$text) # Remove leading whitespaces
msft$text <- gsub(" +"," ",msft$text) #Remove extra whitespaces
msft$text <- iconv(msft$text, "latin1", "ASCII", sub="") # Remove emojis
msft$text <- gsub("\\n", "", msft$text) #Replace line breaks with ""
msft$text <- gsub("[[:punct:]]","",msft$text) # Remove punctuation
glimpse(msft$text)
## chr [1:55089] "251 ProfitPacked Option Trades YearlyDelivered to Your Inbox DailyFREE " ...
Sentiment Analysis
#run sentiment analysis
sentiment_msft<- analyzeSentiment(msft$text,
language = "english",
removeStopwords = TRUE, stemming = TRUE)
Build dataframe
#Build the final dataframe for analysis
df_msft<-cbind(msft$Date, sentiment_msft$WordCount, sentiment_msft$SentimentGI)
df_msft<-as.data.frame(df_msft)
#remove na values
df_msft <- df_msft[complete.cases(df_msft), ]
#rename columns after cbind
colnames(df_msft)[1:3]<-c("Date", "Count", "MeanSentiment")
#Frequency of sentiment words by date
count_msft<-df_msft%>%
group_by(Date)%>%
summarise(Count = sum(Count))
#Mean sentiments by date
mean_msft<-df_msft%>%
group_by(Date)%>%
summarise(Mean = mean(MeanSentiment))
Binding the data together
ret<-read.csv("returns.csv", stringsAsFactors = FALSE)
mean_msft<-cbind(mean_msft,ret$MSFT, count_msft$Count)
colnames(mean_msft)[3]<-("Returns")
colnames(mean_msft)[4]<-("Count")
write.csv(mean_msft, file ="data_msft.csv", row.names = FALSE, fileEncoding="UTF-8")