Final Project Microsoft data

library(SentimentAnalysis)

## 
## Attaching package: 'SentimentAnalysis'

## The following object is masked from 'package:base':
## 
##     write

library(tidyverse)

## -- Attaching packages ---------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(quantmod)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## Loading required package: TTR

## Version 0.4-0 included new data defaults. See ?getSymbols.

library(rtweet)

## 
## Attaching package: 'rtweet'

## The following object is masked from 'package:purrr':
## 
##     flatten

Getting msft data

##Creating the connection with rtweet
create_token(
  app = "607",
  consumer_key = "KmfIjPiLdO1mZdam7fgEA9XXf",
  consumer_secret = "trP9eQKvDjzzERU2oyRHcJKqUlv3R8hq0ABWRCYy5V3L23lGmY",
  access_token = "480323958-6kQk1ST8tm0CA2qLShYtnozd2Y3rJSsePg5ejqci",
  access_secret = "Zj4q5qqm7pcBpQ4ZOT2TaFBIcZeJ2g1b3bDE3SgMrS1rY"
)

dates <- seq.Date(from = as.Date("2019-11-07"), to = as.Date("2019-12-06"), by =  1)

##Apple tweets
df_msf <- data.frame()

for (i in seq_along(dates)) {
  df_temp<-search_tweets("@$MSFT OR @MSFTNews", n =10000,
                         lang = 'en', include_rts = FALSE, 
                         retryonratelimit = TRUE)
  df_apl <- rbind(df_msft, df_temp)
}

msft<-df_msft
#Encode to utf8
msft$text <- enc2utf8(msft$text)
date <- msft$created_at
date <- str_extract(date, "\\d{4}-\\d{2}-\\d{2}")
date <- as.Date(date)
date <- as.Date(date, format ="%m/%d/%y")
msft$Date <- date
#Keep only the columns we need
msft<-subset(msft, select=c(Date,text))
#Write as csv file, this has been upload at 
write.csv(msft, file ="msftraw.csv", row.names = FALSE, fileEncoding="UTF-8")

Gettingprice

#setting dates on which we need the prices
start <- as.Date("2019-11-27")
end <- as.Date("2019-12-06")
getSymbols("MSFT", src = "yahoo", from = start, to = end)

## 'getSymbols' currently uses auto.assign=TRUE by default, but will
## use auto.assign=FALSE in 0.5-0. You will still be able to use
## 'loadSymbols' to automatically load data. getOption("getSymbols.env")
## and getOption("getSymbols.auto.assign") will still be checked for
## alternate defaults.
## 
## This message is shown once per session and may be disabled by setting 
## options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.

## [1] "MSFT"

price_msft<-as.data.frame(MSFT)
#Extracing date
price_msft$Date<-as.Date(index(MSFT))

#read the file into R
msft<-read.csv('https://raw.githubusercontent.com/zahirf/Data607/master/msft.csv', stringsAsFactors = FALSE)
class(msft$Date)

## [1] "character"

#date is read in as character, so we convert
msft$Date<-as.Date(msft$Date)
#clean up the text
msft$text <- gsub("http.*", "", msft$text)#remove url
msft$text <- gsub("https.*", "", msft$text)#remove url
msft$text <- gsub("&amp;", "&", msft$text) #remove &
msft$text <- gsub("$msft ", "", msft$text)#remove handle
msft$text <- gsub("^[[:space:]]*","",msft$text) # Remove leading whitespaces
msft$text <- gsub(" +"," ",msft$text) #Remove extra whitespaces
msft$text <- iconv(msft$text, "latin1", "ASCII", sub="") # Remove emojis
msft$text <- gsub("\\n", "", msft$text) #Replace line breaks with ""
msft$text <- gsub("[[:punct:]]","",msft$text) # Remove punctuation
glimpse(msft$text)

##  chr [1:55089] "251 ProfitPacked Option Trades YearlyDelivered to Your Inbox DailyFREE " ...

Sentiment Analysis

#run sentiment analysis
sentiment_msft<- analyzeSentiment(msft$text,
                            language = "english",
                            removeStopwords = TRUE, stemming = TRUE)

Build dataframe

#Build the final dataframe for analysis
df_msft<-cbind(msft$Date, sentiment_msft$WordCount, sentiment_msft$SentimentGI)
df_msft<-as.data.frame(df_msft)
#remove na values
df_msft <- df_msft[complete.cases(df_msft), ]
#rename columns after cbind
colnames(df_msft)[1:3]<-c("Date", "Count", "MeanSentiment")
#Frequency of sentiment words by date
count_msft<-df_msft%>%
  group_by(Date)%>%
  summarise(Count = sum(Count))
#Mean sentiments by date
mean_msft<-df_msft%>%
  group_by(Date)%>%
  summarise(Mean = mean(MeanSentiment))

Binding the data together

ret<-read.csv("returns.csv", stringsAsFactors = FALSE)
mean_msft<-cbind(mean_msft,ret$MSFT, count_msft$Count)
colnames(mean_msft)[3]<-("Returns")
colnames(mean_msft)[4]<-("Count")
write.csv(mean_msft, file ="data_msft.csv", row.names = FALSE, fileEncoding="UTF-8")

Final Project Microsoft data

Farhana Zahir

12/7/2019