1. Build a model that accurately predicts stock sentiment: How public tweets opinions affect stock performance?
2. Stock forecast/prediction if machine learning performance is strong
rm(list=ls())
getwd()
setwd(“C:/du lieu/DataScienceProjects/Text Mining”)
install.packages(c(“tm”,“textstem”,“tidytext”,“wordcloud”,“textdata”,“stringr”,“randomForest”,“ipred”,‘e1071’,‘pROC’,“DescTools”,“caret”,“tidyverse”,“stopwords”,“corpus”))
library(stringr)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(textstem)
## Loading required package: koRpus.lang.en
## Loading required package: koRpus
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:tm':
##
## readTagged
## The following object is masked from 'package:readr':
##
## tokenize
library(tidytext)
library(textdata)
library(wordcloud)
## Loading required package: RColorBrewer
library(DescTools)
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
library(syuzhet)
library(e1071)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(caret)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v purrr 0.3.4
## v tidyr 1.2.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x randomForest::combine() masks dplyr::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
## x randomForest::margin() masks ggplot2::margin()
## x koRpus::tokenize() masks readr::tokenize()
library(stopwords)
##
## Attaching package: 'stopwords'
## The following object is masked from 'package:tm':
##
## stopwords
library(corpus)
Import csv file and view intial structure
tweets <- read.csv("Data.csv")
Out of 5000 rows, only 1300 were manually labelled.
We would subset the original csv file to only index those rows that were labelled manually.
tweets_labelled <- tweets[(tweets$sentiment == "positive" | tweets$sentiment == "negative" | tweets$sentiment == "neutral"),]
View(tweets_labelled)
head(tweets_labelled)
## id created_at
## 1 77522 2020-04-15 01:03:46+00:00
## 2 661634 2020-06-25 06:20:06+00:00
## 3 413231 2020-06-04 15:41:45+00:00
## 4 760262 2020-07-03 19:39:35+00:00
## 5 830153 2020-07-09 14:39:14+00:00
## 6 27027 2020-04-12 21:52:56+00:00
## text
## 1 RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Monarch Tokens✅\n\nUS Stock Market Crashes & what we can LEARN from them PT3!\n\nRETWEET, WATCH videoâ\200¦
## 2 #SriLanka surcharge on fuel removed!\n⛽📉\nThe surcharge of Rs.26 imposed on diesel and petrol has been revoked with effect from midnight on June 23 says Power, Energy and Transport Minister Mahinda. Amaraweera -Adaderana-\n\n#lka #FuelPrices #taxes #economy #stocks #StockMarket
## 3 Net issuance increases to fund fiscal programs > yields spike higher > risk off: #stocks and #EMFX correct lower > #Fed comes in with #YCC > stocks to new all time highs with 20% unemployment and -25% GDP. Fin.
## 4 RT @bentboolean: How much of Amazon's traffic is served by Fastly? Help us find out by running this tool from your IP address: https://t.coâ\200¦
## 5 $AMD Ryzen 4000 desktop CPUs looking â\200\230greatâ\200\231 and on track to launch in 2020 https://t.co/y7yYvXOVYJ #madtweets #stocks #cnbc #AMD
## 6 RT @QuantTrend: Reduce your portfolio RISK! GOLD is a perfect tail HEDGE!\n\nCentral banks balance sheet expansion & large fiscal deficits &â\200¦
## sentiment
## 1 positive
## 2 negative
## 3 positive
## 4 positive
## 5 positive
## 6 positive
Check the structure and summary of the data
str(tweets_labelled)
## 'data.frame': 1300 obs. of 4 variables:
## $ id : int 77522 661634 413231 760262 830153 27027 472959 392845 313771 267894 ...
## $ created_at: chr "2020-04-15 01:03:46+00:00" "2020-06-25 06:20:06+00:00" "2020-06-04 15:41:45+00:00" "2020-07-03 19:39:35+00:00" ...
## $ text : chr "RT @RobertBeadles: Yo💥\nEnter to WIN 1,000 Monarch Tokens✅\n\nUS Stock Market Crashes & what we can LE"| __truncated__ "#SriLanka surcharge on fuel removed!\n⛽📉\nThe surcharge of Rs.26 imposed on diesel and petrol has been rev"| __truncated__ "Net issuance increases to fund fiscal programs > yields spike higher > risk off: #stocks and #EMFX correc"| __truncated__ "RT @bentboolean: How much of Amazon's traffic is served by Fastly? Help us find out by running this tool from y"| __truncated__ ...
## $ sentiment : chr "positive" "negative" "positive" "positive" ...
summary(tweets_labelled)
## id created_at text sentiment
## Min. : 11 Length:1300 Length:1300 Length:1300
## 1st Qu.:239501 Class :character Class :character Class :character
## Median :482694 Mode :character Mode :character Mode :character
## Mean :476896
## 3rd Qu.:713023
## Max. :937521
DATA EXPLORATION:
Abstract(tweets_labelled)
## ------------------------------------------------------------------------------
## tweets_labelled
##
## data frame: 1300 obs. of 4 variables
## 1300 complete cases (100.0%)
##
## Nr ColName Class NAs Levels
## 1 id integer .
## 2 created_at character .
## 3 text character .
## 4 sentiment character .
Transform target variable sentiment to factor
tweets_labelled$sentiment <- factor(tweets_labelled$sentiment)
Plotting the distribution of target variable
plot(tweets_labelled$sentiment,
main = "Stock Sentiment",
xlab = "Sentiment")
Out of 1300 labelled tweets, we have around more than 500 positive sentiments, about 350 negative, and > 400 neutral. Checking the column names
colnames(tweets_labelled)
## [1] "id" "created_at" "text" "sentiment"
B.1: Text Preprocessing Remove Special Characters. This will remove hashtags too.
tweets_labelled <- tweets_labelled %>% mutate(new_text_noSpecial = gsub("[^0-9A-Za-z///' ]", "" ,text ,ignore.case = TRUE))
Transform to all lower cases
tweets_labelled <- tweets_labelled %>% mutate(new_text_lower = tolower(new_text_noSpecial))
Remove numbers
tweets_labelled <- tweets_labelled %>% mutate(new_text_noNumbers = gsub('[[:digit:]]','',new_text_lower))
Remove the stopwords:
stopwords_regex = paste(c("...",stopwords('en')), collapse = '\\b|\\b')
tweets_labelled <- tweets_labelled %>% mutate(new_text_noStopWords = gsub(stopwords_regex,'',new_text_noNumbers))
Remove @username
tweets_labelled <- tweets_labelled %>% mutate(new_text_noTags = gsub('@\\w*','',new_text_noStopWords))
Remove URLs from text #
tweets_labelled <- tweets_labelled %>% mutate(new_text_noURL = gsub('http.*\\s','',new_text_noTags))
Replace words within text # If there are words that have typos, let’s change them to the correct words.
tweets_labelled <- tweets_labelled %>% mutate(new_text_noTypos = gsub('fb','facebook',new_text_noURL))
Remove extra white space (this would include space, tab, vertical tab, newline, form feed, carriage return):
tweets_labelled <- tweets_labelled %>% mutate(new_text_noSpaces = gsub('\\s+',' ',new_text_noTypos))
Apply lemmatization
tweets_labelled <- tweets_labelled %>% mutate(new_text_Lemma = lemmatize_strings(new_text_noSpaces))
Apply stemming
tweets_labelled <- tweets_labelled %>% mutate(new_text_Stem = stem_strings(new_text_noSpaces))
Transform the tweets_labelled data frame to corpus. We need to transform it to corpus because the DocumentTermMatrix() function takes a corpus as an input.
corp <- Corpus(VectorSource(tweets_labelled$new_text_Lemma))
class(corp)
## [1] "SimpleCorpus" "Corpus"
length(corp)
## [1] 1300
inspect(corp[1])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1
##
## [1] robertbead yoen mona token st mar cras le ptretw wa vi
B.2: Tf-Idf Once you create Tf-Idf, you can use it as input to predictive models
my_tfidf <- DocumentTermMatrix(corp, control = list(weighting = weightTfIdf))
## Warning in TermDocumentMatrix.SimpleCorpus(x, control): custom functions are
## ignored
## Warning in weighting(x): empty document(s): 238 350 470 605 685 700 886 962 1211
my_tfidf
## <<DocumentTermMatrix (documents: 1300, terms: 4417)>>
## Non-/sparse entries: 9075/5733025
## Sparsity : 100%
## Maximal term length: 78
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
View high-level information about our document term matrix
inspect(my_tfidf)
## <<DocumentTermMatrix (documents: 1300, terms: 4417)>>
## Non-/sparse entries: 9075/5733025
## Sparsity : 100%
## Maximal term length: 78
## Weighting : term frequency - inverse document frequency (normalized) (tf-idf)
## Sample :
## Terms
## Docs earni have invest mar mark sha sto stockmar tar trad
## 1030 0 0 0 0 0 0 0 0 0 0
## 106 0 0 0 0 0 0 0 0 0 0
## 120 0 0 0 0 0 0 0 0 0 0
## 124 0 0 0 0 0 0 0 0 0 0
## 129 0 0 0 0 0 0 0 0 0 0
## 26 0 0 0 0 0 0 0 0 0 0
## 38 0 0 0 0 0 0 0 0 0 0
## 71 0 0 0 0 0 0 0 0 0 0
## 82 0 0 0 0 0 0 0 0 0 0
## 88 0 0 0 0 0 0 0 0 0 0
Dimension Reduction Minimum Document Frequency helps us specify the desired minimum number of documents a term must appear in. Let’s set the minimum frequency to 5.
my_tfidf5 <- DocumentTermMatrix(x = corp,
control = list(bounds = list(global = c(5, Inf))))
my_tfidf5
## <<DocumentTermMatrix (documents: 1300, terms: 306)>>
## Non-/sparse entries: 3855/393945
## Sparsity : 99%
## Maximal term length: 14
## Weighting : term frequency (tf)
Let’s take a look at the terms in our my_tfidf5
Terms(my_tfidf5)
## [1] "mar" "econ" "ene" "sto"
## [5] "stockmar" "hello" "hig" "increa"
## [9] "look" "portfo" "expec" "holdi"
## [13] "mill" "quar" "pri" "anot"
## [17] "decl" "glo" "have" "busin"
## [21] "invest" "earni" "bankni" "fut"
## [25] "welc" "interest" "someth" "analy"
## [29] "equ" "trad" "bitc" "dem"
## [33] "futu" "larg" "posit" "prem"
## [37] "recov" "eur" "lat" "pro"
## [41] "technicalanaly" "fol" "lev" "nas"
## [45] "resista" "supp" "bill" "bull"
## [49] "netf" "rec" "estima" "capi"
## [53] "sta" "exp" "ple" "coronavi"
## [57] "daytrad" "fina" "investme" "mark"
## [61] "resea" "resu" "sil" "acco"
## [65] "pay" "mak" "optionstrad" "sea"
## [69] "advertis" "faceb" "leadlagrep" "light"
## [73] "opti" "inv" "inves" "investm"
## [77] "poi" "gold" "rat" "goo"
## [81] "open" "unl" "contin" "los"
## [85] "datasw" "detecnoticea" "informationsto" "nasdaqdatasw"
## [89] "zeitge" "serv" "sheepknowm" "recess"
## [93] "tak" "tar" "availa" "break"
## [97] "high" "cal" "laks" "dol"
## [101] "ama" "gene" "pande" "ret"
## [105] "sec" "sup" "guida" "managem"
## [109] "dis" "quarte" "buy" "talk"
## [113] "hist" "annou" "forexsign" "gbp"
## [117] "volatil" "direct" "indicat" "wee"
## [121] "consen" "opt" "aro" "maxp"
## [125] "amer" "beli" "peo" "inst"
## [129] "com" "str" "prof" "rall"
## [133] "vol" "bear" "cry" "brokera"
## [137] "micro" "can" "foreca" "chin"
## [141] "gett" "succ" "ale" "negat"
## [145] "neut" "sentim" "hea" "techni"
## [149] "sha" "onl" "indus" "alw"
## [153] "themotleyf" "divide" "noth" "doll"
## [157] "premar" "sou" "daytra" "stockstotr"
## [161] "clo" "tre" "mom" "aver"
## [165] "cent" "recommendat" "sell" "comp"
## [169] "upd" "seekingal" "towa" "follow"
## [173] "tha" "tra" "sharemar" "patie"
## [177] "gro" "hold" "morn" "inter"
## [181] "liquid" "momen" "mov" "stockstowa"
## [185] "bec" "financ" "targ" "every"
## [189] "recei" "memb" "mon" "divid"
## [193] "performa" "coinb" "cryptocurre" "entrepren"
## [197] "ether" "foxbusin" "robinh" "retirem"
## [201] "financi" "pfi" "vacc" "lik"
## [205] "ent" "bou" "trendl" "jpmor"
## [209] "reve" "cha" "technol" "cre"
## [213] "sen" "stimu" "cou" "finis"
## [217] "thur" "produ" "correct" "bot"
## [221] "bra" "rep" "conti" "for"
## [225] "lead" "press" "retu" "dri"
## [229] "per" "softwareengin" "rev" "insi"
## [233] "harmon" "curr" "pennysto" "confir"
## [237] "everyth" "relea" "upda" "richlights"
## [241] "fact" "num" "compan" "wit"
## [245] "northmantra" "anal" "differ" "fundament"
## [249] "sign" "rea" "bet" "wan"
## [253] "wallstr" "cont" "try" "winn"
## [257] "grow" "act" "week" "yester"
## [261] "elliottw" "success" "forexsig" "forextrad"
## [265] "mor" "downl" "possi" "watch"
## [269] "fint" "soc" "contra" "opportun"
## [273] "optionsf" "tue" "previ" "chan"
## [277] "sho" "pat" "resour" "rati"
## [281] "consu" "equit" "iss" "stocktrad"
## [285] "fri" "potent" "numb" "longt"
## [289] "tomor" "yout" "with" "cor"
## [293] "res" "econo" "savsnt" "alre"
## [297] "priv" "tictoct" "stockmarketn" "rai"
## [301] "indi" "adammanc" "pullb" "stockmark"
## [305] "spend" "pre"
If data is bigger, Tf-Idf can get quite large, since it has a column for each term Tf-Idf has many terms, so lets, remove those that are sparse, those that appear only a few times 0.99 gives you 180 terms. change the number to lower or higher to get less or more terms 0.99 tells that a column must have at most 99% of the columns with 0 in them
my_tfidf_small <- removeSparseTerms(my_tfidf5, 0.99)
my_tfidf_small
## <<DocumentTermMatrix (documents: 1300, terms: 67)>>
## Non-/sparse entries: 2201/84899
## Sparsity : 97%
## Maximal term length: 11
## Weighting : term frequency (tf)
Let’s find those terms that occurs at least 25 times and 50 times in our corpus
findFreqTerms(x = my_tfidf_small, lowfreq = 25)
## [1] "mar" "econ" "sto" "stockmar" "have" "busin"
## [7] "invest" "earni" "analy" "equ" "trad" "futu"
## [13] "nas" "coronavi" "mark" "opti" "investm" "tar"
## [19] "str" "sha" "upd"
findFreqTerms(x = my_tfidf_small, lowfreq = 50)
## [1] "mar" "sto" "stockmar" "have" "invest" "earni" "trad"
## [8] "mark"
Transform the my_tfidf_small into a data frame
stock_data_frame <- as.data.frame(as.matrix(my_tfidf_small))
stock_data_frame <- cbind(sentiment_dependent = tweets_labelled$sentiment, stock_data_frame)
Now, that the data is stored as a tf-idf, we can use it in predictive modeling. The Sentiment column would be the dependent variable, while the other columns are the independent variables.
Word Cloud to visualize our corpus:
set.seed(2)
wordcloud(corp, # corpus object
random.order = FALSE, # most frequent in center
colors = brewer.pal(8, "Dark2"), # color schema
max.words = 150) # top 150 terms
Extract top 25 stock tickers:
ticker_pattern <- str_extract(tweets_labelled$text, "[$][A-Z]+")
top_ticker <- sort(table(x = ticker_pattern), decreasing = TRUE)
top25_ticker <- head(top_ticker, n = 25)
top25_ticker <- as.data.frame(top25_ticker)
Plot top 25 stocks:
library(ggplot2)
ggplot(data = top25_ticker, aes(x = x, y = Freq)) +
geom_bar(stat = "identity", fill = "#30b3a2") +
coord_flip( ) +
labs(title = "Top 25 Stock Tickers", x = "Ticker", y = "Frequency") +
geom_text(
aes(label = Freq, y = Freq + 25),
position = position_dodge(0.75),
vjust = 0.25)
D.1: Machine Learning Classification Method - Random Forest
D.1.1: Base Model
Reducing our Dependent Variable sentiment to 2 classes (positive, negative) for better performance. We transform the sentiment column so that it only has 2 factors: positive or negative. All the neutral sentiments will be omitted in the training set. First, let’s create a copy of our tf-idf
bi_stockDf <- stock_data_frame
Subsetting only the rows with positive and negative sentiment
data_2class <- bi_stockDf[(bi_stockDf == "positive" | bi_stockDf == "negative"),]
Removing empty rows
data_2class_full<- data_2class[complete.cases(data_2class), ]
Reducing 3 classes of Dependent Variable to 2 classes.
data_2class_full$sentiment_dependent <- as.character(data_2class_full$sentiment_dependent)
data_2class_full$sentiment_dependent <- as.factor(data_2class_full$sentiment_dependent)
Let’s split our data into training & testing set with 85/15 split.
set.seed(2)
sub <- createDataPartition(y = data_2class_full$sentiment_dependent, # target variable
p = 0.85,#85% will be used for training, the remaining 15% for testing
list = FALSE)
Let’s subset the rows of stock_data_frame to create the training dataframe, those not in sub will belong to the testing df.
bi_train <- data_2class_full[sub, ]
bi_test <- data_2class_full[-sub, ]
Desc(bi_train$sentiment_dependent)
## ------------------------------------------------------------------------------
## bi_train$sentiment_dependent (factor - dichotomous)
##
## length n NAs unique
## 745 745 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## negative 296 39.7% 36.3% 43.3%
## positive 449 60.3% 56.7% 63.7%
##
## ' 95%-CI (Wilson)
The frequency of positive (60.3%) is slight higher than negative(39.7%). Class Imbalance doesn’t seem to be an issue here.
PlotMiss(x = data_2class_full,
main = "Missing Values by Variable")
# There should be no NAs after our transformation.
initialize random seed
set.seed(2)
rf_mod <- randomForest(formula = sentiment_dependent ~. , # use all other variables to predict sentiment
data = bi_train%>%select(-`break`), # training data
importance = TRUE, # obtain variable importance
ntree = 500) # number of trees in forest
rf_mod
##
## Call:
## randomForest(formula = sentiment_dependent ~ ., data = bi_train %>% select(-`break`), importance = TRUE, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 38.52%
## Confusion matrix:
## negative positive class.error
## negative 63 233 0.7871622
## positive 54 395 0.1202673
Variable Importance Plot: We can view the most important variables in the Random Forest model using the varImpPlot() function
varImpPlot(x = rf_mod, # randomForest object
main = "Variable Importance Plot") # title
Training Performance: We use the predict() function to generate class predictions for our training set:
base.RFpreds <- predict(object = rf_mod, # RF model
type = "class") # class predictions
We can use the confusionMatrix() function from the caret package to obtain a confusion matrix and obtain performance measures for our model applied to the training dataset (train).
RF_btrain_conf <- confusionMatrix(data = base.RFpreds, # predictions
reference = bi_train$sentiment_dependent, # actual
positive = "positive",
mode = "everything")
RF_btrain_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 63 54
## positive 233 395
##
## Accuracy : 0.6148
## 95% CI : (0.5788, 0.6499)
## No Information Rate : 0.6027
## P-Value [Acc > NIR] : 0.2628
##
## Kappa : 0.1032
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8797
## Specificity : 0.2128
## Pos Pred Value : 0.6290
## Neg Pred Value : 0.5385
## Precision : 0.6290
## Recall : 0.8797
## F1 : 0.7335
## Prevalence : 0.6027
## Detection Rate : 0.5302
## Detection Prevalence : 0.8430
## Balanced Accuracy : 0.5463
##
## 'Positive' Class : positive
##
Testing Performance We use the predict() function to generate class predictions for our testing set
base.teRFpreds <- predict(object = rf_mod, # RF model
newdata = bi_test, # testing data
type = "class")
Convert to factors before comparison
stock_data_frame$sentiment_dependent <- as.factor(stock_data_frame$sentiment_dependent)
We can use the confusionMatrix() functionfrom the caret package to obtain a confusion matrix and obtain performance measures for our model applied to the testing dataset (test).
RF_btest_conf <- confusionMatrix(data = base.teRFpreds, # predictions
reference = bi_test$sentiment_dependent, # actual
positive = "positive",
mode = "everything")
RF_btest_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 11 8
## positive 41 71
##
## Accuracy : 0.626
## 95% CI : (0.5372, 0.7089)
## No Information Rate : 0.6031
## P-Value [Acc > NIR] : 0.3295
##
## Kappa : 0.1237
##
## Mcnemar's Test P-Value : 4.844e-06
##
## Sensitivity : 0.8987
## Specificity : 0.2115
## Pos Pred Value : 0.6339
## Neg Pred Value : 0.5789
## Precision : 0.6339
## Recall : 0.8987
## F1 : 0.7435
## Prevalence : 0.6031
## Detection Rate : 0.5420
## Detection Prevalence : 0.8550
## Balanced Accuracy : 0.5551
##
## 'Positive' Class : positive
##
The model does a better job at predicting negative class rather than positive, but overall better model than bagging?
GOODNESS OF FIT
To assess if the model is balanced, underfitting or overfitting, we compare the performance on the training and testing. We can use the cbind() function to compare side-by-side.
Overall
cbind(Training = RF_btrain_conf$overall,
Testing = RF_btest_conf$overall)
## Training Testing
## Accuracy 6.147651e-01 6.259542e-01
## Kappa 1.032040e-01 1.236860e-01
## AccuracyLower 5.787502e-01 5.371789e-01
## AccuracyUpper 6.498676e-01 7.088988e-01
## AccuracyNull 6.026846e-01 6.030534e-01
## AccuracyPValue 2.627703e-01 3.295115e-01
## McnemarPValue 8.019777e-26 4.844104e-06
Class-Level
cbind(Training = RF_btrain_conf$byClass,
Testing = RF_btest_conf$byClass
)
## Training Testing
## Sensitivity 0.8797327 0.8987342
## Specificity 0.2128378 0.2115385
## Pos Pred Value 0.6289809 0.6339286
## Neg Pred Value 0.5384615 0.5789474
## Precision 0.6289809 0.6339286
## Recall 0.8797327 0.8987342
## F1 0.7335190 0.7434555
## Prevalence 0.6026846 0.6030534
## Detection Rate 0.5302013 0.5419847
## Detection Prevalence 0.8429530 0.8549618
## Balanced Accuracy 0.5462853 0.5551363
options(scipen = 999)
D.1.2: Tuned Model
HYPERPARAMETER TUNING
We will tune the number of variables to randomly sample as potential variables to split on (m, the mtry argument).
We use the tuneRF() function in the randomForest package. The output will be a plot, where we choose the mtry with the smallest OOB Error. By setting doBest = TRUE, the best mtry will be used to automatically create an RF model
set.seed(2) # initialize random seed
tuneR <- tuneRF(x = bi_train[, -1], # use all variable except the 1st (sentiment_dependent) as predictors
y = bi_train$sentiment_dependent, # use sentiment_dependent as the target variable
ntreeTry = 500, # 500 trees in the forest
doBest = TRUE) # use the best m (mtry) value to create an RF model
## mtry = 8 OOB error = 39.33%
## Searching left ...
## mtry = 4 OOB error = 39.73%
## -0.01023891 0.05
## Searching right ...
## mtry = 16 OOB error = 40.94%
## -0.04095563 0.05
Most minimal mtry is 8, with the lowest OOB error
View basic model information
tuneR
##
## Call:
## randomForest(x = x, y = y, mtry = res[which.min(res[, 2]), 1])
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 38.52%
## Confusion matrix:
## negative positive class.error
## negative 64 232 0.7837838
## positive 55 394 0.1224944
View variable importance for the tuned model
varImpPlot(tuneR)
sto and trad are the 2 most important variables
Training Performance We use the predict() function to generate class predictions for our training set
tune.trRFpreds <- predict(object = tuneR, # tuned RF model
type = "class") # class predictions
We can use the confusionMatrix() function from the caret package to obtain a confusion matrix and obtain performance measures for our model applied to the training dataset (train).
RF_ttrain_conf <- confusionMatrix(data = tune.trRFpreds, # predictions
reference = bi_train$sentiment_dependent, # actual
positive = "positive",
mode = "everything")
RF_ttrain_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 64 55
## positive 232 394
##
## Accuracy : 0.6148
## 95% CI : (0.5788, 0.6499)
## No Information Rate : 0.6027
## P-Value [Acc > NIR] : 0.2628
##
## Kappa : 0.1044
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.8775
## Specificity : 0.2162
## Pos Pred Value : 0.6294
## Neg Pred Value : 0.5378
## Precision : 0.6294
## Recall : 0.8775
## F1 : 0.7330
## Prevalence : 0.6027
## Detection Rate : 0.5289
## Detection Prevalence : 0.8403
## Balanced Accuracy : 0.5469
##
## 'Positive' Class : positive
##
Testing Performance We use the predict() function to generate class predictions for our testing set
tune.teRFpreds <- predict(object = tuneR, # tuned RF model
newdata = bi_test, # testing data
type = "class")
We can use the confusionMatrix() function from the caret package to obtain a confusion matrix and obtain performance measures for our model applied to the testing dataset (test).
RF_ttest_conf <- confusionMatrix(data = tune.teRFpreds, # predictions
reference = bi_test$sentiment_dependent, # actual
positive = "positive",
mode = "everything")
RF_ttest_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 11 8
## positive 41 71
##
## Accuracy : 0.626
## 95% CI : (0.5372, 0.7089)
## No Information Rate : 0.6031
## P-Value [Acc > NIR] : 0.3295
##
## Kappa : 0.1237
##
## Mcnemar's Test P-Value : 0.000004844
##
## Sensitivity : 0.8987
## Specificity : 0.2115
## Pos Pred Value : 0.6339
## Neg Pred Value : 0.5789
## Precision : 0.6339
## Recall : 0.8987
## F1 : 0.7435
## Prevalence : 0.6031
## Detection Rate : 0.5420
## Detection Prevalence : 0.8550
## Balanced Accuracy : 0.5551
##
## 'Positive' Class : positive
##
GOODNESS OF FIT
To assess if the model is balanced, underfitting or overfitting, we compare the performance on the training and testing. We can use the cbind() function to compare side-by-side.
Overall
cbind(Training = RF_ttrain_conf$overall,
Testing = RF_ttest_conf$overall)
## Training Testing
## Accuracy 0.6147651006711409849003757699393 0.625954198473
## Kappa 0.1043535084008092167495718172177 0.123686006826
## AccuracyLower 0.5787501876571911063606989955588 0.537178866741
## AccuracyUpper 0.6498675860931428793776376551250 0.708898846994
## AccuracyNull 0.6026845637583893022792835836299 0.603053435115
## AccuracyPValue 0.2627703263981236991853052131773 0.329511533085
## McnemarPValue 0.0000000000000000000000002783966 0.000004844104
Class-Level
cbind(Training = RF_ttrain_conf$byClass,
Testing = RF_ttest_conf$byClass)
## Training Testing
## Sensitivity 0.8775056 0.8987342
## Specificity 0.2162162 0.2115385
## Pos Pred Value 0.6293930 0.6339286
## Neg Pred Value 0.5378151 0.5789474
## Precision 0.6293930 0.6339286
## Recall 0.8775056 0.8987342
## F1 0.7330233 0.7434555
## Prevalence 0.6026846 0.6030534
## Detection Rate 0.5288591 0.5419847
## Detection Prevalence 0.8402685 0.8549618
## Balanced Accuracy 0.5468609 0.5551363
D.1.3: Feature Selection
Feature Selection Let’s try feature selection to reduce our model training time, and its complexity for better interpretation and possible improved accuracy.
Let’s first subset the most important variables.
FS_Vars <-importance(x = rf_mod)
FS_Vars
## negative positive MeanDecreaseAccuracy MeanDecreaseGini
## mar -2.784410479 -3.50963230 -4.2046440 2.4310107
## econ 0.625123341 12.00979817 8.7223171 2.3006247
## sto 1.875828428 -2.31785797 -0.6974700 4.6525470
## stockmar 3.558115745 -2.65489383 0.5193733 2.3679024
## hig -4.853716602 -0.17530775 -3.3053070 1.5159050
## look 1.879015410 1.17048732 1.8774243 1.5005360
## mill 0.474940635 -2.21929919 -1.3583480 1.8067329
## anot -4.822830703 1.15074405 -2.0117834 1.3388384
## have 2.603544067 1.03802415 2.2472601 4.1276600
## busin 8.166787629 -0.81790076 5.0313761 2.3846098
## invest 5.316792610 -5.82635429 -0.6765620 2.7419676
## earni -2.614407721 -3.98201490 -4.3093583 1.5263246
## analy -3.600248900 -6.43739257 -6.6906258 1.1916317
## equ -9.013011101 4.39363318 -1.9991875 1.7038612
## trad 7.316511991 -1.01116824 5.3294013 4.5847394
## bitc 5.690162670 4.06274735 6.3073567 1.9950448
## futu -6.277418305 17.22908459 9.2456157 2.3381413
## posit 2.351936897 0.03946368 1.6377981 2.4194519
## eur -6.886047647 6.21258672 1.2453349 1.1515346
## pro 0.508756597 5.28725242 4.1450166 1.7364872
## lev -5.538834991 -2.71085630 -4.9675415 1.0318473
## nas -4.649522597 -4.39791803 -6.1382909 1.2971945
## resista -2.546447865 2.25927028 0.2177671 0.9881684
## supp -6.529607844 2.84515043 -2.1118638 1.6091228
## bull 11.099508697 8.66983181 12.1114850 3.3293148
## rec -1.684773525 -0.59539173 -1.2798025 1.2780544
## exp -0.441171133 -2.25381181 -1.8767190 1.1693196
## coronavi -1.291921856 -4.37214673 -4.0235079 1.9866070
## daytrad -5.512342420 11.27781577 5.0252921 1.5086057
## fina 7.957395623 7.14896249 10.3272171 2.9862391
## mark 0.251166322 7.68908174 5.7719282 3.0996167
## resea -2.858908269 -3.63975385 -4.1283670 0.8435115
## resu -0.907228222 -6.90909470 -5.9689034 1.2300167
## optionstrad 2.846404496 1.02439962 2.1496776 0.5223776
## faceb 11.961097112 1.42772407 8.4935155 1.6707535
## opti 3.435526821 1.64270250 3.0432619 2.2367396
## inv 2.850604893 4.65952077 5.0369747 2.2491142
## inves -5.503259640 11.17222122 5.0182183 1.0316223
## investm -5.220115476 14.14267239 8.4998938 2.1585335
## poi -0.155479900 1.49552451 0.9825917 1.7512931
## rat 0.018347017 -3.22407316 -2.3722712 1.4365826
## tar 5.947641618 -7.46833842 -1.1236748 1.3976800
## ama -1.205191263 2.49065411 1.2317092 2.4570262
## ret -1.189196250 -3.62998030 -3.3014998 1.5400635
## buy -4.120390094 -6.76005660 -6.8785436 1.3097667
## wee 0.739961580 2.04332384 2.0138873 1.5056357
## opt -3.562017951 -2.36665554 -3.4953458 1.1574351
## com -0.088522415 -1.39501088 -1.0573145 1.5938141
## str 0.427849306 -5.52507871 -3.3584488 1.4699785
## cry -1.916994761 5.13626639 1.9449523 1.4182489
## micro -3.283108140 0.14542019 -1.7917710 1.5184508
## can 9.703089010 10.25527455 11.7932745 3.0948515
## sha 11.954954591 13.50572538 15.6751493 4.1621046
## clo 6.109998529 3.47282380 6.0702867 2.2903388
## sell 7.564823600 5.14681864 7.5077984 2.2608539
## comp -0.605608948 -1.31175923 -1.3387712 1.7177340
## upd -2.400338398 4.42357253 1.8156016 1.4992714
## tra -0.870838875 2.81376491 1.3210779 1.2716585
## gro -4.539932585 -3.23669473 -4.6508827 0.9984049
## morn -2.391723124 2.88282131 0.9887291 1.6399219
## inter 14.075117541 -1.57855277 8.2827958 2.4611588
## stockstowa -2.951797057 -1.81820511 -3.0015462 0.5366252
## financ -1.853583250 4.86348885 2.2040749 1.5960183
## mon 0.006652518 5.32680738 3.5452578 2.0466248
## bou -2.626125450 -0.05113928 -1.4458100 0.9272763
## cha -0.491763843 2.31419526 1.1549019 1.3717510
Subsetting top 20 best predictors base on Gini Index
FS_20 <- rownames(as.data.frame(FS_Vars) %>% arrange(-MeanDecreaseGini))[1:20]
Analysis
set.seed(2)
rf_FS <- randomForest(formula = sentiment_dependent~ . ,
data = bi_train%>% select(sentiment_dependent,all_of(FS_20)),
importance = TRUE,
ntree = 500)
View the basic output
rf_FS
##
## Call:
## randomForest(formula = sentiment_dependent ~ ., data = bi_train %>% select(sentiment_dependent, all_of(FS_20)), importance = TRUE, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 38.52%
## Confusion matrix:
## negative positive class.error
## negative 35 261 0.88175676
## positive 26 423 0.05790646
Training Performance
FS.RFpreds <- predict(object = rf_FS,
type = "class")
RF_FS_conf <- confusionMatrix(data = FS.RFpreds,
reference = bi_train$sentiment_dependent,
positive = "positive",
mode = "everything")
RF_FS_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 35 26
## positive 261 423
##
## Accuracy : 0.6148
## 95% CI : (0.5788, 0.6499)
## No Information Rate : 0.6027
## P-Value [Acc > NIR] : 0.2628
##
## Kappa : 0.0698
##
## Mcnemar's Test P-Value : <0.0000000000000002
##
## Sensitivity : 0.9421
## Specificity : 0.1182
## Pos Pred Value : 0.6184
## Neg Pred Value : 0.5738
## Precision : 0.6184
## Recall : 0.9421
## F1 : 0.7467
## Prevalence : 0.6027
## Detection Rate : 0.5678
## Detection Prevalence : 0.9181
## Balanced Accuracy : 0.5302
##
## 'Positive' Class : positive
##
Testing Performance
FS.teRFpreds <- predict(object = rf_FS,
newdata = bi_test,
type = "class")
RF_btest_FS_conf <- confusionMatrix(data = FS.teRFpreds,
reference = bi_test$sentiment_dependent,
positive = "positive",
mode = "everything")
RF_btest_FS_conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 7 3
## positive 45 76
##
## Accuracy : 0.6336
## 95% CI : (0.545, 0.716)
## No Information Rate : 0.6031
## P-Value [Acc > NIR] : 0.2673
##
## Kappa : 0.1121
##
## Mcnemar's Test P-Value : 0.000000003262
##
## Sensitivity : 0.9620
## Specificity : 0.1346
## Pos Pred Value : 0.6281
## Neg Pred Value : 0.7000
## Precision : 0.6281
## Recall : 0.9620
## F1 : 0.7600
## Prevalence : 0.6031
## Detection Rate : 0.5802
## Detection Prevalence : 0.9237
## Balanced Accuracy : 0.5483
##
## 'Positive' Class : positive
##
Performance is roughly the same as the base model
GOODNESS OF FIT
options(scipen = 30)
Overall
cbind(Training = RF_FS_conf$overall,
Testing = RF_btest_FS_conf$overall)
## Training
## Accuracy 0.6147651006711409849003757699392735958099365234375
## Kappa 0.0697750301279512763485968207533005625009536743164
## AccuracyLower 0.5787501876571911063606989955587778240442276000977
## AccuracyUpper 0.6498675860931428793776376551249995827674865722656
## AccuracyNull 0.6026845637583893022792835836298763751983642578125
## AccuracyPValue 0.2627703263981236991853052131773438304662704467773
## McnemarPValue 0.0000000000000000000000000000000000000000002140169
## Testing
## Accuracy 0.633587786259542
## Kappa 0.112115221688788
## AccuracyLower 0.544979615778526
## AccuracyUpper 0.716010484285256
## AccuracyNull 0.603053435114504
## AccuracyPValue 0.267298899981495
## McnemarPValue 0.000000003261965
options(scipen = 9)
Class-Level
cbind(Training = RF_FS_conf$byClass,
Testing = RF_btest_FS_conf$byClass)
## Training Testing
## Sensitivity 0.9420935 0.9620253
## Specificity 0.1182432 0.1346154
## Pos Pred Value 0.6184211 0.6280992
## Neg Pred Value 0.5737705 0.7000000
## Precision 0.6184211 0.6280992
## Recall 0.9420935 0.9620253
## F1 0.7466902 0.7600000
## Prevalence 0.6026846 0.6030534
## Detection Rate 0.5677852 0.5801527
## Detection Prevalence 0.9181208 0.9236641
## Balanced Accuracy 0.5301684 0.5483204
D.1.4: Utilizing Random Forest Probability Calculation To Keep A Bigger Sample Size.
In this section, we will try to keep all the 3 classes in the data for testing since removing all neutral rows may make our sample size too small (876 obs. compared to 1300 obs.)
We still want to keep 3 classes for the testing set.
data_3classes <- bi_stockDf[complete.cases(bi_stockDf), ]
Let’s predict on the testing data with 3 classes and use probability mode.
base.teRf3class <- predict(object = rf_mod, # RF Base model
newdata = data_3classes, # testing data
type = "prob" )#probability mode instead of class mode
View(base.teRf3class)
Let’s set a threshold for our predictions. After trying various combinations, we believe the threshold should be: If the Positive % Column has the following percentage of being accurately predicted: 0-25%: Negative Class 25%-85%: Neutral Class 85% - 100%: Positive Class
base.teRf3class <- as.data.frame(base.teRf3class) %>%
mutate(prediction=ifelse(positive > 0.85, "positive",
ifelse(positive < 0.25, "negative","neutral")))
View(base.teRf3class)
The Confusion Matrix Performance
base.teRf3class$prediction <- as.factor(base.teRf3class$prediction)
data_3classes$sentiment_dependent <- as.factor(data_3classes$sentiment_dependent)
RF_CFM_3class <- confusionMatrix(data = base.teRf3class$prediction, # predictions
reference = stock_data_frame$sentiment_dependent, # actual original
positive = "positive",
mode = "everything")
RF_CFM_3class$overall
## Accuracy Kappa AccuracyLower AccuracyUpper AccuracyNull
## 4.346154e-01 1.057624e-01 4.074582e-01 4.620689e-01 4.061538e-01
## AccuracyPValue McnemarPValue
## 1.988980e-02 1.993520e-59
RF_CFM_3class$byClass
## Sensitivity Specificity Pos Pred Value Neg Pred Value Precision
## Class: negative 0.1379310 0.9884454 0.8135593 0.7582595 0.8135593
## Class: neutral 0.4198113 0.6073059 0.3409962 0.6838046 0.3409962
## Class: positive 0.6420455 0.5077720 0.4714882 0.6746988 0.4714882
## Recall F1 Prevalence Detection Rate
## Class: negative 0.1379310 0.2358722 0.2676923 0.03692308
## Class: neutral 0.4198113 0.3763214 0.3261538 0.13692308
## Class: positive 0.6420455 0.5437049 0.4061538 0.26076923
## Detection Prevalence Balanced Accuracy
## Class: negative 0.04538462 0.5631882
## Class: neutral 0.40153846 0.5135586
## Class: positive 0.55307692 0.5749087
As anticipated, due to the 3 classes in the target variable, the performance can’t be as good as the base model.
D.2: Lexicons
Apply the pre-processing function to my_data and save the output to my_data_clean
pre_processing_data_frame_fct <- function(text_column){
text_column <- tolower(text_column) # bring all the words to lower case
text_column <- gsub('[[:digit:]]','',text_column) # remove numbers
text_column <- gsub(paste(stopwords('en'), collapse = '\\b|\\b'),'',text_column) # remove stopwords
text_column <- gsub('[[:punct:]]','',text_column) # remove punctuation
text_column <- gsub('\\s+',' ',text_column) # remove white space
text_column <- lemmatize_strings(text_column) # lemmatize text
corp <- Corpus(VectorSource(text_column)) # transform to corpus
return(corp)
}
my_data_clean <- pre_processing_data_frame_fct(tweets_labelled$text)
my_data_clean
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 1300
Transform the clean data into a term document matrix
my_tdm <- TermDocumentMatrix(my_data_clean)
This gives a row for each term-document combination and the number of times each term appears in each document
tidy_frame <- tidy(my_tdm)
head(tidy_frame)
## # A tibble: 6 x 3
## term document count
## <chr> <chr> <dbl>
## 1 amp 1 1
## 2 can 1 1
## 3 crash 1 1
## 4 enter 1 1
## 5 learn 1 1
## 6 market 1 1
str(tidy_frame) #
## tibble [18,218 x 3] (S3: tbl_df/tbl/data.frame)
## $ term : chr [1:18218] "amp" "can" "crash" "enter" ...
## $ document: chr [1:18218] "1" "1" "1" "1" ...
## $ count : num [1:18218] 1 1 1 1 1 1 1 1 1 1 ...
Categorical sentiment assignment: Bing
sentiment_bing <- get_sentiments("bing")
sentiment_bing
## # A tibble: 6,786 x 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ... with 6,776 more rows
Loughran
sentiment_loughran <- get_sentiments("loughran")
sentiment_loughran
## # A tibble: 3,850 x 2
## word sentiment
## <chr> <chr>
## 1 abandon negative
## 2 abandoned negative
## 3 abandoning negative
## 4 abandonment negative
## 5 abandonments negative
## 6 abandons negative
## 7 abdicated negative
## 8 abdicates negative
## 9 abdicating negative
## 10 abdication negative
## # ... with 3,840 more rows
Numerical value sentiment assignment: Afinn
sentiment_afinn <- get_sentiments("afinn")
sentiment_afinn
## # A tibble: 2,477 x 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ... with 2,467 more rows
Feature Engineering the values of the sentiments to later merge them Bing
sentiment_bing <- sentiment_bing %>% rename(score_bing = sentiment) %>%
mutate(score_bing = ifelse(score_bing == "negative", -1, 1))
head(sentiment_bing)
## # A tibble: 6 x 2
## word score_bing
## <chr> <dbl>
## 1 2-faces -1
## 2 abnormal -1
## 3 abolish -1
## 4 abominable -1
## 5 abominably -1
## 6 abominate -1
tail(sentiment_bing)
## # A tibble: 6 x 2
## word score_bing
## <chr> <dbl>
## 1 zealous -1
## 2 zealously -1
## 3 zenith 1
## 4 zest 1
## 5 zippy 1
## 6 zombie -1
Loughran
unique(sentiment_loughran$sentiment)
## [1] "negative" "positive" "uncertainty" "litigious" "constraining"
## [6] "superfluous"
sentiment_loughran <- sentiment_loughran %>% rename(score_loughran = sentiment) %>%
filter(score_loughran %in% c("negative", "positive")) %>%
mutate(score_loughran = ifelse(score_loughran == "negative", -1, 1))
head(sentiment_loughran)
## # A tibble: 6 x 2
## word score_loughran
## <chr> <dbl>
## 1 abandon -1
## 2 abandoned -1
## 3 abandoning -1
## 4 abandonment -1
## 5 abandonments -1
## 6 abandons -1
tail(sentiment_loughran)
## # A tibble: 6 x 2
## word score_loughran
## <chr> <dbl>
## 1 ,82774 1
## 2 ,82791 1
## 3 ,82792 1
## 4 ,83670 1
## 5 ,83671 1
## 6 ,83672 1
Afinn
sentiment_afinn <- sentiment_afinn %>% rename(score_afinn = value)
head(sentiment_afinn)
## # A tibble: 6 x 2
## word score_afinn
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
tail(sentiment_afinn)
## # A tibble: 6 x 2
## word score_afinn
## <chr> <dbl>
## 1 youthful 2
## 2 yucky -2
## 3 yummy 3
## 4 zealot -2
## 5 zealots -2
## 6 zealous 2
Put the scores from all 3 dictionaries in 1 data frame
sentiments <- full_join(sentiment_bing, sentiment_loughran, by = c("word" = "word"))
sentiments <- full_join(sentiments, sentiment_afinn, by = c("word" = "word"))
head(sentiments)
## # A tibble: 6 x 4
## word score_bing score_loughran score_afinn
## <chr> <dbl> <dbl> <dbl>
## 1 2-faces -1 NA NA
## 2 abnormal -1 -1 NA
## 3 abolish -1 -1 NA
## 4 abominable -1 NA NA
## 5 abominably -1 NA NA
## 6 abominate -1 NA NA
tail(sentiments)
## # A tibble: 6 x 4
## word score_bing score_loughran score_afinn
## <chr> <dbl> <dbl> <dbl>
## 1 yearning NA NA 1
## 2 yeees NA NA 2
## 3 yes NA NA 1
## 4 yucky NA NA -2
## 5 yummy NA NA 3
## 6 zealots NA NA -2
You can use the tidy frame to bring the sentiment to it The tidy_frame and sentiment_bing data sets have the term and word columns in common
head(tidy_frame)
## # A tibble: 6 x 3
## term document count
## <chr> <chr> <dbl>
## 1 amp 1 1
## 2 can 1 1
## 3 crash 1 1
## 4 enter 1 1
## 5 learn 1 1
## 6 market 1 1
head(sentiments)
## # A tibble: 6 x 4
## word score_bing score_loughran score_afinn
## <chr> <dbl> <dbl> <dbl>
## 1 2-faces -1 NA NA
## 2 abnormal -1 -1 NA
## 3 abolish -1 -1 NA
## 4 abominable -1 NA NA
## 5 abominably -1 NA NA
## 6 abominate -1 NA NA
Merge the tidy_frame and sentiment_bing data sets; use by.x = “term” and by.y = “word to indicate the column that is the same in both data sets. Keep all the rows from tidy_frame and bring the matching results from the sentiment_bing data set
my_sentiments <- left_join(tidy_frame, sentiments, by = c("term" = "word"))
my_sentiments <- my_sentiments %>% arrange(document, term) # sort the data by the document and term columns
head(my_sentiments)
## # A tibble: 6 x 6
## term document count score_bing score_loughran score_afinn
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ’ 1 1 NA NA NA
## 2 … 1 1 NA NA NA
## 3 amp 1 1 NA NA NA
## 4 can 1 1 NA NA NA
## 5 crash 1 1 -1 NA -2
## 6 enter 1 1 NA NA NA
str(my_sentiments)
## tibble [18,218 x 6] (S3: tbl_df/tbl/data.frame)
## $ term : chr [1:18218] "’" "…" "amp" "can" ...
## $ document : chr [1:18218] "1" "1" "1" "1" ...
## $ count : num [1:18218] 1 1 1 1 1 1 1 1 1 1 ...
## $ score_bing : num [1:18218] NA NA NA NA -1 NA NA NA NA NA ...
## $ score_loughran: num [1:18218] NA NA NA NA NA NA NA NA NA NA ...
## $ score_afinn : num [1:18218] NA NA NA NA -2 NA NA NA NA NA ...
Bring the sentiment column to my_sentiments in the initial dataset, tweets_labelled, create a column that represents the row number;this equals the number of the documents and we will use this column to be able to bring the sentiment column to the data frame that has the sentiment scores
tweets_labelled <- tweets_labelled %>% mutate(document = row_number())
my_sentiments <- my_sentiments %>% mutate(document = as.integer(document))
my_sentiments <- full_join(my_sentiments, tweets_labelled %>% select(document, sentiment),
by = c("document" = "document"))
Replace all NAs in the dictionary columns with 0
my_sentiments <- my_sentiments %>% mutate_at(vars(score_bing, score_loughran, score_afinn), ~ if_else(is.na(.), 0, .))
head(my_sentiments)
## # A tibble: 6 x 7
## term document count score_bing score_loughran score_afinn sentiment
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 ’ 1 1 0 0 0 positive
## 2 … 1 1 0 0 0 positive
## 3 amp 1 1 0 0 0 positive
## 4 can 1 1 0 0 0 positive
## 5 crash 1 1 -1 0 -2 positive
## 6 enter 1 1 0 0 0 positive
Create a wordcloud To create a wordcloud, we need to know the list of words and how many times each word shows up to do so, we can use the term and count columns from my_sentiments data currently, my_sentiments data, shows the counts of words for each document. We need the total counts of each word, so let’s sum count for each term
cloud_data <- my_sentiments %>% group_by(term) %>% summarise(counts = sum(count))
cloud_data <- cloud_data %>% filter(!is.na(term)) # there is an NA in the term column; let's remove this row, otherwise you will have issues plotting the wordcloud
head(cloud_data %>% arrange(-counts)) # these are the most common words
## # A tibble: 6 x 2
## term counts
## <chr> <dbl>
## 1 stock 560
## 2 spx 269
## 3 aapl 182
## 4 trade 181
## 5 spy 170
## 6 amzn 159
wordcloud(words=cloud_data$term, freq=cloud_data$counts, random.order=FALSE, colors=brewer.pal(7, "Greens"), max.words = 70, min.freq = 20)
Calculate the lexicon scores per document per word and multiply the
number of times each word shows up in a document by the score of each
dictionary
my_sentiments <- my_sentiments %>% mutate(score_bing = count * score_bing,
score_loughran = count * score_loughran,
score_afinn = count * score_afinn)
head(my_sentiments)
## # A tibble: 6 x 7
## term document count score_bing score_loughran score_afinn sentiment
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <fct>
## 1 ’ 1 1 0 0 0 positive
## 2 … 1 1 0 0 0 positive
## 3 amp 1 1 0 0 0 positive
## 4 can 1 1 0 0 0 positive
## 5 crash 1 1 -1 0 -2 positive
## 6 enter 1 1 0 0 0 positive
As of now, we have a score for each of the 3 dictionaries per word-document combination what we need is a score for each dictionary for each document. To find this out, we need to sum up the scores per document
my_sentiments <- my_sentiments %>% group_by(document, sentiment) %>%
summarise(sum_score_bing = sum(score_bing),
sum_score_loughran = sum(score_loughran),
sum_score_afinn = sum(score_afinn))
## `summarise()` has grouped output by 'document'. You can override using the
## `.groups` argument.
head(my_sentiments)
## # A tibble: 6 x 5
## # Groups: document [6]
## document sentiment sum_score_bing sum_score_loughran sum_score_afinn
## <int> <fct> <dbl> <dbl> <dbl>
## 1 1 positive 0 0 2
## 2 2 negative -2 -1 -1
## 3 3 positive 0 0 -3
## 4 4 positive 0 0 2
## 5 5 positive 0 0 0
## 6 6 positive 0 0 -1
head(my_sentiments)
## # A tibble: 6 x 5
## # Groups: document [6]
## document sentiment sum_score_bing sum_score_loughran sum_score_afinn
## <int> <fct> <dbl> <dbl> <dbl>
## 1 1 positive 0 0 2
## 2 2 negative -2 -1 -1
## 3 3 positive 0 0 -3
## 4 4 positive 0 0 2
## 5 5 positive 0 0 0
## 6 6 positive 0 0 -1
tail(my_sentiments)
## # A tibble: 6 x 5
## # Groups: document [6]
## document sentiment sum_score_bing sum_score_loughran sum_score_afinn
## <int> <fct> <dbl> <dbl> <dbl>
## 1 1295 neutral 1 0 0
## 2 1296 positive 3 0 2
## 3 1297 positive 0 0 1
## 4 1298 neutral 0 0 1
## 5 1299 positive 1 0 2
## 6 1300 negative 0 0 1
As we did with ML Classification Models, We can also remove neutral class for better performance.
my_sentiments_2 <- my_sentiments
sentiment_lex <- my_sentiments_2[(my_sentiments_2$sentiment == "positive" | my_sentiments_2$sentiment == "negative"),]
sentiment_lex_full<- sentiment_lex[complete.cases(sentiment_lex), ]
Since sentiments still have 3 classes, we should reconvert it to factor so that R knows we only need 2 classes
sentiment_lex_full$sentiment <- as.character(sentiment_lex_full$sentiment)
sentiment_lex_full$sentiment <- as.factor(sentiment_lex_full$sentiment)
View(sentiment_lex_full)
We would want to subset the my_sentiments dataframe to extract the 3 columns with predicted sentiments.
sents_sub_2 <- sentiment_lex_full[ ,(ncol(sentiment_lex_full)-2):ncol(sentiment_lex_full)]
sents_sub_2 <- sents_sub_2 %>% mutate(sum_score_bing = ifelse(sum_score_bing >= 0, "positive", "negative")) %>%
mutate(sum_score_loughran = ifelse(sum_score_loughran >= 0, "positive", "negative")) %>%
mutate(sum_score_afinn = ifelse(sum_score_afinn >= 0, "positive", "negative"))
sents_sub_2 <- data.frame(lapply(X = sents_sub_2,
FUN = as.factor))
Loughran’s performance
loughran_cm_2 <- confusionMatrix(data = sents_sub_2$sum_score_loughran, reference = sentiment_lex_full$sentiment , positive = "positive", mode = "everything")
loughran_cm_2
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 143 37
## positive 205 491
##
## Accuracy : 0.7237
## 95% CI : (0.6928, 0.7531)
## No Information Rate : 0.6027
## P-Value [Acc > NIR] : 4.505e-14
##
## Kappa : 0.3714
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9299
## Specificity : 0.4109
## Pos Pred Value : 0.7055
## Neg Pred Value : 0.7944
## Precision : 0.7055
## Recall : 0.9299
## F1 : 0.8023
## Prevalence : 0.6027
## Detection Rate : 0.5605
## Detection Prevalence : 0.7945
## Balanced Accuracy : 0.6704
##
## 'Positive' Class : positive
##
Bing’s performance
bing_cm_2 <- confusionMatrix(data = sents_sub_2$sum_score_bing, reference = sentiment_lex_full$sentiment , positive = "positive", mode = "everything")
bing_cm_2
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 172 46
## positive 176 482
##
## Accuracy : 0.7466
## 95% CI : (0.7164, 0.7751)
## No Information Rate : 0.6027
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4348
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9129
## Specificity : 0.4943
## Pos Pred Value : 0.7325
## Neg Pred Value : 0.7890
## Precision : 0.7325
## Recall : 0.9129
## F1 : 0.8128
## Prevalence : 0.6027
## Detection Rate : 0.5502
## Detection Prevalence : 0.7511
## Balanced Accuracy : 0.7036
##
## 'Positive' Class : positive
##
Afinn performance
afinn_cm_2 <- confusionMatrix(data = sents_sub_2$sum_score_afinn, reference = sentiment_lex_full$sentiment , positive = "positive", mode = "everything")
afinn_cm_2
## Confusion Matrix and Statistics
##
## Reference
## Prediction negative positive
## negative 157 26
## positive 191 502
##
## Accuracy : 0.7523
## 95% CI : (0.7223, 0.7806)
## No Information Rate : 0.6027
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4372
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9508
## Specificity : 0.4511
## Pos Pred Value : 0.7244
## Neg Pred Value : 0.8579
## Precision : 0.7244
## Recall : 0.9508
## F1 : 0.8223
## Prevalence : 0.6027
## Detection Rate : 0.5731
## Detection Prevalence : 0.7911
## Balanced Accuracy : 0.7010
##
## 'Positive' Class : positive
##
We can do the same for bing and afinn. Need to convert tweets_labelled’ neutral sentiments to positive too.
Comparing 3 lexicons:
cbind(loughran = loughran_cm_2$overall,
bing = bing_cm_2$overall,
afinn = afinn_cm_2$overall)
## loughran bing afinn
## Accuracy 7.237443e-01 7.465753e-01 7.522831e-01
## Kappa 3.714062e-01 4.348190e-01 4.372447e-01
## AccuracyLower 6.928437e-01 7.164006e-01 7.223058e-01
## AccuracyUpper 7.531315e-01 7.750811e-01 7.805525e-01
## AccuracyNull 6.027397e-01 6.027397e-01 6.027397e-01
## AccuracyPValue 4.504932e-14 2.273674e-19 7.464263e-21
## McnemarPValue 6.959334e-27 4.804795e-18 8.662444e-29
cbind(loughran = loughran_cm_2$byClass,
bing = bing_cm_2$byClass,
afinn = afinn_cm_2$byClass)
## loughran bing afinn
## Sensitivity 0.9299242 0.9128788 0.9507576
## Specificity 0.4109195 0.4942529 0.4511494
## Pos Pred Value 0.7054598 0.7325228 0.7243867
## Neg Pred Value 0.7944444 0.7889908 0.8579235
## Precision 0.7054598 0.7325228 0.7243867
## Recall 0.9299242 0.9128788 0.9507576
## F1 0.8022876 0.8128162 0.8222768
## Prevalence 0.6027397 0.6027397 0.6027397
## Detection Rate 0.5605023 0.5502283 0.5730594
## Detection Prevalence 0.7945205 0.7511416 0.7910959
## Balanced Accuracy 0.6704219 0.7035658 0.7009535