This script introduces some ideas on term matching in the context of market sentiment analysis

  1. Load Libraries
library(rvest)
library(tm)
library(ggplot2)
library(dplyr)
  1. Prepare Text Document Matrix
#Read in most recent agrimoney.com overview page
mw <- read_html("http://www.agrimoney.com/")
info <- mw %>%
  html_nodes("p , .brownlinklarge , .featuretextmedium , td td td td td") %>%
  html_text()

#Remove break characters
info <- gsub("[\t\r\n]", "", info)

#Create text corpus and term document matrix for analysis
corpus <- Corpus(VectorSource(info))

tdm <- TermDocumentMatrix(corpus,
                          control = list(removePunctuation = TRUE,
                                         removeNumbers = TRUE,
                                         stopwords = TRUE))
  1. Examine top 20 words
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing=TRUE)
head(v, 20)
##    futures production      hedge     output      cocoa       crop 
##         10          5          4          4          3          3 
##       fall      funds    harvest       high       hurt       last 
##          3          3          3          3          3          3 
##     prices     record     stocks      sugar      wheat       will 
##          3          3          3          3          3          3 
##  argentine    bearish 
##          2          2

Sentiment Analysis

  1. Read in Positive and Negative sentiment terms
#read in sentiment terms
senterms <- read.csv("senterms.csv", na.strings ="")

#create term dataframe and summarise positive vs negative terms
v <- as.data.frame(v)
v$term <- row.names(v)
  1. Summarise Overall webpage text in terms of positive/negative terms and ratio to overall text (ignorning common “stop” words)
summarise(v,
          total_words = sum(v),
          neg = sum(v[term %in% senterms$negative]),
          pos = sum(v[term %in% senterms$positive]),
          negratio = round(sum(v[term %in% senterms$negative])/sum(v), 4),
          posratio = round(sum(v[term %in% senterms$positive])/sum(v), 4))
##   total_words neg pos negratio posratio
## 1         281  19   7   0.0676   0.0249
  1. Look at a specific word correlation, in this example we look at the terms “cocoa” is associated with
#Set term of interest and view word correlations
toi <- "cocoa" # term of interest
corlimit <- 0.2 #  lower correlation bound limit.

toicor <- data.frame(findAssocs(tdm, toi, corlimit))
names(toicor)[1] <- "corr"
toicor$terms <- row.names(toicor)
toicor
##              corr        terms
## african      0.89      african
## amid         0.89         amid
## far          0.89          far
## lows         0.89         lows
## new          0.89          new
## plumb        0.89        plumb
## throw        0.89        throw
## towel        0.89        towel
## uncertainty  0.89  uncertainty
## week         0.89         week
## west         0.89         west
## extend       0.61       extend
## losses       0.61       losses
## futures      0.48      futures
## funds        0.47        funds
## production   0.46   production
## concerns     0.42     concerns
## handsomely   0.42   handsomely
## headway      0.42      headway
## helped       0.42       helped
## outperformed 0.42 outperformed
## post         0.42         post
## hedge        0.39        hedge
## weather      0.27      weather
  1. summarize positive and negative correlated terms, as we did with the overall document
summarise(toicor,
          total = length(terms),
          neg = length(unique(grep(paste(senterms$negative,collapse="|"), terms, value=TRUE))),
          pos = length(unique(grep(paste(senterms$positive,collapse="|"), terms, value=TRUE))),
          negratio = round(length(unique(grep(paste(senterms$negative,collapse="|"), terms, value=TRUE)))/length(terms),4),
          posratio = round(length(unique(grep(paste(senterms$positive,collapse="|"), terms, value=TRUE)))/length(terms),4))
##   total neg pos negratio posratio
## 1    24   2   2   0.0833   0.0833
  1. View Correlation Graph
ggplot(toicor, aes( y = terms  ) ) +
  geom_point(aes(x = corr), data = toicor) +
  xlab(paste0("Correlation with the term ", "\"", toi, "\""))