This script introduces some ideas on term matching in the context of market sentiment analysis
library(rvest)
library(tm)
library(ggplot2)
library(dplyr)
#Read in most recent agrimoney.com overview page
mw <- read_html("http://www.agrimoney.com/")
info <- mw %>%
html_nodes("p , .brownlinklarge , .featuretextmedium , td td td td td") %>%
html_text()
#Remove break characters
info <- gsub("[\t\r\n]", "", info)
#Create text corpus and term document matrix for analysis
corpus <- Corpus(VectorSource(info))
tdm <- TermDocumentMatrix(corpus,
control = list(removePunctuation = TRUE,
removeNumbers = TRUE,
stopwords = TRUE))
m <- as.matrix(tdm)
v <- sort(rowSums(m), decreasing=TRUE)
head(v, 20)
## futures production hedge output cocoa crop
## 10 5 4 4 3 3
## fall funds harvest high hurt last
## 3 3 3 3 3 3
## prices record stocks sugar wheat will
## 3 3 3 3 3 3
## argentine bearish
## 2 2
#read in sentiment terms
senterms <- read.csv("senterms.csv", na.strings ="")
#create term dataframe and summarise positive vs negative terms
v <- as.data.frame(v)
v$term <- row.names(v)
summarise(v,
total_words = sum(v),
neg = sum(v[term %in% senterms$negative]),
pos = sum(v[term %in% senterms$positive]),
negratio = round(sum(v[term %in% senterms$negative])/sum(v), 4),
posratio = round(sum(v[term %in% senterms$positive])/sum(v), 4))
## total_words neg pos negratio posratio
## 1 281 19 7 0.0676 0.0249
#Set term of interest and view word correlations
toi <- "cocoa" # term of interest
corlimit <- 0.2 # lower correlation bound limit.
toicor <- data.frame(findAssocs(tdm, toi, corlimit))
names(toicor)[1] <- "corr"
toicor$terms <- row.names(toicor)
toicor
## corr terms
## african 0.89 african
## amid 0.89 amid
## far 0.89 far
## lows 0.89 lows
## new 0.89 new
## plumb 0.89 plumb
## throw 0.89 throw
## towel 0.89 towel
## uncertainty 0.89 uncertainty
## week 0.89 week
## west 0.89 west
## extend 0.61 extend
## losses 0.61 losses
## futures 0.48 futures
## funds 0.47 funds
## production 0.46 production
## concerns 0.42 concerns
## handsomely 0.42 handsomely
## headway 0.42 headway
## helped 0.42 helped
## outperformed 0.42 outperformed
## post 0.42 post
## hedge 0.39 hedge
## weather 0.27 weather
summarise(toicor,
total = length(terms),
neg = length(unique(grep(paste(senterms$negative,collapse="|"), terms, value=TRUE))),
pos = length(unique(grep(paste(senterms$positive,collapse="|"), terms, value=TRUE))),
negratio = round(length(unique(grep(paste(senterms$negative,collapse="|"), terms, value=TRUE)))/length(terms),4),
posratio = round(length(unique(grep(paste(senterms$positive,collapse="|"), terms, value=TRUE)))/length(terms),4))
## total neg pos negratio posratio
## 1 24 2 2 0.0833 0.0833
ggplot(toicor, aes( y = terms ) ) +
geom_point(aes(x = corr), data = toicor) +
xlab(paste0("Correlation with the term ", "\"", toi, "\""))