Term Analysis Ideas

This script introduces some ideas on term matching in the context of market sentiment analysis

Sentiment Analysis

Read in Positive and Negative sentiment terms

#read in sentiment terms
senterms <- read.csv("senterms.csv", na.strings ="")

#create term dataframe and summarise positive vs negative terms
v <- as.data.frame(v)
v$term <- row.names(v)

Summarise Overall webpage text in terms of positive/negative terms and ratio to overall text (ignorning common “stop” words)

summarise(v,
          total_words = sum(v),
          neg = sum(v[term %in% senterms$negative]),
          pos = sum(v[term %in% senterms$positive]),
          negratio = round(sum(v[term %in% senterms$negative])/sum(v), 4),
          posratio = round(sum(v[term %in% senterms$positive])/sum(v), 4))

##   total_words neg pos negratio posratio
## 1         281  19   7   0.0676   0.0249

Look at a specific word correlation, in this example we look at the terms “cocoa” is associated with

#Set term of interest and view word correlations
toi <- "cocoa" # term of interest
corlimit <- 0.2 #  lower correlation bound limit.

toicor <- data.frame(findAssocs(tdm, toi, corlimit))
names(toicor)[1] <- "corr"
toicor$terms <- row.names(toicor)
toicor

##              corr        terms
## african      0.89      african
## amid         0.89         amid
## far          0.89          far
## lows         0.89         lows
## new          0.89          new
## plumb        0.89        plumb
## throw        0.89        throw
## towel        0.89        towel
## uncertainty  0.89  uncertainty
## week         0.89         week
## west         0.89         west
## extend       0.61       extend
## losses       0.61       losses
## futures      0.48      futures
## funds        0.47        funds
## production   0.46   production
## concerns     0.42     concerns
## handsomely   0.42   handsomely
## headway      0.42      headway
## helped       0.42       helped
## outperformed 0.42 outperformed
## post         0.42         post
## hedge        0.39        hedge
## weather      0.27      weather

summarize positive and negative correlated terms, as we did with the overall document

summarise(toicor,
          total = length(terms),
          neg = length(unique(grep(paste(senterms$negative,collapse="|"), terms, value=TRUE))),
          pos = length(unique(grep(paste(senterms$positive,collapse="|"), terms, value=TRUE))),
          negratio = round(length(unique(grep(paste(senterms$negative,collapse="|"), terms, value=TRUE)))/length(terms),4),
          posratio = round(length(unique(grep(paste(senterms$positive,collapse="|"), terms, value=TRUE)))/length(terms),4))

##   total neg pos negratio posratio
## 1    24   2   2   0.0833   0.0833

View Correlation Graph

ggplot(toicor, aes( y = terms  ) ) +
  geom_point(aes(x = corr), data = toicor) +
  xlab(paste0("Correlation with the term ", "\"", toi, "\""))