First load some libraries.
library(syuzhet)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tidyr)
Then define some functions.
# I redefine Syuzhet's get_percentage_values function to match a bug I submitted, and to allow it to return percents on documents less than 200 sentences long; that cutoff was arbitrary
get_percentage_values <- function(raw_values, bins = 100){
if(!is.numeric(raw_values)) stop("Input must be a numeric vector")
#if(length(raw_values)/bins < 2){
# stop("Input vector needs to be twice as long as value number to make percentage based segmentation viable")
#}
chunks <- split(raw_values, cut(1:length(raw_values),100))
means = sapply(chunks, mean)
names(means) = 1:100
return(means)
}
get_sentiment_vector = function(location) {
document = get_text_as_string(location %>% as.character)
poa_v = get_sentences(document)
if (length(poa_v) < 100) {
#just punt on the short ones for simplicity
return(c())
}
sentiment_vector <- get_sentiment(poa_v, method="bing")
}
arcsFromFile = function(destination) {
destination %>% get_sentiment_vector %>% arcs_from_vector()
}
arcs_from_vector = function(sentiment_vector) {
# Fail on docs shorter than 100 sentences for simplicity
if (length(sentiment_vector)<100 ) {return(data.frame())}
percent_vals <- get_percentage_values(c(sentiment_vector))
mymean = function(x) {mean(x,na.rm=T)}
# A rolling average
rollingAverage = rollapply(percent_vals,width=30,FUN=mymean,fill=NA)
# A loess average
loessAverage = predict(loess(percent_vals ~ c(1:100)),span=0.3)
# The fourier transformed values
ft_values <- get_transformed_values(sentiment_vector, low_pass_size = 3,x_reverse_len = 100, scale_vals = TRUE,scale_range = FALSE)
data.frame(Fourier = ft_values,
loessAverage,rollingAverage,
percentage=1:length(ft_values)) %>%
gather(method,normalizedScore,Fourier,rollingAverage,loessAverage)
}
Create an artificial plot that gets happier.
# Create an artificial plot that gets happier as time goes on.
a = arcs_from_vector(seq(-5,5,length.out=400))
ggplot(a %>% filter(method!="Fourier")) + aes(x=percentage,y=normalizedScore,color=method) +
geom_line() + labs(title="With a plot that ascends straightforwardly,\nloess or average smoothing works fine.")
## Warning: Removed 29 rows containing missing values (geom_path).
Things get weird in the fourier smoothing.
ggplot(a) + aes(x=percentage,y=normalizedScore,color=method) +
geom_line() + labs(title="on a straightforwardly upward plot,\nsyuzhet's fourier filter imposes an assumption of cyclicality")
## Warning: Removed 29 rows containing missing values (geom_path).
This is true on real data as well: look at how it treats state of the unions since 1960.
The function is defined so that you can swap this out with any other folder filled with .txt
files: see what this looks like on your sources.
SOTUS = data.frame(filename=list.files("~/HDA15/data/SOTUS",full.names = T)) %>% mutate(short_name = gsub(".*/([^/]+).txt","\\1",filename))
# Remove this line if your filenames aren't years.
SOTUS = SOTUS %>% filter(as.numeric(short_name) > 1960)
h = SOTUS %>% group_by(filename,short_name) %>% do(arcsFromFile(.$filename)) %>%ungroup
ggplot(h) + aes(x=percentage,y=normalizedScore,color=method) +
geom_line() + facet_wrap(~short_name) +
geom_line(data = h %>% filter(percentage %in% c(1,100)),lty=2) + labs(title="Syuzhet tries to make State of the Unions\nlook cyclical:\ndashed lines show overall trend in Fourier vs. loess smoothing")
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).
## Warning: Removed 29 rows containing missing values (geom_path).