library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidytext)
library(ndjson)
## 
## Attaching package: 'ndjson'
## The following object is masked from 'package:purrr':
## 
##     flatten
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(syuzhet)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

setwd("~/projects/sentiment_analysis")

theme_set(theme_light())
selected <- read_csv("selected.csv")
## Parsed with column specification:
## cols(
##   created_at = col_character(),
##   timestamp_ms = col_double(),
##   text = col_character(),
##   user.location = col_character()
## )

Clean data

# convert ms to time
selected$timestamp_ms <- format(as.POSIXct(selected$timestamp_ms/ 1000, origin = "1970-01-01", tz = "Europe/Dublin"), "%y-%m-%d %H:%M")

selected$timestamp_ms <- ymd_hm(selected$timestamp_ms)

# Remove links
selected$clean_text <- gsub("http.*","",  selected$text)
selected$clean_text <- gsub("https.*","", selected$clean_text)

# To lowercase
selected$clean_text <- tolower(selected$clean_text)

# Remove rt
selected$clean_text <- gsub("rt", "", selected$clean_text)

# Replace @UserName
selected$clean_text <- gsub("@\\w+", "", selected$clean_text)

# Remove punctuation
selected$clean_text <- gsub("[[:punct:]]", "", selected$clean_text)

# Remove tabs
selected$clean_text <- gsub("[ |\t]{2,}", "", selected$clean_text)

# Remove blank spaces at the beginning
selected$clean_text <- gsub("^ ", "", selected$clean_text)

# Remove blank spaces at the end
selected$clean_text <- gsub(" $", "", selected$clean_text)

token_selected <- selected %>% 
  unnest_tokens(word, clean_text, drop = FALSE)

Seperate by player and then stack

sterling <- selected %>% filter(str_detect(clean_text, "sterling")) %>% mutate(source = "sterling")
rashford <- selected %>% filter(str_detect(clean_text, "rashford")) %>% mutate(source = "rashford")
fred <- selected %>% filter(str_detect(clean_text, "fred")) %>% mutate(source = "fred")

stacked <- rbind(sterling, rashford, fred)

Visualise some data

Number of tweets per minute that mention Sterling/Rashford/Fred

Half-time and full-time are clearly visible

events <- tibble(time = c(as.POSIXct("2019-12-07 17:30:00"), as.POSIXct("2019-12-07 18:19:00"), as.POSIXct("2019-12-07 18:34:00"), as.POSIXct("2019-12-07 19:23:00")), text = c("Kick Off", "Half Time", "Second Half", "Full Time"))

ggplot(selected) +
 aes(x = timestamp_ms) +
 geom_bar(fill = "#0c4c8a") +
 labs(x = "Time", y = "Number of tweets per minute", title = "Twitter mentions of Sterling, Rashford, and Fred", subtitle = "During the Manchester derby 07 Dec 19", caption = "Data obtained via rtweet and Twitter API")  +
 geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 17:30:00"))), col = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 18:19:00"))), col = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 18:34:00"))), col = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 19:23:00"))), col = "red", linetype = "dashed") +
  geom_text(data = events, aes(x = time, y = 0, label = text), size = 3, vjust=-47, hjust = -.1, colour = "red")
## Warning: Removed 240 rows containing non-finite values (stat_count).

General sentiment

Get nrc sentiment of the data

selected_sentiment <- get_nrc_sentiment(selected$text)

#calculationg total score for each sentiment
selected_sentiment_score <-data.frame(colSums(selected_sentiment[,]))

names(selected_sentiment_score) <-"Score"
selected_sentiment_score<-cbind("sentiment"=rownames(selected_sentiment_score),selected_sentiment_score)
rownames(selected_sentiment_score)<-NULL

Graph overall sentiment

selected_sentiment_score_emotions <- selected_sentiment_score %>%
  filter(sentiment != "negative") %>% 
  filter(sentiment != "positive")

ggplot(selected_sentiment_score_emotions, aes(sentiment, Score)) +
  geom_col(aes(fill = selected_sentiment_score_emotions$sentiment)) +
  labs(x = "NRC Sentiment Score", y = "Emotion", title = "Twitter mentions of Sterling, Rashford, and Fred", subtitle = "During the Manchester derby 07 Dec 19", caption = "Data obtained via rtweet and Twitter API")  +
  theme(legend.position = "none")

Look at player sentiment over time

stacked <- stacked %>% 
  mutate(sent_score = get_sentiment(clean_text, method = "afinn"))

stacked %>% 
  group_by(source, timestamp_ms) %>% 
  summarise(mean_sent = mean(sent_score)) %>% 
  ggplot(aes(timestamp_ms, mean_sent)) +
  geom_point( aes(colour = source, alpha = 0.1)) +
  geom_smooth(aes(colour = source)) +
  labs(x = "Time", y = "AFINN Sentiment Score", title = "Twitter mentions of Sterling, Rashford, and Fred", subtitle = "During the Manchester derby 07 Dec 19", caption = "Data obtained via rtweet and Twitter API")  +
 geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 17:30:00"))), col = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 18:19:00"))), col = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 18:34:00"))), col = "red", linetype = "dashed") +
  geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 19:23:00"))), col = "red", linetype = "dashed") +
  geom_text(data = events, aes(x = time, y = 0, label = text), size = 3, vjust=28, hjust =-0.1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Word cloud

stacked_clean <- stacked %>% 
  unnest_tokens(word, clean_text, drop = FALSE)

rashford_cloud <- stacked_clean %>% 
  anti_join(stop_words) %>%
  filter(source == "rashford") %>% 
  count(word) %>% 
  top_n(50, n)
## Joining, by = "word"
fred_cloud <- stacked_clean %>% 
  anti_join(stop_words) %>%
  filter(source == "fred") %>% 
  count(word) %>% 
  top_n(50, n)
## Joining, by = "word"
sterling_cloud <- stacked_clean %>% 
  anti_join(stop_words) %>%
  filter(source == "sterling") %>% 
  count(word) %>% 
  top_n(50, n)
## Joining, by = "word"
set.seed(99)

wordcloud(rashford_cloud$word, rashford_cloud$n, colors = "red")

wordcloud(fred_cloud$word, fred_cloud$n, colors = "red")

wordcloud(sterling_cloud$word, sterling_cloud$n, colors = "light blue")