library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidytext)
library(ndjson)
##
## Attaching package: 'ndjson'
## The following object is masked from 'package:purrr':
##
## flatten
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(syuzhet)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
setwd("~/projects/sentiment_analysis")
theme_set(theme_light())
selected <- read_csv("selected.csv")
## Parsed with column specification:
## cols(
## created_at = col_character(),
## timestamp_ms = col_double(),
## text = col_character(),
## user.location = col_character()
## )
# convert ms to time
selected$timestamp_ms <- format(as.POSIXct(selected$timestamp_ms/ 1000, origin = "1970-01-01", tz = "Europe/Dublin"), "%y-%m-%d %H:%M")
selected$timestamp_ms <- ymd_hm(selected$timestamp_ms)
# Remove links
selected$clean_text <- gsub("http.*","", selected$text)
selected$clean_text <- gsub("https.*","", selected$clean_text)
# To lowercase
selected$clean_text <- tolower(selected$clean_text)
# Remove rt
selected$clean_text <- gsub("rt", "", selected$clean_text)
# Replace @UserName
selected$clean_text <- gsub("@\\w+", "", selected$clean_text)
# Remove punctuation
selected$clean_text <- gsub("[[:punct:]]", "", selected$clean_text)
# Remove tabs
selected$clean_text <- gsub("[ |\t]{2,}", "", selected$clean_text)
# Remove blank spaces at the beginning
selected$clean_text <- gsub("^ ", "", selected$clean_text)
# Remove blank spaces at the end
selected$clean_text <- gsub(" $", "", selected$clean_text)
token_selected <- selected %>%
unnest_tokens(word, clean_text, drop = FALSE)
Seperate by player and then stack
sterling <- selected %>% filter(str_detect(clean_text, "sterling")) %>% mutate(source = "sterling")
rashford <- selected %>% filter(str_detect(clean_text, "rashford")) %>% mutate(source = "rashford")
fred <- selected %>% filter(str_detect(clean_text, "fred")) %>% mutate(source = "fred")
stacked <- rbind(sterling, rashford, fred)
Half-time and full-time are clearly visible
events <- tibble(time = c(as.POSIXct("2019-12-07 17:30:00"), as.POSIXct("2019-12-07 18:19:00"), as.POSIXct("2019-12-07 18:34:00"), as.POSIXct("2019-12-07 19:23:00")), text = c("Kick Off", "Half Time", "Second Half", "Full Time"))
ggplot(selected) +
aes(x = timestamp_ms) +
geom_bar(fill = "#0c4c8a") +
labs(x = "Time", y = "Number of tweets per minute", title = "Twitter mentions of Sterling, Rashford, and Fred", subtitle = "During the Manchester derby 07 Dec 19", caption = "Data obtained via rtweet and Twitter API") +
geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 17:30:00"))), col = "red", linetype = "dashed") +
geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 18:19:00"))), col = "red", linetype = "dashed") +
geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 18:34:00"))), col = "red", linetype = "dashed") +
geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 19:23:00"))), col = "red", linetype = "dashed") +
geom_text(data = events, aes(x = time, y = 0, label = text), size = 3, vjust=-47, hjust = -.1, colour = "red")
## Warning: Removed 240 rows containing non-finite values (stat_count).
Get nrc sentiment of the data
selected_sentiment <- get_nrc_sentiment(selected$text)
#calculationg total score for each sentiment
selected_sentiment_score <-data.frame(colSums(selected_sentiment[,]))
names(selected_sentiment_score) <-"Score"
selected_sentiment_score<-cbind("sentiment"=rownames(selected_sentiment_score),selected_sentiment_score)
rownames(selected_sentiment_score)<-NULL
Graph overall sentiment
selected_sentiment_score_emotions <- selected_sentiment_score %>%
filter(sentiment != "negative") %>%
filter(sentiment != "positive")
ggplot(selected_sentiment_score_emotions, aes(sentiment, Score)) +
geom_col(aes(fill = selected_sentiment_score_emotions$sentiment)) +
labs(x = "NRC Sentiment Score", y = "Emotion", title = "Twitter mentions of Sterling, Rashford, and Fred", subtitle = "During the Manchester derby 07 Dec 19", caption = "Data obtained via rtweet and Twitter API") +
theme(legend.position = "none")
Look at player sentiment over time
stacked <- stacked %>%
mutate(sent_score = get_sentiment(clean_text, method = "afinn"))
stacked %>%
group_by(source, timestamp_ms) %>%
summarise(mean_sent = mean(sent_score)) %>%
ggplot(aes(timestamp_ms, mean_sent)) +
geom_point( aes(colour = source, alpha = 0.1)) +
geom_smooth(aes(colour = source)) +
labs(x = "Time", y = "AFINN Sentiment Score", title = "Twitter mentions of Sterling, Rashford, and Fred", subtitle = "During the Manchester derby 07 Dec 19", caption = "Data obtained via rtweet and Twitter API") +
geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 17:30:00"))), col = "red", linetype = "dashed") +
geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 18:19:00"))), col = "red", linetype = "dashed") +
geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 18:34:00"))), col = "red", linetype = "dashed") +
geom_vline(aes(xintercept = as.integer(as.POSIXct("2019-12-07 19:23:00"))), col = "red", linetype = "dashed") +
geom_text(data = events, aes(x = time, y = 0, label = text), size = 3, vjust=28, hjust =-0.1)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Word cloud
stacked_clean <- stacked %>%
unnest_tokens(word, clean_text, drop = FALSE)
rashford_cloud <- stacked_clean %>%
anti_join(stop_words) %>%
filter(source == "rashford") %>%
count(word) %>%
top_n(50, n)
## Joining, by = "word"
fred_cloud <- stacked_clean %>%
anti_join(stop_words) %>%
filter(source == "fred") %>%
count(word) %>%
top_n(50, n)
## Joining, by = "word"
sterling_cloud <- stacked_clean %>%
anti_join(stop_words) %>%
filter(source == "sterling") %>%
count(word) %>%
top_n(50, n)
## Joining, by = "word"
set.seed(99)
wordcloud(rashford_cloud$word, rashford_cloud$n, colors = "red")
wordcloud(fred_cloud$word, fred_cloud$n, colors = "red")
wordcloud(sterling_cloud$word, sterling_cloud$n, colors = "light blue")