On this notebook we are going to analysis tweets from march madness 2018
Use regular expression to clean the tweets text
Familiarize with some natural language processing tools
# Here we are checking if the package is installed
if(!require("tidyverse")){
install.packages("tidyverse", dependencies = TRUE)
library("tidyverse")
}
if(!require("syuzhet")){
install.packages("syuzhet", dependencies = TRUE)
library("syuzhet")
}
if(!require("cleanNLP")){
install.packages("cleanNLP", dependencies = TRUE)
library("cleanNLP")
}
if(!require("magrittr")){
install.packages("magrittr", dependencies = TRUE)
library("magrittr")
}
if(!require("wordcloud")){
install.packages("wordcloud", dependencies = TRUE)
library("wordcloud")
}
tweets <- read_csv("data/march_madness.csv")
# Change the tweets IDs from longe integer to characters
tweets$tweet_id <- as.character(tweets$tweet_id)
# Extract and delete the links variable to add it at the end
links <- tweets$links
tweets$links <- NULL
# Inspects the first 10 rows
head(tweets)
replace_reg <- 'https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|(pic.twitter.com/[A-Za-z\\d]+)|&|<|>|RT|(.*.)\\.com(.*.)\\S+\\s|[^[:alnum:]]|(http|https)\\S+\\s*|(#|@)\\S+\\s*|\\n|\\"'
tweets <- tweets %>%
mutate(text = str_replace_all(text, replace_reg, " ")) %>%
mutate(text = iconv(text, from = "ASCII", to = "UTF-8", sub = " "))
head(tweets['text'])
nrc_data <- get_nrc_sentiment(tweets$text)
tweets <- bind_cols(tweets, nrc_data)
tweets$sentiment_bing <- get_sentiment(char_v = tweets$text, method="bing", language = "english")
tweets$links <- links
write_csv(tweets, "data/march_madness_sent.csv")
head(tweets[3:10])
tweets <- read_csv("data/sentiment_march_madness.csv")
tweets$tweet_id <- as.character(tweets$tweet_id)
head(tweets[12:21])
cnlp_init_udpipe()
doc <- cnlp_annotate(input = tweets$text, as_strings = TRUE, doc_ids = tweets$tweet_id, meta = tweets[-c(1,2)])
tokens <- cnlp_get_token(doc) %>%
group_by(id, sid) %>%
summarize(sent_len = n())
quantile(tokens$sent_len, seq(0,1,0.1))
qplot(x = 1:length(tweets$sentiment_bing),
y = tweets$sentiment_bing,
geom = "line",
xlab = "Narrative Time",
ylab = "Emotional Valence",
main = "Tweets Sentiment Trajectory")
tweets_entities <- cnlp_get_token(doc) %>%
filter(upos == "NOUN") %>%
group_by(lemma) %>%
summarize(count = n()) %>%
top_n(n = 80, count) %>%
arrange(desc(count)) %>%
use_series(lemma)
data_frame(tweets_entities)
tweets_summary <- cnlp_get_dependency(doc, get_token = TRUE) %>%
left_join(cnlp_get_document(doc)) %>%
select(id = id, start = word, word = lemma_target) %>%
left_join(word_frequency) %>%
filter(frequency < 0.0001) %>%
select(id, start, word) %$%
sprintf("%s => %s", start, word)
data_frame(tweets_summary)
angry_tweets <- which(tweets$anger > 0)
data_frame(tweet = tweets$text[angry_tweets][1:2])
joy_tweets <- which(tweets$joy > 0)
data_frame(tweet = tweets$text[joy_tweets][5:7])
value <- as.double(colSums(prop.table(tweets[, 11:18])))
emotion <- names(tweets)[11:18]
emotion <- factor(emotion, levels = names(tweets)[11:18][order(value, decreasing = FALSE)])
emotions <- data_frame(emotion, percent = value * 100)
head(emotions)
ggplot(data = emotions, aes(x = emotion, y = percent)) +
geom_bar(stat = "identity", aes(fill = emotion)) +
scale_fill_brewer(palette="RdYlGn") +
coord_flip() +
xlab("Emotion") +
ylab("Percentage")
remove_words <- c( "twitter", "chicago", "loyola", "ramblers", "loyolaramblers","school", "gonna",
"university", "luc", "loyolachicago" , "ramblersmbb", "ncaa","ve","basketball" ,
"umichbball", "marchmadness2018", "marchmadness", "final", "marchmaddness",
"goblue", "finalfour", "sisterjean", "ncaatournament", "ncaatournament2018",
"didn","city", "hey", "day", "college", "games", "tourney", "march", "game")
my_stop_words <- bind_rows(data_frame(word = remove_words, lexicon = c("SMART")), stop_words)
Error in bind_rows(data_frame(word = remove_words, lexicon = c("SMART")), :
could not find function "bind_rows"
twt_text <- tibble(text = tweets$text) %>%
unnest_tokens(word, text) %>%
filter(!word %in% my_stop_words$word, str_detect(word, "[a-z]"))
min_freq = 80
max_words = 100
fig_scale = c(3 , 0.5)
twt_text %>%
anti_join(my_stop_words) %>%
count(word) %>%
with(wordcloud(word, n,
scale = fig_scale,
min.freq = min_freq,
max.words = max_words))
Error in twt_text %>% anti_join(my_stop_words) %>% count(word) %>% with(wordcloud(word, :
could not find function "%>%"
mydata = read.csv("~/Dropbox/Cameron/college 2017-2018/Semester Two/BSAD- Business Analytics/BSAD Labs/09-notebook-lab/data/sentiment_march_madness.csv")
summary(mydata)
tweet_id text username
Min. :3.542e+16 : 1273 @LALATE : 81
1st Qu.:9.774e+17 : 1245 @RamblersMBB : 30
Median :9.777e+17 : 197 @SkywayChicago : 27
Mean :9.753e+17 : 51 @chicagomargaret: 21
3rd Qu.:9.777e+17 : 35 @sschrimp : 18
Max. :9.824e+17 SisterJean: 15 @loyolaforus : 16
(Other) :17371 (Other) :19994
fullname date datetime
LALATE : 81 2018-03-25:10708 2018-03-25T00:21:10Z: 16
Loyola Basketball: 31 2018-03-23: 2976 2018-03-25T00:21:31Z: 16
Steve Timble : 27 2018-03-24: 2274 2018-03-25T00:21:09Z: 15
Margaret Holt : 21 2018-03-26: 1504 2018-03-25T00:21:35Z: 15
Mark : 21 2018-03-18: 1099 2018-03-25T00:21:08Z: 14
Steve : 19 2018-03-27: 241 2018-03-25T00:21:11Z: 14
(Other) :19987 (Other) : 1385 (Other) :20097
verified reply retweets favorite
Min. :0.00000 Min. : 0.0000 Min. : 0.000 Min. : 0.0
1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0
Median :0.00000 Median : 0.0000 Median : 0.000 Median : 1.0
Mean :0.06192 Mean : 0.3467 Mean : 3.146 Mean : 15.8
3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 3.0
Max. :1.00000 Max. :591.0000 Max. :5143.000 Max. :32180.0
anger anticipation disgust fear joy
Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000
Median :0.0000 Median :0.0000 Median :0.00000 Median :0.0000 Median :0.000
Mean :0.1342 Mean :0.4359 Mean :0.07143 Mean :0.1612 Mean :0.421
3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:1.000
Max. :4.0000 Max. :7.0000 Max. :3.00000 Max. :6.0000 Max. :8.000
sadness surprise trust negative positive
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.1122 Mean :0.1798 Mean :0.4806 Mean :0.2395 Mean :0.6676
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :5.0000 Max. :4.0000 Max. :7.0000 Max. :6.0000 Max. :9.0000
sentiment_bing links
Min. :-5.0000 @RamblersMBB : 1139
1st Qu.: 0.0000 #LoyolaChicago : 1027
Median : 0.0000 #SisterJean : 778
Mean : 0.5141 https://twitter.com#SisterJean: 231
3rd Qu.: 1.0000 #LoyolaChicago; #MarchMadness : 208
Max. :11.0000 (Other) :16117
NA's : 687
This dataset is a compilation of tweets that werer collected during March Madness. The dataset looks at the number of tweets as well as the number of favorites, replies, and retweets. Additionally, this dataset looks at the different sentiments included in the tweets, which resemebles some of the feelings people were having while tweeting about Loyola and the games.
knitr::include_graphics("imgs/screenshot1.png")
knitr::include_graphics("imgs/screenshot2.png")
knitr::include_graphics("imgs/screenshot3.png")
knitr::include_graphics("imgs/screenshot4.png")
knitr::include_graphics("imgs/screenshot5.png")
The first plot shows the accounts with the greatest number of retweets while the second plot shows the top five accounts with the most tweets overall. The third plot shows the number of tweets per day. The line on this plot obviously peaks on the days that the March Madness games were played. The fourth plot the number of tweets by datetime, and again there is a peak during the time in which the game was played. Finally, the fifth plot shows the number of tweets by different sentiments. Overall, most of the tweets had a good sentiment, whether it was positive, anticipation, or joy. There were far fewer tweets with a bad sentiment, such as anger or disgust. There are also a significant amount of tweets that contained a fearful or sad sentiment, which makes sense for those fans that were afraid the team would not win a game, and then the ones that were obviously sad when the team did lose in the Final Four.
Overall, there was a very positive image of Loyola on twitter during March Madess. Many of the tweets had an overall positive sentiment, which makes it easy to assume that the majority of people tweeting about Loyola were supporting the team. Additionally, due to the sheet number of tweets, it is safe to say that Loyola had a significant presence on Twitter during March Madness. This makes sense because Loyola did make it into the Final Four, and they were one of the favorite teams considering nobody really expected them to even be in the tournament, so naturally it seems logical that there are so many tweets about Loyola.
summary(mydata)
tweet_id text username
Min. :3.542e+16 : 1273 @LALATE : 81
1st Qu.:9.774e+17 : 1245 @RamblersMBB : 30
Median :9.777e+17 : 197 @SkywayChicago : 27
Mean :9.753e+17 : 51 @chicagomargaret: 21
3rd Qu.:9.777e+17 : 35 @sschrimp : 18
Max. :9.824e+17 SisterJean: 15 @loyolaforus : 16
(Other) :17371 (Other) :19994
fullname date datetime
LALATE : 81 2018-03-25:10708 2018-03-25T00:21:10Z: 16
Loyola Basketball: 31 2018-03-23: 2976 2018-03-25T00:21:31Z: 16
Steve Timble : 27 2018-03-24: 2274 2018-03-25T00:21:09Z: 15
Margaret Holt : 21 2018-03-26: 1504 2018-03-25T00:21:35Z: 15
Mark : 21 2018-03-18: 1099 2018-03-25T00:21:08Z: 14
Steve : 19 2018-03-27: 241 2018-03-25T00:21:11Z: 14
(Other) :19987 (Other) : 1385 (Other) :20097
verified reply retweets favorite
Min. :0.00000 Min. : 0.0000 Min. : 0.000 Min. : 0.0
1st Qu.:0.00000 1st Qu.: 0.0000 1st Qu.: 0.000 1st Qu.: 0.0
Median :0.00000 Median : 0.0000 Median : 0.000 Median : 1.0
Mean :0.06192 Mean : 0.3467 Mean : 3.146 Mean : 15.8
3rd Qu.:0.00000 3rd Qu.: 0.0000 3rd Qu.: 0.000 3rd Qu.: 3.0
Max. :1.00000 Max. :591.0000 Max. :5143.000 Max. :32180.0
anger anticipation disgust fear joy
Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.000
Median :0.0000 Median :0.0000 Median :0.00000 Median :0.0000 Median :0.000
Mean :0.1342 Mean :0.4359 Mean :0.07143 Mean :0.1612 Mean :0.421
3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:1.000
Max. :4.0000 Max. :7.0000 Max. :3.00000 Max. :6.0000 Max. :8.000
sadness surprise trust negative positive
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.1122 Mean :0.1798 Mean :0.4806 Mean :0.2395 Mean :0.6676
3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :5.0000 Max. :4.0000 Max. :7.0000 Max. :6.0000 Max. :9.0000
sentiment_bing links
Min. :-5.0000 @RamblersMBB : 1139
1st Qu.: 0.0000 #LoyolaChicago : 1027
Median : 0.0000 #SisterJean : 778
Mean : 0.5141 https://twitter.com#SisterJean: 231
3rd Qu.: 1.0000 #LoyolaChicago; #MarchMadness : 208
Max. :11.0000 (Other) :16117
NA's : 687
This shows that the maximum value for positive is 9, which is the highest among all of the sentiments. Joy and trust are also quite high at 8 and 7 respectively. Seeing that these are all good sentiments, it is safe to say that these helped to create a positive image of Loyola on twitter.
I would say that Loyola’s marketing team should continue to use some of the hashtags that were created in relation to Loyola’s March Madness run. Furthermore, they should pick two or three specific hashtags and encourage students to use those when tweeting about Loyola. This could help consildate many of their marketing efforts into one. Also, I would recommend that the marketing team turn some of the most popular tweets or hashtags into a new marketing campaign and make promotional materials inclduing them. Since many people now know Loyola through the basketball program, it would be wise to use these tweets to catch the attention of non-Loyola people and then once they have their attention, they can share more information about things other than basketball with them.
knitr::include_graphics("imgs/screenshot6.png")
This graph depicts what the top drivers of joy are. According to this graph, the top driver is potisitve and anticipation. This makes sense because those who were tweeting with either or both a postitive or anticipatory sentiment would also have had a joyful sentiment. All of these emotions seem to go together, so it seems accurate that this is the top predictor of joy.
knitr::include_graphics("imgs/screenshot7.png")
This graph shows how the number of tweets that were favorited and the number of replies compare in relation to the date. The two highest peaks were on March 28 and April 1, which were both game days for Loyola. This makes compelte sense, because people would obviously be more likely to tweet either during the games or immediately after they won.
knitr::include_graphics("imgs/screenshot8.png")
This wordcloud displays what the most common hashtags were that were used in the tweets from the dataset. The most popular hashtags included #FinalFour and #LoyolaChicago. Obviously, the greatest number of tweets came during the Final Four game that Loyola played in so it makes sense that #FinalFour is one of the most common hashtags in addition to #LoyolaChicago.