The FIFA WorldCup is an international sports which is favored by millions of soccer fans around the world. In this project, we are gonna make some visualization and natural language processing of the tweets data
setwd("~/Documents/kaggle_data/fifa")
list.files()
## [1] "FIFA.csv" "fifa.RData" "fifa.Rmd" "fifa.nb.html"
## [5] "fifa_cache"
library(tidyverse)
library(tidytext)
library(visNetwork)
fifa<-read_csv('FIFA.csv')
glimpse(fifa)
## Observations: 530,000
## Variables: 16
## $ ID <dbl> 1.013597e+18, 1.013597e+18, 1.013597e+18, 1.0...
## $ lang <chr> "en", "en", "en", "en", "en", "en", "en", "en...
## $ Date <dttm> 2018-07-02 01:35:45, 2018-07-02 01:35:44, 20...
## $ Source <chr> "Twitter for Android", "Twitter for Android",...
## $ len <int> 140, 139, 107, 142, 140, 140, 140, 138, 138, ...
## $ Orig_Tweet <chr> "RT @Squawka: Only two goalkeepers have saved...
## $ Tweet <chr> "Only two goalkeepers have saved three penalt...
## $ Likes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ RTs <int> 477, 1031, 488, 0, 477, 153, 4, 1, 2199, 5146...
## $ Hashtags <chr> "WorldCup,POR,ENG", "WorldCup", "worldcup", "...
## $ UserMentionNames <chr> "Squawka Football", "FC Barcelona,Ivan Rakiti...
## $ UserMentionID <chr> "Squawka", "FCBarcelona,ivanrakitic,HNS_CFF",...
## $ Name <chr> "Cayleb", "Febri Aditya", "??", "Frida Carril...
## $ Place <chr> "Accra", "Bogor", NA, "Zapopan, Jalisco", NA,...
## $ Followers <int> 861, 667, 65, 17, 137, 29, 208, 7, 1, 158, 34...
## $ Friends <int> 828, 686, 67, 89, 216, 283, 338, 9, 6, 245, 3...
There are 530000 observations and 16 features in the data, but we just need some of them, so some features can be ignored.
fifa<-fifa %>% filter(lang == 'en') %>% select(Source,len,Tweet,Hashtags,
RTs,UserMentionNames,Name,Place)
glimpse(fifa)
## Observations: 530,000
## Variables: 8
## $ Source <chr> "Twitter for Android", "Twitter for Android",...
## $ len <int> 140, 139, 107, 142, 140, 140, 140, 138, 138, ...
## $ Tweet <chr> "Only two goalkeepers have saved three penalt...
## $ Hashtags <chr> "WorldCup,POR,ENG", "WorldCup", "worldcup", "...
## $ RTs <int> 477, 1031, 488, 0, 477, 153, 4, 1, 2199, 5146...
## $ UserMentionNames <chr> "Squawka Football", "FC Barcelona,Ivan Rakiti...
## $ Name <chr> "Cayleb", "Febri Aditya", "??", "Frida Carril...
## $ Place <chr> "Accra", "Bogor", NA, "Zapopan, Jalisco", NA,...
fifa_tidy<-fifa %>% unnest_tokens(words,Tweet) %>% filter(!(words %in% stop_words$word)) %>% filter(str_detect(words,'[a-z]'))
fifa_tidy %>% count(words,sort = T) %>% top_n(20,wt=n) %>%
ggplot(aes(x=reorder(words,n),y=n))+geom_col(fill="#AAB7B8")+theme_bw()+
labs(y='',x='',title="Top words in tweets")+coord_flip()
fifa_tidy %>% filter(str_detect(Source,'^Twitter for')) %>%
count(Source,words,sort = T) %>% group_by(Source) %>% top_n(10,wt=n) %>%
ggplot(aes(x=reorder(words,n),y=n,fill=Source))+geom_col()+
theme_bw()+facet_wrap(~Source,scales = "free",ncol = 2)+
labs(y='',x='',title="Top words of tweets in Each source")+coord_flip()+
theme(legend.position = "none")
### 2.2 Worldcloud of words
fifa_tidy %>% count(words,sort = T) %>% top_n(500,wt=n) %>% wordcloud2::wordcloud2()
fifa_tidy_sentiment<-fifa_tidy %>% rename(word=words) %>%
inner_join(get_sentiments('bing'),by='word')
fifa_tidy_sentiment %>% group_by(word,sentiment) %>% summarise(total=n()) %>%
ungroup() %>% group_by(sentiment) %>% arrange(desc(total)) %>% top_n(10) %>%
ggplot(aes(x=reorder(word,total),y=total,fill=sentiment))+geom_col()+
facet_wrap(~sentiment,scales = "free")+theme_bw()+coord_flip()+
theme(legend.position = "none")
fifa_tidy_sentiment %>% group_by(word,sentiment) %>% summarise(total=n()) %>%
arrange(desc(total)) %>%
reshape2::acast(word ~ sentiment, value.var = "total", fill = 0) %>%
wordcloud::comparison.cloud(colors = c("#F8766D", "#00BFC4"),max.words = 350)
## Warning in wordcloud::comparison.cloud(., colors = c("#F8766D",
## "#00BFC4"), : perseverance could not be fit on page. It will not be
## plotted.
fifa_all_sens<-fifa_tidy %>% rename(word=words) %>% inner_join(get_sentiments('nrc'),by="word")
fifa_all_sens %>% count(word,sentiment,sort = T) %>% group_by(sentiment) %>% top_n(10) %>%
ggplot(aes(x=reorder(word,n),y=n,fill=sentiment))+
geom_col(show.legend = F)+ theme_bw()+facet_wrap(~sentiment,scales = "free",ncol = 3)+
theme(legend.position = "none")+coord_flip()+labs(x='',y='',title="The top 10 words under each sentiment category")
fifa_all_sens %>% group_by(word,sentiment) %>% count() %>% bind_tf_idf(word,sentiment,n) %>%
arrange(desc(tf_idf)) %>% group_by(sentiment) %>% top_n(15) %>% ggplot(aes(x=reorder(word,-n),y=n,fill=sentiment))+
geom_col(show.legend = F)+labs(x=NULL,y="tf-idf")+facet_wrap(~sentiment,ncol = 3,scales = "free")+coord_flip()
fifa_ngram<-fifa %>% unnest_tokens(bigram,Tweet,token = "ngrams", n=2) %>% select(bigram) %>%
separate(bigram,c("w1","w2"),sep=" ") %>%
filter(!w1 %in% stop_words$word,!w2 %in% stop_words$word) %>% count(w1,w2,sort = T)
fifa_ngram %>% unite(bigram,w1,w2,sep = " ") %>% wordcloud2::wordcloud2()
fifa_ngram %>% filter(w1=='worldcup') %>% inner_join(get_sentiments('afinn'),by=c(w2="word")) %>%
count(w2,score,sort = T) %>% mutate(contribution=nn*score) %>% arrange(desc(abs(contribution))) %>%
mutate(w2=reorder(w2,contribution)) %>% ggplot(aes(w2,contribution,fill=contribution > 0))+
geom_col(show.legend = F)+coord_flip()
big_graph<-na.omit(fifa_ngram) %>% mutate(section=row_number() %/% 10) %>% filter(n>4000) %>%
igraph::graph_from_data_frame() %>% toVisNetworkData()
visNetwork(big_graph$nodes,big_graph$edges) %>% visOptions(highlightNearest = TRUE)