Introduction of this project

The FIFA WorldCup is an international sports which is favored by millions of soccer fans around the world. In this project, we are gonna make some visualization and natural language processing of the tweets data

1. Read and glimpse of data

  setwd("~/Documents/kaggle_data/fifa")
  list.files()
## [1] "FIFA.csv"     "fifa.RData"   "fifa.Rmd"     "fifa.nb.html"
## [5] "fifa_cache"
  library(tidyverse)
  library(tidytext)
  library(visNetwork)
  
  fifa<-read_csv('FIFA.csv')
  glimpse(fifa)
## Observations: 530,000
## Variables: 16
## $ ID               <dbl> 1.013597e+18, 1.013597e+18, 1.013597e+18, 1.0...
## $ lang             <chr> "en", "en", "en", "en", "en", "en", "en", "en...
## $ Date             <dttm> 2018-07-02 01:35:45, 2018-07-02 01:35:44, 20...
## $ Source           <chr> "Twitter for Android", "Twitter for Android",...
## $ len              <int> 140, 139, 107, 142, 140, 140, 140, 138, 138, ...
## $ Orig_Tweet       <chr> "RT @Squawka: Only two goalkeepers have saved...
## $ Tweet            <chr> "Only two goalkeepers have saved three penalt...
## $ Likes            <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ RTs              <int> 477, 1031, 488, 0, 477, 153, 4, 1, 2199, 5146...
## $ Hashtags         <chr> "WorldCup,POR,ENG", "WorldCup", "worldcup", "...
## $ UserMentionNames <chr> "Squawka Football", "FC Barcelona,Ivan Rakiti...
## $ UserMentionID    <chr> "Squawka", "FCBarcelona,ivanrakitic,HNS_CFF",...
## $ Name             <chr> "Cayleb", "Febri Aditya", "??", "Frida Carril...
## $ Place            <chr> "Accra", "Bogor", NA, "Zapopan, Jalisco", NA,...
## $ Followers        <int> 861, 667, 65, 17, 137, 29, 208, 7, 1, 158, 34...
## $ Friends          <int> 828, 686, 67, 89, 216, 283, 338, 9, 6, 245, 3...

There are 530000 observations and 16 features in the data, but we just need some of them, so some features can be ignored.

  fifa<-fifa %>% filter(lang == 'en') %>% select(Source,len,Tweet,Hashtags,
                                                 RTs,UserMentionNames,Name,Place)
  glimpse(fifa)
## Observations: 530,000
## Variables: 8
## $ Source           <chr> "Twitter for Android", "Twitter for Android",...
## $ len              <int> 140, 139, 107, 142, 140, 140, 140, 138, 138, ...
## $ Tweet            <chr> "Only two goalkeepers have saved three penalt...
## $ Hashtags         <chr> "WorldCup,POR,ENG", "WorldCup", "worldcup", "...
## $ RTs              <int> 477, 1031, 488, 0, 477, 153, 4, 1, 2199, 5146...
## $ UserMentionNames <chr> "Squawka Football", "FC Barcelona,Ivan Rakiti...
## $ Name             <chr> "Cayleb", "Febri Aditya", "??", "Frida Carril...
## $ Place            <chr> "Accra", "Bogor", NA, "Zapopan, Jalisco", NA,...

2.EDA

2.1Top words in tweets

  fifa_tidy<-fifa %>% unnest_tokens(words,Tweet) %>% filter(!(words %in% stop_words$word)) %>% filter(str_detect(words,'[a-z]'))

  fifa_tidy %>% count(words,sort = T) %>% top_n(20,wt=n) %>%
    ggplot(aes(x=reorder(words,n),y=n))+geom_col(fill="#AAB7B8")+theme_bw()+
    labs(y='',x='',title="Top words in tweets")+coord_flip()

  fifa_tidy %>% filter(str_detect(Source,'^Twitter for')) %>%
    count(Source,words,sort = T) %>% group_by(Source) %>% top_n(10,wt=n) %>%
    ggplot(aes(x=reorder(words,n),y=n,fill=Source))+geom_col()+
    theme_bw()+facet_wrap(~Source,scales =  "free",ncol = 2)+
    labs(y='',x='',title="Top words of tweets in Each source")+coord_flip()+
    theme(legend.position = "none")

### 2.2 Worldcloud of words

  fifa_tidy %>% count(words,sort = T) %>% top_n(500,wt=n) %>% wordcloud2::wordcloud2()
  fifa_tidy_sentiment<-fifa_tidy %>% rename(word=words) %>%
    inner_join(get_sentiments('bing'),by='word')
  
  fifa_tidy_sentiment %>% group_by(word,sentiment) %>% summarise(total=n()) %>%
    ungroup() %>% group_by(sentiment) %>% arrange(desc(total)) %>% top_n(10) %>%
    ggplot(aes(x=reorder(word,total),y=total,fill=sentiment))+geom_col()+
    facet_wrap(~sentiment,scales = "free")+theme_bw()+coord_flip()+
    theme(legend.position = "none")

  fifa_tidy_sentiment %>% group_by(word,sentiment) %>% summarise(total=n()) %>% 
    arrange(desc(total)) %>% 
    reshape2::acast(word ~ sentiment, value.var = "total", fill = 0) %>% 
    wordcloud::comparison.cloud(colors = c("#F8766D", "#00BFC4"),max.words = 350)
## Warning in wordcloud::comparison.cloud(., colors = c("#F8766D",
## "#00BFC4"), : perseverance could not be fit on page. It will not be
## plotted.

2.3 More sentiments in tweets

  fifa_all_sens<-fifa_tidy %>% rename(word=words) %>% inner_join(get_sentiments('nrc'),by="word")

  fifa_all_sens %>% count(word,sentiment,sort = T) %>% group_by(sentiment) %>% top_n(10) %>%
    ggplot(aes(x=reorder(word,n),y=n,fill=sentiment))+
    geom_col(show.legend = F)+ theme_bw()+facet_wrap(~sentiment,scales = "free",ncol = 3)+
    theme(legend.position = "none")+coord_flip()+labs(x='',y='',title="The top 10 words under each sentiment category")

  fifa_all_sens %>% group_by(word,sentiment) %>% count() %>% bind_tf_idf(word,sentiment,n) %>% 
    arrange(desc(tf_idf)) %>% group_by(sentiment) %>% top_n(15) %>% ggplot(aes(x=reorder(word,-n),y=n,fill=sentiment))+
    geom_col(show.legend = F)+labs(x=NULL,y="tf-idf")+facet_wrap(~sentiment,ncol = 3,scales = "free")+coord_flip()

2.4 N-grams

  fifa_ngram<-fifa %>% unnest_tokens(bigram,Tweet,token = "ngrams", n=2) %>% select(bigram) %>%
  separate(bigram,c("w1","w2"),sep=" ") %>% 
  filter(!w1 %in% stop_words$word,!w2 %in% stop_words$word) %>% count(w1,w2,sort = T)

  fifa_ngram %>% unite(bigram,w1,w2,sep = " ") %>% wordcloud2::wordcloud2()
  fifa_ngram %>% filter(w1=='worldcup') %>% inner_join(get_sentiments('afinn'),by=c(w2="word")) %>% 
    count(w2,score,sort = T) %>% mutate(contribution=nn*score) %>% arrange(desc(abs(contribution))) %>% 
    mutate(w2=reorder(w2,contribution)) %>% ggplot(aes(w2,contribution,fill=contribution > 0))+
    geom_col(show.legend = F)+coord_flip()

  big_graph<-na.omit(fifa_ngram) %>% mutate(section=row_number() %/% 10) %>% filter(n>4000) %>%
  igraph::graph_from_data_frame() %>% toVisNetworkData()
  visNetwork(big_graph$nodes,big_graph$edges) %>% visOptions(highlightNearest = TRUE)