# load twitter library - the rtweet library is recommended now over twitteR
library(rtweet)
# plotting and pipes - tidyverse!
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# text mining library
library(tidytext)
# plotting packages
library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)
climate_tweets <- search_tweets(q = "#climatechange", n = 10000,
                                      lang = "en",
                                      include_rts = FALSE)

# check data to see if there are emojis
head(climate_tweets$text)
## [1] "Planet/Climate change takes millenials if not epochs of time #ClimateChange \nClimate Change is being used by the United Nations Sustainable Development Gang #UNSDG to push their Agenda21\n#cdnpoli #elxn43 #polcan\n \nhttps://t.co/jr919AHJwt\n\nI Choose CANADA!\nhttps://t.co/TV9gk0UBzC"                                           
## [2] "@LeonardRoxon Planet/Climate change takes millenials if not epochs of time #ClimateChange \nClimate Change is being used by the United Nations Sustainable Development Gang to push their Agenda21\n#cdnpoli #elxn43 #polcan\n \nhttps://t.co/jr919AHJwt\n\nChoose CANADA \nhttps://t.co/TV9gk0UBzC"                                      
## [3] "TURDeau has been convinced &amp; convinced cdns he is doing it supposedly for a greater good. Scaring humanity into believing the PlanetChange LIE #ClimateChange Goal of achieving United Nations Sustainable Development Goals #UNSDG Gang by lowering CDN'S standard of living #cdnpoli https://t.co/5RlY5Rx8L0"                       
## [4] "Where r the jews in all this #unsdg 1 world, new world order, power currupts absolutely, big climate change lie, not planet change but  ClimateChange? #ClimateChange I care about humanity more than u know, starts at homele$$ne$$ No excuse justify deceiving cdns #ppc drill water https://t.co/RmkndpnsXv"                           
## [5] "@Historian_Matt @theJagmeetSingh Told you jagmeet Singh is now a globalist new world order coolaid drinking PlanetChange #ClimateChange in our lifetime believer.\n #cdnpoli #elxn43 #prorep #EqualSenate #2termPM #ndp ppc2019 #PPC2019"                                                                                                 
## [6] "@Tenenbrae @MandysTake TURDeau has been convinced &amp; convinced cdns he is doing it supposedly for a greater good. Scaring humanity into believing the PlanetChange LIE #ClimateChange Goal of achieving United Nations Sustainable Development Goals #UNSDG Gang by lowering CDN'S standard of living #cdnpoli https://t.co/BKHPk0SrKU"
## Warning: Rate limit exceeded - 88

## [1] "I LAMENT @ScottMorrisonMP , I Lament Him Belittling And Demeaning #GretaThunberg , I Lament Him Belittling And Demeaning All Kids , I Lament Him For Not Showing Leadership On #ClimateChange , I Lament Him Not Being The PM Australia Needs \nhttps://t.co/rlxjhoGJGT\n#auspol #insiders"  
## [2] "Why Conservatives Keep #Gaslighting About #ClimateChange . \nIn recent years, leaders of @GOP , @LiberalAus and @The_Nationals  have become aware that denying the existence of global warming makes them look like idiots.\nhttps://t.co/gd0BkyTBrv\n#auspol #insiders" 
## [3] "@ScottMorrisonMP Said At The UN #ClimateSummit He Was Doing His Part On #ClimateChange . Under @LiberalAus Australia Is Ranked 55 Out Of 60 In The Worst Performing Counties On #ClimateAction \nhttps://t.co/G35I1JkUvL"
## [4] "@KnuckleHead6971 @elyasgarad @shell_kon @BelindaJones68 @Greg_MarineLab @simonahac @MadamEarth @AngryTheInch @MortPhil @vanbadham @MsVeruca @Bloss55 @AOC @Noodles_Romanov @AngryGranny1 @GGeoffwitha @JacktheInsider @LaLegale @GOP @LiberalAus Why conservatives keep gaslighting about #climatechange \nIn recent years, leaders of @GOP and @LiberalAus and @The_Nationals  have become aware that denying the existence of global warming makes them look like idiots\nhttps://t.co/gd0BkyTBrv"
## [5] "U.N. Head Antonio Guterres urges world leaders great ‘ambition and action’ on #climatechange, #SustainableDevelopment is needed, saying there is not time to loose in the face of climate change, rising inequality, increasing hatred and intolerance.\nhttps://t.co/iqRWJw9YzM"
## [6] "Experts And The UN Have Accused @ScottMorrisonMP Of Demeaning , Belittling And Talking Down To #GretaThunberg While Playing Completely Ignoring The Science And Facts On #ClimateChange To Parrot Trump\nhttps://t.co/EiLWRNU7vG\n#auspol #insiders" 
# Find tweet using climate+change together in them
climate_tweets <- search_tweets(q = "climate+change", n = 10000, lang = "en",
                             include_rts = FALSE)
## Warning: Rate limit exceeded - 88
## Warning: Rate limit exceeded
# remove urls tidyverse is failing here for some reason
# climate_tweets %>%
# mutate_at(c("stripped_text"), gsub("http.*","",.))

# remove http elements manually
climate_tweets$stripped_text <- gsub("http.*","",  climate_tweets$text)
climate_tweets$stripped_text <- gsub("https.*","", climate_tweets$stripped_text)
# note the words that are recognized as unique by R
a_list_of_words <- c("Dog", "dog", "dog", "cat", "cat", ",")
unique(a_list_of_words)
## [1] "Dog" "dog" "cat" ","
## [1] "Dog" "dog" "cat" ","
# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_clean <- climate_tweets %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(word, stripped_text)
# plot the top 15 words -- notice any issues?
climate_tweets_clean %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets")
## Selecting by n

# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
head(stop_words)
## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART
## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART
nrow(climate_tweets_clean)
## [1] 211562
## [1] 264381
# remove stop words from your list of words
cleaned_tweet_words <- climate_tweets_clean %>%
  anti_join(stop_words)
## Joining, by = "word"
# there should be fewer words now
nrow(cleaned_tweet_words)
## [1] 104715
## [1] 130578
# plot the top 15 words -- notice any issues?
cleaned_tweet_words %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(y = "Count",
      x = "Unique words",
      title = "Count of unique words found in tweets",
      subtitle = "Stop words removed from the list")
## Selecting by n

# library(devtools)
# install_github("dgrtwo/widyr")
library(widyr)

# remove punctuation, convert to lowercase, add id for each tweet!
climate_tweets_paired_words <- climate_tweets %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)

climate_tweets_paired_words %>%
  count(paired_words, sort = TRUE)
## # A tibble: 109,095 x 2
##    paired_words       n
##    <chr>          <int>
##  1 climate change  7367
##  2 change is        904
##  3 of the           561
##  4 about climate    544
##  5 on climate       544
##  6 the climate      537
##  7 in the           526
##  8 of climate       497
##  9 is a             465
## 10 change and       398
## # … with 109,085 more rows
## A tibble: 130,552 x 2
##    paired_words         n
##    <chr>            <int>
##  1 climate change    9164
##  2 change is         1140
##  3 of the             694
##  4 about climate      691
##  5 on climate         667
##  6 the climate        657
##  7 in the             649
##  8 of climate         626
##  9 is a               591
## 10 change and         498
## … with 130,542 more rows
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
## 
##     crossing
climate_tweets_separated_words <- climate_tweets_paired_words %>%
  separate(paired_words, c("word1", "word2"), sep = " ")

climate_tweets_filtered <- climate_tweets_separated_words %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
climate_words_counts <- climate_tweets_filtered %>%
  count(word1, word2, sort = TRUE)

head(climate_words_counts)
## # A tibble: 6 x 3
##   word1   word2        n
##   <chr>   <chr>    <int>
## 1 climate change    7367
## 2 greta   thunberg   230
## 3 global  warming    227
## 4 fight   climate    120
## 5 jason   momoa       96
## 6 change  deniers     95
## # A tibble: 6 x 3
##   word1         word2             n
##   <chr>         <chr>         <int>
## 1 climate       change         9164
## 2 global        warming         295
## 3 greta         thunberg        282
## 4 fight         climate         152
## 5 jason         momoa           118
## 6 change        deniers         112
library(igraph)
library(ggraph)

# plot climate change word network
# (plotting graph edges is currently broken)
climate_words_counts %>%
        filter(n >= 24) %>%
        graph_from_data_frame() %>%
        ggraph(layout = "fr") +
        # geom_edge_link(aes(edge_alpha = n, edge_width = n))
        # geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
        geom_node_point(color = "darkslategray4", size = 3) +
        geom_node_text(aes(label = name), vjust = 1.8, size = 3) +
        labs(title = "Word Network: Tweets using the hashtag - Climate Change",
             subtitle = "Text mining twitter data ",
             x = "", y = "")