Part 3 - Collecting and visualizing Tweets

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# text mining library
library(tidytext)
# plotting packages
library(igraph)
## Warning: package 'igraph' was built under R version 3.6.3
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
## load rtweet package
library(rtweet)
library(httpuv)
## Warning: package 'httpuv' was built under R version 3.6.3
setwd("C:/Users/zxu3/Documents/R/sentiment2") 
rt<- read.csv(file = 'rt.csv')
head(rt)
##                user_id            status_id          created_at     screen_name
## 1 x1270707291466203137 x1306706202651369473 2020-09-17 21:26:50 J11235843718981
## 2            x38228827 x1306705942503923717 2020-09-17 21:25:48     CorpCounsel
## 3 x1176823965366607872 x1306705821737385984 2020-09-17 21:25:19 MarkBak91766022
## 4 x1300247746537021446 x1306705618108063748 2020-09-17 21:24:31         MNMAGA1
## 5 x1220313283267190784 x1306703535799631872 2020-09-17 21:16:14       Jewelrese
## 6  x876569845801795585 x1306702232553811968 2020-09-17 21:11:04      smarty7725
##                                                                                                                                                                                                                                           text
## 1                                                                                                 @RealCandaceO how about snoop doggy dogg wine lmfao...get aunt jemima off the bottle but snoop can have his ugly face on the wine bottle lol
## 2 One of the main challenges is that dropping a brandĆ¢\200ā€whether itĆ¢\200\231s Aunt Jemima syrup, the Washington Redskins football team name or Eskimo Pie ice cream barsĆ¢\200ā€opens the possibility of someone else claiming it. https://t.co/nsRlYc0wlp
## 3                                                                                                                                                              @RealCandaceO @DonaldJTrumpJr Aunt Jemima woke up and moved into the Log Cabin.
## 4                                @_RyanWinkler @MinnesotaDFL Fun to consider that theyâ\200\231ve cancelled Aunt Jemima and Uncle Ben, but Cuties on Netflix is here to stay. \n\nThe Left really has its priorities straight. #VoteRedToSaveAmerica
## 5                                                                                                                                                                                           skipped aunt Jemima &amp; all of them <U+0001F92D>
## 6             @RealCandaceO Nobody but nobody is getting my cream of wheat Guy or my aunt Jemima. Geez! This is iconic history! Leave are logos alone it our history. And by the way, leave our monuments alone too. My Lord helps we beg you!
##                source display_text_width   reply_to_status_id
## 1 Twitter for Android                126 x1306578811220361217
## 2      Hootsuite Inc.                230                     
## 3     Twitter Web App                 49 x1306578811220361217
## 4 Twitter for Android                173 x1306618688892735490
## 5 Twitter for Android                 39                     
## 6  Twitter for iPhone                210 x1306578811220361217
##      reply_to_user_id reply_to_screen_name is_quote is_retweet favorite_count
## 1 x878247600096509952         RealCandaceO    FALSE      FALSE              0
## 2                                             FALSE      FALSE              0
## 3 x878247600096509952         RealCandaceO    FALSE      FALSE              0
## 4          x116496709         _RyanWinkler    FALSE      FALSE              1
## 5                                             FALSE      FALSE              0
## 6 x878247600096509952         RealCandaceO    FALSE      FALSE              0
##   retweet_count quote_count reply_count             hashtags symbols
## 1             0          NA          NA                           NA
## 2             0          NA          NA                           NA
## 3             0          NA          NA                           NA
## 4             0          NA          NA VoteRedToSaveAmerica      NA
## 5             0          NA          NA                           NA
## 6             0          NA          NA                           NA
##                    urls_url               urls_t.co
## 1                                                  
## 2 law.com/corpcounsel/20â\200¦ https://t.co/nsRlYc0wlp
## 3                                                  
## 4                                                  
## 5                                                  
## 6                                                  
##                                                                                      urls_expanded_url
## 1                                                                                                     
## 2 https://www.law.com/corpcounsel/2020/09/16/the-challenges-of-walking-away-from-offensive-trademarks/
## 3                                                                                                     
## 4                                                                                                     
## 5                                                                                                     
## 6                                                                                                     
##   media_url media_t.co media_expanded_url media_type ext_media_url
## 1                                                                 
## 2                                                                 
## 3                                                                 
## 4                                                                 
## 5                                                                 
## 6                                                                 
##   ext_media_t.co ext_media_expanded_url ext_media_type
## 1                                                   NA
## 2                                                   NA
## 3                                                   NA
## 4                                                   NA
## 5                                                   NA
## 6                                                   NA
##                mentions_user_id        mentions_screen_name lang
## 1           x878247600096509952                RealCandaceO   en
## 2                                                             en
## 3 x878247600096509952 x39344374 RealCandaceO DonaldJTrumpJr   en
## 4          x116496709 x15900384   _RyanWinkler MinnesotaDFL   en
## 5                                                             en
## 6           x878247600096509952                RealCandaceO   en
##   quoted_status_id quoted_text quoted_created_at quoted_source
## 1                                                             
## 2                                                             
## 3                                                             
## 4                                                             
## 5                                                             
## 6                                                             
##   quoted_favorite_count quoted_retweet_count quoted_user_id quoted_screen_name
## 1                    NA                   NA                                  
## 2                    NA                   NA                                  
## 3                    NA                   NA                                  
## 4                    NA                   NA                                  
## 5                    NA                   NA                                  
## 6                    NA                   NA                                  
##   quoted_name quoted_followers_count quoted_friends_count quoted_statuses_count
## 1                                 NA                   NA                    NA
## 2                                 NA                   NA                    NA
## 3                                 NA                   NA                    NA
## 4                                 NA                   NA                    NA
## 5                                 NA                   NA                    NA
## 6                                 NA                   NA                    NA
##   quoted_location quoted_description quoted_verified retweet_status_id
## 1                                                 NA                NA
## 2                                                 NA                NA
## 3                                                 NA                NA
## 4                                                 NA                NA
## 5                                                 NA                NA
## 6                                                 NA                NA
##   retweet_text retweet_created_at retweet_source retweet_favorite_count
## 1           NA                 NA             NA                     NA
## 2           NA                 NA             NA                     NA
## 3           NA                 NA             NA                     NA
## 4           NA                 NA             NA                     NA
## 5           NA                 NA             NA                     NA
## 6           NA                 NA             NA                     NA
##   retweet_retweet_count retweet_user_id retweet_screen_name retweet_name
## 1                    NA              NA                  NA           NA
## 2                    NA              NA                  NA           NA
## 3                    NA              NA                  NA           NA
## 4                    NA              NA                  NA           NA
## 5                    NA              NA                  NA           NA
## 6                    NA              NA                  NA           NA
##   retweet_followers_count retweet_friends_count retweet_statuses_count
## 1                      NA                    NA                     NA
## 2                      NA                    NA                     NA
## 3                      NA                    NA                     NA
## 4                      NA                    NA                     NA
## 5                      NA                    NA                     NA
## 6                      NA                    NA                     NA
##   retweet_location retweet_description retweet_verified place_url place_name
## 1               NA                  NA               NA                     
## 2               NA                  NA               NA                     
## 3               NA                  NA               NA                     
## 4               NA                  NA               NA                     
## 5               NA                  NA               NA                     
## 6               NA                  NA               NA                     
##   place_full_name place_type country country_code geo_coords coords_coords
## 1                                                      NA NA         NA NA
## 2                                                      NA NA         NA NA
## 3                                                      NA NA         NA NA
## 4                                                      NA NA         NA NA
## 5                                                      NA NA         NA NA
## 6                                                      NA NA         NA NA
##               bbox_coords
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA
##                                                       status_url
## 1 https://twitter.com/J11235843718981/status/1306706202651369473
## 2     https://twitter.com/CorpCounsel/status/1306705942503923717
## 3 https://twitter.com/MarkBak91766022/status/1306705821737385984
## 4         https://twitter.com/MNMAGA1/status/1306705618108063748
## 5       https://twitter.com/Jewelrese/status/1306703535799631872
## 6      https://twitter.com/smarty7725/status/1306702232553811968
##                        name      location
## 1 J112358437189887641562819              
## 2         Corporate Counsel  New York, NY
## 3                Mark Baker              
## 4                   MN MAGA              
## 5                Dirt queen Silicon Beach
## 6              Martha Denny  Arizona, USA
##                                                                   description
## 1 https://t.co/QrqjZYZ7Q5 \n\n$t3t6t9 \n\nj112358437189887641562819@gmail.com
## 2                     The business magazine and website for in-house counsel.
## 3                                                         mean metal machines
## 4                             MAGA FOREVER â\200¢ Dog lover â\200¢ <U+271D><U+FE0F>
## 5                                                                      bright
## 6                                                                            
##                       url protected followers_count friends_count listed_count
## 1 https://t.co/dAmgWraxDk     FALSE               0           118            0
## 2 https://t.co/njvbBDmELo     FALSE           30672           188          664
## 3                             FALSE               4            63            0
## 4                             FALSE             124           358            0
## 5                             FALSE              11            37            0
## 6                             FALSE             199           491            1
##   statuses_count favourites_count  account_created_at verified
## 1            224              584 2020-06-10 13:20:16    FALSE
## 2          23472              400 2009-05-06 17:26:10    FALSE
## 3            521               94 2019-09-25 11:42:38    FALSE
## 4            617              225 2020-08-31 01:43:29    FALSE
## 5           3510               67 2020-01-23 11:52:26    FALSE
## 6           3458              400 2017-06-18 22:38:31    FALSE
##               profile_url                  profile_expanded_url account_lang
## 1 https://t.co/dAmgWraxDk https://join.robinhood.com/jasons9456           NA
## 2 https://t.co/njvbBDmELo      https://www.law.com/corpcounsel/           NA
## 3                                                                         NA
## 4                                                                         NA
## 5                                                                         NA
## 6                                                                         NA
##                                                     profile_banner_url
## 1                                                                     
## 2            https://pbs.twimg.com/profile_banners/38228827/1591375423
## 3 https://pbs.twimg.com/profile_banners/1176823965366607872/1577305978
## 4                                                                     
## 5 https://pbs.twimg.com/profile_banners/1220313283267190784/1581175257
## 6                                                                     
##                             profile_background_url
## 1                                                 
## 2 http://abs.twimg.com/images/themes/theme1/bg.png
## 3                                                 
## 4                                                 
## 5                                                 
## 6                                                 
##                                                               profile_image_url
## 1   http://pbs.twimg.com/profile_images/1270707421225390081/blfWSkWI_normal.jpg
## 2   http://pbs.twimg.com/profile_images/1182753016845426688/FANWOew4_normal.png
## 3   http://pbs.twimg.com/profile_images/1209935079683870720/62O0YDHK_normal.jpg
## 4   http://pbs.twimg.com/profile_images/1300248019280027648/eDVEU3lF_normal.jpg
## 5   http://pbs.twimg.com/profile_images/1226163834181701632/zOcpSMuu_normal.jpg
## 6 http://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png
# remove http elements manually
rt$stripped_text <- gsub("http.*","",  rt$text)
rt$stripped_text <- gsub("https.*","", rt$stripped_text)


# remove punctuation, convert to lowercase, add id for each tweet!
rt_clean <- rt %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(word, stripped_text)


# note the words that are recognized as unique by R
a_list_of_words <- c("Dog", "dog", "dog", "cat", "cat", ",")
unique(a_list_of_words)
## [1] "Dog" "dog" "cat" ","
## [1] "Dog" "dog" "cat" ","

# remove punctuation, convert to lowercase, add id for each tweet!
rt_clean <- rt %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(word, stripped_text)


# plot the top 15 words -- notice any issues?
rt_clean %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(x = "Count",
       y = "Unique words",
       title = "Count of unique words found in tweets")
## Selecting by n

# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
head(stop_words)
## # A tibble: 6 x 2
##   word      lexicon
##   <chr>     <chr>  
## 1 a         SMART  
## 2 a's       SMART  
## 3 able      SMART  
## 4 about     SMART  
## 5 above     SMART  
## 6 according SMART
# remove stop words from your list of words
cleaned_tweet_words <- rt_clean %>%
  anti_join(stop_words)
## Joining, by = "word"
# there should be fewer words now
nrow(cleaned_tweet_words)
## [1] 11330
# plot the top 15 words -- notice any issues?
cleaned_tweet_words %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(y = "Count",
       x = "Unique words",
       title = "Count of unique words found in tweets",
       subtitle = "Stop words removed from the list")
## Selecting by n

#install.packages("devtools")
library(devtools)
## Warning: package 'devtools' was built under R version 3.6.3
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 3.6.3
#install_github("dgrtwo/widyr")
library(widyr)

# remove punctuation, convert to lowercase, add id for each tweet!
rt_paired_words <- rt %>%
  dplyr::select(stripped_text) %>%
  unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)

rt_paired_words %>%
  count(paired_words, sort = TRUE)
## # A tibble: 15,186 x 2
##    paired_words     n
##    <chr>        <int>
##  1 aunt jemima    807
##  2 u 0001f62d     140
##  3 0001f62d u     134
##  4 jemima and     105
##  5 uncle ben       88
##  6 and uncle       66
##  7 in the          61
##  8 jemima syrup    58
##  9 of aunt         53
## 10 jemima is       42
## # ... with 15,176 more rows
# plot the top 15 words -- notice any issues?
cleaned_tweet_words %>%
  count(word, sort = TRUE) %>%
  top_n(15) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
  labs(y = "Count",
       x = "Unique words",
       title = "Count of unique words found in tweets",
       subtitle = "Stop words removed from the list")
## Selecting by n

## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
## 
##     crossing
## # A tibble: 6 x 3
##   word1         word2             n
##   <chr>         <chr>         <int>
## 1 aunt          jemima          807
## 2 uncle         ben              88
## 3 jemima        syrup            58
## 4 maple         syrup            38
## 5 aunt          jemimaâ          31
## 6 alissawalsh21 mattwalshblog    19