Part 3 - Collecting and visualizing Tweets
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# text mining library
library(tidytext)
# plotting packages
library(igraph)
## Warning: package 'igraph' was built under R version 3.6.3
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggraph)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.3
## load rtweet package
library(rtweet)
library(httpuv)
## Warning: package 'httpuv' was built under R version 3.6.3
setwd("C:/Users/zxu3/Documents/R/sentiment2")
rt<- read.csv(file = 'rt.csv')
head(rt)
## user_id status_id created_at screen_name
## 1 x1270707291466203137 x1306706202651369473 2020-09-17 21:26:50 J11235843718981
## 2 x38228827 x1306705942503923717 2020-09-17 21:25:48 CorpCounsel
## 3 x1176823965366607872 x1306705821737385984 2020-09-17 21:25:19 MarkBak91766022
## 4 x1300247746537021446 x1306705618108063748 2020-09-17 21:24:31 MNMAGA1
## 5 x1220313283267190784 x1306703535799631872 2020-09-17 21:16:14 Jewelrese
## 6 x876569845801795585 x1306702232553811968 2020-09-17 21:11:04 smarty7725
## text
## 1 @RealCandaceO how about snoop doggy dogg wine lmfao...get aunt jemima off the bottle but snoop can have his ugly face on the wine bottle lol
## 2 One of the main challenges is that dropping a brandĆ¢\200āwhether itĆ¢\200\231s Aunt Jemima syrup, the Washington Redskins football team name or Eskimo Pie ice cream barsĆ¢\200āopens the possibility of someone else claiming it. https://t.co/nsRlYc0wlp
## 3 @RealCandaceO @DonaldJTrumpJr Aunt Jemima woke up and moved into the Log Cabin.
## 4 @_RyanWinkler @MinnesotaDFL Fun to consider that theyâ\200\231ve cancelled Aunt Jemima and Uncle Ben, but Cuties on Netflix is here to stay. \n\nThe Left really has its priorities straight. #VoteRedToSaveAmerica
## 5 skipped aunt Jemima & all of them <U+0001F92D>
## 6 @RealCandaceO Nobody but nobody is getting my cream of wheat Guy or my aunt Jemima. Geez! This is iconic history! Leave are logos alone it our history. And by the way, leave our monuments alone too. My Lord helps we beg you!
## source display_text_width reply_to_status_id
## 1 Twitter for Android 126 x1306578811220361217
## 2 Hootsuite Inc. 230
## 3 Twitter Web App 49 x1306578811220361217
## 4 Twitter for Android 173 x1306618688892735490
## 5 Twitter for Android 39
## 6 Twitter for iPhone 210 x1306578811220361217
## reply_to_user_id reply_to_screen_name is_quote is_retweet favorite_count
## 1 x878247600096509952 RealCandaceO FALSE FALSE 0
## 2 FALSE FALSE 0
## 3 x878247600096509952 RealCandaceO FALSE FALSE 0
## 4 x116496709 _RyanWinkler FALSE FALSE 1
## 5 FALSE FALSE 0
## 6 x878247600096509952 RealCandaceO FALSE FALSE 0
## retweet_count quote_count reply_count hashtags symbols
## 1 0 NA NA NA
## 2 0 NA NA NA
## 3 0 NA NA NA
## 4 0 NA NA VoteRedToSaveAmerica NA
## 5 0 NA NA NA
## 6 0 NA NA NA
## urls_url urls_t.co
## 1
## 2 law.com/corpcounsel/20â\200¦ https://t.co/nsRlYc0wlp
## 3
## 4
## 5
## 6
## urls_expanded_url
## 1
## 2 https://www.law.com/corpcounsel/2020/09/16/the-challenges-of-walking-away-from-offensive-trademarks/
## 3
## 4
## 5
## 6
## media_url media_t.co media_expanded_url media_type ext_media_url
## 1
## 2
## 3
## 4
## 5
## 6
## ext_media_t.co ext_media_expanded_url ext_media_type
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## mentions_user_id mentions_screen_name lang
## 1 x878247600096509952 RealCandaceO en
## 2 en
## 3 x878247600096509952 x39344374 RealCandaceO DonaldJTrumpJr en
## 4 x116496709 x15900384 _RyanWinkler MinnesotaDFL en
## 5 en
## 6 x878247600096509952 RealCandaceO en
## quoted_status_id quoted_text quoted_created_at quoted_source
## 1
## 2
## 3
## 4
## 5
## 6
## quoted_favorite_count quoted_retweet_count quoted_user_id quoted_screen_name
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## quoted_name quoted_followers_count quoted_friends_count quoted_statuses_count
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## quoted_location quoted_description quoted_verified retweet_status_id
## 1 NA NA
## 2 NA NA
## 3 NA NA
## 4 NA NA
## 5 NA NA
## 6 NA NA
## retweet_text retweet_created_at retweet_source retweet_favorite_count
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## retweet_retweet_count retweet_user_id retweet_screen_name retweet_name
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## retweet_followers_count retweet_friends_count retweet_statuses_count
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## retweet_location retweet_description retweet_verified place_url place_name
## 1 NA NA NA
## 2 NA NA NA
## 3 NA NA NA
## 4 NA NA NA
## 5 NA NA NA
## 6 NA NA NA
## place_full_name place_type country country_code geo_coords coords_coords
## 1 NA NA NA NA
## 2 NA NA NA NA
## 3 NA NA NA NA
## 4 NA NA NA NA
## 5 NA NA NA NA
## 6 NA NA NA NA
## bbox_coords
## 1 NA NA NA NA NA NA NA NA
## 2 NA NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA NA
## 4 NA NA NA NA NA NA NA NA
## 5 NA NA NA NA NA NA NA NA
## 6 NA NA NA NA NA NA NA NA
## status_url
## 1 https://twitter.com/J11235843718981/status/1306706202651369473
## 2 https://twitter.com/CorpCounsel/status/1306705942503923717
## 3 https://twitter.com/MarkBak91766022/status/1306705821737385984
## 4 https://twitter.com/MNMAGA1/status/1306705618108063748
## 5 https://twitter.com/Jewelrese/status/1306703535799631872
## 6 https://twitter.com/smarty7725/status/1306702232553811968
## name location
## 1 J112358437189887641562819
## 2 Corporate Counsel New York, NY
## 3 Mark Baker
## 4 MN MAGA
## 5 Dirt queen Silicon Beach
## 6 Martha Denny Arizona, USA
## description
## 1 https://t.co/QrqjZYZ7Q5 \n\n$t3t6t9 \n\nj112358437189887641562819@gmail.com
## 2 The business magazine and website for in-house counsel.
## 3 mean metal machines
## 4 MAGA FOREVER â\200¢ Dog lover â\200¢ <U+271D><U+FE0F>
## 5 bright
## 6
## url protected followers_count friends_count listed_count
## 1 https://t.co/dAmgWraxDk FALSE 0 118 0
## 2 https://t.co/njvbBDmELo FALSE 30672 188 664
## 3 FALSE 4 63 0
## 4 FALSE 124 358 0
## 5 FALSE 11 37 0
## 6 FALSE 199 491 1
## statuses_count favourites_count account_created_at verified
## 1 224 584 2020-06-10 13:20:16 FALSE
## 2 23472 400 2009-05-06 17:26:10 FALSE
## 3 521 94 2019-09-25 11:42:38 FALSE
## 4 617 225 2020-08-31 01:43:29 FALSE
## 5 3510 67 2020-01-23 11:52:26 FALSE
## 6 3458 400 2017-06-18 22:38:31 FALSE
## profile_url profile_expanded_url account_lang
## 1 https://t.co/dAmgWraxDk https://join.robinhood.com/jasons9456 NA
## 2 https://t.co/njvbBDmELo https://www.law.com/corpcounsel/ NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## profile_banner_url
## 1
## 2 https://pbs.twimg.com/profile_banners/38228827/1591375423
## 3 https://pbs.twimg.com/profile_banners/1176823965366607872/1577305978
## 4
## 5 https://pbs.twimg.com/profile_banners/1220313283267190784/1581175257
## 6
## profile_background_url
## 1
## 2 http://abs.twimg.com/images/themes/theme1/bg.png
## 3
## 4
## 5
## 6
## profile_image_url
## 1 http://pbs.twimg.com/profile_images/1270707421225390081/blfWSkWI_normal.jpg
## 2 http://pbs.twimg.com/profile_images/1182753016845426688/FANWOew4_normal.png
## 3 http://pbs.twimg.com/profile_images/1209935079683870720/62O0YDHK_normal.jpg
## 4 http://pbs.twimg.com/profile_images/1300248019280027648/eDVEU3lF_normal.jpg
## 5 http://pbs.twimg.com/profile_images/1226163834181701632/zOcpSMuu_normal.jpg
## 6 http://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png
# remove http elements manually
rt$stripped_text <- gsub("http.*","", rt$text)
rt$stripped_text <- gsub("https.*","", rt$stripped_text)
# remove punctuation, convert to lowercase, add id for each tweet!
rt_clean <- rt %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# note the words that are recognized as unique by R
a_list_of_words <- c("Dog", "dog", "dog", "cat", "cat", ",")
unique(a_list_of_words)
## [1] "Dog" "dog" "cat" ","
## [1] "Dog" "dog" "cat" ","
# remove punctuation, convert to lowercase, add id for each tweet!
rt_clean <- rt %>%
dplyr::select(stripped_text) %>%
unnest_tokens(word, stripped_text)
# plot the top 15 words -- notice any issues?
rt_clean %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
## Selecting by n

# load list of stop words - from the tidytext package
data("stop_words")
# view first 6 words
head(stop_words)
## # A tibble: 6 x 2
## word lexicon
## <chr> <chr>
## 1 a SMART
## 2 a's SMART
## 3 able SMART
## 4 about SMART
## 5 above SMART
## 6 according SMART
# remove stop words from your list of words
cleaned_tweet_words <- rt_clean %>%
anti_join(stop_words)
## Joining, by = "word"
# there should be fewer words now
nrow(cleaned_tweet_words)
## [1] 11330
# plot the top 15 words -- notice any issues?
cleaned_tweet_words %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(y = "Count",
x = "Unique words",
title = "Count of unique words found in tweets",
subtitle = "Stop words removed from the list")
## Selecting by n

#install.packages("devtools")
library(devtools)
## Warning: package 'devtools' was built under R version 3.6.3
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 3.6.3
#install_github("dgrtwo/widyr")
library(widyr)
# remove punctuation, convert to lowercase, add id for each tweet!
rt_paired_words <- rt %>%
dplyr::select(stripped_text) %>%
unnest_tokens(paired_words, stripped_text, token = "ngrams", n = 2)
rt_paired_words %>%
count(paired_words, sort = TRUE)
## # A tibble: 15,186 x 2
## paired_words n
## <chr> <int>
## 1 aunt jemima 807
## 2 u 0001f62d 140
## 3 0001f62d u 134
## 4 jemima and 105
## 5 uncle ben 88
## 6 and uncle 66
## 7 in the 61
## 8 jemima syrup 58
## 9 of aunt 53
## 10 jemima is 42
## # ... with 15,176 more rows
# plot the top 15 words -- notice any issues?
cleaned_tweet_words %>%
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(y = "Count",
x = "Unique words",
title = "Count of unique words found in tweets",
subtitle = "Stop words removed from the list")
## Selecting by n

##
## Attaching package: 'tidyr'
## The following object is masked from 'package:igraph':
##
## crossing
## # A tibble: 6 x 3
## word1 word2 n
## <chr> <chr> <int>
## 1 aunt jemima 807
## 2 uncle ben 88
## 3 jemima syrup 58
## 4 maple syrup 38
## 5 aunt jemimaâ 31
## 6 alissawalsh21 mattwalshblog 19
