911 5b-15 FOMO - TAKE 1

Practice in scraping tweets and making a word cloud using Tweets about 911 on Fox.

Lissie Bates-Haus, Ph.D. https://github.com/lbateshaus (U Mass Amherst DACSS MS Student)https://www.umass.edu/sbs/data-analytics-and-computational-social-science-program/ms
2022-04-28

Load Libraries:

Please note: this code is almost entirely from my project 911 5b-12 TAKE 4. New code will be cited as necessary.

911 on Fox Tweets

Load tweets from where I scraped them earlier:

setwd("~/DACCS R/Text as Data/911Fox Project")
#Already scraped tweets so just loading in the csv

tweets15 <- read_csv("tweets15.csv")
head(tweets15)
# A tibble: 6 × 90
  user_id     status_id   created_at          screen_name text  source
  <chr>       <chr>       <dttm>              <chr>       <chr> <chr> 
1 x16821010   x151906833… 2022-04-26 21:38:08 YourFuture… "See… Twitt…
2 x143646307  x151906765… 2022-04-26 21:35:26 _DreaJay    "Thi… Twitt…
3 x101831715  x151906657… 2022-04-26 21:31:07 NutCase_    "oh … Twitt…
4 x101831715  x151648219… 2022-04-19 18:21:45 NutCase_    "\"i… Twitt…
5 x101831715  x151647394… 2022-04-19 17:48:57 NutCase_    "thi… Twitt…
6 x2870355517 x151906326… 2022-04-26 21:18:00 pignapoke_  "Fin… Twitt…
# … with 84 more variables: display_text_width <dbl>,
#   reply_to_status_id <chr>, reply_to_user_id <chr>,
#   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
#   favorite_count <dbl>, retweet_count <dbl>, quote_count <lgl>,
#   reply_count <lgl>, hashtags <chr>, symbols <lgl>, urls_url <chr>,
#   urls_t.co <chr>, urls_expanded_url <chr>, media_url <chr>,
#   media_t.co <chr>, media_expanded_url <chr>, media_type <chr>, …

Separate the Data into only post-5b 15 tweets.

#copy over tweet df to a working df

workingTweets15 <- tweets15

Narrow down to just post-airing tweets

#Narrowing down my working df

nrow(workingTweets15)
[1] 10000
#format = "%Y-%m-%d %H:%M:%s"
# show start date march 29 12 midnight Greenwich Mean Time
start_date <- as.POSIXct('2022-04-25 00:00:00', tz="UTC")

workingTweets15 <- workingTweets15 %>% filter(created_at >= start_date)

nrow(workingTweets15)
[1] 4819

Note: MAKE SURE YOU SET THE PROPER DATE

So that carved us down a bit - let’s check the data.

Explore Common Words

tweetWords <- workingTweets15 %>%
  dplyr::select(text) %>%
  unnest_tokens(word, text)

head(tweetWords)
# A tibble: 6 × 1
  word     
  <chr>    
1 seeing   
2 madeleine
3 on       
4 911onfox 
5 is       
6 so       

Attempt to plot the top 30 words:

# plot the top 30 words
tweetWords %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets")

Deal with Stop Words

A majority of these seem to be stop words, so let’s fix that!

data("stop_words")
# how many words do you have including the stop words?
nrow(tweetWords)
[1] 51641
tweetsClean <- tweetWords %>%
  anti_join(stop_words) %>%
  filter(!word == "rt")

# how many words after removing the stop words?
nrow(tweetsClean)
[1] 26447

Replot top 30

# plot the top 30 words -- notice any issues?
tweetsClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets")

I still want to get things like https and t.co and 911onfox out of here:

#this gets https out I think

nrow(tweetsClean)
[1] 26447
# cleanup
tweetsClean <- workingTweets15 %>%
  mutate(text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)", 
                           "", text)) %>% 
  filter(created_at >= start_date ) %>% 
  dplyr::select(text) %>%
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>%
  filter(!word == "rt") # remove all rows that contain "rt" or retweet
nrow(tweetsClean)
[1] 23852

Replot top 30

# plot the top 30 words -- notice any issues?
tweetsClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets, ")

From here I want to remove numbers and words that start with numbers. How do I do that?

#I'm going to try gsub - that worked but left an empty cell!

nrow(tweetsClean)
[1] 23852
#tweetsCloud <- tweetsClean %>% slice(-("911onfox"))  this doesn't work 

#going to try subset

tweetsCloud <- subset(tweetsClean, word!="911onfox" & word!="episode" & word!="911lonestar" 
                      & word!="hewitt" & word!="i'm" & word!="it's" & word!="1" 
                      &  word!="chim" & word!="gonna" & word!="tonight" 
                      & word!="shes" & word!="im")       #IT LOOKS LIKE THAT WORKED!!



#tweetsCloud <- subset(tweetsCloud, word!="episode")
#tweetsCloud <- subset(tweetsCloud, word!="911lonestar")   #every time I run the word cloud I see words to take out
#tweetsCloud <- subset(tweetsCloud, word!="hewitt") 
#tweetsCloud <- subset(tweetsCloud, word!="im") 
#tweetsCloud <- subset(tweetsCloud, word!="it's") 
#tweetsCloud <- subset(tweetsCloud, word!="1") 
#tweetsCloud <- subset(tweetsCloud, word!="chim") 
#tweetsCloud <- subset(tweetsCloud, word!="gonna")
#tweetsCloud <- subset(tweetsCloud, word!="tonight")
#tweetsCloud <- subset(tweetsCloud, word!="I'm") 

nrow(tweetsCloud)
[1] 18103

Replot top 60

# plot the top 60 words
tweetsCloud %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(60) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets, ")

What I figured out from my last project is that I have an encoding problem here, which is why I’m still getting words that aren’t meaningful. While it’s probably not the correct way to manage this, I’m going to write my top 80 words to a csv, pull it up in excel and take a look at it there. Like I said, probably not ideal, but this will give me 60 words for my wordcloud.

#Put the top 80 words into its own dataframe?

top80_15 <- tweetsCloud %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(80) %>%
  mutate(word = reorder(word, n))

head(top80_15)
# A tibble: 6 × 2
  word                   n
  <fct>              <int>
1 buck                 427
2 eddie                419
3 love                 304
4 gayfirefightershow   258
5 karen                217
6 henren               197
write_as_csv(top80_15,"top80_15.csv")

From here, I’ll go to excel and clean it up! Note: this is the qualitative part of the analysis

Deleted from top80_15:

3 9 amp can’t doesn’t don’t he’s i’m it’s she’s that’s y’all

Words I Combined

Cry + Crying -> Crying bobby + nash -> bobby girl _ girls -> girls kids + kid -> kids watch + watching -> watching scene + scenes -> scene

All of these look to me like they would have been removed with stop words?

Now to import the csv

top62 <- read_csv("top62_15.csv")
head(top62)
# A tibble: 6 × 2
  word                   n
  <chr>              <dbl>
1 buck                 427
2 eddie                419
3 love                 304
4 gayfirefightershow   258
5 karen                217
6 crying               198

Finally, word cloud???

Plot Top 62

wordcloud2(data=top62, color = "random-dark")

Brief attempt to change the colors:

# or a vector of colors. vector must be same length than input data
#rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod", "forestgreen")
library(RColorBrewer)
wordcloud2(top62, color=rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod"), nrow(top62) ) )

And now to export it as a png!

# install webshot
library(webshot)
#webshot::install_phantomjs()

# Make the graph
my_graph <- wordcloud2(top62, color=rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod"), nrow(top62) ) )

my_graph
# save it in html
library("htmlwidgets")
saveWidget(my_graph,"tmp.html",selfcontained = F)

# and in png or pdf
webshot("tmp.html","wordcloud15a.png", delay =5, vwidth = 1000, vheight=800)

I’m not loving this, so I’m going to pare it down to 55 and see if I like that better.

Deleted:

Scene Watching car feel anymore week combine mon + mother -> mom

top55 <- read_csv("top55_15.csv")
head(top55)
# A tibble: 6 × 2
  word                   n
  <chr>              <dbl>
1 buck                 427
2 eddie                419
3 love                 304
4 gayfirefightershow   258
5 mom                  226
6 karen                217
wordcloud2(top55, size = .75,  color=rep_len( c("mediumblue","mediumpurple", "darkgreen", "orchid", "deeppink", "yellowgreen"), nrow(top55) ) )
# install webshot
library(webshot)
webshot::install_phantomjs()

# Make the graph
my_graph <- wordcloud2(top62, size = .8, color=rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod"), nrow(top62) ) )

my_graph
# save it in html
library("htmlwidgets")
saveWidget(my_graph,"tmp.html",selfcontained = F)

# and in png or pdf
webshot("tmp.html","wordcloud15c.png", delay =5, vwidth = 1000, vheight=800)