911 5b-14 TAKE 1

Load Libraries:

#all necessary libraries here

library(rtweet)
library(twitteR)
library(leaflet)
library(quanteda)
library(readr)
library(httr)
library(tidytext)
library(tidyverse)
library(quanteda.textmodels)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
library(dplyr)

#Copied over the whole doc from 5b-12

Please note: this code is almost entirely from my project 911 5b-12 TAKE 4. New code will be cited as necessary.

911 on Fox Tweets

Load tweets from where I scraped them earlier:

setwd("~/DACCS R/Text as Data/911Fox Project")
#Already scraped tweets so just loading in the csv

tweets14 <- read_csv("tweets14.csv")
head(tweets14)

# A tibble: 6 × 90
  user_id       status_id created_at          screen_name text  source
  <chr>         <chr>     <dttm>              <chr>       <chr> <chr> 
1 x11344572023… x1516527… 2022-04-19 21:21:23 thiamcore   "edd… Twitt…
2 x11344572023… x1515825… 2022-04-17 22:51:33 thiamcore   "edd… Twitt…
3 x11344572023… x1514022… 2022-04-12 23:28:33 thiamcore   "edd… Twitt…
4 x14665225508… x1516527… 2022-04-19 21:21:19 Tereredlim… "Sti… Twitt…
5 x74037324162… x1516525… 2022-04-19 21:12:36 CammiGarre… "Chi… Twitt…
6 x411865835    x1516522… 2022-04-19 21:02:48 nailah_ais… "@91… Twitt…
# … with 84 more variables: display_text_width <dbl>,
#   reply_to_status_id <chr>, reply_to_user_id <chr>,
#   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
#   favorite_count <dbl>, retweet_count <dbl>, quote_count <lgl>,
#   reply_count <lgl>, hashtags <chr>, symbols <lgl>, urls_url <chr>,
#   urls_t.co <chr>, urls_expanded_url <chr>, media_url <chr>,
#   media_t.co <chr>, media_expanded_url <chr>, media_type <chr>, …

Separate the Data into only post-5b 14 tweets.

#copy over tweet df to a working df

workingTweets14 <- tweets14

Narrow down to just post-airing tweets

#Narrowing down my working df

nrow(workingTweets14)

[1] 10000

#format = "%Y-%m-%d %H:%M:%s"
# show start date march 29 12 midnight Greenwich Mean Time
start_date <- as.POSIXct('2022-04-18 00:00:00', tz="UTC")

workingTweets14 <- workingTweets14 %>% filter(created_at >= start_date)

nrow(workingTweets14)

[1] 5178

Note: MAKE SURE YOU SET THE PROPER DATE

So that carved us down a bit - let’s check the data.

Explore Common Words

tweetWords <- workingTweets14 %>%
  dplyr::select(text) %>%
  unnest_tokens(word, text)

head(tweetWords)

# A tibble: 6 × 1
  word 
  <chr>
1 eddie
2 diaz 
3 911  
4 911  
5 on   
6 fox

Attempt to plot the top 15 words:

# plot the top 30 words
tweetWords %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets")

Deal with Stop Words

A majority of these seem to be stop words, so let’s fix that!

data("stop_words")
# how many words do you have including the stop words?
nrow(tweetWords)

[1] 62897

tweetsClean <- tweetWords %>%
  anti_join(stop_words) %>%
  filter(!word == "rt")

# how many words after removing the stop words?
nrow(tweetsClean)

[1] 32501

Replot top 30

# plot the top 30 words -- notice any issues?
tweetsClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets")

I still want to get things like https and t.co and 911onfox out of here:

#this gets https out I think

nrow(tweetsClean)

[1] 32501

# cleanup
tweetsClean <- workingTweets14 %>%
  mutate(text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)", 
                           "", text)) %>% 
  filter(created_at >= start_date ) %>% 
  dplyr::select(text) %>%
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>%
  filter(!word == "rt") # remove all rows that contain "rt" or retweet
nrow(tweetsClean)

[1] 28424

Replot top 50

# plot the top 30 words -- notice any issues?
tweetsClean %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(30) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets, ")

From here I want to remove numbers and words that start with numbers. How do I do that?

#I'm going to try gsub - that worked but left an empty cell!

nrow(tweetsClean)

[1] 28424

#tweetsCloud <- tweetsClean %>% slice(-("911onfox"))  this doesn't work 

#going to try subset

tweetsCloud <- subset(tweetsClean, word!="911onfox" & word!="episode" & word!="911lonestar" 
                      & word!="hewitt" & word!="i'm" & word!="it's" & word!="1" 
                      &  word!="chim" & word!="gonna" & word!="tonight" 
                      & word!="shes" & word!="im")       #IT LOOKS LIKE THAT WORKED!!



#tweetsCloud <- subset(tweetsCloud, word!="episode")
#tweetsCloud <- subset(tweetsCloud, word!="911lonestar")   #every time I run the word cloud I see words to take out
#tweetsCloud <- subset(tweetsCloud, word!="hewitt") 
#tweetsCloud <- subset(tweetsCloud, word!="im") 
#tweetsCloud <- subset(tweetsCloud, word!="it's") 
#tweetsCloud <- subset(tweetsCloud, word!="1") 
#tweetsCloud <- subset(tweetsCloud, word!="chim") 
#tweetsCloud <- subset(tweetsCloud, word!="gonna")
#tweetsCloud <- subset(tweetsCloud, word!="tonight")
#tweetsCloud <- subset(tweetsCloud, word!="I'm") 

nrow(tweetsCloud)

[1] 22230

Replot top 60

# plot the top 60 words
tweetsCloud %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(60) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x = word, y = n)) +
  geom_col() +
  xlab(NULL) +
  coord_flip() +
      labs(x = "Count",
      y = "Unique words",
      title = "Count of unique words found in tweets, ")

What I figured out from my last project is that I have an encoding problem here, which is why I’m still getting words that aren’t meaningful. While it’s probably not the correct way to manage this, I’m going to write my top 80 words to a csv, pull it up in excel and take a look at it there. Like I said, probably not ideal, but this will give me 60 words for my wordcloud.

#Put the top 80 words into its own dataframe?

top80 <- tweetsCloud %>%
  dplyr::count(word, sort = TRUE) %>%
  top_n(80) %>%
  mutate(word = reorder(word, n))

head(top80)

# A tibble: 6 × 2
  word                   n
  <fct>              <int>
1 eddie                749
2 buck                 718
3 gayfirefightershow   369
4 lucy                 296
5 chris                282
6 love                 240

write_as_csv(top80,"top80.csv")

From here, I’ll go to excel and clean it up! Note: this is the qualitative part of the analysis

Deleted from top80:

i‚Äôm don‚Äôt amp it‚Äôs he‚Äôs that‚Äôs

All of these look to me like they would have been removed with stop words?

In addition, I’m going to make the call to combine the words “stop” and “sign” due to details of the episode.We’ll see how the wordcloud handles it!

Now to import the csv

top76 <- read_csv("14_top76.csv")
head(top76)

# A tibble: 6 × 2
  word                   n
  <chr>              <dbl>
1 eddie                749
2 buck                 718
3 christopher          408
4 gayfirefightershow   369
5 lucy                 296
6 stop sign            292

Finally, word cloud???

Plot Top 76

wordcloud2(data=top76, color = "random-dark")

I’m going to pare down a few words here:

Consolidate Chris and Christoper to Christopher Consolidate save and saved to saved Removed: scene, scenes, season, watch, watching

Reload the file:

top69 <- read_csv("14_top76.csv")
head(top69)

# A tibble: 6 × 2
  word                   n
  <chr>              <dbl>
1 eddie                749
2 buck                 718
3 christopher          408
4 gayfirefightershow   369
5 lucy                 296
6 stop sign            292

Brief attempt to change the colors:

# or a vector of colors. vector must be same length than input data
wordcloud2(top69, color=rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod", "forestgreen"), nrow(top69) ) )

And now to export it as a png!

# install webshot
library(webshot)
webshot::install_phantomjs()

# Make the graph
my_graph <- wordcloud2(top69, color=rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod"), nrow(top69) ) )

my_graph

# save it in html
library("htmlwidgets")
saveWidget(my_graph,"tmp.html",selfcontained = F)

# and in png or pdf
webshot("tmp.html","wordcloud14a.png", delay =5, vwidth = 1000, vheight=800)