Practice in scraping tweets and making a word cloud using Tweets about 911 on Fox.
Load Libraries:
#all necessary libraries here
library(rtweet)
library(twitteR)
library(leaflet)
library(quanteda)
library(readr)
library(httr)
library(tidytext)
library(tidyverse)
library(quanteda.textmodels)
library(tm)
library(wordcloud)
library(RColorBrewer)
library(wordcloud2)
library(dplyr)
#Copied over the whole doc from 5b-12
Please note: this code is almost entirely from my project 911 5b-12 TAKE 4. New code will be cited as necessary.
Load tweets from where I scraped them earlier:
setwd("~/DACCS R/Text as Data/911Fox Project")
#Already scraped tweets so just loading in the csv
tweets15 <- read_csv("tweets15.csv")
head(tweets15)
# A tibble: 6 × 90
user_id status_id created_at screen_name text source
<chr> <chr> <dttm> <chr> <chr> <chr>
1 x16821010 x151906833… 2022-04-26 21:38:08 YourFuture… "See… Twitt…
2 x143646307 x151906765… 2022-04-26 21:35:26 _DreaJay "Thi… Twitt…
3 x101831715 x151906657… 2022-04-26 21:31:07 NutCase_ "oh … Twitt…
4 x101831715 x151648219… 2022-04-19 18:21:45 NutCase_ "\"i… Twitt…
5 x101831715 x151647394… 2022-04-19 17:48:57 NutCase_ "thi… Twitt…
6 x2870355517 x151906326… 2022-04-26 21:18:00 pignapoke_ "Fin… Twitt…
# … with 84 more variables: display_text_width <dbl>,
# reply_to_status_id <chr>, reply_to_user_id <chr>,
# reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
# favorite_count <dbl>, retweet_count <dbl>, quote_count <lgl>,
# reply_count <lgl>, hashtags <chr>, symbols <lgl>, urls_url <chr>,
# urls_t.co <chr>, urls_expanded_url <chr>, media_url <chr>,
# media_t.co <chr>, media_expanded_url <chr>, media_type <chr>, …
#copy over tweet df to a working df
workingTweets15 <- tweets15
#Narrowing down my working df
nrow(workingTweets15)
[1] 10000
#format = "%Y-%m-%d %H:%M:%s"
# show start date march 29 12 midnight Greenwich Mean Time
start_date <- as.POSIXct('2022-04-25 00:00:00', tz="UTC")
workingTweets15 <- workingTweets15 %>% filter(created_at >= start_date)
nrow(workingTweets15)
[1] 4819
Note: MAKE SURE YOU SET THE PROPER DATE
So that carved us down a bit - let’s check the data.
tweetWords <- workingTweets15 %>%
dplyr::select(text) %>%
unnest_tokens(word, text)
head(tweetWords)
# A tibble: 6 × 1
word
<chr>
1 seeing
2 madeleine
3 on
4 911onfox
5 is
6 so
Attempt to plot the top 30 words:
# plot the top 30 words
tweetWords %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
A majority of these seem to be stop words, so let’s fix that!
[1] 51641
tweetsClean <- tweetWords %>%
anti_join(stop_words) %>%
filter(!word == "rt")
# how many words after removing the stop words?
nrow(tweetsClean)
[1] 26447
Replot top 30
# plot the top 30 words -- notice any issues?
tweetsClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets")
I still want to get things like https and t.co and 911onfox out of here:
#this gets https out I think
nrow(tweetsClean)
[1] 26447
# cleanup
tweetsClean <- workingTweets15 %>%
mutate(text = gsub("\\s?(f|ht)(tp)(s?)(://)([^\\.]*)[\\.|/](\\S*)",
"", text)) %>%
filter(created_at >= start_date ) %>%
dplyr::select(text) %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
filter(!word == "rt") # remove all rows that contain "rt" or retweet
nrow(tweetsClean)
[1] 23852
Replot top 30
# plot the top 30 words -- notice any issues?
tweetsClean %>%
dplyr::count(word, sort = TRUE) %>%
top_n(30) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets, ")
From here I want to remove numbers and words that start with numbers. How do I do that?
#I'm going to try gsub - that worked but left an empty cell!
nrow(tweetsClean)
[1] 23852
#tweetsCloud <- tweetsClean %>% slice(-("911onfox")) this doesn't work
#going to try subset
tweetsCloud <- subset(tweetsClean, word!="911onfox" & word!="episode" & word!="911lonestar"
& word!="hewitt" & word!="i'm" & word!="it's" & word!="1"
& word!="chim" & word!="gonna" & word!="tonight"
& word!="shes" & word!="im") #IT LOOKS LIKE THAT WORKED!!
#tweetsCloud <- subset(tweetsCloud, word!="episode")
#tweetsCloud <- subset(tweetsCloud, word!="911lonestar") #every time I run the word cloud I see words to take out
#tweetsCloud <- subset(tweetsCloud, word!="hewitt")
#tweetsCloud <- subset(tweetsCloud, word!="im")
#tweetsCloud <- subset(tweetsCloud, word!="it's")
#tweetsCloud <- subset(tweetsCloud, word!="1")
#tweetsCloud <- subset(tweetsCloud, word!="chim")
#tweetsCloud <- subset(tweetsCloud, word!="gonna")
#tweetsCloud <- subset(tweetsCloud, word!="tonight")
#tweetsCloud <- subset(tweetsCloud, word!="I'm")
nrow(tweetsCloud)
[1] 18103
Replot top 60
# plot the top 60 words
tweetsCloud %>%
dplyr::count(word, sort = TRUE) %>%
top_n(60) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(x = "Count",
y = "Unique words",
title = "Count of unique words found in tweets, ")
What I figured out from my last project is that I have an encoding problem here, which is why I’m still getting words that aren’t meaningful. While it’s probably not the correct way to manage this, I’m going to write my top 80 words to a csv, pull it up in excel and take a look at it there. Like I said, probably not ideal, but this will give me 60 words for my wordcloud.
#Put the top 80 words into its own dataframe?
top80_15 <- tweetsCloud %>%
dplyr::count(word, sort = TRUE) %>%
top_n(80) %>%
mutate(word = reorder(word, n))
head(top80_15)
# A tibble: 6 × 2
word n
<fct> <int>
1 buck 427
2 eddie 419
3 love 304
4 gayfirefightershow 258
5 karen 217
6 henren 197
write_as_csv(top80_15,"top80_15.csv")
From here, I’ll go to excel and clean it up! Note: this is the qualitative part of the analysis
Deleted from top80_15:
3 9 amp can’t doesn’t don’t he’s i’m it’s she’s that’s y’all
Words I Combined
Cry + Crying -> Crying bobby + nash -> bobby girl _ girls -> girls kids + kid -> kids watch + watching -> watching scene + scenes -> scene
All of these look to me like they would have been removed with stop words?
Now to import the csv
# A tibble: 6 × 2
word n
<chr> <dbl>
1 buck 427
2 eddie 419
3 love 304
4 gayfirefightershow 258
5 karen 217
6 crying 198
Finally, word cloud???
Plot Top 62
wordcloud2(data=top62, color = "random-dark")
Brief attempt to change the colors:
# or a vector of colors. vector must be same length than input data
#rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod", "forestgreen")
library(RColorBrewer)
wordcloud2(top62, color=rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod"), nrow(top62) ) )
And now to export it as a png!
# install webshot
library(webshot)
#webshot::install_phantomjs()
# Make the graph
my_graph <- wordcloud2(top62, color=rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod"), nrow(top62) ) )
my_graph
# save it in html
library("htmlwidgets")
saveWidget(my_graph,"tmp.html",selfcontained = F)
# and in png or pdf
webshot("tmp.html","wordcloud15a.png", delay =5, vwidth = 1000, vheight=800)
I’m not loving this, so I’m going to pare it down to 55 and see if I like that better.
Deleted:
Scene Watching car feel anymore week combine mon + mother -> mom
# A tibble: 6 × 2
word n
<chr> <dbl>
1 buck 427
2 eddie 419
3 love 304
4 gayfirefightershow 258
5 mom 226
6 karen 217
wordcloud2(top55, size = .75, color=rep_len( c("mediumblue","mediumpurple", "darkgreen", "orchid", "deeppink", "yellowgreen"), nrow(top55) ) )
# install webshot
library(webshot)
webshot::install_phantomjs()
# Make the graph
my_graph <- wordcloud2(top62, size = .8, color=rep_len( c("mediumblue","darkorchid", "seagreen", "firebrick", "deeppink", "goldenrod"), nrow(top62) ) )
my_graph
# save it in html
library("htmlwidgets")
saveWidget(my_graph,"tmp.html",selfcontained = F)
# and in png or pdf
webshot("tmp.html","wordcloud15c.png", delay =5, vwidth = 1000, vheight=800)