edgecompute <- search_tweets("Edge Computing", n=1000, include_rts=FALSE, lang="en")
# Remove hashtags
eh <- subset(edgecompute, is.na(edgecompute$hashtags))
Then, create a separate data frame containing the number of organic tweets, retweets, and replies. These numbers are easy to find: they are the number of observations for your three respective datasets.
# Adding columns
edge_org <- edgecompute %>%
select(source) %>%
group_by(source) %>%
summarize(count=n())
edge_org <- subset(edge_org, count > 11)
data <- data.frame(
category=edge_org$source,
count=edge_org$count
)
data$fraction = data$count / sum(data$count)
data$percentage = data$count / sum(data$count) * 100
data$ymax = cumsum(data$fraction)
data$ymin = c(0, head(data$ymax, n=-1))
data <- round_df(data, 2)
Source <- paste(data$category, data$percentage, "%")
ggplot(data, aes(ymax=ymax, ymin=ymin, xmax=4, xmin=3, fill=Source)) +
geom_rect() +
coord_polar(theta="y") + # Try to remove that to understand how the chart is built initially
xlim(c(2, 4)) +
theme_void() +
theme(legend.position = "right")
## Most frequent words found in edge computation
edgecompute$text <- gsub("https\\S*", "", edgecompute$text)
edgecompute$text <- gsub("@\\S*", "", edgecompute$text)
tweets <- edgecompute %>%
select(text) %>%
unnest_tokens(word, text)
tweets <- tweets %>%
anti_join(stop_words)
## Joining, by = "word"
tweets %>% # gives you a bar chart of the most frequent words found in the tweets
count(word, sort = TRUE) %>%
top_n(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
labs(y = "Count",
x = "Unique words",
title = "Most frequent words found in the tweets related to edge computing",
subtitle = "Stop words removed from the list")
## Selecting by n