library(readr)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(extrafont)
## Registering fonts with R
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
# Read in data
MLK_Speech <- read_lines("I_have_a_dream_MLK.txt")
# Splits chunk of text into lines
MLK_text <- data.frame(line = 1:length(MLK_Speech), text = MLK_Speech, stringsAsFactors = FALSE)
# filter blank lines (even rows)
MLK_text <- MLK_text %>% filter(line %% 2 == 1) %>%
mutate(line = 1:nrow(.)) #renumber lines
# Make single word vector
MLK_text <- MLK_text %>%
unnest_tokens(word, text)
# Now remove all the stop words using "anti_join"
tidy_MLK <- MLK_text %>%
anti_join(stop_words)
## Joining, by = "word"
# count the frequency of each word
tidy_MLK %>%
count(word, sort = TRUE)
## # A tibble: 214 x 2
## word n
## <chr> <int>
## 1 freedom 13
## 2 ring 12
## 3 dream 11
## 4 day 9
## 5 negro 8
## 6 free 5
## 7 white 5
## 8 faith 4
## 9 hundred 4
## 10 mountain 4
## # ... with 204 more rows
# count frequency of word used in the speech by most to least words that used more than twice.
tidy_MLK %>%
count(word, sort = TRUE) %>%
filter(n >2) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word,n))+
geom_bar(stat = "identity", fill = "steelblue")+
xlab(NULL)+
coord_flip()+ labs(x="Words used more than 2 times", y= "word frequency", title = "I have a Dream Speech, MLK Jr. (1963)")

tidy_MLK <- tidy_MLK %>%
count(word, sort = TRUE) %>%
mutate(word = reorder(word, n))
View(tidy_MLK)
wordcloud(words = tidy_MLK$word, freq = tidy_MLK$n, min.freq = 2, max.words = 200, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"))
