library(readr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidytext)
library(extrafont)
## Registering fonts with R
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
# Read in data
MLK_Speech <- read_lines("I_have_a_dream_MLK.txt")

# Splits chunk of text into lines
MLK_text <- data.frame(line = 1:length(MLK_Speech), text = MLK_Speech, stringsAsFactors = FALSE)

# filter blank lines (even rows)
MLK_text <- MLK_text %>% filter(line %% 2 == 1) %>%
  mutate(line = 1:nrow(.)) #renumber lines

# Make single word vector
MLK_text <- MLK_text %>%
  unnest_tokens(word, text)

# Now remove all the stop words using "anti_join"
tidy_MLK <- MLK_text %>%
  anti_join(stop_words)
## Joining, by = "word"
# count the frequency of each word
tidy_MLK %>%
  count(word, sort = TRUE)
## # A tibble: 214 x 2
##    word         n
##    <chr>    <int>
##  1 freedom     13
##  2 ring        12
##  3 dream       11
##  4 day          9
##  5 negro        8
##  6 free         5
##  7 white        5
##  8 faith        4
##  9 hundred      4
## 10 mountain     4
## # ... with 204 more rows
# count frequency of word used in the speech by most to least words that used more than twice.
tidy_MLK %>%
  count(word, sort = TRUE) %>%
  filter(n >2) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word,n))+
  geom_bar(stat = "identity", fill = "steelblue")+
  xlab(NULL)+
  coord_flip()+ labs(x="Words used more than 2 times", y= "word frequency", title = "I have a Dream Speech, MLK Jr. (1963)")

tidy_MLK <- tidy_MLK %>%
  count(word, sort = TRUE) %>%
  mutate(word = reorder(word, n))
View(tidy_MLK)  
wordcloud(words = tidy_MLK$word, freq = tidy_MLK$n, min.freq = 2, max.words = 200, random.order = FALSE, rot.per = 0.35,
          colors = brewer.pal(8, "Dark2"))