DATA607_Presentation

library(readr)
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(extrafont)

## Registering fonts with R

library(wordcloud)

## Loading required package: RColorBrewer

library(RColorBrewer)

# Read in data
MLK_Speech <- read_lines("I_have_a_dream_MLK.txt")

# Splits chunk of text into lines
MLK_text <- data.frame(line = 1:length(MLK_Speech), text = MLK_Speech, stringsAsFactors = FALSE)

# filter blank lines (even rows)
MLK_text <- MLK_text %>% filter(line %% 2 == 1) %>%
  mutate(line = 1:nrow(.)) #renumber lines

# Make single word vector
MLK_text <- MLK_text %>%
  unnest_tokens(word, text)

# Now remove all the stop words using "anti_join"
tidy_MLK <- MLK_text %>%
  anti_join(stop_words)

## Joining, by = "word"

# count the frequency of each word
tidy_MLK %>%
  count(word, sort = TRUE)

## # A tibble: 214 x 2
##    word         n
##    <chr>    <int>
##  1 freedom     13
##  2 ring        12
##  3 dream       11
##  4 day          9
##  5 negro        8
##  6 free         5
##  7 white        5
##  8 faith        4
##  9 hundred      4
## 10 mountain     4
## # ... with 204 more rows

# count frequency of word used in the speech by most to least words that used more than twice.
tidy_MLK %>%
  count(word, sort = TRUE) %>%
  filter(n >2) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word,n))+
  geom_bar(stat = "identity", fill = "steelblue")+
  xlab(NULL)+
  coord_flip()+ labs(x="Words used more than 2 times", y= "word frequency", title = "I have a Dream Speech, MLK Jr. (1963)")

tidy_MLK <- tidy_MLK %>%
  count(word, sort = TRUE) %>%
  mutate(word = reorder(word, n))
View(tidy_MLK)

wordcloud(words = tidy_MLK$word, freq = tidy_MLK$n, min.freq = 2, max.words = 200, random.order = FALSE, rot.per = 0.35,
          colors = brewer.pal(8, "Dark2"))

DATA607_Presentation_TextMining

Don Padmaperuma

11/5/2019