The goal of this Milestone Report is to demonstrate that some Exploratory Data Analysis has been completed and that sufficient progress has been made to create the prediction algorithm next.
Setting the working directory in the en_US
folder.
Loading the necessary libraries.
library(tm)
library(dplyr)
library(tidytext)
library(ggplot2)
library(ggpubr)
library(wordcloud)
library(tidyr)
Reading the files.
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
We can use the meta
function to see the filename of the
document corresponding with the corpus.
## [1] "en_US.blogs.txt"
## [1] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”."
## [2] "We love you Mr. Brown."
## [1] "en_US.news.txt"
## [1] "He wasn't home alone, apparently."
## [2] "The St. Louis plant had to close. It would die of old age. Workers had been making cars there since the onset of mass automotive production in the 1920s."
## [1] "en_US.twitter.txt"
## [1] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long."
## [2] "When you meet someone special... you'll know. Your heart will beat more rapidly and you'll smile for no reason."
All the datasets are now loaded.
Converting the text into a dataframe.
blogs_textdf <- tibble(line = 1:length(blogs_text), text = blogs_text)
news_textdf <- tibble(line = 1:length(news_text), text = news_text)
twitter_textdf <- tibble(line = 1:length(twitter_text), text = twitter_text)
head(blogs_textdf)
## # A tibble: 6 × 2
## line text
## <int> <chr>
## 1 1 In the years thereafter, most of the Oil fields and platforms were name…
## 2 2 We love you Mr. Brown.
## 3 3 Chad has been awesome with the kids and holding down the fort while I w…
## 4 4 so anyways, i am going to share some home decor inspiration that i have…
## 5 5 With graduation season right around the corner, Nancy has whipped up a …
## 6 6 If you have an alternative argument, let's hear it! :)
Converting the text dataframe into one word per row.
blogs_tidy <- blogs_textdf %>%
unnest_tokens(word, text)
news_tidy <- news_textdf %>%
unnest_tokens(word, text)
twitter_tidy <- twitter_textdf %>%
unnest_tokens(word, text)
head(blogs_tidy)
## # A tibble: 6 × 2
## line word
## <int> <chr>
## 1 1 in
## 2 1 the
## 3 1 years
## 4 1 thereafter
## 5 1 most
## 6 1 of
Summary table
data_summary <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Character_Count = c(sum(nchar(blogs_text)), sum(nchar(news_text)), sum(nchar(twitter_text))),
Word_Count = c(nrow(blogs_tidy), nrow(news_tidy), nrow(twitter_tidy)),
Line_Count = c(nrow(blogs_textdf), nrow(news_textdf), nrow(twitter_textdf))
)
data_summary
## Dataset Character_Count Word_Count Line_Count
## 1 Blogs 206824505 37546806 899288
## 2 News 203223159 34762658 1010242
## 3 Twitter 162096031 30096649 2360148
set.seed(1984)
blogs_text <- sample(blogs_text, 100000)
news_text <- sample(news_text, 100000)
twitter_text <- sample(twitter_text, 100000)
Now let’s make the tidy dataframe (one word per row) for the new sample data.
blogs_textdf <- tibble(line = 1:length(blogs_text), text = blogs_text)
news_textdf <- tibble(line = 1:length(news_text), text = news_text)
twitter_textdf <- tibble(line = 1:length(twitter_text), text = twitter_text)
blogs_tidy <- blogs_textdf %>%
unnest_tokens(word, text)
news_tidy <- news_textdf %>%
unnest_tokens(word, text)
twitter_tidy <- twitter_textdf %>%
unnest_tokens(word, text)
Let’s first get the word frequencies in the dataset. First we need to remove the stop words and digits to get a meaningful analysis.
## # A tibble: 10 × 2
## line word
## <int> <chr>
## 1 1 my
## 2 1 seventh
## 3 1 grade
## 4 1 math
## 5 1 teacher
## 6 1 was
## 7 1 yet
## 8 1 another
## 9 1 victim
## 10 1 of
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## # A tibble: 6 × 2
## line word
## <int> <chr>
## 1 1 seventh
## 2 1 grade
## 3 1 math
## 4 1 teacher
## 5 1 victim
## 6 1 evil
Now let’s get the word count by descending order.
blogs_wcount <- blogs_tidy %>%
count(word, sort=TRUE)
news_wcount <- news_tidy %>%
count(word, sort=TRUE)
twitter_wcount <- twitter_tidy %>%
count(word, sort=TRUE)
blogs_wcount
## # A tibble: 96,573 × 2
## word n
## <chr> <int>
## 1 time 9990
## 2 people 6650
## 3 day 5931
## 4 love 5127
## 5 life 4714
## 6 it’s 4389
## 7 world 3370
## 8 i’m 3234
## 9 book 3213
## 10 don’t 3162
## # ℹ 96,563 more rows
## # A tibble: 88,266 × 2
## word n
## <chr> <int>
## 1 time 5690
## 2 people 4812
## 3 city 3775
## 4 school 3625
## 5 percent 3589
## 6 game 3497
## 7 day 3196
## 8 million 3102
## 9 home 3049
## 10 county 3009
## # ℹ 88,256 more rows
## # A tibble: 54,265 × 2
## word n
## <chr> <int>
## 1 love 4449
## 2 day 3840
## 3 rt 3837
## 4 time 3287
## 5 lol 2921
## 6 people 2156
## 7 follow 2061
## 8 happy 1994
## 9 tonight 1844
## 10 night 1770
## # ℹ 54,255 more rows
plt1 <- ggplot(data=blogs_wcount[1:15,], aes(x=word, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Word Frequencies in Blogs dataset") +
xlab("Word") +
ylab("Count")
plt2 <- ggplot(data=news_wcount[1:15,], aes(x=word, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Word Frequencies in News dataset") +
xlab("Word") +
ylab("Count")
plt3 <- ggplot(data=twitter_wcount[1:15,], aes(x=word, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Word Frequencies in Twitter dataset") +
xlab("Word") +
ylab("Count")
plt1
Let’s display the most common bigrams in all datasets.
#unnest_tokens(word, text) #%>%
blogs_bigram <- blogs_textdf %>%
unnest_tokens(bigram, text, token = 'ngrams', n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words$word) & !is.na(word1) & !grepl("\\d", word1)) %>%
filter(!(word2 %in% stop_words$word) & !is.na(word2) & !grepl("\\d", word2)) %>%
mutate(bigram=paste(word1, word2, sep = " ")) %>%
count(bigram, sort=TRUE)
news_bigram <- news_textdf %>%
unnest_tokens(bigram, text, token = 'ngrams', n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words$word) & !is.na(word1) & !grepl("\\d", word1)) %>%
filter(!(word2 %in% stop_words$word) & !is.na(word2) & !grepl("\\d", word2)) %>%
mutate(bigram=paste(word1, word2, sep = " ")) %>%
count(bigram, sort=TRUE)
twitter_bigram <- twitter_textdf %>%
unnest_tokens(bigram, text, token = 'ngrams', n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words$word) & !is.na(word1) & !grepl("\\d", word1)) %>%
filter(!(word2 %in% stop_words$word) & !is.na(word2) & !grepl("\\d", word2)) %>%
mutate(bigram=paste(word1, word2, sep = " ")) %>%
count(bigram, sort=TRUE)
blogs_bigram
## # A tibble: 404,248 × 2
## bigram n
## <chr> <int>
## 1 weeks ago 168
## 2 ice cream 165
## 3 jesus christ 139
## 4 south africa 135
## 5 social media 128
## 6 real life 127
## 7 olive oil 120
## 8 blog post 116
## 9 months ago 113
## 10 feel free 101
## # ℹ 404,238 more rows
blogs_bigram %>%
{wordcloud(.$bigram, .$n, scale=c(2, .25), max.words = 50, colors=c('dodgerblue', 'salmon', 'seagreen'))}
news_bigram %>%
{wordcloud(.$bigram, .$n, scale=c(3, .375), max.words = 50, colors=c('dodgerblue', 'salmon', 'seagreen'))}
twitter_bigram %>%
{wordcloud(.$bigram, .$n, scale=c(2, .5), max.words = 40, colors=c('dodgerblue', 'salmon', 'seagreen'))}
Let’s display the most common bigrams with bar plots.
plt4 <- ggplot(data=blogs_bigram[1:15,], aes(x=bigram, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Bigram Frequencies in Blogs dataset") +
xlab("Bigrams") +
ylab("Count") +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
plt5 <- ggplot(data=news_bigram[1:15,], aes(x=bigram, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Bigram Frequencies in News dataset") +
xlab("Bigrams") +
ylab("Count") +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
plt6 <- ggplot(data=twitter_bigram[1:15,], aes(x=bigram, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Bigram Frequencies in Twitter dataset") + xlab("Bigrams") + ylab("Count") +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
plt4
#unnest_tokens(word, text) #%>%
blogs_trigram <- blogs_textdf %>%
unnest_tokens(trigram, text, token = 'ngrams', n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words$word) & !is.na(word1) & !grepl("\\d", word1)) %>%
filter(!(word2 %in% stop_words$word) & !is.na(word2) & !grepl("\\d", word2)) %>%
filter(!(word3 %in% stop_words$word) & !is.na(word3) & !grepl("\\d", word3)) %>%
mutate(trigram=paste(word1, word2, word3, sep = " ")) %>%
count(trigram, sort=TRUE)
news_trigram <- news_textdf %>%
unnest_tokens(trigram, text, token = 'ngrams', n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words$word) & !is.na(word1) & !grepl("\\d", word1)) %>%
filter(!(word2 %in% stop_words$word) & !is.na(word2) & !grepl("\\d", word2)) %>%
filter(!(word3 %in% stop_words$word) & !is.na(word3) & !grepl("\\d", word3)) %>%
mutate(trigram=paste(word1, word2, word3, sep = " ")) %>%
count(trigram, sort=TRUE)
twitter_trigram <- twitter_textdf %>%
unnest_tokens(trigram, text, token = 'ngrams', n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words$word) & !is.na(word1) & !grepl("\\d", word1)) %>%
filter(!(word2 %in% stop_words$word) & !is.na(word2) & !grepl("\\d", word2)) %>%
filter(!(word3 %in% stop_words$word) & !is.na(word3) & !grepl("\\d", word3)) %>%
mutate(trigram=paste(word1, word2, word3, sep = " ")) %>%
count(trigram, sort=TRUE)
blogs_trigram
## # A tibble: 166,493 × 2
## trigram n
## <chr> <int>
## 1 amazon services llc 56
## 2 amp lt span 38
## 3 amazon eu associates 28
## 4 amazon.ca amazon.co.uk amazon.de 28
## 5 amazon.co.uk amazon.de amazon.fr 28
## 6 amazon.com amazon.ca amazon.co.uk 28
## 7 amazon.de amazon.fr amazon.it 28
## 8 associates programmes designed 28
## 9 earn advertising fees 28
## 10 eu associates programmes 28
## # ℹ 166,483 more rows
blogs_trigram %>%
{wordcloud(.$trigram, .$n, scale=c(2, .25), max.words = 50, colors=c('dodgerblue', 'salmon', 'seagreen'))}
news_trigram %>%
{wordcloud(.$trigram, .$n, scale=c(2, .25), max.words = 40, colors=c('dodgerblue', 'salmon', 'seagreen'))}
twitter_trigram %>%
{wordcloud(.$trigram, .$n, scale=c(2, .25), max.words = 50, colors=c('dodgerblue', 'salmon', 'seagreen'))}
plt4 <- ggplot(data=blogs_trigram[1:15,], aes(x=trigram, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Bigram Frequencies in Blogs dataset") +
xlab("Trigrams") +
ylab("Count") +
theme(axis.text.x = element_text(angle=60, vjust=1, hjust=1))
plt5 <- ggplot(data=news_trigram[1:15,], aes(x=trigram, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Bigram Frequencies in News dataset") +
xlab("Trigrams") +
ylab("Count") +
theme(axis.text.x = element_text(angle=75, vjust=1, hjust=1))
plt6 <- ggplot(data=twitter_trigram[1:15,], aes(x=trigram, y=n, fill=n)) +
geom_bar(stat="identity") +
ggtitle("Bigram Frequencies in Twitter dataset") +
xlab("Trigrams") +
ylab("Count") +
theme(axis.text.x = element_text(angle=45, vjust=1, hjust=1))
plt4