Exploratory Data Analysis of Text Data

library(stringi)
library(ggplot2)
library(dplyr)
library(tidytext)
library(readr)
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if(!file.exists("Coursera-SwiftKey.zip")) {
  download.file(url, "Coursera-SwiftKey.zip", method = "auto")
  unzip("Coursera-SwiftKey.zip")
}

blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", warn = FALSE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", warn = FALSE)
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", warn = FALSE)
summary_table <- data.frame(
  Source = c("Blogs", "News", "Twitter"),
  Lines = c(length(blogs), length(news), length(twitter)),
  Words = c(
    sum(stri_count_words(blogs)),
    sum(stri_count_words(news)),
    sum(stri_count_words(twitter))
  ),
  File_Size_MB = c(
    file.info("final/en_US/en_US.blogs.txt")$size / 1024^2,
    file.info("final/en_US/en_US.news.txt")$size / 1024^2,
    file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
  )
)

summary_table

##    Source   Lines    Words File_Size_MB
## 1   Blogs  899288 37546250     200.4242
## 2    News 1010242 34762395     196.2775
## 3 Twitter 2360148 30093372     159.3641

set.seed(123)

sample_text <- c(
  sample(blogs, 5000),
  sample(news, 5000),
  sample(twitter, 5000)
)

text_df <- data.frame(text = sample_text)
words <- text_df %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word") %>%
  count(word, sort = TRUE)

head(words, 20)

##      word    n
## 1    time 1007
## 2  people  709
## 3     day  634
## 4    love  516
## 5       2  414
## 6       1  412
## 7    home  376
## 8       3  371
## 9  school  343
## 10   week  337
## 11   life  334
## 12  world  306
## 13   it’s  282
## 14   game  280
## 15  night  274
## 16   city  268
## 17     10  267
## 18      5  264
## 19      4  246
## 20   feel  242

words %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 20 Most Frequent Words",
    x = "Words",
    y = "Frequency"
  )

bigrams <- text_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(bigram, sort = TRUE)

head(bigrams, 20)

##      bigram    n
## 1    of the 2019
## 2    in the 1875
## 3    to the 1027
## 4    on the  921
## 5   for the  845
## 6     to be  749
## 7    at the  605
## 8   and the  594
## 9      in a  570
## 10 with the  479
## 11 from the  441
## 12     is a  440
## 13   it was  432
## 14    for a  411
## 15    it is  394
## 16  will be  390
## 17    i was  386
## 18    and i  382
## 19   with a  382
## 20   i have  371

bigrams %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(bigram, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 20 Most Frequent Bigrams",
    x = "Bigram",
    y = "Frequency"
  )

trigrams <- text_df %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  count(trigram, sort = TRUE)

head(trigrams, 20)

##           trigram   n
## 1      one of the 160
## 2        a lot of 126
## 3     going to be  83
## 4         to be a  73
## 5       i want to  66
## 6        it was a  66
## 7      as well as  65
## 8      be able to  65
## 9     some of the  64
## 10      i have to  57
## 11 thanks for the  56
## 12     the end of  56
## 13    you want to  55
## 14      this is a  54
## 15 the first time  52
## 16    a couple of  51
## 17     out of the  49
## 18    the rest of  49
## 19     there is a  49
## 20        it is a  46

trigrams %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(trigram, n), y = n)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top 20 Most Frequent Trigrams",
    x = "Trigram",
    y = "Frequency"
  )

Exploratory Data Analysis of Text Data

Makhabbat