Exploratory Data Analysis - Data Science Specialization Capstone Project

Summary

This EDA is intended to investigate and undestand the distribution of words, frequencies of words and word pairs, and relationship between the words in the corpora (English language files). The objective of the capstone project is to develop an application to predict the next word based on the corpora.

Summary and Distributions

There are three files: en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt. The firs step is to read those files and determine some basic stadistics and the distribution of the words of each file. The summary information of the three files will be presented in a table and the distributions in histograms.

The distribution of the words for ‘blogs’ and ‘news’ aren’t normal and those seem to be Poison or Gamma distribution due to their long right tail (positevely skewed).

Frequency of words

Based on the fact that the blogs, news and twitter files are big, a binomial sample will be applied to those files and then the new sample objects will be conbined in order to get the frequencies of the words (1-gram, 2-gram and 3-gram). Quanteda library will be use for tokenization and the topten of each n-gram will be presented, and also for some cleaning tasks such as removing: numbers, punctuantion, symbols, hyphens, stopwords and make sure the language is English.

The first three words of 1-gram are: one, just and said, for the 2-gram are: right now, year old, and last year. Finally, for 3-gram the topthree are: happi mother day, new year citi, and let us know.

Next Steps

After the Explanatory Analysis, the next steps include:

Splitting the data into training set and testing set.

Defining a model to predict the next word.

Testing and measuring the model performance.

Developing a Shiny application to be used for endusers in which after writting a word, the app will predict the next one.

Appedix

The code used for this assignment.

# libraries
library(dplyr)
library(ggplot2)
library(quanteda)
library(stringi)

## Summary and Distributions
#Connection to blogs file
con <- file("en_US.blogs.txt", "r")
us.blogs <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)

blog.info <- data.frame(
        File = "en_US.blogs.txt",
        Size = round(file.info("en_US.blogs.txt")$size/1024/1024, 2),
        Lines = length(us.blogs),
        Words = sum(stri_count_words(us.blogs)),
        Min_Words = min(stri_count_words(us.blogs)),
        Max_Words = max(stri_count_words(us.blogs)),
        Mean_Words = round(mean(stri_count_words(us.blogs)), 2)
)

#Connection to news file
con <- file("en_US.news.txt", "r")
us.news <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)

news.info <- data.frame(
        File = "en_US.news.txt",
        Size = round(file.info("en_US.news.txt")$size/1024/1024, 2),
        Lines = length(us.news),
        Words = sum(stri_count_words(us.news)),
        Min_Words = min(stri_count_words(us.news)),
        Max_Words = max(stri_count_words(us.news)),
        Mean_Words = round(mean(stri_count_words(us.news)), 2)
)

#Connection to twitter file
con <- file("en_US.twitter.txt", "r")
us.twitt <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)

twitt.info <- data.frame(
        File = "en_US.twitter.txt",
        Size = round(file.info("en_US.twitter.txt")$size/1024/1024, 2),
        Lines = length(us.twitt),
        Words = sum(stri_count_words(us.twitt)),
        Min_Words = min(stri_count_words(us.twitt)),
        Max_Words = max(stri_count_words(us.twitt)),
        Mean_Words = round(mean(stri_count_words(us.twitt)), 2)
)

DT::datatable(rbind(blog.info, news.info, twitt.info))

# Distributions
dist.blogs <- stri_count_words(us.blogs)
dist.news <- stri_count_words(us.news)
dist.twitt <- stri_count_words(us.twitt)

ggplot(as.data.frame(dist.blogs),aes(dist.blogs)) +
        geom_histogram(binwidth = 300, fill ="blue") + 
        labs(x = "Length of Words", 
             y = "Words count",
             title = "Distribution of the Blogs") +
        theme(plot.title = element_text(hjust = .5))

ggplot(as.data.frame(dist.news),aes(dist.news)) +
        geom_histogram(binwidth = 100, fill ="green") +
        labs(x = "Length of Words", 
             y = "Words count",
             title = "Distribution of the News") +
        theme(plot.title = element_text(hjust = .5))

ggplot(as.data.frame(dist.twitt),aes(dist.twitt)) +
        geom_histogram(binwidth = .99) +
        labs(x = "Length of Words", 
             y = "Words count",
             title = "Distribution of the Twitter") +
        theme(plot.title = element_text(hjust = .5))


## Frequency of words
# blog sampling
binom.blog <- rbinom(length(us.blogs), 1, 0.1)
sample.blog <- us.blogs[binom.blog == 1]

# news sampling
binom.news <- rbinom(length(us.news), 1, 0.1)
sample.news <- us.news[binom.news == 1]

#twitte sampling
binom.twitt <- rbinom(length(us.twitt), 1, 0.1)
sample.twitt <- us.twitt[binom.twitt == 1]

# combined samples
total.sample <- c(sample.blog, sample.news, sample.twitt)

# tokenization
total.tokens <- tokens(total.sample, what = "word", 
                       remove_numbers = TRUE, remove_punct = TRUE,
                       remove_symbols = TRUE, remove_hyphens = TRUE)
total.tokens <- tokens_select(total.tokens, stopwords(), 
                              selection = "remove")
total.tokens <-  tokens_wordstem(total.tokens, language = "english")

# ngram-1
ngram_1 <- topfeatures(dfm(total.tokens, ngrams = 1, verbose = FALSE))

# ngram-2 
ngram_2 <- topfeatures(dfm(total.tokens, ngrams = 2, verbose = FALSE))

# ngram-3 
ngram_3 <- topfeatures(dfm(total.tokens, ngrams = 3, verbose = FALSE))

# ngram-1 data.frame
ngram1.df <- data.frame(words1 = factor(names(ngram_1), levels = names(ngram_1)),
                 freq1 = ngram_1)

# ngram-2 data.frame
ngram2.df <- data.frame(words2 = factor(names(ngram_2), levels = names(ngram_2)),
                 freq2 = ngram_2)

#ngram-3 data.frame
ngram3.df <- data.frame(words3 = factor(names(ngram_3), levels = names(ngram_3)),
                 freq3 = ngram_3)

# plot ngram-1
ggplot(ngram1.df, aes(x = words1, y = freq1)) +
        geom_col(fill ="cadetblue") + 
        labs(x = "One-Word", 
             y = "Frequency",
             title = "Distribution of ngram-1") +
        theme(plot.title = element_text(hjust = .5)) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1))

# plot ngram-2
ggplot(ngram2.df, aes(x = words2, y = freq2)) +
        geom_col(fill ="cadetblue") + 
        labs(x = "Two-Words", 
             y = "Frequency",
             title = "Distribution of ngram-2") +
        theme(plot.title = element_text(hjust = .5)) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1))

# plot ngram-3
ggplot(ngram3.df, aes(x = words3, y = freq3)) +
        geom_col(fill = "cadetblue") + 
        labs(x = "Three-Words", 
             y = "Frequency",
             title = "Distribution of ngram-3") +
        theme(plot.title = element_text(hjust = .5)) +
        theme(axis.text.x = element_text(angle = 90, hjust = 1))