Explore the data and create some summary statistics

We examined the data sets and summarize our findings

# Structure of df
str(lineTwitter)

##  chr [1:2360148] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long." ...

str(lineBlogs)

##  chr [1:899288] "In the years thereafter, most of the Oil fields and platforms were named after pagan “gods”." ...

str(lineNews)

##  chr [1:1010242] "He wasn't home alone, apparently." ...

# Sample of 1 observation to control the transformation (want to see the process)
exampleNews <- lineNews[4]
exampleBlog <- lineBlogs[4]
exampleTwit <- lineTwitter[4]
exampleTwit

## [1] "So Tired D; Played Lazer Tag & Ran A LOT D; Ughh Going To Sleep Like In 5 Minutes ;)"

exampleBlog

## [1] "so anyways, i am going to share some home decor inspiration that i have been storing in my folder on the puter. i have all these amazing images stored away ready to come to life when we get our home."

exampleNews

## [1] "The Alaimo Group of Mount Holly was up for a contract last fall to evaluate and suggest improvements to Trenton Water Works. But campaign finance records released this week show the two employees donated a total of $4,500 to the political action committee (PAC) Partners for Progress in early June. Partners for Progress reported it gave more than $10,000 in both direct and in-kind contributions to Mayor Tony Mack in the two weeks leading up to his victory in the mayoral runoff election June 15."

# File sizes
lineBlogs.size <- file.info("./projectData/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
lineNews.size <- file.info("./projectData/final/en_US/en_US.news.txt")$size / 1024 ^ 2
lineTwitter.size <- file.info("./projectData/final/en_US/en_US.twitter.txt")$size / 1024 ^ 2

# Get words in files
lineBlogs.words <- stringi::stri_count_words(lineBlogs)
lineNews.words <- stringi::stri_count_words(lineNews)
lineTwitter.words <- stringi::stri_count_words(lineTwitter)

Summary Table

data_source	file.size.MB	num.lines	num.words	mean.num.words
blogs	200.4242	899288	37546239	41.75107
news	196.2775	1010242	34762395	34.40997
twitter	159.3641	2360148	30093413	12.75065

Cleaning The Data

Lets clean the data removing special characters, punctuations, numbers, excess whitespace, urls and stopwords. We will also change the text to lower case. Finaly, we will randomly choose 2% of the data to demonstrate the exploratory analysis. These 2% was chosen after failing to work with big vectors of 5% sample size.

# Randomly Sample the Datasets
set.seed(3456)
data.sample <- c(sample(lineBlogs, length(lineBlogs) * 0.02),
                 sample(lineNews, length(lineNews) * 0.02),
                 sample(lineTwitter, length(lineTwitter) * 0.02))

# Create corpus and clean the data
corpus <- VCorpus(VectorSource(data.sample))
# str(corpus)
# inspect(corpus[1])

cleanspace <- function(x, pattern) {
  df <- gsub(pattern, " ", x)
  df
} 
corpus <- tm_map(corpus, cleanspace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, cleanspace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

Exploratory Analysis

We will search for the most frequently words in the data. Here we list the most common bigrams and trigrams.

# Free memory
rm(lineTwitter)
rm(lineBlogs)
rm(lineNews)

# Lets create some tools 
freq <- function(x) {
  freq <- sort(rowSums(as.matrix(x)), decreasing = TRUE)
  return(data.frame(word = names(freq), freq = freq))
}

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
#quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))

# Let see document matrix
TermDocumentMatrix(corpus)

## <<TermDocumentMatrix (terms: 85729, documents: 85391)>>
## Non-/sparse entries: 1007589/7319477450
## Sparsity           : 100%
## Maximal term length: 91
## Weighting          : term frequency (tf)

# Get frequencies of most common bigram and tigram in data sample
#freq1 <- freq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- freq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- freq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
# freq4 <- freq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = quadgram)), 0.9999))

# save RDS version
#saveRDS(freq1, "freq1.rds")
saveRDS(freq2, "freq2.rds")
saveRDS(freq3, "freq3.rds")
# saveRDS(freq4, "freq4.rds")

# load RDS for future use
# freq1 <- readRDS(freq1, "freq2.rds")
# freq2 <- readRDS(freq2, "freq2.rds")
# freq3 <- readRDS(freq2, "freq3.rds")

Plot of 10 most frequent bigrams

freq2[1:10,] %>% 
  ggplot(aes(reorder(word, -freq), freq)) +
         labs(x = "10 most frequent bigrams", y = "Frequency") +
         theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("blue"))

Plot of 10 most frequent trigrams

freq3[1:10,] %>% 
  ggplot(aes(reorder(word, -freq), freq)) +
         labs(x = "10 most frequent trigrams", y = "Frequency") +
         theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
         geom_bar(stat = "identity", fill = I("blue"))

Conclusion and Ideas por Final Capstone

We will use a ngram model in the final capstone project. We will take the trigram model first to predict next word, if we don’t have a matching trigram we’ll search for a bigram aplying the corresponding data model. We dismiss the quadgram model for not having significant information and the unigram because of memory allocation and speed. With these algorithm we are going to deploy our Shiny app.

Milestone Report

Castillo Claudio Sebastian, castilloclaudiosebastian@gmail.com

2019-01-07

Introduction

Reading de Datasets

Explore the data and create some summary statistics

Summary Table

Cleaning The Data

Exploratory Analysis

Conclusion and Ideas por Final Capstone