Introduction

In the first steps of the proyect we will build ngrams models and we will make an exploratory data anlysis.

Data loading and summarizing

Read the data english files The link to training data is here: https://rpubs.com/wujohn1990/courseta_data_science_capstone_week2

setwd("~/Capstone_project")
blogs   <- readLines("en_US.blogs.txt", skipNul = TRUE, warn = F)
news    <- readLines("en_US.news.txt",skipNul = TRUE, warn = F)
twitter <- readLines("en_US.twitter.txt" , skipNul = TRUE, warn = F )

Create summary of data

summary <- readRDS("repo_summary.rds") # readRDS("repo_summary.rds")
summary
##   f_names   f_size f_lines    n_char  n_words pct_n_char pct_lines pct_words
## 1   blogs 200.4242  899288 208361438 37334131       0.54      0.27      0.53
## 2    news 196.2775   77259  15683765  2643969       0.04      0.02      0.04
## 3 twitter 159.3641 2360148 162385042 30373585       0.42      0.71      0.43

Sampling data

set.seed(1977)
usblogs = sample(blogs,100, replace = FALSE)
usnews = sample(news,100, replace = FALSE)
ustwitter = sample(news,100, replace = FALSE)
invisible(gc())
raw_text = c(usblogs,usnews,ustwitter) 
invisible(gc())

Build the corpus

library(tm)
# make a volatile corpus
raw_source = VectorSource(raw_text)
invisible(gc())
raw_corpus <- VCorpus(raw_source)
invisible(gc())

Clean the corpus

# clean the corpus
clean_corpus <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lower case
  corpus <- tm_map(corpus, stripWhitespace) # remove white space
  corpus <- tm_map(corpus, removePunctuation) # remove punctuation
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("[[:digit:]]","",x)))# remove numbers
  corpus <- tm_map(corpus,content_transformer(function(x) gsub(" th", "",x))) # remove th (like 4th)
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("http[[:alnum:]]*","",x))) # remove url
  corpus <- tm_map(corpus,content_transformer(function(x) iconv(x, "latin1", "ASCII", sub=""))) # remove non-ASCII characters
  corpus <- tm_map(corpus,content_transformer(function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x))) # remove repeated alphabets in a word
  gc()
  return(corpus)
}

corpus <- clean_corpus(raw_corpus)

save(corpus,file='corpus.RData')

Build N-Gram model

Unigram Model. The top 10 word frequncies and word coverage

Word Coverage

save(d1,file='d1.RData')

Bigram Model. The top 10 word frequncies and word coverage

Word Coverage

save(d2,file='d2.RData')

Trigram Model. The top 10 word frequncies and word coverage line are also plotted

Word Coverage

Next steps