In the first steps of the proyect we will build ngrams models and we will make an exploratory data anlysis.
Read the data english files The link to training data is here: https://rpubs.com/wujohn1990/courseta_data_science_capstone_week2
setwd("~/Capstone_project")
blogs <- readLines("en_US.blogs.txt", skipNul = TRUE, warn = F)
news <- readLines("en_US.news.txt",skipNul = TRUE, warn = F)
twitter <- readLines("en_US.twitter.txt" , skipNul = TRUE, warn = F )
Create summary of data
summary <- readRDS("repo_summary.rds") # readRDS("repo_summary.rds")
summary
## f_names f_size f_lines n_char n_words pct_n_char pct_lines pct_words
## 1 blogs 200.4242 899288 208361438 37334131 0.54 0.27 0.53
## 2 news 196.2775 77259 15683765 2643969 0.04 0.02 0.04
## 3 twitter 159.3641 2360148 162385042 30373585 0.42 0.71 0.43
set.seed(1977)
usblogs = sample(blogs,100, replace = FALSE)
usnews = sample(news,100, replace = FALSE)
ustwitter = sample(news,100, replace = FALSE)
invisible(gc())
raw_text = c(usblogs,usnews,ustwitter)
invisible(gc())
library(tm)
# make a volatile corpus
raw_source = VectorSource(raw_text)
invisible(gc())
raw_corpus <- VCorpus(raw_source)
invisible(gc())
# clean the corpus
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, content_transformer(tolower)) # convert to lower case
corpus <- tm_map(corpus, stripWhitespace) # remove white space
corpus <- tm_map(corpus, removePunctuation) # remove punctuation
corpus <- tm_map(corpus,content_transformer(function(x) gsub("[[:digit:]]","",x)))# remove numbers
corpus <- tm_map(corpus,content_transformer(function(x) gsub(" th", "",x))) # remove th (like 4th)
corpus <- tm_map(corpus,content_transformer(function(x) gsub("http[[:alnum:]]*","",x))) # remove url
corpus <- tm_map(corpus,content_transformer(function(x) iconv(x, "latin1", "ASCII", sub=""))) # remove non-ASCII characters
corpus <- tm_map(corpus,content_transformer(function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x))) # remove repeated alphabets in a word
gc()
return(corpus)
}
corpus <- clean_corpus(raw_corpus)
save(corpus,file='corpus.RData')
Unigram Model. The top 10 word frequncies and word coverage
save(d1,file='d1.RData')
Bigram Model. The top 10 word frequncies and word coverage
save(d2,file='d2.RData')
Trigram Model. The top 10 word frequncies and word coverage line are also plotted