##Overview
We will explore the data before be do anything else after downloading the data. We will see if the data has anything interesting before cleaning.
##Load the libraries and data
library(tm)
## Loading required package: NLP
blogs <- readLines("./final/en_US/en_US.blogs.txt")
news <- readLines("./final/en_US/en_US.news.txt")
## Warning in readLines("./final/en_US/en_US.news.txt"): incomplete final line
## found on './final/en_US/en_US.news.txt'
twitter <- readLines("./final/en_US/en_US.twitter.txt")
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 167155 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 268547 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1759032 appears to
## contain an embedded nul
##Word counts and summary of the datas
max(nchar(blogs))
## [1] 40835
max(nchar(news))
## [1] 5760
max(nchar(twitter))
## [1] 213
summary(blogs)
## Length Class Mode
## 899288 character character
summary(news)
## Length Class Mode
## 77259 character character
summary(twitter)
## Length Class Mode
## 2360148 character character
It is a pretty big data.So we need to subset some of those as training sets.
##Subset the data
blog.sample <- sample(blogs,10000)
news.sample <- sample(news,10000)
twitter.sample <- sample(twitter,10000)
##Create corpus
sampledata <- c(blog.sample,news.sample,twitter.sample)
writeLines(sampledata,"output/sampledata.txt")
sample_data <- readLines("./output/sampledata.txt")
We take 10000 lines from each of the files and combine them into one file ##Cleaning the Data
We need to remove the punctuations, numbers, stop words and white spaces. We also have to change the captial letters to lower cases.
sample_data <- removePunctuation(sample_data)
sample_data <- tolower(sample_data)
sample_data <- removeNumbers(sample_data)
sample_data <- stemDocument(sample_data)
sample_data <- removeWords(sample_data, c('the', stopwords("english")))
sample_data <- stripWhitespace(sample_data)
##Tokenization Next, we will separte each words from each other so that itโll make easier to make the model.
sample_data <- paste0(unlist(sample_data), collapse=" ")
sample_data <- strsplit(sample_data, " ", fixed=TRUE)[[1L]]
sample_data <- sample_data[sample_data != ""]
##unigram
one_grams <- as.data.frame(sample_data)
one_grams <- sort(table(one_grams), decreasing = T)
##bigram
bigrams <- vapply(ngrams(sample_data, 2L), paste, "", collapse=" ")
two_grams <- as.data.frame(bigrams)
two_grams <- sort(table(two_grams), decreasing = T)
~ ##Histogram Top 5 unigrams
unigram_top5 <- one_grams[1:5]
barplot(unigram_top5)
Top 5 bigrams
bigram_top5 <- two_grams[1:5]
barplot(bigram_top5)