The goal of this project is to create a predictive text model using the en_US.blogs.txt, en_US.news.txt and en_US.twitter.txt.
This will show a brief exploratory analysis of this data to make sure your on your way to completing the final Capstone Project.
blogs <- readLines("~/Desktop/final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/Desktop/final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("~/Desktop/final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
library(stringi)
blogs_size <- file.info("~/Desktop/final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news_size <- file.info("~/Desktop/final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter_size <- file.info("~/Desktop/final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
# Get words in files
blogs_words <- stri_count_words(blogs)
news_words <- stri_count_words(news)
twitter_words <- stri_count_words(twitter)
# Summary of the data sets
data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(blogs_size, news_size, twitter_size),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(blogs_words), sum(news_words), sum(twitter_words)),
mean.num.words = c(mean(blogs_words), mean(news_words), mean(twitter_words)))
## source file.size.MB num.lines num.words mean.num.words
## 1 blogs 200.4242 899288 37546250 41.75109
## 2 news 196.2775 1010242 34762395 34.40997
## 3 twitter 159.3641 2360148 30093413 12.75065
Before the exploratory analysis I’m going to clean the data. I’m going to use 2% of the data for this analysis
library(tm)
## Loading required package: NLP
data.sample <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
# Create corpus and clean the data
corpus <- VCorpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
Now that the data is cleaned of all punctuations, special characters, numbers and URL’s it’s time to put this data to graphs.
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
options(mc.cores=1)
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
makePlot <- function(data, label) {
ggplot(data[1:20,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency of words") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("orange"))
}
# Getting the frequencies of most common n-grams in data
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus), 0.9999))
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.9999))
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
makePlot(freq1, "20 Most Common words")
makePlot(freq2, "20 Most Common two words")
makePlot(freq3, "20 Most Common the word string")
This concludes the first part of the capstone project. The next step in the capstone project is to create a shiny app using the same data.