The goal of this project is to display that we’ve become familiar with the data and that we are on track to create our prediction algorithm. This report (to be submitted on R Pubs (http://rpubs.com/)) explains our exploratory analysis and our goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data we’ve identified and briefly summarize our plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. We will make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:
library(R.utils) # various R programming facilities
library(ggplot2) # ggplot plotting package
library(ngram) # R package for constructing n-grams (“tokenizing”), as well as generating new text based on the n-gram structure of a given text input (“babbling”)
library(dplyr) # data manipulation utilities
library(slam) # sparse lightweight arrays and matrices
library(tidytext) # tidy text mining package
library(textmineR) # functions for text mining and topic modeling
library(tau) # text analysis utilities
library(stringi) # character string processing package
library(tm) # text mining package
library(RWeka) # a collection of machine learning algorithms for data mining tasks written in Java
library(SnowballC) # a R interface to the C 'libstemmer' library that implements Porter's word stemming algorithm for collapsing words to a common root to aid comparison of vocabulary.
blogs <- "final/en_US/en_US.blogs.txt"
news <- "final/en_US/en_US.news.txt"
twitter <- "final/en_US/en_US.twitter.txt"
en_US.blogs.txtblog_line <- readLines(blogs,encoding="UTF-8", skipNul = TRUE)
num_blog_lines <- sapply(blogs,countLines) # count number of lines in file
num_blog_words <- wordcount(blog_line, sep = " ", count.function = sum) # count number of words in file
print(paste("Number of lines - blogs: ",num_blog_lines))
## [1] "Number of lines - blogs: 899288"
print(paste("Number of words - blogs: ",num_blog_words))
## [1] "Number of words - blogs: 37334131"
en_US.news.txtnews_line <- readLines(news,encoding="UTF-8", skipNul = TRUE)
num_news_lines <- sapply(news,countLines) # count number of lines in file
num_news_words <- wordcount(news_line, sep = " ", count.function = sum) # count number of words in file
print(paste("Number of lines - news: ",num_news_lines))
## [1] "Number of lines - news: 1010242"
print(paste("Number of words - news: ",num_news_words))
## [1] "Number of words - news: 34372530"
en_US.twitter.txttwitter_line <- readLines(twitter,encoding="UTF-8", skipNul = TRUE)
num_twitter_lines <- sapply(twitter,countLines) # count number of lines in file
num_twitter_words <- wordcount(twitter_line, sep = " ", count.function = sum) # count number of words in file
print(paste("Number of lines - twitter: ",num_twitter_lines))
## [1] "Number of lines - twitter: 2360148"
print(paste("Number of words - twitter: ",num_twitter_words))
## [1] "Number of words - twitter: 30373583"
.txt filesset.seed(42)
data.sample <- c(blog_line[sample(1:length(blog_line),length(blog_line)*0.05)],
news_line[sample(1:length(news_line),length(news_line)*0.05)],
twitter_line[sample(1:length(twitter_line),length(twitter_line)*0.05)] )
blog_line,news_line and twitter_line files to free up memory since we don’t need them anymore.rm(list=c("blog_line","news_line","twitter_line"))
tm package…sample_corpus = VCorpus(VectorSource(data.sample))
sample_corpus = tm_map(sample_corpus, content_transformer(tolower))
sample_corpus = tm_map(sample_corpus, removeNumbers)
sample_corpus = tm_map(sample_corpus, removePunctuation)
sample_corpus = tm_map(sample_corpus, removeWords, stopwords())
sample_corpus = tm_map(sample_corpus, stemDocument)
sample_corpus = tm_map(sample_corpus, stripWhitespace)
n_grams_plot <- function(n, data) {
options(mc.cores=1)
# Build a n-gram tokenizer
token <- function(x) NGramTokenizer(x, Weka_control(min = n, max = n))
# Create a matrix of n-grams
ngrams_matrix <- TermDocumentMatrix(data, control=list(tokenize=token))
# Transform the matrix for easy visualization
ngrams_matrix <- as.matrix(rollup(ngrams_matrix, 2, na.rm=TRUE, FUN=sum))
ngrams_matrix <- data.frame(word=rownames(ngrams_matrix), freq=ngrams_matrix[,1])
# Find the 10 most frequent n-grams in the matrix
ngrams_matrix <- ngrams_matrix[order(-ngrams_matrix$freq), ][1:15, ]
ngrams_matrix$word <- factor(ngrams_matrix$word, as.character(ngrams_matrix$word))
# plots
ggplot(ngrams_matrix, aes(x=word, y=freq)) +
geom_bar(stat="Identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab(paste(n,"- grams")) +
ylab("Frequency") + ggtitle(paste(n,"- gram Frequencies")) +
theme(plot.title = element_text(hjust = 0.5))
}
Let’s plot of frequency distribution of 1-grams
n_grams_plot(n=1, data=sample_corpus)
Let’s plot of frequency distribution of 2-grams
n_grams_plot(n=2, data=sample_corpus)
Let’s plot of frequency distribution of 3-grams
n_grams_plot(n=3, data=sample_corpus)