The goal of this project is to display that we’ve gotten used to working with the data and that we are on track to create your prediction algorithm. This report (to be submitted on R Pubs (http://rpubs.com/)) explains our exploratory analysis and our goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you we identified and briefly summarize our plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. we will make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:
library(R.utils) # various R programming facilities
library(ngram) # R package for constructing n-grams (“tokenizing”), as well as generating new text based on the n-gram structure of a given text input (“babbling”)
library(dplyr) # data manipulation utilities
library(tidytext) # tidy text mining package
library(textmineR) # functions for text mining and topic modeling
library(tau) # text analysis utilities
library(stringi) # character string processing package
library(tm) # text mining package
library(RWeka) # a collection of machine learning algorithms for data mining tasks written in Java
library(SnowballC) # a R interface to the C 'libstemmer' library that implements Porter's word stemming algorithm for collapsing words to a common root to aid comparison of vocabulary.
blogs <- "final/en_US/en_US.blogs.txt"
news <- "final/en_US/en_US.news.txt"
twitter <- "final/en_US/en_US.twitter.txt"
en_US.blogs.txtblog_line <- readLines(blogs,encoding="UTF-8", skipNul = TRUE)
num_blog_lines <- sapply(blogs,countLines) # count number of lines in file
num_blog_words <- wordcount(blog_line, sep = " ", count.function = sum) # count number of words in file
print(paste("Number of lines - blogs: ",num_blog_lines))
## [1] "Number of lines - blogs: 899288"
print(paste("Number of words - blogs: ",num_blog_words))
## [1] "Number of words - blogs: 37334131"
en_US.blogs.txt file…set.seed(42)
sampleBlogLines <- blog_line[sample(1:length(blog_line),90000)] # sample 90000 lines from `blog_line`
blog_corpus = VCorpus(VectorSource(sampleBlogLines))
blog_corpus = tm_map(blog_corpus, content_transformer(tolower))
blog_corpus = tm_map(blog_corpus, removeNumbers)
blog_corpus = tm_map(blog_corpus, removePunctuation)
blog_corpus = tm_map(blog_corpus, removeWords, stopwords())
blog_corpus = tm_map(blog_corpus, stemDocument)
blog_corpus = tm_map(blog_corpus, stripWhitespace)
en_US.blogs.txt sample corpus by creating a TermDocumentMatrix object and converting it to a dataframe…blog_tdm <- as.data.frame(tidy(TermDocumentMatrix(blog_corpus)))
freq_cols <- c("term","count") # remove documents column
blog_freq_df <- blog_tdm[freq_cols]
blog_freq_df <- blog_freq_df %>% # aggregate word count by `term`
group_by(term) %>%
mutate(sum(count))
names(blog_freq_df) <- c("term","count","total") # rename columns of resultant dataframe
freq_cols <- c("term","total")
blog_freq_df <- blog_freq_df[freq_cols]
blog_freq_df <- distinct(blog_freq_df)
blog_freq_df <- blog_freq_df[order(-blog_freq_df$total),]
en_US.news.txtnews_line <- readLines(news,encoding="UTF-8", skipNul = TRUE)
num_news_lines <- sapply(news,countLines) # count number of lines in file
num_news_words <- wordcount(news_line, sep = " ", count.function = sum) # count number of words in file
print(paste("Number of lines - news: ",num_news_lines))
## [1] "Number of lines - news: 1010242"
print(paste("Number of words - news: ",num_news_words))
## [1] "Number of words - news: 34372530"
en_US.news.txt file…set.seed(42)
sampleNewsLines <- news_line[sample(1:length(news_line),90000)] # sample 90000 lines from `news_line`
news_corpus = VCorpus(VectorSource(sampleNewsLines))
news_corpus = tm_map(news_corpus, content_transformer(tolower))
news_corpus = tm_map(news_corpus, removeNumbers)
news_corpus = tm_map(news_corpus, removePunctuation)
news_corpus = tm_map(news_corpus, removeWords, stopwords())
news_corpus = tm_map(news_corpus, stemDocument)
news_corpus = tm_map(news_corpus, stripWhitespace)
en_US.news.txt sample corpus by creating a TermDocumentMatrix object and converting it to a dataframe…news_tdm <- as.data.frame(tidy(TermDocumentMatrix(news_corpus)))
freq_cols <- c("term","count") # remove documents column
news_freq_df <- news_tdm[freq_cols]
news_freq_df <- news_freq_df %>% # aggregate word count by `term`
group_by(term) %>%
mutate(sum(count))
names(news_freq_df) <- c("term","count","total") # rename columns of resultant dataframe
freq_cols <- c("term","total")
news_freq_df <- news_freq_df[freq_cols]
news_freq_df <- distinct(news_freq_df)
news_freq_df <- news_freq_df[order(-news_freq_df$total),]
en_US.twitter.txttwitter_line <- readLines(twitter,encoding="UTF-8", skipNul = TRUE)
num_twitter_lines <- sapply(twitter,countLines) # count number of lines in file
num_twitter_words <- wordcount(twitter_line, sep = " ", count.function = sum) # count number of words in file
print(paste("Number of lines - twitter: ",num_twitter_lines))
## [1] "Number of lines - twitter: 2360148"
print(paste("Number of words - twitter: ",num_twitter_words))
## [1] "Number of words - twitter: 30373583"
en_US.twitter.txt file…set.seed(42)
sampleTwitterLines <- twitter_line[sample(1:length(twitter_line),90000)] # sample 90000 lines from `twitter_line`
twitter_corpus = VCorpus(VectorSource(sampleTwitterLines))
twitter_corpus = tm_map(twitter_corpus, content_transformer(tolower))
twitter_corpus = tm_map(twitter_corpus, removeNumbers)
twitter_corpus = tm_map(twitter_corpus, removePunctuation)
twitter_corpus = tm_map(twitter_corpus, removeWords, stopwords())
twitter_corpus = tm_map(twitter_corpus, stemDocument)
twitter_corpus = tm_map(twitter_corpus, stripWhitespace)
en_US.twitter.txt sample corpus by creating a TermDocumentMatrix object and converting it to a dataframe…twitter_tdm <- as.data.frame(tidy(TermDocumentMatrix(twitter_corpus)))
freq_cols <- c("term","count") # remove documents column
twitter_freq_df <- twitter_tdm[freq_cols]
twitter_freq_df <- twitter_freq_df %>% # aggregate word count by `term`
group_by(term) %>%
mutate(sum(count))
names(twitter_freq_df) <- c("term","count","total") # rename columns of resultant dataframe
freq_cols <- c("term","total")
twitter_freq_df <- twitter_freq_df[freq_cols]
twitter_freq_df <- distinct(twitter_freq_df)
twitter_freq_df <- twitter_freq_df[order(-twitter_freq_df$total),]
unigramTokenizer <- function(x) {
NGramTokenizer(x, Weka_control(min = 1, max = 1))
}
unigrams <- DocumentTermMatrix(blog_corpus, control = list(tokenize = unigramTokenizer))
BigramTokenizer <- function(x) {
NGramTokenizer(x, Weka_control(min = 2, max = 2))
}
bigrams <- DocumentTermMatrix(blog_corpus, control = list(tokenize = BigramTokenizer))
TrigramTokenizer <- function(x) {
NGramTokenizer(x, Weka_control(min = 3, max = 3))
}
trigrams <- DocumentTermMatrix(blog_corpus, control = list(tokenize = TrigramTokenizer))