Capstone Progress Report

This report summarizes the data cleaning and exploratory data analysis associated with text mining a corpus of Twitter, Blogs, and News data for the purpose of developing a predictive alogorithm for Natural Language Processing.

Load the necessary libraries

library(dplyr)
library(tm)  
library(ggplot2)  
library(RWeka)  
library(SnowballC) 

#Extract swiftkey.zip and read in the three files for your chosen language
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8") 
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8") 
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8") 

#Basic Info on the data set

summary(twitter)
##    Length     Class      Mode 
##   2360148 character character
summary(blogs)
##    Length     Class      Mode 
##    899288 character character
summary(news)
##    Length     Class      Mode 
##     77259 character character

Create a random sample for modeling, then save it and clear your workkspace.

set.seed(39)
sampleTwitter <- twitter[sample(1:length(twitter),10000)]
sampleNews <- news[sample(1:length(news),10000)]
sampleBlogs <- blogs[sample(1:length(blogs),10000)]
sampleData <- c(sampleTwitter,sampleNews,sampleBlogs)
writeLines(sampleData, "./sample/sampleData.txt")
rm(twitter,news,blogs,sampleTwitter,sampleNews,sampleBlogs,sampleData)

Create a corpus from your sample and begin cleaning it by changing case, removing punctuation,removing special characters, stop words, and profanity.

library(tm)
library(SnowballC)
cname <- file.path(".", "sample")
docs <- Corpus(DirSource(cname))
docs <- tm_map(docs, content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/|@|\\|")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeWords, stopwords("en"))   
googlebadwords <- read.delim("google_badwords.txt",sep = ":",header = FALSE)
googlebadwords <- googlebadwords[,1]
docs <- tm_map(docs, removeWords, googlebadwords)
NewDoc <- tm_map(docs, stemDocument)
freq_frame <- function(tdm){
  
  Freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
  freq_frame <- data.frame(word=names(Freq), Freq=Freq)
  return(freq_frame)
}
#Use commands from the Text Mining (tm) package to tokenize your sample into unigrams, bigrams, and trigrams.
library(tm)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
NewDoc<- VCorpus(VectorSource(NewDoc))
Jimboterms1a <- TermDocumentMatrix(NewDoc)
Jimboterms1 <- removeSparseTerms(Jimboterms1a, 0.99)
freq1_frame <- freq_frame(Jimboterms1)
freq1_frame<-filter(freq1_frame, Freq>1500)

Jimboterms2a <- TermDocumentMatrix(NewDoc, control=list(tokenize=BigramTokenizer))
Jimboterms2 <- removeSparseTerms(Jimboterms2a, 0.999)
freq2_frame <- freq_frame(Jimboterms2)
freq2_frame<-filter(freq2_frame,Freq>100)

Jimboterms3a <- TermDocumentMatrix(NewDoc, control=list(tokenize=TrigramTokenizer))
Jimboterms3 <- removeSparseTerms(Jimboterms3a, 0.9999)
freq3_frame <- freq_frame(Jimboterms3)
freq3_frame<-filter(freq3_frame,Freq>15)

Generate Plots of the most common words

biplot

# Predictive Analysis and Shiny App The next steps are to build a predictive model based on word frequencies and word association. I will read in more of the data to create a large training set. The basis for the algorithm is a random forest model. After construction, I will deploy the algorithm via a shiny app.