This report summarizes the data cleaning and exploratory data analysis associated with text mining a corpus of Twitter, Blogs, and News data for the purpose of developing a predictive alogorithm for Natural Language Processing.
library(dplyr)
library(tm)
library(ggplot2)
library(RWeka)
library(SnowballC)
#Extract swiftkey.zip and read in the three files for your chosen language
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
#Basic Info on the data set
summary(twitter)
## Length Class Mode
## 2360148 character character
summary(blogs)
## Length Class Mode
## 899288 character character
summary(news)
## Length Class Mode
## 77259 character character
Create a random sample for modeling, then save it and clear your workkspace.
set.seed(39)
sampleTwitter <- twitter[sample(1:length(twitter),10000)]
sampleNews <- news[sample(1:length(news),10000)]
sampleBlogs <- blogs[sample(1:length(blogs),10000)]
sampleData <- c(sampleTwitter,sampleNews,sampleBlogs)
writeLines(sampleData, "./sample/sampleData.txt")
rm(twitter,news,blogs,sampleTwitter,sampleNews,sampleBlogs,sampleData)
Create a corpus from your sample and begin cleaning it by changing case, removing punctuation,removing special characters, stop words, and profanity.
library(tm)
library(SnowballC)
cname <- file.path(".", "sample")
docs <- Corpus(DirSource(cname))
docs <- tm_map(docs, content_transformer(tolower))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/|@|\\|")
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, removeWords, stopwords("en"))
googlebadwords <- read.delim("google_badwords.txt",sep = ":",header = FALSE)
googlebadwords <- googlebadwords[,1]
docs <- tm_map(docs, removeWords, googlebadwords)
NewDoc <- tm_map(docs, stemDocument)
freq_frame <- function(tdm){
Freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_frame <- data.frame(word=names(Freq), Freq=Freq)
return(freq_frame)
}
#Use commands from the Text Mining (tm) package to tokenize your sample into unigrams, bigrams, and trigrams.
library(tm)
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
NewDoc<- VCorpus(VectorSource(NewDoc))
Jimboterms1a <- TermDocumentMatrix(NewDoc)
Jimboterms1 <- removeSparseTerms(Jimboterms1a, 0.99)
freq1_frame <- freq_frame(Jimboterms1)
freq1_frame<-filter(freq1_frame, Freq>1500)
Jimboterms2a <- TermDocumentMatrix(NewDoc, control=list(tokenize=BigramTokenizer))
Jimboterms2 <- removeSparseTerms(Jimboterms2a, 0.999)
freq2_frame <- freq_frame(Jimboterms2)
freq2_frame<-filter(freq2_frame,Freq>100)
Jimboterms3a <- TermDocumentMatrix(NewDoc, control=list(tokenize=TrigramTokenizer))
Jimboterms3 <- removeSparseTerms(Jimboterms3a, 0.9999)
freq3_frame <- freq_frame(Jimboterms3)
freq3_frame<-filter(freq3_frame,Freq>15)
Generate Plots of the most common words
biplot
# Predictive Analysis and Shiny App The next steps are to build a predictive model based on word frequencies and word association. I will read in more of the data to create a large training set. The basis for the algorithm is a random forest model. After construction, I will deploy the algorithm via a shiny app.