Goal of this assignment

to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

My steps in the text data analysis are listed below:

Set the working directory

setwd("C:/Users/mllrg/Desktop/final/en_US")

load libraries

* This will also be completed throughout this analysis

library(tm)
library(NLP)
library(SnowballC)
library(syuzhet)

read the text file

    # blogs
blogsFileName <- "en_US.blogs.txt"
con <- file(blogsFileName, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

# news
newsFileName <- "en_US.news.txt"
con <- file(newsFileName, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)

close(con)

# twitter
twitterFileName <- "en_US.twitter.txt"
con <- file(twitterFileName, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

rm(con)

Summary and Sampling of data

library(stringi)
library(kableExtra)

assigning a sample size of 0.01, due to the large amount of original data

# assign sample size
sampleSize = 0.01

# file size
fileSizeMB <- round(file.info(c(blogsFileName,
                                newsFileName,
                                twitterFileName))$size / 1024 ^ 2)

# num lines per file
numLines <- sapply(list(blogs, news, twitter), length)

# num characters per file
numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)

# num words per file
numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]

# words per line
wpl <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))

# words per line summary
wplSummary = sapply(list(blogs, news, twitter),
             function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(wplSummary) = c('WPL.Min', 'WPL.Mean', 'WPL.Max')

summary <- data.frame(
    File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
    FileSize = paste(fileSizeMB, " MB"),
    Lines = numLines,
    Characters = numChars,
    Words = numWords,
    t(rbind(round(wplSummary)))
)

kable(summary,
      row.names = FALSE,
      align = c("l", rep("r", 7)),
      caption = "") %>% kable_styling(position = "left")
File FileSize Lines Characters Words WPL.Min WPL.Mean WPL.Max
en_US.blogs.txt 200 MB 899288 206824505 37570839 0 42 6726
en_US.news.txt 196 MB 77259 15639408 2651432 1 35 1123
en_US.twitter.txt 159 MB 2360148 162096241 30451170 1 13 47

Exploratory Analysis

library(ggplot2)
library(gridExtra)

Plot the data for words per line

plot1 <- qplot(wpl[[1]],
               geom = "histogram",
               main = "US Blogs",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot2 <- qplot(wpl[[2]],
               geom = "histogram",
               main = "US News",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot3 <- qplot(wpl[[3]],
               geom = "histogram",
               main = "US Twitter",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 1)

plotList = list(plot1, plot2, plot3)
do.call(grid.arrange, c(plotList, list(ncol = 1)))

# remove variables for memory purposes
rm(plot1, plot2, plot3)

Clean data

# set seed for reproducability
set.seed(600000)

# sample all three data sets
sampleBlogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sampleNews <- sample(news, length(news) * sampleSize, replace = FALSE)
sampleTwitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)

# remove all non-English characters from the sampled data
sampleBlogs <- iconv(sampleBlogs, "latin1", "ASCII", sub = "")
sampleNews <- iconv(sampleNews, "latin1", "ASCII", sub = "")
sampleTwitter <- iconv(sampleTwitter, "latin1", "ASCII", sub = "")

# combine all three data sets into a single data set and write to disk
sampleData <- c(sampleBlogs, sampleNews, sampleTwitter)
sampleDataFileName <- "en_US.sample.txt"
con <- file(sampleDataFileName, open = "w")
writeLines(sampleData, con)
close(con)

# get number of lines and words from the sample data set
sampleDataLines <- length(sampleData);
sampleDataWords <- sum(stri_count_words(sampleData))

Clean data further and Build the Corpus

library(tm)

# download bad words file
badWordsURL <- "https://www.cs.cmu.edu/~biglou/resources/"
badWordsFile <- "bad-words.txt"
if (!file.exists('data')) {
    dir.create('data')
}
if (!file.exists(badWordsFile)) {
    tempFile <- tempfile()
    download.file(badWordsURL, tempFile)
    unzip(tempFile, exdir = "data")
    unlink(tempFile)
}

buildCorpus <- function (dataSet) {
    docs <- VCorpus(VectorSource(dataSet))
    toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
    
    # remove URL, Twitter handles and email patterns
    docs <- tm_map(docs, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
    docs <- tm_map(docs, toSpace, "@[^\\s]+")
    docs <- tm_map(docs, toSpace, "\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b")
    
    # remove profane words from the sample data set
    con <- file(badWordsFile, open = "r")
    profanity <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
    close(con)
    profanity <- iconv(profanity, "latin1", "ASCII", sub = "")
    docs <- tm_map(docs, removeWords, profanity)
    
    docs <- tm_map(docs, tolower)
    docs <- tm_map(docs, removeWords, stopwords("english"))
    docs <- tm_map(docs, removePunctuation)
    docs <- tm_map(docs, removeNumbers)
    docs <- tm_map(docs, stripWhitespace)
    docs <- tm_map(docs, PlainTextDocument)
    return(docs)
}

# build the corpus and write to disk (RDS)
corpus <- buildCorpus(sampleData)
saveRDS(corpus, file = "en_US.corpus.rds")

# convert corpus to a dataframe and write lines/words to disk (text)
corpusText <- data.frame(text = unlist(sapply(corpus, '[', "content")), stringsAsFactors = FALSE)
con <- file("en_US.corpus.txt", open = "w")
writeLines(corpusText$text, con)
close(con)

kable(head(corpusText$text, 10),
      row.names = FALSE,
      col.names = NULL,
      align = c("l"),
      caption = "First 10 Documents") %>% kable_styling(position = "left")
First 10 Documents
joe paterno
page passenger list steam ship ionian arrived montreal glasgow th may found james listed age labourer born scotland however also salvation army written next name majority passengers first two pages possibly mean massive recruitment drive invaded canada spread word
im going talk main character luke kincaid alike dislike
amazing prizes won thanks generous sponsors penny black stampalot crimson cloud crafts u love
nineteen years ago baby almost weeks pregnant time baby first seemed well visit antenatal clinic changed everything blood pressure high protein sent hospital admitted immediately dawn next morning caesarean section tiny lbs oz daughter fighting life special care
mark henry defeated randy orton become new world heavyweight champion worlds greatest slam wow incredible match two good mark company years receiving biggest push date earned title people stunned
taste nice eaten pickled green chilies bird eye chilies
scuffed feet stared ground debating take
now home unpacked laundry done put away think wrap order chances last trip magic kingdom learned quite bit take show road
everyone sits back see results television starting pm
# remove variables for memory purposes
rm(sampleData)

Construct visuals for word frequency

library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.1.3
library(RColorBrewer)

Plot the 10 most frequent words

tdm <- TermDocumentMatrix(corpus)
tdm<-tdm[1:12000, 1:12000]
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
wordFreq <- data.frame(word = names(freq), freq = freq)

# plot the top 10 most frequent words
g <- ggplot (wordFreq[1:10,], aes(x = reorder(wordFreq[1:10,]$word, -wordFreq[1:10,]$fre),
                                  y = wordFreq[1:10,]$fre ))
g <- g + geom_bar( stat = "Identity" , fill = I("grey50"))
g <- g + geom_text(aes(label = wordFreq[1:10,]$fre), vjust = -0.20, size = 3)
g <- g + xlab("")
g <- g + ylab("Word Frequencies")
g <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
               axis.text.x = element_text(hjust = 0.5, vjust = 0.5, angle = 45),
               axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
g <- g + ggtitle("10 Most Frequent Words")
print(g)

Construct a Word Cloud

# construct word cloud
suppressWarnings (
    wordcloud(words = wordFreq$word,
              freq = wordFreq$freq,
              min.freq = 1,
              max.words = 100,
              random.order = FALSE,
              rot.per = 0.35, 
              colors=brewer.pal(8, "Dark2"))
)

# remove variables for memory purposes
rm(tdm, freq, wordFreq, g)

Capstone Project, coming up next….

In the next steps of this project, and now that the exploratory analysis is complete, I will create a prediction model and then integrate that model into a Shiny app. The app will dynamic, as it will predict the next word that will occur.