Goal of this assignment

to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

My steps in the text data analysis are listed below:

Set the working directory

setwd("C:/Users/mllrg/Desktop/final/en_US")

load libraries

* This will also be completed throughout this analysis

library(tm)
library(NLP)
library(SnowballC)
library(syuzhet)

read the text file

    # blogs
blogsFileName <- "en_US.blogs.txt"
con <- file(blogsFileName, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

# news
newsFileName <- "en_US.news.txt"
con <- file(newsFileName, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)

close(con)

# twitter
twitterFileName <- "en_US.twitter.txt"
con <- file(twitterFileName, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)

rm(con)

Summary and Sampling of data

library(stringi)
library(kableExtra)

assigning a sample size of 0.01, due to the large amount of original data

# assign sample size
sampleSize = 0.01

# file size
fileSizeMB <- round(file.info(c(blogsFileName,
                                newsFileName,
                                twitterFileName))$size / 1024 ^ 2)

# num lines per file
numLines <- sapply(list(blogs, news, twitter), length)

# num characters per file
numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)

# num words per file
numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]

# words per line
wpl <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))

# words per line summary
wplSummary = sapply(list(blogs, news, twitter),
             function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(wplSummary) = c('WPL.Min', 'WPL.Mean', 'WPL.Max')

summary <- data.frame(
    File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
    FileSize = paste(fileSizeMB, " MB"),
    Lines = numLines,
    Characters = numChars,
    Words = numWords,
    t(rbind(round(wplSummary)))
)

kable(summary,
      row.names = FALSE,
      align = c("l", rep("r", 7)),
      caption = "") %>% kable_styling(position = "left")


File	FileSize	Lines	Characters	Words	WPL.Min	WPL.Mean	WPL.Max
en_US.blogs.txt	200 MB	899288	206824505	37570839	0	42	6726
en_US.news.txt	196 MB	77259	15639408	2651432	1	35	1123
en_US.twitter.txt	159 MB	2360148	162096241	30451170	1	13	47

Exploratory Analysis

library(ggplot2)
library(gridExtra)

Plot the data for words per line

plot1 <- qplot(wpl[[1]],
               geom = "histogram",
               main = "US Blogs",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot2 <- qplot(wpl[[2]],
               geom = "histogram",
               main = "US News",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot3 <- qplot(wpl[[3]],
               geom = "histogram",
               main = "US Twitter",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 1)

plotList = list(plot1, plot2, plot3)
do.call(grid.arrange, c(plotList, list(ncol = 1)))

# remove variables for memory purposes
rm(plot1, plot2, plot3)

Clean data

# set seed for reproducability
set.seed(600000)

# sample all three data sets
sampleBlogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sampleNews <- sample(news, length(news) * sampleSize, replace = FALSE)
sampleTwitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)

# remove all non-English characters from the sampled data
sampleBlogs <- iconv(sampleBlogs, "latin1", "ASCII", sub = "")
sampleNews <- iconv(sampleNews, "latin1", "ASCII", sub = "")
sampleTwitter <- iconv(sampleTwitter, "latin1", "ASCII", sub = "")

# combine all three data sets into a single data set and write to disk
sampleData <- c(sampleBlogs, sampleNews, sampleTwitter)
sampleDataFileName <- "en_US.sample.txt"
con <- file(sampleDataFileName, open = "w")
writeLines(sampleData, con)
close(con)

# get number of lines and words from the sample data set
sampleDataLines <- length(sampleData);
sampleDataWords <- sum(stri_count_words(sampleData))

Clean data further and Build the Corpus

library(tm)

# download bad words file
badWordsURL <- "https://www.cs.cmu.edu/~biglou/resources/"
badWordsFile <- "bad-words.txt"
if (!file.exists('data')) {
    dir.create('data')
}
if (!file.exists(badWordsFile)) {
    tempFile <- tempfile()
    download.file(badWordsURL, tempFile)
    unzip(tempFile, exdir = "data")
    unlink(tempFile)
}

buildCorpus <- function (dataSet) {
    docs <- VCorpus(VectorSource(dataSet))
    toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
    
    # remove URL, Twitter handles and email patterns
    docs <- tm_map(docs, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
    docs <- tm_map(docs, toSpace, "@[^\\s]+")
    docs <- tm_map(docs, toSpace, "\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b")
    
    # remove profane words from the sample data set
    con <- file(badWordsFile, open = "r")
    profanity <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
    close(con)
    profanity <- iconv(profanity, "latin1", "ASCII", sub = "")
    docs <- tm_map(docs, removeWords, profanity)
    
    docs <- tm_map(docs, tolower)
    docs <- tm_map(docs, removeWords, stopwords("english"))
    docs <- tm_map(docs, removePunctuation)
    docs <- tm_map(docs, removeNumbers)
    docs <- tm_map(docs, stripWhitespace)
    docs <- tm_map(docs, PlainTextDocument)
    return(docs)
}

# build the corpus and write to disk (RDS)
corpus <- buildCorpus(sampleData)
saveRDS(corpus, file = "en_US.corpus.rds")

# convert corpus to a dataframe and write lines/words to disk (text)
corpusText <- data.frame(text = unlist(sapply(corpus, '[', "content")), stringsAsFactors = FALSE)
con <- file("en_US.corpus.txt", open = "w")
writeLines(corpusText$text, con)
close(con)

kable(head(corpusText$text, 10),
      row.names = FALSE,
      col.names = NULL,
      align = c("l"),
      caption = "First 10 Documents") %>% kable_styling(position = "left")

First 10 Documents
joe paterno
page passenger list steam ship ionian arrived montreal glasgow th may found james listed age labourer born scotland however also salvation army written next name majority passengers first two pages possibly mean massive recruitment drive invaded canada spread word
im going talk main character luke kincaid alike dislike
amazing prizes won thanks generous sponsors penny black stampalot crimson cloud crafts u love
nineteen years ago baby almost weeks pregnant time baby first seemed well visit antenatal clinic changed everything blood pressure high protein sent hospital admitted immediately dawn next morning caesarean section tiny lbs oz daughter fighting life special care
mark henry defeated randy orton become new world heavyweight champion worlds greatest slam wow incredible match two good mark company years receiving biggest push date earned title people stunned
taste nice eaten pickled green chilies bird eye chilies
scuffed feet stared ground debating take
now home unpacked laundry done put away think wrap order chances last trip magic kingdom learned quite bit take show road
everyone sits back see results television starting pm

# remove variables for memory purposes
rm(sampleData)

Construct visuals for word frequency

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.1.3

## Loading required package: RColorBrewer

## Warning: package 'RColorBrewer' was built under R version 4.1.3

library(RColorBrewer)

Plot the 10 most frequent words

tdm <- TermDocumentMatrix(corpus)
tdm<-tdm[1:12000, 1:12000]
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
wordFreq <- data.frame(word = names(freq), freq = freq)

# plot the top 10 most frequent words
g <- ggplot (wordFreq[1:10,], aes(x = reorder(wordFreq[1:10,]$word, -wordFreq[1:10,]$fre),
                                  y = wordFreq[1:10,]$fre ))
g <- g + geom_bar( stat = "Identity" , fill = I("grey50"))
g <- g + geom_text(aes(label = wordFreq[1:10,]$fre), vjust = -0.20, size = 3)
g <- g + xlab("")
g <- g + ylab("Word Frequencies")
g <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
               axis.text.x = element_text(hjust = 0.5, vjust = 0.5, angle = 45),
               axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
g <- g + ggtitle("10 Most Frequent Words")
print(g)

Construct a Word Cloud

# construct word cloud
suppressWarnings (
    wordcloud(words = wordFreq$word,
              freq = wordFreq$freq,
              min.freq = 1,
              max.words = 100,
              random.order = FALSE,
              rot.per = 0.35, 
              colors=brewer.pal(8, "Dark2"))
)

# remove variables for memory purposes
rm(tdm, freq, wordFreq, g)

Capstone Project, coming up next….

In the next steps of this project, and now that the exploratory analysis is complete, I will create a prediction model and then integrate that model into a Shiny app. The app will dynamic, as it will predict the next word that will occur.

Milestone Report - Capstone

Colleen

4/29/2022