to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.
setwd("C:/Users/mllrg/Desktop/final/en_US")
library(tm)
library(NLP)
library(SnowballC)
library(syuzhet)
# blogs
blogsFileName <- "en_US.blogs.txt"
con <- file(blogsFileName, open = "r")
blogs <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# news
newsFileName <- "en_US.news.txt"
con <- file(newsFileName, open = "r")
news <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
# twitter
twitterFileName <- "en_US.twitter.txt"
con <- file(twitterFileName, open = "r")
twitter <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
rm(con)
library(stringi)
library(kableExtra)
assigning a sample size of 0.01, due to the large amount of original data
# assign sample size
sampleSize = 0.01
# file size
fileSizeMB <- round(file.info(c(blogsFileName,
newsFileName,
twitterFileName))$size / 1024 ^ 2)
# num lines per file
numLines <- sapply(list(blogs, news, twitter), length)
# num characters per file
numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)
# num words per file
numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]
# words per line
wpl <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))
# words per line summary
wplSummary = sapply(list(blogs, news, twitter),
function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(wplSummary) = c('WPL.Min', 'WPL.Mean', 'WPL.Max')
summary <- data.frame(
File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
FileSize = paste(fileSizeMB, " MB"),
Lines = numLines,
Characters = numChars,
Words = numWords,
t(rbind(round(wplSummary)))
)
kable(summary,
row.names = FALSE,
align = c("l", rep("r", 7)),
caption = "") %>% kable_styling(position = "left")
| File | FileSize | Lines | Characters | Words | WPL.Min | WPL.Mean | WPL.Max |
|---|---|---|---|---|---|---|---|
| en_US.blogs.txt | 200 MB | 899288 | 206824505 | 37570839 | 0 | 42 | 6726 |
| en_US.news.txt | 196 MB | 77259 | 15639408 | 2651432 | 1 | 35 | 1123 |
| en_US.twitter.txt | 159 MB | 2360148 | 162096241 | 30451170 | 1 | 13 | 47 |
library(ggplot2)
library(gridExtra)
plot1 <- qplot(wpl[[1]],
geom = "histogram",
main = "US Blogs",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 5)
plot2 <- qplot(wpl[[2]],
geom = "histogram",
main = "US News",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 5)
plot3 <- qplot(wpl[[3]],
geom = "histogram",
main = "US Twitter",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 1)
plotList = list(plot1, plot2, plot3)
do.call(grid.arrange, c(plotList, list(ncol = 1)))
# remove variables for memory purposes
rm(plot1, plot2, plot3)
# set seed for reproducability
set.seed(600000)
# sample all three data sets
sampleBlogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sampleNews <- sample(news, length(news) * sampleSize, replace = FALSE)
sampleTwitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)
# remove all non-English characters from the sampled data
sampleBlogs <- iconv(sampleBlogs, "latin1", "ASCII", sub = "")
sampleNews <- iconv(sampleNews, "latin1", "ASCII", sub = "")
sampleTwitter <- iconv(sampleTwitter, "latin1", "ASCII", sub = "")
# combine all three data sets into a single data set and write to disk
sampleData <- c(sampleBlogs, sampleNews, sampleTwitter)
sampleDataFileName <- "en_US.sample.txt"
con <- file(sampleDataFileName, open = "w")
writeLines(sampleData, con)
close(con)
# get number of lines and words from the sample data set
sampleDataLines <- length(sampleData);
sampleDataWords <- sum(stri_count_words(sampleData))
library(tm)
# download bad words file
badWordsURL <- "https://www.cs.cmu.edu/~biglou/resources/"
badWordsFile <- "bad-words.txt"
if (!file.exists('data')) {
dir.create('data')
}
if (!file.exists(badWordsFile)) {
tempFile <- tempfile()
download.file(badWordsURL, tempFile)
unzip(tempFile, exdir = "data")
unlink(tempFile)
}
buildCorpus <- function (dataSet) {
docs <- VCorpus(VectorSource(dataSet))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
# remove URL, Twitter handles and email patterns
docs <- tm_map(docs, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
docs <- tm_map(docs, toSpace, "@[^\\s]+")
docs <- tm_map(docs, toSpace, "\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b")
# remove profane words from the sample data set
con <- file(badWordsFile, open = "r")
profanity <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
profanity <- iconv(profanity, "latin1", "ASCII", sub = "")
docs <- tm_map(docs, removeWords, profanity)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, PlainTextDocument)
return(docs)
}
# build the corpus and write to disk (RDS)
corpus <- buildCorpus(sampleData)
saveRDS(corpus, file = "en_US.corpus.rds")
# convert corpus to a dataframe and write lines/words to disk (text)
corpusText <- data.frame(text = unlist(sapply(corpus, '[', "content")), stringsAsFactors = FALSE)
con <- file("en_US.corpus.txt", open = "w")
writeLines(corpusText$text, con)
close(con)
kable(head(corpusText$text, 10),
row.names = FALSE,
col.names = NULL,
align = c("l"),
caption = "First 10 Documents") %>% kable_styling(position = "left")
| joe paterno |
| page passenger list steam ship ionian arrived montreal glasgow th may found james listed age labourer born scotland however also salvation army written next name majority passengers first two pages possibly mean massive recruitment drive invaded canada spread word |
| im going talk main character luke kincaid alike dislike |
| amazing prizes won thanks generous sponsors penny black stampalot crimson cloud crafts u love |
| nineteen years ago baby almost weeks pregnant time baby first seemed well visit antenatal clinic changed everything blood pressure high protein sent hospital admitted immediately dawn next morning caesarean section tiny lbs oz daughter fighting life special care |
| mark henry defeated randy orton become new world heavyweight champion worlds greatest slam wow incredible match two good mark company years receiving biggest push date earned title people stunned |
| taste nice eaten pickled green chilies bird eye chilies |
| scuffed feet stared ground debating take |
| now home unpacked laundry done put away think wrap order chances last trip magic kingdom learned quite bit take show road |
| everyone sits back see results television starting pm |
# remove variables for memory purposes
rm(sampleData)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.1.3
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 4.1.3
library(RColorBrewer)
tdm <- TermDocumentMatrix(corpus)
tdm<-tdm[1:12000, 1:12000]
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
wordFreq <- data.frame(word = names(freq), freq = freq)
# plot the top 10 most frequent words
g <- ggplot (wordFreq[1:10,], aes(x = reorder(wordFreq[1:10,]$word, -wordFreq[1:10,]$fre),
y = wordFreq[1:10,]$fre ))
g <- g + geom_bar( stat = "Identity" , fill = I("grey50"))
g <- g + geom_text(aes(label = wordFreq[1:10,]$fre), vjust = -0.20, size = 3)
g <- g + xlab("")
g <- g + ylab("Word Frequencies")
g <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5),
axis.text.x = element_text(hjust = 0.5, vjust = 0.5, angle = 45),
axis.text.y = element_text(hjust = 0.5, vjust = 0.5))
g <- g + ggtitle("10 Most Frequent Words")
print(g)
# construct word cloud
suppressWarnings (
wordcloud(words = wordFreq$word,
freq = wordFreq$freq,
min.freq = 1,
max.words = 100,
random.order = FALSE,
rot.per = 0.35,
colors=brewer.pal(8, "Dark2"))
)
# remove variables for memory purposes
rm(tdm, freq, wordFreq, g)
In the next steps of this project, and now that the exploratory analysis is complete, I will create a prediction model and then integrate that model into a Shiny app. The app will dynamic, as it will predict the next word that will occur.