R Markdown

This is an R Markdown presentation. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document.

library(knitr) rm(list = ls(all.names = TRUE)) setwd(-report.Rmd”)

trainURL <- “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip” trainDataFile <- “data/Coursera-SwiftKey.zip”

if (!file.exists(‘data’)) { dir.create(‘data’) }

if (!file.exists(“data/final/en_US”)) { tempFile <- tempfile() download.file(trainURL, tempFile) unzip(tempFile, exdir = “data”) unlink(tempFile) }

blogs

blogsFileName <- “data/final/en_US/en_US.blogs.txt” con <- file(blogsFileName, open = “r”) blogs <- readLines(con, encoding = “UTF-8”, skipNul = TRUE) close(con)

news

newsFileName <- “data/final/en_US/en_US.news.txt” con <- file(newsFileName, open = “r”) news <- readLines(con, encoding = “UTF-8”, skipNul = TRUE) close(con)

twitter

twitterFileName <- “data/final/en_US/en_US.twitter.txt” con <- file(twitterFileName, open = “r”) twitter <- readLines(con, encoding = “UTF-8”, skipNul = TRUE) close(con)

rm(con)

library(stringi) library(kableExtra)

assign sample size

sampleSize = 0.01

file size

fileSizeMB <- round(file.info(c(blogsFileName, newsFileName, twitterFileName))$size / 1024 ^ 2)

num lines per file

numLines <- sapply(list(blogs, news, twitter), length)

num characters per file

numChars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)

num words per file

numWords <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]

words per line

wpl <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))

words per line summary

wplSummary = sapply(list(blogs, news, twitter), function(x) summary(stri_count_words(x))[c(‘Min.’, ‘Mean’, ‘Max.’)]) rownames(wplSummary) = c(‘WPL.Min’, ‘WPL.Mean’, ‘WPL.Max’)

summary <- data.frame( File = c(“en_US.blogs.txt”, “en_US.news.txt”, “en_US.twitter.txt”), FileSize = paste(fileSizeMB, ” MB”), Lines = numLines, Characters = numChars, Words = numWords, t(rbind(round(wplSummary))) )

kable(summary, row.names = FALSE, align = c(“l”, rep(“r”, 7)), caption = ““) %>% kable_styling(position =”left”)

  library(ggplot2)

library(gridExtra)

plot1 <- qplot(wpl[[1]], geom = “histogram”, main = “US Blogs”, xlab = “Words per Line”, ylab = “Frequency”, binwidth = 5)

plot2 <- qplot(wpl[[2]], geom = “histogram”, main = “US News”, xlab = “Words per Line”, ylab = “Frequency”, binwidth = 5)

plot3 <- qplot(wpl[[3]], geom = “histogram”, main = “US Twitter”, xlab = “Words per Line”, ylab = “Frequency”, binwidth = 1)

plotList = list(plot1, plot2, plot3) do.call(grid.arrange, c(plotList, list(ncol = 1)))

free up some memory

rm(plot1, plot2, plot3)

set seed for reproducability

set.seed(660067)

sample all three data sets

sampleBlogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE) sampleNews <- sample(news, length(news) * sampleSize, replace = FALSE) sampleTwitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)

remove all non-English characters from the sampled data

sampleBlogs <- iconv(sampleBlogs, “latin1”, “ASCII”, sub = ““) sampleNews <- iconv(sampleNews,”latin1”, “ASCII”, sub = ““) sampleTwitter <- iconv(sampleTwitter,”latin1”, “ASCII”, sub = ““)

combine all three data sets into a single data set and write to disk

sampleData <- c(sampleBlogs, sampleNews, sampleTwitter) sampleDataFileName <- “data/final/en_US/en_US.sample.txt” con <- file(sampleDataFileName, open = “w”) writeLines(sampleData, con) close(con)

get number of lines and words from the sample data set

sampleDataLines <- length(sampleData); sampleDataWords <- sum(stri_count_words(sampleData))

remove variables no longer needed to free up memory

rm(blogs, news, twitter, sampleBlogs, sampleNews, sampleTwitter)

library(tm)

download bad words file

badWordsURL <- “http://www.idevelopment.info/data/DataScience/uploads/full-list-of-bad-words_text-file_2018_07_30.zip” badWordsFile <- “data/full-list-of-bad-words_text-file_2018_07_30.txt” if (!file.exists(‘data’)) { dir.create(‘data’) } if (!file.exists(badWordsFile)) { tempFile <- tempfile() download.file(badWordsURL, tempFile) unzip(tempFile, exdir = “data”) unlink(tempFile) }

buildCorpus <- function (dataSet) { docs <- VCorpus(VectorSource(dataSet)) toSpace <- content_transformer(function(x, pattern) gsub(pattern, ” “, x))

# remove URL, Twitter handles and email patterns
docs <- tm_map(docs, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
docs <- tm_map(docs, toSpace, "@[^\\s]+")
docs <- tm_map(docs, toSpace, "\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b")

# remove profane words from the sample data set
con <- file(badWordsFile, open = "r")
profanity <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
profanity <- iconv(profanity, "latin1", "ASCII", sub = "")
docs <- tm_map(docs, removeWords, profanity)

docs <- tm_map(docs, tolower)
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
docs <- tm_map(docs, PlainTextDocument)
return(docs)

}

build the corpus and write to disk (RDS)

corpus <- buildCorpus(sampleData) saveRDS(corpus, file = “data/final/en_US/en_US.corpus.rds”)

convert corpus to a dataframe and write lines/words to disk (text)

corpusText <- data.frame(text = unlist(sapply(corpus, ‘[’, “content”)), stringsAsFactors = FALSE) con <- file(“data/final/en_US/en_US.corpus.txt”, open = “w”) writeLines(corpusText$text, con) close(con)

kable(head(corpusText$text, 10), row.names = FALSE, col.names = NULL, align = c(“l”), caption = “First 10 Documents”) %>% kable_styling(position = “left”)

remove variables no longer needed to free up memory

rm(sampleData)

library(wordcloud) library(RColorBrewer)

tdm <- TermDocumentMatrix(corpus) freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE) wordFreq <- data.frame(word = names(freq), freq = freq)

plot the top 10 most frequent words

g <- ggplot (wordFreq[1:10,], aes(x = reorder(wordFreq[1:10,]$word, -wordFreq[1:10,]$fre), y = wordFreq[1:10,]$fre )) g <- g + geom_bar( stat = "Identity" , fill = I("grey50")) g <- g + geom_text(aes(label = wordFreq[1:10,]$fre), vjust = -0.20, size = 3) g <- g + xlab(““) g <- g + ylab(”Word Frequencies”) g <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5), axis.text.x = element_text(hjust = 0.5, vjust = 0.5, angle = 45), axis.text.y = element_text(hjust = 0.5, vjust = 0.5)) g <- g + ggtitle(“10 Most Frequent Words”) print(g)

construct word cloud

suppressWarnings ( wordcloud(words = wordFreq$word, freq = wordFreq$freq, min.freq = 1, max.words = 100, random.order = FALSE, rot.per = 0.35, colors=brewer.pal(8, “Dark2”)) )

remove variables no longer needed to free up memory

rm(tdm, freq, wordFreq, g)

library(RWeka)

unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1)) bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2)) trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

create term document matrix for the corpus

unigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = unigramTokenizer))

eliminate sparse terms for each n-gram and get frequencies of most common n-grams

unigramMatrixFreq <- sort(rowSums(as.matrix(removeSparseTerms(unigramMatrix, 0.99))), decreasing = TRUE) unigramMatrixFreq <- data.frame(word = names(unigramMatrixFreq), freq = unigramMatrixFreq)

generate plot

g <- ggplot(unigramMatrixFreq[1:20,], aes(x = reorder(word, -freq), y = freq)) g <- g + geom_bar(stat = “identity”, fill = I(“grey50”)) g <- g + geom_text(aes(label = freq ), vjust = -0.20, size = 3) g <- g + xlab(““) g <- g + ylab(”Frequency”) g <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5), axis.text.x = element_text(hjust = 1.0, angle = 45), axis.text.y = element_text(hjust = 0.5, vjust = 0.5)) g <- g + ggtitle(“20 Most Common Unigrams”) print(g)

create term document matrix for the corpus

bigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer))

eliminate sparse terms for each n-gram and get frequencies of most common n-grams

bigramMatrixFreq <- sort(rowSums(as.matrix(removeSparseTerms(bigramMatrix, 0.999))), decreasing = TRUE) bigramMatrixFreq <- data.frame(word = names(bigramMatrixFreq), freq = bigramMatrixFreq)

generate plot

g <- ggplot(bigramMatrixFreq[1:20,], aes(x = reorder(word, -freq), y = freq)) g <- g + geom_bar(stat = “identity”, fill = I(“grey50”)) g <- g + geom_text(aes(label = freq ), vjust = -0.20, size = 3) g <- g + xlab(““) g <- g + ylab(”Frequency”) g <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5), axis.text.x = element_text(hjust = 1.0, angle = 45), axis.text.y = element_text(hjust = 0.5, vjust = 0.5)) g <- g + ggtitle(“20 Most Common Bigrams”) print(g)

create term document matrix for the corpus

trigramMatrix <- TermDocumentMatrix(corpus, control = list(tokenize = trigramTokenizer))

eliminate sparse terms for each n-gram and get frequencies of most common n-grams

trigramMatrixFreq <- sort(rowSums(as.matrix(removeSparseTerms(trigramMatrix, 0.9999))), decreasing = TRUE) trigramMatrixFreq <- data.frame(word = names(trigramMatrixFreq), freq = trigramMatrixFreq)

generate plot

g <- ggplot(trigramMatrixFreq[1:20,], aes(x = reorder(word, -freq), y = freq)) g <- g + geom_bar(stat = “identity”, fill = I(“grey50”)) g <- g + geom_text(aes(label = freq ), vjust = -0.20, size = 3) g <- g + xlab(““) g <- g + ylab(”Frequency”) g <- g + theme(plot.title = element_text(size = 14, hjust = 0.5, vjust = 0.5), axis.text.x = element_text(hjust = 1.0, angle = 45), axis.text.y = element_text(hjust = 0.5, vjust = 0.5)) g <- g + ggtitle(“20 Most Common Trigrams”) print(g)

Coursera Capstone Project Milestone Report

R Markdown

blogs

news

twitter

assign sample size

file size

num lines per file

num characters per file

num words per file

words per line

words per line summary

free up some memory

set seed for reproducability

sample all three data sets

remove all non-English characters from the sampled data

combine all three data sets into a single data set and write to disk

get number of lines and words from the sample data set

remove variables no longer needed to free up memory

download bad words file

build the corpus and write to disk (RDS)

convert corpus to a dataframe and write lines/words to disk (text)

remove variables no longer needed to free up memory

plot the top 10 most frequent words

construct word cloud

remove variables no longer needed to free up memory

create term document matrix for the corpus

eliminate sparse terms for each n-gram and get frequencies of most common n-grams

generate plot

create term document matrix for the corpus

eliminate sparse terms for each n-gram and get frequencies of most common n-grams

generate plot

create term document matrix for the corpus

eliminate sparse terms for each n-gram and get frequencies of most common n-grams

generate plot