The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.
The motivation for this project is to:
The data provided for NLP (Natural Language Processing) consists of 3 “corpora” of data:
# Load libraries and suppress messages for ease of reading report
suppressMessages(library(dplyr))
suppressMessages(library(ggplot2))
suppressMessages(library(LaF))
suppressMessages(library(quanteda))
suppressMessages(library(RColorBrewer))
suppressMessages(library(RWeka))
suppressMessages(library(SnowballC))
suppressMessages(library(tau))
suppressMessages(library(tm))
suppressMessages(library(wordcloud))
# Download and extract data
source_file <- "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destination_file <- "Coursera-SwiftKey.zip"
download.file(source_file, destination_file)
unzip(destination_file)
# Unzip file
unzip(destination_file, list = TRUE )
## Name Length Date
## 1 final/ 0 2014-07-22 10:10:00
## 2 final/de_DE/ 0 2014-07-22 10:10:00
## 3 final/de_DE/de_DE.twitter.txt 75578341 2014-07-22 10:11:00
## 4 final/de_DE/de_DE.blogs.txt 85459666 2014-07-22 10:11:00
## 5 final/de_DE/de_DE.news.txt 95591959 2014-07-22 10:11:00
## 6 final/ru_RU/ 0 2014-07-22 10:10:00
## 7 final/ru_RU/ru_RU.blogs.txt 116855835 2014-07-22 10:12:00
## 8 final/ru_RU/ru_RU.news.txt 118996424 2014-07-22 10:12:00
## 9 final/ru_RU/ru_RU.twitter.txt 105182346 2014-07-22 10:12:00
## 10 final/en_US/ 0 2014-07-22 10:10:00
## 11 final/en_US/en_US.twitter.txt 167105338 2014-07-22 10:12:00
## 12 final/en_US/en_US.news.txt 205811889 2014-07-22 10:13:00
## 13 final/en_US/en_US.blogs.txt 210160014 2014-07-22 10:13:00
## 14 final/fi_FI/ 0 2014-07-22 10:10:00
## 15 final/fi_FI/fi_FI.news.txt 94234350 2014-07-22 10:11:00
## 16 final/fi_FI/fi_FI.blogs.txt 108503595 2014-07-22 10:12:00
## 17 final/fi_FI/fi_FI.twitter.txt 25331142 2014-07-22 10:10:00
# Load the data en_US data
dataBlogs <- readLines("./final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
dataNews <- readLines("./final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE, warn = FALSE)
dataTwitter <- readLines("./final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
#Convert to ASCII
dataNews <- iconv(dataNews, 'UTF-8', 'ASCII', "byte")
dataBlogs <- iconv(dataBlogs, 'UTF-8', 'ASCII', "byte")
dataTwitter <- iconv(dataTwitter, 'UTF-8', 'ASCII', "byte")
# Assess size of all 3 files - blogs, news and Twitter
dataBlogs.filesizeMB <- file.size("./final/en_US/en_US.blogs.txt")
dataNews.filesizeMB <- file.size("./final/en_US/en_US.news.txt")
dataTwitter.filesizeMB <- file.size("./final/en_US/en_US.twitter.txt")
# Determine word count and length of the longest line seen
dataBlogs.wordsCount <- nchar(dataBlogs)
tmax <- which.max(dataBlogs.wordsCount)
dataBlogs.longestWordCount <- nchar(dataBlogs[tmax])
dataNews.wordsCount <- nchar(dataNews)
tmax <- which.max(dataNews.wordsCount)
dataNews.longestWordCount <- nchar(dataNews[tmax])
dataTwitter.wordsCount <- nchar(dataTwitter)
tmax <- which.max(dataTwitter.wordsCount)
dataTwitter.longestWordCount <- nchar(dataTwitter[tmax])
# Combine into a data frame
dataframe.blogs <- c(dataBlogs.filesizeMB, length(dataBlogs.wordsCount), dataBlogs.longestWordCount)
dataframe.news <- c(dataNews.filesizeMB, length(dataNews.wordsCount), dataNews.longestWordCount)
dataframe.twitter <- c(dataTwitter.filesizeMB, length(dataTwitter.wordsCount), dataTwitter.longestWordCount)
info <- data.frame(rbind(dataframe.blogs, dataframe.news, dataframe.twitter))
names(info) <- c("File Size (MB)", "Word Count", "Longest Line")
row.names(info) <- c("Blogs", "News", "Twitter")
# Showcase table
info
## File Size (MB) Word Count Longest Line
## Blogs 210160014 899288 40844
## News 205811889 1010242 11384
## Twitter 167105338 2360148 589
# Assess maximum number of characters in a line of the files
summary(nchar(dataBlogs))[6]
## Max.
## 40840
summary(nchar(dataNews))[6]
## Max.
## 11380
summary(nchar(dataTwitter))[6]
## Max.
## 589
# Run sampling at 5% of the actual file parameters because of sizes of files
dataBlogs_sample_size <- round(.05 * length(dataBlogs), 0)
dataNews_sample_size <- round(.05 * length(dataNews), 0)
dataTwitter_sample_size <- round(.05 * length(dataTwitter), 0)
# Compute with approximately 5% of the population for each file
dataBlogs_sample <- sample_lines("./final/en_US/en_US.blogs.txt", n = dataBlogs_sample_size, nlines = NULL)
dataNews_sample <- sample_lines("./final/en_US/en_US.news.txt", n = dataNews_sample_size , nlines = NULL)
dataTwitter_sample <- sample_lines("./final/en_US/en_US.twitter.txt", n = dataTwitter_sample_size, nlines = NULL)
# Determine word frequency for each of the 3 files
dataBlogs_word_freq <- dfm(dataBlogs_sample, verbose = FALSE)
dataNews_word_freq <- dfm(dataNews_sample, verbose = FALSE)
dataTwitter_word_freq <- dfm(dataTwitter_sample, verbose = FALSE)
docfreq(dataBlogs_word_freq)[1:11]
## folks hanging out
## 131 100 4548
## in the park
## 16949 27534 204
## applauded free show
## 5 532 752
## of precisely-coordinated
## 20470 1
docfreq(dataNews_word_freq)[1:11]
## for traders treasuries represented an obvious
## 13924 19 3 64 5462 56
## safe haven from stocks and
## 156 19 6587 79 26525
docfreq(dataTwitter_word_freq)[1:11]
## drunk ppl are very loud bruhh too fucking late
## 126 257 7533 1224 125 1 2827 352 399
## lmao i'm
## 423 6082
require(tm)
require(SnowballC)
require(RWeka)
require(slam)
## Loading required package: slam
require(ggplot2)
# Set CleanR function
CleanR <- function(corpus){
tm_map(corpus, removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
tm_map(PlainTextDocument)
}
# Combine all twitter, blogs and news and save to RData
all <- c(dataBlogs, dataNews, dataTwitter)
save(all, file="all.RData")
all.sample <- sample(all, round(0.02*length(all)))
save(all.sample, file="sample-2p.RData")
# Create the corpus
corpus <- Corpus(VectorSource(all.sample))
corpus <- tm_map(corpus, content_transformer(removePunctuation), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(removeNumbers), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(tolower), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(stripWhitespace), lazy = TRUE)
corpus <- tm_map(corpus, content_transformer(PlainTextDocument), lazy = TRUE)
# Set CleanR function
CleanR <- function(corpus){
tm_map(corpus, removeNumbers) %>%
tm_map(removePunctuation) %>%
tm_map(content_transformer(tolower)) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)
}
# reading the text file
bad.word <- read.delim(file = "googlebadwords.txt", sep = ":", header = FALSE)
bad.word_new<- gsub("[*()]","",bad.word[,1])
corpus <- tm_map(corpus, removeWords, bad.word_new)
# Save the corpus for next phase of capstone
save(corpus, file="WorkingCorpus.RData")
# Create a few NGram functions via RWeka
unigram_token <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram_token <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram_token <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
# Create UniGram functions via RWeka
options(stringsAsFactors = FALSE)
options(mc.cores = 1)
unigram <- TermDocumentMatrix(corpus, control=list(tokenize=unigram_token))
unigram.good <- rollup(unigram, 2, na.rm=TRUE, FUN = sum)
# Sort with decreasing frequency
unigram.tf <- findFreqTerms(unigram.good, lowfreq = 3)
unigram.tf <- sort(rowSums(as.matrix(unigram.good[unigram.tf, ])), decreasing = TRUE)
unigram.tf <- data.frame(unigram.good=names(unigram.tf), frequency=unigram.tf)
names(unigram.tf) <- c("word", "frequency")
head(unigram.tf, 10)
## word frequency
## the the 95210
## and and 48111
## for for 22235
## that that 20585
## you you 18712
## with with 14113
## was was 12374
## this this 10862
## have have 10500
## are are 9856
# Plot top 10 word frequency for UniGram
g <- ggplot(data = head(unigram.tf, 10), aes(x = word, y = frequency))
g <- g + geom_bar(stat="Identity", fill="red", colour = "black")
g <- g + geom_text(aes(label=frequency), vjust=-0.1)
g <- g + theme(axis.text.x = element_text(angle = 45, hjust = 2))
g
# BiGram work
bi.gram.dataBlogs <- textcnt(dataBlogs_sample, n = 2, method = "string")
bi.gram.dataBlogs <- bi.gram.dataBlogs[order(bi.gram.dataBlogs, decreasing = TRUE)]
bi.gram.dataBlogs[1:3] # top three, 2-Word combinations
## of the in the to the
## 9420 7817 4386
blogs_corpus <- VCorpus(DataframeSource(data.frame(dataBlogs_sample)))
news_corpus <- VCorpus(DataframeSource(data.frame(dataNews_sample)))
twitter_corpus <- VCorpus(DataframeSource(data.frame(dataTwitter_sample)))
rm(dataBlogs_sample); rm(dataNews_sample); rm(dataTwitter_sample)
blogs_corpus <- CleanR(blogs_corpus)
news_corpus <- CleanR(news_corpus)
twitter_corpus <- CleanR(twitter_corpus)
pal <- brewer.pal(8,"Accent")
wordcloud(blogs_corpus, max.words = 90, random.order = FALSE, colors = pal)
wordcloud(news_corpus, max.words = 90, random.order = FALSE, colors = pal)
wordcloud(twitter_corpus, max.words = 90, random.order = FALSE, colors = pal)