The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to: 1. Demonstrate that you’ve downloaded the data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings that you amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
library(stringi)
library(tm)
## Loading required package: NLP
library(NLP)
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
destfile = "./Coursera-SwiftKey.zip"
if(!file.exists(destfile)){
url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
file <- basename(url)
download.file(url, file, method="curl")
unzip(file)
}
#blogs in english
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
blogs_size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
blogs_words <- stri_count_words(blogs)
#news in english
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
news_size <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
news_words <- stri_count_words(news)
#twitter in english
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
twitter_size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
twitter_words <- stri_count_words(twitter)
summary_en <- data.frame(data_source = c("blogs", "news", "twitter"),
file_size_MB = c(blogs_size, news_size, twitter_size),
line_counts = c(length(blogs), length(news), length(twitter)),
words_counts = c(sum(blogs_words), sum(news_words), sum(twitter_words)),
num_of_words_per_line_mean = c(mean(blogs_words), mean(news_words), mean(twitter_words)))
summary_en
## data_source file_size_MB line_counts words_counts num_of_words_per_line_mean
## 1 blogs 200.4242 899288 37546250 41.75109
## 2 news 196.2775 1010242 34762395 34.40997
## 3 twitter 159.3641 2360148 30093413 12.75065
Using just 1% of the data will help speed things up as the dataset is quite large, therefore we will make a corpus using the tm package
set.seed(111)
data_sample <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
# Create corpus and clean the data
corpus <- VCorpus(VectorSource(data_sample))
print(corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 42695
Before removing any unwanted word, we can convert the documents to all lowercase using tm_map(dataset, tolower). After that, we remove the stopwords, punctuations, numbers, and whitespace.
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
print(corpus)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 42695
options(mc.cores=1)
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpus_dtm1 <- TermDocumentMatrix(corpus)
print(corpus_dtm1)
## <<TermDocumentMatrix (terms: 58101, documents: 42695)>>
## Non-/sparse entries: 509117/2480113078
## Sparsity : 100%
## Maximal term length: 74
## Weighting : term frequency (tf)
freq1 <- getFreq(removeSparseTerms(corpus_dtm1, 0.9999))
ggplot(freq1[1:10, ], aes(reorder(word, -freq), freq)) +
geom_bar(stat="identity", fill="lightyellow", colour="yellow") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("10 Most Common Unigrams") +
xlab("Words") +
ylab("Frequency")
corpus_dtm2 <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))
print(corpus_dtm2)
## <<TermDocumentMatrix (terms: 434381, documents: 42695)>>
## Non-/sparse entries: 515330/18545381465
## Sparsity : 100%
## Maximal term length: 84
## Weighting : term frequency (tf)
freq2 <- getFreq(removeSparseTerms(corpus_dtm2, 0.9999))
ggplot(freq2[1:10, ], aes(reorder(word, -freq), freq)) +
geom_bar(stat="identity", fill="tan", colour="orange") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("10 Most Common Bigrams") +
xlab("Words") +
ylab("Frequency")
corpus_dtm3 <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))
print(corpus_dtm3)
## <<TermDocumentMatrix (terms: 472734, documents: 42695)>>
## Non-/sparse entries: 476883/20182901247
## Sparsity : 100%
## Maximal term length: 97
## Weighting : term frequency (tf)
freq3 <- getFreq(removeSparseTerms(corpus_dtm3, 0.9999))
ggplot(freq3[1:10, ], aes(reorder(word, -freq), freq)) +
geom_bar(stat="identity", fill="pink", colour="red") +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("10 Most Common Trigrams") +
xlab("Words") +
ylab("Frequency")