The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs http://rpubs.com that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.

Getting and Cleaning Data

Loading the required data & libraries to perform the required analysis:

library("pander")
library("ggplot2")
library("NLP")
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library("openNLP")
library("tm")
library("RWeka")
library("qdap")
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## Loading required package: RColorBrewer
## 
## Attaching package: 'qdap'
## 
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## 
## The following object is masked from 'package:base':
## 
##     Filter
#Get files directory
cur_directory <- getwd()
setwd(cur_directory)

archiveFile <-"Coursera-SwiftKey.zip"
subDir <- "final"

if(!file.exists(archiveFile)) {   
  archiveURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  download.file(url=archiveURL,destfile=archiveFile,method="curl")
 }

if(!file.exists(subDir)) { 
  unzip(archiveFile)
}else{
  print("File Already Downloaded & Unzipped!")
}
## [1] "File Already Downloaded & Unzipped!"

Summary statistics about the capstone data sets:

Get file names:

#Get file name of the Capstone files
myfiles <- list.files("final/en_US/")
file_name <- myfiles[ !grepl("badwords",myfiles) ]

print(file_name)
## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

Get file size from files:

#Get File Size for each file of the Capstone Data
us_blogs_size <- system("ls -lh final/en_US/en_US.blogs.txt | awk '{print $5}'", intern=TRUE)
us_news_size <- system("ls -lh final/en_US/en_US.news.txt | awk '{print $5}'", intern=TRUE)
us_twitter_size <- system("ls -lh final/en_US/en_US.twitter.txt | awk '{print $5}'", intern=TRUE)

#Size of the all files
size <- c(us_blogs_size,us_news_size,us_twitter_size)

print(size)
## [1] "200M" "196M" "159M"

Get lines count from files:

#Load Data
blogs_data <- readLines(file(paste0(cur_directory , "/final/en_US/en_US.blogs.txt"),"rb"),skipNul=TRUE, 1000)
news_data <- readLines(file(paste0(cur_directory , "/final/en_US/en_US.news.txt"),"rb"),skipNul=TRUE, 1000)
twitter_data <- readLines(file(paste0(cur_directory , "/final/en_US/en_US.twitter.txt"),"rb"),skipNul=TRUE, 1000)

#Get Line Count for each file of the Capstone Data
us_blogs_lc <- length(blogs_data)
us_news_lc <- length(news_data)
us_twitter_lc <- length(twitter_data)

#Format the data
us_blogs_lc <- format(us_blogs_lc, digits=2, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3)
us_news_lc <- format(us_news_lc, digits=2, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3)
us_twitter_lc <- format(us_twitter_lc, digits=2, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3)

#Line Count Data from files
lines_count <- c(us_blogs_lc,us_news_lc,us_twitter_lc)

print(lines_count)
## [1] "1,000" "1,000" "1,000"

Get words count from files:

#Get Word Count for each file of the Capstone Data
us_blogs_wc <- system("wc -w final/en_US/en_US.blogs.txt | awk '{print $1}'", intern=TRUE)
us_news_wc <- system("wc -w final/en_US/en_US.news.txt | awk '{print $1}'", intern=TRUE)
us_twitter_wc <- system("wc -w final/en_US/en_US.twitter.txt | awk '{print $1}'", intern=TRUE)

#Conver data to numeric
us_blogs_wc <- as.numeric(us_blogs_wc)
us_news_wc <- as.numeric(us_news_wc)
us_twitter_wc <- as.numeric(us_twitter_wc)

#format the data
us_blogs_wc <- format(us_blogs_wc, big.mark=",")
us_news_wc <- format(us_news_wc, big.mark=",")
us_twitter_wc <- format(us_twitter_wc, big.mark=",")

#Word Count Data from files
words_count <- c(us_blogs_wc,us_news_wc,us_twitter_wc)

print(words_count)
## [1] "37,334,690" "34,372,720" "30,374,206"

Store all data into a Data Frame:

#Capstone DT
capstone.data <- data.frame(file_name, size , lines_count, words_count)
Capstone Files
file_name size lines_count words_count
en_US.blogs.txt 200M 1,000 37,334,690
en_US.news.txt 196M 1,000 34,372,720
en_US.twitter.txt 159M 1,000 30,374,206

Tokenization

We need to combine the data and break the data into sentences:

combined_data <- paste(blogs_data, news_data, twitter_data)
combined_data <- sent_detect(combined_data, language = "en", model = NULL)

corpus <- VCorpus(VectorSource(combined_data))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english")) 
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument)

badwordsvector <- VectorSource(readLines(file(paste0(cur_directory , "/final/en_US/badwords.txt"),"r")))
## Warning in readLines(file(paste0(cur_directory,
## "/final/en_US/badwords.txt"), : incomplete final line found on
## '/Users/manuelcazares/Documents/Doctorado/Especializacion/Capstone/final/en_US/badwords.txt'
corpus <- tm_map(corpus, removeWords, badwordsvector) 
## Warning: closing unused connection 5
## (/Users/manuelcazares/Documents/Doctorado/Especializacion/Capstone/final/en_US/badwords.txt)

Then we convert the corpus to Data Frame for further analysis:

corpus_dt<-data.frame(text=unlist(sapply(corpus, `[`, "content")), stringsAsFactors=F)

And finally we do the tokenization with RWeka Library:

one_gram_token <- NGramTokenizer(corpus_dt, Weka_control(min = 1, max = 1))
two_gram_token <- NGramTokenizer(corpus_dt, Weka_control(min = 2, max = 2))
t_gram_token <- NGramTokenizer(corpus_dt, Weka_control(min = 3, max = 3))
four_gram_token <- NGramTokenizer(corpus_dt, Weka_control(min = 4, max = 4))

Exploratory Analysis

Distribution of word frequencies:

one_gram_dt <- data.frame(table(one_gram_token))
two_gram_dt <- data.frame(table(two_gram_token))
t_gram_dt <- data.frame(table(t_gram_token))
four_gram_dt <- data.frame(table(four_gram_token))


one_gram_sorted <- one_gram_dt[order(one_gram_dt$Freq,decreasing = TRUE),]
two_gram_sorted <- two_gram_dt[order(two_gram_dt$Freq,decreasing = TRUE),]
t_gram_sorted <- t_gram_dt[order(t_gram_dt$Freq,decreasing = TRUE),]
four_gram_sorted <- four_gram_dt[order(four_gram_dt$Freq,decreasing = TRUE),]

Get the top 10 n-gram (subset):

one_gram_top10 <- one_gram_sorted[1:10,]
colnames(one_gram_top10) <- c("Word","Frequency")

two_gram_top10 <- two_gram_sorted[1:10,]
colnames(two_gram_top10) <- c("Word","Frequency")

t_gram_top10 <- t_gram_sorted[1:10,]
colnames(t_gram_top10) <- c("Word","Frequency")

four_gram_top10 <- four_gram_sorted[1:10,]
colnames(four_gram_top10) <- c("Word","Frequency")

Top 10 1-gram (single words)

Top 10 2-gram (two words)

Conclusion

Modelling and Prediction are the next tasks to be completed.

Future Work

To create our prediction model we will use the Katz back-off http://en.wikipedia.org/wiki/Katz%27s_back-off_model which is a generative n-gram language model that estimates the conditional probability of a word given its history in the n-gram.