Milestone Report Submission

The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm. Please submit a report on R Pubs http://rpubs.com that explains your exploratory analysis and your goals for the eventual app and algorithm. This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager. You should make use of tables and plots to illustrate important summaries of the data set.

Getting and Cleaning Data

Loading the required data & libraries to perform the required analysis:

library("pander")
library("ggplot2")
library("NLP")

## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library("openNLP")
library("tm")
library("RWeka")
library("qdap")

## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## Loading required package: RColorBrewer
## 
## Attaching package: 'qdap'
## 
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## 
## The following object is masked from 'package:base':
## 
##     Filter

archiveFile <-"Coursera-SwiftKey.zip"
subDir <- "final"

if(!file.exists(archiveFile)) {   
  archiveURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  download.file(url=archiveURL,destfile=archiveFile,method="curl")
 }

if(!file.exists(subDir)) { 
  unzip(archiveFile)
}else{
  print("File Already Downloaded & Unzipped!")
}

## [1] "File Already Downloaded & Unzipped!"

Summary statistics about the capstone data sets:

Get file names:

#Get file name of the Capstone files
myfiles <- list.files("final/en_US/")
file_name <- myfiles[ !grepl("badwords",myfiles) ]

print(file_name)

## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

Get file size from files:

#Get File Size for each file of the Capstone Data
us_blogs_size <- system("ls -lh final/en_US/en_US.blogs.txt | awk '{print $5}'", intern=TRUE)
us_news_size <- system("ls -lh final/en_US/en_US.news.txt | awk '{print $5}'", intern=TRUE)
us_twitter_size <- system("ls -lh final/en_US/en_US.twitter.txt | awk '{print $5}'", intern=TRUE)

#Size of the all files
size <- c(us_blogs_size,us_news_size,us_twitter_size)

print(size)

## [1] "200M" "196M" "159M"

Get lines count from files:

#Get files directory
cur_directory <- getwd()

#Load Data
blogs_data <- readLines(file(paste0(cur_directory , "/final/en_US/en_US.blogs.txt"),"rb"),skipNul=TRUE, 1000)
news_data <- readLines(file(paste0(cur_directory , "/final/en_US/en_US.news.txt"),"rb"),skipNul=TRUE, 1000)
twitter_data <- readLines(file(paste0(cur_directory , "/final/en_US/en_US.twitter.txt"),"rb"),skipNul=TRUE, 1000)

#Get Line Count for each file of the Capstone Data
us_blogs_lc <- length(blogs_data)
us_news_lc <- length(news_data)
us_twitter_lc <- length(twitter_data)

#Format the data
us_blogs_lc <- format(us_blogs_lc, digits=2, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3)
us_news_lc <- format(us_news_lc, digits=2, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3)
us_twitter_lc <- format(us_twitter_lc, digits=2, decimal.mark=",", big.mark=",",small.mark=".", , small.interval=3)

#Line Count Data from files
lines_count <- c(us_blogs_lc,us_news_lc,us_twitter_lc)

print(lines_count)

## [1] "1,000" "1,000" "1,000"

Get words count from files:

#Get Word Count for each file of the Capstone Data
us_blogs_wc <- system("wc -w final/en_US/en_US.blogs.txt | awk '{print $1}'", intern=TRUE)
us_news_wc <- system("wc -w final/en_US/en_US.news.txt | awk '{print $1}'", intern=TRUE)
us_twitter_wc <- system("wc -w final/en_US/en_US.twitter.txt | awk '{print $1}'", intern=TRUE)

#Conver data to numeric
us_blogs_wc <- as.numeric(us_blogs_wc)
us_news_wc <- as.numeric(us_news_wc)
us_twitter_wc <- as.numeric(us_twitter_wc)

#format the data
us_blogs_wc <- format(us_blogs_wc, big.mark=",")
us_news_wc <- format(us_news_wc, big.mark=",")
us_twitter_wc <- format(us_twitter_wc, big.mark=",")

#Word Count Data from files
words_count <- c(us_blogs_wc,us_news_wc,us_twitter_wc)

print(words_count)

## [1] "37,334,690" "34,372,720" "30,374,206"

Store all data into a Data Frame:

#Capstone DT
capstone.data <- data.frame(file_name, size , lines_count, words_count)

Capstone Files
file_name	size	lines_count	words_count
en_US.blogs.txt	200M	1,000	37,334,690
en_US.news.txt	196M	1,000	34,372,720
en_US.twitter.txt	159M	1,000	30,374,206

Tokenization

We need to combine the data and break the data into sentences:

combined_data <- paste(blogs_data, news_data, twitter_data)
combined_data <- sent_detect(combined_data, language = "en", model = NULL)

Now we create the main corpus, and then we need to remove all badwords (profanity filter), numbers, whitespaces, special characters and lowercasing:

corpus <- VCorpus(VectorSource(combined_data))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english")) 
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument)

badwordsvector <- VectorSource(readLines("/final/en_US/badwords.txt"))
corpus <- tm_map(corpus, removeWords, badwordsvector)

Then we convert the corpus to Data Frame for further analysis:

corpus_dt<-data.frame(text=unlist(sapply(corpus, `[`, "content")), stringsAsFactors=F)

And finally we do the tokenization with RWeka Library:

1_gram_token <- NGramTokenizer(corpus_dt, Weka_control(min = 1, max = 1))
2_gram_token <- NGramTokenizer(corpus_dt, Weka_control(min = 2, max = 2))
3_gram_token <- NGramTokenizer(corpus_dt, Weka_control(min = 3, max = 3))
4_gram_token <- NGramTokenizer(corpus_dt, Weka_control(min = 4, max = 4))

Exploratory Analysis

Distribution of word frequencies:

1-gram
2-gram
3-gram
4-gram

1_gram_dt <- data.frame(table(1_gram_token))
2_gram_dt <- data.frame(table(2_gram_token))
3_gram_dt <- data.frame(table(3_gram_token))
4_gram_dt <- data.frame(table(4_gram_token))


1_gram_sorted <- 1_gram_dt[order(1_gram_dt$Freq,decreasing = TRUE),]
2_gram_sorted <- 2_gram_dt[order(2_gram_dt$Freq,decreasing = TRUE),]
3_gram_sorted <- 3_gram_dt[order(3_gram_dt$Freq,decreasing = TRUE),]
4_gram_sorted <- 3_gram_dt[order(4_gram_dt$Freq,decreasing = TRUE),]

Conclusion

Modelling and Prediction are the next tasks to be completed.

Future Work

To create our prediction model we will use the Katz back-off http://en.wikipedia.org/wiki/Katz%27s_back-off_model which is a generative n-gram language model that estimates the conditional probability of a word given its history in the n-gram.