Data Science Specialization SwiftKey Capstone Project Overview

The goal of this capstone is to mimic the experience of being a Data Scientist by using data science techniques learned from all 9 specialization courses to create a Data Product and presentation to SwiftKey.
For week 2 the main objective is to build the sample corpus find the 2-gram and 3-gram term document matrix and perform exploratory analysis on the words.

Downloading And Unziping Data

The data is available to be downloaded from
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

dataUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(dataUrl,destfile = "swiftkeydata.zip",method = "curl")
unzip("swiftkeydata.zip")

The files are extracted from the zip file with Three working files:

Importing Libraries

library(stringi)
library(tm)
## Loading required package: NLP
library(NLP)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(SnowballC)

Reading And Getting Basic Information Of Data

As en_US file has 3 working files .Now we calculate for each file:

# Reading All Three Files
blogFile <- readLines("en_US.blogs.txt",encoding = "UTF-8",skipNul = TRUE)
newsFile <- readLines("en_US.news.txt",encoding = "UTF-8",skipNul = TRUE)
twitterFile <- readLines("en_US.Twitter.txt",encoding = "UTF-8",skipNul = TRUE)

# Calculating
charBlog <- sum(stri_count_words(blogFile))
linesBlog <- NROW(blogFile)
sizeBlog <- file.size(blogFile)
charNews <- sum(stri_count_words(newsFile))
linesNews <- NROW(newsFile)
charTwitter <- sum(stri_count_words(twitterFile))
linesTwitter <- NROW(twitterFile)
# Storing result in data frame
textNLines <- data.frame(en_US.blogs= c(charBlog,linesBlog), en_US.news = c(charNews,linesBlog), en_US.Twitter=c(charTwitter,linesTwitter),row.names = c("Total Characters","Total Lines"))
textNLines
##                  en_US.blogs en_US.news en_US.Twitter
## Total Characters    37546239    2674536      30093413
## Total Lines           899288     899288       2360148

Creating Sample Data

As the orignal files blogs,news,twitter are extremely large a small sample will be generated to study the data. Using some of the contents of each of the data will be sampled to create the Sample Corpus

#  Generating Samples
sampleBlogFile <- readLines("en_US.blogs.txt",encoding = "UTF-8",skipNul = TRUE,35000)
sampleNewsFile <- readLines("en_US.news.txt",encoding = "UTF-8",skipNul = TRUE,35000)
sampleTwitterFile <- readLines("en_US.Twitter.txt",encoding = "UTF-8",skipNul = TRUE,35000)

# Saving Samples in new file
write.table(sampleBlogFile,"sampleblog.txt",row.names = FALSE)
write.table(sampleNewsFile,"samplenews.txt",row.names = FALSE)
write.table(sampleTwitterFile,"sampletwitter.txt",row.names = FALSE)

Generating Corpus From Sample Data

folderPath <- "C:/Users/Super/Desktop/swiftkeycapstone/final/en_US/TextMining"
corpusData <- VCorpus(DirSource(folderPath))
inspect(corpusData)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 8087197
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 7185028
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2478937

Cleaning Corpus Data

removeURL<-function(x) gsub("http[[:alnum:]]*","",x)
removeSign<-function(x) gsub("[[:punct:]]","",x)
removeNum<-function(x) gsub("[[:digit:]]","",x)
removeapo<-function(x) gsub("'","",x)
removeNonASCII<-function(x) iconv(x, "latin1", "ASCII", sub="")
removerepeat<- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
removeSpace<-function(x) gsub("\\s+"," ",x)
removeTh<-function(x) gsub(" th", "",x)
removeHyphen <- function(x) gsub("-"," ",x)
removeColon <- function(x) gsub(":", " ",x)
removeComma <- function(x) gsub("'","",x)
removeC <- function(x) gsub(",","",x)
removeDc <- function(x) gsub(", "," ",x)
removeEqual <- function(x) gsub("=", " ",x)
removeBar <- function(x) gsub("_","",x)

corpusData <- tm_map(corpusData,content_transformer(removeDc))
corpusData <- tm_map(corpusData,content_transformer(removeC))
corpusData <- tm_map(corpusData,content_transformer(removeBar))
corpusData <- tm_map(corpusData,content_transformer(removeEqual))
corpusData <- tm_map(corpusData,content_transformer(removeComma))
corpusData <- tm_map(corpusData,content_transformer(removeColon))
corpusData <- tm_map(corpusData,content_transformer(removeHyphen))
corpusData <- tm_map(corpusData,content_transformer(removeURL))
corpusData <- tm_map(corpusData,content_transformer(removeSign))
corpusData <- tm_map(corpusData,content_transformer(removeNum))
corpusData <- tm_map(corpusData,content_transformer(removeapo))
corpusData <- tm_map(corpusData,content_transformer(removeNonASCII))
corpusData <- tm_map(corpusData,content_transformer(removerepeat))
corpusData <- tm_map(corpusData,content_transformer(removeSpace))
corpusData <- tm_map(corpusData,content_transformer(removeTh))
corpusData <- tm_map(corpusData,content_transformer(removePunctuation))
corpusData <- tm_map(corpusData,removeNumbers)
corpusData <- tm_map(corpusData,removeWords,stopwords("english"))
corpusData <- tm_map(corpusData,stripWhitespace)

Now Sample corpus is clean now it will then be tokenized using RWeka to three different categories to further analyze the frequency of the words.

Exploratory Analysis

UniGram

1-gram is a continous sequence of single word from the corpus

library(RWeka)
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
dtmUni<-TermDocumentMatrix(corpusData,control = list(tokenize =unigram ))
matrixUni = as.data.frame((as.matrix(  dtmUni )) ) 
sortUni <- sort(rowSums(matrixUni),decreasing=TRUE)
dfUni <- data.frame(word = names(sortUni),freq=sortUni)
head(dfUni)
##      word  freq
## the   the 17333
## one   one 14895
## ofe   ofe 14108
## ine   ine 13134
## will will  9469
## said said  8886

BiGram

2-gram is a contiguous sequence of two words from the corpus.

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtmBigram<-TermDocumentMatrix(corpusData,control = list(tokenize =bigram))
matrixBigram = as.data.frame((as.matrix(  dtmBigram ))) 
sortBigram <- sort(rowSums(matrixBigram),decreasing=TRUE)
dfBigram <- data.frame(word = names(sortBigram),freq=sortBigram)
head(dfBigram)
##            word freq
## i dont   i dont 1370
## one ofe one ofe 1116
## i can     i can 1006
## i just   i just  935
## i will   i will  844
## i love   i love  816

TriGram

3-gram is a contiguous sequence of three words from the corpus.

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtmTrigram<-TermDocumentMatrix(corpusData,control = list(tokenize =trigram ))
matrixTrigram <- as.data.frame((as.matrix(  dtmTrigram )) ) 
sortTrigram <- sort(rowSums(matrixTrigram),decreasing=TRUE)
dfTrigram <- data.frame(word = names(sortTrigram),freq=sortTrigram)
head(dfTrigram)
##                            word freq
## i dont know         i dont know  293
## fore first time fore first time  188
## i feel like         i feel like  137
## i dont want         i dont want  129
## ate end ofe         ate end ofe  126
## i know i               i know i  121

WordClouds

About Next Step Predictive Algorithm And Shiny App

There are a number of steps that need to be addressed in the Predictive/Shiny part of this project. Create a larger corpus size from the original blogs, news and twitter and tokenize it (2-gram and 3-gram). Possible to split the corpus onto multiple parts and create the 2-gram and 3-gram matrix independently then to recombine them as one.Create prediction algorithm by comparing the input to the 2-gram and 3-gram matrix. Optimize the codes to allow faster processing.The Shiny interface will allow a non-data scientist to interact with the program, and attempt to predict the next word given a string of previous words.