Data Science SwiftKey Capstone Week 2 Project

Data Science Specialization SwiftKey Capstone Project Overview

The goal of this capstone is to mimic the experience of being a Data Scientist by using data science techniques learned from all 9 specialization courses to create a Data Product and presentation to SwiftKey.
For week 2 the main objective is to build the sample corpus find the 2-gram and 3-gram term document matrix and perform exploratory analysis on the words.

Downloading And Unziping Data

The data is available to be downloaded from
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

dataUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(dataUrl,destfile = "swiftkeydata.zip",method = "curl")
unzip("swiftkeydata.zip")

The files are extracted from the zip file with Three working files:

en_US.blogs.txt
en_US.news.txt
en_US.twitter.txt

Importing Libraries

library(stringi)
library(tm)

## Loading required package: NLP

library(NLP)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(wordcloud)

## Loading required package: RColorBrewer

library(SnowballC)

Reading And Getting Basic Information Of Data

As en_US file has 3 working files .Now we calculate for each file:

Total Numbers Of Characters
Total Lines

# Reading All Three Files
blogFile <- readLines("en_US.blogs.txt",encoding = "UTF-8",skipNul = TRUE)
newsFile <- readLines("en_US.news.txt",encoding = "UTF-8",skipNul = TRUE)
twitterFile <- readLines("en_US.Twitter.txt",encoding = "UTF-8",skipNul = TRUE)

# Calculating
charBlog <- sum(stri_count_words(blogFile))
linesBlog <- NROW(blogFile)
sizeBlog <- file.size(blogFile)
charNews <- sum(stri_count_words(newsFile))
linesNews <- NROW(newsFile)
charTwitter <- sum(stri_count_words(twitterFile))
linesTwitter <- NROW(twitterFile)
# Storing result in data frame
textNLines <- data.frame(en_US.blogs= c(charBlog,linesBlog), en_US.news = c(charNews,linesBlog), en_US.Twitter=c(charTwitter,linesTwitter),row.names = c("Total Characters","Total Lines"))
textNLines

##                  en_US.blogs en_US.news en_US.Twitter
## Total Characters    37546239    2674536      30093413
## Total Lines           899288     899288       2360148

Creating Sample Data

As the orignal files blogs,news,twitter are extremely large a small sample will be generated to study the data. Using some of the contents of each of the data will be sampled to create the Sample Corpus

#  Generating Samples
sampleBlogFile <- readLines("en_US.blogs.txt",encoding = "UTF-8",skipNul = TRUE,35000)
sampleNewsFile <- readLines("en_US.news.txt",encoding = "UTF-8",skipNul = TRUE,35000)
sampleTwitterFile <- readLines("en_US.Twitter.txt",encoding = "UTF-8",skipNul = TRUE,35000)

# Saving Samples in new file
write.table(sampleBlogFile,"sampleblog.txt",row.names = FALSE)
write.table(sampleNewsFile,"samplenews.txt",row.names = FALSE)
write.table(sampleTwitterFile,"sampletwitter.txt",row.names = FALSE)

Generating Corpus From Sample Data

folderPath <- "C:/Users/Super/Desktop/swiftkeycapstone/final/en_US/TextMining"
corpusData <- VCorpus(DirSource(folderPath))
inspect(corpusData)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 8087197
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 7185028
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 2478937

Cleaning Corpus Data

removeURL<-function(x) gsub("http[[:alnum:]]*","",x)
removeSign<-function(x) gsub("[[:punct:]]","",x)
removeNum<-function(x) gsub("[[:digit:]]","",x)
removeapo<-function(x) gsub("'","",x)
removeNonASCII<-function(x) iconv(x, "latin1", "ASCII", sub="")
removerepeat<- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
removeSpace<-function(x) gsub("\\s+"," ",x)
removeTh<-function(x) gsub(" th", "",x)
removeHyphen <- function(x) gsub("-"," ",x)
removeColon <- function(x) gsub(":", " ",x)
removeComma <- function(x) gsub("'","",x)
removeC <- function(x) gsub(",","",x)
removeDc <- function(x) gsub(", "," ",x)
removeEqual <- function(x) gsub("=", " ",x)
removeBar <- function(x) gsub("_","",x)

corpusData <- tm_map(corpusData,content_transformer(removeDc))
corpusData <- tm_map(corpusData,content_transformer(removeC))
corpusData <- tm_map(corpusData,content_transformer(removeBar))
corpusData <- tm_map(corpusData,content_transformer(removeEqual))
corpusData <- tm_map(corpusData,content_transformer(removeComma))
corpusData <- tm_map(corpusData,content_transformer(removeColon))
corpusData <- tm_map(corpusData,content_transformer(removeHyphen))
corpusData <- tm_map(corpusData,content_transformer(removeURL))
corpusData <- tm_map(corpusData,content_transformer(removeSign))
corpusData <- tm_map(corpusData,content_transformer(removeNum))
corpusData <- tm_map(corpusData,content_transformer(removeapo))
corpusData <- tm_map(corpusData,content_transformer(removeNonASCII))
corpusData <- tm_map(corpusData,content_transformer(removerepeat))
corpusData <- tm_map(corpusData,content_transformer(removeSpace))
corpusData <- tm_map(corpusData,content_transformer(removeTh))
corpusData <- tm_map(corpusData,content_transformer(removePunctuation))
corpusData <- tm_map(corpusData,removeNumbers)
corpusData <- tm_map(corpusData,removeWords,stopwords("english"))
corpusData <- tm_map(corpusData,stripWhitespace)

Document Term Matrix

Now Sample corpus is clean now it will then be tokenized using RWeka to three different categories to further analyze the frequency of the words.

unigram
bigram
trigram

Exploratory Analysis

UniGram

1-gram is a continous sequence of single word from the corpus

library(RWeka)
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
dtmUni<-TermDocumentMatrix(corpusData,control = list(tokenize =unigram ))
matrixUni = as.data.frame((as.matrix(  dtmUni )) ) 
sortUni <- sort(rowSums(matrixUni),decreasing=TRUE)
dfUni <- data.frame(word = names(sortUni),freq=sortUni)
head(dfUni)

##      word  freq
## the   the 17333
## one   one 14895
## ofe   ofe 14108
## ine   ine 13134
## will will  9469
## said said  8886

BiGram

2-gram is a contiguous sequence of two words from the corpus.

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtmBigram<-TermDocumentMatrix(corpusData,control = list(tokenize =bigram))
matrixBigram = as.data.frame((as.matrix(  dtmBigram ))) 
sortBigram <- sort(rowSums(matrixBigram),decreasing=TRUE)
dfBigram <- data.frame(word = names(sortBigram),freq=sortBigram)
head(dfBigram)

##            word freq
## i dont   i dont 1370
## one ofe one ofe 1116
## i can     i can 1006
## i just   i just  935
## i will   i will  844
## i love   i love  816

TriGram

3-gram is a contiguous sequence of three words from the corpus.

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtmTrigram<-TermDocumentMatrix(corpusData,control = list(tokenize =trigram ))
matrixTrigram <- as.data.frame((as.matrix(  dtmTrigram )) ) 
sortTrigram <- sort(rowSums(matrixTrigram),decreasing=TRUE)
dfTrigram <- data.frame(word = names(sortTrigram),freq=sortTrigram)
head(dfTrigram)

##                            word freq
## i dont know         i dont know  293
## fore first time fore first time  188
## i feel like         i feel like  137
## i dont want         i dont want  129
## ate end ofe         ate end ofe  126
## i know i               i know i  121

WordClouds

UniGram Wordplot

BiGram Wordplot

TriGram Wordplot

About Next Step Predictive Algorithm And Shiny App

There are a number of steps that need to be addressed in the Predictive/Shiny part of this project. Create a larger corpus size from the original blogs, news and twitter and tokenize it (2-gram and 3-gram). Possible to split the corpus onto multiple parts and create the 2-gram and 3-gram matrix independently then to recombine them as one.Create prediction algorithm by comparing the input to the 2-gram and 3-gram matrix. Optimize the codes to allow faster processing.The Shiny interface will allow a non-data scientist to interact with the program, and attempt to predict the next word given a string of previous words.