The goal of this capstone is to mimic the experience of being a Data Scientist by using data science techniques learned from all 9 specialization courses to create a Data Product and presentation to SwiftKey.
For week 2 the main objective is to build the sample corpus find the 2-gram and 3-gram term document matrix and perform exploratory analysis on the words.
The data is available to be downloaded from
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
dataUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(dataUrl,destfile = "swiftkeydata.zip",method = "curl")
unzip("swiftkeydata.zip")
The files are extracted from the zip file with Three working files:
library(stringi)
library(tm)
## Loading required package: NLP
library(NLP)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(SnowballC)
As en_US file has 3 working files .Now we calculate for each file:
# Reading All Three Files
blogFile <- readLines("en_US.blogs.txt",encoding = "UTF-8",skipNul = TRUE)
newsFile <- readLines("en_US.news.txt",encoding = "UTF-8",skipNul = TRUE)
twitterFile <- readLines("en_US.Twitter.txt",encoding = "UTF-8",skipNul = TRUE)
# Calculating
charBlog <- sum(stri_count_words(blogFile))
linesBlog <- NROW(blogFile)
sizeBlog <- file.size(blogFile)
charNews <- sum(stri_count_words(newsFile))
linesNews <- NROW(newsFile)
charTwitter <- sum(stri_count_words(twitterFile))
linesTwitter <- NROW(twitterFile)
# Storing result in data frame
textNLines <- data.frame(en_US.blogs= c(charBlog,linesBlog), en_US.news = c(charNews,linesBlog), en_US.Twitter=c(charTwitter,linesTwitter),row.names = c("Total Characters","Total Lines"))
textNLines
## en_US.blogs en_US.news en_US.Twitter
## Total Characters 37546239 2674536 30093413
## Total Lines 899288 899288 2360148
As the orignal files blogs,news,twitter are extremely large a small sample will be generated to study the data. Using some of the contents of each of the data will be sampled to create the Sample Corpus
# Generating Samples
sampleBlogFile <- readLines("en_US.blogs.txt",encoding = "UTF-8",skipNul = TRUE,35000)
sampleNewsFile <- readLines("en_US.news.txt",encoding = "UTF-8",skipNul = TRUE,35000)
sampleTwitterFile <- readLines("en_US.Twitter.txt",encoding = "UTF-8",skipNul = TRUE,35000)
# Saving Samples in new file
write.table(sampleBlogFile,"sampleblog.txt",row.names = FALSE)
write.table(sampleNewsFile,"samplenews.txt",row.names = FALSE)
write.table(sampleTwitterFile,"sampletwitter.txt",row.names = FALSE)
folderPath <- "C:/Users/Super/Desktop/swiftkeycapstone/final/en_US/TextMining"
corpusData <- VCorpus(DirSource(folderPath))
inspect(corpusData)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 8087197
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 7185028
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 2478937
removeURL<-function(x) gsub("http[[:alnum:]]*","",x)
removeSign<-function(x) gsub("[[:punct:]]","",x)
removeNum<-function(x) gsub("[[:digit:]]","",x)
removeapo<-function(x) gsub("'","",x)
removeNonASCII<-function(x) iconv(x, "latin1", "ASCII", sub="")
removerepeat<- function(x) gsub("([[:alpha:]])\\1{2,}", "\\1\\1", x)
removeSpace<-function(x) gsub("\\s+"," ",x)
removeTh<-function(x) gsub(" th", "",x)
removeHyphen <- function(x) gsub("-"," ",x)
removeColon <- function(x) gsub(":", " ",x)
removeComma <- function(x) gsub("'","",x)
removeC <- function(x) gsub(",","",x)
removeDc <- function(x) gsub(", "," ",x)
removeEqual <- function(x) gsub("=", " ",x)
removeBar <- function(x) gsub("_","",x)
corpusData <- tm_map(corpusData,content_transformer(removeDc))
corpusData <- tm_map(corpusData,content_transformer(removeC))
corpusData <- tm_map(corpusData,content_transformer(removeBar))
corpusData <- tm_map(corpusData,content_transformer(removeEqual))
corpusData <- tm_map(corpusData,content_transformer(removeComma))
corpusData <- tm_map(corpusData,content_transformer(removeColon))
corpusData <- tm_map(corpusData,content_transformer(removeHyphen))
corpusData <- tm_map(corpusData,content_transformer(removeURL))
corpusData <- tm_map(corpusData,content_transformer(removeSign))
corpusData <- tm_map(corpusData,content_transformer(removeNum))
corpusData <- tm_map(corpusData,content_transformer(removeapo))
corpusData <- tm_map(corpusData,content_transformer(removeNonASCII))
corpusData <- tm_map(corpusData,content_transformer(removerepeat))
corpusData <- tm_map(corpusData,content_transformer(removeSpace))
corpusData <- tm_map(corpusData,content_transformer(removeTh))
corpusData <- tm_map(corpusData,content_transformer(removePunctuation))
corpusData <- tm_map(corpusData,removeNumbers)
corpusData <- tm_map(corpusData,removeWords,stopwords("english"))
corpusData <- tm_map(corpusData,stripWhitespace)
Now Sample corpus is clean now it will then be tokenized using RWeka to three different categories to further analyze the frequency of the words.
1-gram is a continous sequence of single word from the corpus
library(RWeka)
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
dtmUni<-TermDocumentMatrix(corpusData,control = list(tokenize =unigram ))
matrixUni = as.data.frame((as.matrix( dtmUni )) )
sortUni <- sort(rowSums(matrixUni),decreasing=TRUE)
dfUni <- data.frame(word = names(sortUni),freq=sortUni)
head(dfUni)
## word freq
## the the 17333
## one one 14895
## ofe ofe 14108
## ine ine 13134
## will will 9469
## said said 8886
2-gram is a contiguous sequence of two words from the corpus.
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtmBigram<-TermDocumentMatrix(corpusData,control = list(tokenize =bigram))
matrixBigram = as.data.frame((as.matrix( dtmBigram )))
sortBigram <- sort(rowSums(matrixBigram),decreasing=TRUE)
dfBigram <- data.frame(word = names(sortBigram),freq=sortBigram)
head(dfBigram)
## word freq
## i dont i dont 1370
## one ofe one ofe 1116
## i can i can 1006
## i just i just 935
## i will i will 844
## i love i love 816
3-gram is a contiguous sequence of three words from the corpus.
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtmTrigram<-TermDocumentMatrix(corpusData,control = list(tokenize =trigram ))
matrixTrigram <- as.data.frame((as.matrix( dtmTrigram )) )
sortTrigram <- sort(rowSums(matrixTrigram),decreasing=TRUE)
dfTrigram <- data.frame(word = names(sortTrigram),freq=sortTrigram)
head(dfTrigram)
## word freq
## i dont know i dont know 293
## fore first time fore first time 188
## i feel like i feel like 137
## i dont want i dont want 129
## ate end ofe ate end ofe 126
## i know i i know i 121
There are a number of steps that need to be addressed in the Predictive/Shiny part of this project. Create a larger corpus size from the original blogs, news and twitter and tokenize it (2-gram and 3-gram). Possible to split the corpus onto multiple parts and create the 2-gram and 3-gram matrix independently then to recombine them as one.Create prediction algorithm by comparing the input to the 2-gram and 3-gram matrix. Optimize the codes to allow faster processing.The Shiny interface will allow a non-data scientist to interact with the program, and attempt to predict the next word given a string of previous words.