The final goal of this capstone’s project is to build a predictive model for predicting the next word in a sentence (or clause) based on its previous words. However, the goal of this milestone project is just to display that we’ve gotten used to working with the data. Concretely, in this report we would: 1. Demonstrate that we have successfully downloaded and read the data, 2. Create a basic report of summary statistics of the data, 3. Report some feedback on our plans for creating a prediction algorithm. The dataset provided for this project is downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip. The thorough dataset is multi-language and we are going to use only the English sub-datasets, namely “en_US_twitter.txt”, “en_US_news.txt” and “en_US_blogs.txt”.
if(!file.exists("./data")){
dir.create("./data")
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(Url, destfile="./data/Coursera-SwiftKey.zip", mode = "wb")
unzip(zipfile="./data/Coursera-SwiftKey.zip", exdir="./data")
}
dataBlogs <- readLines("./data/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
dataNews <- readLines("./data/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("./data/en_US/en_US.news.txt", encoding = "UTF-8", :
## incomplete final line found on './data/en_US/en_US.news.txt'
dataTwitter <- readLines("./data/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
stri_stats_general(dataBlogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
stri_stats_general(dataNews)
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15639408 13072698
stri_stats_general(dataTwitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
Sample the data and create the corpus
subdataBlogs <- sample(dataBlogs, size = 1000)
subdataNews <- sample(dataNews, size = 1000)
subdataTwitter <- sample(dataTwitter, size = 1000)
sampledData <- c(subdataBlogs, subdataNews, subdataTwitter)
corpus <- VCorpus(VectorSource(sampledData))
Remove stopwords, punctuation, whitespaces, numbers etc. from the corpuses
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "/|@|//|$|:|:)|*|&|!|?|_|-|#|")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords())
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
Create the DocumentTermMatrizes
dtm1 <- TermDocumentMatrix(corpus)
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm2 <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtm3 <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))
freq1 <- rowSums(as.matrix(dtm1))
freq1 <- sort(freq1, decreasing = TRUE)
dfFreq1 <- data.frame(word = names(freq1), freq=freq1)
ggplot(dfFreq1[1:20, ], aes(word, freq)) +
geom_bar(stat="identity", fill="red", colour="red") +
theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("1-gram Frequency")
freq2 <- rowSums(as.matrix(dtm2))
freq2 <- sort(freq2, decreasing = TRUE)
dfFreq2 <- data.frame(word = names(freq2), freq=freq2)
ggplot(dfFreq2[1:20, ], aes(word, freq)) +
geom_bar(stat="identity", fill="red", colour="red") +
theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("2-gram Frequency")
freq3 <- rowSums(as.matrix(dtm3))
freq3 <- sort(freq3, decreasing = TRUE)
dfFreq3 <- data.frame(word = names(freq3), freq=freq3)
ggplot(dfFreq3[1:20, ], aes(word, freq)) +
geom_bar(stat="identity", fill="red", colour="red") +
theme(axis.text.x=element_text(angle=45, hjust=1)) + ggtitle("3-gram Frequency")
wordcloud(dfFreq1$word[1:100], dfFreq1$freq[1:100], colors=brewer.pal(8, "Dark2"))
## Warning in wordcloud(dfFreq1$word[1:100], dfFreq1$freq[1:100], colors =
## brewer.pal(8, : will could not be fit on page. It will not be plotted.
From observing the dataset, and looking in discussion forumas there are number of issues to solve to achieve the final goal of this project:
For the further analysis, text modelling, and text prediction, we need to do the following studies: