This is the first part of the Capstone project of the Data Science Specialization. The purpose of this project is to prove the ability of working with data and the ability of writing a prediction algorithm on data.
The main purpose of the current assignment is to:
Data has been provided by SwiftKey https://swiftkey.com/ and was downloaded from the following link https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip. Data contains three text-data files which are unarchived. The data is composed of three files from three different sources saved under ./data folder:
Initial exploratory analysis is performed on the input files. The following features are sought and summarized in the following table:
## Number_Of_Char Number_Of_Words File_Size
## en_US.blogs.txt 208361438 36893516 210.1600
## en_US.twitter.txt 162384825 29430648 167.1053
## en_US.news.txt 15683765 2579113 205.8119
Data in its raw form has features which needs to be removed before proceeding with building our prediction algorithm, such as, numbers, punctuation and other characters besides words. Thus, the next step in the process is cleaning data.
First, since the prediction algorithm we will write will predict the next word, we will just keep the words in the data. All characters besides letters are removed.
Secondly, for further cleaning of the data the tm package is used and data is concatenated in one single object and converted to type corpus.
Third, the following additional cleaning steps are performed:
The result of the clean data is data in the form of a long vector composed of sequence words, which will be used in the prediction algorithm.
Once the data is clean, exploratory analysis is performed on the data. For the estimation of the next word, the most important part of the data is the sequence of words in the text. Thus, the next step is to find:
For each group, a bar plot of the 10 most used words or phrases in the text data is shown. In order to obtain those graphs, the code is shown in the Appendix below.
The appendix contains all the code used to create the results in this report.
library(NLP)
library(tm)
library(SnowballC)
library(ngram)
library(ggplot2)
library(RWekajars)
library(RWeka)
file_blog <- "./data/en_US.blogs.txt"
file_twit <- "./data/en_US.twitter.txt"
file_news <- "./data/en_US.news.txt"
con_blog <- file(file_blog, "r")
con_twit <- file(file_twit, "r")
con_news <- file(file_news, "r")
blog <- readLines(con_blog)
twit <- readLines(con_twit)
news <- readLines(con_news)
close(con_blog)
close(con_twit)
close(con_news)
# Number of characters
nc_blog <- sum(nchar(blog), na.rm = TRUE)
nc_twit <- sum(nchar(twit), na.rm = TRUE)
nc_news <- sum(nchar(news), na.rm = TRUE)
# Number of words
library(qdap)
nw_blog <- wc(blog)
nw_twit <- wc(twit)
nw_news <- wc(news)
nw_blog_all <- sum(nw_blog, na.rm = TRUE)
nw_twit_all <- sum(nw_twit, na.rm = TRUE)
nw_news_all <- sum(nw_news, na.rm = TRUE)
# File sizes in Mb
sz_blog <- file.info(file_blog)$size/10^6
sz_twit <- file.info(file_twit)$size/10^6
sz_news <- file.info(file_news)$size/10^6
# Consolidate data into a data.frame
file_data <- data.frame(Number_Of_Char = c(nc_blog, nc_twit, nc_news),
Number_Of_Words = c(nw_blog_all, nw_twit_all, nw_news_all),
File_Size = c(sz_blog, sz_twit, sz_news),
row.names = c("en_US.blogs.txt", "en_US.twitter.txt", "en_US.news.txt"))
# Remove all characters other than letters
blog_clean <- gsub("[^ a-zA-Z]", "", blog)
twit_clean <- gsub("[^ a-zA-Z]", "", twit)
news_clean <- gsub("[^ a-zA-Z]", "", news)
save(blog_clean, twit_clean, news_clean, file = "CleanData.RData")
load(file = "CleanData.RData")
# Sample 10,000 lines of each dataset - for faster run
blog_clean_samp <- blog_clean[sample(1:length(blog_clean),10000)]
twit_clean_samp <- twit_clean[sample(1:length(twit_clean),10000)]
news_clean_samp <- news_clean[sample(1:length(news_clean),10000)]
# Concatenate all samples into a single one
clean_data_samp <- c(blog_clean_samp, twit_clean_samp, news_clean_samp)
clean_data_samp <- VectorSource(clean_data_samp)
corpus_data_samp <- Corpus(clean_data_samp)
save(corpus_data_samp, file = "CorpusData.RData")
load(file = "CorpusData.RData")
# Convert all letters to lower case
corpus_data_Samp_clean <- tm_map(corpus_data_samp, content_transformer(tolower))
# Remove numbers
corpus_data_Samp_clean <- tm_map(corpus_data_Samp_clean, content_transformer(removeNumbers))
# Remove white spaces
corpus_data_Samp_clean <- tm_map(corpus_data_Samp_clean, stripWhitespace)
# Stemming
corpus_data_Samp_clean <- tm_map(corpus_data_Samp_clean, stemDocument)
# Tokenization functions
wt2 <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
wt3 <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
wt4 <- function(x) NGramTokenizer(x, Weka_control(min=4, max=4))
# Most important words
one_wt <- TermDocumentMatrix(corpus_data_Samp_clean)
one_wt <- removeSparseTerms(one_wt, 0.99)
one_wt_sort <- sort(rowSums(as.matrix(one_wt)), decreasing = TRUE)
one_wt_df <- data.frame(phrase = names(one_wt_sort), count = one_wt_sort)
ggplot(one_wt_df[1:10,], aes(x = phrase, y = count)) +
geom_bar(stat = "identity", fill = "blue") +
xlab("Phrase") + ylab("Frequency") +
ggtitle("First 10 Most-used Words") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Most important 2-word word groups
two_wt <- TermDocumentMatrix(corpus_data_Samp_clean, control=list(tokenize=wt2))
two_wt <- removeSparseTerms(two_wt, 0.99)
two_wt_sort <- sort(rowSums(as.matrix(two_wt)), decreasing = TRUE)
two_wt_df <- data.frame(phrase = names(two_wt_sort), count = two_wt_sort)
ggplot(two_wt_df[1:10,], aes(x = phrase, y = count)) +
geom_bar(stat = "identity", fill = "blue") +
xlab("Phrase") + ylab("Frequency") +
ggtitle("First 10 Most-used 2-word Groups") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Most important 3-word word groups
three_wt <- TermDocumentMatrix(corpus_data_Samp_clean, control=list(tokenize=wt3))
three_wt <- removeSparseTerms(three_wt, 0.999)
three_wt_sort <- sort(rowSums(as.matrix(three_wt)), decreasing = TRUE)
three_wt_df <- data.frame(phrase = names(three_wt_sort), count = three_wt_sort)
ggplot(three_wt_df[1:10,], aes(x = phrase, y = count)) +
geom_bar(stat = "identity", fill = "blue") +
xlab("Phrase") + ylab("Frequency") +
ggtitle("First 10 Most-used 3-word Groups") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Most important 4-word word groups
four_wt <- TermDocumentMatrix(corpus_data_Samp_clean, control=list(tokenize=wt4))
four_wt <- removeSparseTerms(four_wt, 0.9999)
four_wt_sort <- sort(rowSums(as.matrix(four_wt)), decreasing = TRUE)
four_wt_df <- data.frame(phrase = names(four_wt_sort), count = four_wt_sort)
ggplot(four_wt_df[1:10,], aes(x = phrase, y = count)) +
geom_bar(stat = "identity", fill = "blue") +
xlab("Phrase") + ylab("Frequency") +
ggtitle("First 10 Most-used 4-word Groups") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
save(one_wt_df, two_wt_df, three_wt_df, four_wt_df, file = "word_groups.Rdata")