The objective of the Data Science Capstone is to demonstrate the various data analysis and prediction models learned during the Data Science Specialization. Specifically, these skills will be applied in the analysis of text data and natural language processing.
This milestone report will accomplish the following tasks:
NOTE: All exploratory analysis and prediction modeling steps performed throughout the Capstone is based on the data set obtained from the link below: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
First, the original training data files will be examined.
library(R.utils) # R Utilities pacakge
library(stringr) # String manipulation pacakge
library(stringi) # String manipulation pacakge
library(tm) # Text mining package
library(SnowballC) # Required for stemming
library(RWeka) # R interface to Weka
library(ggplot2) # Graph plotting package
library(dplyr)
# Load the blogs file
folder <- "data"
filename <- "en_US.blogs.txt"
filepath <- file.path(folder, filename)
filesize <- file.size(filepath)
filecon <- file(filepath, open="rb")
blogsdata <- readLines(filecon, encoding="UTF-8", skipNul=TRUE)
close(filecon)
linecount <- length(blogsdata)
wordcount <- sum(stri_count_words(blogsdata))
summary_alltextdata <- data.frame(file_name = filename,
file_size = filesize,
line_count = linecount,
word_count = wordcount
)
# Load the news file
filename <- "en_US.news.txt"
filepath <- file.path(folder, filename)
filesize <- file.size(filepath)
filecon <- file(filepath, open="rb")
newsdata <- readLines(filecon, encoding="UTF-8", skipNul=TRUE)
close(filecon)
linecount <- length(newsdata)
wordcount <- sum(stri_count_words(newsdata))
summary_alltextdata <- rbind(summary_alltextdata,
data.frame(
file_name = filename,
file_size = filesize,
line_count = linecount,
word_count = wordcount
)
)
# Load the twitter file
filename <- "en_US.twitter.txt"
filepath <- file.path(folder, filename)
filesize <- file.size(filepath)
filecon <- file(filepath, open="rb")
twitterdata <- readLines(filecon, encoding="UTF-8", skipNul=TRUE)
close(filecon)
linecount <- length(twitterdata)
wordcount <- sum(stri_count_words(twitterdata))
summary_alltextdata <- rbind(summary_alltextdata,
data.frame(
file_name = filename,
file_size = filesize,
line_count = linecount,
word_count = wordcount
)
)
print(summary_alltextdata)
## file_name file_size line_count word_count
## 1 en_US.blogs.txt 210160014 899288 37546246
## 2 en_US.news.txt 205811889 1010242 34762395
## 3 en_US.twitter.txt 167105338 2360148 30093410
The training data files contain extremely large amount of text data which can be computational intensive and time consuming. Hence, a subset of the data is obtain by random sample from the orignal text content.
# Create a sample from the original blogs data
samplename <- "sample blogs"
samplesize <- 1000
samplerownum <- sample(1:length(blogsdata), samplesize)
sample_blogsdata <- blogsdata[samplerownum]
samplelinecount <- length(sample_blogsdata)
samplewordcount <- sum(stri_count_words(sample_blogsdata))
summary_allsampledata <- data.frame(sample_name = samplename,
sample_line_count = samplelinecount,
sample_word_count = samplewordcount
)
# Create a sample from the original news data
samplename <- "sample news"
samplesize <- 1000
samplerownum <- sample(1:length(newsdata), samplesize)
sample_newsdata <- newsdata[samplerownum]
samplelinecount <- length(sample_newsdata)
samplewordcount <- sum(stri_count_words(sample_newsdata))
summary_allsampledata <- rbind(summary_allsampledata,
data.frame(
sample_name = samplename,
sample_line_count = samplelinecount,
sample_word_count = samplewordcount
)
)
# Create a sample from the original twitter data
samplename <- "sample twitter"
samplesize <- 1000
samplerownum <- sample(1:length(twitterdata), samplesize)
sample_twitterdata <- twitterdata[samplerownum]
samplelinecount <- length(sample_twitterdata)
samplewordcount <- sum(stri_count_words(sample_twitterdata))
summary_allsampledata <- rbind(summary_allsampledata,
data.frame(
sample_name = samplename,
sample_line_count = samplelinecount,
sample_word_count = samplewordcount
)
)
allsampledata <- c(sample_blogsdata, sample_newsdata, sample_twitterdata)
rm(blogsdata, newsdata, twitterdata)
rm(sample_blogsdata, sample_newsdata, sample_twitterdata)
print(summary_allsampledata)
## sample_name sample_line_count sample_word_count
## 1 sample blogs 1000 42087
## 2 sample news 1000 34024
## 3 sample twitter 1000 12791
The text content contain dirty data which can play havoc with the results. The ‘tm’ package consist of very useful transformation functions that greatly simplify data cleaning work.
textdata <- Corpus(VectorSource(allsampledata))
rm(allsampledata)
## Begin data cleansing
textdata <- tm_map(textdata, removePunctuation) # Removing punctuation
textdata <- tm_map(textdata, content_transformer(
function(x)
stri_replace_all_regex(x,"[^\\p{L}\\s[']]+","")
)
) # Removing Reg Expression characters
textdata <- tm_map(textdata, removeNumbers) # Removing numbers
textdata <- tm_map(textdata, stripWhitespace) # Stripping whitespace
textdata <- tm_map(textdata, tolower) # Converting to lowercase
textdata <- tm_map(textdata, removeWords, stopwords("english")) # Removing 'stopwords'
textdata <- tm_map(textdata, stemDocument) # Removing common word endings (e.g., "ing", "es")
textdata <- tm_map(textdata, PlainTextDocument) # Convert text data into PlainTextDocument
Now that the sample text data has been transformed and cleansed, we can now perform exploratory analysis of the data to gain understanding the distribution and frequency of words, as well as relationship between the words in the corpora. The ‘RWeka’ package is used to create N-grams models for the sample text data.
FuncTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
dtm <- DocumentTermMatrix(textdata, control = list(tokenize = FuncTokenizer))
tm_byfreq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
df_wordfreq <- data.frame(word=names(tm_byfreq), freq=tm_byfreq)
df_wordfreq <- df_wordfreq[1:20,]
# Plot graph
g <- ggplot(df_wordfreq, aes(x = reorder(word, freq, desc), y = freq)) +
geom_bar(stat="identity") +
ggtitle("Uni-grams by word frequencies") +
xlab("Uni-grams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
print(g)
FuncTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm <- DocumentTermMatrix(textdata, control = list(tokenize = FuncTokenizer))
tm_byfreq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
df_wordfreq <- data.frame(word=names(tm_byfreq), freq=tm_byfreq)
df_wordfreq <- df_wordfreq[1:20,]
# Plot graph
g <- ggplot(df_wordfreq, aes(x = reorder(word, freq, desc), y = freq)) +
geom_bar(stat="identity") +
ggtitle("Bi-grams by word frequencies") +
xlab("Bi-grams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
print(g)
FuncTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtm <- DocumentTermMatrix(textdata, control = list(tokenize = FuncTokenizer))
tm_byfreq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
df_wordfreq <- data.frame(word=names(tm_byfreq), freq=tm_byfreq)
df_wordfreq <- df_wordfreq[1:20,]
# Plot graph
g <- ggplot(df_wordfreq, aes(x = reorder(word, freq, desc), y = freq)) +
geom_bar(stat="identity") +
ggtitle("Tri-grams by word frequencies") +
xlab("Tri-grams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
print(g)
This milestone report is created by applying text mining techniques published in the following articles:
Text Predictions with R: http://rstudio-pubs-static.s3.amazonaws.com/39014_76f8487a8fb84ed7849e96846847c295.html
Basic Text Mining in R: https://rstudio-pubs-static.s3.amazonaws.com/31867_8236987cf0a8444e962ccd2aec46d9c3.html
A Gentle Introduction to Text Mining using R: https://eight2late.wordpress.com/2015/05/27/a-gentle-introduction-to-text-mining-using-r/
It was observed that the executing the various transformating functions can be computational intensive and time consuming due to large amount of text in the training data.
It was therefor decided that a smaller subset from the original training data be taken by extracting a smaller sample of the original text content to reduce the computation resources and processing time. While such action may reduce the effectiveness and accuracy of the final text prediction model, nevertheless, it was sufficient to demonstrate the key concepts intended for this exercise.