Data Science Capstone Milestone Report

Summary

The goal is to build a simple model for the relationship between words as a first step in building a predictive text mining application. The basic n-gram model will be used to predict the next word based on the previous 1, 2, or 3 words. The model will handle unseen n-grams, as well cases where a particular n-gram is not observed.

Download text files

Three text files represent: - collection of Tweets - collection of blog entries - collection of news items

Download text files from: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Load reuired library packages

Loading text mining, string processing and plotting packages

library(knitr)
library(tm)
library(stringi)
library(quanteda)
library(ggplot2)
library(RWeka)

Setting the environment

Read the files & get the file size

setwd("C:\\Users\\Ciru\\Documents\\Coursera\\Data Science Specialization\\Capstone Project\\Coursera-SwiftKey\\final\\en_US")

list.files(pattern = "^en_US.*txt$")

## [1] "en_US.blogs.txt"   "en_US.news.txt"    "en_US.twitter.txt"

size <- round(file.info(c("en_US.blogs.txt", 
                          "en_US.news.txt", 
                          "en_US.twitter.txt"))$size/1024/1024, 2)
size

## [1] 200.42 196.28 159.36

blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

# to be used in data munging step
profanity <- read.table("profanelist.txt")

Data Summary

The internal structure of the files:

#str(blogs)
#str(twitter)
#str(news)

summary(blogs)

##    Length     Class      Mode 
##    899288 character character

summary(twitter)

##    Length     Class      Mode 
##   2360148 character character

summary(news)

##    Length     Class      Mode 
##     77259 character character

Determine number of lines, characters and words. Create a dataframe of the files and plot the counts.

## Number of lines in each file
line_count <- c(length(blogs), 
           length(news), 
           length(twitter))

## Number of characters in each file
char_count <- c(sum(nchar(blogs)), 
          sum(nchar(news)), 
          sum(nchar(twitter)))

## Number of words
word_count <- c(sum(stri_count_words(blogs)), 
           sum(stri_count_words(news)), 
           sum(stri_count_words(twitter)))

filesstats <- cbind(line_count, char_count, word_count)
colnames(filesstats) <- c("Lines", "Characters", "Words")
rownames(filesstats) <- c("Blogs", "News", "Twitter")

filesstats

##           Lines Characters    Words
## Blogs    899288  206824505 37546246
## News      77259   15639408  2674536
## Twitter 2360148  162096241 30093410

# create a data frame to store counts
df <- data.frame(text.source = c("blogs", "news", "twitter"), char.count = NA, line.count = NA, word.count = NA)

df$char.count <- char_count
df$line.count <- line_count 
df$word.count <- word_count

# plot the counts
g_char <-   ggplot(df, aes(x = factor(text.source), y = char.count/1e+06)) +
            geom_bar(stat = "identity") +
            labs(y = "No. of characters (millions)", x = "Text File", title = "Number of characters per text file") 
g_char

g_lines <-  ggplot(df, aes(x = factor(text.source), y = line.count/1e+06)) +
            geom_bar(stat = "identity") +
            labs(y = "No. of lines (millions)", x = "Text File", title = "Number of lines per text file") 
g_lines

g_words <-  ggplot(df, aes(x = factor(text.source), y = word.count/1e+06)) +
            geom_bar(stat = "identity") +
            labs(y = "No. of words (millions)", x = "Text File", title = "Number of words per text file") 
g_words

Random sampling

Create data samples of 10000 words from each file

# put files into a list
my.list <- list(blogs = blogs, news = news, twitter = twitter)

# create a new, empty list to store random selections
sample.list <- list(blog = NA, news = NA, twitter = NA)

# create a data frame for samples
sample.df <- data.frame(text.source = c("blog", "news", "twitter"),
                 line.count = NA, word.count = NA)

#Create data sample of 10000 
sampleblogs <-sample(blogs, 10000)
samplenews <-sample(news, 10000)
sampletwitter <-sample(twitter, 10000)
sample.list <- list(blogs = sampleblogs, news = samplenews, twitter = sampletwitter)

# get counts of sample.list
samplewordcount <- c(sum(stri_count_words(sampleblogs)), 
                    sum(stri_count_words(samplenews)), 
                    sum(stri_count_words(sampletwitter)))

sample.df$line.count <- sapply(sample.list, length)
sample.df$word.count <- samplewordcount
sample.df

##   text.source line.count word.count
## 1        blog      10000     414927
## 2        news      10000     344727
## 3     twitter      10000     127214

Data munging

Remove: hash tags (#), twitter handles (@), URLs, profane language, numbers, punctuation, high-frequency words, white space; Convert: all text is converted to lower case; Create corpus class from the sample text files;

# replace strings that match the patterns below with blankstm
removeURL <- function(x) gsub("http:[[:alnum:]]*", "", x)
removeHashTags <- function(x) gsub("#\\S+", "", x)
removeTwitterHandles <- function(x) gsub("@\\S+", "", x)

# create corpus classs
text.corpus <- tm::Corpus(VectorSource(sample.list))

# Transformations
text.corpus <- tm::tm_map(text.corpus, content_transformer(removeHashTags))
text.corpus <- tm::tm_map(text.corpus, content_transformer(removeTwitterHandles))
text.corpus <- tm::tm_map(text.corpus, content_transformer(removeURL))
text.corpus <- tm::tm_map(text.corpus, tolower)
text.corpus <- tm::tm_map(text.corpus, removeNumbers)
text.corpus <- tm::tm_map(text.corpus, stripWhitespace)
text.corpus <- tm::tm_map(text.corpus, removePunctuation)
text.corpus <- tm::tm_map(text.corpus, removeWords, stopwords("english"))
#text.corpus <- tm::tm_map(text.corpus, removeWords, profanity)

Construct a Document Term Matrix (DTM)

For each text file

blog_dtm <- DocumentTermMatrix(text.corpus[1])
news_dtm <- DocumentTermMatrix(text.corpus[2])
twitter_dtm <- DocumentTermMatrix(text.corpus[3])

Term frequency

Determine the frequency of unique words

# Assign word count from DTM into data frames
freq.blog_df <- data.frame(word = blog_dtm$dimnames$Terms, frequency = blog_dtm$v)
freq.news_df <- data.frame(word = news_dtm$dimnames$Terms, frequency = news_dtm$v)
freq.twitter_df <- data.frame(word = twitter_dtm$dimnames$Terms, frequency = twitter_dtm$v)

# Order by descending frequency
freq.blog_df <- plyr::arrange(freq.blog_df, -frequency)
freq.news_df <- plyr::arrange(freq.news_df, -frequency)
freq.twitter_df <- plyr::arrange(freq.twitter_df, -frequency)

Top 20 words in each file

Plot the top 20 most frequent terms in each corpus

n <- 20L # variable to set top n words

# isolate top n words by decreasing frequency
blog.top <- freq.blog_df[1:n, ]
news.top <- freq.news_df[1:n, ]
twitter.top <- freq.twitter_df[1:n, ]

# reorder levels so charts plot in order of frequency
blog.top$word <- reorder(blog.top$word, blog.top$frequency)
news.top$word <- reorder(news.top$word, news.top$frequency)
twitter.top$word <- reorder(twitter.top$word, twitter.top$frequency)

# plots
g.blog.top <- ggplot(blog.top, aes(x = word, y = frequency)) +
              geom_bar(stat = "identity", colour="white", fill="deepskyblue4") + coord_flip() +
              labs(title = "Top 20 Most Frequent Terms: Blog Corpus")
g.blog.top

g.news.top <- ggplot(news.top, aes(x = word, y = frequency)) +
              geom_bar(stat = "identity", colour="white", fill="firebrick2") + coord_flip() +
              labs(title = "Top 20 Most Frequent Terms: News Corpus")
g.news.top

g.twitter.top <- ggplot(twitter.top, aes(x = word, y = frequency)) +
              geom_bar(stat = "identity", colour="white", fill="darkolivegreen2") + coord_flip() +
              labs(title = "Top 20 Most Frequent Terms: Twitter Corpus")
g.twitter.top

Next steps

Develop a natural language prediction algorithm and display the results via a Shiny app.