Introduction

The Capstore project is to build a predictive natural language text model based upon 3 provided datasets: (1) Twitter, (2) Blogs and (3) News.

Loading and analyzing the datasets

Number of lines of the Twitter dataset:

fileName <- "Coursera-SwiftKey.zip"
urlName <- "http://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(urlName,fileName)
unzip(fileName)

con1 <- file("final/en_US/en_US.twitter.txt") 
twitter_data <- readLines(con1)
## Warning in readLines(con1): line 167155 appears to contain an embedded nul
## Warning in readLines(con1): line 268547 appears to contain an embedded nul
## Warning in readLines(con1): line 1274086 appears to contain an embedded nul
## Warning in readLines(con1): line 1759032 appears to contain an embedded nul
close(con1)
stri_stats_general(twitter_data)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096031   134082634

Number of lines of the Blogs dataset:

con2 <- file("final/en_US/en_US.blogs.txt") 
blog_data <- readLines(con2)
close(con2)
stri_stats_general(blog_data)
##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539

Number of lines of the News dataset:

con3 <- file("final/en_US/en_US.news.txt") 
news_data <- readLines(con3)
close(con3)
stri_stats_general(news_data)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     1010242     1010242   203223154   169860866

Sampling

Creating sample file from % of the Twitter dataset

if (!dir.exists(file.path(".", "samples"))) {
  twitter_sample <- twitter_data[c(1:35000)]
  dir.create(file.path(".", "samples"))
  sam1 <- file("./samples/twitter.sample")
  writeLines(twitter_sample, sam1)
  close(sam1)
}

Creating sample file from % of the Blog dataset

if (!dir.exists(file.path(".", "samples"))) {
  blogs_sample <- blog_data[c(1:110000)]
  dir.create(file.path(".", "samples"))
  sam2 <- file("./samples/blogs.sample")
  writeLines(twitter_sample, sam2)
  close(sam2)
}

Creating sample file from % of the News dataset

if (!dir.exists(file.path(".", "samples"))) {
  news_sample <- news_data[c(1:120000)]
  dir.create(file.path(".", "samples"))
  sam3 <- file("samples/news.sample")
  writeLines(news_sample, sam3)
  close(sam3)
}

Data Exploration

Creating corpus by combining all of the samples from the 3 datasets

# top profanity words in the US
profanity_list <- c("shit", "fuck", "damn", "bitch", "crap", "piss", "dick", "darn", "cock", "pussy", "asshole", "fag", "bastard", "slut", "douche");

sampleDir <- paste(getwd(), "/samples", sep = "")
target_dir <- file.path(sampleDir)
corpus <- VCorpus(DirSource(target_dir))
corpus1 <- tm_map(corpus, stripWhitespace)
corpus1 <- tm_map(corpus1, removeWords, stopwords("english"))
corpus1 <- tm_map(corpus1, removePunctuation)
corpus1 <- tm_map(corpus1, removeNumbers)
corpus1 <- tm_map(corpus1, removeWords, profanity_list)

Creating plot for existing uni-grams

dtm1 <- DocumentTermMatrix(corpus1)
freq1 <- sort(colSums(as.matrix(dtm1)), decreasing=TRUE)
df1 <- data.frame(word=names(freq1), freq=freq1)

subset(df1, freq>1000) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity", fill="maroon") +
theme(axis.text.x=element_text(angle=45, hjust=1))

Creating wordcloud plot for existing uni-grams

Creating plot for existing bi-grams

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
dtm2 <- DocumentTermMatrix(corpus1, control = list(tokenize = BigramTokenizer))
freq2 <- sort(colSums(as.matrix(dtm2)), decreasing=TRUE)
df2 <- data.frame(word=names(freq2), freq=freq2)

subset(df2, freq>100) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity", fill="purple") +
theme(axis.text.x=element_text(angle=45, hjust=1))

Creating wordcloud plot for existing bi-grams

Creating plot for existing tri-grams

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
dtm3 <- DocumentTermMatrix(corpus1, control = list(tokenize = TrigramTokenizer))
freq3 <- sort(colSums(as.matrix(dtm3)), decreasing=TRUE)
df3 <- data.frame(word=names(freq3), freq=freq3)

subset(df3, freq>10) %>%
ggplot(aes(word, freq)) +
geom_bar(stat="identity", fill="blue") +
theme(axis.text.x=element_text(angle=45, hjust=1))

Creating wordcloud plot for existing tri-grams

Conclusion

By analyzing the resulting n-grams and categorizing the popular patterns from combined dataset of Twitter, Blogs, News data. We will be able to build a model to predict text if we see specific patterns based upon the probability that they are appear in the sample datasets in the future.