Introduction

This report provides the capsulization of the exploratory analysis of the text data, in addition to creating a predictive model

fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if (!file.exists("Coursera-SwiftKey.zip")){

  download.file(fileUrl, destfile = "Coursera-SwiftKey.zip")

}

unzip("Coursera-SwiftKey.zip")

 

file.list = c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")

text <- list(blogs = "", news = "", twitter = "")

 

data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),

                                                              c("file size, Mb", "lines", "words")))

for (i in 1:3) {

  con <- file(file.list[i], "rb")

  text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)

  close(con)

  data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)

  data.summary[i,2] <- length(text[[i]])

  data.summary[i,3] <- sum(stri_count_words(text[[i]]))

}

Summary

data.summary %>%
  kable() %>%
  kable_styling()
file size, Mb lines words
blogs 200.42 899288 37546239
news 196.28 1010242 34762395
twitter 159.36 2360148 30093413

However, these data sets are very large, will proceed with a small sample (0.005%) of the each data set, then combine them into one data set which shall be used for the analysis.

## Create the sample

set.seed(1234)

blogs <- sample(text$blogs, 0.005*length(text$blogs))

news<- sample(text$news, 0.005*length(text$news))

twitter <- sample(text$twitter, 0.005*length(text$twitter))

sample_data <- c(blogs, news, twitter)

 

sumWords <- sum(stri_count_words(sample_data))

sumWords
## [1] 517755
writeLines(sample_data, "sample_data.txt")

Create Corpus

Cleaning the Data

sample_data <- iconv(sample_data, 'UTF-8', 'ASCII')

corpus <- Corpus(VectorSource(as.data.frame(sample_data, stringsAsFactors = FALSE)))

 

corpus <- corpus %>%

  tm_map(removeWords, stopwords("english")) %>% # remove stopwords       

  tm_map(tolower) %>%                   # Converts all text to lower case

  tm_map(removeNumbers) %>%             # Remove Numbers

  tm_map(removePunctuation) %>%         # Remove punctuation marks


  tm_map(stripWhitespace) %>%           # Remove whitespaces

  tm_map(PlainTextDocument)         # An intermediate preprocessing step

Build Tokenizers:

Split the sentence into individual words

Unigram

unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))

unigram.df <- data.frame(table(unigram))

unigram.df <- unigram.df[order(unigram.df$Freq, decreasing = TRUE),]

 

wordcloud(unigram.df$unigram, unigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=0, "Word Cloud of Unigram")

Frequency of Unigram

# Plot area set up

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

 

## Unigram Frequency

plotUni <- ggplot(head(unigram.df,25), aes(reorder(unigram,Freq), Freq)) +

  geom_bar(stat="identity",fill = "brown") + coord_flip() +

  xlab("Unigrams") + ylab("Frequency") +

  ggtitle("Most frequently used words - Unigrams")

 

ggplotly(plotUni)

Bigram

bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))

bigram.df <- data.frame(table(bigram))

bigram.df <- bigram.df[order(bigram.df$Freq, decreasing = TRUE),]

 

wordcloud(bigram.df$bigram, bigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=0, "Word Cloud of Bigram")

Frequency of Bigram

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

 

## Bigram Frequency

plotBi <- ggplot(head(bigram.df,25), aes(reorder(bigram,Freq), Freq)) +

  geom_bar(stat="identity",fill = "seagreen") + coord_flip() +

  xlab("Bigrams") + ylab("Frequency") +

  ggtitle("Most frequently used words - Bigrams")

 

ggplotly(plotBi)

Trigram

trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))

trigram.df <- data.frame(table(trigram))

trigram.df <- trigram.df[order(trigram.df$Freq, decreasing = TRUE),]

 

wordcloud(trigram.df$trigram, trigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))

text(x=0.5, y=0, "Word Cloud of Trigram")

Frequency of Trigram

layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))

 

## Trigram Frequency

plotTri <- ggplot(head(trigram.df,25), aes(reorder(trigram,Freq), Freq)) +

  geom_bar(stat="identity",fill = "blue") + coord_flip() +

  xlab("Unigrams") + ylab("Frequency") +

  ggtitle("Most frequently used words - Unigrams")

 

ggplotly(plotTri)

Further Analysis:

Planning to develop and test Prediction Algorithm and Shiny app: