This report provides the capsulization of the exploratory analysis of the text data, in addition to creating a predictive model
fileUrl <-"https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if (!file.exists("Coursera-SwiftKey.zip")){
download.file(fileUrl, destfile = "Coursera-SwiftKey.zip")
}
unzip("Coursera-SwiftKey.zip")
file.list = c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
data.summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),
c("file size, Mb", "lines", "words")))
for (i in 1:3) {
con <- file(file.list[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8",skipNul = TRUE)
close(con)
data.summary[i,1] <- round(file.info(file.list[i])$size / 1024^2, 2)
data.summary[i,2] <- length(text[[i]])
data.summary[i,3] <- sum(stri_count_words(text[[i]]))
}
data.summary %>%
kable() %>%
kable_styling()
| file size, Mb | lines | words | |
|---|---|---|---|
| blogs | 200.42 | 899288 | 37546239 |
| news | 196.28 | 1010242 | 34762395 |
| 159.36 | 2360148 | 30093413 |
However, these data sets are very large, will proceed with a small sample (0.005%) of the each data set, then combine them into one data set which shall be used for the analysis.
## Create the sample
set.seed(1234)
blogs <- sample(text$blogs, 0.005*length(text$blogs))
news<- sample(text$news, 0.005*length(text$news))
twitter <- sample(text$twitter, 0.005*length(text$twitter))
sample_data <- c(blogs, news, twitter)
sumWords <- sum(stri_count_words(sample_data))
sumWords
## [1] 517755
writeLines(sample_data, "sample_data.txt")
Cleaning the Data
sample_data <- iconv(sample_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(sample_data, stringsAsFactors = FALSE)))
corpus <- corpus %>%
tm_map(removeWords, stopwords("english")) %>% # remove stopwords
tm_map(tolower) %>% # Converts all text to lower case
tm_map(removeNumbers) %>% # Remove Numbers
tm_map(removePunctuation) %>% # Remove punctuation marks
tm_map(stripWhitespace) %>% # Remove whitespaces
tm_map(PlainTextDocument) # An intermediate preprocessing step
Split the sentence into individual words
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
unigram.df <- data.frame(table(unigram))
unigram.df <- unigram.df[order(unigram.df$Freq, decreasing = TRUE),]
wordcloud(unigram.df$unigram, unigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=0, "Word Cloud of Unigram")
Frequency of Unigram
# Plot area set up
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
## Unigram Frequency
plotUni <- ggplot(head(unigram.df,25), aes(reorder(unigram,Freq), Freq)) +
geom_bar(stat="identity",fill = "brown") + coord_flip() +
xlab("Unigrams") + ylab("Frequency") +
ggtitle("Most frequently used words - Unigrams")
ggplotly(plotUni)
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
bigram.df <- data.frame(table(bigram))
bigram.df <- bigram.df[order(bigram.df$Freq, decreasing = TRUE),]
wordcloud(bigram.df$bigram, bigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=0, "Word Cloud of Bigram")
Frequency of Bigram
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
## Bigram Frequency
plotBi <- ggplot(head(bigram.df,25), aes(reorder(bigram,Freq), Freq)) +
geom_bar(stat="identity",fill = "seagreen") + coord_flip() +
xlab("Bigrams") + ylab("Frequency") +
ggtitle("Most frequently used words - Bigrams")
ggplotly(plotBi)
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
trigram.df <- data.frame(table(trigram))
trigram.df <- trigram.df[order(trigram.df$Freq, decreasing = TRUE),]
wordcloud(trigram.df$trigram, trigram.df$Freq, max.words=125, random.order=TRUE, colors=brewer.pal(8, "Dark2"))
text(x=0.5, y=0, "Word Cloud of Trigram")
Frequency of Trigram
layout(matrix(c(1,1,2,3), 2, 2, byrow = TRUE))
## Trigram Frequency
plotTri <- ggplot(head(trigram.df,25), aes(reorder(trigram,Freq), Freq)) +
geom_bar(stat="identity",fill = "blue") + coord_flip() +
xlab("Unigrams") + ylab("Frequency") +
ggtitle("Most frequently used words - Unigrams")
ggplotly(plotTri)
Planning to develop and test Prediction Algorithm and Shiny app:
Build prediction model based on the n-gram frequency of words
Integrate the model into shiny app which can be used as interactive application