This report provides a short overview of the exploratory analysis of the text data to be used for the Capstone project for the Data Science Specialization along with a description of plans for the word prediction algorithm.
Download the following packages if not already installed:
install.packages("stringi")
install.packages("tm")
install.packages("wordcloud")
install.packages("RColorBrewer")
These packages will be used in this project
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringi)
library(knitr)
library(tm)
## Warning: package 'tm' was built under R version 3.6.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.3
## Loading required package: RColorBrewer
library(RColorBrewer)
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.6.3
filename <- "Coursera-SwiftKey.zip"
if (!file.exists(filename)){
fileURL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(fileURL, filename)
}
if (!file.exists("Coursera-SwiftKey")) {
unzip(filename)
}
This data consists of text from 3 different sources: blogs, news, and twitter feeds and are provided in 4 different languages: German, English (US), Finnish, and Russian. For the remainder of this project, we will use only the the English (US) dataset.
files <- c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt")
text <- list(blogs = "", news = "", twitter = "")
summary <- matrix(0, nrow = 3, ncol = 3, dimnames = list(c("blogs", "news", "twitter"),c("File size (Mb)", "Lines", "Words")))
for (i in 1:3) {
con <- file(files[i], "rb")
text[[i]] <- readLines(con, encoding = "UTF-8", skipNul = TRUE)
close(con)
summary[i,1] <- round(file.info(files[i])$size / 1024^2, 2)
summary[i,2] <- length(text[[i]])
summary[i,3] <- sum(stri_count_words(text[[i]]))
}
Summary of data is given in the below table:
kable(summary)
| File size (Mb) | Lines | Words | |
|---|---|---|---|
| blogs | 200.42 | 899288 | 37546239 |
| news | 196.28 | 1010242 | 34762395 |
| 159.36 | 2360148 | 30093413 |
These datasets are rather large, so, for the remainder of the report we will use a smaller fraction of the data (0.5%) to perform the analysis. The three parts will be combine into a single file and used to generate the corpus.
set.seed(720)
blogs_data <- sample(text$blogs, 0.005*length(text$blogs))
news_data <- sample(text$news, 0.005*length(text$news))
twitter_data <- sample(text$twitter, 0.005*length(text$twitter))
combined_data <- c(blogs_data, news_data, twitter_data)
sum <- sum(stri_count_words(combined_data))
Amount of words in new combined dataset:
sum
## [1] 510925
We will first remove all the unnecessary information from our dataset like emoticons, punctuations etc.
combined_data <- iconv(combined_data, 'UTF-8', 'ASCII')
corpus <- Corpus(VectorSource(as.data.frame(combined_data, stringsAsFactors = FALSE)))
corpus <- corpus %>%
tm_map(tolower) %>%
tm_map(PlainTextDocument) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(stripWhitespace)
## Warning in tm_map.SimpleCorpus(., tolower): transformation drops documents
## Warning in tm_map.SimpleCorpus(., PlainTextDocument): transformation drops
## documents
We will now calculate the frequency of words
term_doc_matrix <- TermDocumentMatrix(corpus)
term_doc_matrix <- as.matrix(term_doc_matrix)
word_freq <- sort(rowSums(term_doc_matrix), decreasing=TRUE)
df <- data.frame(word = names(word_freq), freq = word_freq)
wordcloud(df$word, df$freq, min.freq = 300, random.order = TRUE, rot.per = .25, colors = brewer.pal(8, "Dark2"))
Now we will use the RWeka library to create unigrams, bigrams and trigram.
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
unigram_data <- data.frame(table(unigram))
unigram_data <- unigram_data[order(unigram_data$Freq, decreasing = TRUE),]
ggplot(unigram_data[1:20,], aes(x = unigram, y = Freq)) +
geom_bar(stat = "Identity")+
xlab("Unigrams") + ylab("Frequency")+
ggtitle("Top 20 Unigrams") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
bigram_data <- data.frame(table(bigram))
bigram_data <- bigram_data[order(bigram_data$Freq, decreasing = TRUE),]
ggplot(bigram_data[1:20,], aes(x = bigram, y = Freq)) +
geom_bar(stat = "Identity")+
xlab("Bigrams") + ylab("Frequency")+
ggtitle("Top 20 Bigrams") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
trigram_data <- data.frame(table(trigram))
trigram_data <- trigram_data[order(trigram_data$Freq, decreasing = TRUE),]
ggplot(trigram_data[1:20,], aes(x = trigram, y = Freq)) +
geom_bar(stat = "Identity")+
xlab("Trigrams") + ylab("Frequency")+
ggtitle("Top 20 Trigrams") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))