knitr::opts_chunk$set(echo=T)
suppressPackageStartupMessages(library(downloader))
suppressPackageStartupMessages(library(ngram))
suppressPackageStartupMessages(library(tm))
suppressPackageStartupMessages(library(wordcloud2))
suppressPackageStartupMessages(library(htmlwidgets))
suppressPackageStartupMessages(library(webshot))
suppressPackageStartupMessages(library(ggplot2))
This is a milestone report for the project regarding analysis of the SwiftKey data set.
The purpose for this report is to:
The Swiftkey data contains blog entries, news entries, and twitter feeds in four different languages, which are German, English, Russian, and Finnish, respectively. The English dataset is processed with reference to the other 3 datasets to match the foreign language words embedded in the data.
# Download and extract data
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("../Coursera-SwiftKey.zip")) { download.file(Url, destfile="../Coursera-SwiftKey.zip", mode="wb") }
if(!file.exists("../final")) { unzip(zipfile="../Coursera-SwiftKey.zip", exdir="../") }
# Load data
blogs <- readLines("../final/en_US/en_US.blogs.txt", skipNul=T, encoding="UTF-8")
news <- readLines("../final/en_US/en_US.news.txt", skipNul=T, encoding="UTF-8")
twits <- readLines("../final/en_US/en_US.twitter.txt", skipNul=T, encoding="UTF-8")
# Size of the en_US datasets (bytes)
blogs_size <- file.info("../final/en_US/en_US.blogs.txt")$size / 1024
news_size <- file.info("../final/en_US/en_US.news.txt")$size / 1024
twits_size <- file.info("../final/en_US/en_US.twitter.txt")$size / 1024
# Basic summary of each dataset
blogs_summ <- summary(blogs)
news_summ <- summary(news)
twits_summ <- summary(twits)
# Summary of number of characters in each dataset
blogs_cc <- summary(nchar(blogs))
news_cc <- summary(nchar(news))
twits_cc <- summary(nchar(twits))
# Number of lines in each dataset
blogs_lc <- length(blogs)
news_lc <- length(news)
twits_lc <- length(twits)
# Number of words in each dataset
blogs_wc <- wordcount(blogs)
news_wc <- wordcount(news)
twits_wc <- wordcount(twits)
# Report the summaries
rbind(blogs_summ, news_summ, twits_summ)
## Length Class Mode
## blogs_summ "899288" "character" "character"
## news_summ "77259" "character" "character"
## twits_summ "2360148" "character" "character"
rbind(blogs_cc, news_cc, twits_cc)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## blogs_cc 1 47 156 229.98695 329 40833
## news_cc 2 111 186 202.42830 270 5760
## twits_cc 2 37 64 68.68054 100 140
data.frame(c("Blogs", "News", "Twits"),
"File size"=c(blogs_size, news_size, twits_size),
"Line count"=c(blogs_lc, news_lc, twits_lc),
"Word count"=c(blogs_wc, news_wc, twits_wc),
row.names=1)
## File.size Line.count Word.count
## Blogs 205234.4 899288 37334131
## News 200988.2 77259 2643969
## Twits 163188.8 2360148 30373583
The smaller subset of each dataset is created randomly to ease the subsequent processing steps. A corpus is created from the data subsets by:
The list of profane words is obtained from Google. Entries that contain symbols that might contaminate regular expressions are removed.
# Create corpus from dataset subsets
subset_size <- 10000
blogs_sub <- sample(blogs, subset_size)
news_sub <- sample(news, subset_size)
twits_sub <- sample(twits, subset_size)
corp <- VCorpus(VectorSource(c(blogs_sub, news_sub, twits_sub)))
# Clean the corpus
delPat <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
profane_words <- read.delim("./badwords.txt", header=F)[,1]
corp <- tm_map(x=corp, FUN=delPat, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corp <- tm_map(x=corp, FUN=delPat, "[^a-zA-Z ]")
corp <- tm_map(x=corp, FUN=removePunctuation)
corp <- tm_map(x=corp, FUN=removeNumbers)
corp <- tm_map(x=corp, FUN=content_transformer(FUN=tolower)) # Functions are nested to avoid error
corp <- tm_map(x=corp, FUN=removeWords, stopwords("en"))
corp <- tm_map(x=corp, FUN=removeWords, profane_words)
# corp <- tm_map(x=corp, FUN=stemDocument) # Stemming produce less intuitive n-grams
corp <- tm_map(x=corp, FUN=stripWhitespace)
corp <- tm_map(x=corp, FUN=PlainTextDocument)
# Save the corpus for future reference
saveRDS(corp, file="./corp.rds")
In the field of Natural Language Processing, n-gram is a contiguous sequence of n items from a given sequence of text or speech. The sentences and phrases in the corpus are then broken into pairs of words or n-grams through tokenization:
# Define tokenization functions
bigram <- function(x) unlist(lapply(ngrams(words(x), 2), paste, collapse=" "), use.names=FALSE)
trigram <- function(x) unlist(lapply(ngrams(words(x), 3), paste, collapse=" "), use.names=FALSE)
# Remove sparse terms to aid generalization, tune sparse parameter to return a matrix of appropriate size
docTermMat1 <- removeSparseTerms(x=TermDocumentMatrix(corp), sparse=0.99)
docTermMat2 <- removeSparseTerms(x=TermDocumentMatrix(corp, control=list(tokenize=bigram)), sparse=0.9993)
docTermMat3 <- removeSparseTerms(x=TermDocumentMatrix(corp, control=list(tokenize=trigram)), sparse=0.99995)
# Create frequency dataframes
freq1 <- sort(rowSums(as.matrix(docTermMat1)), decreasing=T)
freq2 <- sort(rowSums(as.matrix(docTermMat2)), decreasing=T)
freq3 <- sort(rowSums(as.matrix(docTermMat3)), decreasing=T)
df1 <- data.frame(grm=names(freq1), frq=freq1)
df2 <- data.frame(grm=names(freq2), frq=freq2)
df3 <- data.frame(grm=names(freq3), frq=freq3)
The frequencies of the more common unigrams, bigrams, and trigrams in the corpus are visualized using word clouds and histograms:
# Word clouds
cloud1 = wordcloud2(data=df1, size=0.6, fontWeight=700, shape='circle', color='random-dark')
cloud2 = wordcloud2(data=df2, size=1.2, fontWeight=600, shape='circle', color='random-dark')
cloud3 = wordcloud2(data=df3, size=0.7, fontWeight=500, shape='circle', color='random-dark')
saveWidget(widget=cloud1, file="cloud1.html", selfcontained=F)
saveWidget(widget=cloud2, file="cloud2.html", selfcontained=F)
saveWidget(widget=cloud3, file="cloud3.html", selfcontained=F)
webshot(url="cloud1.html", file="cloud1.png", vwidth=700, vheight=500, delay=10)
webshot(url="cloud2.html", file="cloud2.png", vwidth=700, vheight=500, delay=10)
webshot(url="cloud3.html", file="cloud3.png", vwidth=700, vheight=500, delay=10)
# Histograms
ggplot(df1[1:30, ], aes(reorder(grm, -frq), frq)) +
labs(title="30 Most Common Unigrams Frequency", x="Unigrams", y="Frequency") +
theme(axis.text.x=element_text(angle=45, size=12, hjust=1)) +
geom_bar(stat="identity", fill="darkblue")
ggplot(df2[1:30, ], aes(reorder(grm, -frq), frq)) +
labs(title="30 Most Common Bigrams Frequency", x="Bigrams", y="Frequency") +
theme(axis.text.x=element_text(angle=45, size=12, hjust=1)) +
geom_bar(stat="identity", fill="darkblue")
ggplot(df3[1:30, ], aes(reorder(grm, -frq), frq)) +
labs(title="30 Most Common Trigrams Frequency", x="Trigrams", y="Frequency") +
theme(axis.text.x=element_text(angle=45, size=12, hjust=1)) +
geom_bar(stat="identity", fill="darkblue")
The exploratory data analysis on the English dataset resulted in the following findings:
The plan to continue the project is as follow:
Use n-gram models to predict the next word in a sequence of words:
create a Shiny app that allows its users to provide a text input and display an output of the predicted upcoming word.
Deploy the app and present a reproducible pitch.