The goal of this project is just to display what we have discovered so far by exploring the data and demonstrate that we are on track to create our prediction algorithm.
library(knitr, quietly = TRUE)
library(dplyr, quietly = TRUE)
library(doParallel, quietly = TRUE)
library(stringi, quietly = TRUE)
library(formattable,quietly = T)
library(SnowballC, quietly = TRUE)
library(tm, quietly = TRUE)
The dataset is downloadable in zipped file via here.
if(!file.exists('data/')) dir.create('data/')
lnk <- 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
destfile <- 'Coursera-SwiftKey.zip'
if(!file.exists(paste0('data/', destfile))) {
download.file(lnk, destfile = paste0('data/', destfile))
}
if(!file.exists(paste0('data/final'))) {
## Unzip the dataset
#'@ unzip(paste0('data/', destfile), exdir = 'data/final/de_DE', list = TRUE)
# Error in unzip(paste0("data/", destfile), exdir = "data/final/de_DE", :
# 'exdir' does not exist
unzip(paste0('data/', destfile), exdir = 'data')
}
## list down the details of the zipped file
unzip(paste0('data/', destfile), list = TRUE)
# Since I'm using Rstudio cloud, I need to remove unsued folders to have some space
unlink("data/final/de_DE", recursive = TRUE)
unlink("data/final/fi_FI", recursive = TRUE)
unlink("data/final/ru_RU", recursive = TRUE)
#setting up my directory and listing files
txt_files<-list.files("data/final/en_US")
# read the 3 datasets
for (i in 1:3) {
conn <- file(paste0('data/final/en_US/',txt_files[i]), open = "rb")
nam<-paste(gsub("en_US.",'',gsub('.txt','',txt_files[i])))
assign(nam, readLines(conn, encoding = "UTF-8"))
close(conn)
}
# Add some info for each file/dataset
list_data<-list(blogs,news,twitter)
dataset_info_function<-function(i){
fileName=gsub('.txt','',txt_files[i])
fileSize=paste(round(file.info(paste0('data/final/en_US/',txt_files[i]))$size/1024^2,digits=2),"MB")
other_info<-sapply(list_data[i], stri_stats_general)
WordCount = sapply(list_data[i], stri_stats_latex)[4,]
df<-cbind.data.frame(fileName,fileSize,t(other_info),WordCount)
df<-df %>% mutate(across(where(is.numeric),comma ))
return(df)
}
datasets_info<-plyr::ldply(1:3,dataset_info_function,.progress = "text")
##
|
| | 0%
|
|======================= | 33%
|
|=============================================== | 67%
|
|======================================================================| 100%
knitr::kable(datasets_info)
| fileName | fileSize | Lines | LinesNEmpty | Chars | CharsNWhite | WordCount |
|---|---|---|---|---|---|---|
| en_US.blogs | 200.42 MB | 899,288.00 | 899,288.00 | 206,824,382.00 | 170,389,539.00 | 37,570,839.00 |
| en_US.news | 196.28 MB | 1,010,242.00 | 1,010,242.00 | 203,223,154.00 | 169,860,866.00 | 34,494,539.00 |
| en_US.twitter | 159.36 MB | 2,360,148.00 | 2,360,148.00 | 162,096,031.00 | 134,082,634.00 | 30,451,128.00 |
We’ll only analyze a small portion of the data because the original files are massive. The sample will be stripped of special characters, white spaces, punctuation, and other elements. We’ll also clean up the data by removing any profanity. We’ll be using the badword data from [https://code.google.com/archive/p/badwordslist/downloads] from Google.
set.seed(2980)
smpl_Blogs <- blogs[sample(1:length(blogs), 12000, replace=FALSE)]
smpl_Twitter <- twitter[sample(1:length(twitter), 12000, replace=FALSE)]
smpl_News <- news[sample(1:length(news), 12000, replace=FALSE)]
# Clean the sampled data
smpl_Blogs <- iconv(smpl_Blogs,"UTF-8", "ASCII", sub = "")
smpl_Twitter <- iconv(smpl_Twitter,"UTF-8", "ASCII", sub = "")
smpl_News <- iconv(smpl_News,"UTF-8", "ASCII", sub = "")
# Merge the cleaned data and delete the intermediate files
smpl_data <- list(smpl_Blogs, smpl_Twitter, smpl_News)
rm(blogs, news, twitter,list_data)
corpus <- list()
dtMatrix <- list()
profanity <- readLines("badwords.txt", n=457)
removeProfanity <- content_transformer(function(x) {
for(i in 1:length(profanity))
{
a <- gsub("\\(","",profanity[i])
x <- gsub(a,"", x)
}
return(x)
})
for (i in 1 : length(smpl_data))
{
corpus[[i]] <- Corpus(VectorSource(smpl_data[[i]]))
corpus[[i]] <- tm_map(corpus[[i]], tolower)
corpus[[i]] <- tm_map(corpus[[i]], removePunctuation)
corpus[[i]] <- tm_map(corpus[[i]], removeNumbers)
corpus[[i]] <- tm_map(corpus[[i]], stripWhitespace)
corpus[[i]] <- tm_map(corpus[[i]], removeProfanity)
corpus[[i]] <- tm_map(corpus[[i]], removeWords, stopwords("english"))
corpus[[i]] <- tm_map(corpus[[i]], stemDocument)
dtMatrix[[i]] <- DocumentTermMatrix(corpus[[i]],
control=list(wordLengths=c(0,Inf)))
}
rm(smpl_data)
We can see how each corpus appears using the wordcloud package. The following is an example from the corpus of US English Blogs. The other two corpora, or a combination of the three, can likewise be employed in a similar manner.
library(wordcloud, quietly = TRUE)
library(slam, quietly = TRUE)
# Set random seed for reproducibility
set.seed(2980)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
Headings= c("Word Cloud - US English Blogs",
"Word Cloud - US English Twitter",
"Word Cloud - US English News")
# Iterate each corpus and dtMatrix and plot word cloud (Max = 100)
for (i in 1:length(corpus)) {
wordcloud(words = colnames(dtMatrix[[i]]), freq = slam::col_sums(dtMatrix[[i]]),
scale = c(3, 1), max.words = 100, random.order = FALSE, rot.per = 0.45,
use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
title(Headings[i])
}
We’ve made charts to indicate how many times the words are repeated in the corpus. The “Unigrams,” “Bigrams,” and “Trigrams” are displayed in this section.
library(dplyr, quietly = TRUE)
library(qdap,quietly = TRUE)
library(rJava,quietly = TRUE)#.jinit(parameters="-Xmx128g")
library(RWeka,quietly = TRUE)
library(ggplot2, quietly = TRUE)
# Define a function to make Unigram, Bigram and Trigram from the corpus
# And then Plot them together with ggplot2 and gridExtra packages
plot.Grams <- function (x=smpl_Blogs, subTitle="Blogs", N=10) {
# Use RWeka to get unigram token
Tokenizer1 <- RWeka::NGramTokenizer(x, Weka_control(min = 1, max = 1))
Gram.1 <- data.frame(table(Tokenizer1))
Gram.1 <- Gram.1[order(Gram.1$Freq, decreasing = TRUE),]
colnames(Gram.1) <- c("Word", "Freq")
Gram.1 <- head(Gram.1, N)
g1 <- ggplot(Gram.1, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="green") +
ggtitle(paste("Unigrams", "-", subTitle)) +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Use RWeka to get bigram token
Tokenizer2 <- RWeka::NGramTokenizer(x,
Weka_control(min = 2, max = 2,delimiters = "\\r\\n\\t.,;:\"()?!"))
Gram.2 <- data.frame(table(Tokenizer2))
Gram.2 <- Gram.2[order(Gram.2$Freq, decreasing = TRUE),]
colnames(Gram.2) <- c("Word", "Freq")
Gram.2 <- head(Gram.2, N)
g2 <- ggplot(Gram.2, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="blue") +
ggtitle(paste("Bigrams", "-", subTitle)) +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Use RWeka to get trigram token
Tokenizer3 <- RWeka::NGramTokenizer(smpl_Blogs,
Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
Gram.3 <- data.frame(table(Tokenizer3))
Gram.3 <- Gram.3[order(Gram.3$Freq, decreasing = TRUE),]
colnames(Gram.3) <- c("Word", "Freq")
Gram.3 <- head(Gram.3, N)
g3 <- ggplot(Gram.3, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="darkgreen") +
ggtitle(paste("Trigrams", "-", subTitle)) +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Put three plots into 1 row 3 columns
gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
}
plot.Grams(x = smpl_Blogs, subTitle = "Blogs", N = 12)
plot.Grams(x = smpl_Twitter, subTitle = "Twitter", N = 12)
plot.Grams(x = smpl_News, subTitle = "News", N = 12)