The goal of this project is just to display what we have discovered so far by exploring the data and demonstrate that we are on track to create our prediction algorithm.

The motivation for this project is to:

  1. Demonstrate that we have downloaded the data and have successfully loaded it in.
  2. Create a basic report of summary statistics about the data sets.
  3. Report any interesting findings that we amassed so far.
  4. Get feedback on our plans for creating a prediction algorithm and Shiny app.
library(knitr, quietly = TRUE)
library(dplyr, quietly = TRUE)
library(doParallel, quietly = TRUE)
library(stringi, quietly = TRUE)
library(formattable,quietly = T)
library(SnowballC, quietly = TRUE)
library(tm, quietly = TRUE)

Basic details about the files and data

The dataset is downloadable in zipped file via here.

Read the files from the working directory and summarize properties

if(!file.exists('data/')) dir.create('data/')
lnk <- 'https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip'
destfile <- 'Coursera-SwiftKey.zip'
if(!file.exists(paste0('data/', destfile))) {
  download.file(lnk, destfile = paste0('data/', destfile))
}
if(!file.exists(paste0('data/final'))) {
  ## Unzip the dataset
  #'@ unzip(paste0('data/', destfile), exdir = 'data/final/de_DE', list = TRUE)
  # Error in unzip(paste0("data/", destfile), exdir = "data/final/de_DE",  : 
  #   'exdir' does not exist
  unzip(paste0('data/', destfile), exdir = 'data')
}
## list down the details of the zipped file
unzip(paste0('data/', destfile), list = TRUE)
# Since I'm using Rstudio cloud, I need to remove unsued folders to have some space

unlink("data/final/de_DE", recursive = TRUE)
unlink("data/final/fi_FI", recursive = TRUE)
unlink("data/final/ru_RU", recursive = TRUE)
#setting up my directory and listing files
txt_files<-list.files("data/final/en_US")


# read the 3 datasets
for (i in 1:3) {
conn <- file(paste0('data/final/en_US/',txt_files[i]), open = "rb")
nam<-paste(gsub("en_US.",'',gsub('.txt','',txt_files[i])))
assign(nam, readLines(conn, encoding = "UTF-8"))
close(conn)
}

# Add some info for each file/dataset
list_data<-list(blogs,news,twitter)
dataset_info_function<-function(i){
  fileName=gsub('.txt','',txt_files[i])
  fileSize=paste(round(file.info(paste0('data/final/en_US/',txt_files[i]))$size/1024^2,digits=2),"MB")
  other_info<-sapply(list_data[i], stri_stats_general)
  WordCount = sapply(list_data[i], stri_stats_latex)[4,]
  
  df<-cbind.data.frame(fileName,fileSize,t(other_info),WordCount)
  df<-df %>% mutate(across(where(is.numeric),comma ))
  return(df)
}

datasets_info<-plyr::ldply(1:3,dataset_info_function,.progress = "text")
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |===============================================                       |  67%
  |                                                                            
  |======================================================================| 100%
knitr::kable(datasets_info)
fileName fileSize Lines LinesNEmpty Chars CharsNWhite WordCount
en_US.blogs 200.42 MB 899,288.00 899,288.00 206,824,382.00 170,389,539.00 37,570,839.00
en_US.news 196.28 MB 1,010,242.00 1,010,242.00 203,223,154.00 169,860,866.00 34,494,539.00
en_US.twitter 159.36 MB 2,360,148.00 2,360,148.00 162,096,031.00 134,082,634.00 30,451,128.00

Data Clean up and Analysis

We’ll only analyze a small portion of the data because the original files are massive. The sample will be stripped of special characters, white spaces, punctuation, and other elements. We’ll also clean up the data by removing any profanity. We’ll be using the badword data from [https://code.google.com/archive/p/badwordslist/downloads] from Google.

set.seed(2980)
smpl_Blogs <- blogs[sample(1:length(blogs), 12000, replace=FALSE)]
smpl_Twitter <- twitter[sample(1:length(twitter), 12000, replace=FALSE)]
smpl_News <- news[sample(1:length(news), 12000, replace=FALSE)]

# Clean the sampled data
smpl_Blogs <- iconv(smpl_Blogs,"UTF-8", "ASCII", sub = "")
smpl_Twitter <- iconv(smpl_Twitter,"UTF-8", "ASCII", sub = "")
smpl_News <- iconv(smpl_News,"UTF-8", "ASCII", sub = "")
# Merge the cleaned data and delete the intermediate files
smpl_data <- list(smpl_Blogs, smpl_Twitter, smpl_News)
rm(blogs, news, twitter,list_data)
corpus <- list()
dtMatrix <- list()
profanity  <- readLines("badwords.txt", n=457)
removeProfanity <- content_transformer(function(x) {
  for(i in 1:length(profanity))
  {
    a <- gsub("\\(","",profanity[i])
    x <-  gsub(a,"", x)
  }
  return(x)
})
for (i in 1 : length(smpl_data)) 
{
  corpus[[i]] <- Corpus(VectorSource(smpl_data[[i]]))
  corpus[[i]] <- tm_map(corpus[[i]], tolower)
  corpus[[i]] <- tm_map(corpus[[i]], removePunctuation)
  corpus[[i]] <- tm_map(corpus[[i]], removeNumbers)
  corpus[[i]] <- tm_map(corpus[[i]], stripWhitespace)
  corpus[[i]] <- tm_map(corpus[[i]], removeProfanity)
  corpus[[i]] <- tm_map(corpus[[i]], removeWords, stopwords("english"))
  corpus[[i]] <- tm_map(corpus[[i]], stemDocument)
  dtMatrix[[i]] <- DocumentTermMatrix(corpus[[i]],
                                      control=list(wordLengths=c(0,Inf)))
}
rm(smpl_data)

Plot Sampled Corpus Data with Word Cloud

We can see how each corpus appears using the wordcloud package. The following is an example from the corpus of US English Blogs. The other two corpora, or a combination of the three, can likewise be employed in a similar manner.

library(wordcloud, quietly = TRUE)
library(slam, quietly = TRUE)
# Set random seed for reproducibility
set.seed(2980)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
Headings= c("Word Cloud - US English Blogs",
            "Word Cloud - US English Twitter",        
            "Word Cloud - US English News")
# Iterate each corpus and dtMatrix and plot word cloud (Max = 100)
for (i in 1:length(corpus)) {
    wordcloud(words = colnames(dtMatrix[[i]]), freq = slam::col_sums(dtMatrix[[i]]), 
        scale = c(3, 1), max.words = 100, random.order = FALSE, rot.per = 0.45, 
        use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))
    title(Headings[i])
}

Plots based on the cleaned data

We’ve made charts to indicate how many times the words are repeated in the corpus. The “Unigrams,” “Bigrams,” and “Trigrams” are displayed in this section.

library(dplyr, quietly = TRUE)
library(qdap,quietly = TRUE) 
library(rJava,quietly = TRUE)#.jinit(parameters="-Xmx128g")
library(RWeka,quietly = TRUE) 
library(ggplot2, quietly = TRUE)
# Define a function to make Unigram, Bigram and Trigram from the corpus
# And then Plot them together with ggplot2 and gridExtra packages
plot.Grams <- function (x=smpl_Blogs, subTitle="Blogs", N=10) {
    # Use RWeka to get unigram token
    Tokenizer1 <- RWeka::NGramTokenizer(x, Weka_control(min = 1, max = 1))
    Gram.1 <- data.frame(table(Tokenizer1))
    Gram.1 <- Gram.1[order(Gram.1$Freq, decreasing = TRUE),]
    colnames(Gram.1) <- c("Word", "Freq")
    Gram.1 <- head(Gram.1, N) 
    g1 <- ggplot(Gram.1, aes(x=reorder(Word, Freq),y=Freq)) + 
            geom_bar(stat="identity", fill="green") + 
            ggtitle(paste("Unigrams", "-", subTitle)) + 
            xlab("Unigrams") + ylab("Frequency") + 
            theme(axis.text.x=element_text(angle=90, hjust=1))
    # Use RWeka to get bigram token
    Tokenizer2 <- RWeka::NGramTokenizer(x, 
        Weka_control(min = 2, max = 2,delimiters = "\\r\\n\\t.,;:\"()?!"))
    Gram.2 <- data.frame(table(Tokenizer2))
    Gram.2 <- Gram.2[order(Gram.2$Freq, decreasing = TRUE),]
    colnames(Gram.2) <- c("Word", "Freq")
    Gram.2 <- head(Gram.2, N) 
    g2 <- ggplot(Gram.2, aes(x=reorder(Word, Freq),y=Freq)) + 
            geom_bar(stat="identity", fill="blue") + 
            ggtitle(paste("Bigrams", "-", subTitle)) + 
            xlab("Bigrams") + ylab("Frequency") + 
            theme(axis.text.x=element_text(angle=90, hjust=1))
    
    # Use RWeka to get trigram token
    Tokenizer3 <- RWeka::NGramTokenizer(smpl_Blogs, 
        Weka_control(min = 3, max = 3, delimiters = " \\r\\n\\t.,;:\"()?!"))
    Gram.3 <- data.frame(table(Tokenizer3))
    Gram.3 <- Gram.3[order(Gram.3$Freq, decreasing = TRUE),]
    colnames(Gram.3) <- c("Word", "Freq")
    Gram.3 <- head(Gram.3, N) 
    g3 <- ggplot(Gram.3, aes(x=reorder(Word, Freq),y=Freq)) + 
            geom_bar(stat="identity", fill="darkgreen") + 
            ggtitle(paste("Trigrams", "-", subTitle)) + 
            xlab("Trigrams") + ylab("Frequency") + 
            theme(axis.text.x=element_text(angle=90, hjust=1))
    
    # Put three plots into 1 row 3 columns
    gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
}
plot.Grams(x = smpl_Blogs, subTitle = "Blogs", N = 12)

plot.Grams(x = smpl_Twitter, subTitle = "Twitter", N = 12)

plot.Grams(x = smpl_News, subTitle = "News", N = 12)