The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm.

This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.

You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:

1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.

2. Create a basic report of summary statistics about the data sets.

3. Report any interesting findings that you amassed so far.

4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Review criteria

2. Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables?

3. Has the data scientist made basic plots, such as histograms to illustrate features of the data?

4. Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?

Load the necessary packages

library(knitr)
library(stringi)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.0.5
library(ggplot2)
library(gridExtra)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.0.5
library(SnowballC)

Download the datasets

We will first download the datasets in the computer

path <- getwd()
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

download.file(url, file.path(path, "Coursera-SwiftKey.zip"))

unzip("Coursera-SwiftKey.zip")

Load the datasets

Let’s now load the datasets in the environment

file_blogs <- file("./final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(file_blogs, encoding = "UTF-8")

file_news <- file("./final/en_US/en_US.news.txt", "rb")
news <- readLines(file_news, encoding = "UTF-8")

file_twitter <- file("./final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(file_twitter, encoding = "UTF-8")
## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 167155 contient
## un caractère nul
## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 268547 contient
## un caractère nul
## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 1274086
## contient un caractère nul
## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 1759032
## contient un caractère nul

Summary

We will now create a table summarizing the most important aspects of the datasets

MB = 1024^2

# file size
fileSizeMB <- round(file.info(c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt"))$size / MB)
 
# num lines per file
number_of_lines <- sapply(list(blogs, news, twitter), length)

# num characters per file
number_of_chars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)

# num words per file
number_of_words <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]

# words per line
words_per_line <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))

# words per line summary
words_per_line_summary = sapply(list(blogs, news, twitter),
             function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(words_per_line_summary) = c('Minimum number of words per line', 'Mean number of words per line', 'Maximum number of words per line')

summary <- data.frame(
    File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
    FileSize = paste(fileSizeMB, " MB"),
    Lines = number_of_lines,
    Characters = number_of_chars,
    Words = number_of_words,
    t(rbind(round(words_per_line_summary)))
)

kable(summary,
      row.names = FALSE,
      align = c("l", rep("r", 7)),
      caption = "") %>% kable_styling(position = "left")
File FileSize Lines Characters Words Minimum.number.of.words.per.line Mean.number.of.words.per.line Maximum.number.of.words.per.line
en_US.blogs.txt 200 MB 899288 206824505 37570839 0 42 6726
en_US.news.txt 196 MB 1010242 203223159 34494539 1 34 1796
en_US.twitter.txt 159 MB 2360148 162096031 30451128 1 13 47

Plot the data

We will now show the distribution of the words per line in the 3 datasets

# Plot of the blogs data
plot1 <- qplot(words_per_line[[1]],
               geom = "histogram",
               main = "US Blogs",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot2 <- qplot(words_per_line[[2]],
               geom = "histogram",
               main = "US News",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot3 <- qplot(words_per_line[[3]],
               geom = "histogram",
               main = "US Twitter",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 1)

plotList = list(plot1, plot2, plot3)
do.call(grid.arrange, c(plotList, list(ncol = 1)))

Prepare the data

Let’s now prepare the datasets

# Choose the desired sample size
sampleSize = 0.01

set.seed(123)

# Creation of the samples
sample_blogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sample_news <- sample(news, length(news) * sampleSize, replace = FALSE)
sample_twitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)

# We will now remove all non-English characters from our created samples
sample_blogs <- iconv(sample_blogs, "latin1", "ASCII", sub = "")
sample_news <- iconv(sample_news, "latin1", "ASCII", sub = "")
sample_twitter <- iconv(sample_twitter, "latin1", "ASCII", sub = "")

# combine all three data sets into a single data set and write to disk
sample <- c(sample_blogs, sample_news, sample_twitter)
sample_file_name <- "final/en_US/en_US.sample.txt"
con <- file(sample_file_name, open = "w")
writeLines(sample, con)
close(con)

# get number of lines and words from the sample data set
sample_data_lines <- length(sample);
sample_data_words <- sum(stri_count_words(sample))

Build the corpus

path <- getwd()
bad_words_url <- "https://raw.githubusercontent.com/RobertJGabriel/Google-profanity-words/master/list.txt"

download.file(bad_words_url, file.path(path, "google_bad_words.txt"))

# Read the file 
bad_words <- read.delim('google_bad_words.txt')

sample_corpus <- c(sample_blogs,sample_news,sample_twitter)
corpus <- Corpus(VectorSource(list(sample_corpus)))

corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
google_bad_words <- read.delim("google_bad_words.txt",sep = ":",header = FALSE)
google_bad_words <- google_bad_words[,1]
corpus <- tm_map(corpus, removeWords, google_bad_words)
## Warning in tm_map.SimpleCorpus(corpus, removeWords, google_bad_words):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
writeCorpus(corpus, filenames="corpus.txt")
corpus <- readLines("corpus.txt")

N-gram analysis

unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

wordlist <- unigramTokenizer(corpus)
unigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(unigram.df) <- c("word","freq")
unigram.df <- unigram.df[with(unigram.df, order(-unigram.df$freq)),]
row.names(unigram.df) <- NULL
save(unigram.df, file="unigram.Rda")

ggplot(head(unigram.df,20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="lightblue") +
  ggtitle("Unigram - word frequency") +
  ylab("Frequency") +
  xlab("Word")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Our model let’s us know that the most mentioned word is ‘will’, followed by ‘said’ and ‘just’

wordlist <- bigramTokenizer(corpus)
bigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(bigram.df) <- c("word","freq")
bigram.df <- bigram.df[with(bigram.df, order(-bigram.df$freq)),]
row.names(bigram.df) <- NULL

save(bigram.df, file="bigram.Rda")
ggplot(head(bigram.df, 20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="lightgreen") +
  ggtitle("Bigrams frequency") +
  ylab("Frequency") +
  xlab("Term") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

From our model, we learn that the two words appearing the most together are ‘right now’, followed by ‘can’t wait’ and ‘don’t know’

wordlist <- trigramTokenizer(corpus)
trigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(trigram.df) <- c("word","freq")
trigram.df <- trigram.df[with(trigram.df, order(-trigram.df$freq)),]
row.names(trigram.df) <- NULL
save(trigram.df, file="trigram.Rda")
ggplot(head(trigram.df,20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="lightsalmon2") +
  ggtitle("Trigrams frequency") +
  ylab("Frequency") +
  xlab("Term") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Finally, we learn from our trigram that the most recurrent three-word group is ‘can’t wait see’, followed by ‘happy mother’s day’ and ‘new york city’.