The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.

Please submit a report on R Pubs (http://rpubs.com/) that explains your exploratory analysis and your goals for the eventual app and algorithm.

This document should be concise and explain only the major features of the data you have identified and briefly summarize your plans for creating the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.

You should make use of tables and plots to illustrate important summaries of the data set. The motivation for this project is to:

1. Demonstrate that you’ve downloaded the data and have successfully loaded it in.

2. Create a basic report of summary statistics about the data sets.

3. Report any interesting findings that you amassed so far.

4. Get feedback on your plans for creating a prediction algorithm and Shiny app.

Review criteria

1. Does the link lead to an HTML page describing the exploratory analysis of the training data set?

2. Has the data scientist done basic summaries of the three files? Word counts, line counts and basic data tables?

3. Has the data scientist made basic plots, such as histograms to illustrate features of the data?

4. Was the report written in a brief, concise style, in a way that a non-data scientist manager could appreciate?

Load the necessary packages

library(knitr)
library(stringi)
library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.0.5

library(ggplot2)
library(gridExtra)
library(tm)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(RWeka)

## Warning: package 'RWeka' was built under R version 4.0.5

library(SnowballC)

Download the datasets

We will first download the datasets in the computer

path <- getwd()
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

download.file(url, file.path(path, "Coursera-SwiftKey.zip"))

unzip("Coursera-SwiftKey.zip")

Load the datasets

Let’s now load the datasets in the environment

file_blogs <- file("./final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(file_blogs, encoding = "UTF-8")

file_news <- file("./final/en_US/en_US.news.txt", "rb")
news <- readLines(file_news, encoding = "UTF-8")

file_twitter <- file("./final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(file_twitter, encoding = "UTF-8")

## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 167155 contient
## un caractère nul

## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 268547 contient
## un caractère nul

## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 1274086
## contient un caractère nul

## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 1759032
## contient un caractère nul

Summary

We will now create a table summarizing the most important aspects of the datasets

MB = 1024^2

# file size
fileSizeMB <- round(file.info(c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt"))$size / MB)
 
# num lines per file
number_of_lines <- sapply(list(blogs, news, twitter), length)

# num characters per file
number_of_chars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)

# num words per file
number_of_words <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]

# words per line
words_per_line <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))

# words per line summary
words_per_line_summary = sapply(list(blogs, news, twitter),
             function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(words_per_line_summary) = c('Minimum number of words per line', 'Mean number of words per line', 'Maximum number of words per line')

summary <- data.frame(
    File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
    FileSize = paste(fileSizeMB, " MB"),
    Lines = number_of_lines,
    Characters = number_of_chars,
    Words = number_of_words,
    t(rbind(round(words_per_line_summary)))
)

kable(summary,
      row.names = FALSE,
      align = c("l", rep("r", 7)),
      caption = "") %>% kable_styling(position = "left")


File	FileSize	Lines	Characters	Words	Minimum.number.of.words.per.line	Mean.number.of.words.per.line	Maximum.number.of.words.per.line
en_US.blogs.txt	200 MB	899288	206824505	37570839	0	42	6726
en_US.news.txt	196 MB	1010242	203223159	34494539	1	34	1796
en_US.twitter.txt	159 MB	2360148	162096031	30451128	1	13	47

Plot the data

We will now show the distribution of the words per line in the 3 datasets

# Plot of the blogs data
plot1 <- qplot(words_per_line[[1]],
               geom = "histogram",
               main = "US Blogs",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot2 <- qplot(words_per_line[[2]],
               geom = "histogram",
               main = "US News",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 5)

plot3 <- qplot(words_per_line[[3]],
               geom = "histogram",
               main = "US Twitter",
               xlab = "Words per Line",
               ylab = "Frequency",
               binwidth = 1)

plotList = list(plot1, plot2, plot3)
do.call(grid.arrange, c(plotList, list(ncol = 1)))

Prepare the data

Let’s now prepare the datasets

# Choose the desired sample size
sampleSize = 0.01

set.seed(123)

# Creation of the samples
sample_blogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sample_news <- sample(news, length(news) * sampleSize, replace = FALSE)
sample_twitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)

# We will now remove all non-English characters from our created samples
sample_blogs <- iconv(sample_blogs, "latin1", "ASCII", sub = "")
sample_news <- iconv(sample_news, "latin1", "ASCII", sub = "")
sample_twitter <- iconv(sample_twitter, "latin1", "ASCII", sub = "")

# combine all three data sets into a single data set and write to disk
sample <- c(sample_blogs, sample_news, sample_twitter)
sample_file_name <- "final/en_US/en_US.sample.txt"
con <- file(sample_file_name, open = "w")
writeLines(sample, con)
close(con)

# get number of lines and words from the sample data set
sample_data_lines <- length(sample);
sample_data_words <- sum(stri_count_words(sample))

Build the corpus

path <- getwd()
bad_words_url <- "https://raw.githubusercontent.com/RobertJGabriel/Google-profanity-words/master/list.txt"

download.file(bad_words_url, file.path(path, "google_bad_words.txt"))

# Read the file 
bad_words <- read.delim('google_bad_words.txt')

sample_corpus <- c(sample_blogs,sample_news,sample_twitter)
corpus <- Corpus(VectorSource(list(sample_corpus)))

corpus <- tm_map(corpus, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents

corpus <- tm_map(corpus, removePunctuation)

## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents

corpus <- tm_map(corpus, removeNumbers)

## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents

corpus <- tm_map(corpus, removeWords, stopwords("english"))

## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents

google_bad_words <- read.delim("google_bad_words.txt",sep = ":",header = FALSE)
google_bad_words <- google_bad_words[,1]
corpus <- tm_map(corpus, removeWords, google_bad_words)

## Warning in tm_map.SimpleCorpus(corpus, removeWords, google_bad_words):
## transformation drops documents

corpus <- tm_map(corpus, stripWhitespace)

## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents

writeCorpus(corpus, filenames="corpus.txt")
corpus <- readLines("corpus.txt")

N-gram analysis

unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

wordlist <- unigramTokenizer(corpus)
unigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(unigram.df) <- c("word","freq")
unigram.df <- unigram.df[with(unigram.df, order(-unigram.df$freq)),]
row.names(unigram.df) <- NULL
save(unigram.df, file="unigram.Rda")

ggplot(head(unigram.df,20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="lightblue") +
  ggtitle("Unigram - word frequency") +
  ylab("Frequency") +
  xlab("Word")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Our model let’s us know that the most mentioned word is ‘will’, followed by ‘said’ and ‘just’

wordlist <- bigramTokenizer(corpus)
bigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(bigram.df) <- c("word","freq")
bigram.df <- bigram.df[with(bigram.df, order(-bigram.df$freq)),]
row.names(bigram.df) <- NULL

save(bigram.df, file="bigram.Rda")
ggplot(head(bigram.df, 20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="lightgreen") +
  ggtitle("Bigrams frequency") +
  ylab("Frequency") +
  xlab("Term") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

From our model, we learn that the two words appearing the most together are ‘right now’, followed by ‘can’t wait’ and ‘don’t know’

wordlist <- trigramTokenizer(corpus)
trigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(trigram.df) <- c("word","freq")
trigram.df <- trigram.df[with(trigram.df, order(-trigram.df$freq)),]
row.names(trigram.df) <- NULL
save(trigram.df, file="trigram.Rda")
ggplot(head(trigram.df,20), aes(x=reorder(word,-freq), y=freq)) +
  geom_bar(stat="Identity", fill="lightsalmon2") +
  ggtitle("Trigrams frequency") +
  ylab("Frequency") +
  xlab("Term") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Finally, we learn from our trigram that the most recurrent three-word group is ‘can’t wait see’, followed by ‘happy mother’s day’ and ‘new york city’.

Data Science Capstone - Milestone Report

Tristan Foulard

24/05/2021