library(knitr)
library(stringi)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.0.5
library(ggplot2)
library(gridExtra)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(RWeka)
## Warning: package 'RWeka' was built under R version 4.0.5
library(SnowballC)
path <- getwd()
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(url, file.path(path, "Coursera-SwiftKey.zip"))
unzip("Coursera-SwiftKey.zip")
file_blogs <- file("./final/en_US/en_US.blogs.txt", "rb")
blogs <- readLines(file_blogs, encoding = "UTF-8")
file_news <- file("./final/en_US/en_US.news.txt", "rb")
news <- readLines(file_news, encoding = "UTF-8")
file_twitter <- file("./final/en_US/en_US.twitter.txt", "rb")
twitter <- readLines(file_twitter, encoding = "UTF-8")
## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 167155 contient
## un caractère nul
## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 268547 contient
## un caractère nul
## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 1274086
## contient un caractère nul
## Warning in readLines(file_twitter, encoding = "UTF-8"): la ligne 1759032
## contient un caractère nul
MB = 1024^2
# file size
fileSizeMB <- round(file.info(c("final/en_US/en_US.blogs.txt", "final/en_US/en_US.news.txt", "final/en_US/en_US.twitter.txt"))$size / MB)
# num lines per file
number_of_lines <- sapply(list(blogs, news, twitter), length)
# num characters per file
number_of_chars <- sapply(list(nchar(blogs), nchar(news), nchar(twitter)), sum)
# num words per file
number_of_words <- sapply(list(blogs, news, twitter), stri_stats_latex)[4,]
# words per line
words_per_line <- lapply(list(blogs, news, twitter), function(x) stri_count_words(x))
# words per line summary
words_per_line_summary = sapply(list(blogs, news, twitter),
function(x) summary(stri_count_words(x))[c('Min.', 'Mean', 'Max.')])
rownames(words_per_line_summary) = c('Minimum number of words per line', 'Mean number of words per line', 'Maximum number of words per line')
summary <- data.frame(
File = c("en_US.blogs.txt", "en_US.news.txt", "en_US.twitter.txt"),
FileSize = paste(fileSizeMB, " MB"),
Lines = number_of_lines,
Characters = number_of_chars,
Words = number_of_words,
t(rbind(round(words_per_line_summary)))
)
kable(summary,
row.names = FALSE,
align = c("l", rep("r", 7)),
caption = "") %>% kable_styling(position = "left")
| File | FileSize | Lines | Characters | Words | Minimum.number.of.words.per.line | Mean.number.of.words.per.line | Maximum.number.of.words.per.line |
|---|---|---|---|---|---|---|---|
| en_US.blogs.txt | 200 MB | 899288 | 206824505 | 37570839 | 0 | 42 | 6726 |
| en_US.news.txt | 196 MB | 1010242 | 203223159 | 34494539 | 1 | 34 | 1796 |
| en_US.twitter.txt | 159 MB | 2360148 | 162096031 | 30451128 | 1 | 13 | 47 |
# Plot of the blogs data
plot1 <- qplot(words_per_line[[1]],
geom = "histogram",
main = "US Blogs",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 5)
plot2 <- qplot(words_per_line[[2]],
geom = "histogram",
main = "US News",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 5)
plot3 <- qplot(words_per_line[[3]],
geom = "histogram",
main = "US Twitter",
xlab = "Words per Line",
ylab = "Frequency",
binwidth = 1)
plotList = list(plot1, plot2, plot3)
do.call(grid.arrange, c(plotList, list(ncol = 1)))
# Choose the desired sample size
sampleSize = 0.01
set.seed(123)
# Creation of the samples
sample_blogs <- sample(blogs, length(blogs) * sampleSize, replace = FALSE)
sample_news <- sample(news, length(news) * sampleSize, replace = FALSE)
sample_twitter <- sample(twitter, length(twitter) * sampleSize, replace = FALSE)
# We will now remove all non-English characters from our created samples
sample_blogs <- iconv(sample_blogs, "latin1", "ASCII", sub = "")
sample_news <- iconv(sample_news, "latin1", "ASCII", sub = "")
sample_twitter <- iconv(sample_twitter, "latin1", "ASCII", sub = "")
# combine all three data sets into a single data set and write to disk
sample <- c(sample_blogs, sample_news, sample_twitter)
sample_file_name <- "final/en_US/en_US.sample.txt"
con <- file(sample_file_name, open = "w")
writeLines(sample, con)
close(con)
# get number of lines and words from the sample data set
sample_data_lines <- length(sample);
sample_data_words <- sum(stri_count_words(sample))
path <- getwd()
bad_words_url <- "https://raw.githubusercontent.com/RobertJGabriel/Google-profanity-words/master/list.txt"
download.file(bad_words_url, file.path(path, "google_bad_words.txt"))
# Read the file
bad_words <- read.delim('google_bad_words.txt')
sample_corpus <- c(sample_blogs,sample_news,sample_twitter)
corpus <- Corpus(VectorSource(list(sample_corpus)))
corpus <- tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
corpus <- tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus <- tm_map(corpus, removeNumbers)
## Warning in tm_map.SimpleCorpus(corpus, removeNumbers): transformation drops
## documents
corpus <- tm_map(corpus, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, stopwords("english")):
## transformation drops documents
google_bad_words <- read.delim("google_bad_words.txt",sep = ":",header = FALSE)
google_bad_words <- google_bad_words[,1]
corpus <- tm_map(corpus, removeWords, google_bad_words)
## Warning in tm_map.SimpleCorpus(corpus, removeWords, google_bad_words):
## transformation drops documents
corpus <- tm_map(corpus, stripWhitespace)
## Warning in tm_map.SimpleCorpus(corpus, stripWhitespace): transformation drops
## documents
writeCorpus(corpus, filenames="corpus.txt")
corpus <- readLines("corpus.txt")
unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
wordlist <- unigramTokenizer(corpus)
unigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(unigram.df) <- c("word","freq")
unigram.df <- unigram.df[with(unigram.df, order(-unigram.df$freq)),]
row.names(unigram.df) <- NULL
save(unigram.df, file="unigram.Rda")
ggplot(head(unigram.df,20), aes(x=reorder(word,-freq), y=freq)) +
geom_bar(stat="Identity", fill="lightblue") +
ggtitle("Unigram - word frequency") +
ylab("Frequency") +
xlab("Word")+ theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Our model let’s us know that the most mentioned word is ‘will’, followed by ‘said’ and ‘just’
wordlist <- bigramTokenizer(corpus)
bigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(bigram.df) <- c("word","freq")
bigram.df <- bigram.df[with(bigram.df, order(-bigram.df$freq)),]
row.names(bigram.df) <- NULL
save(bigram.df, file="bigram.Rda")
ggplot(head(bigram.df, 20), aes(x=reorder(word,-freq), y=freq)) +
geom_bar(stat="Identity", fill="lightgreen") +
ggtitle("Bigrams frequency") +
ylab("Frequency") +
xlab("Term") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
wordlist <- trigramTokenizer(corpus)
trigram.df <- data.frame(V1 = as.vector(names(table(unlist(wordlist)))), V2 = as.numeric(table(unlist(wordlist))))
names(trigram.df) <- c("word","freq")
trigram.df <- trigram.df[with(trigram.df, order(-trigram.df$freq)),]
row.names(trigram.df) <- NULL
save(trigram.df, file="trigram.Rda")
ggplot(head(trigram.df,20), aes(x=reorder(word,-freq), y=freq)) +
geom_bar(stat="Identity", fill="lightsalmon2") +
ggtitle("Trigrams frequency") +
ylab("Frequency") +
xlab("Term") + theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Finally, we learn from our trigram that the most recurrent three-word group is ‘can’t wait see’, followed by ‘happy mother’s day’ and ‘new york city’.