The goal of this project is to display what I learnt on this program and te steps to create my prediction algorithm. This report explains exploratory analysis and goals for the eventual app and algorithm. What is shown in this project is: 1. Demonstrate I downloaded data and have successfully loaded it in. 2. Create a basic report of summary statistics about the data sets. 3. Report any interesting findings amassed so far. 4. Get feedback on your plans for creating a prediction algorithm and Shiny app.
URL <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
download.file(URL, "./Capstone Dataset.zip")
unzip("./Capstone Dataset.zip", exdir = "./Capstone Project")
The download of data into our directory is pretty straightforward, like in any other project done so far.
Blog_file <- file("./Capstone Project/final/en_US/en_US.blogs.txt")
Twitter_file <- file("./Capstone Project/final/en_US/en_US.twitter.txt")
News_file <- file("./Capstone Project/final/en_US/en_US.news.txt")
Blog <- readLines(Blog_file)
Twitter <- readLines(Twitter_file)
## Warning in readLines(Twitter_file): line 167155 appears to contain an embedded
## nul
## Warning in readLines(Twitter_file): line 268547 appears to contain an embedded
## nul
## Warning in readLines(Twitter_file): line 1274086 appears to contain an embedded
## nul
## Warning in readLines(Twitter_file): line 1759032 appears to contain an embedded
## nul
News <- readLines(News_file) ###Use default settings
## Warning in readLines(News_file): incomplete final line found on './Capstone
## Project/final/en_US/en_US.news.txt'
close(Blog_file)
close(Twitter_file)
close(News_file)
If you want to avoid warnings, you can set warn = FALSE, although shown warnings aren’t a big deal.
The table shown below contains the most basic features of the 3 datasets: words, lines, size, etc.
library(stringi)
Names <- c("Blog", "Twitter", "News")
Length <- c(length(Blog), length(Twitter), length(News))
Chars <- c(sum(nchar(Blog)),sum(nchar(Twitter)),sum(nchar(News)))
Max_chars <- c(max(nchar(Blog)),max(nchar(Twitter)),max(nchar(News)))
Median_chars <- c(median(nchar(Blog)),median(nchar(Twitter)),median(nchar(News)))
Words <- c(sum(stri_count_words(Blog)),sum(stri_count_words(Twitter)),sum(stri_count_words(News)))
Size <- c(file.info("./Capstone Project/final/en_US/en_US.blogs.txt")$size /1024^2, file.info("./Capstone Project/final/en_US/en_US.twitter.txt")$size /1024^2, file.info("./Capstone Project/final/en_US/en_US.news.txt")$size /1024^2) ###Divide by 1024^2 to get MB
Data_table <- data.frame("Name" = Names, "Nº Lines" = Length, "Nº Characters" = Chars, "Max char in a line" = Max_chars, "Median chars per line" = Median_chars, "Nº words" = Words, "Size in MB" = Size)
Data_table
## Name Nº.Lines Nº.Characters Max.char.in.a.line Median.chars.per.line
## 1 Blog 899288 208361438 40835 157
## 2 Twitter 2360148 162384825 213 64
## 3 News 77259 15683765 5760 186
## Nº.words Size.in.MB
## 1 38154238 200.4242
## 2 30218125 159.3641
## 3 2693898 196.2775
As these datasets are very large, we’ll only take a sample from them (for example a 3% of each one) and join them in a single corpus with the same encoding (latin alphabet).
Blog_sample <- sample(Blog, length(Blog)*0.03)
Twitter_sample <- sample(Twitter, length(Twitter)*0.03)
News_sample <- sample(News, length(News)*0.03)
Total_sample <- c(Blog_sample, Twitter_sample, News_sample)
Corpus <- iconv(Total_sample, from = "UTF-8", to = "ASCII", sub = "")
Over this corpus, we’ll perform some cleaning. On the order written here, the steps taken to tidy the dataset is: convert to lowercase, remove punctuation marks, numbers, stopwords, “profanity” and extra whitespaces done by these removals.
library(tm)
## Loading required package: NLP
Corpus <- VCorpus(VectorSource(Corpus))
Corpus <- tm_map(Corpus, content_transformer(tolower))
Corpus <- tm_map(Corpus, removePunctuation)
Corpus <- tm_map(Corpus, removeNumbers)
Corpus <- tm_map(Corpus, removeWords, stopwords("english"))
Corpus <- tm_map(Corpus, stripWhitespace)
After having cleaned the corpus, it’s time now to perform a few plots to ilustrate the most frequent n-grams (Unigrams, Bigrams, Trigrams and Quadgrams).
library(RWeka)
library(tm)
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigram_tdm <- TermDocumentMatrix(Corpus, control = list(tokenize = unigram))
unigram_freq <- findFreqTerms(unigram_tdm, lowfreq = 40)
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigram_tdm <- TermDocumentMatrix(Corpus, control = list(tokenize = bigram))
bigram_freq <- findFreqTerms(bigram_tdm, lowfreq = 40)
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
trigram_tdm <- TermDocumentMatrix(Corpus, control = list(tokenize = trigram))
trigram_freq <- findFreqTerms(trigram_tdm, lowfreq = 10)
quadgram <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
quadgram_tdm <- TermDocumentMatrix(Corpus, control = list(tokenize = quadgram))
quadgram_freq <- findFreqTerms(quadgram_tdm, lowfreq = 10)
After having tokenized the corpus by these n-grams, we now features a barplot for each n-gram.
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
unigram_freq_sums <- rowSums(as.matrix(unigram_tdm[unigram_freq,]))
unigram_ord <- order(unigram_freq_sums, decreasing = T)
unigram_freq_sums <- data.frame(word=names(unigram_freq_sums[unigram_ord]), frequency=unigram_freq_sums[unigram_ord])
ggplot(unigram_freq_sums[1:25,], aes(factor(word, levels = unique(word)), frequency)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90)) + xlab("Unigram") + ylab("Frequency")
library(ggplot2)
bigram_freq_sums <- rowSums(as.matrix(bigram_tdm[bigram_freq,]))
bigram_ord <- order(bigram_freq_sums, decreasing = T)
bigram_freq_sums <- data.frame(word=names(bigram_freq_sums[bigram_ord]), frequency=bigram_freq_sums[bigram_ord])
ggplot(bigram_freq_sums[1:25,], aes(factor(word, levels = unique(word)), frequency)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90)) + xlab("Bigram") + ylab("Frequency")
library(ggplot2)
trigram_freq_sums <- rowSums(as.matrix(trigram_tdm[trigram_freq,]))
trigram_ord <- order(trigram_freq_sums, decreasing = T)
trigram_freq_sums <- data.frame(word=names(trigram_freq_sums[trigram_ord]), frequency=trigram_freq_sums[trigram_ord])
ggplot(trigram_freq_sums[1:25,], aes(factor(word, levels = unique(word)), frequency)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90)) + xlab("Trigram") + ylab("Frequency")
quadgram_freq_sums <- rowSums(as.matrix(quadgram_tdm[quadgram_freq,]))
quadgram_ord <- order(quadgram_freq_sums, decreasing = T)
quadgram_freq_sums <- data.frame(word=names(quadgram_freq_sums[quadgram_ord]), frequency=quadgram_freq_sums[quadgram_ord])
ggplot(quadgram_freq_sums[1:25,], aes(factor(word, levels = unique(word)), frequency)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 90)) + xlab("Quadgram") + ylab("Frequency")
Using quadgrams to see which expression is more repeated is quite unuseful, as the expressions shown don’t make much sense. Aside from quadgrams, when comparing the other barplots, we see that in 1-grams the frencuency decreases relative constant. However, in 2-grams and especially 3-grams the frecuency distribution is more unequal, as the top n-grams cover a bigger portion of this distribution.
Apart from realizing that analyzing quadgrams return expressions that doesn’t make sense, they also indicate that, when cleaning the data, there should be a further remove of words aside from the aforementioned.