This is the Milestone Report for the Coursera Data Science Capstone project. This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.
The source data is available at this link: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
# Loading Libraries
library(plyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(stringi)
library(tm)
## Loading required package: NLP
library(NLP)
library(widyr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(ggraph)
library(tidytext)
library(tidyr)
As a first step, I load the downloaded data to my local working directory and a simple exploratory data analysis of the dataset:
setwd("~/Jupyter_Varios/R/Final_Project/data/en_US/")
twitter <- readLines("~/Jupyter_Varios/R/Final_Project/data/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
blogs <- readLines("~/Jupyter_Varios/R/Final_Project/data/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines("~/Jupyter_Varios/R/Final_Project/data/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
length(twitter)
## [1] 2360148
length(blogs)
## [1] 899288
length(news)
## [1] 1010242
twitterWords <- stri_stats_latex(twitter)
blogsWords <-stri_stats_latex(blogs)
newsWords <- stri_stats_latex(news)
ncharTwitter <- sum(nchar(twitter))
ncharBlogs <- sum(nchar(blogs))
ncharNews <- sum(nchar(news))
data.frame("File Name" = c("twitter", "blogs", "news"),
"num.lines" = c(length(twitter),length(blogs), length(news)),
"num.words" = c(sum(blogsWords), sum(newsWords), sum(twitterWords)),
"Num of character" = c(ncharBlogs,ncharNews,ncharTwitter))
## File.Name num.lines num.words Num.of.character
## 1 twitter 2360148 242672204 206824505
## 2 blogs 899288 236985627 203223159
## 3 news 1010242 191984472 162096241
Now I will run some exploratory data analysis and a cleaning process to the sample data:
set.seed(10000)
blog.c <- iconv(blogs, "latin1", "ASCII", sub = "")
news.c <- iconv(news, "latin1", "ASCII", sub = "")
twitter.c <- iconv(twitter, "latin1", "ASCII", sub = "")
sampleData<-c(sample(twitter.c,length(twitter.c)*0.01),
sample(blog.c,length(blog.c)*0.01),
sample(news.c,length(news.c)*0.01))
corpus <- VCorpus(VectorSource(sampleData))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusResult <- data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusResult)
## text
## 1 tiger woods poker night phil hellmuth doyle brunson
## 2 suddenly feel start checking retirement homes
## 3 acutely aware fact major blowers building sound good microphones
## 4 seem right avatar
## 5 new empathic approach used one high school great results
## 6 many diagnosed altzheimers old head injury ever wonder pres reagan subdural hematoma tbi
tail(corpusResult)
## text
## 42690 look new site done persimmon walls dark wood offer additional malaysian menu carried cleveland heights coowner founder freeman ngo former singaporean space hopes offer solon location end year
## 42691 just introduced wished luck pettitte recalled just fired somebody looked watching since high school see meet awesome
## 42692 bera delman
## 42693 love books way emphasize preparing eating good food something can every single day rituals create around dinner table establish sacred bonds among family
## 42694 q working
## 42695 come wednesday night showers likely colder temps along light winds can expect lows around city around degrees outlying areas
Build N-gram and Plot graph
# Unigrams
library(RWeka)
unigram <- function(x) NGramTokenizer(x, Weka_control(min=1,max=1))
unigramtab <- TermDocumentMatrix(corpus, control = list(tokenize = unigram))
unigramcorpus <- findFreqTerms(unigramtab, lowfreq = 1000)
unigramcorpusnum <- rowSums(as.matrix(unigramtab[unigramcorpus, ]))
unigramcorpustab <- data.frame(Word = names(unigramcorpusnum), frequency = unigramcorpusnum)
unigramcorpussort <- unigramcorpustab[order(-unigramcorpustab$frequency), ]
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigramtab <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))
bigramcorpus <- findFreqTerms(bigramtab, lowfreq = 80)
bigramcorpusnum <- rowSums(as.matrix(bigramtab[bigramcorpus, ]))
bigramcorpustab <- data.frame(Word = names(bigramcorpusnum), frequency = bigramcorpusnum)
bigramcorpussort <- bigramcorpustab[order(-bigramcorpustab$frequency), ]
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
trigramtab <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))
trigramcorpus <- findFreqTerms(trigramtab, lowfreq = 10)
trigramcorpusnum <- rowSums(as.matrix(trigramtab[trigramcorpus, ]))
trigramcorpustab <- data.frame(Word = names(trigramcorpusnum), frequency = trigramcorpusnum)
trigramcorpussort <- trigramcorpustab[order(-trigramcorpustab$frequency),]
ggplot(unigramcorpussort[1:15,], aes(x = reorder(Word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = I("dark blue")) +
labs(title = "Unigrams", x = "Most Words", y = "Frecuency") +
theme(axis.text = element_text(angle = 90))
ggplot(bigramcorpussort[1:15,], aes(x = reorder(Word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = I("dark green")) +
labs(title = "Bigrams", x = "Most Words", y = "Frecuency") +
theme(axis.text = element_text(angle = 90))
ggplot(trigramcorpussort[1:12,], aes(x = reorder(Word, -frequency), y = frequency)) +
geom_bar(stat = "identity", fill = I("dark red")) +
labs(title = "Trigrams", x = "Most Words", y = "Frecuency") +
theme(axis.text = element_text(angle = 90))
Now we can see the most frequent words (unigram) and combinations of words (bigram and trigram) found in the sample data.
As a next step, I will to finalize the predictive algorithm and to develop the shiny app.
Thank you.