Introducción

This is the Milestone Report for the Coursera Data Science Capstone project. This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.

Loading Data

The source data is available at this link: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

# Loading Libraries

library(plyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(stringi)
library(tm)
## Loading required package: NLP
library(NLP)
library(widyr)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(ggraph)
library(tidytext)
library(tidyr)

First Steps

As a first step, I load the downloaded data to my local working directory and a simple exploratory data analysis of the dataset:

setwd("~/Jupyter_Varios/R/Final_Project/data/en_US/")

twitter <- readLines("~/Jupyter_Varios/R/Final_Project/data/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)

blogs <- readLines("~/Jupyter_Varios/R/Final_Project/data/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)

news <- readLines("~/Jupyter_Varios/R/Final_Project/data/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)

Lines length of every file

length(twitter)
## [1] 2360148
length(blogs)
## [1] 899288
length(news)
## [1] 1010242

Stats by file

twitterWords <- stri_stats_latex(twitter)
blogsWords <-stri_stats_latex(blogs)
newsWords <- stri_stats_latex(news)
ncharTwitter <- sum(nchar(twitter))
ncharBlogs <- sum(nchar(blogs))
ncharNews <- sum(nchar(news))

data.frame("File Name" = c("twitter", "blogs", "news"),
           "num.lines" = c(length(twitter),length(blogs), length(news)),
           "num.words" = c(sum(blogsWords), sum(newsWords), sum(twitterWords)),
           "Num of character" = c(ncharBlogs,ncharNews,ncharTwitter))
##   File.Name num.lines num.words Num.of.character
## 1   twitter   2360148 242672204        206824505
## 2     blogs    899288 236985627        203223159
## 3      news   1010242 191984472        162096241

Exploratory Data Analysis

Now I will run some exploratory data analysis and a cleaning process to the sample data:

set.seed(10000)
blog.c <- iconv(blogs, "latin1", "ASCII", sub = "")
news.c <- iconv(news, "latin1", "ASCII", sub = "")
twitter.c <- iconv(twitter, "latin1", "ASCII", sub = "")

sampleData<-c(sample(twitter.c,length(twitter.c)*0.01),
              sample(blog.c,length(blog.c)*0.01),
               sample(news.c,length(news.c)*0.01))

corpus <- VCorpus(VectorSource(sampleData))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)

corpusResult <- data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusResult)
##                                                                                       text
## 1                                      tiger woods poker night phil hellmuth doyle brunson
## 2                                            suddenly feel start checking retirement homes
## 3                         acutely aware fact major blowers building sound good microphones
## 4                                                                        seem right avatar
## 5                                 new empathic approach used one high school great results
## 6 many diagnosed altzheimers old head injury ever wonder pres reagan subdural hematoma tbi
tail(corpusResult)
##                                                                                                                                                                                                   text
## 42690  look new site done persimmon walls dark wood offer additional malaysian menu carried cleveland heights coowner founder freeman ngo former singaporean space hopes offer solon location end year
## 42691                                                                             just introduced wished luck pettitte recalled just fired somebody looked watching since high school see meet awesome
## 42692                                                                                                                                                                                      bera delman
## 42693                                        love books way emphasize preparing eating good food something can every single day rituals create around dinner table establish sacred bonds among family
## 42694                                                                                                                                                                                        q working
## 42695                                                                     come wednesday night showers likely colder temps along light winds can expect lows around city around degrees outlying areas

Build N-gram and Plot graph

Building N-Grams with RWeka

Unigram

# Unigrams

library(RWeka)

unigram <- function(x) NGramTokenizer(x, Weka_control(min=1,max=1))

unigramtab <- TermDocumentMatrix(corpus, control = list(tokenize = unigram))

unigramcorpus <- findFreqTerms(unigramtab, lowfreq = 1000)

unigramcorpusnum <- rowSums(as.matrix(unigramtab[unigramcorpus, ]))

unigramcorpustab <- data.frame(Word = names(unigramcorpusnum), frequency = unigramcorpusnum)

unigramcorpussort <- unigramcorpustab[order(-unigramcorpustab$frequency), ]

Bigrams

bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))

bigramtab <- TermDocumentMatrix(corpus, control = list(tokenize = bigram))

bigramcorpus <- findFreqTerms(bigramtab, lowfreq = 80)

bigramcorpusnum <- rowSums(as.matrix(bigramtab[bigramcorpus, ]))

bigramcorpustab <- data.frame(Word = names(bigramcorpusnum), frequency = bigramcorpusnum)

bigramcorpussort <- bigramcorpustab[order(-bigramcorpustab$frequency), ]

Trigrams

trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))

trigramtab <- TermDocumentMatrix(corpus, control = list(tokenize = trigram))

trigramcorpus <- findFreqTerms(trigramtab, lowfreq = 10)

trigramcorpusnum <- rowSums(as.matrix(trigramtab[trigramcorpus, ]))

trigramcorpustab <- data.frame(Word = names(trigramcorpusnum), frequency = trigramcorpusnum)

trigramcorpussort <- trigramcorpustab[order(-trigramcorpustab$frequency),]

Ploting N-Grams with ggplot2

Unigram

ggplot(unigramcorpussort[1:15,], aes(x = reorder(Word, -frequency), y = frequency)) + 
  geom_bar(stat = "identity", fill = I("dark blue")) + 
  labs(title = "Unigrams", x = "Most Words", y = "Frecuency") +
  theme(axis.text = element_text(angle = 90))

Bigram

ggplot(bigramcorpussort[1:15,], aes(x = reorder(Word, -frequency), y = frequency)) + 
  geom_bar(stat = "identity", fill = I("dark green")) + 
  labs(title = "Bigrams", x = "Most Words", y = "Frecuency") +
  theme(axis.text = element_text(angle = 90))

Trigram

ggplot(trigramcorpussort[1:12,], aes(x = reorder(Word, -frequency), y = frequency)) + 
  geom_bar(stat = "identity", fill = I("dark red")) + 
  labs(title = "Trigrams", x = "Most Words", y = "Frecuency") +
  theme(axis.text = element_text(angle = 90))

Exploratory Data Analysis comments:

Now we can see the most frequent words (unigram) and combinations of words (bigram and trigram) found in the sample data.

As a next step, I will to finalize the predictive algorithm and to develop the shiny app.

Thank you.