Executive Summary

The goal for this project is to create a Next Word application based on the data from Swiftkey. Avialable at https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

At this document we show exploratory analysis and your goals for the eventual app and algorithm.

Data loading

Once the data is at our working directory we read them.

library(tm)
## Loading required package: NLP
library(SnowballC)
library(RWeka)

blogs.file="./en_US.blogs.txt"
news.file="./en_US.news.txt"
twitter.file="en_US.twitter.txt"
blogs <- readLines(blogs.file,10000)
news <- readLines(news.file,10000)
twitter <- readLines(twitter.file,10000)
#blogs <- readLines(blogs.file)
#news <- readLines(news.file)
#twitter <- readLines(twitter.file)

Exploratory Analysis

First we check the size of files:

c(file.info(blogs.file)$size,file.info(news.file)$size,file.info(twitter.file)$size) 
## [1] 210160014 205811889 167105338

We see the amout of lines of text sources.

#c(length(blogs),length(news),length(twitter))
c(
system(paste("wc -l", blogs.file),intern = TRUE),
system(paste("wc -l", news.file),intern = TRUE),
system(paste("wc -l", twitter.file),intern = TRUE)
)
## [1] "899288 ./en_US.blogs.txt"  "1010242 ./en_US.news.txt" 
## [3] "2360148 en_US.twitter.txt"

counting max size of text line

c(max(nchar(blogs)),max(nchar(news)),max(nchar(twitter)))
## [1] 3831 1929  140

Corpus creation and data cleaning

Now we integrate all three data in a corpus and clean it.

corpus <- VCorpus(VectorSource(c(blogs,news,twitter)))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)

Now we use a profanity words list from: http://www.bannedwordlist.com/lists/swearWords.txt

if (!file.exists("./swearWords.txt")){
  download.file(url="http://www.bannedwordlist.com/lists/swearWords.txt",
                destfile="./swearWords.txt")
}

#eliminamos la lisa de malas palabrasprofanity  
profanity<- VectorSource(readLines("./swearWords.txt"))
# remove profanity words
corpus <- tm_map(corpus, removeWords, profanity) 

Tokenization N-gram

corpusdf <- data.frame(text=unlist(sapply(corpus, '[',"content")),stringsAsFactors=F)
delim <- " \\t\\r\\n.!?,;\"()"
onetoken<-NGramTokenizer(corpusdf, Weka_control(min=1, max=1))
twotoken <- NGramTokenizer(corpusdf, Weka_control(min=2,max=2, delimiters = delim))
threetoken <- NGramTokenizer(corpusdf, Weka_control(min=3,max=3, delimiters = delim))
fourtoken <- NGramTokenizer(corpusdf, Weka_control(min=4,max=4, delimiters = delim))

Tokeinization graphs

topone<-as.data.frame(table(onetoken))
toponefinal<-(head(topone[order(-topone$Freq),],10))
print(toponefinal)
##       onetoken Freq
## 28335     said 2942
## 36414     will 2776
## 23337      one 2670
## 18695     like 2383
## 17075     just 2366
## 33249     time 2254
## 12906      get 2231
## 4864       can 2115
## 13177       go 2005
## 37169     year 1962
barplot(toponefinal$Freq,names.arg=toponefinal[,1],main="OneGram top words",ylab="times",xlab="words")

toptwo<-as.data.frame(table(twotoken))
toptwofinal<-(head(toptwo[order(-toptwo$Freq),],10))
print(toptwofinal)
##           twotoken Freq
## 184446   last year  214
## 226929    new york  181
## 94165    dont know  165
## 284477   right now  153
## 386259    year ago  144
## 197190   look like  136
## 153998 high school  134
## 117727   feel like  132
## 122046  first time  124
## 184438   last week  118
barplot(toptwofinal$Freq,names.arg=toptwofinal[,1],main="TwoGram top",ylab="times",xlab="2grams")

topthree<-as.data.frame(table(threetoken))
topthreefinal<-(head(topthree[order(-topthree$Freq),],10))
print(topthreefinal)
##                 threetoken Freq
## 58125        cant wait see   23
## 276938       new york citi   23
## 319588 presid barack obama   19
## 144439     first time sinc   18
## 178171    happi mother day   18
## 277019       new york time   18
## 436799               u u u   17
## 178180      happi new year   15
## 436090        two year ago   15
## 229118         let us know   13
barplot(topthreefinal$Freq,names.arg=topthreefinal[,1],main="ThreeGram top",ylab="times",xlab="3grams")

Future development plans

I plan to reduce the corpus for one gram, besides it is necesary to implement the prediction algorithms and proceudres. One shinny app will be created with the results achived. I will work Markov chains too.