The goal for this project is to create a Next Word application based on the data from Swiftkey. Avialable at https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
At this document we show exploratory analysis and your goals for the eventual app and algorithm.
Once the data is at our working directory we read them.
library(tm)
## Loading required package: NLP
library(SnowballC)
library(RWeka)
blogs.file="./en_US.blogs.txt"
news.file="./en_US.news.txt"
twitter.file="en_US.twitter.txt"
blogs <- readLines(blogs.file,10000)
news <- readLines(news.file,10000)
twitter <- readLines(twitter.file,10000)
#blogs <- readLines(blogs.file)
#news <- readLines(news.file)
#twitter <- readLines(twitter.file)
First we check the size of files:
c(file.info(blogs.file)$size,file.info(news.file)$size,file.info(twitter.file)$size)
## [1] 210160014 205811889 167105338
We see the amout of lines of text sources.
#c(length(blogs),length(news),length(twitter))
c(
system(paste("wc -l", blogs.file),intern = TRUE),
system(paste("wc -l", news.file),intern = TRUE),
system(paste("wc -l", twitter.file),intern = TRUE)
)
## [1] "899288 ./en_US.blogs.txt" "1010242 ./en_US.news.txt"
## [3] "2360148 en_US.twitter.txt"
counting max size of text line
c(max(nchar(blogs)),max(nchar(news)),max(nchar(twitter)))
## [1] 3831 1929 140
Now we integrate all three data in a corpus and clean it.
corpus <- VCorpus(VectorSource(c(blogs,news,twitter)))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, stemDocument)
Now we use a profanity words list from: http://www.bannedwordlist.com/lists/swearWords.txt
if (!file.exists("./swearWords.txt")){
download.file(url="http://www.bannedwordlist.com/lists/swearWords.txt",
destfile="./swearWords.txt")
}
#eliminamos la lisa de malas palabrasprofanity
profanity<- VectorSource(readLines("./swearWords.txt"))
# remove profanity words
corpus <- tm_map(corpus, removeWords, profanity)
corpusdf <- data.frame(text=unlist(sapply(corpus, '[',"content")),stringsAsFactors=F)
delim <- " \\t\\r\\n.!?,;\"()"
onetoken<-NGramTokenizer(corpusdf, Weka_control(min=1, max=1))
twotoken <- NGramTokenizer(corpusdf, Weka_control(min=2,max=2, delimiters = delim))
threetoken <- NGramTokenizer(corpusdf, Weka_control(min=3,max=3, delimiters = delim))
fourtoken <- NGramTokenizer(corpusdf, Weka_control(min=4,max=4, delimiters = delim))
topone<-as.data.frame(table(onetoken))
toponefinal<-(head(topone[order(-topone$Freq),],10))
print(toponefinal)
## onetoken Freq
## 28335 said 2942
## 36414 will 2776
## 23337 one 2670
## 18695 like 2383
## 17075 just 2366
## 33249 time 2254
## 12906 get 2231
## 4864 can 2115
## 13177 go 2005
## 37169 year 1962
barplot(toponefinal$Freq,names.arg=toponefinal[,1],main="OneGram top words",ylab="times",xlab="words")
toptwo<-as.data.frame(table(twotoken))
toptwofinal<-(head(toptwo[order(-toptwo$Freq),],10))
print(toptwofinal)
## twotoken Freq
## 184446 last year 214
## 226929 new york 181
## 94165 dont know 165
## 284477 right now 153
## 386259 year ago 144
## 197190 look like 136
## 153998 high school 134
## 117727 feel like 132
## 122046 first time 124
## 184438 last week 118
barplot(toptwofinal$Freq,names.arg=toptwofinal[,1],main="TwoGram top",ylab="times",xlab="2grams")
topthree<-as.data.frame(table(threetoken))
topthreefinal<-(head(topthree[order(-topthree$Freq),],10))
print(topthreefinal)
## threetoken Freq
## 58125 cant wait see 23
## 276938 new york citi 23
## 319588 presid barack obama 19
## 144439 first time sinc 18
## 178171 happi mother day 18
## 277019 new york time 18
## 436799 u u u 17
## 178180 happi new year 15
## 436090 two year ago 15
## 229118 let us know 13
barplot(topthreefinal$Freq,names.arg=topthreefinal[,1],main="ThreeGram top",ylab="times",xlab="3grams")
I plan to reduce the corpus for one gram, besides it is necesary to implement the prediction algorithms and proceudres. One shinny app will be created with the results achived. I will work Markov chains too.