The goal of this project is just to display that we've gotten used to working with the data and that we are on track to create our prediction algorithm.
library(stringi)
library(tm)
library(RWeka)
library(ggplot2)
Set working directory and download the data from the web address: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
setwd("C:/Users/asus/Desktop/Data Science Capstone")
if(!file.exists("./swiftkey_data")){
dir.create("./swiftkey_data")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./swiftkey_data/Coursera-SwiftKey.zip")){
download.file(Url,destfile="./swiftkey_data/Coursera-SwiftKey.zip",mode = "wb")
}
if(!file.exists("./swiftkey_data/final")){
unzip(zipfile="./swiftkey_data/Coursera-SwiftKey.zip",exdir="./swiftkey_data")
}
setwd("C:/Users/asus/Desktop/Data Science Capstone/swiftkey_data/final/en_US")
blogs <- readLines("en_US.blogs.txt", encoding= "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding= "UTF-8", skipNul = TRUE)
## Warning in readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE):
## tamamlanmamış son dize bulundu: 'en_US.news.txt'
twitter <- readLines("en_US.twitter.txt", encoding= "UTF-8", skipNul = TRUE)
Some genaral statistics of the data is given below.
stri_stats_general(blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
stri_stats_latex(blogs)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 162464653 9 42636700 37570839 3
## Envirs
## 0
stri_stats_general(news)
## Lines LinesNEmpty Chars CharsNWhite
## 77259 77259 15639408 13072698
stri_stats_latex(news)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 12476453 0 3096618 2651432 0
## Envirs
## 0
stri_stats_general(twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
stri_stats_latex(twitter)
## CharsWord CharsCmdEnvir CharsWhite Words Cmds
## 125570778 3032 35958529 30451170 963
## Envirs
## 0
Data is very large so, sampling is done.
set.seed(1234)
blogs <- sample(blogs, length(blogs) * 0.01)
twitter<-sample(twitter, length(twitter) * 0.01)
news<-sample(news, length(news) * 0.01)
Data is cleaned by removing stopwords,punctuation, white spaces, numbers.
corpus <- VCorpus(VectorSource(c(blogs,twitter,news)))
corpus<-tm_map(corpus,tolower)
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,removeWords,stopwords("english"))
corpus<-tm_map(corpus,PlainTextDocument)
corpus<-tm_map(corpus,stripWhitespace)
unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
unigram<- as.data.frame(as.matrix(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = unigramTokenizer)),0.9999)))
unigram_freq<- sort(rowSums(unigram), decreasing = TRUE)
gram1 <- data.frame(word = names(unigram_freq), freq = unigram_freq)
gram1[1:20,]
## word freq
## just just 2492
## will will 2276
## like like 2236
## one one 2153
## can can 1895
## get get 1816
## time time 1583
## good good 1528
## now now 1465
## love love 1433
## know know 1429
## day day 1386
## new new 1300
## see see 1202
## dont dont 1182
## people people 1170
## back back 1160
## think think 1057
## great great 1036
## make make 1009
bigram<-as.matrix(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer)),0.9999))
bigram_freq<- sort(rowSums(bigram), decreasing = TRUE)
gram2 <- data.frame(word = names(bigram_freq), freq = bigram_freq)
gram2[1:20,]
## word freq
## right now right now 228
## cant wait cant wait 182
## last night last night 156
## feel like feel like 122
## dont know dont know 121
## looking forward looking forward 118
## im going im going 117
## good morning good morning 102
## looks like looks like 94
## happy birthday happy birthday 90
## make sure make sure 85
## im sure im sure 80
## new york new york 80
## can get can get 78
## first time first time 78
## let know let know 78
## just got just got 77
## years ago years ago 77
## good luck good luck 76
## last week last week 72
trigram<- as.matrix(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigramTokenizer)),0.9999))
trigram_freq<- sort(rowSums(trigram), decreasing = TRUE)
gram3 <- data.frame(word = names(trigram_freq), freq = trigram_freq)
gram3[1:20,]
## word freq
## happy mothers day happy mothers day 30
## cant wait see cant wait see 25
## happy new year happy new year 19
## let us know let us know 18
## im pretty sure im pretty sure 14
## cant wait get cant wait get 12
## dont even know dont even know 12
## new york city new york city 12
## looking forward seeing looking forward seeing 11
## feel like im feel like im 10
## new years eve new years eve 10
## ive ever seen ive ever seen 9
## new york times new york times 9
## cinco de mayo cinco de mayo 8
## come see us come see us 8
## dont know im dont know im 8
## happy valentines day happy valentines day 8
## im looking forward im looking forward 8
## cant wait til cant wait til 7
## just wanted say just wanted say 7
ggplot(gram1[1:20,],aes(x=reorder(word,-freq),y=freq,fill=freq))+geom_bar(stat="identity")+scale_fill_gradient(low="green",high="orange")+
theme(axis.text.x=element_text(angle=45, hjust=1))+labs(title="Unigrams",x="Words",
y="Frequencies")
ggplot(gram2[1:20,],aes(x=reorder(word,-freq),y=freq,fill=freq))+geom_bar(stat="identity")+scale_fill_gradient(low="blue",high="red")+
theme(axis.text.x=element_text(angle=45, hjust=1))+labs(title="Bigrams",x="Words",
y="Frequencies")
ggplot(gram3[1:20,],aes(x=reorder(word,-freq),y=freq,fill=freq))+geom_bar(stat="identity")+scale_fill_gradient(low="brown",high="pink")+
theme(axis.text.x=element_text(angle=45, hjust=1))+labs(title="Trigrams",x="Words",
y="Frequencies")
The next step of the Capstone Project is to finish word prediction algorithm using Shiny Application.