data_science_milestone

Data Science Capstone Milestone Report

The goal of this project is just to display that we've gotten used to working with the data and that we are on track to create our prediction algorithm.

Load packages

library(stringi)
library(tm)
library(RWeka)
library(ggplot2)

Download and read the data

Set working directory and download the data from the web address: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

setwd("C:/Users/asus/Desktop/Data Science Capstone")

if(!file.exists("./swiftkey_data")){
  dir.create("./swiftkey_data")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if(!file.exists("./swiftkey_data/Coursera-SwiftKey.zip")){
  download.file(Url,destfile="./swiftkey_data/Coursera-SwiftKey.zip",mode = "wb")
}

if(!file.exists("./swiftkey_data/final")){
  unzip(zipfile="./swiftkey_data/Coursera-SwiftKey.zip",exdir="./swiftkey_data")
}


setwd("C:/Users/asus/Desktop/Data Science Capstone/swiftkey_data/final/en_US")



blogs <- readLines("en_US.blogs.txt", encoding= "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding= "UTF-8", skipNul = TRUE)

## Warning in readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE):
## tamamlanmamış son dize bulundu: 'en_US.news.txt'

twitter <- readLines("en_US.twitter.txt", encoding= "UTF-8", skipNul = TRUE)

Genaral Statistics of the Data

Some genaral statistics of the data is given below.

stri_stats_general(blogs)

##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539

stri_stats_latex(blogs)

##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##     162464653             9      42636700      37570839             3 
##        Envirs 
##             0

stri_stats_general(news)

##       Lines LinesNEmpty       Chars CharsNWhite 
##       77259       77259    15639408    13072698

stri_stats_latex(news)

##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##      12476453             0       3096618       2651432             0 
##        Envirs 
##             0

stri_stats_general(twitter)

##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096241   134082806

stri_stats_latex(twitter)

##     CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
##     125570778          3032      35958529      30451170           963 
##        Envirs 
##             0

Data is very large so, sampling is done.

set.seed(1234)


blogs <- sample(blogs, length(blogs) * 0.01)
twitter<-sample(twitter, length(twitter) * 0.01)
news<-sample(news, length(news) * 0.01)

Data Cleaning

Data is cleaned by removing stopwords,punctuation, white spaces, numbers.

corpus <- VCorpus(VectorSource(c(blogs,twitter,news)))

corpus<-tm_map(corpus,tolower)
corpus<-tm_map(corpus,removeNumbers)


corpus<-tm_map(corpus,removePunctuation)

corpus<-tm_map(corpus,removeWords,stopwords("english"))

corpus<-tm_map(corpus,PlainTextDocument)

corpus<-tm_map(corpus,stripWhitespace)

Data Analysis

unigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))



unigram<- as.data.frame(as.matrix(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = unigramTokenizer)),0.9999)))

unigram_freq<- sort(rowSums(unigram), decreasing = TRUE)
gram1 <- data.frame(word = names(unigram_freq), freq = unigram_freq)
gram1[1:20,]

##          word freq
## just     just 2492
## will     will 2276
## like     like 2236
## one       one 2153
## can       can 1895
## get       get 1816
## time     time 1583
## good     good 1528
## now       now 1465
## love     love 1433
## know     know 1429
## day       day 1386
## new       new 1300
## see       see 1202
## dont     dont 1182
## people people 1170
## back     back 1160
## think   think 1057
## great   great 1036
## make     make 1009

bigram<-as.matrix(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigramTokenizer)),0.9999))

bigram_freq<- sort(rowSums(bigram), decreasing = TRUE)
gram2 <- data.frame(word = names(bigram_freq), freq = bigram_freq)

gram2[1:20,]

##                            word freq
## right now             right now  228
## cant wait             cant wait  182
## last night           last night  156
## feel like             feel like  122
## dont know             dont know  121
## looking forward looking forward  118
## im going               im going  117
## good morning       good morning  102
## looks like           looks like   94
## happy birthday   happy birthday   90
## make sure             make sure   85
## im sure                 im sure   80
## new york               new york   80
## can get                 can get   78
## first time           first time   78
## let know               let know   78
## just got               just got   77
## years ago             years ago   77
## good luck             good luck   76
## last week             last week   72

trigram<- as.matrix(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigramTokenizer)),0.9999))

trigram_freq<- sort(rowSums(trigram), decreasing = TRUE)
gram3 <- data.frame(word = names(trigram_freq), freq = trigram_freq)
gram3[1:20,]

##                                          word freq
## happy mothers day           happy mothers day   30
## cant wait see                   cant wait see   25
## happy new year                 happy new year   19
## let us know                       let us know   18
## im pretty sure                 im pretty sure   14
## cant wait get                   cant wait get   12
## dont even know                 dont even know   12
## new york city                   new york city   12
## looking forward seeing looking forward seeing   11
## feel like im                     feel like im   10
## new years eve                   new years eve   10
## ive ever seen                   ive ever seen    9
## new york times                 new york times    9
## cinco de mayo                   cinco de mayo    8
## come see us                       come see us    8
## dont know im                     dont know im    8
## happy valentines day     happy valentines day    8
## im looking forward         im looking forward    8
## cant wait til                   cant wait til    7
## just wanted say               just wanted say    7

Bar Plots of Most Frequent Words (Unigrams, Bigrams and Trigrams)

ggplot(gram1[1:20,],aes(x=reorder(word,-freq),y=freq,fill=freq))+geom_bar(stat="identity")+scale_fill_gradient(low="green",high="orange")+
  theme(axis.text.x=element_text(angle=45, hjust=1))+labs(title="Unigrams",x="Words",
                                                          y="Frequencies")

ggplot(gram2[1:20,],aes(x=reorder(word,-freq),y=freq,fill=freq))+geom_bar(stat="identity")+scale_fill_gradient(low="blue",high="red")+
  theme(axis.text.x=element_text(angle=45, hjust=1))+labs(title="Bigrams",x="Words",
                                                          y="Frequencies")

ggplot(gram3[1:20,],aes(x=reorder(word,-freq),y=freq,fill=freq))+geom_bar(stat="identity")+scale_fill_gradient(low="brown",high="pink")+
  theme(axis.text.x=element_text(angle=45, hjust=1))+labs(title="Trigrams",x="Words",
                                                          y="Frequencies")