Hossam Saad

Aug 17, 2020

Introduction

This is the Milestone Report for the Coursera Data Science Capstone project. This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.

Load Data

The data was downloaded at this link: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Load Required Packages

library(dplyr)
library(plyr)
library(downloader)
library(readr)
library(readtext)
library(stringi)
if(!file.exists("C:/Users/user/Desktop/MyProjDataSet")){
  dir.create("C:/Users/user/Desktop/MyProjDataSet")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"

if(!file.exists("C:/Users/user/Desktop/MyProjDataSet/Coursera-SwiftKey.zip")){
  download.file(Url,destfile="C:/Users/user/Desktop/MyProjDataSet/Coursera-SwiftKey.zip",mode = "wb")
}

if(!file.exists("C:/Users/user/Desktop/MyProjDataSet/final")){
  unzip(zipfile="C:/Users/user/Desktop/MyProjDataSet/Coursera-SwiftKey.zip",exdir="C:/Users/user/Desktop/MyProjDataSet")
}
setwd("C:/Users/user/Desktop/MyProjDataSet/final/en_US")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
length(twitter)
## [1] 2360148
length(news)
## [1] 77259
length(blogs)
## [1] 899288
twitter_words <-stri_stats_latex(twitter)[4]
blogs_words <-stri_stats_latex(blogs)[4]
news_words <-stri_stats_latex(news)[4]
nchar_twitter<-sum(nchar(twitter))
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))
data.frame("File_Name" = c("news", "twitter", "blogs"),
           "num_lines" = c(length(news),length(twitter), length(blogs)),
           "num_words" = c(sum(news_words), sum(twitter_words), sum(blogs_words)),
           "Num of character"=c(nchar_news,nchar_twitter,nchar_blogs))
##   File_Name num_lines num_words Num.of.character
## 1      news     77259   2651432         15639408
## 2   twitter   2360148  30451128        162096031
## 3     blogs    899288  37570839        206824505

Exploratory Data Analysis

At this Part we will run some exploratory data analysis:Data merge, Date sample and Date clean

set.seed(2020)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")
library(tm)
library(NLP)
Data_Sample<-c(sample(twitter_c,length(twitter_c)*0.01),
              sample(blogs_c,length(blogs_c)*0.01),
               sample(news_c,length(news_c)*0.01))
corpus <- VCorpus(VectorSource(Data_Sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult)
##                                                                                       text
## 1                      headed symposium think better way spread love military ccme orlando
## 2              lex tv station reports men arrested ky dialysis clinic trash talk escalated
## 3  watching msnbc twenty minutes can understand fox news consistently twice number viewers
## 4       congratulations barbara hauer mentor public library ohio winning kindle g giveaway
## 5                                      portico learning solutions wishes happiest new year
## 6        spiritual astrology path divine awakening start get natal chart cafeastrologycom
library(RWeka)
library(ggplot2)
unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=1000)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]
ggplot(unigramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("Red"))+
labs(title="Unigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=80)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]
ggplot(bigramcorpussort[1:12,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("Blue"))+
labs(title="Bigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=10)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]
ggplot(trigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("Green"))+
labs(title="Trigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
## Warning: Removed 3 rows containing missing values (position_stack).

Next Steps

I do exploratory analysis initially. The next steps of this capstone project would be to finalize our predictive algorithm, and deploy our algorithm using shiny() app. As for the Shiny app it will consist of a simple user interface that will allow a user to enter text into a single textbox.