Hossam Saad
Aug 17, 2020
This is the Milestone Report for the Coursera Data Science Capstone project. This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.
The data was downloaded at this link: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
Load Required Packages
library(dplyr)
library(plyr)
library(downloader)
library(readr)
library(readtext)
library(stringi)
if(!file.exists("C:/Users/user/Desktop/MyProjDataSet")){
dir.create("C:/Users/user/Desktop/MyProjDataSet")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("C:/Users/user/Desktop/MyProjDataSet/Coursera-SwiftKey.zip")){
download.file(Url,destfile="C:/Users/user/Desktop/MyProjDataSet/Coursera-SwiftKey.zip",mode = "wb")
}
if(!file.exists("C:/Users/user/Desktop/MyProjDataSet/final")){
unzip(zipfile="C:/Users/user/Desktop/MyProjDataSet/Coursera-SwiftKey.zip",exdir="C:/Users/user/Desktop/MyProjDataSet")
}
setwd("C:/Users/user/Desktop/MyProjDataSet/final/en_US")
twitter<-readLines("en_US.twitter.txt",warn=FALSE,encoding="UTF-8")
blogs<-readLines("en_US.blogs.txt",warn=FALSE,encoding="UTF-8")
news<-readLines("en_US.news.txt",warn=FALSE,encoding="UTF-8")
length(twitter)
## [1] 2360148
length(news)
## [1] 77259
length(blogs)
## [1] 899288
twitter_words <-stri_stats_latex(twitter)[4]
blogs_words <-stri_stats_latex(blogs)[4]
news_words <-stri_stats_latex(news)[4]
nchar_twitter<-sum(nchar(twitter))
nchar_blogs<-sum(nchar(blogs))
nchar_news<-sum(nchar(news))
data.frame("File_Name" = c("news", "twitter", "blogs"),
"num_lines" = c(length(news),length(twitter), length(blogs)),
"num_words" = c(sum(news_words), sum(twitter_words), sum(blogs_words)),
"Num of character"=c(nchar_news,nchar_twitter,nchar_blogs))
## File_Name num_lines num_words Num.of.character
## 1 news 77259 2651432 15639408
## 2 twitter 2360148 30451128 162096031
## 3 blogs 899288 37570839 206824505
At this Part we will run some exploratory data analysis:Data merge, Date sample and Date clean
set.seed(2020)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")
library(tm)
library(NLP)
Data_Sample<-c(sample(twitter_c,length(twitter_c)*0.01),
sample(blogs_c,length(blogs_c)*0.01),
sample(news_c,length(news_c)*0.01))
corpus <- VCorpus(VectorSource(Data_Sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
corpusresult<-data.frame(text=unlist(sapply(corpus,'[',"content")),stringsAsFactors = FALSE)
head(corpusresult)
## text
## 1 headed symposium think better way spread love military ccme orlando
## 2 lex tv station reports men arrested ky dialysis clinic trash talk escalated
## 3 watching msnbc twenty minutes can understand fox news consistently twice number viewers
## 4 congratulations barbara hauer mentor public library ohio winning kindle g giveaway
## 5 portico learning solutions wishes happiest new year
## 6 spiritual astrology path divine awakening start get natal chart cafeastrologycom
library(RWeka)
library(ggplot2)
unigram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
unigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=unigram))
unigramcorpus<-findFreqTerms(unigramtab,lowfreq=1000)
unigramcorpusnum<-rowSums(as.matrix(unigramtab[unigramcorpus,]))
unigramcorpustab<-data.frame(Word=names(unigramcorpusnum),frequency=unigramcorpusnum)
unigramcorpussort<-unigramcorpustab[order(-unigramcorpustab$frequency),]
ggplot(unigramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("Red"))+
labs(title="Unigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
bigram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=bigram))
bigramcorpus<-findFreqTerms(bigramtab,lowfreq=80)
bigramcorpusnum<-rowSums(as.matrix(bigramtab[bigramcorpus,]))
bigramcorpustab<-data.frame(Word=names(bigramcorpusnum),frequency=bigramcorpusnum)
bigramcorpussort<-bigramcorpustab[order(-bigramcorpustab$frequency),]
ggplot(bigramcorpussort[1:12,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("Blue"))+
labs(title="Bigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
trigram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
trigramtab<-TermDocumentMatrix(corpus,control=list(tokenize=trigram))
trigramcorpus<-findFreqTerms(trigramtab,lowfreq=10)
trigramcorpusnum<-rowSums(as.matrix(trigramtab[trigramcorpus,]))
trigramcorpustab<-data.frame(Word=names(trigramcorpusnum),frequency=trigramcorpusnum)
trigramcorpussort<-trigramcorpustab[order(-trigramcorpustab$frequency),]
ggplot(trigramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("Green"))+
labs(title="Trigrams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
## Warning: Removed 3 rows containing missing values (position_stack).
I do exploratory analysis initially. The next steps of this capstone project would be to finalize our predictive algorithm, and deploy our algorithm using shiny() app. As for the Shiny app it will consist of a simple user interface that will allow a user to enter text into a single textbox.