The motivation for this project is to: 1. Demonstrate that the data has been downladed and sucessfully loaded. 2. Summary statistics about the data sets. 3. Report the patterns of the data sets. 4. Give feedback of the plans for creating a prediction algorithm and shiny app.
Data is downloaded from the following link: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip and unzipped into the local working directory.
twitter <- readLines(con <- file("./en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
blog<-readLines(con<-file("./en_US.blogs.txt"),encoding = "UTF-8", skipNul = TRUE)
news<-readLines(con<-file("./en_US.news.txt"),encoding = "UTF-8", skipNul = TRUE)
twitter_size<-round(file.info("en_US.twitter.txt")$size/1024^2)
blog_size<-round(file.info("en_US.blogs.txt")$size/1024^2)
news_size<-round(file.info("en_US.news.txt")$size/1024^2)
twitter_line<-length(twitter)
blog_line<-length(blog)
news_line<-length(news)
library(ngram)
twitter_word<-wordcount(twitter)
blog_word<-wordcount(blog)
news_word<-wordcount(news)
Size_MBs<-c(twitter_size,blog_size,news_size)
Line_Count<-c(twitter_line,blog_line,news_line)
Word_Count<-c(twitter_word,blog_word,news_word)
summ<-data.frame(Size_MBs,Line_Count,Word_Count)
row.names(summ)<-c("Twitter","Blog","News")
summ
## Size_MBs Line_Count Word_Count
## Twitter 159 2360148 30373583
## Blog 200 899288 37334131
## News 196 77259 2643969
Since these three data size are too large, I only take part of the data as samples for further analysis.
set.seed(2019)
sampletwitter<-sample(twitter,size = length(twitter)*0.05)
sampleblog<-sample(blog,size=length(blog)*0.05)
samplenews<-sample(news,size=length(news)*0.05)
datasample<-sample(paste(sampletwitter,sampleblog,samplenews),size=10000,replace = TRUE)
datasample1<-datasample[!is.na(datasample)]
Now I can generate corpus and clean the data. In this step,all the words are transfered into lowercase, punctuations and numbers are removed,
library(tm)
datasample<-iconv(datasample1,"UTF-8","ASCII",sub = "")
corpus<-Corpus(VectorSource(as.data.frame(datasample,stringsAsFactors=FALSE)))
corpus<-tm_map(corpus,tolower)
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,stripWhitespace)
corpus<-tm_map(corpus,PlainTextDocument)
In this section, I tokenize the word as unigram, bigram and trigram. The frequency of each term is then summarized.
library(dplyr)
library(RWeka)
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))
unigram_df<-data.frame(table(unigram))%>%arrange(desc(Freq))
bigram_df<-data.frame(table(bigram))%>%arrange(desc(Freq))
trigram_df<-data.frame(table(trigram))%>%arrange(desc(Freq))
unigram_15<-head(unigram_df,15)
bigram_15<-head(bigram_df,15)
trigram_15<-head(trigram_df,15)
Here I plot the top 15 higest frequency term in each case and the wordcloud of the word having a minimum frequency of 500.
library(ggplot2)
library(wordcloud)
library(RColorBrewer)
ggplot(unigram_15,aes(x=unigram,y=Freq,fill=Freq))+
geom_bar(stat = "identity")+labs(x="Word", y="Frequency",title ="Figure 1.Top 15 Highest Frequency Word")+
theme(legend.position="none")
pal2 <- brewer.pal(8,"Dark2")
layout(matrix(c(1, 2), nrow=2), heights=c(1,4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "Figure 2. Wordcloud of words with minimum frequency of 200")
wordcloud(unigram_df$unigram,unigram_df$Freq,min.freq=200, colors=pal2, random.order = F,scale = c(8,0.5))
ggplot(bigram_15,aes(x=bigram,y=Freq,fill=Freq))+
geom_bar(stat = "identity")+labs(x="Word", y="Frequency",title ="Figure 3.Top 15 Highest Frequency bigram")+
theme(legend.position="none")
pal2 <- brewer.pal(8,"Dark2")
layout(matrix(c(1, 2), nrow=2), heights=c(1,4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "Figure 4. Wordcloud of bigram with minimum frequency of 200")
wordcloud(bigram_df$bigram,bigram_df$Freq,min.freq=200, colors=pal2, random.order = F,scale = c(8,0.5))
ggplot(trigram_15,aes(x=trigram,y=Freq,fill=Freq))+
geom_bar(stat = "identity")+labs(x="Word", y="Frequency",title ="Figure 5.Top 15 Highest Frequency trigram")+
theme(legend.position="none")
pal2 <- brewer.pal(8,"Dark2")
layout(matrix(c(1, 2), nrow=2), heights=c(1,4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "Figure 6. Wordcloud of trigram with minimum frequency of 30")
wordcloud(trigram_df$trigram,trigram_df$Freq,min.freq=30, colors=pal2, random.order = F,scale = c(8,0.5))
In this report, I separate the words in the setences as unigram, bigram and trigram, then count the frequency of each term in these three cases. Furture plan will be using this word frequency relationship to train a model that can predict the next word. If you have any suggestions related to the model, please leave it in the comments,thank you.