JHU Data Science Week 2 Report

Introduction

The motivation for this project is to: 1. Demonstrate that the data has been downladed and sucessfully loaded. 2. Summary statistics about the data sets. 3. Report the patterns of the data sets. 4. Give feedback of the plans for creating a prediction algorithm and shiny app.

Data download and loading

Data is downloaded from the following link: https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip and unzipped into the local working directory.

twitter <- readLines(con <- file("./en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
blog<-readLines(con<-file("./en_US.blogs.txt"),encoding = "UTF-8", skipNul = TRUE)
news<-readLines(con<-file("./en_US.news.txt"),encoding = "UTF-8", skipNul = TRUE)

Exploratory data analysis

File Size

twitter_size<-round(file.info("en_US.twitter.txt")$size/1024^2)
blog_size<-round(file.info("en_US.blogs.txt")$size/1024^2)
news_size<-round(file.info("en_US.news.txt")$size/1024^2)

Line Count

twitter_line<-length(twitter)
blog_line<-length(blog)
news_line<-length(news)

Word Count

library(ngram)
twitter_word<-wordcount(twitter)
blog_word<-wordcount(blog)
news_word<-wordcount(news)

Summary

Size_MBs<-c(twitter_size,blog_size,news_size)
Line_Count<-c(twitter_line,blog_line,news_line)
Word_Count<-c(twitter_word,blog_word,news_word)
summ<-data.frame(Size_MBs,Line_Count,Word_Count)
row.names(summ)<-c("Twitter","Blog","News")
summ

##         Size_MBs Line_Count Word_Count
## Twitter      159    2360148   30373583
## Blog         200     899288   37334131
## News         196      77259    2643969

Data Sampling and cleaning

Since these three data size are too large, I only take part of the data as samples for further analysis.

set.seed(2019)
sampletwitter<-sample(twitter,size = length(twitter)*0.05)
sampleblog<-sample(blog,size=length(blog)*0.05)
samplenews<-sample(news,size=length(news)*0.05)
datasample<-sample(paste(sampletwitter,sampleblog,samplenews),size=10000,replace = TRUE)
datasample1<-datasample[!is.na(datasample)]

Now I can generate corpus and clean the data. In this step,all the words are transfered into lowercase, punctuations and numbers are removed,

library(tm)
datasample<-iconv(datasample1,"UTF-8","ASCII",sub = "")
corpus<-Corpus(VectorSource(as.data.frame(datasample,stringsAsFactors=FALSE)))
corpus<-tm_map(corpus,tolower)
corpus<-tm_map(corpus,removePunctuation)
corpus<-tm_map(corpus,removeNumbers)
corpus<-tm_map(corpus,stripWhitespace)
corpus<-tm_map(corpus,PlainTextDocument)

Word Tokenize

In this section, I tokenize the word as unigram, bigram and trigram. The frequency of each term is then summarized.

library(dplyr)
library(RWeka)
unigram <- NGramTokenizer(corpus, Weka_control(min = 1, max = 1))
bigram <- NGramTokenizer(corpus, Weka_control(min = 2, max = 2))
trigram <- NGramTokenizer(corpus, Weka_control(min = 3, max = 3))

unigram_df<-data.frame(table(unigram))%>%arrange(desc(Freq))
bigram_df<-data.frame(table(bigram))%>%arrange(desc(Freq))
trigram_df<-data.frame(table(trigram))%>%arrange(desc(Freq))

unigram_15<-head(unigram_df,15)
bigram_15<-head(bigram_df,15)
trigram_15<-head(trigram_df,15)

Here I plot the top 15 higest frequency term in each case and the wordcloud of the word having a minimum frequency of 500.

Unigram

library(ggplot2)
library(wordcloud)
library(RColorBrewer)

ggplot(unigram_15,aes(x=unigram,y=Freq,fill=Freq))+
geom_bar(stat = "identity")+labs(x="Word", y="Frequency",title ="Figure 1.Top 15 Highest Frequency Word")+
theme(legend.position="none")

pal2 <- brewer.pal(8,"Dark2")
layout(matrix(c(1, 2), nrow=2), heights=c(1,4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "Figure 2. Wordcloud of words with minimum frequency of 200")
wordcloud(unigram_df$unigram,unigram_df$Freq,min.freq=200, colors=pal2, random.order = F,scale = c(8,0.5))

Bigram

ggplot(bigram_15,aes(x=bigram,y=Freq,fill=Freq))+
geom_bar(stat = "identity")+labs(x="Word", y="Frequency",title ="Figure 3.Top 15 Highest Frequency bigram")+
theme(legend.position="none")

pal2 <- brewer.pal(8,"Dark2")
layout(matrix(c(1, 2), nrow=2), heights=c(1,4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "Figure 4. Wordcloud of bigram with minimum frequency of 200")
wordcloud(bigram_df$bigram,bigram_df$Freq,min.freq=200, colors=pal2, random.order = F,scale = c(8,0.5))

Trigram

ggplot(trigram_15,aes(x=trigram,y=Freq,fill=Freq))+
        geom_bar(stat = "identity")+labs(x="Word", y="Frequency",title ="Figure 5.Top 15 Highest Frequency trigram")+
        theme(legend.position="none")

pal2 <- brewer.pal(8,"Dark2")
layout(matrix(c(1, 2), nrow=2), heights=c(1,4))
par(mar=rep(0, 4))
plot.new()
text(x=0.5, y=0.5, "Figure 6. Wordcloud of trigram with minimum frequency of 30")
wordcloud(trigram_df$trigram,trigram_df$Freq,min.freq=30, colors=pal2, random.order = F,scale = c(8,0.5))

Summary and Furture Plans

In this report, I separate the words in the setences as unigram, bigram and trigram, then count the frequency of each term in these three cases. Furture plan will be using this word frequency relationship to train a model that can predict the next word. If you have any suggestions related to the model, please leave it in the comments,thank you.