This is the milestone report of data science capstone. The report describes the properties of the training data twitter and summarizes a plan of creating a prediction model.
The data is at this Link
#Dowload file
setwd("C:/Users/Apple/Desktop/RStudio Tour/assignment/assignment10")
if(!file.exists("Coursera-SwiftKey.zip" )){
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
destfile ="Coursera-SwiftKey.zip" )
unzip("Coursera-SwiftKey.zip" )
}
#Read file
library(tidyverse)
library(tidytext)
twitter<-read_lines(file="final/en_US/en_US.twitter.txt",
skip_empty_rows = TRUE)
blog<-read_lines(file="final/en_US/en_US.blogs.txt",
skip_empty_rows = TRUE)
news<-read_lines(file="final/en_US/en_US.news.txt",
skip_empty_rows = TRUE)
##Process Data Summarise the data
library(stringi)
data.frame(
File = c("blog","news","twitter"),
t(rbind(sapply(list(blog,news,twitter),stri_stats_general),
TotalWords = sapply(list(blog,news,twitter),stri_stats_latex)[4,]))
)
## File Lines LinesNEmpty Chars CharsNWhite TotalWords
## 1 blog 899288 899288 206824382 170389539 37570839
## 2 news 1010242 1010242 203223154 169860866 34494539
## 3 twitter 2360148 2360148 162096031 134082634 30451128
Since the data sets are quite large, we will randomly choose 1% of the data to demonstrate the data cleaning and exploratory analysis.
set.seed(12345)
twitter<-tibble(twitter)
blog<-tibble(blog)
news<-tibble(news)
twitter<-sample_frac(twitter,size=0.01)
blog<-sample_frac(blog,size=0.01)
news<-sample_frac(news,size=0.01)
This part aims to count words in each file and show the results in basic plots.
find the highest frequency word in each dataset.
#process data
data("stop_words")
twitter_word<- twitter %>%
unnest_tokens(word,twitter)%>%
mutate(name="twitter")
blog_word<-blog%>%
unnest_tokens(word,blog)%>%
mutate(name="blog")
news_word<-news%>%
unnest_tokens(word,news)%>%
mutate(name="news")
word_data<-rbind(twitter_word,blog_word,news_word)
word_data<-word_data %>%
filter(grepl("[0-9]",word)==FALSE)%>%
anti_join(stop_words,by="word")%>%
group_by(name)%>%
count(word)%>%
arrange(desc(n),.by_group=TRUE)%>%
slice(1:10)
p1<-word_data%>%
filter(name=='twitter')%>%
ggplot(aes(n,reorder(word,n)))+
geom_col()+
labs(y=NULL,title = "twitter")
p2<-word_data%>%
filter(name=='blog')%>%
ggplot(aes(n,reorder(word,n)))+
geom_col()+
labs(y=NULL,title = "blog")
p3<-word_data%>%
filter(name=='news')%>%
ggplot(aes(n,reorder(word,n)))+
geom_col()+
labs(y=NULL,title = "news")
gridExtra::grid.arrange(p1,p2,p3,ncol=3)
Calculate the frequency of 2-grams
p1<-twitter%>%
unnest_tokens(gram,twitter,token="ngrams",n=2)%>%
separate(gram,sep = " ",
into = c("word1","word2"),remove = FALSE)%>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1%in%c(1:9),
!word2%in%c(1:9))%>%
select(gram)%>%
count(gram,sort=TRUE)%>%
arrange(desc(n))%>%
slice(1:10)%>%
ggplot(aes(y=reorder(gram,n),x=n))+
geom_col()+
labs(y=NULL,title="twitter")
p2<-news%>%
unnest_tokens(gram,news,token="ngrams",n=2)%>%
separate(gram,sep = " ",
into = c("word1","word2"),remove = FALSE)%>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1%in%c(1:9),
!word2%in%c(1:9))%>%
select(gram)%>%
filter(complete.cases(gram)==TRUE)%>%
count(gram,sort=TRUE)%>%
arrange(desc(n))%>%
slice(1:10)%>%
ggplot(aes(y=reorder(gram,n),x=n))+
geom_col()+
labs(y=NULL,title="news")
p3<-blog%>%
unnest_tokens(gram,blog,token="ngrams",n=2)%>%
separate(gram,sep = " ",
into = c("word1","word2"),remove = FALSE)%>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1%in%c(1:9),
!word2%in%c(1:9))%>%
select(gram)%>%
filter(complete.cases(gram)==TRUE)%>%
count(gram,sort=TRUE)%>%
arrange(desc(n))%>%
slice(1:10)%>%
ggplot(aes(y=reorder(gram,n),x=n))+
geom_col()+
labs(y=NULL,title="blog")
gridExtra::grid.arrange(p1,p2,p3,ncol=3)
On the basis of analysis, I am planning to use ngram dataframe to calculate the probabilities of the next word occuring with respect to previous words. For the Shiny app, the plan is to create an app with a simple interface where the user can enter a string of text. Our prediction model will then give a list of suggested words to update the next word.