Introduction

This is the milestone report of data science capstone. The report describes the properties of the training data twitter and summarizes a plan of creating a prediction model.

Getting Data

The data is at this Link

#Dowload file
setwd("C:/Users/Apple/Desktop/RStudio Tour/assignment/assignment10")

if(!file.exists("Coursera-SwiftKey.zip" )){
    download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip",
                  destfile ="Coursera-SwiftKey.zip" )
    unzip("Coursera-SwiftKey.zip" )
    
}

#Read file
library(tidyverse)
library(tidytext)

twitter<-read_lines(file="final/en_US/en_US.twitter.txt",
                    skip_empty_rows = TRUE)

blog<-read_lines(file="final/en_US/en_US.blogs.txt",
                    skip_empty_rows = TRUE)

news<-read_lines(file="final/en_US/en_US.news.txt",
                    skip_empty_rows = TRUE)

##Process Data Summarise the data

library(stringi)
data.frame(
        File = c("blog","news","twitter"), 
        t(rbind(sapply(list(blog,news,twitter),stri_stats_general),
                TotalWords = sapply(list(blog,news,twitter),stri_stats_latex)[4,]))
)
##      File   Lines LinesNEmpty     Chars CharsNWhite TotalWords
## 1    blog  899288      899288 206824382   170389539   37570839
## 2    news 1010242     1010242 203223154   169860866   34494539
## 3 twitter 2360148     2360148 162096031   134082634   30451128

Since the data sets are quite large, we will randomly choose 1% of the data to demonstrate the data cleaning and exploratory analysis.

set.seed(12345)
twitter<-tibble(twitter)
blog<-tibble(blog)
news<-tibble(news)

twitter<-sample_frac(twitter,size=0.01)
blog<-sample_frac(blog,size=0.01)
news<-sample_frac(news,size=0.01)

Explorary Data Analysis

This part aims to count words in each file and show the results in basic plots.

find the highest frequency word in each dataset.

#process data
data("stop_words")
twitter_word<- twitter %>%
    unnest_tokens(word,twitter)%>%
    mutate(name="twitter")

blog_word<-blog%>%
    unnest_tokens(word,blog)%>%
    mutate(name="blog")

news_word<-news%>%
    unnest_tokens(word,news)%>%
    mutate(name="news")

word_data<-rbind(twitter_word,blog_word,news_word)

word_data<-word_data %>%
    filter(grepl("[0-9]",word)==FALSE)%>%
    anti_join(stop_words,by="word")%>%
    group_by(name)%>%
    count(word)%>%
    arrange(desc(n),.by_group=TRUE)%>%
    slice(1:10)

p1<-word_data%>%
    filter(name=='twitter')%>%
    ggplot(aes(n,reorder(word,n)))+
    geom_col()+
    labs(y=NULL,title = "twitter")

p2<-word_data%>%
    filter(name=='blog')%>%
    ggplot(aes(n,reorder(word,n)))+
    geom_col()+
   labs(y=NULL,title = "blog")

p3<-word_data%>%
    filter(name=='news')%>%
    ggplot(aes(n,reorder(word,n)))+
    geom_col()+
    labs(y=NULL,title = "news")

gridExtra::grid.arrange(p1,p2,p3,ncol=3)  

Calculate the frequency of 2-grams

p1<-twitter%>%
    unnest_tokens(gram,twitter,token="ngrams",n=2)%>%
    separate(gram,sep = " ",
             into = c("word1","word2"),remove = FALSE)%>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word,
           !word1%in%c(1:9),
           !word2%in%c(1:9))%>%
    select(gram)%>%
    count(gram,sort=TRUE)%>%
    arrange(desc(n))%>%
    slice(1:10)%>%
    ggplot(aes(y=reorder(gram,n),x=n))+
    geom_col()+
    labs(y=NULL,title="twitter")

p2<-news%>%
    unnest_tokens(gram,news,token="ngrams",n=2)%>%
    separate(gram,sep = " ",
             into = c("word1","word2"),remove = FALSE)%>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word,
           !word1%in%c(1:9),
           !word2%in%c(1:9))%>%
    select(gram)%>%
    filter(complete.cases(gram)==TRUE)%>%
    count(gram,sort=TRUE)%>%
    arrange(desc(n))%>%
    slice(1:10)%>%
    ggplot(aes(y=reorder(gram,n),x=n))+
    geom_col()+
    labs(y=NULL,title="news")

p3<-blog%>%
    unnest_tokens(gram,blog,token="ngrams",n=2)%>%
    separate(gram,sep = " ",
             into = c("word1","word2"),remove = FALSE)%>%
    filter(!word1 %in% stop_words$word,
           !word2 %in% stop_words$word,
           !word1%in%c(1:9),
           !word2%in%c(1:9))%>%
    select(gram)%>%
    filter(complete.cases(gram)==TRUE)%>%
    count(gram,sort=TRUE)%>%
    arrange(desc(n))%>%
    slice(1:10)%>%
    ggplot(aes(y=reorder(gram,n),x=n))+
    geom_col()+
    labs(y=NULL,title="blog")

gridExtra::grid.arrange(p1,p2,p3,ncol=3)

Next Step

On the basis of analysis, I am planning to use ngram dataframe to calculate the probabilities of the next word occuring with respect to previous words. For the Shiny app, the plan is to create an app with a simple interface where the user can enter a string of text. Our prediction model will then give a list of suggested words to update the next word.