Introduction

This report is part of the data science capstone project of Coursera. This report describe the statistics of words in the three corpora(blogs, news, twitter).

1. Loading add-on package

library(stringi)
library(ggplot2)

2. Download the data

file_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dest_file <- "Coursera-SwiftKey.zip"
download.file(file_url, dest_file)

3. Unzip and loading the data

unzip(dest_file)

con <- file("final/en_US/en_US.blogs.txt", open="rb")
en_blogs <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)

con <- file("final/en_US/en_US.news.txt", open="rb")
en_news <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)

con <- file("final/en_US/en_US.twitter.txt", open="rb")
en_twitter <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)

4. Describing the data size, lines and characters

blogs

file.info("final/en_US/en_US.blogs.txt")$size   / 1024^2
## [1] 200.4242
stri_stats_general(en_blogs)
##       Lines LinesNEmpty       Chars CharsNWhite 
##      899288      899288   206824382   170389539

news

file.info("final/en_US/en_US.news.txt")$size   / 1024^2
## [1] 196.2775
stri_stats_general(en_news)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     1010242     1010242   203223154   169860866

twitter

file.info("final/en_US/en_US.twitter.txt")$size   / 1024^2
## [1] 159.3641
stri_stats_general(en_twitter)
##       Lines LinesNEmpty       Chars CharsNWhite 
##     2360148     2360148   162096241   134082806

5. Summarize and plot the data

words_en_blogs <- stri_count_words(en_blogs)
words_en_news <- stri_count_words(en_news)
words_en_twitter <- stri_count_words(en_twitter)

all <- data.frame(dataset=c(rep("blogs",length(words_en_blogs)),
                          rep("news",length(words_en_news)),
                          rep("twitter", length(words_en_twitter))),
                value=c(words_en_blogs, words_en_news, words_en_twitter))

g <- ggplot(all, aes(x=value, fill=dataset))
g + geom_histogram(alpha=0.5, position="identity") + 
    xlim(0, 300) + xlab("words") + ylab("counts")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1946 rows containing non-finite values (stat_bin).

6. SUmmray