This report is part of the data science capstone project of Coursera. This report describe the statistics of words in the three corpora(blogs, news, twitter).
library(stringi)
library(ggplot2)
file_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dest_file <- "Coursera-SwiftKey.zip"
download.file(file_url, dest_file)
unzip(dest_file)
con <- file("final/en_US/en_US.blogs.txt", open="rb")
en_blogs <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)
con <- file("final/en_US/en_US.news.txt", open="rb")
en_news <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)
con <- file("final/en_US/en_US.twitter.txt", open="rb")
en_twitter <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)
file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
## [1] 200.4242
stri_stats_general(en_blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
file.info("final/en_US/en_US.news.txt")$size / 1024^2
## [1] 196.2775
stri_stats_general(en_news)
## Lines LinesNEmpty Chars CharsNWhite
## 1010242 1010242 203223154 169860866
file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
## [1] 159.3641
stri_stats_general(en_twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
words_en_blogs <- stri_count_words(en_blogs)
words_en_news <- stri_count_words(en_news)
words_en_twitter <- stri_count_words(en_twitter)
all <- data.frame(dataset=c(rep("blogs",length(words_en_blogs)),
rep("news",length(words_en_news)),
rep("twitter", length(words_en_twitter))),
value=c(words_en_blogs, words_en_news, words_en_twitter))
g <- ggplot(all, aes(x=value, fill=dataset))
g + geom_histogram(alpha=0.5, position="identity") +
xlim(0, 300) + xlab("words") + ylab("counts")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1946 rows containing non-finite values (stat_bin).