This report is part of the data science capstone project of Coursera. This report describe the statistics of words in the three corpora(blogs, news, twitter).
library(stringi)
library(ggplot2)
file_url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
dest_file <- "Coursera-SwiftKey.zip"
download.file(file_url, dest_file)
unzip(dest_file)
con <- file("final/en_US/en_US.blogs.txt", open="rb")
en_blogs <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)
con <- file("final/en_US/en_US.news.txt", open="rb")
en_news <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)
con <- file("final/en_US/en_US.twitter.txt", open="rb")
en_twitter <- readLines(con, encoding="UTF-8", skipNul=TRUE)
close(con)
rm(con)
file.info("final/en_US/en_US.blogs.txt")$size / 1024^2
## [1] 200.4242
stri_stats_general(en_blogs)
## Lines LinesNEmpty Chars CharsNWhite
## 899288 899288 206824382 170389539
file.info("final/en_US/en_US.news.txt")$size / 1024^2
## [1] 196.2775
stri_stats_general(en_news)
## Lines LinesNEmpty Chars CharsNWhite
## 1010242 1010242 203223154 169860866
file.info("final/en_US/en_US.twitter.txt")$size / 1024^2
## [1] 159.3641
stri_stats_general(en_twitter)
## Lines LinesNEmpty Chars CharsNWhite
## 2360148 2360148 162096241 134082806
words_en_blogs <- stri_count_words(en_blogs)
words_en_news <- stri_count_words(en_news)
words_en_twitter <- stri_count_words(en_twitter)
all <- data.frame(dataset=c(rep("blogs",length(words_en_blogs)),
rep("news",length(words_en_news)),
rep("twitter", length(words_en_twitter))),
value=c(words_en_blogs, words_en_news, words_en_twitter))
g <- ggplot(all, aes(x=value, fill=dataset))
g + geom_histogram(alpha=0.5, position="identity") +
xlim(0, 300) + xlab("words") + ylab("counts")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1946 rows containing non-finite values (stat_bin).
These data file are around 200MB.
The blogs data has around 900,000 lines, and around 200 million character.
The news data has around 1,000,000 lines, and around 200 million character.
The twitter data has around 2,000,000 lines, and around 160 million character.
The number of characters of Twitter is the least, and the deviation is small.
The mode number of character in blogs is smaller than news, but the blog has a long tail distribution.