blogs <- readLines(“final/en_US/en_US.blogs.txt”, encoding = “UTF-8”, skipNul = TRUE) news <- readLines(“final/en_US/en_US.news.txt”, encoding = “UTF-8”, skipNul = TRUE) twitter <- readLines(“final/en_US/en_US.twitter.txt”, encoding = “UTF-8”, skipNul = TRUE) data_summary <- data.frame( source = c(“Blogs”, “News”, “Twitter”), lines = c(length(blogs), length(news), length(twitter)), words = c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter))), characters = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter))) )
kable(data_summary) blog_lengths <- stri_count_words(blogs) qplot(blog_lengths, bins = 100, main = “Blogs Word Count per Line”, xlab = “Words”, ylab = “Frequency”)