blogs <- readLines(“en_US.blogs.txt”, warn = FALSE) news <- readLines(“en_US.news.txt”, warn = FALSE) twitter <- readLines(“en_US.twitter.txt”, warn = FALSE)

Basic summaries

data_summary <- data.frame( Dataset = c(“Blogs”, “News”, “Twitter”), Lines = c(length(blogs), length(news), length(twitter)), Words = c(sum(str_count(blogs, “\w+”)), sum(str_count(news, “\w+”)), sum(str_count(twitter, “\w+”))), Max_Char_Line = c(max(nchar(blogs)), max(nchar(news)), max(nchar(twitter))) ) knitr::kable(data_summary)

library(ggplot2)

blog_lengths <- nchar(blogs) ggplot(data.frame(blog_lengths), aes(blog_lengths)) + geom_histogram(binwidth = 500) + ggtitle(“Histogram of Blog Line Lengths”) + xlab(“Characters per Line”) + ylab(“Frequency”)