blogs <- readLines(“en_US.blogs.txt”, encoding = “UTF-8”, skipNul = TRUE) news <- readLines(“en_US.news.txt”, encoding = “UTF-8”, skipNul = TRUE) twitter <- readLines(“en_US.twitter.txt”, encoding = “UTF-8”, skipNul = TRUE)
summary_df <- data.frame( Dataset = c(“Blogs”, “News”, “Twitter”), Lines = c(length(blogs), length(news), length(twitter)), Words = c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter))), Characters = c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter))) )
summary_df blog_words <- stri_count_words(blogs) hist(blog_words, breaks = 50, main = “Distribution of Words per Blog Entry”, xlab = “Words per line”)