if(!dir.exists(“data”)) dir.create(“data”)
if(!file.exists(“data/Coursera-SwiftKey.zip”)) { url <- “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip” download.file(url, destfile = “data/Coursera-SwiftKey.zip”, method = “auto”) }
if(!dir.exists(“data/final”)) { unzip(“data/Coursera-SwiftKey.zip”, exdir = “data”) }
blogs_file <- “data/final/en_US/en_US.blogs.txt” news_file <- “data/final/en_US/en_US.news.txt” twitter_file <- “data/final/en_US/en_US.twitter.txt”
blogs <- readLines(blogs_file, encoding = “UTF-8”, skipNul = TRUE) news <- readLines(news_file, encoding = “UTF-8”, skipNul = TRUE) twitter <- readLines(twitter_file, encoding = “UTF-8”, skipNul = TRUE)
length(blogs); length(news); length(twitter) library(stringi)
line_counts <- c(length(blogs), length(news), length(twitter))
word_counts <- c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter)))
char_counts <- c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter)))
summary_table <- data.frame( Dataset = c(“Blogs”, “News”, “Twitter”), Lines = line_counts, Words = word_counts, Characters = char_counts )
summary_table
library(ggplot2)
blogs_wc <- stri_count_words(blogs) news_wc <- stri_count_words(news) twitter_wc <- stri_count_words(twitter)
df <- data.frame( words = c(blogs_wc, news_wc, twitter_wc), source = rep(c(“Blogs”, “News”, “Twitter”), times = c(length(blogs_wc), length(news_wc), length(twitter_wc))) )
ggplot(df, aes(x = words, fill = source)) + geom_histogram(bins = 50, alpha = 0.6, position = “identity”) + xlim(0, 100) + labs(title = “Distribution of Words per Line”, x = “Words per Line”, y = “Frequency”)