Create data folder if it doesn’t exist

if(!dir.exists(“data”)) dir.create(“data”)

Download dataset if not already downloaded

if(!file.exists(“data/Coursera-SwiftKey.zip”)) { url <- “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip” download.file(url, destfile = “data/Coursera-SwiftKey.zip”, method = “auto”) }

Unzip if not already unzipped

if(!dir.exists(“data/final”)) { unzip(“data/Coursera-SwiftKey.zip”, exdir = “data”) }

File paths

blogs_file <- “data/final/en_US/en_US.blogs.txt” news_file <- “data/final/en_US/en_US.news.txt” twitter_file <- “data/final/en_US/en_US.twitter.txt”

Load datasets

blogs <- readLines(blogs_file, encoding = “UTF-8”, skipNul = TRUE) news <- readLines(news_file, encoding = “UTF-8”, skipNul = TRUE) twitter <- readLines(twitter_file, encoding = “UTF-8”, skipNul = TRUE)

Quick check

length(blogs); length(news); length(twitter) library(stringi)

Line counts

line_counts <- c(length(blogs), length(news), length(twitter))

Word counts

word_counts <- c(sum(stri_count_words(blogs)), sum(stri_count_words(news)), sum(stri_count_words(twitter)))

Character counts

char_counts <- c(sum(nchar(blogs)), sum(nchar(news)), sum(nchar(twitter)))

summary_table <- data.frame( Dataset = c(“Blogs”, “News”, “Twitter”), Lines = line_counts, Words = word_counts, Characters = char_counts )

summary_table

library(ggplot2)

Histogram of line lengths (in words)

blogs_wc <- stri_count_words(blogs) news_wc <- stri_count_words(news) twitter_wc <- stri_count_words(twitter)

df <- data.frame( words = c(blogs_wc, news_wc, twitter_wc), source = rep(c(“Blogs”, “News”, “Twitter”), times = c(length(blogs_wc), length(news_wc), length(twitter_wc))) )

ggplot(df, aes(x = words, fill = source)) + geom_histogram(bins = 50, alpha = 0.6, position = “identity”) + xlim(0, 100) + labs(title = “Distribution of Words per Line”, x = “Words per Line”, y = “Frequency”)

Exploratory analysis

Payal Gupta

2025-10-01