Data Science Capstone R Markdown

Loading the data

blogs <- readLines("en_US.blogs.txt", encoding="UTF-8", skipNul=TRUE)
news <- readLines("en_US.news.txt", encoding="UTF-8", skipNul=TRUE)

## Warning in readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'en_US.news.txt'

twitter <- readLines("en_US.twitter.txt", encoding="UTF-8", skipNul=TRUE)

Line Count

length(blogs)

## [1] 899288

length(news)

## [1] 77259

length(twitter)

## [1] 2360148

The File Sizes

file.info("en_US.blogs.txt")$size / 1024^2

## [1] 200.4242

file.info("en_US.news.txt")$size / 1024^2

## [1] 196.2775

file.info("en_US.twitter.txt")$size / 1024^2

## [1] 159.3641

WORD COUNT OF EACH DATA SET

blogs_words <- sum(sapply(strsplit(blogs, "\\s+"), length))
blogs_words

## [1] 37334131

news_words <- sum(sapply(strsplit(news, "\\s+"), length))
news_words

## [1] 2643969

twitter_words <- sum(sapply(strsplit(twitter, "\\s+"), length))
twitter_words

## [1] 30373583

Blog dataset histogram

blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)

blogs_length <- nchar(blogs)

hist(blogs_length, main = "Blog Text Length Distribution", xlab = "Character Count")

News dataset histogram

news_length <- nchar(news)
hist(news_length,
     main = "News Line Length Distribution",
     xlab = "Characters per line",
     col = "lightgreen",
     breaks = 50,
     ylim = c(0, 5000))

Twitter dataset histogram

twitter_length <- nchar(twitter)
hist(twitter_length,
main = "Twitter Line Length Distribution",
xlab = "Characters per line",
col = "lightpink",
breaks = 50,
xlim = c(0, 100),
ylim = c(0, 5000))

Data Science Capstone R Markdown

Bindu A M

2026-06-15