library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringi)
library(ggplot2)
library(tidyr)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate

1. Introduction

The purpose of this brief report is to:

Show that the training data has been successfully downloaded and loaded into R

Present initial exploratory analysis

Share important early findings

Outline the plan for the prediction algorithm and Shiny application

This report is written for a non-technical manager, focusing on insights rather than code details.

blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'en_US.news.txt'
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)

3. Basic Summaries

3.1 Line Counts

line_counts <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter))
)

kable(line_counts) %>% kable_styling(full_width = FALSE)
Dataset Lines
Blogs 899288
News 77259
Twitter 2360148

3.2 Word Counts

word_counts <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
WordCount = c(
sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))
)
)

kable(word_counts) %>% kable_styling(full_width = FALSE)
Dataset WordCount
Blogs 37546806
News 2674561
Twitter 30096690

3.3 Summary Table

summary_table <- line_counts %>%
left_join(word_counts, by = "Dataset") %>%
mutate(
AvgWordsPerLine = round(WordCount / Lines, 2)
)

kable(summary_table) %>%
kable_styling(full_width = FALSE)
Dataset Lines WordCount AvgWordsPerLine
Blogs 899288 37546806 41.75
News 77259 2674561 34.62
Twitter 2360148 30096690 12.75

4. Exploratory Plots

To keep processing reasonable, I sample 10,000 lines from each file.

set.seed(123)
sample_blogs <- sample(blogs, 10000)
sample_news <- sample(news, 10000)
sample_twitter <- sample(twitter, 10000)

4.1 Histogram of Word Counts per Line

df_hist <- data.frame(
words = c(stri_count_words(sample_blogs),
stri_count_words(sample_news),
stri_count_words(sample_twitter)),
dataset = rep(c("Blogs", "News", "Twitter"),
each = 10000)
)

ggplot(df_hist, aes(x = words, fill = dataset)) +
geom_histogram(bins = 50, alpha = 0.6) +
facet_wrap(~dataset, scales = "free_y") +
labs(
title = "Distribution of Word Counts per Line",
x = "Words per Line",
y = "Frequency"
)

## 5. Interesting Early Findings Twitter lines are much shorter than Blogs or News, as expected due to character limits.

Blogs contain the highest average words per line, suggesting richer sentence structure.

News text is more balanced and formal, which may affect predictive word patterns.

The dataset is large (over 4 million lines), meaning sampling will be required during model building.