library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringi)
library(ggplot2)
library(tidyr)
library(knitr)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
The purpose of this brief report is to:
Show that the training data has been successfully downloaded and loaded into R
Present initial exploratory analysis
Share important early findings
Outline the plan for the prediction algorithm and Shiny application
This report is written for a non-technical manager, focusing on insights rather than code details.
blogs <- readLines("en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("en_US.news.txt", encoding = "UTF-8", skipNul = TRUE):
## incomplete final line found on 'en_US.news.txt'
twitter <- readLines("en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
line_counts <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
Lines = c(length(blogs), length(news), length(twitter))
)
kable(line_counts) %>% kable_styling(full_width = FALSE)
| Dataset | Lines |
|---|---|
| Blogs | 899288 |
| News | 77259 |
| 2360148 |
word_counts <- data.frame(
Dataset = c("Blogs", "News", "Twitter"),
WordCount = c(
sum(stri_count_words(blogs)),
sum(stri_count_words(news)),
sum(stri_count_words(twitter))
)
)
kable(word_counts) %>% kable_styling(full_width = FALSE)
| Dataset | WordCount |
|---|---|
| Blogs | 37546806 |
| News | 2674561 |
| 30096690 |
summary_table <- line_counts %>%
left_join(word_counts, by = "Dataset") %>%
mutate(
AvgWordsPerLine = round(WordCount / Lines, 2)
)
kable(summary_table) %>%
kable_styling(full_width = FALSE)
| Dataset | Lines | WordCount | AvgWordsPerLine |
|---|---|---|---|
| Blogs | 899288 | 37546806 | 41.75 |
| News | 77259 | 2674561 | 34.62 |
| 2360148 | 30096690 | 12.75 |
To keep processing reasonable, I sample 10,000 lines from each file.
set.seed(123)
sample_blogs <- sample(blogs, 10000)
sample_news <- sample(news, 10000)
sample_twitter <- sample(twitter, 10000)
df_hist <- data.frame(
words = c(stri_count_words(sample_blogs),
stri_count_words(sample_news),
stri_count_words(sample_twitter)),
dataset = rep(c("Blogs", "News", "Twitter"),
each = 10000)
)
ggplot(df_hist, aes(x = words, fill = dataset)) +
geom_histogram(bins = 50, alpha = 0.6) +
facet_wrap(~dataset, scales = "free_y") +
labs(
title = "Distribution of Word Counts per Line",
x = "Words per Line",
y = "Frequency"
)
## 5. Interesting Early Findings Twitter lines are much shorter than
Blogs or News, as expected due to character limits.
Blogs contain the highest average words per line, suggesting richer sentence structure.
News text is more balanced and formal, which may affect predictive word patterns.
The dataset is large (over 4 million lines), meaning sampling will be required during model building.